thailint 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. src/__init__.py +7 -2
  2. src/analyzers/__init__.py +23 -0
  3. src/analyzers/typescript_base.py +148 -0
  4. src/api.py +1 -1
  5. src/cli.py +524 -141
  6. src/config.py +6 -31
  7. src/core/base.py +12 -0
  8. src/core/cli_utils.py +206 -0
  9. src/core/config_parser.py +99 -0
  10. src/core/linter_utils.py +168 -0
  11. src/core/registry.py +17 -92
  12. src/core/rule_discovery.py +132 -0
  13. src/core/violation_builder.py +122 -0
  14. src/linter_config/ignore.py +112 -40
  15. src/linter_config/loader.py +3 -13
  16. src/linters/dry/__init__.py +23 -0
  17. src/linters/dry/base_token_analyzer.py +76 -0
  18. src/linters/dry/block_filter.py +262 -0
  19. src/linters/dry/block_grouper.py +59 -0
  20. src/linters/dry/cache.py +218 -0
  21. src/linters/dry/cache_query.py +61 -0
  22. src/linters/dry/config.py +130 -0
  23. src/linters/dry/config_loader.py +44 -0
  24. src/linters/dry/deduplicator.py +120 -0
  25. src/linters/dry/duplicate_storage.py +126 -0
  26. src/linters/dry/file_analyzer.py +127 -0
  27. src/linters/dry/inline_ignore.py +140 -0
  28. src/linters/dry/linter.py +170 -0
  29. src/linters/dry/python_analyzer.py +517 -0
  30. src/linters/dry/storage_initializer.py +51 -0
  31. src/linters/dry/token_hasher.py +115 -0
  32. src/linters/dry/typescript_analyzer.py +590 -0
  33. src/linters/dry/violation_builder.py +74 -0
  34. src/linters/dry/violation_filter.py +91 -0
  35. src/linters/dry/violation_generator.py +174 -0
  36. src/linters/file_placement/config_loader.py +86 -0
  37. src/linters/file_placement/directory_matcher.py +80 -0
  38. src/linters/file_placement/linter.py +252 -472
  39. src/linters/file_placement/path_resolver.py +61 -0
  40. src/linters/file_placement/pattern_matcher.py +55 -0
  41. src/linters/file_placement/pattern_validator.py +106 -0
  42. src/linters/file_placement/rule_checker.py +229 -0
  43. src/linters/file_placement/violation_factory.py +177 -0
  44. src/linters/nesting/config.py +13 -3
  45. src/linters/nesting/linter.py +76 -152
  46. src/linters/nesting/typescript_analyzer.py +38 -102
  47. src/linters/nesting/typescript_function_extractor.py +130 -0
  48. src/linters/nesting/violation_builder.py +139 -0
  49. src/linters/srp/__init__.py +99 -0
  50. src/linters/srp/class_analyzer.py +113 -0
  51. src/linters/srp/config.py +76 -0
  52. src/linters/srp/heuristics.py +89 -0
  53. src/linters/srp/linter.py +225 -0
  54. src/linters/srp/metrics_evaluator.py +47 -0
  55. src/linters/srp/python_analyzer.py +72 -0
  56. src/linters/srp/typescript_analyzer.py +75 -0
  57. src/linters/srp/typescript_metrics_calculator.py +90 -0
  58. src/linters/srp/violation_builder.py +117 -0
  59. src/orchestrator/core.py +42 -7
  60. src/utils/__init__.py +4 -0
  61. src/utils/project_root.py +84 -0
  62. {thailint-0.1.6.dist-info → thailint-0.2.1.dist-info}/METADATA +414 -63
  63. thailint-0.2.1.dist-info/RECORD +75 -0
  64. src/.ai/layout.yaml +0 -48
  65. thailint-0.1.6.dist-info/RECORD +0 -28
  66. {thailint-0.1.6.dist-info → thailint-0.2.1.dist-info}/LICENSE +0 -0
  67. {thailint-0.1.6.dist-info → thailint-0.2.1.dist-info}/WHEEL +0 -0
  68. {thailint-0.1.6.dist-info → thailint-0.2.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,517 @@
1
+ """
2
+ Purpose: Python source code tokenization and duplicate block analysis
3
+
4
+ Scope: Python-specific code analysis for duplicate detection
5
+
6
+ Overview: Analyzes Python source files to extract code blocks for duplicate detection. Inherits
7
+ from BaseTokenAnalyzer to reuse common token-based hashing and rolling hash window logic.
8
+ Filters out docstrings at the tokenization level to prevent false positive duplication
9
+ detection on documentation strings.
10
+
11
+ Dependencies: BaseTokenAnalyzer, CodeBlock, DRYConfig, pathlib.Path, ast, TokenHasher
12
+
13
+ Exports: PythonDuplicateAnalyzer class
14
+
15
+ Interfaces: PythonDuplicateAnalyzer.analyze(file_path: Path, content: str, config: DRYConfig)
16
+ -> list[CodeBlock]
17
+
18
+ Implementation: Uses custom tokenizer that filters docstrings before hashing
19
+
20
+ SRP Exception: PythonDuplicateAnalyzer has 32 methods and 358 lines (exceeds max 8 methods/200 lines)
21
+ Justification: Complex AST analysis algorithm for duplicate code detection with sophisticated
22
+ false positive filtering. Methods form tightly coupled algorithm pipeline: docstring extraction,
23
+ tokenization with line tracking, single-statement pattern detection across 5+ AST node types
24
+ (ClassDef, FunctionDef, Call, Assign, Expr), and context-aware filtering (decorators, function
25
+ calls, class bodies). Similar to parser or compiler pass architecture where algorithmic
26
+ cohesion is critical. Splitting would fragment the algorithm logic and make maintenance
27
+ harder by separating interdependent AST analysis steps. All methods contribute to single
28
+ responsibility: accurately detecting duplicate Python code while minimizing false positives.
29
+ """
30
+
31
+ import ast
32
+ from collections.abc import Callable
33
+ from pathlib import Path
34
+ from typing import cast
35
+
36
+ from .base_token_analyzer import BaseTokenAnalyzer
37
+ from .block_filter import BlockFilterRegistry, create_default_registry
38
+ from .cache import CodeBlock
39
+ from .config import DRYConfig
40
+
41
+ # Type alias for AST nodes that have line number attributes
42
+ # All stmt and expr nodes have lineno and end_lineno after parsing
43
+ ASTWithLineNumbers = ast.stmt | ast.expr
44
+
45
+
46
+ class PythonDuplicateAnalyzer(BaseTokenAnalyzer): # thailint: ignore[srp.violation]
47
+ """Analyzes Python code for duplicate blocks, excluding docstrings.
48
+
49
+ SRP suppression: Complex AST analysis algorithm requires 32 methods to implement
50
+ sophisticated duplicate detection with false positive filtering. See file header for justification.
51
+ """
52
+
53
+ def __init__(self, filter_registry: BlockFilterRegistry | None = None):
54
+ """Initialize analyzer with optional custom filter registry.
55
+
56
+ Args:
57
+ filter_registry: Custom filter registry (uses defaults if None)
58
+ """
59
+ super().__init__()
60
+ self._filter_registry = filter_registry or create_default_registry()
61
+
62
+ def analyze(self, file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]:
63
+ """Analyze Python file for duplicate code blocks, excluding docstrings.
64
+
65
+ Args:
66
+ file_path: Path to source file
67
+ content: File content
68
+ config: DRY configuration
69
+
70
+ Returns:
71
+ List of CodeBlock instances with hash values
72
+ """
73
+ # Get docstring line ranges
74
+ docstring_ranges = self._get_docstring_ranges_from_content(content)
75
+
76
+ # Tokenize with line number tracking
77
+ lines_with_numbers = self._tokenize_with_line_numbers(content, docstring_ranges)
78
+
79
+ # Generate rolling hash windows
80
+ windows = self._rolling_hash_with_tracking(lines_with_numbers, config.min_duplicate_lines)
81
+
82
+ blocks = []
83
+ for hash_val, start_line, end_line, snippet in windows:
84
+ # Skip blocks that are single logical statements
85
+ # Check the original source code, not the normalized snippet
86
+ if self._is_single_statement_in_source(content, start_line, end_line):
87
+ continue
88
+
89
+ block = CodeBlock(
90
+ file_path=file_path,
91
+ start_line=start_line,
92
+ end_line=end_line,
93
+ snippet=snippet,
94
+ hash_value=hash_val,
95
+ )
96
+
97
+ # Apply extensible filters (keyword arguments, imports, etc.)
98
+ if self._filter_registry.should_filter_block(block, content):
99
+ continue
100
+
101
+ blocks.append(block)
102
+
103
+ return blocks
104
+
105
+ def _get_docstring_ranges_from_content(self, content: str) -> set[int]:
106
+ """Extract line numbers that are part of docstrings.
107
+
108
+ Args:
109
+ content: Python source code
110
+
111
+ Returns:
112
+ Set of line numbers (1-indexed) that are part of docstrings
113
+ """
114
+ try:
115
+ tree = ast.parse(content)
116
+ except SyntaxError:
117
+ return set()
118
+
119
+ docstring_lines: set[int] = set()
120
+ for node in ast.walk(tree):
121
+ self._extract_docstring_lines(node, docstring_lines)
122
+
123
+ return docstring_lines
124
+
125
+ def _extract_docstring_lines(self, node: ast.AST, docstring_lines: set[int]) -> None:
126
+ """Extract docstring line numbers from a node."""
127
+ docstring = self._get_docstring_safe(node)
128
+ if not docstring:
129
+ return
130
+
131
+ if not hasattr(node, "body") or not node.body:
132
+ return
133
+
134
+ first_stmt = node.body[0]
135
+ if self._is_docstring_node(first_stmt):
136
+ self._add_line_range(first_stmt, docstring_lines)
137
+
138
+ @staticmethod
139
+ def _get_docstring_safe(node: ast.AST) -> str | None:
140
+ """Safely get docstring from node, returning None on error."""
141
+ try:
142
+ return ast.get_docstring(node, clean=False) # type: ignore[arg-type]
143
+ except TypeError:
144
+ return None
145
+
146
+ @staticmethod
147
+ def _is_docstring_node(node: ast.stmt) -> bool:
148
+ """Check if a statement node is a docstring."""
149
+ return isinstance(node, ast.Expr) and isinstance(node.value, ast.Constant)
150
+
151
+ @staticmethod
152
+ def _add_line_range(node: ast.stmt, line_set: set[int]) -> None:
153
+ """Add all line numbers from node's line range to the set."""
154
+ if node.lineno and node.end_lineno:
155
+ for line_num in range(node.lineno, node.end_lineno + 1):
156
+ line_set.add(line_num)
157
+
158
+ def _tokenize_with_line_numbers(
159
+ self, content: str, docstring_lines: set[int]
160
+ ) -> list[tuple[int, str]]:
161
+ """Tokenize code while tracking original line numbers and skipping docstrings.
162
+
163
+ Args:
164
+ content: Source code
165
+ docstring_lines: Set of line numbers that are docstrings
166
+
167
+ Returns:
168
+ List of (original_line_number, normalized_code) tuples
169
+ """
170
+ lines_with_numbers = []
171
+
172
+ for line_num, line in enumerate(content.split("\n"), start=1):
173
+ # Skip docstring lines
174
+ if line_num in docstring_lines:
175
+ continue
176
+
177
+ # Use hasher's existing tokenization logic
178
+ line = self._hasher._strip_comments(line) # pylint: disable=protected-access
179
+ line = " ".join(line.split())
180
+
181
+ if not line:
182
+ continue
183
+
184
+ if self._hasher._is_import_statement(line): # pylint: disable=protected-access
185
+ continue
186
+
187
+ lines_with_numbers.append((line_num, line))
188
+
189
+ return lines_with_numbers
190
+
191
+ def _rolling_hash_with_tracking(
192
+ self, lines_with_numbers: list[tuple[int, str]], window_size: int
193
+ ) -> list[tuple[int, int, int, str]]:
194
+ """Create rolling hash windows while preserving original line numbers.
195
+
196
+ Args:
197
+ lines_with_numbers: List of (line_number, code) tuples
198
+ window_size: Number of lines per window
199
+
200
+ Returns:
201
+ List of (hash_value, start_line, end_line, snippet) tuples
202
+ """
203
+ if len(lines_with_numbers) < window_size:
204
+ return []
205
+
206
+ hashes = []
207
+ for i in range(len(lines_with_numbers) - window_size + 1):
208
+ window = lines_with_numbers[i : i + window_size]
209
+
210
+ # Extract just the code for hashing
211
+ code_lines = [code for _, code in window]
212
+ snippet = "\n".join(code_lines)
213
+ hash_val = hash(snippet)
214
+
215
+ # Get original line numbers
216
+ start_line = window[0][0]
217
+ end_line = window[-1][0]
218
+
219
+ hashes.append((hash_val, start_line, end_line, snippet))
220
+
221
+ return hashes
222
+
223
+ def _is_single_statement_in_source(self, content: str, start_line: int, end_line: int) -> bool:
224
+ """Check if a line range in the original source is a single logical statement."""
225
+ tree = self._parse_content_safe(content)
226
+ if tree is None:
227
+ return False
228
+
229
+ return self._check_overlapping_nodes(tree, start_line, end_line)
230
+
231
+ @staticmethod
232
+ def _parse_content_safe(content: str) -> ast.Module | None:
233
+ """Parse content, returning None on syntax error."""
234
+ try:
235
+ return ast.parse(content)
236
+ except SyntaxError:
237
+ return None
238
+
239
+ def _check_overlapping_nodes(self, tree: ast.Module, start_line: int, end_line: int) -> bool:
240
+ """Check if any AST node overlaps and matches single-statement pattern."""
241
+ for node in ast.walk(tree):
242
+ if self._node_overlaps_and_matches(node, start_line, end_line):
243
+ return True
244
+ return False
245
+
246
+ def _node_overlaps_and_matches(self, node: ast.AST, start_line: int, end_line: int) -> bool:
247
+ """Check if node overlaps with range and matches single-statement pattern."""
248
+ if not hasattr(node, "lineno") or not hasattr(node, "end_lineno"):
249
+ return False
250
+
251
+ overlaps = not (node.end_lineno < start_line or node.lineno > end_line)
252
+ if not overlaps:
253
+ return False
254
+
255
+ return self._is_single_statement_pattern(node, start_line, end_line)
256
+
257
+ def _is_single_statement_pattern(self, node: ast.AST, start_line: int, end_line: int) -> bool:
258
+ """Check if an AST node represents a single-statement pattern to filter.
259
+
260
+ Args:
261
+ node: AST node that overlaps with the line range
262
+ start_line: Starting line number (1-indexed)
263
+ end_line: Ending line number (1-indexed)
264
+
265
+ Returns:
266
+ True if this node represents a single logical statement pattern
267
+ """
268
+ contains = self._node_contains_range(node, start_line, end_line)
269
+ if contains is None:
270
+ return False
271
+
272
+ return self._dispatch_pattern_check(node, start_line, end_line, contains)
273
+
274
+ def _node_contains_range(self, node: ast.AST, start_line: int, end_line: int) -> bool | None:
275
+ """Check if node completely contains the range. Returns None if invalid."""
276
+ if not self._has_valid_line_numbers(node):
277
+ return None
278
+ # Type narrowing: _has_valid_line_numbers ensures node has line numbers
279
+ # Safe to cast after validation check above
280
+ typed_node = cast(ASTWithLineNumbers, node)
281
+ # Use type: ignore to suppress MyPy's inability to understand runtime validation
282
+ return typed_node.lineno <= start_line and typed_node.end_lineno >= end_line # type: ignore[operator]
283
+
284
+ @staticmethod
285
+ def _has_valid_line_numbers(node: ast.AST) -> bool:
286
+ """Check if node has valid line number attributes."""
287
+ if not (hasattr(node, "lineno") and hasattr(node, "end_lineno")):
288
+ return False
289
+ return node.lineno is not None and node.end_lineno is not None
290
+
291
+ def _dispatch_pattern_check(
292
+ self, node: ast.AST, start_line: int, end_line: int, contains: bool
293
+ ) -> bool:
294
+ """Dispatch to node-type-specific pattern checkers."""
295
+ # Simple containment check for Expr nodes
296
+ if isinstance(node, ast.Expr):
297
+ return contains
298
+
299
+ # Delegate to specialized checkers
300
+ return self._check_specific_pattern(node, start_line, end_line, contains)
301
+
302
+ def _check_specific_pattern(
303
+ self, node: ast.AST, start_line: int, end_line: int, contains: bool
304
+ ) -> bool:
305
+ """Check specific node types with their pattern rules."""
306
+ if isinstance(node, ast.ClassDef):
307
+ return self._check_class_def_pattern(node, start_line, end_line)
308
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
309
+ return self._check_function_def_pattern(node, start_line, end_line)
310
+ if isinstance(node, ast.Call):
311
+ return self._check_call_pattern(node, start_line, end_line, contains)
312
+ if isinstance(node, ast.Assign):
313
+ return self._check_assign_pattern(node, start_line, end_line, contains)
314
+ return False
315
+
316
+ def _check_class_def_pattern(self, node: ast.ClassDef, start_line: int, end_line: int) -> bool:
317
+ """Check if range is in class field definitions (not method bodies)."""
318
+ first_method_line = self._find_first_method_line(node)
319
+ class_start = self._get_class_start_with_decorators(node)
320
+ return self._is_in_class_fields_area(
321
+ class_start, start_line, end_line, first_method_line, node.end_lineno
322
+ )
323
+
324
+ @staticmethod
325
+ def _find_first_method_line(node: ast.ClassDef) -> int | None:
326
+ """Find line number of first method in class."""
327
+ for item in node.body:
328
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
329
+ return item.lineno
330
+ return None
331
+
332
+ @staticmethod
333
+ def _get_class_start_with_decorators(node: ast.ClassDef) -> int:
334
+ """Get class start line, including decorators if present."""
335
+ if node.decorator_list:
336
+ return min(d.lineno for d in node.decorator_list)
337
+ return node.lineno
338
+
339
+ @staticmethod
340
+ def _is_in_class_fields_area(
341
+ class_start: int,
342
+ start_line: int,
343
+ end_line: int,
344
+ first_method_line: int | None,
345
+ class_end_line: int | None,
346
+ ) -> bool:
347
+ """Check if range is in class fields area (before methods)."""
348
+ if first_method_line is not None:
349
+ return class_start <= start_line and end_line < first_method_line
350
+ if class_end_line is not None:
351
+ return class_start <= start_line and class_end_line >= end_line
352
+ return False
353
+
354
+ def _check_function_def_pattern(
355
+ self, node: ast.FunctionDef | ast.AsyncFunctionDef, start_line: int, end_line: int
356
+ ) -> bool:
357
+ """Check if range is in function decorator pattern."""
358
+ if not node.decorator_list:
359
+ return False
360
+
361
+ first_decorator_line = min(d.lineno for d in node.decorator_list)
362
+ first_body_line = self._get_function_body_start(node)
363
+
364
+ if first_body_line is None:
365
+ return False
366
+
367
+ return start_line >= first_decorator_line and end_line < first_body_line
368
+
369
+ @staticmethod
370
+ def _get_function_body_start(node: ast.FunctionDef | ast.AsyncFunctionDef) -> int | None:
371
+ """Get the line number where function body starts."""
372
+ if not node.body or not hasattr(node.body[0], "lineno"):
373
+ return None
374
+ return node.body[0].lineno
375
+
376
+ def _check_call_pattern(
377
+ self, node: ast.Call, start_line: int, end_line: int, contains: bool
378
+ ) -> bool:
379
+ """Check if range is part of a function/constructor call."""
380
+ return self._check_multiline_or_contained(node, start_line, end_line, contains)
381
+
382
+ def _check_assign_pattern(
383
+ self, node: ast.Assign, start_line: int, end_line: int, contains: bool
384
+ ) -> bool:
385
+ """Check if range is part of a multi-line assignment."""
386
+ return self._check_multiline_or_contained(node, start_line, end_line, contains)
387
+
388
+ def _check_multiline_or_contained(
389
+ self, node: ast.AST, start_line: int, end_line: int, contains: bool
390
+ ) -> bool:
391
+ """Check if node is multiline containing start, or single-line containing range."""
392
+ if not self._has_valid_line_numbers(node):
393
+ return False
394
+
395
+ # Type narrowing: _has_valid_line_numbers ensures node has line numbers
396
+ # Safe to cast after validation check above
397
+ typed_node = cast(ASTWithLineNumbers, node)
398
+ # Use type: ignore to suppress MyPy's inability to understand runtime validation
399
+ is_multiline = typed_node.lineno < typed_node.end_lineno # type: ignore[operator]
400
+ if is_multiline:
401
+ return typed_node.lineno <= start_line <= typed_node.end_lineno # type: ignore[operator]
402
+ return contains
403
+
404
+ def _is_standalone_single_statement(
405
+ self, lines: list[str], start_line: int, end_line: int
406
+ ) -> bool:
407
+ """Check if the exact range parses as a single statement on its own."""
408
+ source_lines = lines[start_line - 1 : end_line]
409
+ source_snippet = "\n".join(source_lines)
410
+
411
+ try:
412
+ tree = ast.parse(source_snippet)
413
+ return len(tree.body) == 1
414
+ except SyntaxError:
415
+ return False
416
+
417
+ def _check_ast_context( # pylint: disable=too-many-arguments,too-many-positional-arguments
418
+ self,
419
+ lines: list[str],
420
+ start_line: int,
421
+ end_line: int,
422
+ lookback: int,
423
+ lookforward: int,
424
+ predicate: Callable[[ast.Module, int], bool],
425
+ ) -> bool:
426
+ """Generic helper for AST-based context checking.
427
+
428
+ Args:
429
+ lines: Source file lines
430
+ start_line: Starting line number (1-indexed)
431
+ end_line: Ending line number (1-indexed)
432
+ lookback: Number of lines to look backward
433
+ lookforward: Number of lines to look forward
434
+ predicate: Function that takes AST tree and returns bool
435
+
436
+ Returns:
437
+ True if predicate returns True for the parsed context
438
+ """
439
+ lookback_start = max(0, start_line - lookback)
440
+ lookforward_end = min(len(lines), end_line + lookforward)
441
+
442
+ context_lines = lines[lookback_start:lookforward_end]
443
+ context = "\n".join(context_lines)
444
+
445
+ try:
446
+ tree = ast.parse(context)
447
+ return predicate(tree, lookback_start)
448
+ except SyntaxError:
449
+ pass
450
+
451
+ return False
452
+
453
+ def _is_part_of_decorator(self, lines: list[str], start_line: int, end_line: int) -> bool:
454
+ """Check if lines are part of a decorator + function definition.
455
+
456
+ A decorator pattern is @something(...) followed by def/class.
457
+ """
458
+
459
+ def has_decorators(tree: ast.Module, _lookback_start: int) -> bool:
460
+ """Check if any function or class in the tree has decorators."""
461
+ for stmt in tree.body:
462
+ if isinstance(stmt, (ast.FunctionDef, ast.ClassDef)) and stmt.decorator_list:
463
+ return True
464
+ return False
465
+
466
+ return self._check_ast_context(lines, start_line, end_line, 10, 10, has_decorators)
467
+
468
+ def _is_part_of_function_call(self, lines: list[str], start_line: int, end_line: int) -> bool:
469
+ """Check if lines are arguments inside a function/constructor call.
470
+
471
+ Detects patterns like:
472
+ obj = Constructor(
473
+ arg1=value1,
474
+ arg2=value2,
475
+ )
476
+ """
477
+
478
+ def is_single_non_function_statement(tree: ast.Module, _lookback_start: int) -> bool:
479
+ """Check if context has exactly one statement that's not a function/class def."""
480
+ return len(tree.body) == 1 and not isinstance(
481
+ tree.body[0], (ast.FunctionDef, ast.ClassDef)
482
+ )
483
+
484
+ return self._check_ast_context(
485
+ lines, start_line, end_line, 10, 10, is_single_non_function_statement
486
+ )
487
+
488
+ def _is_part_of_class_body(self, lines: list[str], start_line: int, end_line: int) -> bool:
489
+ """Check if lines are field definitions inside a class body.
490
+
491
+ Detects patterns like:
492
+ class Foo:
493
+ field1: Type1
494
+ field2: Type2
495
+ """
496
+
497
+ def is_within_class_body(tree: ast.Module, lookback_start: int) -> bool:
498
+ """Check if flagged range falls within a class body."""
499
+ for stmt in tree.body:
500
+ if not isinstance(stmt, ast.ClassDef):
501
+ continue
502
+
503
+ # Adjust line numbers: stmt.lineno is relative to context
504
+ # We need to convert back to original file line numbers
505
+ class_start_in_context = stmt.lineno
506
+ class_end_in_context = stmt.end_lineno if stmt.end_lineno else stmt.lineno
507
+
508
+ # Convert to original file line numbers (1-indexed)
509
+ class_start_original = lookback_start + class_start_in_context
510
+ class_end_original = lookback_start + class_end_in_context
511
+
512
+ # Check if the flagged range overlaps with class body
513
+ if start_line >= class_start_original and end_line <= class_end_original:
514
+ return True
515
+ return False
516
+
517
+ return self._check_ast_context(lines, start_line, end_line, 10, 5, is_within_class_body)
@@ -0,0 +1,51 @@
1
+ """
2
+ Purpose: Storage initialization for DRY linter
3
+
4
+ Scope: Initializes DuplicateStorage with cache or in-memory fallback
5
+
6
+ Overview: Handles storage initialization based on DRY configuration. Creates SQLite cache when
7
+ cache_enabled is true, or triggers in-memory fallback when false (Decision 6). Separates
8
+ initialization logic from main linter rule to maintain SRP compliance.
9
+
10
+ Dependencies: BaseLintContext, DRYConfig, DRYCache, DuplicateStorage, Path
11
+
12
+ Exports: StorageInitializer class
13
+
14
+ Interfaces: StorageInitializer.initialize(context, config) -> DuplicateStorage
15
+
16
+ Implementation: Creates cache if enabled, delegates to DuplicateStorage for storage management
17
+ """
18
+
19
+ from pathlib import Path
20
+
21
+ from src.core.base import BaseLintContext
22
+
23
+ from .cache import DRYCache
24
+ from .config import DRYConfig
25
+ from .duplicate_storage import DuplicateStorage
26
+
27
+
28
+ class StorageInitializer:
29
+ """Initializes storage for duplicate detection."""
30
+
31
+ def initialize(self, context: BaseLintContext, config: DRYConfig) -> DuplicateStorage:
32
+ """Initialize storage based on configuration.
33
+
34
+ Args:
35
+ context: Lint context
36
+ config: DRY configuration
37
+
38
+ Returns:
39
+ DuplicateStorage instance
40
+ """
41
+ cache = None
42
+ if config.cache_enabled:
43
+ # Use SQLite cache
44
+ metadata = getattr(context, "metadata", {})
45
+ project_root = metadata.get("_project_root", Path.cwd())
46
+ cache_path = project_root / config.cache_path
47
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
48
+ cache = DRYCache(cache_path)
49
+ # else: cache = None triggers in-memory fallback in DuplicateStorage
50
+
51
+ return DuplicateStorage(cache)
@@ -0,0 +1,115 @@
1
+ """
2
+ Purpose: Tokenization and rolling hash generation for code deduplication
3
+
4
+ Scope: Code normalization, comment stripping, and hash window generation
5
+
6
+ Overview: Implements token-based hashing algorithm (Rabin-Karp) for detecting code duplicates.
7
+ Normalizes source code by stripping comments and whitespace, then generates rolling hash
8
+ windows over consecutive lines. Each window represents a potential duplicate code block.
9
+ Uses Python's built-in hash function for simplicity and performance. Supports both Python
10
+ and JavaScript/TypeScript comment styles.
11
+
12
+ Dependencies: Python built-in hash function
13
+
14
+ Exports: TokenHasher class
15
+
16
+ Interfaces: TokenHasher.tokenize(code: str) -> list[str],
17
+ TokenHasher.rolling_hash(lines: list[str], window_size: int) -> list[tuple]
18
+
19
+ Implementation: Token-based normalization with rolling window algorithm, language-agnostic approach
20
+ """
21
+
22
+
23
+ class TokenHasher:
24
+ """Tokenize code and create rolling hashes for duplicate detection."""
25
+
26
+ def tokenize(self, code: str) -> list[str]:
27
+ """Tokenize code by stripping comments and normalizing whitespace.
28
+
29
+ Args:
30
+ code: Source code string
31
+
32
+ Returns:
33
+ List of normalized code lines (non-empty, comments removed, imports filtered)
34
+ """
35
+ lines = []
36
+
37
+ for line in code.split("\n"):
38
+ # Remove comments (language-specific logic can be added)
39
+ line = self._strip_comments(line)
40
+
41
+ # Normalize whitespace (collapse to single space)
42
+ line = " ".join(line.split())
43
+
44
+ # Skip empty lines
45
+ if not line:
46
+ continue
47
+
48
+ # Skip import statements (common false positive)
49
+ if self._is_import_statement(line):
50
+ continue
51
+
52
+ lines.append(line)
53
+
54
+ return lines
55
+
56
+ def _strip_comments(self, line: str) -> str:
57
+ """Remove comments from line (Python # and // style).
58
+
59
+ Args:
60
+ line: Source code line
61
+
62
+ Returns:
63
+ Line with comments removed
64
+ """
65
+ # Python comments
66
+ if "#" in line:
67
+ line = line[: line.index("#")]
68
+
69
+ # JavaScript/TypeScript comments
70
+ if "//" in line:
71
+ line = line[: line.index("//")]
72
+
73
+ return line
74
+
75
+ def _is_import_statement(self, line: str) -> bool:
76
+ """Check if line is an import statement.
77
+
78
+ Args:
79
+ line: Normalized code line
80
+
81
+ Returns:
82
+ True if line is an import statement
83
+ """
84
+ # Check all import/export patterns
85
+ import_prefixes = ("import ", "from ", "export ")
86
+ import_tokens = ("{", "}", "} from")
87
+
88
+ return line.startswith(import_prefixes) or line in import_tokens
89
+
90
+ def rolling_hash(self, lines: list[str], window_size: int) -> list[tuple[int, int, int, str]]:
91
+ """Create rolling hash windows over code lines.
92
+
93
+ Args:
94
+ lines: List of normalized code lines
95
+ window_size: Number of lines per window (min_duplicate_lines)
96
+
97
+ Returns:
98
+ List of tuples: (hash_value, start_line, end_line, code_snippet)
99
+ """
100
+ if len(lines) < window_size:
101
+ return []
102
+
103
+ hashes = []
104
+ for i in range(len(lines) - window_size + 1):
105
+ window = lines[i : i + window_size]
106
+ snippet = "\n".join(window)
107
+ hash_val = hash(snippet)
108
+
109
+ # Line numbers are 1-indexed
110
+ start_line = i + 1
111
+ end_line = i + window_size
112
+
113
+ hashes.append((hash_val, start_line, end_line, snippet))
114
+
115
+ return hashes