PyPI - thailint - Versions diffs - 0.4.3__tar.gz → 0.4.5__tar.gz - Mend

thailint 0.4.3tar.gz → 0.4.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{thailint-0.4.3 → thailint-0.4.5}/PKG-INFO RENAMED Viewed

@@ -1,8 +1,9 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: thailint
-Version: 0.4.3
+Version: 0.4.5
 Summary: The AI Linter - Enterprise-grade linting and governance for AI-generated code across multiple languages
 License: MIT
+License-File: LICENSE
 Keywords: linter,ai,code-quality,static-analysis,file-placement,governance,multi-language,cli,docker,python
 Author: Steve Jackson
 Requires-Python: >=3.11,<4.0
@@ -15,6 +16,7 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Software Development :: Quality Assurance

{thailint-0.4.3 → thailint-0.4.5}/pyproject.toml RENAMED Viewed

@@ -17,7 +17,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "thailint"
-version = "0.4.3"
+version = "0.4.5"
 description = "The AI Linter - Enterprise-grade linting and governance for AI-generated code across multiple languages"
 authors = ["Steve Jackson"]
 license = "MIT"

{thailint-0.4.3 → thailint-0.4.5}/src/linters/dry/python_analyzer.py RENAMED Viewed

@@ -62,8 +62,15 @@ class PythonDuplicateAnalyzer(BaseTokenAnalyzer):  # thailint: ignore[srp.violat
         """
         super().__init__()
         self._filter_registry = filter_registry or create_default_registry()
-    def analyze(self, file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]:
+        # Performance optimization: Cache parsed AST to avoid re-parsing for each hash window
+        self._cached_ast: ast.Module | None = None
+        self._cached_content: str | None = None
+        # Performance optimization: Line-to-node index for O(1) lookups instead of O(n) ast.walk()
+        self._line_to_nodes: dict[int, list[ast.AST]] | None = None
+    def analyze(  # thailint: ignore[nesting.excessive-depth]
+        self, file_path: Path, content: str, config: DRYConfig
+    ) -> list[CodeBlock]:
         """Analyze Python file for duplicate code blocks, excluding docstrings.
         Args:
@@ -74,37 +81,73 @@ class PythonDuplicateAnalyzer(BaseTokenAnalyzer):  # thailint: ignore[srp.violat
         Returns:
             List of CodeBlock instances with hash values
         """
-        # Get docstring line ranges
-        docstring_ranges = self._get_docstring_ranges_from_content(content)
+        # Performance optimization: Parse AST once and cache for _is_single_statement_in_source() calls
+        self._cached_ast = self._parse_content_safe(content)
+        self._cached_content = content
+        # Performance optimization: Build line-to-node index for O(1) lookups
+        self._line_to_nodes = self._build_line_to_node_index(self._cached_ast)
+        try:
+            # Get docstring line ranges
+            docstring_ranges = self._get_docstring_ranges_from_content(content)
-        # Tokenize with line number tracking
-        lines_with_numbers = self._tokenize_with_line_numbers(content, docstring_ranges)
+            # Tokenize with line number tracking
+            lines_with_numbers = self._tokenize_with_line_numbers(content, docstring_ranges)
-        # Generate rolling hash windows
-        windows = self._rolling_hash_with_tracking(lines_with_numbers, config.min_duplicate_lines)
+            # Generate rolling hash windows
+            windows = self._rolling_hash_with_tracking(
+                lines_with_numbers, config.min_duplicate_lines
+            )
+            return self._filter_valid_blocks(windows, file_path, content)
+        finally:
+            # Clear cache after analysis to avoid memory leaks
+            self._cached_ast = None
+            self._cached_content = None
+            self._line_to_nodes = None
+    def _filter_valid_blocks(
+        self,
+        windows: list[tuple[int, int, int, str]],
+        file_path: Path,
+        content: str,
+    ) -> list[CodeBlock]:
+        """Filter hash windows and create valid CodeBlock instances."""
         blocks = []
         for hash_val, start_line, end_line, snippet in windows:
-            # Skip blocks that are single logical statements
-            # Check the original source code, not the normalized snippet
-            if self._is_single_statement_in_source(content, start_line, end_line):
-                continue
-            block = CodeBlock(
-                file_path=file_path,
-                start_line=start_line,
-                end_line=end_line,
-                snippet=snippet,
-                hash_value=hash_val,
+            block = self._create_block_if_valid(
+                file_path, content, hash_val, start_line, end_line, snippet
             )
+            if block:
+                blocks.append(block)
+        return blocks
-            # Apply extensible filters (keyword arguments, imports, etc.)
-            if self._filter_registry.should_filter_block(block, content):
-                continue
+    def _create_block_if_valid(  # pylint: disable=too-many-arguments,too-many-positional-arguments
+        self,
+        file_path: Path,
+        content: str,
+        hash_val: int,
+        start_line: int,
+        end_line: int,
+        snippet: str,
+    ) -> CodeBlock | None:
+        """Create CodeBlock if it passes all validation checks."""
+        if self._is_single_statement_in_source(content, start_line, end_line):
+            return None
-            blocks.append(block)
+        block = CodeBlock(
+            file_path=file_path,
+            start_line=start_line,
+            end_line=end_line,
+            snippet=snippet,
+            hash_value=hash_val,
+        )
-        return blocks
+        if self._filter_registry.should_filter_block(block, content):
+            return None
+        return block
     def _get_docstring_ranges_from_content(self, content: str) -> set[int]:
         """Extract line numbers that are part of docstrings.
@@ -172,20 +215,21 @@ class PythonDuplicateAnalyzer(BaseTokenAnalyzer):  # thailint: ignore[srp.violat
             List of (original_line_number, normalized_code) tuples
         """
         lines_with_numbers = []
+        in_multiline_import = False
         for line_num, line in enumerate(content.split("\n"), start=1):
-            # Skip docstring lines
             if line_num in docstring_lines:
                 continue
-            # Use hasher's existing tokenization logic
-            line = self._hasher._strip_comments(line)  # pylint: disable=protected-access
-            line = " ".join(line.split())
+            line = self._hasher._normalize_line(line)  # pylint: disable=protected-access
             if not line:
                 continue
-            if self._hasher._is_import_statement(line):  # pylint: disable=protected-access
+            # Update multi-line import state and check if line should be skipped
+            in_multiline_import, should_skip = self._hasher._should_skip_import_line(  # pylint: disable=protected-access
+                line, in_multiline_import
+            )
+            if should_skip:
                 continue
             lines_with_numbers.append((line_num, line))
@@ -225,10 +269,20 @@ class PythonDuplicateAnalyzer(BaseTokenAnalyzer):  # thailint: ignore[srp.violat
         return hashes
     def _is_single_statement_in_source(self, content: str, start_line: int, end_line: int) -> bool:
-        """Check if a line range in the original source is a single logical statement."""
-        tree = self._parse_content_safe(content)
-        if tree is None:
-            return False
+        """Check if a line range in the original source is a single logical statement.
+        Performance optimization: Uses cached AST if available (set by analyze() method)
+        to avoid re-parsing the entire file for each hash window check.
+        """
+        # Use cached AST if available and content matches
+        tree: ast.Module | None
+        if self._cached_ast is not None and content == self._cached_content:
+            tree = self._cached_ast
+        else:
+            # Fallback: parse content (used by tests or standalone calls)
+            tree = self._parse_content_safe(content)
+            if tree is None:
+                return False
         return self._check_overlapping_nodes(tree, start_line, end_line)
@@ -240,13 +294,99 @@ class PythonDuplicateAnalyzer(BaseTokenAnalyzer):  # thailint: ignore[srp.violat
         except SyntaxError:
             return None
+    @staticmethod
+    def _build_line_to_node_index(tree: ast.Module | None) -> dict[int, list[ast.AST]] | None:
+        """Build an index mapping each line number to overlapping AST nodes.
+        Performance optimization: This allows O(1) lookups instead of O(n) ast.walk() calls.
+        For a file with 5,144 nodes and 673 hash windows, this reduces 3.46M node operations
+        to just ~3,365 relevant node checks (99.9% reduction).
+        Args:
+            tree: Parsed AST tree (None if parsing failed)
+        Returns:
+            Dictionary mapping line numbers to list of AST nodes overlapping that line,
+            or None if tree is None
+        """
+        if tree is None:
+            return None
+        line_to_nodes: dict[int, list[ast.AST]] = {}
+        for node in ast.walk(tree):
+            if PythonDuplicateAnalyzer._node_has_line_info(node):
+                PythonDuplicateAnalyzer._add_node_to_index(node, line_to_nodes)
+        return line_to_nodes
+    @staticmethod
+    def _node_has_line_info(node: ast.AST) -> bool:
+        """Check if node has valid line number information."""
+        if not hasattr(node, "lineno") or not hasattr(node, "end_lineno"):
+            return False
+        return node.lineno is not None and node.end_lineno is not None
+    @staticmethod
+    def _add_node_to_index(node: ast.AST, line_to_nodes: dict[int, list[ast.AST]]) -> None:
+        """Add node to all lines it overlaps in the index."""
+        for line_num in range(node.lineno, node.end_lineno + 1):  # type: ignore[attr-defined]
+            if line_num not in line_to_nodes:
+                line_to_nodes[line_num] = []
+            line_to_nodes[line_num].append(node)
     def _check_overlapping_nodes(self, tree: ast.Module, start_line: int, end_line: int) -> bool:
-        """Check if any AST node overlaps and matches single-statement pattern."""
+        """Check if any AST node overlaps and matches single-statement pattern.
+        Performance optimization: Use line-to-node index for O(1) lookups instead of O(n) ast.walk().
+        """
+        if self._line_to_nodes is not None:
+            return self._check_nodes_via_index(start_line, end_line)
+        return self._check_nodes_via_walk(tree, start_line, end_line)
+    def _check_nodes_via_index(self, start_line: int, end_line: int) -> bool:
+        """Check nodes using line-to-node index for O(1) lookups."""
+        candidates = self._collect_candidate_nodes_from_index(start_line, end_line)
+        return self._any_node_matches_pattern(candidates, start_line, end_line)
+    def _collect_candidate_nodes_from_index(self, start_line: int, end_line: int) -> set[ast.AST]:
+        """Collect unique nodes that overlap with the line range from index."""
+        candidate_nodes: set[ast.AST] = set()
+        for line_num in range(start_line, end_line + 1):
+            if self._line_to_nodes and line_num in self._line_to_nodes:
+                candidate_nodes.update(self._line_to_nodes[line_num])
+        return candidate_nodes
+    def _any_node_matches_pattern(
+        self, nodes: set[ast.AST], start_line: int, end_line: int
+    ) -> bool:
+        """Check if any node matches single-statement pattern."""
+        for node in nodes:
+            if self._is_single_statement_pattern(node, start_line, end_line):
+                return True
+        return False
+    def _check_nodes_via_walk(self, tree: ast.Module, start_line: int, end_line: int) -> bool:
+        """Check nodes using ast.walk() fallback for tests or standalone calls."""
         for node in ast.walk(tree):
-            if self._node_overlaps_and_matches(node, start_line, end_line):
+            if self._node_matches_via_walk(node, start_line, end_line):
                 return True
         return False
+    def _node_matches_via_walk(self, node: ast.AST, start_line: int, end_line: int) -> bool:
+        """Check if a single node overlaps and matches pattern."""
+        if not self._node_overlaps_range(node, start_line, end_line):
+            return False
+        return self._is_single_statement_pattern(node, start_line, end_line)
+    @staticmethod
+    def _node_overlaps_range(node: ast.AST, start_line: int, end_line: int) -> bool:
+        """Check if node overlaps with the given line range."""
+        if not hasattr(node, "lineno") or not hasattr(node, "end_lineno"):
+            return False
+        node_end = node.end_lineno
+        node_start = node.lineno
+        return not (node_end < start_line or node_start > end_line)
     def _node_overlaps_and_matches(self, node: ast.AST, start_line: int, end_line: int) -> bool:
         """Check if node overlaps with range and matches single-statement pattern."""
         if not hasattr(node, "lineno") or not hasattr(node, "end_lineno"):

{thailint-0.4.3 → thailint-0.4.5}/src/linters/dry/token_hasher.py RENAMED Viewed

@@ -33,26 +33,80 @@ class TokenHasher:
             List of normalized code lines (non-empty, comments removed, imports filtered)
         """
         lines = []
+        in_multiline_import = False
         for line in code.split("\n"):
-            # Remove comments (language-specific logic can be added)
-            line = self._strip_comments(line)
-            # Normalize whitespace (collapse to single space)
-            line = " ".join(line.split())
-            # Skip empty lines
+            line = self._normalize_line(line)
             if not line:
                 continue
-            # Skip import statements (common false positive)
-            if self._is_import_statement(line):
+            # Update multi-line import state and check if line should be skipped
+            in_multiline_import, should_skip = self._should_skip_import_line(
+                line, in_multiline_import
+            )
+            if should_skip:
                 continue
             lines.append(line)
         return lines
+    def _normalize_line(self, line: str) -> str:
+        """Normalize a line by removing comments and excess whitespace.
+        Args:
+            line: Raw source code line
+        Returns:
+            Normalized line (empty string if line has no content)
+        """
+        line = self._strip_comments(line)
+        return " ".join(line.split())
+    def _should_skip_import_line(self, line: str, in_multiline_import: bool) -> tuple[bool, bool]:
+        """Determine if an import line should be skipped.
+        Args:
+            line: Normalized code line
+            in_multiline_import: Whether we're currently inside a multi-line import
+        Returns:
+            Tuple of (new_in_multiline_import_state, should_skip_line)
+        """
+        if self._is_multiline_import_start(line):
+            return True, True
+        if in_multiline_import:
+            return self._handle_multiline_import_continuation(line)
+        if self._is_import_statement(line):
+            return False, True
+        return False, False
+    def _is_multiline_import_start(self, line: str) -> bool:
+        """Check if line starts a multi-line import statement.
+        Args:
+            line: Normalized code line
+        Returns:
+            True if line starts a multi-line import (has opening paren but no closing)
+        """
+        return self._is_import_statement(line) and "(" in line and ")" not in line
+    def _handle_multiline_import_continuation(self, line: str) -> tuple[bool, bool]:
+        """Handle a line that's part of a multi-line import.
+        Args:
+            line: Normalized code line inside a multi-line import
+        Returns:
+            Tuple of (still_in_import, should_skip)
+        """
+        closes_import = ")" in line
+        return not closes_import, True
     def _strip_comments(self, line: str) -> str:
         """Remove comments from line (Python # and // style).

thailint-0.4.5/src/linters/file_header/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""
+File: src/linters/file_header/__init__.py
+Purpose: File header linter module initialization
+Exports: FileHeaderRule
+Depends: linter.FileHeaderRule
+Implements: Module-level exports for clean API
+Related: linter.py for main rule implementation
+Overview:
+    Initializes the file header linter module providing multi-language file header
+    validation with mandatory field checking, atemporal language detection, and configuration
+    support. Main entry point for file header linting functionality.
+Usage:
+    from src.linters.file_header import FileHeaderRule
+    rule = FileHeaderRule()
+    violations = rule.check(context)
+Notes: Follows standard Python module initialization pattern with __all__ export control
+"""
+from .linter import FileHeaderRule
+__all__ = ["FileHeaderRule"]

thailint-0.4.5/src/linters/file_header/atemporal_detector.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+File: src/linters/file_header/atemporal_detector.py
+Purpose: Detects temporal language patterns in file headers
+Exports: AtemporalDetector class
+Depends: re module for regex matching
+Implements: Regex-based pattern matching with configurable patterns
+Related: linter.py for detector usage, violation_builder.py for violation creation
+Overview:
+    Implements pattern-based detection of temporal language that violates atemporal
+    documentation requirements. Detects dates, temporal qualifiers, state change language,
+    and future references using regex patterns. Provides violation details for each pattern match.
+Usage:
+    detector = AtemporalDetector()
+    violations = detector.detect_violations(header_text)
+Notes: Four pattern categories - dates, temporal qualifiers, state changes, future references
+"""
+import re
+class AtemporalDetector:
+    """Detects temporal language patterns in text."""
+    # Date patterns
+    DATE_PATTERNS = [
+        (r"\d{4}-\d{2}-\d{2}", "ISO date format (YYYY-MM-DD)"),
+        (
+            r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}",
+            "Month Year format",
+        ),
+        (r"(?:Created|Updated|Modified):\s*\d{4}", "Date metadata"),
+    ]
+    # Temporal qualifiers
+    TEMPORAL_QUALIFIERS = [
+        (r"\bcurrently\b", 'temporal qualifier "currently"'),
+        (r"\bnow\b", 'temporal qualifier "now"'),
+        (r"\brecently\b", 'temporal qualifier "recently"'),
+        (r"\bsoon\b", 'temporal qualifier "soon"'),
+        (r"\bfor now\b", 'temporal qualifier "for now"'),
+    ]
+    # State change language
+    STATE_CHANGE = [
+        (r"\breplaces?\b", 'state change "replaces"'),
+        (r"\bmigrated from\b", 'state change "migrated from"'),
+        (r"\bformerly\b", 'state change "formerly"'),
+        (r"\bold implementation\b", 'state change "old"'),
+        (r"\bnew implementation\b", 'state change "new"'),
+    ]
+    # Future references
+    FUTURE_REFS = [
+        (r"\bwill be\b", 'future reference "will be"'),
+        (r"\bplanned\b", 'future reference "planned"'),
+        (r"\bto be added\b", 'future reference "to be added"'),
+        (r"\bcoming soon\b", 'future reference "coming soon"'),
+    ]
+    def detect_violations(  # thailint: ignore[nesting]
+        self, text: str
+    ) -> list[tuple[str, str, int]]:
+        """Detect all temporal language violations in text.
+        Args:
+            text: Text to check for temporal language
+        Returns:
+            List of (pattern, description, line_number) tuples for each violation
+        """
+        violations = []
+        # Check all pattern categories
+        all_patterns = (
+            self.DATE_PATTERNS + self.TEMPORAL_QUALIFIERS + self.STATE_CHANGE + self.FUTURE_REFS
+        )
+        lines = text.split("\n")
+        for line_num, line in enumerate(lines, start=1):
+            for pattern, description in all_patterns:
+                if re.search(pattern, line, re.IGNORECASE):
+                    violations.append((pattern, description, line_num))
+        return violations

thailint-0.4.5/src/linters/file_header/config.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""
+File: src/linters/file_header/config.py
+Purpose: Configuration model for file header linter
+Exports: FileHeaderConfig dataclass
+Depends: dataclasses, pathlib
+Implements: Configuration with validation and defaults
+Related: linter.py for configuration usage
+Overview:
+    Defines configuration structure for file header linter including required fields
+    per language, ignore patterns, and validation options. Provides defaults matching
+    ai-doc-standard.md requirements and supports loading from .thailint.yaml configuration.
+Usage:
+    config = FileHeaderConfig()
+    config = FileHeaderConfig.from_dict(config_dict, "python")
+Notes: Dataclass with validation and language-specific defaults
+"""
+from dataclasses import dataclass, field
+@dataclass
+class FileHeaderConfig:
+    """Configuration for file header linting."""
+    # Required fields by language
+    required_fields_python: list[str] = field(
+        default_factory=lambda: [
+            "Purpose",
+            "Scope",
+            "Overview",
+            "Dependencies",
+            "Exports",
+            "Interfaces",
+            "Implementation",
+        ]
+    )
+    # Enforce atemporal language checking
+    enforce_atemporal: bool = True
+    # Patterns to ignore (file paths)
+    ignore: list[str] = field(
+        default_factory=lambda: ["test/**", "**/migrations/**", "**/__init__.py"]
+    )
+    @classmethod
+    def from_dict(cls, config_dict: dict, language: str) -> "FileHeaderConfig":
+        """Create config from dictionary.
+        Args:
+            config_dict: Dictionary of configuration values
+            language: Programming language for language-specific config
+        Returns:
+            FileHeaderConfig instance with values from dictionary
+        """
+        return cls(
+            required_fields_python=config_dict.get("required_fields", {}).get(
+                "python", cls().required_fields_python
+            ),
+            enforce_atemporal=config_dict.get("enforce_atemporal", True),
+            ignore=config_dict.get("ignore", cls().ignore),
+        )

thailint-0.4.5/src/linters/file_header/field_validator.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""
+File: src/linters/file_header/field_validator.py
+Purpose: Validates mandatory fields in file headers
+Exports: FieldValidator class
+Depends: FileHeaderConfig for field requirements
+Implements: Configuration-driven validation with field presence checking
+Related: linter.py for validator usage, config.py for configuration
+Overview:
+    Validates presence and quality of mandatory header fields. Checks that all
+    required fields are present, non-empty, and meet minimum content requirements.
+    Supports language-specific required fields and provides detailed violation messages.
+Usage:
+    validator = FieldValidator(config)
+    violations = validator.validate_fields(fields, "python")
+Notes: Language-specific field requirements defined in config
+"""
+from .config import FileHeaderConfig
+class FieldValidator:
+    """Validates mandatory fields in headers."""
+    def __init__(self, config: FileHeaderConfig):
+        """Initialize validator with configuration.
+        Args:
+            config: File header configuration with required fields
+        """
+        self.config = config
+    def validate_fields(  # thailint: ignore[nesting]
+        self, fields: dict[str, str], language: str
+    ) -> list[tuple[str, str]]:
+        """Validate all required fields are present.
+        Args:
+            fields: Dictionary of parsed header fields
+            language: File language (python, typescript, etc.)
+        Returns:
+            List of (field_name, error_message) tuples for missing/invalid fields
+        """
+        violations = []
+        required_fields = self._get_required_fields(language)
+        for field_name in required_fields:
+            if field_name not in fields:
+                violations.append((field_name, f"Missing mandatory field: {field_name}"))
+            elif not fields[field_name] or len(fields[field_name].strip()) == 0:
+                violations.append((field_name, f"Empty mandatory field: {field_name}"))
+        return violations
+    def _get_required_fields(self, language: str) -> list[str]:
+        """Get required fields for language.
+        Args:
+            language: Programming language
+        Returns:
+            List of required field names for the language
+        """
+        if language == "python":
+            return self.config.required_fields_python
+        return []  # Other languages in PR5

thailint 0.4.3__tar.gz → 0.4.5__tar.gz

thailint 0.4.3tar.gz → 0.4.5tar.gz