PyPI - ai-codeindex - Versions diffs - 0.7.0__py3-none-any.whl - Mend

ai-codeindex 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

ai_codeindex-0.7.0.dist-info/METADATA +966 -0
ai_codeindex-0.7.0.dist-info/RECORD +41 -0
ai_codeindex-0.7.0.dist-info/WHEEL +4 -0
ai_codeindex-0.7.0.dist-info/entry_points.txt +2 -0
ai_codeindex-0.7.0.dist-info/licenses/LICENSE +21 -0
codeindex/README_AI.md +767 -0
codeindex/__init__.py +11 -0
codeindex/adaptive_config.py +83 -0
codeindex/adaptive_selector.py +171 -0
codeindex/ai_helper.py +48 -0
codeindex/cli.py +40 -0
codeindex/cli_common.py +10 -0
codeindex/cli_config.py +97 -0
codeindex/cli_docs.py +66 -0
codeindex/cli_hooks.py +765 -0
codeindex/cli_scan.py +562 -0
codeindex/cli_symbols.py +295 -0
codeindex/cli_tech_debt.py +238 -0
codeindex/config.py +479 -0
codeindex/directory_tree.py +229 -0
codeindex/docstring_processor.py +342 -0
codeindex/errors.py +62 -0
codeindex/extractors/__init__.py +9 -0
codeindex/extractors/thinkphp.py +132 -0
codeindex/file_classifier.py +148 -0
codeindex/framework_detect.py +323 -0
codeindex/hierarchical.py +428 -0
codeindex/incremental.py +278 -0
codeindex/invoker.py +260 -0
codeindex/parallel.py +155 -0
codeindex/parser.py +740 -0
codeindex/route_extractor.py +98 -0
codeindex/route_registry.py +77 -0
codeindex/scanner.py +167 -0
codeindex/semantic_extractor.py +408 -0
codeindex/smart_writer.py +737 -0
codeindex/symbol_index.py +199 -0
codeindex/symbol_scorer.py +283 -0
codeindex/tech_debt.py +619 -0
codeindex/tech_debt_formatters.py +234 -0
codeindex/writer.py +164 -0

codeindex/docstring_processor.py ADDED Viewed

@@ -0,0 +1,342 @@
+"""Docstring Processor - AI-powered documentation extraction.
+Story 9.1: Docstring Processor Core
+This module provides AI-powered docstring extraction and normalization
+for any programming language, following the KISS principle (no complex parsers).
+Modes:
+- hybrid: Simple extraction + selective AI (cost-effective, <$1 per 250 dirs)
+- all-ai: AI processes everything (highest quality, higher cost)
+Architecture:
+- Batch processing: 1 AI call per file (not per comment)
+- Fallback strategy: Graceful degradation if AI fails
+- Cost tracking: Token counting for budget management
+"""
+import json
+import re
+import subprocess
+from pathlib import Path
+from .parser import Symbol
+class DocstringProcessor:
+    """AI-powered docstring extraction and normalization.
+    Uses external AI CLI (Claude, GPT-4, etc.) to understand and normalize
+    documentation comments from any format:
+    - PHPDoc (/** @param */)
+    - JavaDoc (/** ... */)
+    - JSDoc (/** ... */)
+    - Inline comments (// ...)
+    - Mixed language (Chinese + English)
+    - Irregular formats
+    Attributes:
+        ai_command: AI CLI command template with {prompt} placeholder
+        mode: Processing mode ("hybrid" or "all-ai")
+        total_tokens: Total tokens processed (for cost tracking)
+    """
+    def __init__(self, ai_command: str, mode: str = "hybrid"):
+        """
+        Initialize docstring processor.
+        Args:
+            ai_command: AI CLI command template (e.g., 'claude -p "{prompt}"')
+            mode: Processing mode - "hybrid" (default) or "all-ai"
+        """
+        if mode not in ("hybrid", "all-ai"):
+            raise ValueError(f"Invalid mode: {mode}. Must be 'hybrid' or 'all-ai'")
+        self.ai_command = ai_command
+        self.mode = mode
+        self.total_tokens = 0
+    def process_file(
+        self, file_path: Path, symbols: list[Symbol]
+    ) -> dict[str, str]:
+        """
+        Process all docstrings in a file.
+        Batch processing: Makes single AI call for all symbols in the file
+        (not per symbol).
+        Args:
+            file_path: Path to source file
+            symbols: List of symbols with raw docstrings
+        Returns:
+            Dict mapping symbol name to normalized description
+        """
+        if not symbols:
+            return {}
+        # Filter symbols that need processing
+        symbols_to_process = [
+            s for s in symbols if self._should_process(s.docstring)
+        ]
+        if not symbols_to_process:
+            return {}
+        # Decide whether to use AI
+        if self.mode == "all-ai":
+            # All-AI mode: always use AI
+            return self._process_with_ai(file_path, symbols_to_process, symbols)
+        # Hybrid mode: selective AI usage
+        needs_ai = any(self._should_use_ai(s.docstring) for s in symbols_to_process)
+        if needs_ai:
+            return self._process_with_ai(file_path, symbols_to_process, symbols)
+        # Simple extraction without AI
+        return self._process_simple(symbols_to_process)
+    def _should_process(self, docstring: str) -> bool:
+        """Check if docstring should be processed."""
+        return bool(docstring and docstring.strip())
+    def _should_use_ai(self, docstring: str) -> bool:
+        """
+        Decide if AI is needed for this docstring.
+        Hybrid mode uses AI only when necessary:
+        - Simple cases: NO AI (fast, free)
+        - Complex cases: YES AI (accurate, costs tokens)
+        Args:
+            docstring: Raw docstring text
+        Returns:
+            True if AI is needed
+        """
+        if not docstring or len(docstring.strip()) == 0:
+            return False
+        # Check for structured documentation markers
+        structured_markers = ["@param", "@return", "@throws", "@var", "/**", "*/"]
+        if any(marker in docstring for marker in structured_markers):
+            return True  # Structured doc → AI
+        # Simple case: Clean one-liner in English (<= 60 chars, no newlines)
+        if len(docstring) <= 60 and "\n" not in docstring:
+            # Check if contains non-ASCII (Chinese, etc.)
+            if not self._contains_non_ascii(docstring):
+                return False  # Simple English → No AI
+        # Complex cases that need AI:
+        # - Mixed language (Chinese + English)
+        # - Multi-line with structure (@param, @return)
+        # - Irregular formatting
+        # - Very long (>60 chars)
+        return True
+    def _contains_non_ascii(self, text: str) -> bool:
+        """Check if text contains non-ASCII characters."""
+        return any(ord(c) > 127 for c in text)
+    def _process_simple(self, symbols: list[Symbol]) -> dict[str, str]:
+        """
+        Process docstrings without AI (simple extraction).
+        Args:
+            symbols: Symbols to process
+        Returns:
+            Dict mapping symbol name to description
+        """
+        result = {}
+        for symbol in symbols:
+            if symbol.docstring:
+                result[symbol.name] = self._fallback_extract(symbol.docstring)
+        return result
+    def _process_with_ai(
+        self,
+        file_path: Path,
+        symbols_to_process: list[Symbol],
+        all_symbols: list[Symbol],
+    ) -> dict[str, str]:
+        """
+        Process docstrings with AI (batch processing).
+        Args:
+            file_path: Source file path
+            symbols_to_process: Symbols that need processing
+            all_symbols: All symbols (for context)
+        Returns:
+            Dict mapping symbol name to normalized description
+        """
+        # Generate prompt
+        prompt = self._generate_prompt(file_path, symbols_to_process)
+        # Call AI
+        try:
+            ai_result = self._call_ai(prompt)
+            # Parse JSON response
+            parsed = self._parse_ai_response(ai_result)
+            # Update token count (estimate)
+            self.total_tokens += len(prompt) // 4 + len(ai_result) // 4
+            return parsed
+        except Exception:
+            # Fallback on AI failure
+            return self._process_simple(symbols_to_process)
+    def _generate_prompt(self, file_path: Path, symbols: list[Symbol]) -> str:
+        """
+        Generate AI prompt for batch processing.
+        Args:
+            file_path: Source file path
+            symbols: Symbols to process
+        Returns:
+            Prompt string
+        """
+        symbols_list = "\n".join(
+            (
+                f"- {s.name} ({s.kind}): {s.docstring[:100]}..."
+                if len(s.docstring) > 100
+                else f"- {s.name} ({s.kind}): {s.docstring}"
+            )
+            for s in symbols
+        )
+        prompt = f"""You are analyzing source code documentation comments.
+Extract and normalize docstrings for the following symbols:
+File: {file_path}
+Symbols:
+{symbols_list}
+For each symbol, generate a concise description (max 60 characters):
+1. Use imperative mood ("Get user list", not "Gets user list")
+2. Focus on WHAT the code does, not HOW
+3. Combine information from all comment types (PHPDoc, inline, etc.)
+4. Handle mixed languages (prefer English if available)
+5. Remove noise (@param, @return, TODO, etc.)
+Return JSON format:
+{{
+  "symbols": [
+    {{
+      "name": "methodName",
+      "description": "Concise description here",
+      "quality": "high|medium|low"
+    }}
+  ]
+}}
+If a symbol has no meaningful documentation, omit it from the response."""
+        return prompt
+    def _call_ai(self, prompt: str) -> str:
+        """
+        Call AI CLI and get response.
+        Args:
+            prompt: Prompt to send
+        Returns:
+            AI response text
+        Raises:
+            Exception: If AI call fails
+        """
+        # Replace {prompt} placeholder in command
+        command = self.ai_command.replace("{prompt}", prompt)
+        # Execute AI CLI
+        result = subprocess.run(
+            command,
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        if result.returncode != 0:
+            raise Exception(f"AI CLI failed: {result.stderr}")
+        return result.stdout
+    def _parse_ai_response(self, response: str) -> dict[str, str]:
+        """
+        Parse AI JSON response.
+        Args:
+            response: AI response text
+        Returns:
+            Dict mapping symbol name to description
+        Raises:
+            Exception: If JSON parsing fails
+        """
+        try:
+            data = json.loads(response)
+            symbols = data.get("symbols", [])
+            result = {}
+            for symbol in symbols:
+                name = symbol.get("name")
+                description = symbol.get("description")
+                if name and description:
+                    result[name] = description
+            return result
+        except json.JSONDecodeError as e:
+            raise Exception(f"Failed to parse AI JSON response: {e}")
+    def _fallback_extract(self, docstring: str) -> str:
+        """
+        Simple fallback: extract first line, max 60 chars.
+        Args:
+            docstring: Raw docstring text
+        Returns:
+            Cleaned description (max 60 chars + "...")
+        """
+        if not docstring:
+            return ""
+        # Clean up docstring
+        cleaned = docstring.strip()
+        # Remove comment markers
+        cleaned = re.sub(r"^/\*\*\s*", "", cleaned)  # /** at start
+        cleaned = re.sub(r"\s*\*/$", "", cleaned)  # */ at end
+        cleaned = re.sub(r"^\s*\*\s*", "", cleaned, flags=re.MULTILINE)  # * lines
+        cleaned = re.sub(r"^//\s*", "", cleaned)  # // comments
+        cleaned = re.sub(r"^#\s*", "", cleaned)  # # comments
+        # Take first line
+        lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
+        if not lines:
+            return ""
+        first_line = lines[0]
+        # Remove @tags
+        first_line = re.sub(r"@\w+.*", "", first_line).strip()
+        # Truncate if too long
+        if len(first_line) > 60:
+            return first_line[:60] + "..."
+        return first_line

codeindex/errors.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Error codes and structures for JSON output.
+Story 4: Structured error handling for machine-readable errors.
+"""
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+class ErrorCode(str, Enum):
+    """Error codes for command-level errors."""
+    DIRECTORY_NOT_FOUND = "DIRECTORY_NOT_FOUND"
+    NO_CONFIG_FOUND = "NO_CONFIG_FOUND"
+    INVALID_PATH = "INVALID_PATH"
+    PARSE_ERROR = "PARSE_ERROR"  # File-level
+    UNKNOWN_ERROR = "UNKNOWN_ERROR"
+@dataclass
+class ErrorInfo:
+    """Structured error information for JSON output."""
+    code: str  # ErrorCode value
+    message: str
+    detail: Optional[str] = None
+    def to_dict(self) -> dict:
+        """Convert to JSON-serializable dict."""
+        return {
+            "code": self.code,
+            "message": self.message,
+            "detail": self.detail,
+        }
+def create_error_response(
+    error: ErrorInfo,
+    results: Optional[list] = None,
+) -> dict:
+    """
+    Create standardized error response for JSON output.
+    Args:
+        error: Error information
+        results: Optional partial results (for partial success)
+    Returns:
+        JSON-serializable error response dict
+    """
+    return {
+        "success": False,
+        "error": error.to_dict(),
+        "results": results or [],
+        "summary": {
+            "total_files": len(results) if results else 0,
+            "total_symbols": sum(len(r.get("symbols", [])) for r in (results or [])),
+            "total_imports": sum(len(r.get("imports", [])) for r in (results or [])),
+            "errors": 1,
+        },
+    }

codeindex/extractors/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Framework-specific route extractors.
+This package contains route extractors for different frameworks.
+Each extractor implements the RouteExtractor interface.
+"""
+from .thinkphp import ThinkPHPRouteExtractor
+__all__ = ["ThinkPHPRouteExtractor"]

codeindex/extractors/thinkphp.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""ThinkPHP route extractor.
+Extracts routes from ThinkPHP framework controllers using convention-based routing.
+ThinkPHP routing convention:
+- URL: /module/controller/action
+- Example: /admin/index/home -> Admin\\Controller\\IndexController::home()
+Epic 6: Framework-agnostic route extraction
+"""
+from ..framework_detect import RouteInfo
+from ..route_extractor import ExtractionContext, RouteExtractor
+class ThinkPHPRouteExtractor(RouteExtractor):
+    """
+    Route extractor for ThinkPHP framework.
+    ThinkPHP uses convention-based routing where:
+    - Controllers are in Application/{Module}/Controller/ directories
+    - URL pattern: /{module}/{controller}/{action}
+    - Only public methods are routes
+    - Magic methods (__construct, __call, etc.) are excluded
+    - Internal methods (starting with _) are excluded
+    """
+    @property
+    def framework_name(self) -> str:
+        """Return framework name."""
+        return "thinkphp"
+    def can_extract(self, context: ExtractionContext) -> bool:
+        """
+        Check if routes should be extracted from this directory.
+        Routes are extracted only from Controller directories.
+        Args:
+            context: Extraction context
+        Returns:
+            True if current directory is a Controller directory
+        """
+        return context.current_dir.name == "Controller"
+    def extract_routes(self, context: ExtractionContext) -> list[RouteInfo]:
+        """
+        Extract routes from ThinkPHP controllers.
+        Args:
+            context: Extraction context with parse results
+        Returns:
+            List of RouteInfo objects for each public method in controllers
+        """
+        routes = []
+        # Get module name from directory structure
+        # e.g., /Application/Admin/Controller -> module = "Admin"
+        module_name = context.current_dir.parent.name
+        for result in context.parse_results:
+            if result.error:
+                continue
+            # Find controller class
+            controller_class = None
+            for symbol in result.symbols:
+                if symbol.kind == "class" and symbol.name.endswith("Controller"):
+                    controller_class = symbol.name
+                    break
+            if not controller_class:
+                continue
+            # Extract controller name (remove "Controller" suffix)
+            controller_name = controller_class.replace("Controller", "").lower()
+            # Find public methods (actions)
+            for symbol in result.symbols:
+                if symbol.kind != "method":
+                    continue
+                # Only public methods are routes
+                if "public" not in symbol.signature.lower():
+                    continue
+                # Skip magic methods and internal methods
+                method_name = symbol.name.split("::")[-1]
+                if method_name.startswith("_") or method_name.startswith("__"):
+                    continue
+                # Build route URL: /module/controller/action
+                url = f"/{module_name.lower()}/{controller_name}/{method_name}"
+                routes.append(
+                    RouteInfo(
+                        url=url,
+                        controller=controller_class,
+                        action=method_name,
+                        method_signature=symbol.signature,
+                        line_number=symbol.line_start,
+                        file_path=result.path.name,
+                        description=self._extract_description(symbol),
+                    )
+                )
+        return routes
+    def _extract_description(self, symbol) -> str:
+        """
+        Extract description from symbol docstring.
+        Limits description to 60 characters for table display.
+        Args:
+            symbol: Symbol with docstring
+        Returns:
+            Cleaned description (max 60 chars + "...")
+        """
+        if not symbol.docstring:
+            return ""
+        description = symbol.docstring.strip()
+        # Limit length for table display
+        if len(description) > 60:
+            return description[:60] + "..."
+        return description

codeindex/file_classifier.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""Unified file size classification system (Epic 4 Story 4.2).
+This module provides a unified approach to file size classification,
+replacing hard-coded constants in tech_debt and ai_enhancement modules.
+"""
+from dataclasses import dataclass
+from enum import Enum
+from codeindex.config import Config
+from codeindex.parser import ParseResult
+class FileSizeCategory(Enum):
+    """File size categories for classification."""
+    TINY = "tiny"  # <500 lines
+    SMALL = "small"  # 500-1000 lines
+    MEDIUM = "medium"  # 1000-2000 lines
+    LARGE = "large"  # 2000-5000 lines (or 40-100 symbols)
+    SUPER_LARGE = "super_large"  # >5000 lines OR >100 symbols
+@dataclass
+class FileSizeAnalysis:
+    """Result of file size analysis.
+    Attributes:
+        category: File size category (enum)
+        file_lines: Number of lines in the file
+        symbol_count: Number of symbols in the file
+        exceeds_line_threshold: True if file exceeds super_large_lines threshold
+        exceeds_symbol_threshold: True if file exceeds super_large_symbols threshold
+        reason: Human-readable reason (e.g., "excessive_lines", "excessive_symbols")
+    """
+    category: FileSizeCategory
+    file_lines: int
+    symbol_count: int
+    exceeds_line_threshold: bool
+    exceeds_symbol_threshold: bool
+    reason: str | None = None
+class FileSizeClassifier:
+    """Unified file size classifier for all modules.
+    This classifier provides consistent file size detection across
+    tech_debt and ai_enhancement modules, using configurable thresholds.
+    Example:
+        >>> config = Config.load()
+        >>> classifier = FileSizeClassifier(config)
+        >>> analysis = classifier.classify(parse_result)
+        >>> if analysis.category == FileSizeCategory.SUPER_LARGE:
+        ...     # Super large file detected
+    """
+    def __init__(self, config: Config):
+        """Initialize classifier with configuration.
+        Args:
+            config: Configuration containing threshold values
+        """
+        self.config = config
+        # Super large thresholds for tech debt detection
+        self.super_large_lines = 5000
+        self.super_large_symbols = 100
+    def classify(self, parse_result: ParseResult) -> FileSizeAnalysis:
+        """Classify file size based on lines and symbol count.
+        Classification rules:
+        - TINY: < 500 lines
+        - SMALL: 500-1000 lines
+        - MEDIUM: 1000-2000 lines
+        - LARGE: 2000-5000 lines (or 40-100 symbols)
+        - SUPER_LARGE: > super_large_lines OR > super_large_symbols
+        Args:
+            parse_result: Parsed file data with lines and symbols
+        Returns:
+            FileSizeAnalysis with category, thresholds, and reason
+        """
+        file_lines = parse_result.file_lines
+        symbol_count = len(parse_result.symbols)
+        # Check super large thresholds
+        exceeds_lines = file_lines > self.super_large_lines
+        exceeds_symbols = symbol_count > self.super_large_symbols
+        # Build reason string
+        reasons = []
+        if exceeds_lines:
+            reasons.append("excessive_lines")
+        if exceeds_symbols:
+            reasons.append("excessive_symbols")
+        reason = ",".join(reasons) if reasons else None
+        # Determine category
+        if exceeds_lines or exceeds_symbols:
+            category = FileSizeCategory.SUPER_LARGE
+        elif file_lines > 2000 or symbol_count > 40:
+            category = FileSizeCategory.LARGE
+        elif file_lines > 1000:
+            category = FileSizeCategory.MEDIUM
+        elif file_lines > 500:
+            category = FileSizeCategory.SMALL
+        else:
+            category = FileSizeCategory.TINY
+        return FileSizeAnalysis(
+            category=category,
+            file_lines=file_lines,
+            symbol_count=symbol_count,
+            exceeds_line_threshold=exceeds_lines,
+            exceeds_symbol_threshold=exceeds_symbols,
+            reason=reason,
+        )
+    def is_super_large(self, parse_result: ParseResult) -> bool:
+        """Check if file is super large.
+        Convenience method that returns True if category is SUPER_LARGE.
+        Args:
+            parse_result: Parsed file data
+        Returns:
+            True if file is super large, False otherwise
+        """
+        analysis = self.classify(parse_result)
+        return analysis.category == FileSizeCategory.SUPER_LARGE
+    def is_large(self, parse_result: ParseResult) -> bool:
+        """Check if file is large or super large.
+        Convenience method for checking if a file needs special handling.
+        Args:
+            parse_result: Parsed file data
+        Returns:
+            True if file is large or super large, False otherwise
+        """
+        analysis = self.classify(parse_result)
+        return analysis.category in [FileSizeCategory.LARGE, FileSizeCategory.SUPER_LARGE]