PyPI - ai-codeindex - Versions diffs - 0.7.0__py3-none-any.whl - Mend

ai-codeindex 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

ai_codeindex-0.7.0.dist-info/METADATA +966 -0
ai_codeindex-0.7.0.dist-info/RECORD +41 -0
ai_codeindex-0.7.0.dist-info/WHEEL +4 -0
ai_codeindex-0.7.0.dist-info/entry_points.txt +2 -0
ai_codeindex-0.7.0.dist-info/licenses/LICENSE +21 -0
codeindex/README_AI.md +767 -0
codeindex/__init__.py +11 -0
codeindex/adaptive_config.py +83 -0
codeindex/adaptive_selector.py +171 -0
codeindex/ai_helper.py +48 -0
codeindex/cli.py +40 -0
codeindex/cli_common.py +10 -0
codeindex/cli_config.py +97 -0
codeindex/cli_docs.py +66 -0
codeindex/cli_hooks.py +765 -0
codeindex/cli_scan.py +562 -0
codeindex/cli_symbols.py +295 -0
codeindex/cli_tech_debt.py +238 -0
codeindex/config.py +479 -0
codeindex/directory_tree.py +229 -0
codeindex/docstring_processor.py +342 -0
codeindex/errors.py +62 -0
codeindex/extractors/__init__.py +9 -0
codeindex/extractors/thinkphp.py +132 -0
codeindex/file_classifier.py +148 -0
codeindex/framework_detect.py +323 -0
codeindex/hierarchical.py +428 -0
codeindex/incremental.py +278 -0
codeindex/invoker.py +260 -0
codeindex/parallel.py +155 -0
codeindex/parser.py +740 -0
codeindex/route_extractor.py +98 -0
codeindex/route_registry.py +77 -0
codeindex/scanner.py +167 -0
codeindex/semantic_extractor.py +408 -0
codeindex/smart_writer.py +737 -0
codeindex/symbol_index.py +199 -0
codeindex/symbol_scorer.py +283 -0
codeindex/tech_debt.py +619 -0
codeindex/tech_debt_formatters.py +234 -0
codeindex/writer.py +164 -0

codeindex/symbol_index.py ADDED Viewed

@@ -0,0 +1,199 @@
+"""Global symbol index generator for PROJECT_SYMBOLS.md."""
+from collections import defaultdict
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from .config import Config
+from .directory_tree import DirectoryTree
+from .parallel import parse_files_parallel
+from .scanner import scan_directory
+@dataclass
+class SymbolEntry:
+    """A symbol entry in the global index."""
+    name: str
+    kind: str
+    namespace: str
+    file_path: Path
+    signature: str
+    docstring: str = ""
+class GlobalSymbolIndex:
+    """
+    Generates a global symbol index (PROJECT_SYMBOLS.md) for a project.
+    Collects all classes, functions, and key symbols across all directories
+    and generates a searchable index organized by type and alphabetically.
+    """
+    def __init__(self, root: Path, config: Config):
+        self.root = root.resolve()
+        self.config = config
+        self.symbols: list[SymbolEntry] = []
+    def collect_symbols(self, quiet: bool = False) -> dict:
+        """
+        Collect symbols from all directories.
+        Returns statistics about collected symbols.
+        """
+        # Build directory tree
+        tree = DirectoryTree(self.root, self.config)
+        dirs = list(tree.nodes.keys())
+        # Collect all unique files (avoid duplicates from nested directories)
+        all_files: set[Path] = set()
+        for dir_path in dirs:
+            # Scan directory for files (non-recursive to avoid duplicates)
+            result = scan_directory(dir_path, self.config, recursive=False)
+            all_files.update(result.files)
+        if not all_files:
+            return {"directories": len(dirs), "files": 0, "symbols": 0}
+        # Parse all files
+        parse_results = parse_files_parallel(list(all_files), self.config, quiet=True)
+        # Track seen symbols to avoid duplicates
+        seen: set[tuple[str, str]] = set()  # (file_path, symbol_name)
+        for pr in parse_results:
+            if pr.error:
+                continue
+            # Extract symbols from parse result
+            for symbol in pr.symbols:
+                # Only include classes and functions (not methods)
+                if symbol.kind not in ("class", "function"):
+                    continue
+                # Check for duplicate
+                key = (str(pr.path), symbol.name)
+                if key in seen:
+                    continue
+                seen.add(key)
+                entry = SymbolEntry(
+                    name=symbol.name,
+                    kind=symbol.kind,
+                    namespace=pr.namespace or "",
+                    file_path=pr.path,
+                    signature=symbol.signature,
+                    docstring=symbol.docstring[:100] if symbol.docstring else "",
+                )
+                self.symbols.append(entry)
+        return {
+            "directories": len(dirs),
+            "files": len(all_files),
+            "symbols": len(self.symbols),
+        }
+    def generate_index(self, output_file: str = "PROJECT_SYMBOLS.md") -> Path:
+        """
+        Generate the PROJECT_SYMBOLS.md file.
+        Returns the path to the generated file.
+        """
+        timestamp = datetime.now().isoformat()
+        lines = [
+            f"<!-- Generated by codeindex at {timestamp} -->",
+            "",
+            f"# Project Symbol Index: {self.root.name}",
+            "",
+            f"Total: {len(self.symbols)} symbols",
+            "",
+        ]
+        # Group by type (using suffix pattern)
+        groups = self._group_by_type()
+        lines.append("## Symbols by Type")
+        lines.append("")
+        for group_name, symbols in groups.items():
+            if not symbols:
+                continue
+            lines.append(f"### {group_name} ({len(symbols)})")
+            lines.append("")
+            for sym in sorted(symbols, key=lambda s: s.name):
+                rel_path = sym.file_path.relative_to(self.root)
+                full_name = f"{sym.namespace}\\{sym.name}" if sym.namespace else sym.name
+                desc = f" - {sym.docstring}" if sym.docstring else ""
+                lines.append(f"- `{full_name}` - {rel_path}{desc}")
+            lines.append("")
+        # Alphabetical index
+        lines.append("## All Classes (alphabetical)")
+        lines.append("")
+        lines.append("| Class | Namespace | File |")
+        lines.append("|-------|-----------|------|")
+        classes = [s for s in self.symbols if s.kind == "class"]
+        for sym in sorted(classes, key=lambda s: s.name.lower()):
+            rel_path = sym.file_path.relative_to(self.root)
+            ns = sym.namespace if sym.namespace else "-"
+            lines.append(f"| {sym.name} | `{ns}` | {rel_path} |")
+        lines.append("")
+        # Functions index (if any)
+        functions = [s for s in self.symbols if s.kind == "function"]
+        if functions:
+            lines.append("## Functions")
+            lines.append("")
+            for sym in sorted(functions, key=lambda s: s.name.lower()):
+                rel_path = sym.file_path.relative_to(self.root)
+                lines.append(f"- `{sym.name}()` - {rel_path}")
+            lines.append("")
+        # Write file
+        output_path = self.root / output_file
+        content = "\n".join(lines)
+        output_path.write_text(content, encoding="utf-8")
+        return output_path
+    def _group_by_type(self) -> dict[str, list[SymbolEntry]]:
+        """Group symbols by their type suffix (Controller, Service, Model, etc.)."""
+        groups = defaultdict(list)
+        # Get grouping patterns from config
+        patterns = (
+            self.config.indexing.grouping.patterns
+            if self.config.indexing.grouping.enabled
+            else {}
+        )
+        for sym in self.symbols:
+            if sym.kind != "class":
+                continue
+            # Check suffix patterns
+            matched = False
+            for pattern in patterns.keys():
+                if sym.name.endswith(pattern):
+                    groups[pattern].append(sym)
+                    matched = True
+                    break
+            if not matched:
+                groups["Other"].append(sym)
+        # Sort groups by pattern order
+        ordered = {}
+        for pattern in patterns.keys():
+            if pattern in groups:
+                ordered[pattern] = groups[pattern]
+        if "Other" in groups:
+            ordered["Other"] = groups["Other"]
+        return ordered

codeindex/symbol_scorer.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""Symbol importance scoring system.
+This module provides functionality to score symbols based on their importance,
+helping to prioritize which symbols should be included in README_AI.md files.
+"""
+from dataclasses import dataclass
+from typing import Optional
+from codeindex.parser import Symbol
+@dataclass
+class ScoringContext:
+    """Scoring context for symbols.
+    Attributes:
+        framework: The framework being used (e.g., 'thinkphp', 'django')
+        file_type: The type of file (e.g., 'controller', 'model', 'service')
+        total_symbols: Total number of symbols in the file
+    """
+    framework: str = "unknown"
+    file_type: str = "unknown"
+    total_symbols: int = 0
+class SymbolImportanceScorer:
+    """Score symbols by importance for inclusion in documentation.
+    This scorer evaluates symbols across multiple dimensions to determine
+    their importance for documentation purposes. Higher scores indicate
+    more important symbols that should be prioritized for inclusion.
+    Attributes:
+        context: Optional ScoringContext providing additional information
+                about the codebase being scored
+    """
+    # Critical keywords indicating core business functionality
+    CRITICAL_KEYWORDS = [
+        "create",
+        "update",
+        "delete",
+        "remove",
+        "save",
+        "insert",
+        "process",
+        "handle",
+        "execute",
+        "run",
+        "pay",
+        "notify",
+        "callback",
+        "validate",
+        "sign",
+        "auth",
+        "login",
+        "logout",
+        "register",
+    ]
+    # Secondary keywords for query/retrieval operations
+    SECONDARY_KEYWORDS = [
+        "find",
+        "search",
+        "query",
+        "list",
+        "show",
+        "display",
+        "fetch",
+        "load",
+    ]
+    def __init__(self, context: Optional[ScoringContext] = None):
+        """Initialize the scorer with optional context.
+        Args:
+            context: Optional ScoringContext. If not provided, uses defaults.
+        """
+        self.context = context or ScoringContext()
+    def _score_visibility(self, symbol: Symbol) -> float:
+        """Score symbol based on its visibility.
+        Public APIs should be prioritized over private implementation details.
+        Scoring:
+        - PHP public: 20 points (main API surface)
+        - PHP protected: 10 points (inheritance API)
+        - PHP private: 0 points (internal implementation)
+        - Python public (no _): 15 points
+        - Python private (_ or __): 5 points
+        Args:
+            symbol: The Symbol to score
+        Returns:
+            float: Visibility score (0-20)
+        """
+        sig_lower = symbol.signature.lower()
+        # PHP visibility keywords
+        if "public" in sig_lower:
+            return 20.0
+        elif "protected" in sig_lower:
+            return 10.0
+        elif "private" in sig_lower:
+            return 0.0
+        else:
+            # Python naming conventions
+            # Private/magic methods start with underscore
+            if symbol.name.startswith("_"):
+                return 5.0
+            else:
+                return 15.0
+    def _score_semantics(self, symbol: Symbol) -> float:
+        """Score symbol based on semantic importance of its name.
+        Core business operations (pay, create, update, delete) should be
+        prioritized over generic helpers or accessor methods.
+        Scoring:
+        - Critical keywords (pay, create, update, etc.): 25 points
+        - Secondary keywords (find, search, list): 15 points
+        - Generic names: 5 points
+        Matching is case-insensitive.
+        Args:
+            symbol: The Symbol to score
+        Returns:
+            float: Semantic importance score (5-25)
+        """
+        name_lower = symbol.name.lower()
+        # Check for critical keywords
+        for keyword in self.CRITICAL_KEYWORDS:
+            if keyword in name_lower:
+                return 25.0
+        # Check for secondary keywords
+        for keyword in self.SECONDARY_KEYWORDS:
+            if keyword in name_lower:
+                return 15.0
+        # Generic method
+        return 5.0
+    def _score_documentation(self, symbol: Symbol) -> float:
+        """Score symbol based on documentation quality.
+        Well-documented code is more important for understanding and
+        should be prioritized in documentation.
+        Scoring:
+        - Comprehensive docs (>200 chars): 15 points
+        - Medium docs (>50 chars): 10 points
+        - Brief docs (any): 5 points
+        - No docs: 0 points
+        Args:
+            symbol: The Symbol to score
+        Returns:
+            float: Documentation quality score (0-15)
+        """
+        if not symbol.docstring:
+            return 0.0
+        doc_length = len(symbol.docstring.strip())
+        if doc_length > 200:
+            return 15.0
+        elif doc_length > 50:
+            return 10.0
+        elif doc_length > 0:
+            return 5.0
+        else:
+            return 0.0
+    def _score_complexity(self, symbol: Symbol) -> float:
+        """Score symbol based on code complexity (measured by line count).
+        Larger, more complex symbols often contain critical business logic
+        and should be prioritized for documentation.
+        Scoring:
+        - Very large (>100 lines): 20 points
+        - Large (50-100 lines): 15 points
+        - Medium (20-50 lines): 10 points
+        - Small (<20 lines): 5 points
+        Args:
+            symbol: The Symbol to score
+        Returns:
+            float: Complexity score (5-20)
+        """
+        lines = symbol.line_end - symbol.line_start + 1
+        if lines > 100:
+            return 20.0
+        elif lines >= 50:
+            return 15.0
+        elif lines >= 20:
+            return 10.0
+        else:
+            return 5.0
+    def _score_naming_pattern(self, symbol: Symbol) -> float:
+        """Score symbol based on naming patterns (noise detection).
+        Penalize common noise patterns like getters, setters, and
+        internal/magic methods that clutter documentation.
+        Scoring (penalties):
+        - Magic methods (__*): -20 points
+        - Private methods (_*): -15 points
+        - Getter/setter/checker methods (get*/set*/is*/has*): -10 points
+        - Normal methods: 0 points
+        Args:
+            symbol: The Symbol to score
+        Returns:
+            float: Naming pattern score (-20 to 0)
+        """
+        name = symbol.name
+        # Check for magic methods (highest penalty)
+        if name.startswith("__"):
+            return -20.0
+        # Check for private methods (high penalty)
+        if name.startswith("_"):
+            return -15.0
+        # Check for getter/setter/checker patterns (moderate penalty)
+        name_lower = name.lower()
+        noise_prefixes = ["get", "set", "is", "has"]
+        for prefix in noise_prefixes:
+            if name_lower.startswith(prefix):
+                return -10.0
+        # Normal method name
+        return 0.0
+    def score(self, symbol: Symbol) -> float:
+        """Calculate importance score for a symbol.
+        Returns a score between 0-100, where higher scores indicate
+        more important symbols that should be prioritized for documentation.
+        Multi-dimensional scoring based on:
+        - Visibility (public/private): 0-20 points
+        - Semantic importance (keywords): 5-25 points
+        - Documentation quality: 0-15 points
+        - Code complexity: 5-20 points
+        - Naming patterns (noise detection): -20-0 points
+        Theoretical range: -10 to 100 (clamped to 0-100)
+        Args:
+            symbol: The Symbol to score
+        Returns:
+            float: Score between 0-100
+        """
+        # Start with neutral base
+        score = 0.0
+        # Add all scoring dimensions
+        score += self._score_visibility(symbol)  # 0-20
+        score += self._score_semantics(symbol)  # 5-25
+        score += self._score_documentation(symbol)  # 0-15
+        score += self._score_complexity(symbol)  # 5-20
+        score += self._score_naming_pattern(symbol)  # -20-0
+        # Ensure score stays in valid range [0, 100]
+        return max(0.0, min(100.0, score))