ai-codeindex 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,342 @@
1
+ """Docstring Processor - AI-powered documentation extraction.
2
+
3
+ Story 9.1: Docstring Processor Core
4
+
5
+ This module provides AI-powered docstring extraction and normalization
6
+ for any programming language, following the KISS principle (no complex parsers).
7
+
8
+ Modes:
9
+ - hybrid: Simple extraction + selective AI (cost-effective, <$1 per 250 dirs)
10
+ - all-ai: AI processes everything (highest quality, higher cost)
11
+
12
+ Architecture:
13
+ - Batch processing: 1 AI call per file (not per comment)
14
+ - Fallback strategy: Graceful degradation if AI fails
15
+ - Cost tracking: Token counting for budget management
16
+ """
17
+
18
+ import json
19
+ import re
20
+ import subprocess
21
+ from pathlib import Path
22
+
23
+ from .parser import Symbol
24
+
25
+
26
+ class DocstringProcessor:
27
+ """AI-powered docstring extraction and normalization.
28
+
29
+ Uses external AI CLI (Claude, GPT-4, etc.) to understand and normalize
30
+ documentation comments from any format:
31
+ - PHPDoc (/** @param */)
32
+ - JavaDoc (/** ... */)
33
+ - JSDoc (/** ... */)
34
+ - Inline comments (// ...)
35
+ - Mixed language (Chinese + English)
36
+ - Irregular formats
37
+
38
+ Attributes:
39
+ ai_command: AI CLI command template with {prompt} placeholder
40
+ mode: Processing mode ("hybrid" or "all-ai")
41
+ total_tokens: Total tokens processed (for cost tracking)
42
+ """
43
+
44
+ def __init__(self, ai_command: str, mode: str = "hybrid"):
45
+ """
46
+ Initialize docstring processor.
47
+
48
+ Args:
49
+ ai_command: AI CLI command template (e.g., 'claude -p "{prompt}"')
50
+ mode: Processing mode - "hybrid" (default) or "all-ai"
51
+ """
52
+ if mode not in ("hybrid", "all-ai"):
53
+ raise ValueError(f"Invalid mode: {mode}. Must be 'hybrid' or 'all-ai'")
54
+
55
+ self.ai_command = ai_command
56
+ self.mode = mode
57
+ self.total_tokens = 0
58
+
59
+ def process_file(
60
+ self, file_path: Path, symbols: list[Symbol]
61
+ ) -> dict[str, str]:
62
+ """
63
+ Process all docstrings in a file.
64
+
65
+ Batch processing: Makes single AI call for all symbols in the file
66
+ (not per symbol).
67
+
68
+ Args:
69
+ file_path: Path to source file
70
+ symbols: List of symbols with raw docstrings
71
+
72
+ Returns:
73
+ Dict mapping symbol name to normalized description
74
+ """
75
+ if not symbols:
76
+ return {}
77
+
78
+ # Filter symbols that need processing
79
+ symbols_to_process = [
80
+ s for s in symbols if self._should_process(s.docstring)
81
+ ]
82
+
83
+ if not symbols_to_process:
84
+ return {}
85
+
86
+ # Decide whether to use AI
87
+ if self.mode == "all-ai":
88
+ # All-AI mode: always use AI
89
+ return self._process_with_ai(file_path, symbols_to_process, symbols)
90
+
91
+ # Hybrid mode: selective AI usage
92
+ needs_ai = any(self._should_use_ai(s.docstring) for s in symbols_to_process)
93
+
94
+ if needs_ai:
95
+ return self._process_with_ai(file_path, symbols_to_process, symbols)
96
+
97
+ # Simple extraction without AI
98
+ return self._process_simple(symbols_to_process)
99
+
100
+ def _should_process(self, docstring: str) -> bool:
101
+ """Check if docstring should be processed."""
102
+ return bool(docstring and docstring.strip())
103
+
104
+ def _should_use_ai(self, docstring: str) -> bool:
105
+ """
106
+ Decide if AI is needed for this docstring.
107
+
108
+ Hybrid mode uses AI only when necessary:
109
+ - Simple cases: NO AI (fast, free)
110
+ - Complex cases: YES AI (accurate, costs tokens)
111
+
112
+ Args:
113
+ docstring: Raw docstring text
114
+
115
+ Returns:
116
+ True if AI is needed
117
+ """
118
+ if not docstring or len(docstring.strip()) == 0:
119
+ return False
120
+
121
+ # Check for structured documentation markers
122
+ structured_markers = ["@param", "@return", "@throws", "@var", "/**", "*/"]
123
+ if any(marker in docstring for marker in structured_markers):
124
+ return True # Structured doc → AI
125
+
126
+ # Simple case: Clean one-liner in English (<= 60 chars, no newlines)
127
+ if len(docstring) <= 60 and "\n" not in docstring:
128
+ # Check if contains non-ASCII (Chinese, etc.)
129
+ if not self._contains_non_ascii(docstring):
130
+ return False # Simple English → No AI
131
+
132
+ # Complex cases that need AI:
133
+ # - Mixed language (Chinese + English)
134
+ # - Multi-line with structure (@param, @return)
135
+ # - Irregular formatting
136
+ # - Very long (>60 chars)
137
+ return True
138
+
139
+ def _contains_non_ascii(self, text: str) -> bool:
140
+ """Check if text contains non-ASCII characters."""
141
+ return any(ord(c) > 127 for c in text)
142
+
143
+ def _process_simple(self, symbols: list[Symbol]) -> dict[str, str]:
144
+ """
145
+ Process docstrings without AI (simple extraction).
146
+
147
+ Args:
148
+ symbols: Symbols to process
149
+
150
+ Returns:
151
+ Dict mapping symbol name to description
152
+ """
153
+ result = {}
154
+ for symbol in symbols:
155
+ if symbol.docstring:
156
+ result[symbol.name] = self._fallback_extract(symbol.docstring)
157
+ return result
158
+
159
+ def _process_with_ai(
160
+ self,
161
+ file_path: Path,
162
+ symbols_to_process: list[Symbol],
163
+ all_symbols: list[Symbol],
164
+ ) -> dict[str, str]:
165
+ """
166
+ Process docstrings with AI (batch processing).
167
+
168
+ Args:
169
+ file_path: Source file path
170
+ symbols_to_process: Symbols that need processing
171
+ all_symbols: All symbols (for context)
172
+
173
+ Returns:
174
+ Dict mapping symbol name to normalized description
175
+ """
176
+ # Generate prompt
177
+ prompt = self._generate_prompt(file_path, symbols_to_process)
178
+
179
+ # Call AI
180
+ try:
181
+ ai_result = self._call_ai(prompt)
182
+
183
+ # Parse JSON response
184
+ parsed = self._parse_ai_response(ai_result)
185
+
186
+ # Update token count (estimate)
187
+ self.total_tokens += len(prompt) // 4 + len(ai_result) // 4
188
+
189
+ return parsed
190
+
191
+ except Exception:
192
+ # Fallback on AI failure
193
+ return self._process_simple(symbols_to_process)
194
+
195
+ def _generate_prompt(self, file_path: Path, symbols: list[Symbol]) -> str:
196
+ """
197
+ Generate AI prompt for batch processing.
198
+
199
+ Args:
200
+ file_path: Source file path
201
+ symbols: Symbols to process
202
+
203
+ Returns:
204
+ Prompt string
205
+ """
206
+ symbols_list = "\n".join(
207
+ (
208
+ f"- {s.name} ({s.kind}): {s.docstring[:100]}..."
209
+ if len(s.docstring) > 100
210
+ else f"- {s.name} ({s.kind}): {s.docstring}"
211
+ )
212
+ for s in symbols
213
+ )
214
+
215
+ prompt = f"""You are analyzing source code documentation comments.
216
+
217
+ Extract and normalize docstrings for the following symbols:
218
+
219
+ File: {file_path}
220
+
221
+ Symbols:
222
+ {symbols_list}
223
+
224
+ For each symbol, generate a concise description (max 60 characters):
225
+ 1. Use imperative mood ("Get user list", not "Gets user list")
226
+ 2. Focus on WHAT the code does, not HOW
227
+ 3. Combine information from all comment types (PHPDoc, inline, etc.)
228
+ 4. Handle mixed languages (prefer English if available)
229
+ 5. Remove noise (@param, @return, TODO, etc.)
230
+
231
+ Return JSON format:
232
+ {{
233
+ "symbols": [
234
+ {{
235
+ "name": "methodName",
236
+ "description": "Concise description here",
237
+ "quality": "high|medium|low"
238
+ }}
239
+ ]
240
+ }}
241
+
242
+ If a symbol has no meaningful documentation, omit it from the response."""
243
+
244
+ return prompt
245
+
246
+ def _call_ai(self, prompt: str) -> str:
247
+ """
248
+ Call AI CLI and get response.
249
+
250
+ Args:
251
+ prompt: Prompt to send
252
+
253
+ Returns:
254
+ AI response text
255
+
256
+ Raises:
257
+ Exception: If AI call fails
258
+ """
259
+ # Replace {prompt} placeholder in command
260
+ command = self.ai_command.replace("{prompt}", prompt)
261
+
262
+ # Execute AI CLI
263
+ result = subprocess.run(
264
+ command,
265
+ shell=True,
266
+ capture_output=True,
267
+ text=True,
268
+ timeout=120,
269
+ )
270
+
271
+ if result.returncode != 0:
272
+ raise Exception(f"AI CLI failed: {result.stderr}")
273
+
274
+ return result.stdout
275
+
276
+ def _parse_ai_response(self, response: str) -> dict[str, str]:
277
+ """
278
+ Parse AI JSON response.
279
+
280
+ Args:
281
+ response: AI response text
282
+
283
+ Returns:
284
+ Dict mapping symbol name to description
285
+
286
+ Raises:
287
+ Exception: If JSON parsing fails
288
+ """
289
+ try:
290
+ data = json.loads(response)
291
+ symbols = data.get("symbols", [])
292
+
293
+ result = {}
294
+ for symbol in symbols:
295
+ name = symbol.get("name")
296
+ description = symbol.get("description")
297
+ if name and description:
298
+ result[name] = description
299
+
300
+ return result
301
+
302
+ except json.JSONDecodeError as e:
303
+ raise Exception(f"Failed to parse AI JSON response: {e}")
304
+
305
+ def _fallback_extract(self, docstring: str) -> str:
306
+ """
307
+ Simple fallback: extract first line, max 60 chars.
308
+
309
+ Args:
310
+ docstring: Raw docstring text
311
+
312
+ Returns:
313
+ Cleaned description (max 60 chars + "...")
314
+ """
315
+ if not docstring:
316
+ return ""
317
+
318
+ # Clean up docstring
319
+ cleaned = docstring.strip()
320
+
321
+ # Remove comment markers
322
+ cleaned = re.sub(r"^/\*\*\s*", "", cleaned) # /** at start
323
+ cleaned = re.sub(r"\s*\*/$", "", cleaned) # */ at end
324
+ cleaned = re.sub(r"^\s*\*\s*", "", cleaned, flags=re.MULTILINE) # * lines
325
+ cleaned = re.sub(r"^//\s*", "", cleaned) # // comments
326
+ cleaned = re.sub(r"^#\s*", "", cleaned) # # comments
327
+
328
+ # Take first line
329
+ lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
330
+ if not lines:
331
+ return ""
332
+
333
+ first_line = lines[0]
334
+
335
+ # Remove @tags
336
+ first_line = re.sub(r"@\w+.*", "", first_line).strip()
337
+
338
+ # Truncate if too long
339
+ if len(first_line) > 60:
340
+ return first_line[:60] + "..."
341
+
342
+ return first_line
codeindex/errors.py ADDED
@@ -0,0 +1,62 @@
1
+ """Error codes and structures for JSON output.
2
+
3
+ Story 4: Structured error handling for machine-readable errors.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from typing import Optional
9
+
10
+
11
+ class ErrorCode(str, Enum):
12
+ """Error codes for command-level errors."""
13
+
14
+ DIRECTORY_NOT_FOUND = "DIRECTORY_NOT_FOUND"
15
+ NO_CONFIG_FOUND = "NO_CONFIG_FOUND"
16
+ INVALID_PATH = "INVALID_PATH"
17
+ PARSE_ERROR = "PARSE_ERROR" # File-level
18
+ UNKNOWN_ERROR = "UNKNOWN_ERROR"
19
+
20
+
21
+ @dataclass
22
+ class ErrorInfo:
23
+ """Structured error information for JSON output."""
24
+
25
+ code: str # ErrorCode value
26
+ message: str
27
+ detail: Optional[str] = None
28
+
29
+ def to_dict(self) -> dict:
30
+ """Convert to JSON-serializable dict."""
31
+ return {
32
+ "code": self.code,
33
+ "message": self.message,
34
+ "detail": self.detail,
35
+ }
36
+
37
+
38
+ def create_error_response(
39
+ error: ErrorInfo,
40
+ results: Optional[list] = None,
41
+ ) -> dict:
42
+ """
43
+ Create standardized error response for JSON output.
44
+
45
+ Args:
46
+ error: Error information
47
+ results: Optional partial results (for partial success)
48
+
49
+ Returns:
50
+ JSON-serializable error response dict
51
+ """
52
+ return {
53
+ "success": False,
54
+ "error": error.to_dict(),
55
+ "results": results or [],
56
+ "summary": {
57
+ "total_files": len(results) if results else 0,
58
+ "total_symbols": sum(len(r.get("symbols", [])) for r in (results or [])),
59
+ "total_imports": sum(len(r.get("imports", [])) for r in (results or [])),
60
+ "errors": 1,
61
+ },
62
+ }
@@ -0,0 +1,9 @@
1
+ """Framework-specific route extractors.
2
+
3
+ This package contains route extractors for different frameworks.
4
+ Each extractor implements the RouteExtractor interface.
5
+ """
6
+
7
+ from .thinkphp import ThinkPHPRouteExtractor
8
+
9
+ __all__ = ["ThinkPHPRouteExtractor"]
@@ -0,0 +1,132 @@
1
+ """ThinkPHP route extractor.
2
+
3
+ Extracts routes from ThinkPHP framework controllers using convention-based routing.
4
+
5
+ ThinkPHP routing convention:
6
+ - URL: /module/controller/action
7
+ - Example: /admin/index/home -> Admin\\Controller\\IndexController::home()
8
+
9
+ Epic 6: Framework-agnostic route extraction
10
+ """
11
+
12
+ from ..framework_detect import RouteInfo
13
+ from ..route_extractor import ExtractionContext, RouteExtractor
14
+
15
+
16
+ class ThinkPHPRouteExtractor(RouteExtractor):
17
+ """
18
+ Route extractor for ThinkPHP framework.
19
+
20
+ ThinkPHP uses convention-based routing where:
21
+ - Controllers are in Application/{Module}/Controller/ directories
22
+ - URL pattern: /{module}/{controller}/{action}
23
+ - Only public methods are routes
24
+ - Magic methods (__construct, __call, etc.) are excluded
25
+ - Internal methods (starting with _) are excluded
26
+ """
27
+
28
+ @property
29
+ def framework_name(self) -> str:
30
+ """Return framework name."""
31
+ return "thinkphp"
32
+
33
+ def can_extract(self, context: ExtractionContext) -> bool:
34
+ """
35
+ Check if routes should be extracted from this directory.
36
+
37
+ Routes are extracted only from Controller directories.
38
+
39
+ Args:
40
+ context: Extraction context
41
+
42
+ Returns:
43
+ True if current directory is a Controller directory
44
+ """
45
+ return context.current_dir.name == "Controller"
46
+
47
+ def extract_routes(self, context: ExtractionContext) -> list[RouteInfo]:
48
+ """
49
+ Extract routes from ThinkPHP controllers.
50
+
51
+ Args:
52
+ context: Extraction context with parse results
53
+
54
+ Returns:
55
+ List of RouteInfo objects for each public method in controllers
56
+ """
57
+ routes = []
58
+
59
+ # Get module name from directory structure
60
+ # e.g., /Application/Admin/Controller -> module = "Admin"
61
+ module_name = context.current_dir.parent.name
62
+
63
+ for result in context.parse_results:
64
+ if result.error:
65
+ continue
66
+
67
+ # Find controller class
68
+ controller_class = None
69
+ for symbol in result.symbols:
70
+ if symbol.kind == "class" and symbol.name.endswith("Controller"):
71
+ controller_class = symbol.name
72
+ break
73
+
74
+ if not controller_class:
75
+ continue
76
+
77
+ # Extract controller name (remove "Controller" suffix)
78
+ controller_name = controller_class.replace("Controller", "").lower()
79
+
80
+ # Find public methods (actions)
81
+ for symbol in result.symbols:
82
+ if symbol.kind != "method":
83
+ continue
84
+
85
+ # Only public methods are routes
86
+ if "public" not in symbol.signature.lower():
87
+ continue
88
+
89
+ # Skip magic methods and internal methods
90
+ method_name = symbol.name.split("::")[-1]
91
+ if method_name.startswith("_") or method_name.startswith("__"):
92
+ continue
93
+
94
+ # Build route URL: /module/controller/action
95
+ url = f"/{module_name.lower()}/{controller_name}/{method_name}"
96
+
97
+ routes.append(
98
+ RouteInfo(
99
+ url=url,
100
+ controller=controller_class,
101
+ action=method_name,
102
+ method_signature=symbol.signature,
103
+ line_number=symbol.line_start,
104
+ file_path=result.path.name,
105
+ description=self._extract_description(symbol),
106
+ )
107
+ )
108
+
109
+ return routes
110
+
111
+ def _extract_description(self, symbol) -> str:
112
+ """
113
+ Extract description from symbol docstring.
114
+
115
+ Limits description to 60 characters for table display.
116
+
117
+ Args:
118
+ symbol: Symbol with docstring
119
+
120
+ Returns:
121
+ Cleaned description (max 60 chars + "...")
122
+ """
123
+ if not symbol.docstring:
124
+ return ""
125
+
126
+ description = symbol.docstring.strip()
127
+
128
+ # Limit length for table display
129
+ if len(description) > 60:
130
+ return description[:60] + "..."
131
+
132
+ return description
@@ -0,0 +1,148 @@
1
+ """Unified file size classification system (Epic 4 Story 4.2).
2
+
3
+ This module provides a unified approach to file size classification,
4
+ replacing hard-coded constants in tech_debt and ai_enhancement modules.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from enum import Enum
9
+
10
+ from codeindex.config import Config
11
+ from codeindex.parser import ParseResult
12
+
13
+
14
+ class FileSizeCategory(Enum):
15
+ """File size categories for classification."""
16
+
17
+ TINY = "tiny" # <500 lines
18
+ SMALL = "small" # 500-1000 lines
19
+ MEDIUM = "medium" # 1000-2000 lines
20
+ LARGE = "large" # 2000-5000 lines (or 40-100 symbols)
21
+ SUPER_LARGE = "super_large" # >5000 lines OR >100 symbols
22
+
23
+
24
+ @dataclass
25
+ class FileSizeAnalysis:
26
+ """Result of file size analysis.
27
+
28
+ Attributes:
29
+ category: File size category (enum)
30
+ file_lines: Number of lines in the file
31
+ symbol_count: Number of symbols in the file
32
+ exceeds_line_threshold: True if file exceeds super_large_lines threshold
33
+ exceeds_symbol_threshold: True if file exceeds super_large_symbols threshold
34
+ reason: Human-readable reason (e.g., "excessive_lines", "excessive_symbols")
35
+ """
36
+
37
+ category: FileSizeCategory
38
+ file_lines: int
39
+ symbol_count: int
40
+ exceeds_line_threshold: bool
41
+ exceeds_symbol_threshold: bool
42
+ reason: str | None = None
43
+
44
+
45
+ class FileSizeClassifier:
46
+ """Unified file size classifier for all modules.
47
+
48
+ This classifier provides consistent file size detection across
49
+ tech_debt and ai_enhancement modules, using configurable thresholds.
50
+
51
+ Example:
52
+ >>> config = Config.load()
53
+ >>> classifier = FileSizeClassifier(config)
54
+ >>> analysis = classifier.classify(parse_result)
55
+ >>> if analysis.category == FileSizeCategory.SUPER_LARGE:
56
+ ... # Super large file detected
57
+ """
58
+
59
+ def __init__(self, config: Config):
60
+ """Initialize classifier with configuration.
61
+
62
+ Args:
63
+ config: Configuration containing threshold values
64
+ """
65
+ self.config = config
66
+ # Super large thresholds for tech debt detection
67
+ self.super_large_lines = 5000
68
+ self.super_large_symbols = 100
69
+
70
+ def classify(self, parse_result: ParseResult) -> FileSizeAnalysis:
71
+ """Classify file size based on lines and symbol count.
72
+
73
+ Classification rules:
74
+ - TINY: < 500 lines
75
+ - SMALL: 500-1000 lines
76
+ - MEDIUM: 1000-2000 lines
77
+ - LARGE: 2000-5000 lines (or 40-100 symbols)
78
+ - SUPER_LARGE: > super_large_lines OR > super_large_symbols
79
+
80
+ Args:
81
+ parse_result: Parsed file data with lines and symbols
82
+
83
+ Returns:
84
+ FileSizeAnalysis with category, thresholds, and reason
85
+ """
86
+ file_lines = parse_result.file_lines
87
+ symbol_count = len(parse_result.symbols)
88
+
89
+ # Check super large thresholds
90
+ exceeds_lines = file_lines > self.super_large_lines
91
+ exceeds_symbols = symbol_count > self.super_large_symbols
92
+
93
+ # Build reason string
94
+ reasons = []
95
+ if exceeds_lines:
96
+ reasons.append("excessive_lines")
97
+ if exceeds_symbols:
98
+ reasons.append("excessive_symbols")
99
+ reason = ",".join(reasons) if reasons else None
100
+
101
+ # Determine category
102
+ if exceeds_lines or exceeds_symbols:
103
+ category = FileSizeCategory.SUPER_LARGE
104
+ elif file_lines > 2000 or symbol_count > 40:
105
+ category = FileSizeCategory.LARGE
106
+ elif file_lines > 1000:
107
+ category = FileSizeCategory.MEDIUM
108
+ elif file_lines > 500:
109
+ category = FileSizeCategory.SMALL
110
+ else:
111
+ category = FileSizeCategory.TINY
112
+
113
+ return FileSizeAnalysis(
114
+ category=category,
115
+ file_lines=file_lines,
116
+ symbol_count=symbol_count,
117
+ exceeds_line_threshold=exceeds_lines,
118
+ exceeds_symbol_threshold=exceeds_symbols,
119
+ reason=reason,
120
+ )
121
+
122
+ def is_super_large(self, parse_result: ParseResult) -> bool:
123
+ """Check if file is super large.
124
+
125
+ Convenience method that returns True if category is SUPER_LARGE.
126
+
127
+ Args:
128
+ parse_result: Parsed file data
129
+
130
+ Returns:
131
+ True if file is super large, False otherwise
132
+ """
133
+ analysis = self.classify(parse_result)
134
+ return analysis.category == FileSizeCategory.SUPER_LARGE
135
+
136
+ def is_large(self, parse_result: ParseResult) -> bool:
137
+ """Check if file is large or super large.
138
+
139
+ Convenience method for checking if a file needs special handling.
140
+
141
+ Args:
142
+ parse_result: Parsed file data
143
+
144
+ Returns:
145
+ True if file is large or super large, False otherwise
146
+ """
147
+ analysis = self.classify(parse_result)
148
+ return analysis.category in [FileSizeCategory.LARGE, FileSizeCategory.SUPER_LARGE]