ai-codeindex 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ """Global symbol index generator for PROJECT_SYMBOLS.md."""
2
+
3
+ from collections import defaultdict
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+ from .config import Config
9
+ from .directory_tree import DirectoryTree
10
+ from .parallel import parse_files_parallel
11
+ from .scanner import scan_directory
12
+
13
+
14
+ @dataclass
15
+ class SymbolEntry:
16
+ """A symbol entry in the global index."""
17
+ name: str
18
+ kind: str
19
+ namespace: str
20
+ file_path: Path
21
+ signature: str
22
+ docstring: str = ""
23
+
24
+
25
+ class GlobalSymbolIndex:
26
+ """
27
+ Generates a global symbol index (PROJECT_SYMBOLS.md) for a project.
28
+
29
+ Collects all classes, functions, and key symbols across all directories
30
+ and generates a searchable index organized by type and alphabetically.
31
+ """
32
+
33
+ def __init__(self, root: Path, config: Config):
34
+ self.root = root.resolve()
35
+ self.config = config
36
+ self.symbols: list[SymbolEntry] = []
37
+
38
+ def collect_symbols(self, quiet: bool = False) -> dict:
39
+ """
40
+ Collect symbols from all directories.
41
+
42
+ Returns statistics about collected symbols.
43
+ """
44
+ # Build directory tree
45
+ tree = DirectoryTree(self.root, self.config)
46
+ dirs = list(tree.nodes.keys())
47
+
48
+ # Collect all unique files (avoid duplicates from nested directories)
49
+ all_files: set[Path] = set()
50
+ for dir_path in dirs:
51
+ # Scan directory for files (non-recursive to avoid duplicates)
52
+ result = scan_directory(dir_path, self.config, recursive=False)
53
+ all_files.update(result.files)
54
+
55
+ if not all_files:
56
+ return {"directories": len(dirs), "files": 0, "symbols": 0}
57
+
58
+ # Parse all files
59
+ parse_results = parse_files_parallel(list(all_files), self.config, quiet=True)
60
+
61
+ # Track seen symbols to avoid duplicates
62
+ seen: set[tuple[str, str]] = set() # (file_path, symbol_name)
63
+
64
+ for pr in parse_results:
65
+ if pr.error:
66
+ continue
67
+
68
+ # Extract symbols from parse result
69
+ for symbol in pr.symbols:
70
+ # Only include classes and functions (not methods)
71
+ if symbol.kind not in ("class", "function"):
72
+ continue
73
+
74
+ # Check for duplicate
75
+ key = (str(pr.path), symbol.name)
76
+ if key in seen:
77
+ continue
78
+ seen.add(key)
79
+
80
+ entry = SymbolEntry(
81
+ name=symbol.name,
82
+ kind=symbol.kind,
83
+ namespace=pr.namespace or "",
84
+ file_path=pr.path,
85
+ signature=symbol.signature,
86
+ docstring=symbol.docstring[:100] if symbol.docstring else "",
87
+ )
88
+ self.symbols.append(entry)
89
+
90
+ return {
91
+ "directories": len(dirs),
92
+ "files": len(all_files),
93
+ "symbols": len(self.symbols),
94
+ }
95
+
96
+ def generate_index(self, output_file: str = "PROJECT_SYMBOLS.md") -> Path:
97
+ """
98
+ Generate the PROJECT_SYMBOLS.md file.
99
+
100
+ Returns the path to the generated file.
101
+ """
102
+ timestamp = datetime.now().isoformat()
103
+ lines = [
104
+ f"<!-- Generated by codeindex at {timestamp} -->",
105
+ "",
106
+ f"# Project Symbol Index: {self.root.name}",
107
+ "",
108
+ f"Total: {len(self.symbols)} symbols",
109
+ "",
110
+ ]
111
+
112
+ # Group by type (using suffix pattern)
113
+ groups = self._group_by_type()
114
+
115
+ lines.append("## Symbols by Type")
116
+ lines.append("")
117
+
118
+ for group_name, symbols in groups.items():
119
+ if not symbols:
120
+ continue
121
+
122
+ lines.append(f"### {group_name} ({len(symbols)})")
123
+ lines.append("")
124
+
125
+ for sym in sorted(symbols, key=lambda s: s.name):
126
+ rel_path = sym.file_path.relative_to(self.root)
127
+ full_name = f"{sym.namespace}\\{sym.name}" if sym.namespace else sym.name
128
+ desc = f" - {sym.docstring}" if sym.docstring else ""
129
+ lines.append(f"- `{full_name}` - {rel_path}{desc}")
130
+
131
+ lines.append("")
132
+
133
+ # Alphabetical index
134
+ lines.append("## All Classes (alphabetical)")
135
+ lines.append("")
136
+ lines.append("| Class | Namespace | File |")
137
+ lines.append("|-------|-----------|------|")
138
+
139
+ classes = [s for s in self.symbols if s.kind == "class"]
140
+ for sym in sorted(classes, key=lambda s: s.name.lower()):
141
+ rel_path = sym.file_path.relative_to(self.root)
142
+ ns = sym.namespace if sym.namespace else "-"
143
+ lines.append(f"| {sym.name} | `{ns}` | {rel_path} |")
144
+
145
+ lines.append("")
146
+
147
+ # Functions index (if any)
148
+ functions = [s for s in self.symbols if s.kind == "function"]
149
+ if functions:
150
+ lines.append("## Functions")
151
+ lines.append("")
152
+ for sym in sorted(functions, key=lambda s: s.name.lower()):
153
+ rel_path = sym.file_path.relative_to(self.root)
154
+ lines.append(f"- `{sym.name}()` - {rel_path}")
155
+ lines.append("")
156
+
157
+ # Write file
158
+ output_path = self.root / output_file
159
+ content = "\n".join(lines)
160
+ output_path.write_text(content, encoding="utf-8")
161
+
162
+ return output_path
163
+
164
+ def _group_by_type(self) -> dict[str, list[SymbolEntry]]:
165
+ """Group symbols by their type suffix (Controller, Service, Model, etc.)."""
166
+ groups = defaultdict(list)
167
+
168
+ # Get grouping patterns from config
169
+ patterns = (
170
+ self.config.indexing.grouping.patterns
171
+ if self.config.indexing.grouping.enabled
172
+ else {}
173
+ )
174
+
175
+ for sym in self.symbols:
176
+ if sym.kind != "class":
177
+ continue
178
+
179
+ # Check suffix patterns
180
+ matched = False
181
+ for pattern in patterns.keys():
182
+ if sym.name.endswith(pattern):
183
+ groups[pattern].append(sym)
184
+ matched = True
185
+ break
186
+
187
+ if not matched:
188
+ groups["Other"].append(sym)
189
+
190
+ # Sort groups by pattern order
191
+ ordered = {}
192
+ for pattern in patterns.keys():
193
+ if pattern in groups:
194
+ ordered[pattern] = groups[pattern]
195
+
196
+ if "Other" in groups:
197
+ ordered["Other"] = groups["Other"]
198
+
199
+ return ordered
@@ -0,0 +1,283 @@
1
+ """Symbol importance scoring system.
2
+
3
+ This module provides functionality to score symbols based on their importance,
4
+ helping to prioritize which symbols should be included in README_AI.md files.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Optional
9
+
10
+ from codeindex.parser import Symbol
11
+
12
+
13
+ @dataclass
14
+ class ScoringContext:
15
+ """Scoring context for symbols.
16
+
17
+ Attributes:
18
+ framework: The framework being used (e.g., 'thinkphp', 'django')
19
+ file_type: The type of file (e.g., 'controller', 'model', 'service')
20
+ total_symbols: Total number of symbols in the file
21
+ """
22
+
23
+ framework: str = "unknown"
24
+ file_type: str = "unknown"
25
+ total_symbols: int = 0
26
+
27
+
28
+ class SymbolImportanceScorer:
29
+ """Score symbols by importance for inclusion in documentation.
30
+
31
+ This scorer evaluates symbols across multiple dimensions to determine
32
+ their importance for documentation purposes. Higher scores indicate
33
+ more important symbols that should be prioritized for inclusion.
34
+
35
+ Attributes:
36
+ context: Optional ScoringContext providing additional information
37
+ about the codebase being scored
38
+ """
39
+
40
+ # Critical keywords indicating core business functionality
41
+ CRITICAL_KEYWORDS = [
42
+ "create",
43
+ "update",
44
+ "delete",
45
+ "remove",
46
+ "save",
47
+ "insert",
48
+ "process",
49
+ "handle",
50
+ "execute",
51
+ "run",
52
+ "pay",
53
+ "notify",
54
+ "callback",
55
+ "validate",
56
+ "sign",
57
+ "auth",
58
+ "login",
59
+ "logout",
60
+ "register",
61
+ ]
62
+
63
+ # Secondary keywords for query/retrieval operations
64
+ SECONDARY_KEYWORDS = [
65
+ "find",
66
+ "search",
67
+ "query",
68
+ "list",
69
+ "show",
70
+ "display",
71
+ "fetch",
72
+ "load",
73
+ ]
74
+
75
+ def __init__(self, context: Optional[ScoringContext] = None):
76
+ """Initialize the scorer with optional context.
77
+
78
+ Args:
79
+ context: Optional ScoringContext. If not provided, uses defaults.
80
+ """
81
+ self.context = context or ScoringContext()
82
+
83
+ def _score_visibility(self, symbol: Symbol) -> float:
84
+ """Score symbol based on its visibility.
85
+
86
+ Public APIs should be prioritized over private implementation details.
87
+
88
+ Scoring:
89
+ - PHP public: 20 points (main API surface)
90
+ - PHP protected: 10 points (inheritance API)
91
+ - PHP private: 0 points (internal implementation)
92
+ - Python public (no _): 15 points
93
+ - Python private (_ or __): 5 points
94
+
95
+ Args:
96
+ symbol: The Symbol to score
97
+
98
+ Returns:
99
+ float: Visibility score (0-20)
100
+ """
101
+ sig_lower = symbol.signature.lower()
102
+
103
+ # PHP visibility keywords
104
+ if "public" in sig_lower:
105
+ return 20.0
106
+ elif "protected" in sig_lower:
107
+ return 10.0
108
+ elif "private" in sig_lower:
109
+ return 0.0
110
+ else:
111
+ # Python naming conventions
112
+ # Private/magic methods start with underscore
113
+ if symbol.name.startswith("_"):
114
+ return 5.0
115
+ else:
116
+ return 15.0
117
+
118
+ def _score_semantics(self, symbol: Symbol) -> float:
119
+ """Score symbol based on semantic importance of its name.
120
+
121
+ Core business operations (pay, create, update, delete) should be
122
+ prioritized over generic helpers or accessor methods.
123
+
124
+ Scoring:
125
+ - Critical keywords (pay, create, update, etc.): 25 points
126
+ - Secondary keywords (find, search, list): 15 points
127
+ - Generic names: 5 points
128
+
129
+ Matching is case-insensitive.
130
+
131
+ Args:
132
+ symbol: The Symbol to score
133
+
134
+ Returns:
135
+ float: Semantic importance score (5-25)
136
+ """
137
+ name_lower = symbol.name.lower()
138
+
139
+ # Check for critical keywords
140
+ for keyword in self.CRITICAL_KEYWORDS:
141
+ if keyword in name_lower:
142
+ return 25.0
143
+
144
+ # Check for secondary keywords
145
+ for keyword in self.SECONDARY_KEYWORDS:
146
+ if keyword in name_lower:
147
+ return 15.0
148
+
149
+ # Generic method
150
+ return 5.0
151
+
152
+ def _score_documentation(self, symbol: Symbol) -> float:
153
+ """Score symbol based on documentation quality.
154
+
155
+ Well-documented code is more important for understanding and
156
+ should be prioritized in documentation.
157
+
158
+ Scoring:
159
+ - Comprehensive docs (>200 chars): 15 points
160
+ - Medium docs (>50 chars): 10 points
161
+ - Brief docs (any): 5 points
162
+ - No docs: 0 points
163
+
164
+ Args:
165
+ symbol: The Symbol to score
166
+
167
+ Returns:
168
+ float: Documentation quality score (0-15)
169
+ """
170
+ if not symbol.docstring:
171
+ return 0.0
172
+
173
+ doc_length = len(symbol.docstring.strip())
174
+
175
+ if doc_length > 200:
176
+ return 15.0
177
+ elif doc_length > 50:
178
+ return 10.0
179
+ elif doc_length > 0:
180
+ return 5.0
181
+ else:
182
+ return 0.0
183
+
184
+ def _score_complexity(self, symbol: Symbol) -> float:
185
+ """Score symbol based on code complexity (measured by line count).
186
+
187
+ Larger, more complex symbols often contain critical business logic
188
+ and should be prioritized for documentation.
189
+
190
+ Scoring:
191
+ - Very large (>100 lines): 20 points
192
+ - Large (50-100 lines): 15 points
193
+ - Medium (20-50 lines): 10 points
194
+ - Small (<20 lines): 5 points
195
+
196
+ Args:
197
+ symbol: The Symbol to score
198
+
199
+ Returns:
200
+ float: Complexity score (5-20)
201
+ """
202
+ lines = symbol.line_end - symbol.line_start + 1
203
+
204
+ if lines > 100:
205
+ return 20.0
206
+ elif lines >= 50:
207
+ return 15.0
208
+ elif lines >= 20:
209
+ return 10.0
210
+ else:
211
+ return 5.0
212
+
213
+ def _score_naming_pattern(self, symbol: Symbol) -> float:
214
+ """Score symbol based on naming patterns (noise detection).
215
+
216
+ Penalize common noise patterns like getters, setters, and
217
+ internal/magic methods that clutter documentation.
218
+
219
+ Scoring (penalties):
220
+ - Magic methods (__*): -20 points
221
+ - Private methods (_*): -15 points
222
+ - Getter/setter/checker methods (get*/set*/is*/has*): -10 points
223
+ - Normal methods: 0 points
224
+
225
+ Args:
226
+ symbol: The Symbol to score
227
+
228
+ Returns:
229
+ float: Naming pattern score (-20 to 0)
230
+ """
231
+ name = symbol.name
232
+
233
+ # Check for magic methods (highest penalty)
234
+ if name.startswith("__"):
235
+ return -20.0
236
+
237
+ # Check for private methods (high penalty)
238
+ if name.startswith("_"):
239
+ return -15.0
240
+
241
+ # Check for getter/setter/checker patterns (moderate penalty)
242
+ name_lower = name.lower()
243
+ noise_prefixes = ["get", "set", "is", "has"]
244
+ for prefix in noise_prefixes:
245
+ if name_lower.startswith(prefix):
246
+ return -10.0
247
+
248
+ # Normal method name
249
+ return 0.0
250
+
251
+ def score(self, symbol: Symbol) -> float:
252
+ """Calculate importance score for a symbol.
253
+
254
+ Returns a score between 0-100, where higher scores indicate
255
+ more important symbols that should be prioritized for documentation.
256
+
257
+ Multi-dimensional scoring based on:
258
+ - Visibility (public/private): 0-20 points
259
+ - Semantic importance (keywords): 5-25 points
260
+ - Documentation quality: 0-15 points
261
+ - Code complexity: 5-20 points
262
+ - Naming patterns (noise detection): -20-0 points
263
+
264
+ Theoretical range: -10 to 100 (clamped to 0-100)
265
+
266
+ Args:
267
+ symbol: The Symbol to score
268
+
269
+ Returns:
270
+ float: Score between 0-100
271
+ """
272
+ # Start with neutral base
273
+ score = 0.0
274
+
275
+ # Add all scoring dimensions
276
+ score += self._score_visibility(symbol) # 0-20
277
+ score += self._score_semantics(symbol) # 5-25
278
+ score += self._score_documentation(symbol) # 0-15
279
+ score += self._score_complexity(symbol) # 5-20
280
+ score += self._score_naming_pattern(symbol) # -20-0
281
+
282
+ # Ensure score stays in valid range [0, 100]
283
+ return max(0.0, min(100.0, score))