ai-codeindex 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_codeindex-0.7.0.dist-info/METADATA +966 -0
- ai_codeindex-0.7.0.dist-info/RECORD +41 -0
- ai_codeindex-0.7.0.dist-info/WHEEL +4 -0
- ai_codeindex-0.7.0.dist-info/entry_points.txt +2 -0
- ai_codeindex-0.7.0.dist-info/licenses/LICENSE +21 -0
- codeindex/README_AI.md +767 -0
- codeindex/__init__.py +11 -0
- codeindex/adaptive_config.py +83 -0
- codeindex/adaptive_selector.py +171 -0
- codeindex/ai_helper.py +48 -0
- codeindex/cli.py +40 -0
- codeindex/cli_common.py +10 -0
- codeindex/cli_config.py +97 -0
- codeindex/cli_docs.py +66 -0
- codeindex/cli_hooks.py +765 -0
- codeindex/cli_scan.py +562 -0
- codeindex/cli_symbols.py +295 -0
- codeindex/cli_tech_debt.py +238 -0
- codeindex/config.py +479 -0
- codeindex/directory_tree.py +229 -0
- codeindex/docstring_processor.py +342 -0
- codeindex/errors.py +62 -0
- codeindex/extractors/__init__.py +9 -0
- codeindex/extractors/thinkphp.py +132 -0
- codeindex/file_classifier.py +148 -0
- codeindex/framework_detect.py +323 -0
- codeindex/hierarchical.py +428 -0
- codeindex/incremental.py +278 -0
- codeindex/invoker.py +260 -0
- codeindex/parallel.py +155 -0
- codeindex/parser.py +740 -0
- codeindex/route_extractor.py +98 -0
- codeindex/route_registry.py +77 -0
- codeindex/scanner.py +167 -0
- codeindex/semantic_extractor.py +408 -0
- codeindex/smart_writer.py +737 -0
- codeindex/symbol_index.py +199 -0
- codeindex/symbol_scorer.py +283 -0
- codeindex/tech_debt.py +619 -0
- codeindex/tech_debt_formatters.py +234 -0
- codeindex/writer.py +164 -0
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Business Semantic Extractor
|
|
3
|
+
|
|
4
|
+
Story 4.4: Extract business semantics from directory structure
|
|
5
|
+
Task 4.4.5: KISS Universal Description Generator
|
|
6
|
+
|
|
7
|
+
This module provides universal, language-agnostic code directory descriptions.
|
|
8
|
+
No domain knowledge assumptions, no translations, just objective information extraction.
|
|
9
|
+
"""
|
|
10
|
+
import json
|
|
11
|
+
import re
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import List, Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class DirectoryContext:
|
|
20
|
+
"""
|
|
21
|
+
Context information about a directory
|
|
22
|
+
|
|
23
|
+
Used to collect information for semantic extraction.
|
|
24
|
+
"""
|
|
25
|
+
path: str
|
|
26
|
+
files: List[str]
|
|
27
|
+
subdirs: List[str]
|
|
28
|
+
symbols: List[str] # Class names, function names
|
|
29
|
+
imports: List[str] # Import statements
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class BusinessSemantic:
|
|
34
|
+
"""
|
|
35
|
+
Business semantic information
|
|
36
|
+
|
|
37
|
+
Extracted description of what a directory does.
|
|
38
|
+
"""
|
|
39
|
+
# Description (e.g., "Admin/Controller: 15 controllers (AdminJurUsers, Permission, ...)")
|
|
40
|
+
description: str
|
|
41
|
+
purpose: str # Main purpose
|
|
42
|
+
key_components: List[str] # Key components/features
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SimpleDescriptionGenerator:
|
|
46
|
+
"""
|
|
47
|
+
Universal description generator: zero assumptions, zero semantic understanding
|
|
48
|
+
|
|
49
|
+
Only extracts objective information, no subjective judgments.
|
|
50
|
+
Supports all languages, all architectures.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def generate(self, context: DirectoryContext) -> str:
|
|
54
|
+
"""
|
|
55
|
+
Generate description: {path} {pattern} ({symbols})
|
|
56
|
+
|
|
57
|
+
Strategy:
|
|
58
|
+
1. Extract path context (last 1-2 levels)
|
|
59
|
+
2. Identify symbol pattern (common suffix)
|
|
60
|
+
3. List key symbols (sorted, deduplicated, truncated)
|
|
61
|
+
4. Simple concatenation
|
|
62
|
+
"""
|
|
63
|
+
# 1. Path context (keep original, no interpretation)
|
|
64
|
+
path_context = self._extract_path_context(context.path)
|
|
65
|
+
|
|
66
|
+
# 2. Symbol pattern analysis
|
|
67
|
+
pattern = self._analyze_symbol_pattern(context.symbols)
|
|
68
|
+
|
|
69
|
+
# 3. Extract entity names (remove common suffixes)
|
|
70
|
+
entities = self._extract_entity_names(context.symbols)
|
|
71
|
+
|
|
72
|
+
# 4. Sort, deduplicate, truncate
|
|
73
|
+
entities = sorted(set(entities))
|
|
74
|
+
entity_count = len(entities)
|
|
75
|
+
entity_sample = entities[:5]
|
|
76
|
+
|
|
77
|
+
# 5. Concatenate description
|
|
78
|
+
if entity_count == 0:
|
|
79
|
+
return f"{path_context} (empty directory)"
|
|
80
|
+
|
|
81
|
+
entity_str = ", ".join(entity_sample)
|
|
82
|
+
if entity_count > 5:
|
|
83
|
+
entity_str += f", ... ({entity_count} total)"
|
|
84
|
+
|
|
85
|
+
return f"{path_context}: {entity_count} {pattern} ({entity_str})"
|
|
86
|
+
|
|
87
|
+
def _extract_path_context(self, path: str) -> str:
|
|
88
|
+
"""Extract path context (last 1-2 levels)"""
|
|
89
|
+
parts = Path(path).parts
|
|
90
|
+
if len(parts) >= 2:
|
|
91
|
+
return f"{parts[-2]}/{parts[-1]}"
|
|
92
|
+
elif len(parts) == 1:
|
|
93
|
+
return parts[-1]
|
|
94
|
+
else:
|
|
95
|
+
return "."
|
|
96
|
+
|
|
97
|
+
def _analyze_symbol_pattern(self, symbols: List[str]) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Analyze symbol pattern (identify common suffix)
|
|
100
|
+
|
|
101
|
+
Universal suffix mapping (language-agnostic):
|
|
102
|
+
- Controller/Controllers → "controllers"
|
|
103
|
+
- Service/Services → "services"
|
|
104
|
+
- Model/Models → "models"
|
|
105
|
+
- Util/Utils/Helper/Helpers → "utilities"
|
|
106
|
+
- Manager/Managers → "managers"
|
|
107
|
+
- Handler/Handlers → "handlers"
|
|
108
|
+
- Provider/Providers → "providers"
|
|
109
|
+
- Repository/Repositories → "repositories"
|
|
110
|
+
- No obvious pattern → "modules/classes/functions"
|
|
111
|
+
"""
|
|
112
|
+
if not symbols:
|
|
113
|
+
return "items"
|
|
114
|
+
|
|
115
|
+
# Count suffixes
|
|
116
|
+
suffix_count = defaultdict(int)
|
|
117
|
+
common_suffixes = [
|
|
118
|
+
"Controller", "Service", "Model", "Manager",
|
|
119
|
+
"Handler", "Provider", "Repository", "Util",
|
|
120
|
+
"Helper", "Factory", "Builder", "Strategy",
|
|
121
|
+
"Observer", "Listener", "Adapter", "Facade",
|
|
122
|
+
"Test", "Spec"
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
for symbol in symbols:
|
|
126
|
+
for suffix in common_suffixes:
|
|
127
|
+
if symbol.endswith(suffix):
|
|
128
|
+
suffix_count[suffix] += 1
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
if not suffix_count:
|
|
132
|
+
return "modules"
|
|
133
|
+
|
|
134
|
+
# Find most common suffix
|
|
135
|
+
dominant_suffix, count = max(suffix_count.items(), key=lambda x: x[1])
|
|
136
|
+
|
|
137
|
+
# If >50% of symbols have this suffix, use plural form
|
|
138
|
+
if count / len(symbols) > 0.5:
|
|
139
|
+
return self._pluralize(dominant_suffix)
|
|
140
|
+
else:
|
|
141
|
+
return "modules"
|
|
142
|
+
|
|
143
|
+
def _pluralize(self, suffix: str) -> str:
|
|
144
|
+
"""Convert to plural (simple rules)"""
|
|
145
|
+
mapping = {
|
|
146
|
+
"Controller": "controllers",
|
|
147
|
+
"Service": "services",
|
|
148
|
+
"Model": "models",
|
|
149
|
+
"Manager": "managers",
|
|
150
|
+
"Handler": "handlers",
|
|
151
|
+
"Provider": "providers",
|
|
152
|
+
"Repository": "repositories",
|
|
153
|
+
"Util": "utilities",
|
|
154
|
+
"Helper": "helpers",
|
|
155
|
+
"Factory": "factories",
|
|
156
|
+
"Strategy": "strategies",
|
|
157
|
+
"Observer": "observers",
|
|
158
|
+
"Adapter": "adapters",
|
|
159
|
+
"Test": "tests",
|
|
160
|
+
"Spec": "specs",
|
|
161
|
+
}
|
|
162
|
+
return mapping.get(suffix, suffix.lower() + "s")
|
|
163
|
+
|
|
164
|
+
def _extract_entity_names(self, symbols: List[str]) -> List[str]:
|
|
165
|
+
"""
|
|
166
|
+
Extract entity names (remove common suffixes)
|
|
167
|
+
|
|
168
|
+
"AdminJurUsersController" → "AdminJurUsers"
|
|
169
|
+
"UserRoleService" → "UserRole"
|
|
170
|
+
"ProductModel" → "Product"
|
|
171
|
+
"""
|
|
172
|
+
entities = []
|
|
173
|
+
common_suffixes = [
|
|
174
|
+
"Controller", "Service", "Model", "Manager",
|
|
175
|
+
"Handler", "Provider", "Repository", "Util",
|
|
176
|
+
"Helper", "Factory", "Builder", "Strategy",
|
|
177
|
+
"Observer", "Listener", "Adapter", "Facade",
|
|
178
|
+
"Test", "Spec"
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
for symbol in symbols:
|
|
182
|
+
entity = symbol
|
|
183
|
+
for suffix in common_suffixes:
|
|
184
|
+
if entity.endswith(suffix):
|
|
185
|
+
entity = entity[:-len(suffix)]
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
# Remove Interface/Abstract prefix
|
|
189
|
+
if entity.startswith("I") and len(entity) > 1 and entity[1].isupper():
|
|
190
|
+
entity = entity[1:]
|
|
191
|
+
if entity.startswith("Abstract"):
|
|
192
|
+
entity = entity[8:]
|
|
193
|
+
|
|
194
|
+
if entity: # Prevent empty strings
|
|
195
|
+
entities.append(entity)
|
|
196
|
+
|
|
197
|
+
return entities
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class SemanticExtractor:
|
|
201
|
+
"""
|
|
202
|
+
Extract business semantics from directory context
|
|
203
|
+
|
|
204
|
+
Supports two modes:
|
|
205
|
+
- Heuristic mode: KISS universal description (fast, offline)
|
|
206
|
+
- AI mode: LLM-powered semantic understanding (accurate, requires API)
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
def __init__(self, use_ai: bool = False, ai_command: Optional[str] = None):
|
|
210
|
+
"""
|
|
211
|
+
Initialize SemanticExtractor
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
use_ai: If True, use AI for extraction; if False, use heuristic rules
|
|
215
|
+
ai_command: AI command template (required if use_ai=True)
|
|
216
|
+
"""
|
|
217
|
+
self.use_ai = use_ai
|
|
218
|
+
self.ai_command = ai_command
|
|
219
|
+
|
|
220
|
+
if use_ai and not ai_command:
|
|
221
|
+
raise ValueError("ai_command is required when use_ai=True")
|
|
222
|
+
|
|
223
|
+
def extract_directory_semantic(
|
|
224
|
+
self,
|
|
225
|
+
context: DirectoryContext
|
|
226
|
+
) -> BusinessSemantic:
|
|
227
|
+
"""
|
|
228
|
+
Extract business semantic from directory context
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
context: Directory context information
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
BusinessSemantic with description and purpose
|
|
235
|
+
|
|
236
|
+
Strategy:
|
|
237
|
+
- If use_ai=True, call AI for extraction
|
|
238
|
+
- Otherwise, use KISS universal description
|
|
239
|
+
"""
|
|
240
|
+
if self.use_ai:
|
|
241
|
+
# AI mode
|
|
242
|
+
return self._ai_extract(context)
|
|
243
|
+
else:
|
|
244
|
+
# Heuristic mode (KISS)
|
|
245
|
+
return self._heuristic_extract(context)
|
|
246
|
+
|
|
247
|
+
def _heuristic_extract(self, context: DirectoryContext) -> BusinessSemantic:
|
|
248
|
+
"""
|
|
249
|
+
Extract semantic using KISS universal description
|
|
250
|
+
|
|
251
|
+
Strategy:
|
|
252
|
+
1. Use SimpleDescriptionGenerator for universal format
|
|
253
|
+
2. No domain assumptions, no translations
|
|
254
|
+
3. Just objective information extraction
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
context: Directory context
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
BusinessSemantic with universal description
|
|
261
|
+
"""
|
|
262
|
+
generator = SimpleDescriptionGenerator()
|
|
263
|
+
description = generator.generate(context)
|
|
264
|
+
|
|
265
|
+
# Extract entities for key_components
|
|
266
|
+
entities = generator._extract_entity_names(context.symbols)
|
|
267
|
+
key_components = sorted(set(entities))[:10]
|
|
268
|
+
|
|
269
|
+
return BusinessSemantic(
|
|
270
|
+
description=description,
|
|
271
|
+
purpose=description, # Simplified: purpose = description
|
|
272
|
+
key_components=key_components
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
def _ai_extract(self, context: DirectoryContext) -> BusinessSemantic:
|
|
276
|
+
"""
|
|
277
|
+
Extract semantic using AI (implemented in Day 2)
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
context: Directory context
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
BusinessSemantic with AI-generated description
|
|
284
|
+
"""
|
|
285
|
+
# Build the prompt
|
|
286
|
+
prompt = self._build_ai_prompt(context)
|
|
287
|
+
|
|
288
|
+
# Invoke AI CLI
|
|
289
|
+
from codeindex.invoker import invoke_ai_cli
|
|
290
|
+
|
|
291
|
+
result = invoke_ai_cli(
|
|
292
|
+
command_template=self.ai_command,
|
|
293
|
+
prompt=prompt,
|
|
294
|
+
timeout=30
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if not result.success:
|
|
298
|
+
# Fallback to heuristic if AI fails
|
|
299
|
+
return self._heuristic_extract(context)
|
|
300
|
+
|
|
301
|
+
# Parse AI response
|
|
302
|
+
try:
|
|
303
|
+
semantic = self._parse_ai_response(result.output)
|
|
304
|
+
return semantic
|
|
305
|
+
except Exception:
|
|
306
|
+
# Fallback to heuristic if parsing fails
|
|
307
|
+
return self._heuristic_extract(context)
|
|
308
|
+
|
|
309
|
+
def _build_ai_prompt(self, context: DirectoryContext) -> str:
|
|
310
|
+
"""
|
|
311
|
+
Build AI prompt for semantic extraction
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
context: Directory context
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Formatted prompt string
|
|
318
|
+
"""
|
|
319
|
+
# Prepare context information
|
|
320
|
+
files_str = ", ".join(context.files[:10]) # Limit to first 10
|
|
321
|
+
if len(context.files) > 10:
|
|
322
|
+
files_str += f" (and {len(context.files) - 10} more)"
|
|
323
|
+
|
|
324
|
+
subdirs_str = ", ".join(context.subdirs[:10])
|
|
325
|
+
if len(context.subdirs) > 10:
|
|
326
|
+
subdirs_str += f" (and {len(context.subdirs) - 10} more)"
|
|
327
|
+
|
|
328
|
+
symbols_str = ", ".join(context.symbols[:20])
|
|
329
|
+
if len(context.symbols) > 20:
|
|
330
|
+
symbols_str += f" (and {len(context.symbols) - 20} more)"
|
|
331
|
+
|
|
332
|
+
imports_str = ", ".join(context.imports[:10])
|
|
333
|
+
if len(context.imports) > 10:
|
|
334
|
+
imports_str += f" (and {len(context.imports) - 10} more)"
|
|
335
|
+
|
|
336
|
+
prompt = f"""分析以下代码目录的业务语义,提供准确且有意义的描述。
|
|
337
|
+
|
|
338
|
+
目录路径: {context.path}
|
|
339
|
+
|
|
340
|
+
文件列表 ({len(context.files)} 个):
|
|
341
|
+
{files_str or "无"}
|
|
342
|
+
|
|
343
|
+
子目录 ({len(context.subdirs)} 个):
|
|
344
|
+
{subdirs_str or "无"}
|
|
345
|
+
|
|
346
|
+
代码符号 ({len(context.symbols)} 个):
|
|
347
|
+
{symbols_str or "无"}
|
|
348
|
+
|
|
349
|
+
导入模块 ({len(context.imports)} 个):
|
|
350
|
+
{imports_str or "无"}
|
|
351
|
+
|
|
352
|
+
请分析这个目录的业务含义,返回 JSON 格式:
|
|
353
|
+
|
|
354
|
+
{{
|
|
355
|
+
"description": "业务描述(中文,简洁明确,避免通用化描述如'业务模块')",
|
|
356
|
+
"purpose": "主要用途(中文,说明这个目录的核心职责)",
|
|
357
|
+
"key_components": ["组件1", "组件2", "组件3"]
|
|
358
|
+
}}
|
|
359
|
+
|
|
360
|
+
要求:
|
|
361
|
+
1. description 必须反映实际业务含义,不要使用"业务模块"、"代码目录"等通用描述
|
|
362
|
+
2. 结合路径名、文件名、符号名推断业务领域
|
|
363
|
+
3. 优先识别架构模式(Controller/Model/Service等)和业务领域(User/Product/Order等)
|
|
364
|
+
4. key_components 列举2-4个关键组成部分
|
|
365
|
+
5. 如果是PHP项目,识别ThinkPHP等框架的特定模式
|
|
366
|
+
|
|
367
|
+
只返回 JSON,不要其他解释。"""
|
|
368
|
+
|
|
369
|
+
return prompt
|
|
370
|
+
|
|
371
|
+
def _parse_ai_response(self, response: str) -> BusinessSemantic:
|
|
372
|
+
"""
|
|
373
|
+
Parse AI response into BusinessSemantic
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
response: AI output string
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
BusinessSemantic parsed from response
|
|
380
|
+
|
|
381
|
+
Raises:
|
|
382
|
+
ValueError: If response cannot be parsed
|
|
383
|
+
"""
|
|
384
|
+
# Extract JSON from response
|
|
385
|
+
# AI might wrap JSON in markdown code blocks
|
|
386
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
|
|
387
|
+
if json_match:
|
|
388
|
+
json_str = json_match.group(1)
|
|
389
|
+
else:
|
|
390
|
+
# Try to find JSON directly
|
|
391
|
+
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
392
|
+
if json_match:
|
|
393
|
+
json_str = json_match.group(0)
|
|
394
|
+
else:
|
|
395
|
+
raise ValueError("No JSON found in AI response")
|
|
396
|
+
|
|
397
|
+
# Parse JSON
|
|
398
|
+
data = json.loads(json_str)
|
|
399
|
+
|
|
400
|
+
# Validate required fields
|
|
401
|
+
if "description" not in data:
|
|
402
|
+
raise ValueError("Missing 'description' field in AI response")
|
|
403
|
+
|
|
404
|
+
return BusinessSemantic(
|
|
405
|
+
description=data.get("description", ""),
|
|
406
|
+
purpose=data.get("purpose", ""),
|
|
407
|
+
key_components=data.get("key_components", [])
|
|
408
|
+
)
|