ai-codeindex 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,408 @@
1
+ """
2
+ Business Semantic Extractor
3
+
4
+ Story 4.4: Extract business semantics from directory structure
5
+ Task 4.4.5: KISS Universal Description Generator
6
+
7
+ This module provides universal, language-agnostic code directory descriptions.
8
+ No domain knowledge assumptions, no translations, just objective information extraction.
9
+ """
10
+ import json
11
+ import re
12
+ from collections import defaultdict
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import List, Optional
16
+
17
+
18
+ @dataclass
19
+ class DirectoryContext:
20
+ """
21
+ Context information about a directory
22
+
23
+ Used to collect information for semantic extraction.
24
+ """
25
+ path: str
26
+ files: List[str]
27
+ subdirs: List[str]
28
+ symbols: List[str] # Class names, function names
29
+ imports: List[str] # Import statements
30
+
31
+
32
+ @dataclass
33
+ class BusinessSemantic:
34
+ """
35
+ Business semantic information
36
+
37
+ Extracted description of what a directory does.
38
+ """
39
+ # Description (e.g., "Admin/Controller: 15 controllers (AdminJurUsers, Permission, ...)")
40
+ description: str
41
+ purpose: str # Main purpose
42
+ key_components: List[str] # Key components/features
43
+
44
+
45
+ class SimpleDescriptionGenerator:
46
+ """
47
+ Universal description generator: zero assumptions, zero semantic understanding
48
+
49
+ Only extracts objective information, no subjective judgments.
50
+ Supports all languages, all architectures.
51
+ """
52
+
53
+ def generate(self, context: DirectoryContext) -> str:
54
+ """
55
+ Generate description: {path} {pattern} ({symbols})
56
+
57
+ Strategy:
58
+ 1. Extract path context (last 1-2 levels)
59
+ 2. Identify symbol pattern (common suffix)
60
+ 3. List key symbols (sorted, deduplicated, truncated)
61
+ 4. Simple concatenation
62
+ """
63
+ # 1. Path context (keep original, no interpretation)
64
+ path_context = self._extract_path_context(context.path)
65
+
66
+ # 2. Symbol pattern analysis
67
+ pattern = self._analyze_symbol_pattern(context.symbols)
68
+
69
+ # 3. Extract entity names (remove common suffixes)
70
+ entities = self._extract_entity_names(context.symbols)
71
+
72
+ # 4. Sort, deduplicate, truncate
73
+ entities = sorted(set(entities))
74
+ entity_count = len(entities)
75
+ entity_sample = entities[:5]
76
+
77
+ # 5. Concatenate description
78
+ if entity_count == 0:
79
+ return f"{path_context} (empty directory)"
80
+
81
+ entity_str = ", ".join(entity_sample)
82
+ if entity_count > 5:
83
+ entity_str += f", ... ({entity_count} total)"
84
+
85
+ return f"{path_context}: {entity_count} {pattern} ({entity_str})"
86
+
87
+ def _extract_path_context(self, path: str) -> str:
88
+ """Extract path context (last 1-2 levels)"""
89
+ parts = Path(path).parts
90
+ if len(parts) >= 2:
91
+ return f"{parts[-2]}/{parts[-1]}"
92
+ elif len(parts) == 1:
93
+ return parts[-1]
94
+ else:
95
+ return "."
96
+
97
+ def _analyze_symbol_pattern(self, symbols: List[str]) -> str:
98
+ """
99
+ Analyze symbol pattern (identify common suffix)
100
+
101
+ Universal suffix mapping (language-agnostic):
102
+ - Controller/Controllers → "controllers"
103
+ - Service/Services → "services"
104
+ - Model/Models → "models"
105
+ - Util/Utils/Helper/Helpers → "utilities"
106
+ - Manager/Managers → "managers"
107
+ - Handler/Handlers → "handlers"
108
+ - Provider/Providers → "providers"
109
+ - Repository/Repositories → "repositories"
110
+ - No obvious pattern → "modules/classes/functions"
111
+ """
112
+ if not symbols:
113
+ return "items"
114
+
115
+ # Count suffixes
116
+ suffix_count = defaultdict(int)
117
+ common_suffixes = [
118
+ "Controller", "Service", "Model", "Manager",
119
+ "Handler", "Provider", "Repository", "Util",
120
+ "Helper", "Factory", "Builder", "Strategy",
121
+ "Observer", "Listener", "Adapter", "Facade",
122
+ "Test", "Spec"
123
+ ]
124
+
125
+ for symbol in symbols:
126
+ for suffix in common_suffixes:
127
+ if symbol.endswith(suffix):
128
+ suffix_count[suffix] += 1
129
+ break
130
+
131
+ if not suffix_count:
132
+ return "modules"
133
+
134
+ # Find most common suffix
135
+ dominant_suffix, count = max(suffix_count.items(), key=lambda x: x[1])
136
+
137
+ # If >50% of symbols have this suffix, use plural form
138
+ if count / len(symbols) > 0.5:
139
+ return self._pluralize(dominant_suffix)
140
+ else:
141
+ return "modules"
142
+
143
+ def _pluralize(self, suffix: str) -> str:
144
+ """Convert to plural (simple rules)"""
145
+ mapping = {
146
+ "Controller": "controllers",
147
+ "Service": "services",
148
+ "Model": "models",
149
+ "Manager": "managers",
150
+ "Handler": "handlers",
151
+ "Provider": "providers",
152
+ "Repository": "repositories",
153
+ "Util": "utilities",
154
+ "Helper": "helpers",
155
+ "Factory": "factories",
156
+ "Strategy": "strategies",
157
+ "Observer": "observers",
158
+ "Adapter": "adapters",
159
+ "Test": "tests",
160
+ "Spec": "specs",
161
+ }
162
+ return mapping.get(suffix, suffix.lower() + "s")
163
+
164
+ def _extract_entity_names(self, symbols: List[str]) -> List[str]:
165
+ """
166
+ Extract entity names (remove common suffixes)
167
+
168
+ "AdminJurUsersController" → "AdminJurUsers"
169
+ "UserRoleService" → "UserRole"
170
+ "ProductModel" → "Product"
171
+ """
172
+ entities = []
173
+ common_suffixes = [
174
+ "Controller", "Service", "Model", "Manager",
175
+ "Handler", "Provider", "Repository", "Util",
176
+ "Helper", "Factory", "Builder", "Strategy",
177
+ "Observer", "Listener", "Adapter", "Facade",
178
+ "Test", "Spec"
179
+ ]
180
+
181
+ for symbol in symbols:
182
+ entity = symbol
183
+ for suffix in common_suffixes:
184
+ if entity.endswith(suffix):
185
+ entity = entity[:-len(suffix)]
186
+ break
187
+
188
+ # Remove Interface/Abstract prefix
189
+ if entity.startswith("I") and len(entity) > 1 and entity[1].isupper():
190
+ entity = entity[1:]
191
+ if entity.startswith("Abstract"):
192
+ entity = entity[8:]
193
+
194
+ if entity: # Prevent empty strings
195
+ entities.append(entity)
196
+
197
+ return entities
198
+
199
+
200
+ class SemanticExtractor:
201
+ """
202
+ Extract business semantics from directory context
203
+
204
+ Supports two modes:
205
+ - Heuristic mode: KISS universal description (fast, offline)
206
+ - AI mode: LLM-powered semantic understanding (accurate, requires API)
207
+ """
208
+
209
+ def __init__(self, use_ai: bool = False, ai_command: Optional[str] = None):
210
+ """
211
+ Initialize SemanticExtractor
212
+
213
+ Args:
214
+ use_ai: If True, use AI for extraction; if False, use heuristic rules
215
+ ai_command: AI command template (required if use_ai=True)
216
+ """
217
+ self.use_ai = use_ai
218
+ self.ai_command = ai_command
219
+
220
+ if use_ai and not ai_command:
221
+ raise ValueError("ai_command is required when use_ai=True")
222
+
223
+ def extract_directory_semantic(
224
+ self,
225
+ context: DirectoryContext
226
+ ) -> BusinessSemantic:
227
+ """
228
+ Extract business semantic from directory context
229
+
230
+ Args:
231
+ context: Directory context information
232
+
233
+ Returns:
234
+ BusinessSemantic with description and purpose
235
+
236
+ Strategy:
237
+ - If use_ai=True, call AI for extraction
238
+ - Otherwise, use KISS universal description
239
+ """
240
+ if self.use_ai:
241
+ # AI mode
242
+ return self._ai_extract(context)
243
+ else:
244
+ # Heuristic mode (KISS)
245
+ return self._heuristic_extract(context)
246
+
247
+ def _heuristic_extract(self, context: DirectoryContext) -> BusinessSemantic:
248
+ """
249
+ Extract semantic using KISS universal description
250
+
251
+ Strategy:
252
+ 1. Use SimpleDescriptionGenerator for universal format
253
+ 2. No domain assumptions, no translations
254
+ 3. Just objective information extraction
255
+
256
+ Args:
257
+ context: Directory context
258
+
259
+ Returns:
260
+ BusinessSemantic with universal description
261
+ """
262
+ generator = SimpleDescriptionGenerator()
263
+ description = generator.generate(context)
264
+
265
+ # Extract entities for key_components
266
+ entities = generator._extract_entity_names(context.symbols)
267
+ key_components = sorted(set(entities))[:10]
268
+
269
+ return BusinessSemantic(
270
+ description=description,
271
+ purpose=description, # Simplified: purpose = description
272
+ key_components=key_components
273
+ )
274
+
275
+ def _ai_extract(self, context: DirectoryContext) -> BusinessSemantic:
276
+ """
277
+ Extract semantic using AI (implemented in Day 2)
278
+
279
+ Args:
280
+ context: Directory context
281
+
282
+ Returns:
283
+ BusinessSemantic with AI-generated description
284
+ """
285
+ # Build the prompt
286
+ prompt = self._build_ai_prompt(context)
287
+
288
+ # Invoke AI CLI
289
+ from codeindex.invoker import invoke_ai_cli
290
+
291
+ result = invoke_ai_cli(
292
+ command_template=self.ai_command,
293
+ prompt=prompt,
294
+ timeout=30
295
+ )
296
+
297
+ if not result.success:
298
+ # Fallback to heuristic if AI fails
299
+ return self._heuristic_extract(context)
300
+
301
+ # Parse AI response
302
+ try:
303
+ semantic = self._parse_ai_response(result.output)
304
+ return semantic
305
+ except Exception:
306
+ # Fallback to heuristic if parsing fails
307
+ return self._heuristic_extract(context)
308
+
309
+ def _build_ai_prompt(self, context: DirectoryContext) -> str:
310
+ """
311
+ Build AI prompt for semantic extraction
312
+
313
+ Args:
314
+ context: Directory context
315
+
316
+ Returns:
317
+ Formatted prompt string
318
+ """
319
+ # Prepare context information
320
+ files_str = ", ".join(context.files[:10]) # Limit to first 10
321
+ if len(context.files) > 10:
322
+ files_str += f" (and {len(context.files) - 10} more)"
323
+
324
+ subdirs_str = ", ".join(context.subdirs[:10])
325
+ if len(context.subdirs) > 10:
326
+ subdirs_str += f" (and {len(context.subdirs) - 10} more)"
327
+
328
+ symbols_str = ", ".join(context.symbols[:20])
329
+ if len(context.symbols) > 20:
330
+ symbols_str += f" (and {len(context.symbols) - 20} more)"
331
+
332
+ imports_str = ", ".join(context.imports[:10])
333
+ if len(context.imports) > 10:
334
+ imports_str += f" (and {len(context.imports) - 10} more)"
335
+
336
+ prompt = f"""分析以下代码目录的业务语义,提供准确且有意义的描述。
337
+
338
+ 目录路径: {context.path}
339
+
340
+ 文件列表 ({len(context.files)} 个):
341
+ {files_str or "无"}
342
+
343
+ 子目录 ({len(context.subdirs)} 个):
344
+ {subdirs_str or "无"}
345
+
346
+ 代码符号 ({len(context.symbols)} 个):
347
+ {symbols_str or "无"}
348
+
349
+ 导入模块 ({len(context.imports)} 个):
350
+ {imports_str or "无"}
351
+
352
+ 请分析这个目录的业务含义,返回 JSON 格式:
353
+
354
+ {{
355
+ "description": "业务描述(中文,简洁明确,避免通用化描述如'业务模块')",
356
+ "purpose": "主要用途(中文,说明这个目录的核心职责)",
357
+ "key_components": ["组件1", "组件2", "组件3"]
358
+ }}
359
+
360
+ 要求:
361
+ 1. description 必须反映实际业务含义,不要使用"业务模块"、"代码目录"等通用描述
362
+ 2. 结合路径名、文件名、符号名推断业务领域
363
+ 3. 优先识别架构模式(Controller/Model/Service等)和业务领域(User/Product/Order等)
364
+ 4. key_components 列举2-4个关键组成部分
365
+ 5. 如果是PHP项目,识别ThinkPHP等框架的特定模式
366
+
367
+ 只返回 JSON,不要其他解释。"""
368
+
369
+ return prompt
370
+
371
+ def _parse_ai_response(self, response: str) -> BusinessSemantic:
372
+ """
373
+ Parse AI response into BusinessSemantic
374
+
375
+ Args:
376
+ response: AI output string
377
+
378
+ Returns:
379
+ BusinessSemantic parsed from response
380
+
381
+ Raises:
382
+ ValueError: If response cannot be parsed
383
+ """
384
+ # Extract JSON from response
385
+ # AI might wrap JSON in markdown code blocks
386
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
387
+ if json_match:
388
+ json_str = json_match.group(1)
389
+ else:
390
+ # Try to find JSON directly
391
+ json_match = re.search(r'\{.*\}', response, re.DOTALL)
392
+ if json_match:
393
+ json_str = json_match.group(0)
394
+ else:
395
+ raise ValueError("No JSON found in AI response")
396
+
397
+ # Parse JSON
398
+ data = json.loads(json_str)
399
+
400
+ # Validate required fields
401
+ if "description" not in data:
402
+ raise ValueError("Missing 'description' field in AI response")
403
+
404
+ return BusinessSemantic(
405
+ description=data.get("description", ""),
406
+ purpose=data.get("purpose", ""),
407
+ key_components=data.get("key_components", [])
408
+ )