ai-codeindex 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeindex/cli_scan.py ADDED
@@ -0,0 +1,562 @@
1
+ """CLI commands for scanning directories and generating README files.
2
+
3
+ This module provides the core scanning functionality, including single directory
4
+ scans and bulk scanning of entire projects with parallel processing and AI enhancement.
5
+ """
6
+
7
+ import concurrent.futures
8
+ from pathlib import Path
9
+
10
+ import click
11
+
12
+ from .cli_common import console
13
+ from .config import Config
14
+ from .directory_tree import DirectoryTree
15
+ from .docstring_processor import DocstringProcessor
16
+ from .invoker import (
17
+ clean_ai_output,
18
+ format_prompt,
19
+ invoke_ai_cli,
20
+ validate_markdown_output,
21
+ )
22
+ from .parallel import parse_files_parallel
23
+ from .scanner import scan_directory
24
+ from .smart_writer import SmartWriter
25
+ from .writer import (
26
+ format_files_for_prompt,
27
+ format_imports_for_prompt,
28
+ format_symbols_for_prompt,
29
+ generate_fallback_readme,
30
+ write_readme,
31
+ )
32
+
33
+ # ========== Helper functions for scan_all (extracted from nested functions) ==========
34
+
35
+
36
+ def _process_directory_with_smartwriter(
37
+ dir_path: Path,
38
+ tree: DirectoryTree,
39
+ config: Config,
40
+ docstring_processor=None,
41
+ ) -> tuple[Path, bool, str, int]:
42
+ """Process a single directory with SmartWriter.
43
+
44
+ Args:
45
+ dir_path: Directory to process
46
+ tree: DirectoryTree for level and child information
47
+ config: Configuration
48
+ docstring_processor: Optional DocstringProcessor for AI docstring extraction
49
+
50
+ Returns:
51
+ Tuple of (path, success, status_message, size_bytes)
52
+ """
53
+ try:
54
+ level = tree.get_level(dir_path)
55
+ child_dirs = tree.get_children(dir_path)
56
+
57
+ # Scan directory (non-recursive for overview to avoid huge file lists)
58
+ scan_recursive = level != "overview"
59
+ result = scan_directory(dir_path, config, recursive=scan_recursive)
60
+
61
+ # Parse files (if any)
62
+ parse_results = []
63
+ if result.files:
64
+ parse_results = parse_files_parallel(result.files, config, quiet=True)
65
+
66
+ # Use SmartWriter with docstring processor
67
+ writer = SmartWriter(config.indexing, docstring_processor=docstring_processor)
68
+ write_result = writer.write_readme(
69
+ dir_path=dir_path,
70
+ parse_results=parse_results,
71
+ level=level,
72
+ child_dirs=child_dirs,
73
+ output_file=config.output_file,
74
+ )
75
+
76
+ if write_result.success:
77
+ size_kb = write_result.size_bytes / 1024
78
+ truncated = " [truncated]" if write_result.truncated else ""
79
+ status_msg = f"[{level}] {size_kb:.1f}KB{truncated}"
80
+ return dir_path, True, status_msg, write_result.size_bytes
81
+ else:
82
+ return dir_path, False, write_result.error, 0
83
+
84
+ except Exception as e:
85
+ return dir_path, False, str(e), 0
86
+
87
+
88
+ # ========== CLI Commands ==========
89
+
90
+
91
+ @click.command()
92
+ @click.argument("path", type=click.Path(exists=False, file_okay=False, path_type=Path))
93
+ @click.option("--dry-run", is_flag=True, help="Show what would be done without executing")
94
+ @click.option("--fallback", is_flag=True, help="Generate basic README without AI")
95
+ @click.option("--quiet", "-q", is_flag=True, help="Minimal output")
96
+ @click.option("--timeout", default=120, help="AI CLI timeout in seconds")
97
+ @click.option("--parallel", "-p", type=int, help="Override parallel workers (from config)")
98
+ @click.option(
99
+ "--docstring-mode",
100
+ type=click.Choice(["off", "hybrid", "all-ai"]),
101
+ default=None,
102
+ help="Docstring extraction mode (off=disabled, hybrid=selective AI, "
103
+ "all-ai=maximum quality). Overrides config value.",
104
+ )
105
+ @click.option(
106
+ "--show-cost",
107
+ is_flag=True,
108
+ help="Display AI token usage and estimated cost for docstring processing",
109
+ )
110
+ @click.option(
111
+ "--output",
112
+ type=click.Choice(["markdown", "json"]),
113
+ default="markdown",
114
+ help="Output format (markdown writes README_AI.md, json prints to stdout)",
115
+ )
116
+ def scan(
117
+ path: Path,
118
+ dry_run: bool,
119
+ fallback: bool,
120
+ quiet: bool,
121
+ timeout: int,
122
+ parallel: int | None,
123
+ docstring_mode: str | None,
124
+ show_cost: bool,
125
+ output: str,
126
+ ):
127
+ """
128
+ Scan a directory and generate README_AI.md.
129
+
130
+ PATH is the directory to scan.
131
+ """
132
+ path = path.resolve()
133
+
134
+ # Check if path exists (handle JSON error output)
135
+ if not path.exists():
136
+ if output == "json":
137
+ import json
138
+
139
+ from .errors import ErrorCode, ErrorInfo, create_error_response
140
+
141
+ error = ErrorInfo(
142
+ code=ErrorCode.DIRECTORY_NOT_FOUND,
143
+ message=f"Directory does not exist: {path}",
144
+ detail=None,
145
+ )
146
+ click.echo(json.dumps(create_error_response(error), indent=2, ensure_ascii=False))
147
+ raise SystemExit(1)
148
+ else:
149
+ # Keep original Click behavior for markdown mode
150
+ raise click.BadParameter(f"Directory '{path}' does not exist.")
151
+
152
+ # Check if it's a directory
153
+ if not path.is_dir():
154
+ if output == "json":
155
+ import json
156
+
157
+ from .errors import ErrorCode, ErrorInfo, create_error_response
158
+
159
+ error = ErrorInfo(
160
+ code=ErrorCode.INVALID_PATH,
161
+ message=f"Path is not a directory: {path}",
162
+ detail=None,
163
+ )
164
+ click.echo(json.dumps(create_error_response(error), indent=2, ensure_ascii=False))
165
+ raise SystemExit(1)
166
+ else:
167
+ raise click.BadParameter(f"Path '{path}' is not a directory.")
168
+
169
+ # Load config
170
+ config = Config.load()
171
+
172
+ # Override parallel workers if specified
173
+ if parallel is not None:
174
+ config.parallel_workers = parallel
175
+
176
+ # Determine docstring mode (CLI overrides config)
177
+ effective_docstring_mode = (
178
+ docstring_mode if docstring_mode is not None else config.docstrings.mode
179
+ )
180
+
181
+ # Create DocstringProcessor if needed
182
+ docstring_processor = None
183
+ if effective_docstring_mode != "off" and config.docstrings.ai_command:
184
+ docstring_processor = DocstringProcessor(
185
+ ai_command=config.docstrings.ai_command,
186
+ mode=effective_docstring_mode,
187
+ )
188
+
189
+ if not quiet:
190
+ console.print(f"[bold]Scanning:[/bold] {path}")
191
+
192
+ # Scan directory
193
+ if not quiet:
194
+ console.print(" [dim]→ Scanning directory...[/dim]")
195
+ result = scan_directory(path, config, path.parent)
196
+
197
+ if not result.files:
198
+ if output == "json":
199
+ import json
200
+ # Output empty results JSON
201
+ json_output = {
202
+ "success": True,
203
+ "results": [],
204
+ "summary": {
205
+ "total_files": 0,
206
+ "total_symbols": 0,
207
+ "total_imports": 0,
208
+ "errors": 0,
209
+ },
210
+ }
211
+ click.echo(json.dumps(json_output, indent=2, ensure_ascii=False))
212
+ return
213
+ else:
214
+ if not quiet:
215
+ console.print(f"[yellow]No indexable files found in {path}[/yellow]")
216
+ return
217
+
218
+ if not quiet:
219
+ console.print(f" [dim]→ Found {len(result.files)} files[/dim]")
220
+
221
+ # Parse files
222
+ if not quiet:
223
+ console.print(" [dim]→ Parsing with tree-sitter...[/dim]")
224
+ parse_results = parse_files_parallel(result.files, config, quiet)
225
+ total_symbols = sum(len(r.symbols) for r in parse_results)
226
+ if not quiet:
227
+ console.print(f" [dim]→ Extracted {total_symbols} symbols[/dim]")
228
+
229
+ # Handle JSON output mode
230
+ if output == "json":
231
+ import json
232
+
233
+ # Build JSON output
234
+ json_output = {
235
+ "success": True,
236
+ "results": [r.to_dict() for r in parse_results],
237
+ "summary": {
238
+ "total_files": len(parse_results),
239
+ "total_symbols": sum(len(r.symbols) for r in parse_results),
240
+ "total_imports": sum(len(r.imports) for r in parse_results),
241
+ "errors": sum(1 for r in parse_results if r.error),
242
+ },
243
+ }
244
+
245
+ # Output to stdout
246
+ click.echo(json.dumps(json_output, indent=2, ensure_ascii=False))
247
+ return
248
+
249
+ # Format for prompt
250
+ if not quiet:
251
+ console.print(" [dim]→ Formatting prompt...[/dim]")
252
+ files_info = format_files_for_prompt(parse_results)
253
+ symbols_info = format_symbols_for_prompt(parse_results)
254
+ imports_info = format_imports_for_prompt(parse_results)
255
+
256
+ if fallback:
257
+ # Generate smart README without AI
258
+ if not quiet:
259
+ console.print(" [dim]→ Writing smart README...[/dim]")
260
+
261
+ # For single directory scan, always use detailed level
262
+ # (overview/navigation only make sense in hierarchical mode)
263
+ level = "detailed"
264
+
265
+ writer = SmartWriter(config.indexing, docstring_processor=docstring_processor)
266
+ write_result = writer.write_readme(
267
+ dir_path=path,
268
+ parse_results=parse_results,
269
+ level=level,
270
+ child_dirs=[],
271
+ output_file=config.output_file,
272
+ )
273
+
274
+ if write_result.success:
275
+ size_kb = write_result.size_bytes / 1024
276
+ truncated_msg = " [truncated]" if write_result.truncated else ""
277
+ msg = f"[green]✓ Created ({level}, {size_kb:.1f}KB{truncated_msg}):[/green]"
278
+ console.print(f"{msg} {write_result.path}")
279
+
280
+ # Show cost information if requested
281
+ if show_cost and docstring_processor:
282
+ tokens = docstring_processor.total_tokens
283
+ estimated_cost = (tokens / 1_000_000) * 3.0 # Rough estimate: $3 per 1M tokens
284
+ console.print(
285
+ f" [dim]→ Docstring processing: {tokens} tokens "
286
+ f"(~${estimated_cost:.4f})[/dim]"
287
+ )
288
+ else:
289
+ console.print(f"[red]✗ Error:[/red] {write_result.error}")
290
+ return
291
+
292
+ # Format prompt
293
+ prompt = format_prompt(path, files_info, symbols_info, imports_info)
294
+
295
+ if dry_run:
296
+ console.print("\n[dim]Prompt preview:[/dim]")
297
+ console.print(prompt[:500] + "..." if len(prompt) > 500 else prompt)
298
+ console.print(f"\n[dim]Total prompt length: {len(prompt)} chars[/dim]")
299
+ return
300
+
301
+ # Invoke AI CLI
302
+ if not quiet:
303
+ console.print(f" [dim]→ Invoking AI CLI (timeout: {timeout}s)...[/dim]")
304
+ console.print(f" [dim] Command: {config.ai_command[:50]}...[/dim]")
305
+
306
+ invoke_result = invoke_ai_cli(config.ai_command, prompt, timeout=timeout)
307
+
308
+ if not invoke_result.success:
309
+ console.print(f"[red]✗ AI CLI error:[/red] {invoke_result.error}")
310
+ console.print("[yellow]Tip: Use --fallback to generate basic README without AI[/yellow]")
311
+ return
312
+
313
+ if not quiet:
314
+ console.print(f" [dim]→ AI responded ({len(invoke_result.output)} chars)[/dim]")
315
+
316
+ # Clean and validate AI output
317
+ cleaned_output = clean_ai_output(invoke_result.output)
318
+
319
+ if not validate_markdown_output(cleaned_output):
320
+ console.print("[yellow]⚠ AI output validation failed, using fallback[/yellow]")
321
+ write_result = generate_fallback_readme(path, parse_results, config.output_file)
322
+ if write_result.success:
323
+ console.print(f"[green]✓ Created (fallback):[/green] {write_result.path}")
324
+ else:
325
+ console.print(f"[red]✗ Error:[/red] {write_result.error}")
326
+ return
327
+
328
+ # Write output
329
+ if not quiet:
330
+ console.print(" [dim]→ Writing README_AI.md...[/dim]")
331
+ write_result = write_readme(path, cleaned_output, config.output_file)
332
+
333
+ if write_result.success:
334
+ if not quiet:
335
+ console.print(f"[green]✓ Created:[/green] {write_result.path}")
336
+ else:
337
+ print(write_result.path)
338
+ else:
339
+ console.print(f"[red]✗ Write error:[/red] {write_result.error}")
340
+
341
+
342
+ @click.command()
343
+ @click.option("--root", type=click.Path(exists=True, file_okay=False, path_type=Path), default=".")
344
+ @click.option("--parallel", "-p", type=int, help="Override parallel workers")
345
+ @click.option("--timeout", default=120, help="Timeout per directory in seconds")
346
+ @click.option("--no-ai", is_flag=True, help="Disable AI enhancement, use SmartWriter only")
347
+ @click.option("--fallback", is_flag=True, help="Alias for --no-ai (deprecated)")
348
+ @click.option("--quiet", "-q", is_flag=True, help="Minimal output")
349
+ @click.option("--hierarchical", "-h", is_flag=True, help="Use hierarchical processing (bottom-up)")
350
+ @click.option(
351
+ "--docstring-mode",
352
+ type=click.Choice(["off", "hybrid", "all-ai"]),
353
+ default=None,
354
+ help="Docstring extraction mode (off=disabled, hybrid=selective AI, "
355
+ "all-ai=maximum quality). Overrides config value.",
356
+ )
357
+ @click.option(
358
+ "--show-cost",
359
+ is_flag=True,
360
+ help="Display AI token usage and estimated cost for docstring processing",
361
+ )
362
+ @click.option(
363
+ "--output",
364
+ type=click.Choice(["markdown", "json"]),
365
+ default="markdown",
366
+ help="Output format (markdown writes README_AI.md files, json prints to stdout)",
367
+ )
368
+ def scan_all(
369
+ root: Path | None,
370
+ parallel: int | None,
371
+ timeout: int,
372
+ no_ai: bool,
373
+ fallback: bool,
374
+ quiet: bool,
375
+ hierarchical: bool,
376
+ docstring_mode: str | None,
377
+ show_cost: bool,
378
+ output: str,
379
+ ):
380
+ """Scan all project directories for README_AI.md generation.
381
+
382
+ Generates SmartWriter READMEs for all directories in parallel.
383
+ """
384
+ # Determine root path first (needed for config loading)
385
+ root = Path.cwd() if root is None else root
386
+
387
+ # Check if config file exists (for JSON mode)
388
+ config_path = root / ".codeindex.yaml"
389
+ if not config_path.exists():
390
+ if output == "json":
391
+ import json
392
+
393
+ from .errors import ErrorCode, ErrorInfo, create_error_response
394
+
395
+ error = ErrorInfo(
396
+ code=ErrorCode.NO_CONFIG_FOUND,
397
+ message=f"Configuration file not found: {config_path}",
398
+ detail="Run 'codeindex init' to create .codeindex.yaml",
399
+ )
400
+ click.echo(json.dumps(create_error_response(error), indent=2, ensure_ascii=False))
401
+ raise SystemExit(1)
402
+
403
+ # Load config from root directory
404
+ config = Config.load(config_path if config_path.exists() else None)
405
+
406
+ # --fallback is alias for --no-ai
407
+ use_ai = not (no_ai or fallback)
408
+
409
+ # Override parallel workers if specified
410
+ if parallel is not None:
411
+ config.parallel_workers = parallel
412
+
413
+ # Determine docstring mode (CLI overrides config)
414
+ effective_docstring_mode = (
415
+ docstring_mode if docstring_mode is not None else config.docstrings.mode
416
+ )
417
+
418
+ # Create DocstringProcessor if needed
419
+ docstring_processor = None
420
+ if effective_docstring_mode != "off" and config.docstrings.ai_command:
421
+ docstring_processor = DocstringProcessor(
422
+ ai_command=config.docstrings.ai_command,
423
+ mode=effective_docstring_mode,
424
+ )
425
+
426
+ # Use hierarchical processing if requested
427
+ if hierarchical:
428
+ if not quiet:
429
+ console.print("[bold]🎯 Using hierarchical processing (bottom-up)[/bold]")
430
+
431
+ # Import hierarchical processor
432
+ from .hierarchical import scan_directories_hierarchical
433
+
434
+ success = scan_directories_hierarchical(
435
+ root,
436
+ config,
437
+ config.parallel_workers,
438
+ not use_ai, # fallback parameter
439
+ quiet,
440
+ timeout
441
+ )
442
+
443
+ return
444
+
445
+ # Handle JSON output mode (simplified path)
446
+ if output == "json":
447
+ import json
448
+
449
+ # Build directory tree
450
+ tree = DirectoryTree(root, config)
451
+ dirs = tree.get_processing_order()
452
+
453
+ if not dirs:
454
+ # Empty output
455
+ json_output = {
456
+ "success": True,
457
+ "results": [],
458
+ "summary": {
459
+ "total_files": 0,
460
+ "total_symbols": 0,
461
+ "total_imports": 0,
462
+ "errors": 0,
463
+ },
464
+ }
465
+ click.echo(json.dumps(json_output, indent=2, ensure_ascii=False))
466
+ return
467
+
468
+ # Scan and parse all directories
469
+ all_parse_results = []
470
+
471
+ for dir_path in dirs:
472
+ # Scan directory (non-recursive for overview, recursive for detailed)
473
+ level = tree.get_level(dir_path)
474
+ scan_recursive = level != "overview"
475
+ scan_result = scan_directory(dir_path, config, base_path=root, recursive=scan_recursive)
476
+
477
+ if scan_result.files:
478
+ # Parse files
479
+ parse_results = parse_files_parallel(scan_result.files, config, quiet=True)
480
+ all_parse_results.extend(parse_results)
481
+
482
+ # Build JSON output
483
+ json_output = {
484
+ "success": True,
485
+ "results": [r.to_dict() for r in all_parse_results],
486
+ "summary": {
487
+ "total_files": len(all_parse_results),
488
+ "total_symbols": sum(len(r.symbols) for r in all_parse_results),
489
+ "total_imports": sum(len(r.imports) for r in all_parse_results),
490
+ "errors": sum(1 for r in all_parse_results if r.error),
491
+ },
492
+ }
493
+
494
+ # Output to stdout
495
+ click.echo(json.dumps(json_output, indent=2, ensure_ascii=False))
496
+ return
497
+
498
+ # Build directory tree (first pass)
499
+ if not quiet:
500
+ console.print("[bold]🌳 Building directory tree...[/bold]")
501
+
502
+ tree = DirectoryTree(root, config)
503
+ stats = tree.get_stats()
504
+
505
+ if stats["total_directories"] == 0:
506
+ if not quiet:
507
+ console.print("[yellow]No indexable directories found[/yellow]")
508
+ return
509
+
510
+ if not quiet:
511
+ console.print(f"[green]✓ Found {stats['total_directories']} directories[/green]")
512
+ console.print(f" [dim]├── {stats['with_children']} with children (navigation)[/dim]")
513
+ console.print(f" [dim]├── {stats['leaf_directories']} leaf directories (detailed)[/dim]")
514
+ console.print(f" [dim]└── Max depth: {stats['max_depth']}[/dim]")
515
+
516
+ # Get processing order (bottom-up: deepest first)
517
+ dirs = tree.get_processing_order()
518
+
519
+ # ========== Phase 1: SmartWriter parallel generation ==========
520
+ if not quiet:
521
+ console.print("\n[bold]📝 Phase 1: Generating READMEs (SmartWriter)...[/bold]")
522
+ console.print(f"[dim]→ Processing with {config.parallel_workers} parallel workers...[/dim]")
523
+
524
+ # Phase 1: Parallel SmartWriter processing
525
+ phase1_results = {} # dir_path -> (success, msg, size_bytes)
526
+ success_count = 0
527
+
528
+ with concurrent.futures.ThreadPoolExecutor(max_workers=config.parallel_workers) as executor:
529
+ futures = {
530
+ executor.submit(
531
+ _process_directory_with_smartwriter, d, tree, config, docstring_processor
532
+ ): d
533
+ for d in dirs
534
+ }
535
+
536
+ for future in concurrent.futures.as_completed(futures):
537
+ dir_path, success, msg, size_bytes = future.result()
538
+ phase1_results[dir_path] = (success, msg, size_bytes)
539
+ if success:
540
+ success_count += 1
541
+ if not quiet:
542
+ console.print(f"[green]✓[/green] {dir_path.name} ({msg})")
543
+ else:
544
+ if not quiet:
545
+ console.print(f"[red]✗[/red] {dir_path.name}: {msg}")
546
+
547
+ if not quiet:
548
+ console.print(f"[dim]→ Phase 1 complete: {success_count}/{len(dirs)} directories[/dim]")
549
+
550
+ # Phase 1 complete - show summary
551
+ if not quiet:
552
+ msg = f"Completed: {success_count}/{len(dirs)} directories"
553
+ console.print(f"\n[bold]{msg}[/bold]")
554
+
555
+ # Show cost information if requested
556
+ if show_cost and docstring_processor:
557
+ tokens = docstring_processor.total_tokens
558
+ estimated_cost = (tokens / 1_000_000) * 3.0
559
+ console.print(
560
+ f" [dim]→ Docstring processing: {tokens} tokens "
561
+ f"(~${estimated_cost:.4f})[/dim]"
562
+ )