cicada-mcp 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cicada-mcp might be problematic. Click here for more details.

Files changed (48) hide show
  1. cicada/__init__.py +30 -0
  2. cicada/clean.py +297 -0
  3. cicada/command_logger.py +293 -0
  4. cicada/dead_code_analyzer.py +282 -0
  5. cicada/extractors/__init__.py +36 -0
  6. cicada/extractors/base.py +66 -0
  7. cicada/extractors/call.py +176 -0
  8. cicada/extractors/dependency.py +361 -0
  9. cicada/extractors/doc.py +179 -0
  10. cicada/extractors/function.py +246 -0
  11. cicada/extractors/module.py +123 -0
  12. cicada/extractors/spec.py +151 -0
  13. cicada/find_dead_code.py +270 -0
  14. cicada/formatter.py +918 -0
  15. cicada/git_helper.py +646 -0
  16. cicada/indexer.py +629 -0
  17. cicada/install.py +724 -0
  18. cicada/keyword_extractor.py +364 -0
  19. cicada/keyword_search.py +553 -0
  20. cicada/lightweight_keyword_extractor.py +298 -0
  21. cicada/mcp_server.py +1559 -0
  22. cicada/mcp_tools.py +291 -0
  23. cicada/parser.py +124 -0
  24. cicada/pr_finder.py +435 -0
  25. cicada/pr_indexer/__init__.py +20 -0
  26. cicada/pr_indexer/cli.py +62 -0
  27. cicada/pr_indexer/github_api_client.py +431 -0
  28. cicada/pr_indexer/indexer.py +297 -0
  29. cicada/pr_indexer/line_mapper.py +209 -0
  30. cicada/pr_indexer/pr_index_builder.py +253 -0
  31. cicada/setup.py +339 -0
  32. cicada/utils/__init__.py +52 -0
  33. cicada/utils/call_site_formatter.py +95 -0
  34. cicada/utils/function_grouper.py +57 -0
  35. cicada/utils/hash_utils.py +173 -0
  36. cicada/utils/index_utils.py +290 -0
  37. cicada/utils/path_utils.py +240 -0
  38. cicada/utils/signature_builder.py +106 -0
  39. cicada/utils/storage.py +111 -0
  40. cicada/utils/subprocess_runner.py +182 -0
  41. cicada/utils/text_utils.py +90 -0
  42. cicada/version_check.py +116 -0
  43. cicada_mcp-0.1.4.dist-info/METADATA +619 -0
  44. cicada_mcp-0.1.4.dist-info/RECORD +48 -0
  45. cicada_mcp-0.1.4.dist-info/WHEEL +5 -0
  46. cicada_mcp-0.1.4.dist-info/entry_points.txt +8 -0
  47. cicada_mcp-0.1.4.dist-info/licenses/LICENSE +21 -0
  48. cicada_mcp-0.1.4.dist-info/top_level.txt +1 -0
cicada/indexer.py ADDED
@@ -0,0 +1,629 @@
1
+ """
2
+ Elixir Repository Indexer.
3
+
4
+ Walks an Elixir repository and indexes all modules and functions.
5
+ """
6
+
7
+ import argparse
8
+ import os
9
+ import signal
10
+ import sys
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from cicada.parser import ElixirParser
14
+ from cicada.utils import (
15
+ save_index,
16
+ load_index,
17
+ merge_indexes_incremental,
18
+ validate_index_structure,
19
+ )
20
+ from cicada.utils.hash_utils import (
21
+ load_file_hashes,
22
+ save_file_hashes,
23
+ detect_file_changes,
24
+ compute_hashes_for_files,
25
+ )
26
+
27
+
28
+ class ElixirIndexer:
29
+ """Indexes Elixir repositories to extract module and function information."""
30
+
31
+ # Progress reporting interval - report every N files processed
32
+ PROGRESS_REPORT_INTERVAL = 10
33
+
34
+ def __init__(self, verbose: bool = False):
35
+ """Initialize the indexer with a parser."""
36
+ self.parser = ElixirParser()
37
+ self.verbose = verbose
38
+ self.excluded_dirs = {
39
+ "deps",
40
+ "_build",
41
+ "node_modules",
42
+ ".git",
43
+ "assets",
44
+ "priv",
45
+ }
46
+ self._interrupted = False
47
+
48
+ def _handle_interrupt(self, signum, frame):
49
+ """Handle interrupt signals (Ctrl-C, SIGTERM) gracefully."""
50
+ print(
51
+ "\n\n⚠️ Interrupt received. Finishing current file and saving progress..."
52
+ )
53
+ print(" Press Ctrl-C again to force quit (may lose progress)\n")
54
+ self._interrupted = True
55
+ # Restore default handler so second Ctrl-C will kill immediately
56
+ signal.signal(signal.SIGINT, signal.SIG_DFL)
57
+ signal.signal(signal.SIGTERM, signal.SIG_DFL)
58
+
59
+ def _check_and_report_interruption(
60
+ self, files_processed: int, total_files: int
61
+ ) -> bool:
62
+ """
63
+ Check if interrupted and report status.
64
+
65
+ Args:
66
+ files_processed: Number of files processed so far
67
+ total_files: Total number of files to process
68
+
69
+ Returns:
70
+ True if interrupted, False otherwise
71
+ """
72
+ if self._interrupted:
73
+ print(
74
+ f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files"
75
+ )
76
+ print(" Saving partial progress...")
77
+ return True
78
+ return False
79
+
80
+ def index_repository(
81
+ self,
82
+ repo_path: str,
83
+ output_path: str = ".cicada/index.json",
84
+ extract_keywords: bool = False,
85
+ spacy_model: str = "small",
86
+ ):
87
+ """
88
+ Index an Elixir repository.
89
+
90
+ Args:
91
+ repo_path: Path to the Elixir repository root
92
+ output_path: Path where the index JSON file will be saved
93
+ extract_keywords: If True, extract keywords from documentation using NLP
94
+ spacy_model: Size of spaCy model to use for keyword extraction
95
+ ('small', 'medium', or 'large'). Default is 'small'.
96
+
97
+ Returns:
98
+ Dictionary containing the index data
99
+ """
100
+ repo_path_obj = Path(repo_path).resolve()
101
+
102
+ if not repo_path_obj.exists():
103
+ raise ValueError(f"Repository path does not exist: {repo_path_obj}")
104
+
105
+ print(f"Indexing repository: {repo_path_obj}")
106
+
107
+ # Set up signal handlers for graceful interruption
108
+ signal.signal(signal.SIGINT, self._handle_interrupt)
109
+ signal.signal(signal.SIGTERM, self._handle_interrupt)
110
+ self._interrupted = False
111
+
112
+ # Initialize keyword extractor if requested
113
+ keyword_extractor = None
114
+ if extract_keywords:
115
+ try:
116
+ from cicada.lightweight_keyword_extractor import (
117
+ LightweightKeywordExtractor,
118
+ )
119
+
120
+ keyword_extractor = LightweightKeywordExtractor(
121
+ verbose=True, model_size=spacy_model
122
+ )
123
+ except Exception as e:
124
+ print(f"Warning: Could not initialize keyword extractor: {e}")
125
+ print("Continuing without keyword extraction...")
126
+ extract_keywords = False
127
+
128
+ # Find all Elixir files
129
+ elixir_files = self._find_elixir_files(repo_path_obj)
130
+ total_files = len(elixir_files)
131
+
132
+ print(f"Found {total_files} Elixir files")
133
+ if extract_keywords:
134
+ print("Keyword extraction enabled")
135
+
136
+ # Parse all files
137
+ all_modules = {}
138
+ total_functions = 0
139
+ files_processed = 0
140
+ keyword_extraction_failures = 0
141
+
142
+ for file_path in elixir_files:
143
+ try:
144
+ modules = self.parser.parse_file(str(file_path))
145
+
146
+ if modules:
147
+ for module_data in modules:
148
+ module_name = module_data["module"]
149
+ functions = module_data["functions"]
150
+
151
+ # Calculate stats
152
+ public_count = sum(1 for f in functions if f["type"] == "def")
153
+ private_count = sum(1 for f in functions if f["type"] == "defp")
154
+
155
+ # Extract keywords if enabled
156
+ module_keywords = None
157
+ if keyword_extractor and module_data.get("moduledoc"):
158
+ try:
159
+ module_keywords = (
160
+ keyword_extractor.extract_keywords_simple(
161
+ module_data["moduledoc"], top_n=10
162
+ )
163
+ )
164
+ except Exception as e:
165
+ keyword_extraction_failures += 1
166
+ if self.verbose:
167
+ print(
168
+ f"Warning: Keyword extraction failed for module {module_name}: {e}",
169
+ file=sys.stderr,
170
+ )
171
+
172
+ # Extract keywords from function docs
173
+ if keyword_extractor:
174
+ for func in functions:
175
+ if func.get("doc"):
176
+ func_name = func.get("name", "")
177
+ try:
178
+ # Include function name in text for keyword extraction
179
+ # This ensures the function name identifier gets 10x weight
180
+ text_for_keywords = f"{func_name} {func['doc']}"
181
+ func_keywords = (
182
+ keyword_extractor.extract_keywords_simple(
183
+ text_for_keywords, top_n=10
184
+ )
185
+ )
186
+ if func_keywords:
187
+ func["keywords"] = func_keywords
188
+ except Exception as e:
189
+ keyword_extraction_failures += 1
190
+ if self.verbose:
191
+ print(
192
+ f"Warning: Keyword extraction failed for {module_name}.{func_name}: {e}",
193
+ file=sys.stderr,
194
+ )
195
+
196
+ # Store module info
197
+ module_info = {
198
+ "file": str(file_path.relative_to(repo_path_obj)),
199
+ "line": module_data["line"],
200
+ "moduledoc": module_data.get("moduledoc"),
201
+ "functions": functions,
202
+ "total_functions": len(functions),
203
+ "public_functions": public_count,
204
+ "private_functions": private_count,
205
+ "aliases": module_data.get("aliases", {}),
206
+ "imports": module_data.get("imports", []),
207
+ "requires": module_data.get("requires", []),
208
+ "uses": module_data.get("uses", []),
209
+ "behaviours": module_data.get("behaviours", []),
210
+ "value_mentions": module_data.get("value_mentions", []),
211
+ "calls": module_data.get("calls", []),
212
+ }
213
+
214
+ # Add module keywords if extracted
215
+ if module_keywords:
216
+ module_info["keywords"] = module_keywords
217
+
218
+ all_modules[module_name] = module_info
219
+
220
+ total_functions += len(functions)
221
+
222
+ files_processed += 1
223
+
224
+ # Progress reporting
225
+ if files_processed % self.PROGRESS_REPORT_INTERVAL == 0:
226
+ print(f" Processed {files_processed}/{total_files} files...")
227
+
228
+ # Check for interruption after each file
229
+ if self._check_and_report_interruption(files_processed, total_files):
230
+ break
231
+
232
+ except Exception as e:
233
+ print(f" Skipping {file_path}: {e}")
234
+ # Check for interruption even after error
235
+ if self._check_and_report_interruption(files_processed, total_files):
236
+ break
237
+ continue
238
+
239
+ # Build final index
240
+ index = {
241
+ "modules": all_modules,
242
+ "metadata": {
243
+ "indexed_at": datetime.now().isoformat(),
244
+ "total_modules": len(all_modules),
245
+ "total_functions": total_functions,
246
+ "repo_path": str(repo_path_obj),
247
+ },
248
+ }
249
+
250
+ # Save to file
251
+ output_path_obj = Path(output_path)
252
+
253
+ # Check if .cicada directory exists (first run detection)
254
+ is_first_run = not output_path_obj.parent.exists()
255
+
256
+ # On first run, add .cicada/ to .gitignore if it exists
257
+ if is_first_run:
258
+ from cicada.utils.path_utils import ensure_gitignore_has_cicada
259
+
260
+ if ensure_gitignore_has_cicada(repo_path_obj):
261
+ print("✓ Added .cicada/ to .gitignore")
262
+
263
+ save_index(index, output_path_obj, create_dirs=True)
264
+
265
+ # Compute and save hashes for all PROCESSED files for future incremental updates
266
+ print("Computing file hashes for incremental updates...")
267
+ # Only hash files that were actually processed
268
+ processed_files = [
269
+ str(f.relative_to(repo_path_obj)) for f in elixir_files[:files_processed]
270
+ ]
271
+ file_hashes = compute_hashes_for_files(processed_files, str(repo_path_obj))
272
+ save_file_hashes(str(output_path_obj.parent), file_hashes)
273
+
274
+ # Report completion status
275
+ if self._interrupted:
276
+ print(f"\n✓ Partial index saved!")
277
+ print(
278
+ f" Processed: {files_processed}/{total_files} files ({files_processed/total_files*100:.1f}%)"
279
+ )
280
+ print(f" Modules: {len(all_modules)}")
281
+ print(f" Functions: {total_functions}")
282
+ print(
283
+ f"\n💡 Run the command again to continue indexing remaining {total_files - files_processed} file(s)"
284
+ )
285
+ else:
286
+ print(f"\nIndexing complete!")
287
+ print(f" Modules: {len(all_modules)}")
288
+ print(f" Functions: {total_functions}")
289
+
290
+ # Report keyword extraction failures if any
291
+ if extract_keywords and keyword_extraction_failures > 0:
292
+ print(
293
+ f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
294
+ )
295
+ print(" Some documentation may not be indexed for keyword search.")
296
+
297
+ print(f"\nIndex saved to: {output_path_obj}")
298
+ print(f"Hashes saved to: {output_path_obj.parent}/hashes.json")
299
+
300
+ return index
301
+
302
+ def incremental_index_repository(
303
+ self,
304
+ repo_path: str,
305
+ output_path: str = ".cicada/index.json",
306
+ extract_keywords: bool = False,
307
+ spacy_model: str = "small",
308
+ force_full: bool = False,
309
+ ):
310
+ """
311
+ Incrementally index an Elixir repository using file hashing.
312
+
313
+ Only processes files that have been added, modified, or deleted since
314
+ the last indexing run. Falls back to full indexing if no previous
315
+ index or hashes exist.
316
+
317
+ Args:
318
+ repo_path: Path to the Elixir repository root
319
+ output_path: Path where the index JSON file will be saved
320
+ extract_keywords: If True, extract keywords from documentation using NLP
321
+ spacy_model: Size of spaCy model to use for keyword extraction
322
+ force_full: If True, ignore existing hashes and do full reindex
323
+
324
+ Returns:
325
+ Dictionary containing the index data
326
+ """
327
+ repo_path_obj = Path(repo_path).resolve()
328
+ output_path_obj = Path(output_path)
329
+ cicada_dir = output_path_obj.parent
330
+
331
+ if not repo_path_obj.exists():
332
+ raise ValueError(f"Repository path does not exist: {repo_path_obj}")
333
+
334
+ # Load existing index and hashes
335
+ existing_index = load_index(output_path_obj) if not force_full else None
336
+ existing_hashes = load_file_hashes(str(cicada_dir)) if not force_full else {}
337
+
338
+ # Validate existing index structure if loaded
339
+ if existing_index:
340
+ is_valid, error = validate_index_structure(existing_index)
341
+ if not is_valid:
342
+ print(
343
+ f"Warning: Existing index is corrupted ({error}). Performing full reindex..."
344
+ )
345
+ existing_index = None
346
+
347
+ # If no existing data, do full index
348
+ if not existing_index or not existing_hashes:
349
+ print("No existing index or hashes found. Performing full index...")
350
+ return self.index_repository(
351
+ str(repo_path_obj), str(output_path_obj), extract_keywords, spacy_model
352
+ )
353
+
354
+ print(f"Performing incremental index of: {repo_path_obj}")
355
+
356
+ # Set up signal handlers for graceful interruption
357
+ signal.signal(signal.SIGINT, self._handle_interrupt)
358
+ signal.signal(signal.SIGTERM, self._handle_interrupt)
359
+ self._interrupted = False
360
+
361
+ # Find all current Elixir files
362
+ elixir_files = self._find_elixir_files(repo_path_obj)
363
+ # Convert to relative paths
364
+ relative_files = [str(f.relative_to(repo_path_obj)) for f in elixir_files]
365
+
366
+ # Detect file changes
367
+ print("Detecting file changes...")
368
+ new_files, modified_files, deleted_files = detect_file_changes(
369
+ relative_files, existing_hashes, str(repo_path_obj)
370
+ )
371
+
372
+ # Calculate what needs to be processed
373
+ files_to_process = new_files + modified_files
374
+ total_changes = len(new_files) + len(modified_files) + len(deleted_files)
375
+
376
+ if total_changes == 0:
377
+ print("No changes detected. Index is up to date.")
378
+ return existing_index
379
+
380
+ print(f"Changes detected:")
381
+ print(f" New files: {len(new_files)}")
382
+ print(f" Modified files: {len(modified_files)}")
383
+ print(f" Deleted files: {len(deleted_files)}")
384
+
385
+ if files_to_process:
386
+ print(f"\nProcessing {len(files_to_process)} changed file(s)...")
387
+
388
+ # Initialize keyword extractor if requested
389
+ keyword_extractor = None
390
+ if extract_keywords:
391
+ try:
392
+ from cicada.lightweight_keyword_extractor import (
393
+ LightweightKeywordExtractor,
394
+ )
395
+
396
+ keyword_extractor = LightweightKeywordExtractor(
397
+ verbose=True, model_size=spacy_model
398
+ )
399
+ except Exception as e:
400
+ print(f"Warning: Could not initialize keyword extractor: {e}")
401
+ print("Continuing without keyword extraction...")
402
+ extract_keywords = False
403
+
404
+ # Process changed files
405
+ all_modules = {}
406
+ total_functions = 0
407
+ files_processed = 0
408
+ keyword_extraction_failures = 0
409
+
410
+ for relative_file in files_to_process:
411
+ file_path = repo_path_obj / relative_file
412
+ try:
413
+ modules = self.parser.parse_file(str(file_path))
414
+
415
+ if modules:
416
+ for module_data in modules:
417
+ module_name = module_data["module"]
418
+ functions = module_data["functions"]
419
+
420
+ # Calculate stats
421
+ public_count = sum(1 for f in functions if f["type"] == "def")
422
+ private_count = sum(1 for f in functions if f["type"] == "defp")
423
+
424
+ # Extract keywords if enabled
425
+ module_keywords = None
426
+ if keyword_extractor and module_data.get("moduledoc"):
427
+ try:
428
+ module_keywords = (
429
+ keyword_extractor.extract_keywords_simple(
430
+ module_data["moduledoc"], top_n=10
431
+ )
432
+ )
433
+ except Exception as e:
434
+ keyword_extraction_failures += 1
435
+
436
+ # Extract keywords from function docs
437
+ if keyword_extractor:
438
+ for func in functions:
439
+ if func.get("doc"):
440
+ try:
441
+ func_name = func.get("name", "")
442
+ text_for_keywords = f"{func_name} {func['doc']}"
443
+ func_keywords = (
444
+ keyword_extractor.extract_keywords_simple(
445
+ text_for_keywords, top_n=10
446
+ )
447
+ )
448
+ if func_keywords:
449
+ func["keywords"] = func_keywords
450
+ except Exception as e:
451
+ keyword_extraction_failures += 1
452
+
453
+ # Store module info
454
+ module_info = {
455
+ "file": relative_file,
456
+ "line": module_data["line"],
457
+ "moduledoc": module_data.get("moduledoc"),
458
+ "functions": functions,
459
+ "total_functions": len(functions),
460
+ "public_functions": public_count,
461
+ "private_functions": private_count,
462
+ "aliases": module_data.get("aliases", {}),
463
+ "imports": module_data.get("imports", []),
464
+ "requires": module_data.get("requires", []),
465
+ "uses": module_data.get("uses", []),
466
+ "behaviours": module_data.get("behaviours", []),
467
+ "value_mentions": module_data.get("value_mentions", []),
468
+ "calls": module_data.get("calls", []),
469
+ }
470
+
471
+ # Add module keywords if extracted
472
+ if module_keywords:
473
+ module_info["keywords"] = module_keywords
474
+
475
+ all_modules[module_name] = module_info
476
+ total_functions += len(functions)
477
+
478
+ files_processed += 1
479
+
480
+ # Check for interruption after each file
481
+ if self._check_and_report_interruption(
482
+ files_processed, len(files_to_process)
483
+ ):
484
+ break
485
+
486
+ except Exception as e:
487
+ print(f" Skipping {file_path}: {e}")
488
+ # Check for interruption even after error
489
+ if self._check_and_report_interruption(
490
+ files_processed, len(files_to_process)
491
+ ):
492
+ break
493
+ continue
494
+
495
+ # Build index for changed files
496
+ new_index = {
497
+ "modules": all_modules,
498
+ "metadata": {
499
+ "indexed_at": datetime.now().isoformat(),
500
+ "repo_path": str(repo_path_obj),
501
+ },
502
+ }
503
+
504
+ # Merge with existing index
505
+ print("\nMerging with existing index...")
506
+ merged_index = merge_indexes_incremental(
507
+ existing_index, new_index, deleted_files
508
+ )
509
+
510
+ # Update hashes for all current files
511
+ print("Updating file hashes...")
512
+ updated_hashes = dict(existing_hashes)
513
+
514
+ # Compute hashes only for files that were actually processed
515
+ actually_processed = files_to_process[:files_processed]
516
+ new_hashes = compute_hashes_for_files(actually_processed, str(repo_path_obj))
517
+ updated_hashes.update(new_hashes)
518
+
519
+ # Remove hashes for deleted files
520
+ for deleted_file in deleted_files:
521
+ updated_hashes.pop(deleted_file, None)
522
+
523
+ # Save index and hashes
524
+ save_index(merged_index, output_path_obj, create_dirs=True)
525
+ save_file_hashes(str(cicada_dir), updated_hashes)
526
+
527
+ # Report completion status
528
+ if self._interrupted:
529
+ remaining = len(files_to_process) - files_processed
530
+ print(f"\n✓ Partial index saved!")
531
+ print(
532
+ f" Processed: {files_processed}/{len(files_to_process)} changed file(s)"
533
+ )
534
+ print(f" Total modules: {merged_index['metadata']['total_modules']}")
535
+ print(f" Total functions: {merged_index['metadata']['total_functions']}")
536
+ print(f" Files deleted: {len(deleted_files)}")
537
+ print(
538
+ f"\n💡 Run the command again to continue indexing remaining {remaining} changed file(s)"
539
+ )
540
+ else:
541
+ print(f"\nIncremental indexing complete!")
542
+ print(f" Total modules: {merged_index['metadata']['total_modules']}")
543
+ print(f" Total functions: {merged_index['metadata']['total_functions']}")
544
+ print(f" Files processed: {files_processed}")
545
+ print(f" Files deleted: {len(deleted_files)}")
546
+
547
+ # Report keyword extraction failures if any
548
+ if extract_keywords and keyword_extraction_failures > 0:
549
+ print(
550
+ f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
551
+ )
552
+
553
+ print(f"\nIndex saved to: {output_path_obj}")
554
+ print(f"Hashes saved to: {cicada_dir}/hashes.json")
555
+
556
+ return merged_index
557
+
558
+ def _find_elixir_files(self, repo_path: Path) -> list:
559
+ """Find all Elixir source files in the repository."""
560
+ elixir_files = []
561
+
562
+ for root, dirs, files in os.walk(repo_path):
563
+ # Remove excluded directories from the search
564
+ dirs[:] = [d for d in dirs if d not in self.excluded_dirs]
565
+
566
+ # Find .ex and .exs files
567
+ for file in files:
568
+ if file.endswith((".ex", ".exs")):
569
+ file_path = Path(root) / file
570
+ elixir_files.append(file_path)
571
+
572
+ return sorted(elixir_files)
573
+
574
+
575
+ def main():
576
+ """Main entry point for the indexer CLI."""
577
+ from cicada.version_check import check_for_updates
578
+
579
+ # Check for updates (non-blocking, fails silently)
580
+ check_for_updates()
581
+
582
+ parser = argparse.ArgumentParser(
583
+ description="Index current Elixir repository to extract modules and functions"
584
+ )
585
+ _ = parser.add_argument(
586
+ "repo",
587
+ nargs="?",
588
+ default=".",
589
+ help="Path to the Elixir repository to index (default: current directory)",
590
+ )
591
+ _ = parser.add_argument(
592
+ "--output",
593
+ default=".cicada/index.json",
594
+ help="Output path for the index file (default: .cicada/index.json)",
595
+ )
596
+ parser.add_argument(
597
+ "--extract-keywords",
598
+ action="store_true",
599
+ help="Extract keywords from documentation using NLP (adds ~1-2s per 100 docs)",
600
+ )
601
+ parser.add_argument(
602
+ "--spacy-model",
603
+ choices=["small", "medium", "large"],
604
+ default="small",
605
+ help="Size of spaCy model to use for keyword extraction (default: small). "
606
+ "Medium and large models provide better accuracy but are slower.",
607
+ )
608
+ parser.add_argument(
609
+ "--full",
610
+ action="store_true",
611
+ help="Force full reindex, ignoring existing hashes (default: incremental)",
612
+ )
613
+
614
+ args = parser.parse_args()
615
+
616
+ indexer = ElixirIndexer()
617
+
618
+ # Use incremental indexing by default (unless --full flag is set)
619
+ indexer.incremental_index_repository(
620
+ args.repo,
621
+ args.output,
622
+ extract_keywords=args.extract_keywords,
623
+ spacy_model=args.spacy_model,
624
+ force_full=args.full,
625
+ )
626
+
627
+
628
+ if __name__ == "__main__":
629
+ main()