cicada-mcp 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. cicada/ascii_art.py +60 -0
  2. cicada/clean.py +195 -60
  3. cicada/cli.py +757 -0
  4. cicada/colors.py +27 -0
  5. cicada/command_logger.py +14 -16
  6. cicada/dead_code_analyzer.py +12 -19
  7. cicada/extractors/__init__.py +6 -6
  8. cicada/extractors/base.py +3 -3
  9. cicada/extractors/call.py +11 -15
  10. cicada/extractors/dependency.py +39 -51
  11. cicada/extractors/doc.py +8 -9
  12. cicada/extractors/function.py +12 -24
  13. cicada/extractors/module.py +11 -15
  14. cicada/extractors/spec.py +8 -12
  15. cicada/find_dead_code.py +15 -39
  16. cicada/formatter.py +37 -91
  17. cicada/git_helper.py +22 -34
  18. cicada/indexer.py +122 -107
  19. cicada/interactive_setup.py +490 -0
  20. cicada/keybert_extractor.py +286 -0
  21. cicada/keyword_search.py +22 -30
  22. cicada/keyword_test.py +127 -0
  23. cicada/lightweight_keyword_extractor.py +5 -13
  24. cicada/mcp_entry.py +683 -0
  25. cicada/mcp_server.py +103 -209
  26. cicada/parser.py +9 -9
  27. cicada/pr_finder.py +15 -19
  28. cicada/pr_indexer/__init__.py +3 -3
  29. cicada/pr_indexer/cli.py +4 -9
  30. cicada/pr_indexer/github_api_client.py +22 -37
  31. cicada/pr_indexer/indexer.py +17 -29
  32. cicada/pr_indexer/line_mapper.py +8 -12
  33. cicada/pr_indexer/pr_index_builder.py +22 -34
  34. cicada/setup.py +189 -87
  35. cicada/utils/__init__.py +9 -9
  36. cicada/utils/call_site_formatter.py +4 -6
  37. cicada/utils/function_grouper.py +4 -4
  38. cicada/utils/hash_utils.py +12 -15
  39. cicada/utils/index_utils.py +15 -15
  40. cicada/utils/path_utils.py +24 -29
  41. cicada/utils/signature_builder.py +3 -3
  42. cicada/utils/subprocess_runner.py +17 -19
  43. cicada/utils/text_utils.py +1 -2
  44. cicada/version_check.py +2 -5
  45. {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/METADATA +144 -55
  46. cicada_mcp-0.2.0.dist-info/RECORD +53 -0
  47. cicada_mcp-0.2.0.dist-info/entry_points.txt +4 -0
  48. cicada/install.py +0 -741
  49. cicada_mcp-0.1.7.dist-info/RECORD +0 -47
  50. cicada_mcp-0.1.7.dist-info/entry_points.txt +0 -9
  51. {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/WHEEL +0 -0
  52. {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/licenses/LICENSE +0 -0
  53. {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/top_level.txt +0 -0
cicada/git_helper.py CHANGED
@@ -8,12 +8,12 @@ offering comprehensive commit history for files and functions.
8
8
  Author: Cursor(Auto)
9
9
  """
10
10
 
11
- import git
12
11
  import subprocess
13
12
  from datetime import datetime
14
- from typing import List, Dict, Optional
15
13
  from pathlib import Path
16
14
 
15
+ import git
16
+
17
17
 
18
18
  class GitHelper:
19
19
  """Helper class for extracting git commit history"""
@@ -31,7 +31,7 @@ class GitHelper:
31
31
  self.repo = git.Repo(repo_path)
32
32
  self.repo_path = Path(repo_path)
33
33
 
34
- def get_file_history(self, file_path: str, max_commits: int = 10) -> List[Dict]:
34
+ def get_file_history(self, file_path: str, max_commits: int = 10) -> list[dict]:
35
35
  """
36
36
  Get commit history for a specific file
37
37
 
@@ -53,9 +53,7 @@ class GitHelper:
53
53
 
54
54
  try:
55
55
  # Get commits that touched this file
56
- for commit in self.repo.iter_commits(
57
- paths=file_path, max_count=max_commits
58
- ):
56
+ for commit in self.repo.iter_commits(paths=file_path, max_count=max_commits):
59
57
  commits.append(
60
58
  {
61
59
  "sha": commit.hexsha[:8], # Short SHA
@@ -78,7 +76,7 @@ class GitHelper:
78
76
  function_name: str,
79
77
  _line_number: int,
80
78
  max_commits: int = 5,
81
- ) -> List[Dict]:
79
+ ) -> list[dict]:
82
80
  """
83
81
  Get commit history for a specific function using heuristics.
84
82
 
@@ -123,11 +121,11 @@ class GitHelper:
123
121
  def get_function_history_precise(
124
122
  self,
125
123
  file_path: str,
126
- start_line: Optional[int] = None,
127
- end_line: Optional[int] = None,
128
- function_name: Optional[str] = None,
124
+ start_line: int | None = None,
125
+ end_line: int | None = None,
126
+ function_name: str | None = None,
129
127
  max_commits: int = 5,
130
- ) -> List[Dict]:
128
+ ) -> list[dict]:
131
129
  """
132
130
  Get precise commit history for a function using git log -L.
133
131
 
@@ -158,14 +156,13 @@ class GitHelper:
158
156
  - Requires .gitattributes with "*.ex diff=elixir" for function tracking
159
157
  """
160
158
  commits = []
161
- import subprocess
162
159
 
163
160
  # Determine tracking mode
164
161
  use_function_tracking = function_name is not None
165
162
  use_line_tracking = start_line is not None and end_line is not None
166
163
 
167
164
  if not use_function_tracking and not use_line_tracking:
168
- print(f"Error: Must provide either function_name or (start_line, end_line)")
165
+ print("Error: Must provide either function_name or (start_line, end_line)")
169
166
  return []
170
167
 
171
168
  try:
@@ -239,10 +236,10 @@ class GitHelper:
239
236
  def get_function_evolution(
240
237
  self,
241
238
  file_path: str,
242
- start_line: Optional[int] = None,
243
- end_line: Optional[int] = None,
244
- function_name: Optional[str] = None,
245
- ) -> Optional[Dict]:
239
+ start_line: int | None = None,
240
+ end_line: int | None = None,
241
+ function_name: str | None = None,
242
+ ) -> dict | None:
246
243
  """
247
244
  Get evolution metadata for a function (creation, last modification, change count).
248
245
 
@@ -297,9 +294,7 @@ class GitHelper:
297
294
  if days_between > 0:
298
295
  months = days_between / 30.0
299
296
  modification_frequency = (
300
- total_modifications / months
301
- if months > 0
302
- else total_modifications
297
+ total_modifications / months if months > 0 else total_modifications
303
298
  )
304
299
  except Exception:
305
300
  # If date parsing fails, skip frequency calculation
@@ -330,9 +325,7 @@ class GitHelper:
330
325
  print(f"Error getting function evolution for {file_path}: {e}")
331
326
  return None
332
327
 
333
- def get_function_history(
334
- self, file_path: str, start_line: int, end_line: int
335
- ) -> List[Dict]:
328
+ def get_function_history(self, file_path: str, start_line: int, end_line: int) -> list[dict]:
336
329
  """
337
330
  Get line-by-line authorship for a function using git blame.
338
331
 
@@ -357,7 +350,6 @@ class GitHelper:
357
350
  - lines: List of {number, content} for each line
358
351
  """
359
352
  blame_groups = []
360
- import subprocess
361
353
 
362
354
  try:
363
355
  # Use git blame with line range
@@ -401,10 +393,8 @@ class GitHelper:
401
393
  elif line.startswith("author-time "):
402
394
  try:
403
395
  timestamp = int(line[12:])
404
- current_commit["date"] = datetime.fromtimestamp(
405
- timestamp
406
- ).isoformat()
407
- except:
396
+ current_commit["date"] = datetime.fromtimestamp(timestamp).isoformat()
397
+ except (ValueError, OSError):
408
398
  current_commit["date"] = line[12:]
409
399
  # Actual code line (starts with tab)
410
400
  elif line.startswith("\t"):
@@ -469,9 +459,7 @@ class GitHelper:
469
459
 
470
460
  except subprocess.CalledProcessError as e:
471
461
  error_msg = e.stderr if e.stderr else str(e)
472
- print(
473
- f"Warning: git blame failed for {file_path}:{start_line}-{end_line}: {error_msg}"
474
- )
462
+ print(f"Warning: git blame failed for {file_path}:{start_line}-{end_line}: {error_msg}")
475
463
  return []
476
464
  except Exception as e:
477
465
  print(f"Error getting blame for {file_path}: {e}")
@@ -479,7 +467,7 @@ class GitHelper:
479
467
 
480
468
  return blame_groups
481
469
 
482
- def get_recent_commits(self, max_count: int = 20) -> List[Dict]:
470
+ def get_recent_commits(self, max_count: int = 20) -> list[dict]:
483
471
  """
484
472
  Get recent commits in the repository
485
473
 
@@ -512,7 +500,7 @@ class GitHelper:
512
500
 
513
501
  return commits
514
502
 
515
- def get_commit_details(self, commit_sha: str) -> Optional[Dict]:
503
+ def get_commit_details(self, commit_sha: str) -> dict | None:
516
504
  """
517
505
  Get detailed information about a specific commit
518
506
 
@@ -560,7 +548,7 @@ class GitHelper:
560
548
  print(f"Error getting commit {commit_sha}: {e}")
561
549
  return None
562
550
 
563
- def search_commits(self, query: str, max_results: int = 10) -> List[Dict]:
551
+ def search_commits(self, query: str, max_results: int = 10) -> list[dict]:
564
552
  """
565
553
  Search commit messages for a query string
566
554
 
cicada/indexer.py CHANGED
@@ -10,19 +10,56 @@ import signal
10
10
  import sys
11
11
  from datetime import datetime
12
12
  from pathlib import Path
13
+
13
14
  from cicada.parser import ElixirParser
14
15
  from cicada.utils import (
15
- save_index,
16
16
  load_index,
17
17
  merge_indexes_incremental,
18
+ save_index,
18
19
  validate_index_structure,
19
20
  )
20
21
  from cicada.utils.hash_utils import (
22
+ compute_hashes_for_files,
23
+ detect_file_changes,
21
24
  load_file_hashes,
22
25
  save_file_hashes,
23
- detect_file_changes,
24
- compute_hashes_for_files,
25
26
  )
27
+ from cicada.utils.storage import get_config_path
28
+
29
+
30
+ def read_keyword_extraction_config(repo_path: Path) -> tuple[str, str]:
31
+ """
32
+ Read keyword extraction configuration from config.yaml.
33
+
34
+ Args:
35
+ repo_path: Path to the repository
36
+
37
+ Returns:
38
+ tuple[str, str]: (method, tier) where method is 'lemminflect' or 'bert',
39
+ and tier is 'fast', 'regular', or 'max'.
40
+ Returns ('lemminflect', 'regular') as default if config not found.
41
+ """
42
+ try:
43
+ import yaml
44
+
45
+ config_path = get_config_path(repo_path)
46
+ if not config_path.exists():
47
+ # Default to lemminflect if config doesn't exist
48
+ return ("lemminflect", "regular")
49
+
50
+ with open(config_path) as f:
51
+ config = yaml.safe_load(f)
52
+
53
+ if config and "keyword_extraction" in config:
54
+ method = config["keyword_extraction"].get("method", "lemminflect")
55
+ tier = config["keyword_extraction"].get("tier", "regular")
56
+ return (method, tier)
57
+
58
+ # Default to lemminflect if keyword_extraction section not found
59
+ return ("lemminflect", "regular")
60
+ except Exception:
61
+ # If anything goes wrong, default to lemminflect
62
+ return ("lemminflect", "regular")
26
63
 
27
64
 
28
65
  class ElixirIndexer:
@@ -45,20 +82,16 @@ class ElixirIndexer:
45
82
  }
46
83
  self._interrupted = False
47
84
 
48
- def _handle_interrupt(self, signum, frame):
85
+ def _handle_interrupt(self, _signum, _frame):
49
86
  """Handle interrupt signals (Ctrl-C, SIGTERM) gracefully."""
50
- print(
51
- "\n\n⚠️ Interrupt received. Finishing current file and saving progress..."
52
- )
87
+ print("\n\n⚠️ Interrupt received. Finishing current file and saving progress...")
53
88
  print(" Press Ctrl-C again to force quit (may lose progress)\n")
54
89
  self._interrupted = True
55
90
  # Restore default handler so second Ctrl-C will kill immediately
56
91
  signal.signal(signal.SIGINT, signal.SIG_DFL)
57
92
  signal.signal(signal.SIGTERM, signal.SIG_DFL)
58
93
 
59
- def _check_and_report_interruption(
60
- self, files_processed: int, total_files: int
61
- ) -> bool:
94
+ def _check_and_report_interruption(self, files_processed: int, total_files: int) -> bool:
62
95
  """
63
96
  Check if interrupted and report status.
64
97
 
@@ -70,9 +103,7 @@ class ElixirIndexer:
70
103
  True if interrupted, False otherwise
71
104
  """
72
105
  if self._interrupted:
73
- print(
74
- f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files"
75
- )
106
+ print(f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files")
76
107
  print(" Saving partial progress...")
77
108
  return True
78
109
  return False
@@ -80,9 +111,8 @@ class ElixirIndexer:
80
111
  def index_repository(
81
112
  self,
82
113
  repo_path: str,
83
- output_path: str = ".cicada/index.json",
114
+ output_path: str,
84
115
  extract_keywords: bool = False,
85
- spacy_model: str = "small",
86
116
  ):
87
117
  """
88
118
  Index an Elixir repository.
@@ -91,8 +121,6 @@ class ElixirIndexer:
91
121
  repo_path: Path to the Elixir repository root
92
122
  output_path: Path where the index JSON file will be saved
93
123
  extract_keywords: If True, extract keywords from documentation using NLP
94
- spacy_model: Size of spaCy model to use for keyword extraction
95
- ('small', 'medium', or 'large'). Default is 'small'.
96
124
 
97
125
  Returns:
98
126
  Dictionary containing the index data
@@ -104,6 +132,10 @@ class ElixirIndexer:
104
132
 
105
133
  if self.verbose:
106
134
  print(f"Indexing repository: {repo_path_obj}")
135
+ if extract_keywords:
136
+ # Read and display keyword extraction config
137
+ method, tier = read_keyword_extraction_config(repo_path_obj)
138
+ print(f"Keyword extraction: {method.upper()} ({tier})")
107
139
 
108
140
  # Set up signal handlers for graceful interruption
109
141
  signal.signal(signal.SIGINT, self._handle_interrupt)
@@ -114,13 +146,21 @@ class ElixirIndexer:
114
146
  keyword_extractor = None
115
147
  if extract_keywords:
116
148
  try:
117
- from cicada.lightweight_keyword_extractor import (
118
- LightweightKeywordExtractor,
119
- )
149
+ # Read keyword extraction config from config.yaml
150
+ method, tier = read_keyword_extraction_config(repo_path_obj)
120
151
 
121
- keyword_extractor = LightweightKeywordExtractor(
122
- verbose=self.verbose, model_size=spacy_model
123
- )
152
+ if method == "bert":
153
+ # Initialize KeyBERT extractor
154
+ from cicada.keybert_extractor import KeyBERTExtractor
155
+
156
+ keyword_extractor = KeyBERTExtractor(model_tier=tier, verbose=self.verbose)
157
+ else:
158
+ # Initialize lemminflect extractor (default)
159
+ from cicada.lightweight_keyword_extractor import (
160
+ LightweightKeywordExtractor,
161
+ )
162
+
163
+ keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
124
164
  except Exception as e:
125
165
  if self.verbose:
126
166
  print(f"Warning: Could not initialize keyword extractor: {e}")
@@ -133,8 +173,6 @@ class ElixirIndexer:
133
173
 
134
174
  if self.verbose:
135
175
  print(f"Found {total_files} Elixir files")
136
- if extract_keywords:
137
- print("Keyword extraction enabled")
138
176
 
139
177
  # Parse all files
140
178
  all_modules = {}
@@ -159,10 +197,8 @@ class ElixirIndexer:
159
197
  module_keywords = None
160
198
  if keyword_extractor and module_data.get("moduledoc"):
161
199
  try:
162
- module_keywords = (
163
- keyword_extractor.extract_keywords_simple(
164
- module_data["moduledoc"], top_n=10
165
- )
200
+ module_keywords = keyword_extractor.extract_keywords_simple(
201
+ module_data["moduledoc"], top_n=10
166
202
  )
167
203
  except Exception as e:
168
204
  keyword_extraction_failures += 1
@@ -181,10 +217,8 @@ class ElixirIndexer:
181
217
  # Include function name in text for keyword extraction
182
218
  # This ensures the function name identifier gets 10x weight
183
219
  text_for_keywords = f"{func_name} {func['doc']}"
184
- func_keywords = (
185
- keyword_extractor.extract_keywords_simple(
186
- text_for_keywords, top_n=10
187
- )
220
+ func_keywords = keyword_extractor.extract_keywords_simple(
221
+ text_for_keywords, top_n=10
188
222
  )
189
223
  if func_keywords:
190
224
  func["keywords"] = func_keywords
@@ -225,10 +259,7 @@ class ElixirIndexer:
225
259
  files_processed += 1
226
260
 
227
261
  # Progress reporting
228
- if (
229
- self.verbose
230
- and files_processed % self.PROGRESS_REPORT_INTERVAL == 0
231
- ):
262
+ if self.verbose and files_processed % self.PROGRESS_REPORT_INTERVAL == 0:
232
263
  print(f" Processed {files_processed}/{total_files} files...")
233
264
 
234
265
  # Check for interruption after each file
@@ -264,9 +295,8 @@ class ElixirIndexer:
264
295
  if is_first_run:
265
296
  from cicada.utils.path_utils import ensure_gitignore_has_cicada
266
297
 
267
- if ensure_gitignore_has_cicada(repo_path_obj):
268
- if self.verbose:
269
- print("✓ Added .cicada/ to .gitignore")
298
+ if ensure_gitignore_has_cicada(repo_path_obj) and self.verbose:
299
+ print("✓ Added .cicada/ to .gitignore")
270
300
 
271
301
  save_index(index, output_path_obj, create_dirs=True)
272
302
 
@@ -278,12 +308,16 @@ class ElixirIndexer:
278
308
  str(f.relative_to(repo_path_obj)) for f in elixir_files[:files_processed]
279
309
  ]
280
310
  file_hashes = compute_hashes_for_files(processed_files, str(repo_path_obj))
281
- save_file_hashes(str(output_path_obj.parent), file_hashes)
311
+ # Save hashes to centralized storage directory
312
+ from cicada.utils import get_storage_dir
313
+
314
+ storage_dir = get_storage_dir(repo_path_obj)
315
+ save_file_hashes(str(storage_dir), file_hashes)
282
316
 
283
317
  # Report completion status
284
318
  if self.verbose:
285
319
  if self._interrupted:
286
- print(f"\n✓ Partial index saved!")
320
+ print("\n✓ Partial index saved!")
287
321
  print(
288
322
  f" Processed: {files_processed}/{total_files} files ({files_processed/total_files*100:.1f}%)"
289
323
  )
@@ -293,7 +327,7 @@ class ElixirIndexer:
293
327
  f"\n💡 Run the command again to continue indexing remaining {total_files - files_processed} file(s)"
294
328
  )
295
329
  else:
296
- print(f"\nIndexing complete!")
330
+ print("\nIndexing complete!")
297
331
  print(f" Modules: {len(all_modules)}")
298
332
  print(f" Functions: {total_functions}")
299
333
 
@@ -312,9 +346,8 @@ class ElixirIndexer:
312
346
  def incremental_index_repository(
313
347
  self,
314
348
  repo_path: str,
315
- output_path: str = ".cicada/index.json",
349
+ output_path: str,
316
350
  extract_keywords: bool = False,
317
- spacy_model: str = "small",
318
351
  force_full: bool = False,
319
352
  ):
320
353
  """
@@ -328,7 +361,6 @@ class ElixirIndexer:
328
361
  repo_path: Path to the Elixir repository root
329
362
  output_path: Path where the index JSON file will be saved
330
363
  extract_keywords: If True, extract keywords from documentation using NLP
331
- spacy_model: Size of spaCy model to use for keyword extraction
332
364
  force_full: If True, ignore existing hashes and do full reindex
333
365
 
334
366
  Returns:
@@ -336,33 +368,39 @@ class ElixirIndexer:
336
368
  """
337
369
  repo_path_obj = Path(repo_path).resolve()
338
370
  output_path_obj = Path(output_path)
339
- cicada_dir = output_path_obj.parent
371
+ # Use centralized storage directory for hashes
372
+ from cicada.utils import get_storage_dir
373
+
374
+ storage_dir = get_storage_dir(repo_path_obj)
340
375
 
341
376
  if not repo_path_obj.exists():
342
377
  raise ValueError(f"Repository path does not exist: {repo_path_obj}")
343
378
 
344
379
  # Load existing index and hashes
345
380
  existing_index = load_index(output_path_obj) if not force_full else None
346
- existing_hashes = load_file_hashes(str(cicada_dir)) if not force_full else {}
381
+ existing_hashes = load_file_hashes(str(storage_dir)) if not force_full else {}
347
382
 
348
383
  # Validate existing index structure if loaded
349
384
  if existing_index:
350
385
  is_valid, error = validate_index_structure(existing_index)
351
386
  if not is_valid:
352
- print(
353
- f"Warning: Existing index is corrupted ({error}). Performing full reindex..."
354
- )
387
+ if self.verbose:
388
+ print(
389
+ f"Warning: Existing index is corrupted ({error}). Performing full reindex..."
390
+ )
355
391
  existing_index = None
356
392
 
357
393
  # If no existing data, do full index
358
394
  if not existing_index or not existing_hashes:
359
- print("No existing index or hashes found. Performing full index...")
360
- return self.index_repository(
361
- str(repo_path_obj), str(output_path_obj), extract_keywords, spacy_model
362
- )
395
+ if self.verbose:
396
+ print("No existing index or hashes found. Performing full index...")
397
+ return self.index_repository(str(repo_path_obj), str(output_path_obj), extract_keywords)
363
398
 
364
399
  if self.verbose:
400
+ # Read and display keyword extraction config
401
+ method, tier = read_keyword_extraction_config(repo_path_obj)
365
402
  print(f"Performing incremental index of: {repo_path_obj}")
403
+ print(f"Keyword extraction: {method.upper()} ({tier})")
366
404
 
367
405
  # Set up signal handlers for graceful interruption
368
406
  signal.signal(signal.SIGINT, self._handle_interrupt)
@@ -390,12 +428,9 @@ class ElixirIndexer:
390
428
  return existing_index
391
429
 
392
430
  if self.verbose:
393
- print(f"Changes detected:")
394
- if self.verbose:
431
+ print("Changes detected:")
395
432
  print(f" New files: {len(new_files)}")
396
- if self.verbose:
397
433
  print(f" Modified files: {len(modified_files)}")
398
- if self.verbose:
399
434
  print(f" Deleted files: {len(deleted_files)}")
400
435
 
401
436
  if files_to_process:
@@ -405,13 +440,21 @@ class ElixirIndexer:
405
440
  keyword_extractor = None
406
441
  if extract_keywords:
407
442
  try:
408
- from cicada.lightweight_keyword_extractor import (
409
- LightweightKeywordExtractor,
410
- )
443
+ # Read keyword extraction config from config.yaml
444
+ method, tier = read_keyword_extraction_config(repo_path_obj)
411
445
 
412
- keyword_extractor = LightweightKeywordExtractor(
413
- verbose=self.verbose, model_size=spacy_model
414
- )
446
+ if method == "bert":
447
+ # Initialize KeyBERT extractor
448
+ from cicada.keybert_extractor import KeyBERTExtractor
449
+
450
+ keyword_extractor = KeyBERTExtractor(model_tier=tier, verbose=self.verbose)
451
+ else:
452
+ # Initialize lemminflect extractor (default)
453
+ from cicada.lightweight_keyword_extractor import (
454
+ LightweightKeywordExtractor,
455
+ )
456
+
457
+ keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
415
458
  except Exception as e:
416
459
  print(f"Warning: Could not initialize keyword extractor: {e}")
417
460
  print("Continuing without keyword extraction...")
@@ -441,12 +484,10 @@ class ElixirIndexer:
441
484
  module_keywords = None
442
485
  if keyword_extractor and module_data.get("moduledoc"):
443
486
  try:
444
- module_keywords = (
445
- keyword_extractor.extract_keywords_simple(
446
- module_data["moduledoc"], top_n=10
447
- )
487
+ module_keywords = keyword_extractor.extract_keywords_simple(
488
+ module_data["moduledoc"], top_n=10
448
489
  )
449
- except Exception as e:
490
+ except Exception:
450
491
  keyword_extraction_failures += 1
451
492
 
452
493
  # Extract keywords from function docs
@@ -456,14 +497,12 @@ class ElixirIndexer:
456
497
  try:
457
498
  func_name = func.get("name", "")
458
499
  text_for_keywords = f"{func_name} {func['doc']}"
459
- func_keywords = (
460
- keyword_extractor.extract_keywords_simple(
461
- text_for_keywords, top_n=10
462
- )
500
+ func_keywords = keyword_extractor.extract_keywords_simple(
501
+ text_for_keywords, top_n=10
463
502
  )
464
503
  if func_keywords:
465
504
  func["keywords"] = func_keywords
466
- except Exception as e:
505
+ except Exception:
467
506
  keyword_extraction_failures += 1
468
507
 
469
508
  # Store module info
@@ -494,17 +533,13 @@ class ElixirIndexer:
494
533
  files_processed += 1
495
534
 
496
535
  # Check for interruption after each file
497
- if self._check_and_report_interruption(
498
- files_processed, len(files_to_process)
499
- ):
536
+ if self._check_and_report_interruption(files_processed, len(files_to_process)):
500
537
  break
501
538
 
502
539
  except Exception as e:
503
540
  print(f" Skipping {file_path}: {e}")
504
541
  # Check for interruption even after error
505
- if self._check_and_report_interruption(
506
- files_processed, len(files_to_process)
507
- ):
542
+ if self._check_and_report_interruption(files_processed, len(files_to_process)):
508
543
  break
509
544
  continue
510
545
 
@@ -520,9 +555,7 @@ class ElixirIndexer:
520
555
  # Merge with existing index
521
556
  if self.verbose:
522
557
  print("\nMerging with existing index...")
523
- merged_index = merge_indexes_incremental(
524
- existing_index, new_index, deleted_files
525
- )
558
+ merged_index = merge_indexes_incremental(existing_index, new_index, deleted_files)
526
559
 
527
560
  # Update hashes for all current files
528
561
  if self.verbose:
@@ -540,15 +573,13 @@ class ElixirIndexer:
540
573
 
541
574
  # Save index and hashes
542
575
  save_index(merged_index, output_path_obj, create_dirs=True)
543
- save_file_hashes(str(cicada_dir), updated_hashes)
576
+ save_file_hashes(str(storage_dir), updated_hashes)
544
577
 
545
578
  # Report completion status
546
579
  if self._interrupted:
547
580
  remaining = len(files_to_process) - files_processed
548
- print(f"\n✓ Partial index saved!")
549
- print(
550
- f" Processed: {files_processed}/{len(files_to_process)} changed file(s)"
551
- )
581
+ print("\n✓ Partial index saved!")
582
+ print(f" Processed: {files_processed}/{len(files_to_process)} changed file(s)")
552
583
  print(f" Total modules: {merged_index['metadata']['total_modules']}")
553
584
  print(f" Total functions: {merged_index['metadata']['total_functions']}")
554
585
  print(f" Files deleted: {len(deleted_files)}")
@@ -556,7 +587,7 @@ class ElixirIndexer:
556
587
  f"\n💡 Run the command again to continue indexing remaining {remaining} changed file(s)"
557
588
  )
558
589
  else:
559
- print(f"\nIncremental indexing complete!")
590
+ print("\nIncremental indexing complete!")
560
591
  print(f" Total modules: {merged_index['metadata']['total_modules']}")
561
592
  print(f" Total functions: {merged_index['metadata']['total_functions']}")
562
593
  print(f" Files processed: {files_processed}")
@@ -568,9 +599,6 @@ class ElixirIndexer:
568
599
  f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
569
600
  )
570
601
 
571
- print(f"\nIndex saved to: {output_path_obj}")
572
- print(f"Hashes saved to: {cicada_dir}/hashes.json")
573
-
574
602
  return merged_index
575
603
 
576
604
  def _find_elixir_files(self, repo_path: Path) -> list:
@@ -611,18 +639,6 @@ def main():
611
639
  default=".cicada/index.json",
612
640
  help="Output path for the index file (default: .cicada/index.json)",
613
641
  )
614
- parser.add_argument(
615
- "--extract-keywords",
616
- action="store_true",
617
- help="Extract keywords from documentation using NLP (adds ~1-2s per 100 docs)",
618
- )
619
- parser.add_argument(
620
- "--spacy-model",
621
- choices=["small", "medium", "large"],
622
- default="small",
623
- help="Size of spaCy model to use for keyword extraction (default: small). "
624
- "Medium and large models provide better accuracy but are slower.",
625
- )
626
642
  parser.add_argument(
627
643
  "--full",
628
644
  action="store_true",
@@ -637,8 +653,7 @@ def main():
637
653
  indexer.incremental_index_repository(
638
654
  args.repo,
639
655
  args.output,
640
- extract_keywords=args.extract_keywords,
641
- spacy_model=args.spacy_model,
656
+ extract_keywords=True,
642
657
  force_full=args.full,
643
658
  )
644
659