cicada-mcp 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cicada-mcp might be problematic. Click here for more details.
- cicada/__init__.py +30 -0
- cicada/clean.py +297 -0
- cicada/command_logger.py +293 -0
- cicada/dead_code_analyzer.py +282 -0
- cicada/extractors/__init__.py +36 -0
- cicada/extractors/base.py +66 -0
- cicada/extractors/call.py +176 -0
- cicada/extractors/dependency.py +361 -0
- cicada/extractors/doc.py +179 -0
- cicada/extractors/function.py +246 -0
- cicada/extractors/module.py +123 -0
- cicada/extractors/spec.py +151 -0
- cicada/find_dead_code.py +270 -0
- cicada/formatter.py +918 -0
- cicada/git_helper.py +646 -0
- cicada/indexer.py +629 -0
- cicada/install.py +724 -0
- cicada/keyword_extractor.py +364 -0
- cicada/keyword_search.py +553 -0
- cicada/lightweight_keyword_extractor.py +298 -0
- cicada/mcp_server.py +1559 -0
- cicada/mcp_tools.py +291 -0
- cicada/parser.py +124 -0
- cicada/pr_finder.py +435 -0
- cicada/pr_indexer/__init__.py +20 -0
- cicada/pr_indexer/cli.py +62 -0
- cicada/pr_indexer/github_api_client.py +431 -0
- cicada/pr_indexer/indexer.py +297 -0
- cicada/pr_indexer/line_mapper.py +209 -0
- cicada/pr_indexer/pr_index_builder.py +253 -0
- cicada/setup.py +339 -0
- cicada/utils/__init__.py +52 -0
- cicada/utils/call_site_formatter.py +95 -0
- cicada/utils/function_grouper.py +57 -0
- cicada/utils/hash_utils.py +173 -0
- cicada/utils/index_utils.py +290 -0
- cicada/utils/path_utils.py +240 -0
- cicada/utils/signature_builder.py +106 -0
- cicada/utils/storage.py +111 -0
- cicada/utils/subprocess_runner.py +182 -0
- cicada/utils/text_utils.py +90 -0
- cicada/version_check.py +116 -0
- cicada_mcp-0.1.4.dist-info/METADATA +619 -0
- cicada_mcp-0.1.4.dist-info/RECORD +48 -0
- cicada_mcp-0.1.4.dist-info/WHEEL +5 -0
- cicada_mcp-0.1.4.dist-info/entry_points.txt +8 -0
- cicada_mcp-0.1.4.dist-info/licenses/LICENSE +21 -0
- cicada_mcp-0.1.4.dist-info/top_level.txt +1 -0
cicada/indexer.py
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Elixir Repository Indexer.
|
|
3
|
+
|
|
4
|
+
Walks an Elixir repository and indexes all modules and functions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import os
|
|
9
|
+
import signal
|
|
10
|
+
import sys
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from cicada.parser import ElixirParser
|
|
14
|
+
from cicada.utils import (
|
|
15
|
+
save_index,
|
|
16
|
+
load_index,
|
|
17
|
+
merge_indexes_incremental,
|
|
18
|
+
validate_index_structure,
|
|
19
|
+
)
|
|
20
|
+
from cicada.utils.hash_utils import (
|
|
21
|
+
load_file_hashes,
|
|
22
|
+
save_file_hashes,
|
|
23
|
+
detect_file_changes,
|
|
24
|
+
compute_hashes_for_files,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ElixirIndexer:
|
|
29
|
+
"""Indexes Elixir repositories to extract module and function information."""
|
|
30
|
+
|
|
31
|
+
# Progress reporting interval - report every N files processed
|
|
32
|
+
PROGRESS_REPORT_INTERVAL = 10
|
|
33
|
+
|
|
34
|
+
def __init__(self, verbose: bool = False):
|
|
35
|
+
"""Initialize the indexer with a parser."""
|
|
36
|
+
self.parser = ElixirParser()
|
|
37
|
+
self.verbose = verbose
|
|
38
|
+
self.excluded_dirs = {
|
|
39
|
+
"deps",
|
|
40
|
+
"_build",
|
|
41
|
+
"node_modules",
|
|
42
|
+
".git",
|
|
43
|
+
"assets",
|
|
44
|
+
"priv",
|
|
45
|
+
}
|
|
46
|
+
self._interrupted = False
|
|
47
|
+
|
|
48
|
+
def _handle_interrupt(self, signum, frame):
|
|
49
|
+
"""Handle interrupt signals (Ctrl-C, SIGTERM) gracefully."""
|
|
50
|
+
print(
|
|
51
|
+
"\n\n⚠️ Interrupt received. Finishing current file and saving progress..."
|
|
52
|
+
)
|
|
53
|
+
print(" Press Ctrl-C again to force quit (may lose progress)\n")
|
|
54
|
+
self._interrupted = True
|
|
55
|
+
# Restore default handler so second Ctrl-C will kill immediately
|
|
56
|
+
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
|
57
|
+
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
|
58
|
+
|
|
59
|
+
def _check_and_report_interruption(
|
|
60
|
+
self, files_processed: int, total_files: int
|
|
61
|
+
) -> bool:
|
|
62
|
+
"""
|
|
63
|
+
Check if interrupted and report status.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
files_processed: Number of files processed so far
|
|
67
|
+
total_files: Total number of files to process
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
True if interrupted, False otherwise
|
|
71
|
+
"""
|
|
72
|
+
if self._interrupted:
|
|
73
|
+
print(
|
|
74
|
+
f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files"
|
|
75
|
+
)
|
|
76
|
+
print(" Saving partial progress...")
|
|
77
|
+
return True
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def index_repository(
|
|
81
|
+
self,
|
|
82
|
+
repo_path: str,
|
|
83
|
+
output_path: str = ".cicada/index.json",
|
|
84
|
+
extract_keywords: bool = False,
|
|
85
|
+
spacy_model: str = "small",
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Index an Elixir repository.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
repo_path: Path to the Elixir repository root
|
|
92
|
+
output_path: Path where the index JSON file will be saved
|
|
93
|
+
extract_keywords: If True, extract keywords from documentation using NLP
|
|
94
|
+
spacy_model: Size of spaCy model to use for keyword extraction
|
|
95
|
+
('small', 'medium', or 'large'). Default is 'small'.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dictionary containing the index data
|
|
99
|
+
"""
|
|
100
|
+
repo_path_obj = Path(repo_path).resolve()
|
|
101
|
+
|
|
102
|
+
if not repo_path_obj.exists():
|
|
103
|
+
raise ValueError(f"Repository path does not exist: {repo_path_obj}")
|
|
104
|
+
|
|
105
|
+
print(f"Indexing repository: {repo_path_obj}")
|
|
106
|
+
|
|
107
|
+
# Set up signal handlers for graceful interruption
|
|
108
|
+
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
109
|
+
signal.signal(signal.SIGTERM, self._handle_interrupt)
|
|
110
|
+
self._interrupted = False
|
|
111
|
+
|
|
112
|
+
# Initialize keyword extractor if requested
|
|
113
|
+
keyword_extractor = None
|
|
114
|
+
if extract_keywords:
|
|
115
|
+
try:
|
|
116
|
+
from cicada.lightweight_keyword_extractor import (
|
|
117
|
+
LightweightKeywordExtractor,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
keyword_extractor = LightweightKeywordExtractor(
|
|
121
|
+
verbose=True, model_size=spacy_model
|
|
122
|
+
)
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
125
|
+
print("Continuing without keyword extraction...")
|
|
126
|
+
extract_keywords = False
|
|
127
|
+
|
|
128
|
+
# Find all Elixir files
|
|
129
|
+
elixir_files = self._find_elixir_files(repo_path_obj)
|
|
130
|
+
total_files = len(elixir_files)
|
|
131
|
+
|
|
132
|
+
print(f"Found {total_files} Elixir files")
|
|
133
|
+
if extract_keywords:
|
|
134
|
+
print("Keyword extraction enabled")
|
|
135
|
+
|
|
136
|
+
# Parse all files
|
|
137
|
+
all_modules = {}
|
|
138
|
+
total_functions = 0
|
|
139
|
+
files_processed = 0
|
|
140
|
+
keyword_extraction_failures = 0
|
|
141
|
+
|
|
142
|
+
for file_path in elixir_files:
|
|
143
|
+
try:
|
|
144
|
+
modules = self.parser.parse_file(str(file_path))
|
|
145
|
+
|
|
146
|
+
if modules:
|
|
147
|
+
for module_data in modules:
|
|
148
|
+
module_name = module_data["module"]
|
|
149
|
+
functions = module_data["functions"]
|
|
150
|
+
|
|
151
|
+
# Calculate stats
|
|
152
|
+
public_count = sum(1 for f in functions if f["type"] == "def")
|
|
153
|
+
private_count = sum(1 for f in functions if f["type"] == "defp")
|
|
154
|
+
|
|
155
|
+
# Extract keywords if enabled
|
|
156
|
+
module_keywords = None
|
|
157
|
+
if keyword_extractor and module_data.get("moduledoc"):
|
|
158
|
+
try:
|
|
159
|
+
module_keywords = (
|
|
160
|
+
keyword_extractor.extract_keywords_simple(
|
|
161
|
+
module_data["moduledoc"], top_n=10
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
except Exception as e:
|
|
165
|
+
keyword_extraction_failures += 1
|
|
166
|
+
if self.verbose:
|
|
167
|
+
print(
|
|
168
|
+
f"Warning: Keyword extraction failed for module {module_name}: {e}",
|
|
169
|
+
file=sys.stderr,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Extract keywords from function docs
|
|
173
|
+
if keyword_extractor:
|
|
174
|
+
for func in functions:
|
|
175
|
+
if func.get("doc"):
|
|
176
|
+
func_name = func.get("name", "")
|
|
177
|
+
try:
|
|
178
|
+
# Include function name in text for keyword extraction
|
|
179
|
+
# This ensures the function name identifier gets 10x weight
|
|
180
|
+
text_for_keywords = f"{func_name} {func['doc']}"
|
|
181
|
+
func_keywords = (
|
|
182
|
+
keyword_extractor.extract_keywords_simple(
|
|
183
|
+
text_for_keywords, top_n=10
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
if func_keywords:
|
|
187
|
+
func["keywords"] = func_keywords
|
|
188
|
+
except Exception as e:
|
|
189
|
+
keyword_extraction_failures += 1
|
|
190
|
+
if self.verbose:
|
|
191
|
+
print(
|
|
192
|
+
f"Warning: Keyword extraction failed for {module_name}.{func_name}: {e}",
|
|
193
|
+
file=sys.stderr,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Store module info
|
|
197
|
+
module_info = {
|
|
198
|
+
"file": str(file_path.relative_to(repo_path_obj)),
|
|
199
|
+
"line": module_data["line"],
|
|
200
|
+
"moduledoc": module_data.get("moduledoc"),
|
|
201
|
+
"functions": functions,
|
|
202
|
+
"total_functions": len(functions),
|
|
203
|
+
"public_functions": public_count,
|
|
204
|
+
"private_functions": private_count,
|
|
205
|
+
"aliases": module_data.get("aliases", {}),
|
|
206
|
+
"imports": module_data.get("imports", []),
|
|
207
|
+
"requires": module_data.get("requires", []),
|
|
208
|
+
"uses": module_data.get("uses", []),
|
|
209
|
+
"behaviours": module_data.get("behaviours", []),
|
|
210
|
+
"value_mentions": module_data.get("value_mentions", []),
|
|
211
|
+
"calls": module_data.get("calls", []),
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Add module keywords if extracted
|
|
215
|
+
if module_keywords:
|
|
216
|
+
module_info["keywords"] = module_keywords
|
|
217
|
+
|
|
218
|
+
all_modules[module_name] = module_info
|
|
219
|
+
|
|
220
|
+
total_functions += len(functions)
|
|
221
|
+
|
|
222
|
+
files_processed += 1
|
|
223
|
+
|
|
224
|
+
# Progress reporting
|
|
225
|
+
if files_processed % self.PROGRESS_REPORT_INTERVAL == 0:
|
|
226
|
+
print(f" Processed {files_processed}/{total_files} files...")
|
|
227
|
+
|
|
228
|
+
# Check for interruption after each file
|
|
229
|
+
if self._check_and_report_interruption(files_processed, total_files):
|
|
230
|
+
break
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
print(f" Skipping {file_path}: {e}")
|
|
234
|
+
# Check for interruption even after error
|
|
235
|
+
if self._check_and_report_interruption(files_processed, total_files):
|
|
236
|
+
break
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
# Build final index
|
|
240
|
+
index = {
|
|
241
|
+
"modules": all_modules,
|
|
242
|
+
"metadata": {
|
|
243
|
+
"indexed_at": datetime.now().isoformat(),
|
|
244
|
+
"total_modules": len(all_modules),
|
|
245
|
+
"total_functions": total_functions,
|
|
246
|
+
"repo_path": str(repo_path_obj),
|
|
247
|
+
},
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
# Save to file
|
|
251
|
+
output_path_obj = Path(output_path)
|
|
252
|
+
|
|
253
|
+
# Check if .cicada directory exists (first run detection)
|
|
254
|
+
is_first_run = not output_path_obj.parent.exists()
|
|
255
|
+
|
|
256
|
+
# On first run, add .cicada/ to .gitignore if it exists
|
|
257
|
+
if is_first_run:
|
|
258
|
+
from cicada.utils.path_utils import ensure_gitignore_has_cicada
|
|
259
|
+
|
|
260
|
+
if ensure_gitignore_has_cicada(repo_path_obj):
|
|
261
|
+
print("✓ Added .cicada/ to .gitignore")
|
|
262
|
+
|
|
263
|
+
save_index(index, output_path_obj, create_dirs=True)
|
|
264
|
+
|
|
265
|
+
# Compute and save hashes for all PROCESSED files for future incremental updates
|
|
266
|
+
print("Computing file hashes for incremental updates...")
|
|
267
|
+
# Only hash files that were actually processed
|
|
268
|
+
processed_files = [
|
|
269
|
+
str(f.relative_to(repo_path_obj)) for f in elixir_files[:files_processed]
|
|
270
|
+
]
|
|
271
|
+
file_hashes = compute_hashes_for_files(processed_files, str(repo_path_obj))
|
|
272
|
+
save_file_hashes(str(output_path_obj.parent), file_hashes)
|
|
273
|
+
|
|
274
|
+
# Report completion status
|
|
275
|
+
if self._interrupted:
|
|
276
|
+
print(f"\n✓ Partial index saved!")
|
|
277
|
+
print(
|
|
278
|
+
f" Processed: {files_processed}/{total_files} files ({files_processed/total_files*100:.1f}%)"
|
|
279
|
+
)
|
|
280
|
+
print(f" Modules: {len(all_modules)}")
|
|
281
|
+
print(f" Functions: {total_functions}")
|
|
282
|
+
print(
|
|
283
|
+
f"\n💡 Run the command again to continue indexing remaining {total_files - files_processed} file(s)"
|
|
284
|
+
)
|
|
285
|
+
else:
|
|
286
|
+
print(f"\nIndexing complete!")
|
|
287
|
+
print(f" Modules: {len(all_modules)}")
|
|
288
|
+
print(f" Functions: {total_functions}")
|
|
289
|
+
|
|
290
|
+
# Report keyword extraction failures if any
|
|
291
|
+
if extract_keywords and keyword_extraction_failures > 0:
|
|
292
|
+
print(
|
|
293
|
+
f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
|
|
294
|
+
)
|
|
295
|
+
print(" Some documentation may not be indexed for keyword search.")
|
|
296
|
+
|
|
297
|
+
print(f"\nIndex saved to: {output_path_obj}")
|
|
298
|
+
print(f"Hashes saved to: {output_path_obj.parent}/hashes.json")
|
|
299
|
+
|
|
300
|
+
return index
|
|
301
|
+
|
|
302
|
+
def incremental_index_repository(
|
|
303
|
+
self,
|
|
304
|
+
repo_path: str,
|
|
305
|
+
output_path: str = ".cicada/index.json",
|
|
306
|
+
extract_keywords: bool = False,
|
|
307
|
+
spacy_model: str = "small",
|
|
308
|
+
force_full: bool = False,
|
|
309
|
+
):
|
|
310
|
+
"""
|
|
311
|
+
Incrementally index an Elixir repository using file hashing.
|
|
312
|
+
|
|
313
|
+
Only processes files that have been added, modified, or deleted since
|
|
314
|
+
the last indexing run. Falls back to full indexing if no previous
|
|
315
|
+
index or hashes exist.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
repo_path: Path to the Elixir repository root
|
|
319
|
+
output_path: Path where the index JSON file will be saved
|
|
320
|
+
extract_keywords: If True, extract keywords from documentation using NLP
|
|
321
|
+
spacy_model: Size of spaCy model to use for keyword extraction
|
|
322
|
+
force_full: If True, ignore existing hashes and do full reindex
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Dictionary containing the index data
|
|
326
|
+
"""
|
|
327
|
+
repo_path_obj = Path(repo_path).resolve()
|
|
328
|
+
output_path_obj = Path(output_path)
|
|
329
|
+
cicada_dir = output_path_obj.parent
|
|
330
|
+
|
|
331
|
+
if not repo_path_obj.exists():
|
|
332
|
+
raise ValueError(f"Repository path does not exist: {repo_path_obj}")
|
|
333
|
+
|
|
334
|
+
# Load existing index and hashes
|
|
335
|
+
existing_index = load_index(output_path_obj) if not force_full else None
|
|
336
|
+
existing_hashes = load_file_hashes(str(cicada_dir)) if not force_full else {}
|
|
337
|
+
|
|
338
|
+
# Validate existing index structure if loaded
|
|
339
|
+
if existing_index:
|
|
340
|
+
is_valid, error = validate_index_structure(existing_index)
|
|
341
|
+
if not is_valid:
|
|
342
|
+
print(
|
|
343
|
+
f"Warning: Existing index is corrupted ({error}). Performing full reindex..."
|
|
344
|
+
)
|
|
345
|
+
existing_index = None
|
|
346
|
+
|
|
347
|
+
# If no existing data, do full index
|
|
348
|
+
if not existing_index or not existing_hashes:
|
|
349
|
+
print("No existing index or hashes found. Performing full index...")
|
|
350
|
+
return self.index_repository(
|
|
351
|
+
str(repo_path_obj), str(output_path_obj), extract_keywords, spacy_model
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
print(f"Performing incremental index of: {repo_path_obj}")
|
|
355
|
+
|
|
356
|
+
# Set up signal handlers for graceful interruption
|
|
357
|
+
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
358
|
+
signal.signal(signal.SIGTERM, self._handle_interrupt)
|
|
359
|
+
self._interrupted = False
|
|
360
|
+
|
|
361
|
+
# Find all current Elixir files
|
|
362
|
+
elixir_files = self._find_elixir_files(repo_path_obj)
|
|
363
|
+
# Convert to relative paths
|
|
364
|
+
relative_files = [str(f.relative_to(repo_path_obj)) for f in elixir_files]
|
|
365
|
+
|
|
366
|
+
# Detect file changes
|
|
367
|
+
print("Detecting file changes...")
|
|
368
|
+
new_files, modified_files, deleted_files = detect_file_changes(
|
|
369
|
+
relative_files, existing_hashes, str(repo_path_obj)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Calculate what needs to be processed
|
|
373
|
+
files_to_process = new_files + modified_files
|
|
374
|
+
total_changes = len(new_files) + len(modified_files) + len(deleted_files)
|
|
375
|
+
|
|
376
|
+
if total_changes == 0:
|
|
377
|
+
print("No changes detected. Index is up to date.")
|
|
378
|
+
return existing_index
|
|
379
|
+
|
|
380
|
+
print(f"Changes detected:")
|
|
381
|
+
print(f" New files: {len(new_files)}")
|
|
382
|
+
print(f" Modified files: {len(modified_files)}")
|
|
383
|
+
print(f" Deleted files: {len(deleted_files)}")
|
|
384
|
+
|
|
385
|
+
if files_to_process:
|
|
386
|
+
print(f"\nProcessing {len(files_to_process)} changed file(s)...")
|
|
387
|
+
|
|
388
|
+
# Initialize keyword extractor if requested
|
|
389
|
+
keyword_extractor = None
|
|
390
|
+
if extract_keywords:
|
|
391
|
+
try:
|
|
392
|
+
from cicada.lightweight_keyword_extractor import (
|
|
393
|
+
LightweightKeywordExtractor,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
keyword_extractor = LightweightKeywordExtractor(
|
|
397
|
+
verbose=True, model_size=spacy_model
|
|
398
|
+
)
|
|
399
|
+
except Exception as e:
|
|
400
|
+
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
401
|
+
print("Continuing without keyword extraction...")
|
|
402
|
+
extract_keywords = False
|
|
403
|
+
|
|
404
|
+
# Process changed files
|
|
405
|
+
all_modules = {}
|
|
406
|
+
total_functions = 0
|
|
407
|
+
files_processed = 0
|
|
408
|
+
keyword_extraction_failures = 0
|
|
409
|
+
|
|
410
|
+
for relative_file in files_to_process:
|
|
411
|
+
file_path = repo_path_obj / relative_file
|
|
412
|
+
try:
|
|
413
|
+
modules = self.parser.parse_file(str(file_path))
|
|
414
|
+
|
|
415
|
+
if modules:
|
|
416
|
+
for module_data in modules:
|
|
417
|
+
module_name = module_data["module"]
|
|
418
|
+
functions = module_data["functions"]
|
|
419
|
+
|
|
420
|
+
# Calculate stats
|
|
421
|
+
public_count = sum(1 for f in functions if f["type"] == "def")
|
|
422
|
+
private_count = sum(1 for f in functions if f["type"] == "defp")
|
|
423
|
+
|
|
424
|
+
# Extract keywords if enabled
|
|
425
|
+
module_keywords = None
|
|
426
|
+
if keyword_extractor and module_data.get("moduledoc"):
|
|
427
|
+
try:
|
|
428
|
+
module_keywords = (
|
|
429
|
+
keyword_extractor.extract_keywords_simple(
|
|
430
|
+
module_data["moduledoc"], top_n=10
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
except Exception as e:
|
|
434
|
+
keyword_extraction_failures += 1
|
|
435
|
+
|
|
436
|
+
# Extract keywords from function docs
|
|
437
|
+
if keyword_extractor:
|
|
438
|
+
for func in functions:
|
|
439
|
+
if func.get("doc"):
|
|
440
|
+
try:
|
|
441
|
+
func_name = func.get("name", "")
|
|
442
|
+
text_for_keywords = f"{func_name} {func['doc']}"
|
|
443
|
+
func_keywords = (
|
|
444
|
+
keyword_extractor.extract_keywords_simple(
|
|
445
|
+
text_for_keywords, top_n=10
|
|
446
|
+
)
|
|
447
|
+
)
|
|
448
|
+
if func_keywords:
|
|
449
|
+
func["keywords"] = func_keywords
|
|
450
|
+
except Exception as e:
|
|
451
|
+
keyword_extraction_failures += 1
|
|
452
|
+
|
|
453
|
+
# Store module info
|
|
454
|
+
module_info = {
|
|
455
|
+
"file": relative_file,
|
|
456
|
+
"line": module_data["line"],
|
|
457
|
+
"moduledoc": module_data.get("moduledoc"),
|
|
458
|
+
"functions": functions,
|
|
459
|
+
"total_functions": len(functions),
|
|
460
|
+
"public_functions": public_count,
|
|
461
|
+
"private_functions": private_count,
|
|
462
|
+
"aliases": module_data.get("aliases", {}),
|
|
463
|
+
"imports": module_data.get("imports", []),
|
|
464
|
+
"requires": module_data.get("requires", []),
|
|
465
|
+
"uses": module_data.get("uses", []),
|
|
466
|
+
"behaviours": module_data.get("behaviours", []),
|
|
467
|
+
"value_mentions": module_data.get("value_mentions", []),
|
|
468
|
+
"calls": module_data.get("calls", []),
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
# Add module keywords if extracted
|
|
472
|
+
if module_keywords:
|
|
473
|
+
module_info["keywords"] = module_keywords
|
|
474
|
+
|
|
475
|
+
all_modules[module_name] = module_info
|
|
476
|
+
total_functions += len(functions)
|
|
477
|
+
|
|
478
|
+
files_processed += 1
|
|
479
|
+
|
|
480
|
+
# Check for interruption after each file
|
|
481
|
+
if self._check_and_report_interruption(
|
|
482
|
+
files_processed, len(files_to_process)
|
|
483
|
+
):
|
|
484
|
+
break
|
|
485
|
+
|
|
486
|
+
except Exception as e:
|
|
487
|
+
print(f" Skipping {file_path}: {e}")
|
|
488
|
+
# Check for interruption even after error
|
|
489
|
+
if self._check_and_report_interruption(
|
|
490
|
+
files_processed, len(files_to_process)
|
|
491
|
+
):
|
|
492
|
+
break
|
|
493
|
+
continue
|
|
494
|
+
|
|
495
|
+
# Build index for changed files
|
|
496
|
+
new_index = {
|
|
497
|
+
"modules": all_modules,
|
|
498
|
+
"metadata": {
|
|
499
|
+
"indexed_at": datetime.now().isoformat(),
|
|
500
|
+
"repo_path": str(repo_path_obj),
|
|
501
|
+
},
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
# Merge with existing index
|
|
505
|
+
print("\nMerging with existing index...")
|
|
506
|
+
merged_index = merge_indexes_incremental(
|
|
507
|
+
existing_index, new_index, deleted_files
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Update hashes for all current files
|
|
511
|
+
print("Updating file hashes...")
|
|
512
|
+
updated_hashes = dict(existing_hashes)
|
|
513
|
+
|
|
514
|
+
# Compute hashes only for files that were actually processed
|
|
515
|
+
actually_processed = files_to_process[:files_processed]
|
|
516
|
+
new_hashes = compute_hashes_for_files(actually_processed, str(repo_path_obj))
|
|
517
|
+
updated_hashes.update(new_hashes)
|
|
518
|
+
|
|
519
|
+
# Remove hashes for deleted files
|
|
520
|
+
for deleted_file in deleted_files:
|
|
521
|
+
updated_hashes.pop(deleted_file, None)
|
|
522
|
+
|
|
523
|
+
# Save index and hashes
|
|
524
|
+
save_index(merged_index, output_path_obj, create_dirs=True)
|
|
525
|
+
save_file_hashes(str(cicada_dir), updated_hashes)
|
|
526
|
+
|
|
527
|
+
# Report completion status
|
|
528
|
+
if self._interrupted:
|
|
529
|
+
remaining = len(files_to_process) - files_processed
|
|
530
|
+
print(f"\n✓ Partial index saved!")
|
|
531
|
+
print(
|
|
532
|
+
f" Processed: {files_processed}/{len(files_to_process)} changed file(s)"
|
|
533
|
+
)
|
|
534
|
+
print(f" Total modules: {merged_index['metadata']['total_modules']}")
|
|
535
|
+
print(f" Total functions: {merged_index['metadata']['total_functions']}")
|
|
536
|
+
print(f" Files deleted: {len(deleted_files)}")
|
|
537
|
+
print(
|
|
538
|
+
f"\n💡 Run the command again to continue indexing remaining {remaining} changed file(s)"
|
|
539
|
+
)
|
|
540
|
+
else:
|
|
541
|
+
print(f"\nIncremental indexing complete!")
|
|
542
|
+
print(f" Total modules: {merged_index['metadata']['total_modules']}")
|
|
543
|
+
print(f" Total functions: {merged_index['metadata']['total_functions']}")
|
|
544
|
+
print(f" Files processed: {files_processed}")
|
|
545
|
+
print(f" Files deleted: {len(deleted_files)}")
|
|
546
|
+
|
|
547
|
+
# Report keyword extraction failures if any
|
|
548
|
+
if extract_keywords and keyword_extraction_failures > 0:
|
|
549
|
+
print(
|
|
550
|
+
f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
print(f"\nIndex saved to: {output_path_obj}")
|
|
554
|
+
print(f"Hashes saved to: {cicada_dir}/hashes.json")
|
|
555
|
+
|
|
556
|
+
return merged_index
|
|
557
|
+
|
|
558
|
+
def _find_elixir_files(self, repo_path: Path) -> list:
|
|
559
|
+
"""Find all Elixir source files in the repository."""
|
|
560
|
+
elixir_files = []
|
|
561
|
+
|
|
562
|
+
for root, dirs, files in os.walk(repo_path):
|
|
563
|
+
# Remove excluded directories from the search
|
|
564
|
+
dirs[:] = [d for d in dirs if d not in self.excluded_dirs]
|
|
565
|
+
|
|
566
|
+
# Find .ex and .exs files
|
|
567
|
+
for file in files:
|
|
568
|
+
if file.endswith((".ex", ".exs")):
|
|
569
|
+
file_path = Path(root) / file
|
|
570
|
+
elixir_files.append(file_path)
|
|
571
|
+
|
|
572
|
+
return sorted(elixir_files)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def main():
|
|
576
|
+
"""Main entry point for the indexer CLI."""
|
|
577
|
+
from cicada.version_check import check_for_updates
|
|
578
|
+
|
|
579
|
+
# Check for updates (non-blocking, fails silently)
|
|
580
|
+
check_for_updates()
|
|
581
|
+
|
|
582
|
+
parser = argparse.ArgumentParser(
|
|
583
|
+
description="Index current Elixir repository to extract modules and functions"
|
|
584
|
+
)
|
|
585
|
+
_ = parser.add_argument(
|
|
586
|
+
"repo",
|
|
587
|
+
nargs="?",
|
|
588
|
+
default=".",
|
|
589
|
+
help="Path to the Elixir repository to index (default: current directory)",
|
|
590
|
+
)
|
|
591
|
+
_ = parser.add_argument(
|
|
592
|
+
"--output",
|
|
593
|
+
default=".cicada/index.json",
|
|
594
|
+
help="Output path for the index file (default: .cicada/index.json)",
|
|
595
|
+
)
|
|
596
|
+
parser.add_argument(
|
|
597
|
+
"--extract-keywords",
|
|
598
|
+
action="store_true",
|
|
599
|
+
help="Extract keywords from documentation using NLP (adds ~1-2s per 100 docs)",
|
|
600
|
+
)
|
|
601
|
+
parser.add_argument(
|
|
602
|
+
"--spacy-model",
|
|
603
|
+
choices=["small", "medium", "large"],
|
|
604
|
+
default="small",
|
|
605
|
+
help="Size of spaCy model to use for keyword extraction (default: small). "
|
|
606
|
+
"Medium and large models provide better accuracy but are slower.",
|
|
607
|
+
)
|
|
608
|
+
parser.add_argument(
|
|
609
|
+
"--full",
|
|
610
|
+
action="store_true",
|
|
611
|
+
help="Force full reindex, ignoring existing hashes (default: incremental)",
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
args = parser.parse_args()
|
|
615
|
+
|
|
616
|
+
indexer = ElixirIndexer()
|
|
617
|
+
|
|
618
|
+
# Use incremental indexing by default (unless --full flag is set)
|
|
619
|
+
indexer.incremental_index_repository(
|
|
620
|
+
args.repo,
|
|
621
|
+
args.output,
|
|
622
|
+
extract_keywords=args.extract_keywords,
|
|
623
|
+
spacy_model=args.spacy_model,
|
|
624
|
+
force_full=args.full,
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
if __name__ == "__main__":
|
|
629
|
+
main()
|