cicada-mcp 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/ascii_art.py +60 -0
- cicada/clean.py +195 -60
- cicada/cli.py +757 -0
- cicada/colors.py +27 -0
- cicada/command_logger.py +14 -16
- cicada/dead_code_analyzer.py +12 -19
- cicada/extractors/__init__.py +6 -6
- cicada/extractors/base.py +3 -3
- cicada/extractors/call.py +11 -15
- cicada/extractors/dependency.py +39 -51
- cicada/extractors/doc.py +8 -9
- cicada/extractors/function.py +12 -24
- cicada/extractors/module.py +11 -15
- cicada/extractors/spec.py +8 -12
- cicada/find_dead_code.py +15 -39
- cicada/formatter.py +37 -91
- cicada/git_helper.py +22 -34
- cicada/indexer.py +165 -132
- cicada/interactive_setup.py +490 -0
- cicada/keybert_extractor.py +286 -0
- cicada/keyword_search.py +22 -30
- cicada/keyword_test.py +127 -0
- cicada/lightweight_keyword_extractor.py +5 -13
- cicada/mcp_entry.py +683 -0
- cicada/mcp_server.py +110 -232
- cicada/parser.py +9 -9
- cicada/pr_finder.py +15 -19
- cicada/pr_indexer/__init__.py +3 -3
- cicada/pr_indexer/cli.py +4 -9
- cicada/pr_indexer/github_api_client.py +22 -37
- cicada/pr_indexer/indexer.py +17 -29
- cicada/pr_indexer/line_mapper.py +8 -12
- cicada/pr_indexer/pr_index_builder.py +22 -34
- cicada/setup.py +198 -89
- cicada/utils/__init__.py +9 -9
- cicada/utils/call_site_formatter.py +4 -6
- cicada/utils/function_grouper.py +4 -4
- cicada/utils/hash_utils.py +12 -15
- cicada/utils/index_utils.py +15 -15
- cicada/utils/path_utils.py +24 -29
- cicada/utils/signature_builder.py +3 -3
- cicada/utils/subprocess_runner.py +17 -19
- cicada/utils/text_utils.py +1 -2
- cicada/version_check.py +2 -5
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/METADATA +144 -55
- cicada_mcp-0.2.0.dist-info/RECORD +53 -0
- cicada_mcp-0.2.0.dist-info/entry_points.txt +4 -0
- cicada/install.py +0 -741
- cicada_mcp-0.1.5.dist-info/RECORD +0 -47
- cicada_mcp-0.1.5.dist-info/entry_points.txt +0 -9
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/top_level.txt +0 -0
cicada/indexer.py
CHANGED
|
@@ -10,19 +10,56 @@ import signal
|
|
|
10
10
|
import sys
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from pathlib import Path
|
|
13
|
+
|
|
13
14
|
from cicada.parser import ElixirParser
|
|
14
15
|
from cicada.utils import (
|
|
15
|
-
save_index,
|
|
16
16
|
load_index,
|
|
17
17
|
merge_indexes_incremental,
|
|
18
|
+
save_index,
|
|
18
19
|
validate_index_structure,
|
|
19
20
|
)
|
|
20
21
|
from cicada.utils.hash_utils import (
|
|
22
|
+
compute_hashes_for_files,
|
|
23
|
+
detect_file_changes,
|
|
21
24
|
load_file_hashes,
|
|
22
25
|
save_file_hashes,
|
|
23
|
-
detect_file_changes,
|
|
24
|
-
compute_hashes_for_files,
|
|
25
26
|
)
|
|
27
|
+
from cicada.utils.storage import get_config_path
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def read_keyword_extraction_config(repo_path: Path) -> tuple[str, str]:
|
|
31
|
+
"""
|
|
32
|
+
Read keyword extraction configuration from config.yaml.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
repo_path: Path to the repository
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
tuple[str, str]: (method, tier) where method is 'lemminflect' or 'bert',
|
|
39
|
+
and tier is 'fast', 'regular', or 'max'.
|
|
40
|
+
Returns ('lemminflect', 'regular') as default if config not found.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
import yaml
|
|
44
|
+
|
|
45
|
+
config_path = get_config_path(repo_path)
|
|
46
|
+
if not config_path.exists():
|
|
47
|
+
# Default to lemminflect if config doesn't exist
|
|
48
|
+
return ("lemminflect", "regular")
|
|
49
|
+
|
|
50
|
+
with open(config_path) as f:
|
|
51
|
+
config = yaml.safe_load(f)
|
|
52
|
+
|
|
53
|
+
if config and "keyword_extraction" in config:
|
|
54
|
+
method = config["keyword_extraction"].get("method", "lemminflect")
|
|
55
|
+
tier = config["keyword_extraction"].get("tier", "regular")
|
|
56
|
+
return (method, tier)
|
|
57
|
+
|
|
58
|
+
# Default to lemminflect if keyword_extraction section not found
|
|
59
|
+
return ("lemminflect", "regular")
|
|
60
|
+
except Exception:
|
|
61
|
+
# If anything goes wrong, default to lemminflect
|
|
62
|
+
return ("lemminflect", "regular")
|
|
26
63
|
|
|
27
64
|
|
|
28
65
|
class ElixirIndexer:
|
|
@@ -45,20 +82,16 @@ class ElixirIndexer:
|
|
|
45
82
|
}
|
|
46
83
|
self._interrupted = False
|
|
47
84
|
|
|
48
|
-
def _handle_interrupt(self,
|
|
85
|
+
def _handle_interrupt(self, _signum, _frame):
|
|
49
86
|
"""Handle interrupt signals (Ctrl-C, SIGTERM) gracefully."""
|
|
50
|
-
print(
|
|
51
|
-
"\n\n⚠️ Interrupt received. Finishing current file and saving progress..."
|
|
52
|
-
)
|
|
87
|
+
print("\n\n⚠️ Interrupt received. Finishing current file and saving progress...")
|
|
53
88
|
print(" Press Ctrl-C again to force quit (may lose progress)\n")
|
|
54
89
|
self._interrupted = True
|
|
55
90
|
# Restore default handler so second Ctrl-C will kill immediately
|
|
56
91
|
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
|
57
92
|
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
|
58
93
|
|
|
59
|
-
def _check_and_report_interruption(
|
|
60
|
-
self, files_processed: int, total_files: int
|
|
61
|
-
) -> bool:
|
|
94
|
+
def _check_and_report_interruption(self, files_processed: int, total_files: int) -> bool:
|
|
62
95
|
"""
|
|
63
96
|
Check if interrupted and report status.
|
|
64
97
|
|
|
@@ -70,9 +103,7 @@ class ElixirIndexer:
|
|
|
70
103
|
True if interrupted, False otherwise
|
|
71
104
|
"""
|
|
72
105
|
if self._interrupted:
|
|
73
|
-
print(
|
|
74
|
-
f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files"
|
|
75
|
-
)
|
|
106
|
+
print(f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files")
|
|
76
107
|
print(" Saving partial progress...")
|
|
77
108
|
return True
|
|
78
109
|
return False
|
|
@@ -80,9 +111,8 @@ class ElixirIndexer:
|
|
|
80
111
|
def index_repository(
|
|
81
112
|
self,
|
|
82
113
|
repo_path: str,
|
|
83
|
-
output_path: str
|
|
114
|
+
output_path: str,
|
|
84
115
|
extract_keywords: bool = False,
|
|
85
|
-
spacy_model: str = "small",
|
|
86
116
|
):
|
|
87
117
|
"""
|
|
88
118
|
Index an Elixir repository.
|
|
@@ -91,8 +121,6 @@ class ElixirIndexer:
|
|
|
91
121
|
repo_path: Path to the Elixir repository root
|
|
92
122
|
output_path: Path where the index JSON file will be saved
|
|
93
123
|
extract_keywords: If True, extract keywords from documentation using NLP
|
|
94
|
-
spacy_model: Size of spaCy model to use for keyword extraction
|
|
95
|
-
('small', 'medium', or 'large'). Default is 'small'.
|
|
96
124
|
|
|
97
125
|
Returns:
|
|
98
126
|
Dictionary containing the index data
|
|
@@ -102,7 +130,12 @@ class ElixirIndexer:
|
|
|
102
130
|
if not repo_path_obj.exists():
|
|
103
131
|
raise ValueError(f"Repository path does not exist: {repo_path_obj}")
|
|
104
132
|
|
|
105
|
-
|
|
133
|
+
if self.verbose:
|
|
134
|
+
print(f"Indexing repository: {repo_path_obj}")
|
|
135
|
+
if extract_keywords:
|
|
136
|
+
# Read and display keyword extraction config
|
|
137
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
138
|
+
print(f"Keyword extraction: {method.upper()} ({tier})")
|
|
106
139
|
|
|
107
140
|
# Set up signal handlers for graceful interruption
|
|
108
141
|
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
@@ -113,25 +146,33 @@ class ElixirIndexer:
|
|
|
113
146
|
keyword_extractor = None
|
|
114
147
|
if extract_keywords:
|
|
115
148
|
try:
|
|
116
|
-
from
|
|
117
|
-
|
|
118
|
-
)
|
|
149
|
+
# Read keyword extraction config from config.yaml
|
|
150
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
119
151
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
152
|
+
if method == "bert":
|
|
153
|
+
# Initialize KeyBERT extractor
|
|
154
|
+
from cicada.keybert_extractor import KeyBERTExtractor
|
|
155
|
+
|
|
156
|
+
keyword_extractor = KeyBERTExtractor(model_tier=tier, verbose=self.verbose)
|
|
157
|
+
else:
|
|
158
|
+
# Initialize lemminflect extractor (default)
|
|
159
|
+
from cicada.lightweight_keyword_extractor import (
|
|
160
|
+
LightweightKeywordExtractor,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
|
|
123
164
|
except Exception as e:
|
|
124
|
-
|
|
125
|
-
|
|
165
|
+
if self.verbose:
|
|
166
|
+
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
167
|
+
print("Continuing without keyword extraction...")
|
|
126
168
|
extract_keywords = False
|
|
127
169
|
|
|
128
170
|
# Find all Elixir files
|
|
129
171
|
elixir_files = self._find_elixir_files(repo_path_obj)
|
|
130
172
|
total_files = len(elixir_files)
|
|
131
173
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
print("Keyword extraction enabled")
|
|
174
|
+
if self.verbose:
|
|
175
|
+
print(f"Found {total_files} Elixir files")
|
|
135
176
|
|
|
136
177
|
# Parse all files
|
|
137
178
|
all_modules = {}
|
|
@@ -156,10 +197,8 @@ class ElixirIndexer:
|
|
|
156
197
|
module_keywords = None
|
|
157
198
|
if keyword_extractor and module_data.get("moduledoc"):
|
|
158
199
|
try:
|
|
159
|
-
module_keywords = (
|
|
160
|
-
|
|
161
|
-
module_data["moduledoc"], top_n=10
|
|
162
|
-
)
|
|
200
|
+
module_keywords = keyword_extractor.extract_keywords_simple(
|
|
201
|
+
module_data["moduledoc"], top_n=10
|
|
163
202
|
)
|
|
164
203
|
except Exception as e:
|
|
165
204
|
keyword_extraction_failures += 1
|
|
@@ -178,10 +217,8 @@ class ElixirIndexer:
|
|
|
178
217
|
# Include function name in text for keyword extraction
|
|
179
218
|
# This ensures the function name identifier gets 10x weight
|
|
180
219
|
text_for_keywords = f"{func_name} {func['doc']}"
|
|
181
|
-
func_keywords = (
|
|
182
|
-
|
|
183
|
-
text_for_keywords, top_n=10
|
|
184
|
-
)
|
|
220
|
+
func_keywords = keyword_extractor.extract_keywords_simple(
|
|
221
|
+
text_for_keywords, top_n=10
|
|
185
222
|
)
|
|
186
223
|
if func_keywords:
|
|
187
224
|
func["keywords"] = func_keywords
|
|
@@ -222,7 +259,7 @@ class ElixirIndexer:
|
|
|
222
259
|
files_processed += 1
|
|
223
260
|
|
|
224
261
|
# Progress reporting
|
|
225
|
-
if files_processed % self.PROGRESS_REPORT_INTERVAL == 0:
|
|
262
|
+
if self.verbose and files_processed % self.PROGRESS_REPORT_INTERVAL == 0:
|
|
226
263
|
print(f" Processed {files_processed}/{total_files} files...")
|
|
227
264
|
|
|
228
265
|
# Check for interruption after each file
|
|
@@ -230,7 +267,8 @@ class ElixirIndexer:
|
|
|
230
267
|
break
|
|
231
268
|
|
|
232
269
|
except Exception as e:
|
|
233
|
-
|
|
270
|
+
if self.verbose:
|
|
271
|
+
print(f" Skipping {file_path}: {e}")
|
|
234
272
|
# Check for interruption even after error
|
|
235
273
|
if self._check_and_report_interruption(files_processed, total_files):
|
|
236
274
|
break
|
|
@@ -257,54 +295,59 @@ class ElixirIndexer:
|
|
|
257
295
|
if is_first_run:
|
|
258
296
|
from cicada.utils.path_utils import ensure_gitignore_has_cicada
|
|
259
297
|
|
|
260
|
-
if ensure_gitignore_has_cicada(repo_path_obj):
|
|
298
|
+
if ensure_gitignore_has_cicada(repo_path_obj) and self.verbose:
|
|
261
299
|
print("✓ Added .cicada/ to .gitignore")
|
|
262
300
|
|
|
263
301
|
save_index(index, output_path_obj, create_dirs=True)
|
|
264
302
|
|
|
265
303
|
# Compute and save hashes for all PROCESSED files for future incremental updates
|
|
266
|
-
|
|
304
|
+
if self.verbose:
|
|
305
|
+
print("Computing file hashes for incremental updates...")
|
|
267
306
|
# Only hash files that were actually processed
|
|
268
307
|
processed_files = [
|
|
269
308
|
str(f.relative_to(repo_path_obj)) for f in elixir_files[:files_processed]
|
|
270
309
|
]
|
|
271
310
|
file_hashes = compute_hashes_for_files(processed_files, str(repo_path_obj))
|
|
272
|
-
|
|
311
|
+
# Save hashes to centralized storage directory
|
|
312
|
+
from cicada.utils import get_storage_dir
|
|
313
|
+
|
|
314
|
+
storage_dir = get_storage_dir(repo_path_obj)
|
|
315
|
+
save_file_hashes(str(storage_dir), file_hashes)
|
|
273
316
|
|
|
274
317
|
# Report completion status
|
|
275
|
-
if self.
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
318
|
+
if self.verbose:
|
|
319
|
+
if self._interrupted:
|
|
320
|
+
print("\n✓ Partial index saved!")
|
|
321
|
+
print(
|
|
322
|
+
f" Processed: {files_processed}/{total_files} files ({files_processed/total_files*100:.1f}%)"
|
|
323
|
+
)
|
|
324
|
+
print(f" Modules: {len(all_modules)}")
|
|
325
|
+
print(f" Functions: {total_functions}")
|
|
326
|
+
print(
|
|
327
|
+
f"\n💡 Run the command again to continue indexing remaining {total_files - files_processed} file(s)"
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
print("\nIndexing complete!")
|
|
331
|
+
print(f" Modules: {len(all_modules)}")
|
|
332
|
+
print(f" Functions: {total_functions}")
|
|
289
333
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
334
|
+
# Report keyword extraction failures if any
|
|
335
|
+
if extract_keywords and keyword_extraction_failures > 0:
|
|
336
|
+
print(
|
|
337
|
+
f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
|
|
338
|
+
)
|
|
339
|
+
print(" Some documentation may not be indexed for keyword search.")
|
|
296
340
|
|
|
297
|
-
|
|
298
|
-
|
|
341
|
+
print(f"\nIndex saved to: {output_path_obj}")
|
|
342
|
+
print(f"Hashes saved to: {output_path_obj.parent}/hashes.json")
|
|
299
343
|
|
|
300
344
|
return index
|
|
301
345
|
|
|
302
346
|
def incremental_index_repository(
|
|
303
347
|
self,
|
|
304
348
|
repo_path: str,
|
|
305
|
-
output_path: str
|
|
349
|
+
output_path: str,
|
|
306
350
|
extract_keywords: bool = False,
|
|
307
|
-
spacy_model: str = "small",
|
|
308
351
|
force_full: bool = False,
|
|
309
352
|
):
|
|
310
353
|
"""
|
|
@@ -318,7 +361,6 @@ class ElixirIndexer:
|
|
|
318
361
|
repo_path: Path to the Elixir repository root
|
|
319
362
|
output_path: Path where the index JSON file will be saved
|
|
320
363
|
extract_keywords: If True, extract keywords from documentation using NLP
|
|
321
|
-
spacy_model: Size of spaCy model to use for keyword extraction
|
|
322
364
|
force_full: If True, ignore existing hashes and do full reindex
|
|
323
365
|
|
|
324
366
|
Returns:
|
|
@@ -326,32 +368,39 @@ class ElixirIndexer:
|
|
|
326
368
|
"""
|
|
327
369
|
repo_path_obj = Path(repo_path).resolve()
|
|
328
370
|
output_path_obj = Path(output_path)
|
|
329
|
-
|
|
371
|
+
# Use centralized storage directory for hashes
|
|
372
|
+
from cicada.utils import get_storage_dir
|
|
373
|
+
|
|
374
|
+
storage_dir = get_storage_dir(repo_path_obj)
|
|
330
375
|
|
|
331
376
|
if not repo_path_obj.exists():
|
|
332
377
|
raise ValueError(f"Repository path does not exist: {repo_path_obj}")
|
|
333
378
|
|
|
334
379
|
# Load existing index and hashes
|
|
335
380
|
existing_index = load_index(output_path_obj) if not force_full else None
|
|
336
|
-
existing_hashes = load_file_hashes(str(
|
|
381
|
+
existing_hashes = load_file_hashes(str(storage_dir)) if not force_full else {}
|
|
337
382
|
|
|
338
383
|
# Validate existing index structure if loaded
|
|
339
384
|
if existing_index:
|
|
340
385
|
is_valid, error = validate_index_structure(existing_index)
|
|
341
386
|
if not is_valid:
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
387
|
+
if self.verbose:
|
|
388
|
+
print(
|
|
389
|
+
f"Warning: Existing index is corrupted ({error}). Performing full reindex..."
|
|
390
|
+
)
|
|
345
391
|
existing_index = None
|
|
346
392
|
|
|
347
393
|
# If no existing data, do full index
|
|
348
394
|
if not existing_index or not existing_hashes:
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
)
|
|
395
|
+
if self.verbose:
|
|
396
|
+
print("No existing index or hashes found. Performing full index...")
|
|
397
|
+
return self.index_repository(str(repo_path_obj), str(output_path_obj), extract_keywords)
|
|
353
398
|
|
|
354
|
-
|
|
399
|
+
if self.verbose:
|
|
400
|
+
# Read and display keyword extraction config
|
|
401
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
402
|
+
print(f"Performing incremental index of: {repo_path_obj}")
|
|
403
|
+
print(f"Keyword extraction: {method.upper()} ({tier})")
|
|
355
404
|
|
|
356
405
|
# Set up signal handlers for graceful interruption
|
|
357
406
|
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
@@ -364,7 +413,8 @@ class ElixirIndexer:
|
|
|
364
413
|
relative_files = [str(f.relative_to(repo_path_obj)) for f in elixir_files]
|
|
365
414
|
|
|
366
415
|
# Detect file changes
|
|
367
|
-
|
|
416
|
+
if self.verbose:
|
|
417
|
+
print("Detecting file changes...")
|
|
368
418
|
new_files, modified_files, deleted_files = detect_file_changes(
|
|
369
419
|
relative_files, existing_hashes, str(repo_path_obj)
|
|
370
420
|
)
|
|
@@ -377,10 +427,11 @@ class ElixirIndexer:
|
|
|
377
427
|
print("No changes detected. Index is up to date.")
|
|
378
428
|
return existing_index
|
|
379
429
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
430
|
+
if self.verbose:
|
|
431
|
+
print("Changes detected:")
|
|
432
|
+
print(f" New files: {len(new_files)}")
|
|
433
|
+
print(f" Modified files: {len(modified_files)}")
|
|
434
|
+
print(f" Deleted files: {len(deleted_files)}")
|
|
384
435
|
|
|
385
436
|
if files_to_process:
|
|
386
437
|
print(f"\nProcessing {len(files_to_process)} changed file(s)...")
|
|
@@ -389,13 +440,21 @@ class ElixirIndexer:
|
|
|
389
440
|
keyword_extractor = None
|
|
390
441
|
if extract_keywords:
|
|
391
442
|
try:
|
|
392
|
-
from
|
|
393
|
-
|
|
394
|
-
)
|
|
443
|
+
# Read keyword extraction config from config.yaml
|
|
444
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
395
445
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
446
|
+
if method == "bert":
|
|
447
|
+
# Initialize KeyBERT extractor
|
|
448
|
+
from cicada.keybert_extractor import KeyBERTExtractor
|
|
449
|
+
|
|
450
|
+
keyword_extractor = KeyBERTExtractor(model_tier=tier, verbose=self.verbose)
|
|
451
|
+
else:
|
|
452
|
+
# Initialize lemminflect extractor (default)
|
|
453
|
+
from cicada.lightweight_keyword_extractor import (
|
|
454
|
+
LightweightKeywordExtractor,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
|
|
399
458
|
except Exception as e:
|
|
400
459
|
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
401
460
|
print("Continuing without keyword extraction...")
|
|
@@ -425,12 +484,10 @@ class ElixirIndexer:
|
|
|
425
484
|
module_keywords = None
|
|
426
485
|
if keyword_extractor and module_data.get("moduledoc"):
|
|
427
486
|
try:
|
|
428
|
-
module_keywords = (
|
|
429
|
-
|
|
430
|
-
module_data["moduledoc"], top_n=10
|
|
431
|
-
)
|
|
487
|
+
module_keywords = keyword_extractor.extract_keywords_simple(
|
|
488
|
+
module_data["moduledoc"], top_n=10
|
|
432
489
|
)
|
|
433
|
-
except Exception
|
|
490
|
+
except Exception:
|
|
434
491
|
keyword_extraction_failures += 1
|
|
435
492
|
|
|
436
493
|
# Extract keywords from function docs
|
|
@@ -440,14 +497,12 @@ class ElixirIndexer:
|
|
|
440
497
|
try:
|
|
441
498
|
func_name = func.get("name", "")
|
|
442
499
|
text_for_keywords = f"{func_name} {func['doc']}"
|
|
443
|
-
func_keywords = (
|
|
444
|
-
|
|
445
|
-
text_for_keywords, top_n=10
|
|
446
|
-
)
|
|
500
|
+
func_keywords = keyword_extractor.extract_keywords_simple(
|
|
501
|
+
text_for_keywords, top_n=10
|
|
447
502
|
)
|
|
448
503
|
if func_keywords:
|
|
449
504
|
func["keywords"] = func_keywords
|
|
450
|
-
except Exception
|
|
505
|
+
except Exception:
|
|
451
506
|
keyword_extraction_failures += 1
|
|
452
507
|
|
|
453
508
|
# Store module info
|
|
@@ -478,17 +533,13 @@ class ElixirIndexer:
|
|
|
478
533
|
files_processed += 1
|
|
479
534
|
|
|
480
535
|
# Check for interruption after each file
|
|
481
|
-
if self._check_and_report_interruption(
|
|
482
|
-
files_processed, len(files_to_process)
|
|
483
|
-
):
|
|
536
|
+
if self._check_and_report_interruption(files_processed, len(files_to_process)):
|
|
484
537
|
break
|
|
485
538
|
|
|
486
539
|
except Exception as e:
|
|
487
540
|
print(f" Skipping {file_path}: {e}")
|
|
488
541
|
# Check for interruption even after error
|
|
489
|
-
if self._check_and_report_interruption(
|
|
490
|
-
files_processed, len(files_to_process)
|
|
491
|
-
):
|
|
542
|
+
if self._check_and_report_interruption(files_processed, len(files_to_process)):
|
|
492
543
|
break
|
|
493
544
|
continue
|
|
494
545
|
|
|
@@ -502,13 +553,13 @@ class ElixirIndexer:
|
|
|
502
553
|
}
|
|
503
554
|
|
|
504
555
|
# Merge with existing index
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
)
|
|
556
|
+
if self.verbose:
|
|
557
|
+
print("\nMerging with existing index...")
|
|
558
|
+
merged_index = merge_indexes_incremental(existing_index, new_index, deleted_files)
|
|
509
559
|
|
|
510
560
|
# Update hashes for all current files
|
|
511
|
-
|
|
561
|
+
if self.verbose:
|
|
562
|
+
print("Updating file hashes...")
|
|
512
563
|
updated_hashes = dict(existing_hashes)
|
|
513
564
|
|
|
514
565
|
# Compute hashes only for files that were actually processed
|
|
@@ -522,15 +573,13 @@ class ElixirIndexer:
|
|
|
522
573
|
|
|
523
574
|
# Save index and hashes
|
|
524
575
|
save_index(merged_index, output_path_obj, create_dirs=True)
|
|
525
|
-
save_file_hashes(str(
|
|
576
|
+
save_file_hashes(str(storage_dir), updated_hashes)
|
|
526
577
|
|
|
527
578
|
# Report completion status
|
|
528
579
|
if self._interrupted:
|
|
529
580
|
remaining = len(files_to_process) - files_processed
|
|
530
|
-
print(
|
|
531
|
-
print(
|
|
532
|
-
f" Processed: {files_processed}/{len(files_to_process)} changed file(s)"
|
|
533
|
-
)
|
|
581
|
+
print("\n✓ Partial index saved!")
|
|
582
|
+
print(f" Processed: {files_processed}/{len(files_to_process)} changed file(s)")
|
|
534
583
|
print(f" Total modules: {merged_index['metadata']['total_modules']}")
|
|
535
584
|
print(f" Total functions: {merged_index['metadata']['total_functions']}")
|
|
536
585
|
print(f" Files deleted: {len(deleted_files)}")
|
|
@@ -538,7 +587,7 @@ class ElixirIndexer:
|
|
|
538
587
|
f"\n💡 Run the command again to continue indexing remaining {remaining} changed file(s)"
|
|
539
588
|
)
|
|
540
589
|
else:
|
|
541
|
-
print(
|
|
590
|
+
print("\nIncremental indexing complete!")
|
|
542
591
|
print(f" Total modules: {merged_index['metadata']['total_modules']}")
|
|
543
592
|
print(f" Total functions: {merged_index['metadata']['total_functions']}")
|
|
544
593
|
print(f" Files processed: {files_processed}")
|
|
@@ -550,9 +599,6 @@ class ElixirIndexer:
|
|
|
550
599
|
f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
|
|
551
600
|
)
|
|
552
601
|
|
|
553
|
-
print(f"\nIndex saved to: {output_path_obj}")
|
|
554
|
-
print(f"Hashes saved to: {cicada_dir}/hashes.json")
|
|
555
|
-
|
|
556
602
|
return merged_index
|
|
557
603
|
|
|
558
604
|
def _find_elixir_files(self, repo_path: Path) -> list:
|
|
@@ -593,18 +639,6 @@ def main():
|
|
|
593
639
|
default=".cicada/index.json",
|
|
594
640
|
help="Output path for the index file (default: .cicada/index.json)",
|
|
595
641
|
)
|
|
596
|
-
parser.add_argument(
|
|
597
|
-
"--extract-keywords",
|
|
598
|
-
action="store_true",
|
|
599
|
-
help="Extract keywords from documentation using NLP (adds ~1-2s per 100 docs)",
|
|
600
|
-
)
|
|
601
|
-
parser.add_argument(
|
|
602
|
-
"--spacy-model",
|
|
603
|
-
choices=["small", "medium", "large"],
|
|
604
|
-
default="small",
|
|
605
|
-
help="Size of spaCy model to use for keyword extraction (default: small). "
|
|
606
|
-
"Medium and large models provide better accuracy but are slower.",
|
|
607
|
-
)
|
|
608
642
|
parser.add_argument(
|
|
609
643
|
"--full",
|
|
610
644
|
action="store_true",
|
|
@@ -619,8 +653,7 @@ def main():
|
|
|
619
653
|
indexer.incremental_index_repository(
|
|
620
654
|
args.repo,
|
|
621
655
|
args.output,
|
|
622
|
-
extract_keywords=
|
|
623
|
-
spacy_model=args.spacy_model,
|
|
656
|
+
extract_keywords=True,
|
|
624
657
|
force_full=args.full,
|
|
625
658
|
)
|
|
626
659
|
|