cicada-mcp 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/ascii_art.py +60 -0
- cicada/clean.py +195 -60
- cicada/cli.py +757 -0
- cicada/colors.py +27 -0
- cicada/command_logger.py +14 -16
- cicada/dead_code_analyzer.py +12 -19
- cicada/extractors/__init__.py +6 -6
- cicada/extractors/base.py +3 -3
- cicada/extractors/call.py +11 -15
- cicada/extractors/dependency.py +39 -51
- cicada/extractors/doc.py +8 -9
- cicada/extractors/function.py +12 -24
- cicada/extractors/module.py +11 -15
- cicada/extractors/spec.py +8 -12
- cicada/find_dead_code.py +15 -39
- cicada/formatter.py +37 -91
- cicada/git_helper.py +22 -34
- cicada/indexer.py +122 -107
- cicada/interactive_setup.py +490 -0
- cicada/keybert_extractor.py +286 -0
- cicada/keyword_search.py +22 -30
- cicada/keyword_test.py +127 -0
- cicada/lightweight_keyword_extractor.py +5 -13
- cicada/mcp_entry.py +683 -0
- cicada/mcp_server.py +103 -209
- cicada/parser.py +9 -9
- cicada/pr_finder.py +15 -19
- cicada/pr_indexer/__init__.py +3 -3
- cicada/pr_indexer/cli.py +4 -9
- cicada/pr_indexer/github_api_client.py +22 -37
- cicada/pr_indexer/indexer.py +17 -29
- cicada/pr_indexer/line_mapper.py +8 -12
- cicada/pr_indexer/pr_index_builder.py +22 -34
- cicada/setup.py +189 -87
- cicada/utils/__init__.py +9 -9
- cicada/utils/call_site_formatter.py +4 -6
- cicada/utils/function_grouper.py +4 -4
- cicada/utils/hash_utils.py +12 -15
- cicada/utils/index_utils.py +15 -15
- cicada/utils/path_utils.py +24 -29
- cicada/utils/signature_builder.py +3 -3
- cicada/utils/subprocess_runner.py +17 -19
- cicada/utils/text_utils.py +1 -2
- cicada/version_check.py +2 -5
- {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/METADATA +144 -55
- cicada_mcp-0.2.0.dist-info/RECORD +53 -0
- cicada_mcp-0.2.0.dist-info/entry_points.txt +4 -0
- cicada/install.py +0 -741
- cicada_mcp-0.1.7.dist-info/RECORD +0 -47
- cicada_mcp-0.1.7.dist-info/entry_points.txt +0 -9
- {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.1.7.dist-info → cicada_mcp-0.2.0.dist-info}/top_level.txt +0 -0
cicada/git_helper.py
CHANGED
|
@@ -8,12 +8,12 @@ offering comprehensive commit history for files and functions.
|
|
|
8
8
|
Author: Cursor(Auto)
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
import git
|
|
12
11
|
import subprocess
|
|
13
12
|
from datetime import datetime
|
|
14
|
-
from typing import List, Dict, Optional
|
|
15
13
|
from pathlib import Path
|
|
16
14
|
|
|
15
|
+
import git
|
|
16
|
+
|
|
17
17
|
|
|
18
18
|
class GitHelper:
|
|
19
19
|
"""Helper class for extracting git commit history"""
|
|
@@ -31,7 +31,7 @@ class GitHelper:
|
|
|
31
31
|
self.repo = git.Repo(repo_path)
|
|
32
32
|
self.repo_path = Path(repo_path)
|
|
33
33
|
|
|
34
|
-
def get_file_history(self, file_path: str, max_commits: int = 10) ->
|
|
34
|
+
def get_file_history(self, file_path: str, max_commits: int = 10) -> list[dict]:
|
|
35
35
|
"""
|
|
36
36
|
Get commit history for a specific file
|
|
37
37
|
|
|
@@ -53,9 +53,7 @@ class GitHelper:
|
|
|
53
53
|
|
|
54
54
|
try:
|
|
55
55
|
# Get commits that touched this file
|
|
56
|
-
for commit in self.repo.iter_commits(
|
|
57
|
-
paths=file_path, max_count=max_commits
|
|
58
|
-
):
|
|
56
|
+
for commit in self.repo.iter_commits(paths=file_path, max_count=max_commits):
|
|
59
57
|
commits.append(
|
|
60
58
|
{
|
|
61
59
|
"sha": commit.hexsha[:8], # Short SHA
|
|
@@ -78,7 +76,7 @@ class GitHelper:
|
|
|
78
76
|
function_name: str,
|
|
79
77
|
_line_number: int,
|
|
80
78
|
max_commits: int = 5,
|
|
81
|
-
) ->
|
|
79
|
+
) -> list[dict]:
|
|
82
80
|
"""
|
|
83
81
|
Get commit history for a specific function using heuristics.
|
|
84
82
|
|
|
@@ -123,11 +121,11 @@ class GitHelper:
|
|
|
123
121
|
def get_function_history_precise(
|
|
124
122
|
self,
|
|
125
123
|
file_path: str,
|
|
126
|
-
start_line:
|
|
127
|
-
end_line:
|
|
128
|
-
function_name:
|
|
124
|
+
start_line: int | None = None,
|
|
125
|
+
end_line: int | None = None,
|
|
126
|
+
function_name: str | None = None,
|
|
129
127
|
max_commits: int = 5,
|
|
130
|
-
) ->
|
|
128
|
+
) -> list[dict]:
|
|
131
129
|
"""
|
|
132
130
|
Get precise commit history for a function using git log -L.
|
|
133
131
|
|
|
@@ -158,14 +156,13 @@ class GitHelper:
|
|
|
158
156
|
- Requires .gitattributes with "*.ex diff=elixir" for function tracking
|
|
159
157
|
"""
|
|
160
158
|
commits = []
|
|
161
|
-
import subprocess
|
|
162
159
|
|
|
163
160
|
# Determine tracking mode
|
|
164
161
|
use_function_tracking = function_name is not None
|
|
165
162
|
use_line_tracking = start_line is not None and end_line is not None
|
|
166
163
|
|
|
167
164
|
if not use_function_tracking and not use_line_tracking:
|
|
168
|
-
print(
|
|
165
|
+
print("Error: Must provide either function_name or (start_line, end_line)")
|
|
169
166
|
return []
|
|
170
167
|
|
|
171
168
|
try:
|
|
@@ -239,10 +236,10 @@ class GitHelper:
|
|
|
239
236
|
def get_function_evolution(
|
|
240
237
|
self,
|
|
241
238
|
file_path: str,
|
|
242
|
-
start_line:
|
|
243
|
-
end_line:
|
|
244
|
-
function_name:
|
|
245
|
-
) ->
|
|
239
|
+
start_line: int | None = None,
|
|
240
|
+
end_line: int | None = None,
|
|
241
|
+
function_name: str | None = None,
|
|
242
|
+
) -> dict | None:
|
|
246
243
|
"""
|
|
247
244
|
Get evolution metadata for a function (creation, last modification, change count).
|
|
248
245
|
|
|
@@ -297,9 +294,7 @@ class GitHelper:
|
|
|
297
294
|
if days_between > 0:
|
|
298
295
|
months = days_between / 30.0
|
|
299
296
|
modification_frequency = (
|
|
300
|
-
total_modifications / months
|
|
301
|
-
if months > 0
|
|
302
|
-
else total_modifications
|
|
297
|
+
total_modifications / months if months > 0 else total_modifications
|
|
303
298
|
)
|
|
304
299
|
except Exception:
|
|
305
300
|
# If date parsing fails, skip frequency calculation
|
|
@@ -330,9 +325,7 @@ class GitHelper:
|
|
|
330
325
|
print(f"Error getting function evolution for {file_path}: {e}")
|
|
331
326
|
return None
|
|
332
327
|
|
|
333
|
-
def get_function_history(
|
|
334
|
-
self, file_path: str, start_line: int, end_line: int
|
|
335
|
-
) -> List[Dict]:
|
|
328
|
+
def get_function_history(self, file_path: str, start_line: int, end_line: int) -> list[dict]:
|
|
336
329
|
"""
|
|
337
330
|
Get line-by-line authorship for a function using git blame.
|
|
338
331
|
|
|
@@ -357,7 +350,6 @@ class GitHelper:
|
|
|
357
350
|
- lines: List of {number, content} for each line
|
|
358
351
|
"""
|
|
359
352
|
blame_groups = []
|
|
360
|
-
import subprocess
|
|
361
353
|
|
|
362
354
|
try:
|
|
363
355
|
# Use git blame with line range
|
|
@@ -401,10 +393,8 @@ class GitHelper:
|
|
|
401
393
|
elif line.startswith("author-time "):
|
|
402
394
|
try:
|
|
403
395
|
timestamp = int(line[12:])
|
|
404
|
-
current_commit["date"] = datetime.fromtimestamp(
|
|
405
|
-
|
|
406
|
-
).isoformat()
|
|
407
|
-
except:
|
|
396
|
+
current_commit["date"] = datetime.fromtimestamp(timestamp).isoformat()
|
|
397
|
+
except (ValueError, OSError):
|
|
408
398
|
current_commit["date"] = line[12:]
|
|
409
399
|
# Actual code line (starts with tab)
|
|
410
400
|
elif line.startswith("\t"):
|
|
@@ -469,9 +459,7 @@ class GitHelper:
|
|
|
469
459
|
|
|
470
460
|
except subprocess.CalledProcessError as e:
|
|
471
461
|
error_msg = e.stderr if e.stderr else str(e)
|
|
472
|
-
print(
|
|
473
|
-
f"Warning: git blame failed for {file_path}:{start_line}-{end_line}: {error_msg}"
|
|
474
|
-
)
|
|
462
|
+
print(f"Warning: git blame failed for {file_path}:{start_line}-{end_line}: {error_msg}")
|
|
475
463
|
return []
|
|
476
464
|
except Exception as e:
|
|
477
465
|
print(f"Error getting blame for {file_path}: {e}")
|
|
@@ -479,7 +467,7 @@ class GitHelper:
|
|
|
479
467
|
|
|
480
468
|
return blame_groups
|
|
481
469
|
|
|
482
|
-
def get_recent_commits(self, max_count: int = 20) ->
|
|
470
|
+
def get_recent_commits(self, max_count: int = 20) -> list[dict]:
|
|
483
471
|
"""
|
|
484
472
|
Get recent commits in the repository
|
|
485
473
|
|
|
@@ -512,7 +500,7 @@ class GitHelper:
|
|
|
512
500
|
|
|
513
501
|
return commits
|
|
514
502
|
|
|
515
|
-
def get_commit_details(self, commit_sha: str) ->
|
|
503
|
+
def get_commit_details(self, commit_sha: str) -> dict | None:
|
|
516
504
|
"""
|
|
517
505
|
Get detailed information about a specific commit
|
|
518
506
|
|
|
@@ -560,7 +548,7 @@ class GitHelper:
|
|
|
560
548
|
print(f"Error getting commit {commit_sha}: {e}")
|
|
561
549
|
return None
|
|
562
550
|
|
|
563
|
-
def search_commits(self, query: str, max_results: int = 10) ->
|
|
551
|
+
def search_commits(self, query: str, max_results: int = 10) -> list[dict]:
|
|
564
552
|
"""
|
|
565
553
|
Search commit messages for a query string
|
|
566
554
|
|
cicada/indexer.py
CHANGED
|
@@ -10,19 +10,56 @@ import signal
|
|
|
10
10
|
import sys
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from pathlib import Path
|
|
13
|
+
|
|
13
14
|
from cicada.parser import ElixirParser
|
|
14
15
|
from cicada.utils import (
|
|
15
|
-
save_index,
|
|
16
16
|
load_index,
|
|
17
17
|
merge_indexes_incremental,
|
|
18
|
+
save_index,
|
|
18
19
|
validate_index_structure,
|
|
19
20
|
)
|
|
20
21
|
from cicada.utils.hash_utils import (
|
|
22
|
+
compute_hashes_for_files,
|
|
23
|
+
detect_file_changes,
|
|
21
24
|
load_file_hashes,
|
|
22
25
|
save_file_hashes,
|
|
23
|
-
detect_file_changes,
|
|
24
|
-
compute_hashes_for_files,
|
|
25
26
|
)
|
|
27
|
+
from cicada.utils.storage import get_config_path
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def read_keyword_extraction_config(repo_path: Path) -> tuple[str, str]:
|
|
31
|
+
"""
|
|
32
|
+
Read keyword extraction configuration from config.yaml.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
repo_path: Path to the repository
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
tuple[str, str]: (method, tier) where method is 'lemminflect' or 'bert',
|
|
39
|
+
and tier is 'fast', 'regular', or 'max'.
|
|
40
|
+
Returns ('lemminflect', 'regular') as default if config not found.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
import yaml
|
|
44
|
+
|
|
45
|
+
config_path = get_config_path(repo_path)
|
|
46
|
+
if not config_path.exists():
|
|
47
|
+
# Default to lemminflect if config doesn't exist
|
|
48
|
+
return ("lemminflect", "regular")
|
|
49
|
+
|
|
50
|
+
with open(config_path) as f:
|
|
51
|
+
config = yaml.safe_load(f)
|
|
52
|
+
|
|
53
|
+
if config and "keyword_extraction" in config:
|
|
54
|
+
method = config["keyword_extraction"].get("method", "lemminflect")
|
|
55
|
+
tier = config["keyword_extraction"].get("tier", "regular")
|
|
56
|
+
return (method, tier)
|
|
57
|
+
|
|
58
|
+
# Default to lemminflect if keyword_extraction section not found
|
|
59
|
+
return ("lemminflect", "regular")
|
|
60
|
+
except Exception:
|
|
61
|
+
# If anything goes wrong, default to lemminflect
|
|
62
|
+
return ("lemminflect", "regular")
|
|
26
63
|
|
|
27
64
|
|
|
28
65
|
class ElixirIndexer:
|
|
@@ -45,20 +82,16 @@ class ElixirIndexer:
|
|
|
45
82
|
}
|
|
46
83
|
self._interrupted = False
|
|
47
84
|
|
|
48
|
-
def _handle_interrupt(self,
|
|
85
|
+
def _handle_interrupt(self, _signum, _frame):
|
|
49
86
|
"""Handle interrupt signals (Ctrl-C, SIGTERM) gracefully."""
|
|
50
|
-
print(
|
|
51
|
-
"\n\n⚠️ Interrupt received. Finishing current file and saving progress..."
|
|
52
|
-
)
|
|
87
|
+
print("\n\n⚠️ Interrupt received. Finishing current file and saving progress...")
|
|
53
88
|
print(" Press Ctrl-C again to force quit (may lose progress)\n")
|
|
54
89
|
self._interrupted = True
|
|
55
90
|
# Restore default handler so second Ctrl-C will kill immediately
|
|
56
91
|
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
|
57
92
|
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
|
58
93
|
|
|
59
|
-
def _check_and_report_interruption(
|
|
60
|
-
self, files_processed: int, total_files: int
|
|
61
|
-
) -> bool:
|
|
94
|
+
def _check_and_report_interruption(self, files_processed: int, total_files: int) -> bool:
|
|
62
95
|
"""
|
|
63
96
|
Check if interrupted and report status.
|
|
64
97
|
|
|
@@ -70,9 +103,7 @@ class ElixirIndexer:
|
|
|
70
103
|
True if interrupted, False otherwise
|
|
71
104
|
"""
|
|
72
105
|
if self._interrupted:
|
|
73
|
-
print(
|
|
74
|
-
f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files"
|
|
75
|
-
)
|
|
106
|
+
print(f"\n⚠️ Interrupted after processing {files_processed}/{total_files} files")
|
|
76
107
|
print(" Saving partial progress...")
|
|
77
108
|
return True
|
|
78
109
|
return False
|
|
@@ -80,9 +111,8 @@ class ElixirIndexer:
|
|
|
80
111
|
def index_repository(
|
|
81
112
|
self,
|
|
82
113
|
repo_path: str,
|
|
83
|
-
output_path: str
|
|
114
|
+
output_path: str,
|
|
84
115
|
extract_keywords: bool = False,
|
|
85
|
-
spacy_model: str = "small",
|
|
86
116
|
):
|
|
87
117
|
"""
|
|
88
118
|
Index an Elixir repository.
|
|
@@ -91,8 +121,6 @@ class ElixirIndexer:
|
|
|
91
121
|
repo_path: Path to the Elixir repository root
|
|
92
122
|
output_path: Path where the index JSON file will be saved
|
|
93
123
|
extract_keywords: If True, extract keywords from documentation using NLP
|
|
94
|
-
spacy_model: Size of spaCy model to use for keyword extraction
|
|
95
|
-
('small', 'medium', or 'large'). Default is 'small'.
|
|
96
124
|
|
|
97
125
|
Returns:
|
|
98
126
|
Dictionary containing the index data
|
|
@@ -104,6 +132,10 @@ class ElixirIndexer:
|
|
|
104
132
|
|
|
105
133
|
if self.verbose:
|
|
106
134
|
print(f"Indexing repository: {repo_path_obj}")
|
|
135
|
+
if extract_keywords:
|
|
136
|
+
# Read and display keyword extraction config
|
|
137
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
138
|
+
print(f"Keyword extraction: {method.upper()} ({tier})")
|
|
107
139
|
|
|
108
140
|
# Set up signal handlers for graceful interruption
|
|
109
141
|
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
@@ -114,13 +146,21 @@ class ElixirIndexer:
|
|
|
114
146
|
keyword_extractor = None
|
|
115
147
|
if extract_keywords:
|
|
116
148
|
try:
|
|
117
|
-
from
|
|
118
|
-
|
|
119
|
-
)
|
|
149
|
+
# Read keyword extraction config from config.yaml
|
|
150
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
120
151
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
152
|
+
if method == "bert":
|
|
153
|
+
# Initialize KeyBERT extractor
|
|
154
|
+
from cicada.keybert_extractor import KeyBERTExtractor
|
|
155
|
+
|
|
156
|
+
keyword_extractor = KeyBERTExtractor(model_tier=tier, verbose=self.verbose)
|
|
157
|
+
else:
|
|
158
|
+
# Initialize lemminflect extractor (default)
|
|
159
|
+
from cicada.lightweight_keyword_extractor import (
|
|
160
|
+
LightweightKeywordExtractor,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
|
|
124
164
|
except Exception as e:
|
|
125
165
|
if self.verbose:
|
|
126
166
|
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
@@ -133,8 +173,6 @@ class ElixirIndexer:
|
|
|
133
173
|
|
|
134
174
|
if self.verbose:
|
|
135
175
|
print(f"Found {total_files} Elixir files")
|
|
136
|
-
if extract_keywords:
|
|
137
|
-
print("Keyword extraction enabled")
|
|
138
176
|
|
|
139
177
|
# Parse all files
|
|
140
178
|
all_modules = {}
|
|
@@ -159,10 +197,8 @@ class ElixirIndexer:
|
|
|
159
197
|
module_keywords = None
|
|
160
198
|
if keyword_extractor and module_data.get("moduledoc"):
|
|
161
199
|
try:
|
|
162
|
-
module_keywords = (
|
|
163
|
-
|
|
164
|
-
module_data["moduledoc"], top_n=10
|
|
165
|
-
)
|
|
200
|
+
module_keywords = keyword_extractor.extract_keywords_simple(
|
|
201
|
+
module_data["moduledoc"], top_n=10
|
|
166
202
|
)
|
|
167
203
|
except Exception as e:
|
|
168
204
|
keyword_extraction_failures += 1
|
|
@@ -181,10 +217,8 @@ class ElixirIndexer:
|
|
|
181
217
|
# Include function name in text for keyword extraction
|
|
182
218
|
# This ensures the function name identifier gets 10x weight
|
|
183
219
|
text_for_keywords = f"{func_name} {func['doc']}"
|
|
184
|
-
func_keywords = (
|
|
185
|
-
|
|
186
|
-
text_for_keywords, top_n=10
|
|
187
|
-
)
|
|
220
|
+
func_keywords = keyword_extractor.extract_keywords_simple(
|
|
221
|
+
text_for_keywords, top_n=10
|
|
188
222
|
)
|
|
189
223
|
if func_keywords:
|
|
190
224
|
func["keywords"] = func_keywords
|
|
@@ -225,10 +259,7 @@ class ElixirIndexer:
|
|
|
225
259
|
files_processed += 1
|
|
226
260
|
|
|
227
261
|
# Progress reporting
|
|
228
|
-
if
|
|
229
|
-
self.verbose
|
|
230
|
-
and files_processed % self.PROGRESS_REPORT_INTERVAL == 0
|
|
231
|
-
):
|
|
262
|
+
if self.verbose and files_processed % self.PROGRESS_REPORT_INTERVAL == 0:
|
|
232
263
|
print(f" Processed {files_processed}/{total_files} files...")
|
|
233
264
|
|
|
234
265
|
# Check for interruption after each file
|
|
@@ -264,9 +295,8 @@ class ElixirIndexer:
|
|
|
264
295
|
if is_first_run:
|
|
265
296
|
from cicada.utils.path_utils import ensure_gitignore_has_cicada
|
|
266
297
|
|
|
267
|
-
if ensure_gitignore_has_cicada(repo_path_obj):
|
|
268
|
-
|
|
269
|
-
print("✓ Added .cicada/ to .gitignore")
|
|
298
|
+
if ensure_gitignore_has_cicada(repo_path_obj) and self.verbose:
|
|
299
|
+
print("✓ Added .cicada/ to .gitignore")
|
|
270
300
|
|
|
271
301
|
save_index(index, output_path_obj, create_dirs=True)
|
|
272
302
|
|
|
@@ -278,12 +308,16 @@ class ElixirIndexer:
|
|
|
278
308
|
str(f.relative_to(repo_path_obj)) for f in elixir_files[:files_processed]
|
|
279
309
|
]
|
|
280
310
|
file_hashes = compute_hashes_for_files(processed_files, str(repo_path_obj))
|
|
281
|
-
|
|
311
|
+
# Save hashes to centralized storage directory
|
|
312
|
+
from cicada.utils import get_storage_dir
|
|
313
|
+
|
|
314
|
+
storage_dir = get_storage_dir(repo_path_obj)
|
|
315
|
+
save_file_hashes(str(storage_dir), file_hashes)
|
|
282
316
|
|
|
283
317
|
# Report completion status
|
|
284
318
|
if self.verbose:
|
|
285
319
|
if self._interrupted:
|
|
286
|
-
print(
|
|
320
|
+
print("\n✓ Partial index saved!")
|
|
287
321
|
print(
|
|
288
322
|
f" Processed: {files_processed}/{total_files} files ({files_processed/total_files*100:.1f}%)"
|
|
289
323
|
)
|
|
@@ -293,7 +327,7 @@ class ElixirIndexer:
|
|
|
293
327
|
f"\n💡 Run the command again to continue indexing remaining {total_files - files_processed} file(s)"
|
|
294
328
|
)
|
|
295
329
|
else:
|
|
296
|
-
print(
|
|
330
|
+
print("\nIndexing complete!")
|
|
297
331
|
print(f" Modules: {len(all_modules)}")
|
|
298
332
|
print(f" Functions: {total_functions}")
|
|
299
333
|
|
|
@@ -312,9 +346,8 @@ class ElixirIndexer:
|
|
|
312
346
|
def incremental_index_repository(
|
|
313
347
|
self,
|
|
314
348
|
repo_path: str,
|
|
315
|
-
output_path: str
|
|
349
|
+
output_path: str,
|
|
316
350
|
extract_keywords: bool = False,
|
|
317
|
-
spacy_model: str = "small",
|
|
318
351
|
force_full: bool = False,
|
|
319
352
|
):
|
|
320
353
|
"""
|
|
@@ -328,7 +361,6 @@ class ElixirIndexer:
|
|
|
328
361
|
repo_path: Path to the Elixir repository root
|
|
329
362
|
output_path: Path where the index JSON file will be saved
|
|
330
363
|
extract_keywords: If True, extract keywords from documentation using NLP
|
|
331
|
-
spacy_model: Size of spaCy model to use for keyword extraction
|
|
332
364
|
force_full: If True, ignore existing hashes and do full reindex
|
|
333
365
|
|
|
334
366
|
Returns:
|
|
@@ -336,33 +368,39 @@ class ElixirIndexer:
|
|
|
336
368
|
"""
|
|
337
369
|
repo_path_obj = Path(repo_path).resolve()
|
|
338
370
|
output_path_obj = Path(output_path)
|
|
339
|
-
|
|
371
|
+
# Use centralized storage directory for hashes
|
|
372
|
+
from cicada.utils import get_storage_dir
|
|
373
|
+
|
|
374
|
+
storage_dir = get_storage_dir(repo_path_obj)
|
|
340
375
|
|
|
341
376
|
if not repo_path_obj.exists():
|
|
342
377
|
raise ValueError(f"Repository path does not exist: {repo_path_obj}")
|
|
343
378
|
|
|
344
379
|
# Load existing index and hashes
|
|
345
380
|
existing_index = load_index(output_path_obj) if not force_full else None
|
|
346
|
-
existing_hashes = load_file_hashes(str(
|
|
381
|
+
existing_hashes = load_file_hashes(str(storage_dir)) if not force_full else {}
|
|
347
382
|
|
|
348
383
|
# Validate existing index structure if loaded
|
|
349
384
|
if existing_index:
|
|
350
385
|
is_valid, error = validate_index_structure(existing_index)
|
|
351
386
|
if not is_valid:
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
387
|
+
if self.verbose:
|
|
388
|
+
print(
|
|
389
|
+
f"Warning: Existing index is corrupted ({error}). Performing full reindex..."
|
|
390
|
+
)
|
|
355
391
|
existing_index = None
|
|
356
392
|
|
|
357
393
|
# If no existing data, do full index
|
|
358
394
|
if not existing_index or not existing_hashes:
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
)
|
|
395
|
+
if self.verbose:
|
|
396
|
+
print("No existing index or hashes found. Performing full index...")
|
|
397
|
+
return self.index_repository(str(repo_path_obj), str(output_path_obj), extract_keywords)
|
|
363
398
|
|
|
364
399
|
if self.verbose:
|
|
400
|
+
# Read and display keyword extraction config
|
|
401
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
365
402
|
print(f"Performing incremental index of: {repo_path_obj}")
|
|
403
|
+
print(f"Keyword extraction: {method.upper()} ({tier})")
|
|
366
404
|
|
|
367
405
|
# Set up signal handlers for graceful interruption
|
|
368
406
|
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
@@ -390,12 +428,9 @@ class ElixirIndexer:
|
|
|
390
428
|
return existing_index
|
|
391
429
|
|
|
392
430
|
if self.verbose:
|
|
393
|
-
print(
|
|
394
|
-
if self.verbose:
|
|
431
|
+
print("Changes detected:")
|
|
395
432
|
print(f" New files: {len(new_files)}")
|
|
396
|
-
if self.verbose:
|
|
397
433
|
print(f" Modified files: {len(modified_files)}")
|
|
398
|
-
if self.verbose:
|
|
399
434
|
print(f" Deleted files: {len(deleted_files)}")
|
|
400
435
|
|
|
401
436
|
if files_to_process:
|
|
@@ -405,13 +440,21 @@ class ElixirIndexer:
|
|
|
405
440
|
keyword_extractor = None
|
|
406
441
|
if extract_keywords:
|
|
407
442
|
try:
|
|
408
|
-
from
|
|
409
|
-
|
|
410
|
-
)
|
|
443
|
+
# Read keyword extraction config from config.yaml
|
|
444
|
+
method, tier = read_keyword_extraction_config(repo_path_obj)
|
|
411
445
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
446
|
+
if method == "bert":
|
|
447
|
+
# Initialize KeyBERT extractor
|
|
448
|
+
from cicada.keybert_extractor import KeyBERTExtractor
|
|
449
|
+
|
|
450
|
+
keyword_extractor = KeyBERTExtractor(model_tier=tier, verbose=self.verbose)
|
|
451
|
+
else:
|
|
452
|
+
# Initialize lemminflect extractor (default)
|
|
453
|
+
from cicada.lightweight_keyword_extractor import (
|
|
454
|
+
LightweightKeywordExtractor,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
|
|
415
458
|
except Exception as e:
|
|
416
459
|
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
417
460
|
print("Continuing without keyword extraction...")
|
|
@@ -441,12 +484,10 @@ class ElixirIndexer:
|
|
|
441
484
|
module_keywords = None
|
|
442
485
|
if keyword_extractor and module_data.get("moduledoc"):
|
|
443
486
|
try:
|
|
444
|
-
module_keywords = (
|
|
445
|
-
|
|
446
|
-
module_data["moduledoc"], top_n=10
|
|
447
|
-
)
|
|
487
|
+
module_keywords = keyword_extractor.extract_keywords_simple(
|
|
488
|
+
module_data["moduledoc"], top_n=10
|
|
448
489
|
)
|
|
449
|
-
except Exception
|
|
490
|
+
except Exception:
|
|
450
491
|
keyword_extraction_failures += 1
|
|
451
492
|
|
|
452
493
|
# Extract keywords from function docs
|
|
@@ -456,14 +497,12 @@ class ElixirIndexer:
|
|
|
456
497
|
try:
|
|
457
498
|
func_name = func.get("name", "")
|
|
458
499
|
text_for_keywords = f"{func_name} {func['doc']}"
|
|
459
|
-
func_keywords = (
|
|
460
|
-
|
|
461
|
-
text_for_keywords, top_n=10
|
|
462
|
-
)
|
|
500
|
+
func_keywords = keyword_extractor.extract_keywords_simple(
|
|
501
|
+
text_for_keywords, top_n=10
|
|
463
502
|
)
|
|
464
503
|
if func_keywords:
|
|
465
504
|
func["keywords"] = func_keywords
|
|
466
|
-
except Exception
|
|
505
|
+
except Exception:
|
|
467
506
|
keyword_extraction_failures += 1
|
|
468
507
|
|
|
469
508
|
# Store module info
|
|
@@ -494,17 +533,13 @@ class ElixirIndexer:
|
|
|
494
533
|
files_processed += 1
|
|
495
534
|
|
|
496
535
|
# Check for interruption after each file
|
|
497
|
-
if self._check_and_report_interruption(
|
|
498
|
-
files_processed, len(files_to_process)
|
|
499
|
-
):
|
|
536
|
+
if self._check_and_report_interruption(files_processed, len(files_to_process)):
|
|
500
537
|
break
|
|
501
538
|
|
|
502
539
|
except Exception as e:
|
|
503
540
|
print(f" Skipping {file_path}: {e}")
|
|
504
541
|
# Check for interruption even after error
|
|
505
|
-
if self._check_and_report_interruption(
|
|
506
|
-
files_processed, len(files_to_process)
|
|
507
|
-
):
|
|
542
|
+
if self._check_and_report_interruption(files_processed, len(files_to_process)):
|
|
508
543
|
break
|
|
509
544
|
continue
|
|
510
545
|
|
|
@@ -520,9 +555,7 @@ class ElixirIndexer:
|
|
|
520
555
|
# Merge with existing index
|
|
521
556
|
if self.verbose:
|
|
522
557
|
print("\nMerging with existing index...")
|
|
523
|
-
merged_index = merge_indexes_incremental(
|
|
524
|
-
existing_index, new_index, deleted_files
|
|
525
|
-
)
|
|
558
|
+
merged_index = merge_indexes_incremental(existing_index, new_index, deleted_files)
|
|
526
559
|
|
|
527
560
|
# Update hashes for all current files
|
|
528
561
|
if self.verbose:
|
|
@@ -540,15 +573,13 @@ class ElixirIndexer:
|
|
|
540
573
|
|
|
541
574
|
# Save index and hashes
|
|
542
575
|
save_index(merged_index, output_path_obj, create_dirs=True)
|
|
543
|
-
save_file_hashes(str(
|
|
576
|
+
save_file_hashes(str(storage_dir), updated_hashes)
|
|
544
577
|
|
|
545
578
|
# Report completion status
|
|
546
579
|
if self._interrupted:
|
|
547
580
|
remaining = len(files_to_process) - files_processed
|
|
548
|
-
print(
|
|
549
|
-
print(
|
|
550
|
-
f" Processed: {files_processed}/{len(files_to_process)} changed file(s)"
|
|
551
|
-
)
|
|
581
|
+
print("\n✓ Partial index saved!")
|
|
582
|
+
print(f" Processed: {files_processed}/{len(files_to_process)} changed file(s)")
|
|
552
583
|
print(f" Total modules: {merged_index['metadata']['total_modules']}")
|
|
553
584
|
print(f" Total functions: {merged_index['metadata']['total_functions']}")
|
|
554
585
|
print(f" Files deleted: {len(deleted_files)}")
|
|
@@ -556,7 +587,7 @@ class ElixirIndexer:
|
|
|
556
587
|
f"\n💡 Run the command again to continue indexing remaining {remaining} changed file(s)"
|
|
557
588
|
)
|
|
558
589
|
else:
|
|
559
|
-
print(
|
|
590
|
+
print("\nIncremental indexing complete!")
|
|
560
591
|
print(f" Total modules: {merged_index['metadata']['total_modules']}")
|
|
561
592
|
print(f" Total functions: {merged_index['metadata']['total_functions']}")
|
|
562
593
|
print(f" Files processed: {files_processed}")
|
|
@@ -568,9 +599,6 @@ class ElixirIndexer:
|
|
|
568
599
|
f"\n⚠️ Warning: Keyword extraction failed for {keyword_extraction_failures} module(s) or function(s)"
|
|
569
600
|
)
|
|
570
601
|
|
|
571
|
-
print(f"\nIndex saved to: {output_path_obj}")
|
|
572
|
-
print(f"Hashes saved to: {cicada_dir}/hashes.json")
|
|
573
|
-
|
|
574
602
|
return merged_index
|
|
575
603
|
|
|
576
604
|
def _find_elixir_files(self, repo_path: Path) -> list:
|
|
@@ -611,18 +639,6 @@ def main():
|
|
|
611
639
|
default=".cicada/index.json",
|
|
612
640
|
help="Output path for the index file (default: .cicada/index.json)",
|
|
613
641
|
)
|
|
614
|
-
parser.add_argument(
|
|
615
|
-
"--extract-keywords",
|
|
616
|
-
action="store_true",
|
|
617
|
-
help="Extract keywords from documentation using NLP (adds ~1-2s per 100 docs)",
|
|
618
|
-
)
|
|
619
|
-
parser.add_argument(
|
|
620
|
-
"--spacy-model",
|
|
621
|
-
choices=["small", "medium", "large"],
|
|
622
|
-
default="small",
|
|
623
|
-
help="Size of spaCy model to use for keyword extraction (default: small). "
|
|
624
|
-
"Medium and large models provide better accuracy but are slower.",
|
|
625
|
-
)
|
|
626
642
|
parser.add_argument(
|
|
627
643
|
"--full",
|
|
628
644
|
action="store_true",
|
|
@@ -637,8 +653,7 @@ def main():
|
|
|
637
653
|
indexer.incremental_index_repository(
|
|
638
654
|
args.repo,
|
|
639
655
|
args.output,
|
|
640
|
-
extract_keywords=
|
|
641
|
-
spacy_model=args.spacy_model,
|
|
656
|
+
extract_keywords=True,
|
|
642
657
|
force_full=args.full,
|
|
643
658
|
)
|
|
644
659
|
|