cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/_version_hash.py +4 -0
- cicada/cli.py +6 -748
- cicada/commands.py +1255 -0
- cicada/dead_code/__init__.py +1 -0
- cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
- cicada/dependency_analyzer.py +147 -0
- cicada/entry_utils.py +92 -0
- cicada/extractors/base.py +9 -9
- cicada/extractors/call.py +17 -20
- cicada/extractors/common.py +64 -0
- cicada/extractors/dependency.py +117 -235
- cicada/extractors/doc.py +2 -49
- cicada/extractors/function.py +10 -14
- cicada/extractors/keybert.py +228 -0
- cicada/extractors/keyword.py +191 -0
- cicada/extractors/module.py +6 -10
- cicada/extractors/spec.py +8 -56
- cicada/format/__init__.py +20 -0
- cicada/{ascii_art.py → format/ascii_art.py} +1 -1
- cicada/format/formatter.py +1145 -0
- cicada/git_helper.py +134 -7
- cicada/indexer.py +322 -89
- cicada/interactive_setup.py +251 -323
- cicada/interactive_setup_helpers.py +302 -0
- cicada/keyword_expander.py +437 -0
- cicada/keyword_search.py +208 -422
- cicada/keyword_test.py +383 -16
- cicada/mcp/__init__.py +10 -0
- cicada/mcp/entry.py +17 -0
- cicada/mcp/filter_utils.py +107 -0
- cicada/mcp/pattern_utils.py +118 -0
- cicada/{mcp_server.py → mcp/server.py} +819 -73
- cicada/mcp/tools.py +473 -0
- cicada/pr_finder.py +2 -3
- cicada/pr_indexer/indexer.py +3 -2
- cicada/setup.py +167 -35
- cicada/tier.py +225 -0
- cicada/utils/__init__.py +9 -2
- cicada/utils/fuzzy_match.py +54 -0
- cicada/utils/index_utils.py +9 -0
- cicada/utils/path_utils.py +18 -0
- cicada/utils/text_utils.py +52 -1
- cicada/utils/tree_utils.py +47 -0
- cicada/version_check.py +99 -0
- cicada/watch_manager.py +320 -0
- cicada/watcher.py +431 -0
- cicada_mcp-0.3.0.dist-info/METADATA +541 -0
- cicada_mcp-0.3.0.dist-info/RECORD +70 -0
- cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
- cicada/formatter.py +0 -864
- cicada/keybert_extractor.py +0 -286
- cicada/lightweight_keyword_extractor.py +0 -290
- cicada/mcp_entry.py +0 -683
- cicada/mcp_tools.py +0 -291
- cicada_mcp-0.2.0.dist-info/METADATA +0 -735
- cicada_mcp-0.2.0.dist-info/RECORD +0 -53
- cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
- /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
- /cicada/{colors.py → format/colors.py} +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
cicada/indexer.py
CHANGED
|
@@ -11,7 +11,14 @@ import sys
|
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
|
|
14
|
+
from cicada.dependency_analyzer import (
|
|
15
|
+
calculate_function_end_line,
|
|
16
|
+
extract_function_dependencies,
|
|
17
|
+
extract_module_dependencies,
|
|
18
|
+
)
|
|
19
|
+
from cicada.git_helper import GitHelper
|
|
14
20
|
from cicada.parser import ElixirParser
|
|
21
|
+
from cicada.tier import read_keyword_extraction_config
|
|
15
22
|
from cicada.utils import (
|
|
16
23
|
load_index,
|
|
17
24
|
merge_indexes_incremental,
|
|
@@ -24,42 +31,7 @@ from cicada.utils.hash_utils import (
|
|
|
24
31
|
load_file_hashes,
|
|
25
32
|
save_file_hashes,
|
|
26
33
|
)
|
|
27
|
-
from cicada.
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def read_keyword_extraction_config(repo_path: Path) -> tuple[str, str]:
|
|
31
|
-
"""
|
|
32
|
-
Read keyword extraction configuration from config.yaml.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
repo_path: Path to the repository
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
tuple[str, str]: (method, tier) where method is 'lemminflect' or 'bert',
|
|
39
|
-
and tier is 'fast', 'regular', or 'max'.
|
|
40
|
-
Returns ('lemminflect', 'regular') as default if config not found.
|
|
41
|
-
"""
|
|
42
|
-
try:
|
|
43
|
-
import yaml
|
|
44
|
-
|
|
45
|
-
config_path = get_config_path(repo_path)
|
|
46
|
-
if not config_path.exists():
|
|
47
|
-
# Default to lemminflect if config doesn't exist
|
|
48
|
-
return ("lemminflect", "regular")
|
|
49
|
-
|
|
50
|
-
with open(config_path) as f:
|
|
51
|
-
config = yaml.safe_load(f)
|
|
52
|
-
|
|
53
|
-
if config and "keyword_extraction" in config:
|
|
54
|
-
method = config["keyword_extraction"].get("method", "lemminflect")
|
|
55
|
-
tier = config["keyword_extraction"].get("tier", "regular")
|
|
56
|
-
return (method, tier)
|
|
57
|
-
|
|
58
|
-
# Default to lemminflect if keyword_extraction section not found
|
|
59
|
-
return ("lemminflect", "regular")
|
|
60
|
-
except Exception:
|
|
61
|
-
# If anything goes wrong, default to lemminflect
|
|
62
|
-
return ("lemminflect", "regular")
|
|
34
|
+
from cicada.version_check import get_version_string, version_mismatch
|
|
63
35
|
|
|
64
36
|
|
|
65
37
|
class ElixirIndexer:
|
|
@@ -68,6 +40,10 @@ class ElixirIndexer:
|
|
|
68
40
|
# Progress reporting interval - report every N files processed
|
|
69
41
|
PROGRESS_REPORT_INTERVAL = 10
|
|
70
42
|
|
|
43
|
+
# Keyword expansion parameters
|
|
44
|
+
DEFAULT_EXPANSION_TOP_N = 3
|
|
45
|
+
DEFAULT_EXPANSION_THRESHOLD = 0.2
|
|
46
|
+
|
|
71
47
|
def __init__(self, verbose: bool = False):
|
|
72
48
|
"""Initialize the indexer with a parser."""
|
|
73
49
|
self.parser = ElixirParser()
|
|
@@ -82,6 +58,33 @@ class ElixirIndexer:
|
|
|
82
58
|
}
|
|
83
59
|
self._interrupted = False
|
|
84
60
|
|
|
61
|
+
def _extract_dependencies(self, module_data: dict, functions: list) -> tuple[dict, list]:
|
|
62
|
+
"""
|
|
63
|
+
Extract module and function level dependencies.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
module_data: Parsed module data containing calls, aliases, etc.
|
|
67
|
+
functions: List of function data dictionaries
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Tuple of (module_dependencies, modified_functions_list)
|
|
71
|
+
"""
|
|
72
|
+
# Extract module-level dependencies
|
|
73
|
+
module_dependencies = extract_module_dependencies(module_data)
|
|
74
|
+
|
|
75
|
+
# Extract function-level dependencies
|
|
76
|
+
all_calls = module_data.get("calls", [])
|
|
77
|
+
for i, func in enumerate(functions):
|
|
78
|
+
# Calculate function end line
|
|
79
|
+
next_func_line = functions[i + 1]["line"] if i + 1 < len(functions) else None
|
|
80
|
+
func_end_line = calculate_function_end_line(func, next_func_line)
|
|
81
|
+
|
|
82
|
+
# Extract dependencies for this function
|
|
83
|
+
func_deps = extract_function_dependencies(module_data, func, all_calls, func_end_line)
|
|
84
|
+
func["dependencies"] = func_deps
|
|
85
|
+
|
|
86
|
+
return module_dependencies, functions
|
|
87
|
+
|
|
85
88
|
def _handle_interrupt(self, _signum, _frame):
|
|
86
89
|
"""Handle interrupt signals (Ctrl-C, SIGTERM) gracefully."""
|
|
87
90
|
print("\n\n⚠️ Interrupt received. Finishing current file and saving progress...")
|
|
@@ -113,6 +116,7 @@ class ElixirIndexer:
|
|
|
113
116
|
repo_path: str,
|
|
114
117
|
output_path: str,
|
|
115
118
|
extract_keywords: bool = False,
|
|
119
|
+
compute_timestamps: bool = False,
|
|
116
120
|
):
|
|
117
121
|
"""
|
|
118
122
|
Index an Elixir repository.
|
|
@@ -121,6 +125,7 @@ class ElixirIndexer:
|
|
|
121
125
|
repo_path: Path to the Elixir repository root
|
|
122
126
|
output_path: Path where the index JSON file will be saved
|
|
123
127
|
extract_keywords: If True, extract keywords from documentation using NLP
|
|
128
|
+
compute_timestamps: If True, compute git history timestamps for functions
|
|
124
129
|
|
|
125
130
|
Returns:
|
|
126
131
|
Dictionary containing the index data
|
|
@@ -134,39 +139,61 @@ class ElixirIndexer:
|
|
|
134
139
|
print(f"Indexing repository: {repo_path_obj}")
|
|
135
140
|
if extract_keywords:
|
|
136
141
|
# Read and display keyword extraction config
|
|
137
|
-
|
|
138
|
-
print(
|
|
142
|
+
extraction_method, expansion_method = read_keyword_extraction_config(repo_path_obj)
|
|
143
|
+
print(
|
|
144
|
+
f"Keyword extraction: {extraction_method.upper()} + {expansion_method.upper()}"
|
|
145
|
+
)
|
|
139
146
|
|
|
140
147
|
# Set up signal handlers for graceful interruption
|
|
141
148
|
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
142
149
|
signal.signal(signal.SIGTERM, self._handle_interrupt)
|
|
143
150
|
self._interrupted = False
|
|
144
151
|
|
|
145
|
-
# Initialize keyword extractor if requested
|
|
152
|
+
# Initialize keyword extractor and expander if requested
|
|
146
153
|
keyword_extractor = None
|
|
154
|
+
keyword_expander = None
|
|
147
155
|
if extract_keywords:
|
|
148
156
|
try:
|
|
149
157
|
# Read keyword extraction config from config.yaml
|
|
150
|
-
|
|
158
|
+
extraction_method, expansion_method = read_keyword_extraction_config(repo_path_obj)
|
|
151
159
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
from cicada.
|
|
160
|
+
# Initialize extraction method
|
|
161
|
+
if extraction_method == "bert":
|
|
162
|
+
from cicada.extractors.keybert import KeyBERTExtractor
|
|
155
163
|
|
|
156
|
-
keyword_extractor = KeyBERTExtractor(
|
|
164
|
+
keyword_extractor = KeyBERTExtractor(verbose=self.verbose)
|
|
157
165
|
else:
|
|
158
|
-
#
|
|
159
|
-
from cicada.
|
|
160
|
-
|
|
161
|
-
)
|
|
166
|
+
# Use regular (TF-based) extractor as default
|
|
167
|
+
from cicada.extractors.keyword import RegularKeywordExtractor
|
|
168
|
+
|
|
169
|
+
keyword_extractor = RegularKeywordExtractor(verbose=self.verbose)
|
|
170
|
+
|
|
171
|
+
# Initialize expansion method
|
|
172
|
+
from cicada.keyword_expander import KeywordExpander
|
|
173
|
+
|
|
174
|
+
keyword_expander = KeywordExpander(
|
|
175
|
+
expansion_type=expansion_method, verbose=self.verbose
|
|
176
|
+
)
|
|
162
177
|
|
|
163
|
-
keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
|
|
164
178
|
except Exception as e:
|
|
165
179
|
if self.verbose:
|
|
166
|
-
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
180
|
+
print(f"Warning: Could not initialize keyword extractor/expander: {e}")
|
|
167
181
|
print("Continuing without keyword extraction...")
|
|
168
182
|
extract_keywords = False
|
|
169
183
|
|
|
184
|
+
# Initialize git helper if timestamps are requested
|
|
185
|
+
git_helper = None
|
|
186
|
+
if compute_timestamps:
|
|
187
|
+
try:
|
|
188
|
+
git_helper = GitHelper(str(repo_path_obj))
|
|
189
|
+
if self.verbose:
|
|
190
|
+
print("Git history tracking enabled - computing function timestamps")
|
|
191
|
+
except Exception as e:
|
|
192
|
+
if self.verbose:
|
|
193
|
+
print(f"Warning: Could not initialize git helper: {e}")
|
|
194
|
+
print("Continuing without timestamp computation...")
|
|
195
|
+
compute_timestamps = False
|
|
196
|
+
|
|
170
197
|
# Find all Elixir files
|
|
171
198
|
elixir_files = self._find_elixir_files(repo_path_obj)
|
|
172
199
|
total_files = len(elixir_files)
|
|
@@ -193,13 +220,48 @@ class ElixirIndexer:
|
|
|
193
220
|
public_count = sum(1 for f in functions if f["type"] == "def")
|
|
194
221
|
private_count = sum(1 for f in functions if f["type"] == "defp")
|
|
195
222
|
|
|
196
|
-
# Extract keywords if enabled
|
|
223
|
+
# Extract and expand keywords if enabled
|
|
197
224
|
module_keywords = None
|
|
198
225
|
if keyword_extractor and module_data.get("moduledoc"):
|
|
199
226
|
try:
|
|
200
|
-
|
|
227
|
+
# Step 1: Extract keywords with scores
|
|
228
|
+
extraction_result = keyword_extractor.extract_keywords(
|
|
201
229
|
module_data["moduledoc"], top_n=10
|
|
202
230
|
)
|
|
231
|
+
extracted_keywords = [
|
|
232
|
+
kw for kw, _ in extraction_result["top_keywords"]
|
|
233
|
+
]
|
|
234
|
+
keyword_scores = {
|
|
235
|
+
kw.lower(): score
|
|
236
|
+
for kw, score in extraction_result["top_keywords"]
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
# Step 2: Expand keywords with scores
|
|
240
|
+
if keyword_expander and extracted_keywords:
|
|
241
|
+
expansion_result = keyword_expander.expand_keywords(
|
|
242
|
+
extracted_keywords,
|
|
243
|
+
top_n=self.DEFAULT_EXPANSION_TOP_N,
|
|
244
|
+
threshold=self.DEFAULT_EXPANSION_THRESHOLD,
|
|
245
|
+
return_scores=True,
|
|
246
|
+
keyword_scores=keyword_scores,
|
|
247
|
+
)
|
|
248
|
+
# Convert to dict: word -> max_score
|
|
249
|
+
module_keywords = {}
|
|
250
|
+
# When return_scores=True, expansion_result is a dict
|
|
251
|
+
if not isinstance(expansion_result, dict):
|
|
252
|
+
raise TypeError(
|
|
253
|
+
"Expected dict from expand_keywords with return_scores=True"
|
|
254
|
+
)
|
|
255
|
+
for item in expansion_result["words"]:
|
|
256
|
+
word = item["word"]
|
|
257
|
+
score = item["score"]
|
|
258
|
+
if (
|
|
259
|
+
word not in module_keywords
|
|
260
|
+
or score > module_keywords[word]
|
|
261
|
+
):
|
|
262
|
+
module_keywords[word] = score
|
|
263
|
+
else:
|
|
264
|
+
module_keywords = keyword_scores
|
|
203
265
|
except Exception as e:
|
|
204
266
|
keyword_extraction_failures += 1
|
|
205
267
|
if self.verbose:
|
|
@@ -208,27 +270,97 @@ class ElixirIndexer:
|
|
|
208
270
|
file=sys.stderr,
|
|
209
271
|
)
|
|
210
272
|
|
|
211
|
-
#
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
273
|
+
# Enrich function metadata (keywords and timestamps)
|
|
274
|
+
for func in functions:
|
|
275
|
+
func_name = func.get("name", "")
|
|
276
|
+
|
|
277
|
+
# Extract and expand keywords from function docs
|
|
278
|
+
if keyword_extractor and func.get("doc"):
|
|
279
|
+
try:
|
|
280
|
+
# Include function name in text for keyword extraction
|
|
281
|
+
# This ensures the function name identifier gets 10x weight
|
|
282
|
+
text_for_keywords = f"{func_name} {func['doc']}"
|
|
283
|
+
# Step 1: Extract keywords with scores
|
|
284
|
+
extraction_result = keyword_extractor.extract_keywords(
|
|
285
|
+
text_for_keywords, top_n=10
|
|
286
|
+
)
|
|
287
|
+
extracted_keywords = [
|
|
288
|
+
kw for kw, _ in extraction_result["top_keywords"]
|
|
289
|
+
]
|
|
290
|
+
keyword_scores = {
|
|
291
|
+
kw.lower(): score
|
|
292
|
+
for kw, score in extraction_result["top_keywords"]
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
# Step 2: Expand keywords with scores
|
|
296
|
+
if keyword_expander and extracted_keywords:
|
|
297
|
+
expansion_result = keyword_expander.expand_keywords(
|
|
298
|
+
extracted_keywords,
|
|
299
|
+
top_n=self.DEFAULT_EXPANSION_TOP_N,
|
|
300
|
+
threshold=self.DEFAULT_EXPANSION_THRESHOLD,
|
|
301
|
+
return_scores=True,
|
|
302
|
+
keyword_scores=keyword_scores,
|
|
222
303
|
)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
f"Warning: Keyword extraction failed for {module_name}.{func_name}: {e}",
|
|
230
|
-
file=sys.stderr,
|
|
304
|
+
# Convert to dict: word -> max_score
|
|
305
|
+
func_keywords = {}
|
|
306
|
+
# When return_scores=True, expansion_result is a dict
|
|
307
|
+
if not isinstance(expansion_result, dict):
|
|
308
|
+
raise TypeError(
|
|
309
|
+
"Expected dict from expand_keywords with return_scores=True"
|
|
231
310
|
)
|
|
311
|
+
for item in expansion_result["words"]:
|
|
312
|
+
word = item["word"]
|
|
313
|
+
score = item["score"]
|
|
314
|
+
if (
|
|
315
|
+
word not in func_keywords
|
|
316
|
+
or score > func_keywords[word]
|
|
317
|
+
):
|
|
318
|
+
func_keywords[word] = score
|
|
319
|
+
else:
|
|
320
|
+
func_keywords = keyword_scores
|
|
321
|
+
|
|
322
|
+
if func_keywords:
|
|
323
|
+
func["keywords"] = func_keywords
|
|
324
|
+
except Exception as e:
|
|
325
|
+
keyword_extraction_failures += 1
|
|
326
|
+
if self.verbose:
|
|
327
|
+
print(
|
|
328
|
+
f"Warning: Keyword extraction failed for {module_name}.{func_name}: {e}",
|
|
329
|
+
file=sys.stderr,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Compute git history timestamps if enabled
|
|
333
|
+
if git_helper and func_name:
|
|
334
|
+
try:
|
|
335
|
+
# Get function evolution metadata
|
|
336
|
+
evolution = git_helper.get_function_evolution(
|
|
337
|
+
file_path=str(file_path.relative_to(repo_path_obj)),
|
|
338
|
+
function_name=func_name,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
if evolution:
|
|
342
|
+
# Add timestamp fields to function
|
|
343
|
+
func["created_at"] = evolution["created_at"]["date"]
|
|
344
|
+
func["last_modified_at"] = evolution["last_modified"][
|
|
345
|
+
"date"
|
|
346
|
+
]
|
|
347
|
+
func["last_modified_sha"] = evolution["last_modified"][
|
|
348
|
+
"sha"
|
|
349
|
+
]
|
|
350
|
+
func["modification_count"] = evolution[
|
|
351
|
+
"total_modifications"
|
|
352
|
+
]
|
|
353
|
+
except Exception as e:
|
|
354
|
+
if self.verbose:
|
|
355
|
+
print(
|
|
356
|
+
f"Warning: Could not compute timestamps for {module_name}.{func_name}: {e}",
|
|
357
|
+
file=sys.stderr,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Extract dependencies
|
|
361
|
+
module_dependencies, functions = self._extract_dependencies(
|
|
362
|
+
module_data, functions
|
|
363
|
+
)
|
|
232
364
|
|
|
233
365
|
# Store module info
|
|
234
366
|
module_info = {
|
|
@@ -246,6 +378,7 @@ class ElixirIndexer:
|
|
|
246
378
|
"behaviours": module_data.get("behaviours", []),
|
|
247
379
|
"value_mentions": module_data.get("value_mentions", []),
|
|
248
380
|
"calls": module_data.get("calls", []),
|
|
381
|
+
"dependencies": module_dependencies,
|
|
249
382
|
}
|
|
250
383
|
|
|
251
384
|
# Add module keywords if extracted
|
|
@@ -282,6 +415,7 @@ class ElixirIndexer:
|
|
|
282
415
|
"total_modules": len(all_modules),
|
|
283
416
|
"total_functions": total_functions,
|
|
284
417
|
"repo_path": str(repo_path_obj),
|
|
418
|
+
"cicada_version": get_version_string(),
|
|
285
419
|
},
|
|
286
420
|
}
|
|
287
421
|
|
|
@@ -390,6 +524,20 @@ class ElixirIndexer:
|
|
|
390
524
|
)
|
|
391
525
|
existing_index = None
|
|
392
526
|
|
|
527
|
+
# Check for version mismatch - if cicada version differs, force full reindex
|
|
528
|
+
if existing_index:
|
|
529
|
+
stored_version = existing_index.get("metadata", {}).get("cicada_version")
|
|
530
|
+
current_version = get_version_string()
|
|
531
|
+
if version_mismatch(stored_version, current_version):
|
|
532
|
+
if self.verbose:
|
|
533
|
+
print(
|
|
534
|
+
f"Warning: Cicada version mismatch. "
|
|
535
|
+
f"Index was built with {stored_version}, current version is {current_version}. "
|
|
536
|
+
f"Performing full reindex..."
|
|
537
|
+
)
|
|
538
|
+
existing_index = None
|
|
539
|
+
existing_hashes = {}
|
|
540
|
+
|
|
393
541
|
# If no existing data, do full index
|
|
394
542
|
if not existing_index or not existing_hashes:
|
|
395
543
|
if self.verbose:
|
|
@@ -398,9 +546,9 @@ class ElixirIndexer:
|
|
|
398
546
|
|
|
399
547
|
if self.verbose:
|
|
400
548
|
# Read and display keyword extraction config
|
|
401
|
-
|
|
549
|
+
extraction_method, expansion_method = read_keyword_extraction_config(repo_path_obj)
|
|
402
550
|
print(f"Performing incremental index of: {repo_path_obj}")
|
|
403
|
-
print(f"Keyword extraction: {
|
|
551
|
+
print(f"Keyword extraction: {extraction_method.upper()} + {expansion_method.upper()}")
|
|
404
552
|
|
|
405
553
|
# Set up signal handlers for graceful interruption
|
|
406
554
|
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
@@ -436,27 +584,34 @@ class ElixirIndexer:
|
|
|
436
584
|
if files_to_process:
|
|
437
585
|
print(f"\nProcessing {len(files_to_process)} changed file(s)...")
|
|
438
586
|
|
|
439
|
-
# Initialize keyword extractor if requested
|
|
587
|
+
# Initialize keyword extractor and expander if requested
|
|
440
588
|
keyword_extractor = None
|
|
589
|
+
keyword_expander = None
|
|
441
590
|
if extract_keywords:
|
|
442
591
|
try:
|
|
443
592
|
# Read keyword extraction config from config.yaml
|
|
444
|
-
|
|
593
|
+
extraction_method, expansion_method = read_keyword_extraction_config(repo_path_obj)
|
|
445
594
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
from cicada.
|
|
595
|
+
# Initialize extraction method
|
|
596
|
+
if extraction_method == "bert":
|
|
597
|
+
from cicada.extractors.keybert import KeyBERTExtractor
|
|
449
598
|
|
|
450
|
-
keyword_extractor = KeyBERTExtractor(
|
|
599
|
+
keyword_extractor = KeyBERTExtractor(verbose=self.verbose)
|
|
451
600
|
else:
|
|
452
|
-
#
|
|
453
|
-
from cicada.
|
|
454
|
-
|
|
455
|
-
)
|
|
601
|
+
# Use regular (TF-based) extractor as default
|
|
602
|
+
from cicada.extractors.keyword import RegularKeywordExtractor
|
|
603
|
+
|
|
604
|
+
keyword_extractor = RegularKeywordExtractor(verbose=self.verbose)
|
|
605
|
+
|
|
606
|
+
# Initialize expansion method
|
|
607
|
+
from cicada.keyword_expander import KeywordExpander
|
|
608
|
+
|
|
609
|
+
keyword_expander = KeywordExpander(
|
|
610
|
+
expansion_type=expansion_method, verbose=self.verbose
|
|
611
|
+
)
|
|
456
612
|
|
|
457
|
-
keyword_extractor = LightweightKeywordExtractor(verbose=self.verbose)
|
|
458
613
|
except Exception as e:
|
|
459
|
-
print(f"Warning: Could not initialize keyword extractor: {e}")
|
|
614
|
+
print(f"Warning: Could not initialize keyword extractor/expander: {e}")
|
|
460
615
|
print("Continuing without keyword extraction...")
|
|
461
616
|
extract_keywords = False
|
|
462
617
|
|
|
@@ -480,31 +635,107 @@ class ElixirIndexer:
|
|
|
480
635
|
public_count = sum(1 for f in functions if f["type"] == "def")
|
|
481
636
|
private_count = sum(1 for f in functions if f["type"] == "defp")
|
|
482
637
|
|
|
483
|
-
# Extract keywords if enabled
|
|
638
|
+
# Extract and expand keywords if enabled
|
|
484
639
|
module_keywords = None
|
|
485
640
|
if keyword_extractor and module_data.get("moduledoc"):
|
|
486
641
|
try:
|
|
487
|
-
|
|
642
|
+
# Step 1: Extract keywords with scores
|
|
643
|
+
extraction_result = keyword_extractor.extract_keywords(
|
|
488
644
|
module_data["moduledoc"], top_n=10
|
|
489
645
|
)
|
|
646
|
+
extracted_keywords = [
|
|
647
|
+
kw for kw, _ in extraction_result["top_keywords"]
|
|
648
|
+
]
|
|
649
|
+
keyword_scores = {
|
|
650
|
+
kw.lower(): score
|
|
651
|
+
for kw, score in extraction_result["top_keywords"]
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
# Step 2: Expand keywords with scores
|
|
655
|
+
if keyword_expander and extracted_keywords:
|
|
656
|
+
expansion_result = keyword_expander.expand_keywords(
|
|
657
|
+
extracted_keywords,
|
|
658
|
+
top_n=self.DEFAULT_EXPANSION_TOP_N,
|
|
659
|
+
threshold=self.DEFAULT_EXPANSION_THRESHOLD,
|
|
660
|
+
return_scores=True,
|
|
661
|
+
keyword_scores=keyword_scores,
|
|
662
|
+
)
|
|
663
|
+
# Convert to dict: word -> max_score
|
|
664
|
+
module_keywords = {}
|
|
665
|
+
# When return_scores=True, expansion_result is a dict
|
|
666
|
+
if not isinstance(expansion_result, dict):
|
|
667
|
+
raise TypeError(
|
|
668
|
+
"Expected dict from expand_keywords with return_scores=True"
|
|
669
|
+
)
|
|
670
|
+
for item in expansion_result["words"]:
|
|
671
|
+
word = item["word"]
|
|
672
|
+
score = item["score"]
|
|
673
|
+
if (
|
|
674
|
+
word not in module_keywords
|
|
675
|
+
or score > module_keywords[word]
|
|
676
|
+
):
|
|
677
|
+
module_keywords[word] = score
|
|
678
|
+
else:
|
|
679
|
+
module_keywords = keyword_scores
|
|
490
680
|
except Exception:
|
|
491
681
|
keyword_extraction_failures += 1
|
|
492
682
|
|
|
493
|
-
# Extract keywords from function docs
|
|
683
|
+
# Extract and expand keywords from function docs
|
|
494
684
|
if keyword_extractor:
|
|
495
685
|
for func in functions:
|
|
496
686
|
if func.get("doc"):
|
|
497
687
|
try:
|
|
498
688
|
func_name = func.get("name", "")
|
|
499
689
|
text_for_keywords = f"{func_name} {func['doc']}"
|
|
500
|
-
|
|
690
|
+
# Step 1: Extract keywords with scores
|
|
691
|
+
extraction_result = keyword_extractor.extract_keywords(
|
|
501
692
|
text_for_keywords, top_n=10
|
|
502
693
|
)
|
|
694
|
+
extracted_keywords = [
|
|
695
|
+
kw for kw, _ in extraction_result["top_keywords"]
|
|
696
|
+
]
|
|
697
|
+
keyword_scores = {
|
|
698
|
+
kw.lower(): score
|
|
699
|
+
for kw, score in extraction_result["top_keywords"]
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
# Step 2: Expand keywords with scores
|
|
703
|
+
if keyword_expander and extracted_keywords:
|
|
704
|
+
expansion_result = keyword_expander.expand_keywords(
|
|
705
|
+
extracted_keywords,
|
|
706
|
+
top_n=self.DEFAULT_EXPANSION_TOP_N,
|
|
707
|
+
threshold=self.DEFAULT_EXPANSION_THRESHOLD,
|
|
708
|
+
return_scores=True,
|
|
709
|
+
keyword_scores=keyword_scores,
|
|
710
|
+
)
|
|
711
|
+
# Convert to dict: word -> max_score
|
|
712
|
+
func_keywords = {}
|
|
713
|
+
# When return_scores=True, expansion_result is a dict
|
|
714
|
+
if not isinstance(expansion_result, dict):
|
|
715
|
+
raise TypeError(
|
|
716
|
+
"Expected dict from expand_keywords with return_scores=True"
|
|
717
|
+
)
|
|
718
|
+
for item in expansion_result["words"]:
|
|
719
|
+
word = item["word"]
|
|
720
|
+
score = item["score"]
|
|
721
|
+
if (
|
|
722
|
+
word not in func_keywords
|
|
723
|
+
or score > func_keywords[word]
|
|
724
|
+
):
|
|
725
|
+
func_keywords[word] = score
|
|
726
|
+
else:
|
|
727
|
+
func_keywords = keyword_scores
|
|
728
|
+
|
|
503
729
|
if func_keywords:
|
|
504
730
|
func["keywords"] = func_keywords
|
|
505
731
|
except Exception:
|
|
506
732
|
keyword_extraction_failures += 1
|
|
507
733
|
|
|
734
|
+
# Extract dependencies
|
|
735
|
+
module_dependencies, functions = self._extract_dependencies(
|
|
736
|
+
module_data, functions
|
|
737
|
+
)
|
|
738
|
+
|
|
508
739
|
# Store module info
|
|
509
740
|
module_info = {
|
|
510
741
|
"file": relative_file,
|
|
@@ -521,6 +752,7 @@ class ElixirIndexer:
|
|
|
521
752
|
"behaviours": module_data.get("behaviours", []),
|
|
522
753
|
"value_mentions": module_data.get("value_mentions", []),
|
|
523
754
|
"calls": module_data.get("calls", []),
|
|
755
|
+
"dependencies": module_dependencies,
|
|
524
756
|
}
|
|
525
757
|
|
|
526
758
|
# Add module keywords if extracted
|
|
@@ -549,6 +781,7 @@ class ElixirIndexer:
|
|
|
549
781
|
"metadata": {
|
|
550
782
|
"indexed_at": datetime.now().isoformat(),
|
|
551
783
|
"repo_path": str(repo_path_obj),
|
|
784
|
+
"cicada_version": get_version_string(),
|
|
552
785
|
},
|
|
553
786
|
}
|
|
554
787
|
|