cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/_version_hash.py +4 -0
- cicada/cli.py +6 -748
- cicada/commands.py +1255 -0
- cicada/dead_code/__init__.py +1 -0
- cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
- cicada/dependency_analyzer.py +147 -0
- cicada/entry_utils.py +92 -0
- cicada/extractors/base.py +9 -9
- cicada/extractors/call.py +17 -20
- cicada/extractors/common.py +64 -0
- cicada/extractors/dependency.py +117 -235
- cicada/extractors/doc.py +2 -49
- cicada/extractors/function.py +10 -14
- cicada/extractors/keybert.py +228 -0
- cicada/extractors/keyword.py +191 -0
- cicada/extractors/module.py +6 -10
- cicada/extractors/spec.py +8 -56
- cicada/format/__init__.py +20 -0
- cicada/{ascii_art.py → format/ascii_art.py} +1 -1
- cicada/format/formatter.py +1145 -0
- cicada/git_helper.py +134 -7
- cicada/indexer.py +322 -89
- cicada/interactive_setup.py +251 -323
- cicada/interactive_setup_helpers.py +302 -0
- cicada/keyword_expander.py +437 -0
- cicada/keyword_search.py +208 -422
- cicada/keyword_test.py +383 -16
- cicada/mcp/__init__.py +10 -0
- cicada/mcp/entry.py +17 -0
- cicada/mcp/filter_utils.py +107 -0
- cicada/mcp/pattern_utils.py +118 -0
- cicada/{mcp_server.py → mcp/server.py} +819 -73
- cicada/mcp/tools.py +473 -0
- cicada/pr_finder.py +2 -3
- cicada/pr_indexer/indexer.py +3 -2
- cicada/setup.py +167 -35
- cicada/tier.py +225 -0
- cicada/utils/__init__.py +9 -2
- cicada/utils/fuzzy_match.py +54 -0
- cicada/utils/index_utils.py +9 -0
- cicada/utils/path_utils.py +18 -0
- cicada/utils/text_utils.py +52 -1
- cicada/utils/tree_utils.py +47 -0
- cicada/version_check.py +99 -0
- cicada/watch_manager.py +320 -0
- cicada/watcher.py +431 -0
- cicada_mcp-0.3.0.dist-info/METADATA +541 -0
- cicada_mcp-0.3.0.dist-info/RECORD +70 -0
- cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
- cicada/formatter.py +0 -864
- cicada/keybert_extractor.py +0 -286
- cicada/lightweight_keyword_extractor.py +0 -290
- cicada/mcp_entry.py +0 -683
- cicada/mcp_tools.py +0 -291
- cicada_mcp-0.2.0.dist-info/METADATA +0 -735
- cicada_mcp-0.2.0.dist-info/RECORD +0 -53
- cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
- /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
- /cicada/{colors.py → format/colors.py} +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
cicada/keyword_test.py
CHANGED
|
@@ -7,7 +7,333 @@ Provides an interactive REPL for testing keyword extraction methods.
|
|
|
7
7
|
import sys
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def
|
|
10
|
+
def run_expansion_interactive(
|
|
11
|
+
expansion_type: str = "lemmi",
|
|
12
|
+
extraction_method: str = "regular",
|
|
13
|
+
extraction_tier: str = "regular",
|
|
14
|
+
extraction_threshold: float | None = 0.3,
|
|
15
|
+
expansion_threshold: float = 0.2,
|
|
16
|
+
min_score: float = 0.5,
|
|
17
|
+
):
|
|
18
|
+
"""
|
|
19
|
+
Interactive keyword expansion testing mode.
|
|
20
|
+
|
|
21
|
+
Shows the full pipeline: Text → Extracted Keywords → Expanded Keywords
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
expansion_type: Expansion strategy ('lemmi', 'glove', or 'fasttext')
|
|
25
|
+
extraction_method: Extraction method ('regular' or 'bert')
|
|
26
|
+
extraction_tier: Model tier for extraction ('fast', 'regular', or 'max')
|
|
27
|
+
extraction_threshold: Minimum score for extraction (default: 0.3)
|
|
28
|
+
expansion_threshold: Minimum similarity score for expansion (default: 0.2)
|
|
29
|
+
min_score: Minimum score threshold for keywords (default: 0.5)
|
|
30
|
+
"""
|
|
31
|
+
print(f"\n{'='*70}")
|
|
32
|
+
print("🔄 Cicada Interactive Keyword Pipeline Test")
|
|
33
|
+
print(f"{'='*70}")
|
|
34
|
+
|
|
35
|
+
# Map extraction method to display name
|
|
36
|
+
extraction_display = "REGULAR (token-based)" if extraction_method == "regular" else "BERT"
|
|
37
|
+
print(f"Extraction: {extraction_display} ({extraction_tier})")
|
|
38
|
+
print(f"Expansion: {expansion_type.upper()}")
|
|
39
|
+
if extraction_threshold is not None:
|
|
40
|
+
print(f"Extraction threshold: {extraction_threshold}")
|
|
41
|
+
if min_score > 0.0:
|
|
42
|
+
print(f"Min score: {min_score}")
|
|
43
|
+
print(f"Expansion threshold: {expansion_threshold}")
|
|
44
|
+
|
|
45
|
+
# Show strategy description
|
|
46
|
+
expansion_descriptions = {
|
|
47
|
+
"lemmi": "Inflected forms only (run → running, runs, ran)",
|
|
48
|
+
"glove": "GloVe embeddings + inflected forms (128MB download first time)",
|
|
49
|
+
"fasttext": "FastText embeddings + inflected forms (958MB download first time)",
|
|
50
|
+
}
|
|
51
|
+
print(f"Strategy: {expansion_descriptions.get(expansion_type, 'Unknown')}")
|
|
52
|
+
|
|
53
|
+
print("\nEnter text, then press Ctrl-D (Unix) or Ctrl-Z+Enter (Windows)")
|
|
54
|
+
print("Press Ctrl-C to exit.\n")
|
|
55
|
+
print(f"{'='*70}\n")
|
|
56
|
+
|
|
57
|
+
# Initialize keyword extractor
|
|
58
|
+
try:
|
|
59
|
+
if extraction_method == "regular":
|
|
60
|
+
from cicada.extractors.keyword import RegularKeywordExtractor
|
|
61
|
+
|
|
62
|
+
extractor = RegularKeywordExtractor(verbose=True)
|
|
63
|
+
elif extraction_method == "bert":
|
|
64
|
+
from cicada.extractors.keybert import KeyBERTExtractor
|
|
65
|
+
|
|
66
|
+
extractor = KeyBERTExtractor(verbose=True)
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError(f"Unknown extraction method: {extraction_method}")
|
|
69
|
+
print() # Add newline after initialization
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error initializing keyword extractor: {e}", file=sys.stderr)
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
# Initialize keyword expander
|
|
75
|
+
try:
|
|
76
|
+
from cicada.keyword_expander import KeywordExpander
|
|
77
|
+
|
|
78
|
+
expander = KeywordExpander(expansion_type=expansion_type, verbose=True)
|
|
79
|
+
|
|
80
|
+
# Force-load embedding model if using glove/fasttext
|
|
81
|
+
if expansion_type in ["glove", "fasttext"]:
|
|
82
|
+
print(f"\nPreloading {expansion_type} model...")
|
|
83
|
+
# Trigger model loading with a dummy keyword
|
|
84
|
+
_ = expander.expand_keywords(["test"], top_n=1, threshold=0.9)
|
|
85
|
+
print(f"✓ {expansion_type.title()} model ready\n")
|
|
86
|
+
else:
|
|
87
|
+
print() # Add newline after initialization
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(f"Error initializing keyword expander: {e}", file=sys.stderr)
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
|
|
92
|
+
# Interactive loop
|
|
93
|
+
stdin_closed = False
|
|
94
|
+
try:
|
|
95
|
+
while True:
|
|
96
|
+
print("📝 Enter text (Ctrl-D or Ctrl-Z+Enter when done):")
|
|
97
|
+
print("-" * 70)
|
|
98
|
+
|
|
99
|
+
# Read multi-line input until EOF
|
|
100
|
+
lines = []
|
|
101
|
+
try:
|
|
102
|
+
while True:
|
|
103
|
+
line = input()
|
|
104
|
+
lines.append(line)
|
|
105
|
+
except EOFError:
|
|
106
|
+
if not lines and stdin_closed:
|
|
107
|
+
print("\n👋 No more input available. Exiting.")
|
|
108
|
+
return
|
|
109
|
+
stdin_closed = True
|
|
110
|
+
|
|
111
|
+
text = "\n".join(lines)
|
|
112
|
+
|
|
113
|
+
if not text.strip():
|
|
114
|
+
print("\n⚠️ Empty input. Please enter some text.\n")
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
# Full pipeline display
|
|
118
|
+
print("\n" + "=" * 70)
|
|
119
|
+
print("🔄 KEYWORD PIPELINE:")
|
|
120
|
+
print("=" * 70)
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
# Step 1: Show input text
|
|
124
|
+
print("\n1️⃣ INPUT TEXT:")
|
|
125
|
+
print("-" * 70)
|
|
126
|
+
preview = text[:200] + "..." if len(text) > 200 else text
|
|
127
|
+
print(f"{preview}\n")
|
|
128
|
+
|
|
129
|
+
# Step 2: Extract keywords
|
|
130
|
+
print("2️⃣ EXTRACTED KEYWORDS:")
|
|
131
|
+
print("-" * 70)
|
|
132
|
+
results = extractor.extract_keywords(text, top_n=15, min_score=min_score)
|
|
133
|
+
top_keywords = results.get("top_keywords", [])
|
|
134
|
+
|
|
135
|
+
# Apply extraction threshold if specified
|
|
136
|
+
if extraction_threshold is not None and top_keywords:
|
|
137
|
+
filtered_keywords = [
|
|
138
|
+
item
|
|
139
|
+
for item in top_keywords
|
|
140
|
+
if isinstance(item, (list, tuple))
|
|
141
|
+
and len(item) >= 2
|
|
142
|
+
and item[1] >= extraction_threshold
|
|
143
|
+
]
|
|
144
|
+
if len(filtered_keywords) < len(top_keywords):
|
|
145
|
+
removed_count = len(top_keywords) - len(filtered_keywords)
|
|
146
|
+
print(
|
|
147
|
+
f"⚠️ Filtered out {removed_count} keywords below threshold {extraction_threshold}\n"
|
|
148
|
+
)
|
|
149
|
+
top_keywords = filtered_keywords
|
|
150
|
+
|
|
151
|
+
extracted_keywords = []
|
|
152
|
+
extraction_scores = {} # Map keywords to their extraction scores
|
|
153
|
+
code_identifiers_lower = [
|
|
154
|
+
ident.lower() for ident in results.get("code_identifiers", [])
|
|
155
|
+
]
|
|
156
|
+
code_split_words_lower = [
|
|
157
|
+
word.lower() for word in results.get("code_split_words", [])
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
if top_keywords and isinstance(top_keywords, list):
|
|
161
|
+
for i, item in enumerate(top_keywords, 1):
|
|
162
|
+
if isinstance(item, (list, tuple)) and len(item) >= 2:
|
|
163
|
+
keyword, score = item[0], item[1]
|
|
164
|
+
extracted_keywords.append(keyword)
|
|
165
|
+
extraction_scores[keyword.lower()] = score # Store extraction score
|
|
166
|
+
|
|
167
|
+
# Determine if this keyword was boosted
|
|
168
|
+
boost_label = ""
|
|
169
|
+
if keyword.lower() in code_identifiers_lower:
|
|
170
|
+
boost_label = " [10x boost]"
|
|
171
|
+
elif keyword.lower() in code_split_words_lower:
|
|
172
|
+
boost_label = " [3x boost]"
|
|
173
|
+
|
|
174
|
+
print(f" {i:2}. {keyword:20s} (score: {score:.4f}){boost_label}")
|
|
175
|
+
|
|
176
|
+
if not extracted_keywords:
|
|
177
|
+
print(" No keywords extracted.")
|
|
178
|
+
print("\n" + "=" * 70 + "\n")
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
print(f"\nTotal extracted: {len(extracted_keywords)} keywords")
|
|
182
|
+
if code_identifiers_lower or code_split_words_lower:
|
|
183
|
+
print(f" • Code identifiers (10x): {len(code_identifiers_lower)}")
|
|
184
|
+
print(f" • Code split words (3x): {len(code_split_words_lower)}")
|
|
185
|
+
print()
|
|
186
|
+
|
|
187
|
+
# Step 3: Expand keywords
|
|
188
|
+
print("3️⃣ EXPANDED KEYWORDS:")
|
|
189
|
+
print("-" * 70)
|
|
190
|
+
print("Note: Expansion scores = extraction score × similarity score")
|
|
191
|
+
print("Note: Code identifiers are NOT inflected or expanded (kept exact)\n")
|
|
192
|
+
result = expander.expand_keywords(
|
|
193
|
+
extracted_keywords,
|
|
194
|
+
top_n=3,
|
|
195
|
+
threshold=expansion_threshold,
|
|
196
|
+
return_scores=True,
|
|
197
|
+
keyword_scores=extraction_scores,
|
|
198
|
+
min_score=min_score,
|
|
199
|
+
code_identifiers=results.get("code_identifiers", []),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Extract detailed and simple lists
|
|
203
|
+
if isinstance(result, dict):
|
|
204
|
+
expanded_with_scores = result["words"]
|
|
205
|
+
expanded = result["simple"]
|
|
206
|
+
else:
|
|
207
|
+
# Fallback if return_scores wasn't supported
|
|
208
|
+
expanded = result
|
|
209
|
+
expanded_with_scores = []
|
|
210
|
+
|
|
211
|
+
# Group by source type
|
|
212
|
+
by_source = {
|
|
213
|
+
"original": [],
|
|
214
|
+
"split": [],
|
|
215
|
+
"inflection": [],
|
|
216
|
+
"embedding": [],
|
|
217
|
+
"embedding_inflection": [],
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
for item in expanded_with_scores:
|
|
221
|
+
source = item.get("source", "unknown")
|
|
222
|
+
by_source.setdefault(source, []).append(item)
|
|
223
|
+
|
|
224
|
+
# Display originals
|
|
225
|
+
if by_source["original"]:
|
|
226
|
+
print(f"\nFrom extraction ({len(by_source['original'])}):")
|
|
227
|
+
for item in by_source["original"][:20]:
|
|
228
|
+
score = item.get("score", 1.0)
|
|
229
|
+
print(f" ✓ {item['word']:25s} (score: {score:.3f})")
|
|
230
|
+
|
|
231
|
+
# Display splits
|
|
232
|
+
if by_source["split"]:
|
|
233
|
+
print(f"\nFrom splitting ({len(by_source['split'])}):")
|
|
234
|
+
for item in by_source["split"][:10]:
|
|
235
|
+
parent = item.get("parent", "")
|
|
236
|
+
score = item.get("score", 1.0)
|
|
237
|
+
print(f" → {item['word']:25s} (split from '{parent}', score: {score:.3f})")
|
|
238
|
+
|
|
239
|
+
# Display inflections
|
|
240
|
+
if by_source["inflection"]:
|
|
241
|
+
print(f"\nFrom inflection ({len(by_source['inflection'])}):")
|
|
242
|
+
for item in by_source["inflection"][:15]:
|
|
243
|
+
parent = item.get("parent", "")
|
|
244
|
+
score = item.get("score", 1.0)
|
|
245
|
+
print(
|
|
246
|
+
f" ~ {item['word']:25s} (inflection of '{parent}', score: {score:.3f})"
|
|
247
|
+
)
|
|
248
|
+
if len(by_source["inflection"]) > 15:
|
|
249
|
+
print(f" ... and {len(by_source['inflection']) - 15} more")
|
|
250
|
+
|
|
251
|
+
# Display embeddings (semantic expansion)
|
|
252
|
+
if by_source["embedding"]:
|
|
253
|
+
print(
|
|
254
|
+
f"\nFrom semantic expansion ({len(by_source['embedding'])}) [extraction × similarity]:"
|
|
255
|
+
)
|
|
256
|
+
for item in by_source["embedding"][:15]:
|
|
257
|
+
score = item.get("score", 0)
|
|
258
|
+
parent = item.get("parent", "")
|
|
259
|
+
print(
|
|
260
|
+
f" + {item['word']:25s} (similar to '{parent}', final score: {score:.3f})"
|
|
261
|
+
)
|
|
262
|
+
if len(by_source["embedding"]) > 15:
|
|
263
|
+
print(f" ... and {len(by_source['embedding']) - 15} more")
|
|
264
|
+
|
|
265
|
+
# Display embedding inflections
|
|
266
|
+
if by_source["embedding_inflection"]:
|
|
267
|
+
print(
|
|
268
|
+
f"\nFrom semantic expansion inflections ({len(by_source['embedding_inflection'])}) [inherits final score]:"
|
|
269
|
+
)
|
|
270
|
+
for item in by_source["embedding_inflection"][:10]:
|
|
271
|
+
score = item.get("score", 0)
|
|
272
|
+
parent = item.get("parent", "")
|
|
273
|
+
print(
|
|
274
|
+
f" ≈ {item['word']:25s} (inflection of '{parent}', final score: {score:.3f})"
|
|
275
|
+
)
|
|
276
|
+
if len(by_source["embedding_inflection"]) > 10:
|
|
277
|
+
print(f" ... and {len(by_source['embedding_inflection']) - 10} more")
|
|
278
|
+
|
|
279
|
+
# Show statistics
|
|
280
|
+
print("\n📊 STATISTICS:")
|
|
281
|
+
print("-" * 70)
|
|
282
|
+
print(f" • Extracted: {len(extracted_keywords)} keywords")
|
|
283
|
+
print(f" • Expanded: {len(expanded)} keywords")
|
|
284
|
+
expansion_ratio = (
|
|
285
|
+
len(expanded) / len(extracted_keywords) if extracted_keywords else 0
|
|
286
|
+
)
|
|
287
|
+
print(f" • Ratio: {expansion_ratio:.1f}x expansion")
|
|
288
|
+
print("\n Breakdown by source:")
|
|
289
|
+
print(f" - Original: {len(by_source['original'])}")
|
|
290
|
+
print(f" - Split: {len(by_source['split'])}")
|
|
291
|
+
print(f" - Inflections: {len(by_source['inflection'])}")
|
|
292
|
+
print(f" - Semantic (embeddings): {len(by_source['embedding'])}")
|
|
293
|
+
print(f" - Semantic inflections: {len(by_source['embedding_inflection'])}")
|
|
294
|
+
|
|
295
|
+
# Show expansion info
|
|
296
|
+
info = expander.get_expansion_info()
|
|
297
|
+
if "embedding_vocab_size" in info:
|
|
298
|
+
print("\n🧠 Model Info:")
|
|
299
|
+
print(f" • Vocabulary size: {info['embedding_vocab_size']:,}")
|
|
300
|
+
print(f" • Vector dimensions: {info['embedding_vector_size']}")
|
|
301
|
+
|
|
302
|
+
# Show complete sorted list of all keywords with scores
|
|
303
|
+
if expanded_with_scores:
|
|
304
|
+
print("\n📋 ALL EXPANDED KEYWORDS (sorted by score):")
|
|
305
|
+
print("-" * 70)
|
|
306
|
+
# Sort by score descending
|
|
307
|
+
sorted_keywords = sorted(
|
|
308
|
+
expanded_with_scores, key=lambda x: x.get("score", 0), reverse=True
|
|
309
|
+
)
|
|
310
|
+
# Show top 50
|
|
311
|
+
for i, item in enumerate(sorted_keywords[:50], 1):
|
|
312
|
+
word = item["word"]
|
|
313
|
+
score = item.get("score", 0)
|
|
314
|
+
print(f" {i:3}. {word:25s} (score: {score:.4f})")
|
|
315
|
+
if len(sorted_keywords) > 50:
|
|
316
|
+
print(f"\n ... and {len(sorted_keywords) - 50} more keywords")
|
|
317
|
+
print(
|
|
318
|
+
f" Score range: {sorted_keywords[-1].get('score', 0):.4f} - {sorted_keywords[0].get('score', 0):.4f}"
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
except Exception as e:
|
|
322
|
+
print(f"\n❌ Error in pipeline: {e}", file=sys.stderr)
|
|
323
|
+
import traceback
|
|
324
|
+
|
|
325
|
+
traceback.print_exc()
|
|
326
|
+
|
|
327
|
+
print("\n" + "=" * 70 + "\n")
|
|
328
|
+
|
|
329
|
+
except KeyboardInterrupt:
|
|
330
|
+
print("\n\n👋 Exiting interactive mode. Goodbye!")
|
|
331
|
+
sys.exit(0)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def run_keywords_interactive(
|
|
335
|
+
method: str = "regular", tier: str = "regular", extraction_threshold: float | None = None
|
|
336
|
+
):
|
|
11
337
|
"""
|
|
12
338
|
Interactive keyword extraction testing mode.
|
|
13
339
|
|
|
@@ -15,28 +341,36 @@ def run_keywords_interactive(method: str = "lemminflect", tier: str = "regular")
|
|
|
15
341
|
using the specified extraction method.
|
|
16
342
|
|
|
17
343
|
Args:
|
|
18
|
-
method: Extraction method ('
|
|
344
|
+
method: Extraction method ('regular' or 'bert')
|
|
19
345
|
tier: Model tier ('fast', 'regular', or 'max')
|
|
346
|
+
extraction_threshold: Minimum score for extraction (None = no filtering)
|
|
20
347
|
"""
|
|
21
348
|
print(f"\n{'='*70}")
|
|
22
349
|
print("🔍 Cicada Interactive Keyword Extraction Test")
|
|
23
350
|
print(f"{'='*70}")
|
|
24
|
-
|
|
351
|
+
|
|
352
|
+
# Map extraction method to display name
|
|
353
|
+
method_display = "REGULAR (token-based)" if method == "regular" else "BERT"
|
|
354
|
+
print(f"Method: {method_display}")
|
|
25
355
|
print(f"Tier: {tier}")
|
|
356
|
+
if extraction_threshold is not None:
|
|
357
|
+
print(f"Extraction threshold: {extraction_threshold}")
|
|
26
358
|
print("\nPaste or type text, then press Ctrl-D (Unix) or Ctrl-Z+Enter (Windows)")
|
|
27
359
|
print("to extract keywords. Press Ctrl-C to exit.\n")
|
|
28
360
|
print(f"{'='*70}\n")
|
|
29
361
|
|
|
30
362
|
# Initialize keyword extractor
|
|
31
363
|
try:
|
|
32
|
-
if method == "
|
|
33
|
-
from cicada.
|
|
364
|
+
if method == "regular":
|
|
365
|
+
from cicada.extractors.keyword import RegularKeywordExtractor
|
|
34
366
|
|
|
35
|
-
extractor =
|
|
36
|
-
|
|
37
|
-
from cicada.
|
|
367
|
+
extractor = RegularKeywordExtractor(verbose=True)
|
|
368
|
+
elif method == "bert":
|
|
369
|
+
from cicada.extractors.keybert import KeyBERTExtractor
|
|
38
370
|
|
|
39
|
-
extractor =
|
|
371
|
+
extractor = KeyBERTExtractor(verbose=True)
|
|
372
|
+
else:
|
|
373
|
+
raise ValueError(f"Unknown extraction method: {method}")
|
|
40
374
|
print() # Add newline after initialization
|
|
41
375
|
except Exception as e:
|
|
42
376
|
print(f"Error initializing keyword extractor: {e}", file=sys.stderr)
|
|
@@ -84,28 +418,61 @@ def run_keywords_interactive(method: str = "lemminflect", tier: str = "regular")
|
|
|
84
418
|
|
|
85
419
|
# Display top keywords with scores
|
|
86
420
|
top_keywords = results.get("top_keywords", [])
|
|
421
|
+
|
|
422
|
+
# Apply extraction threshold if specified
|
|
423
|
+
if extraction_threshold is not None and top_keywords:
|
|
424
|
+
filtered_keywords = [
|
|
425
|
+
item
|
|
426
|
+
for item in top_keywords
|
|
427
|
+
if isinstance(item, (list, tuple))
|
|
428
|
+
and len(item) >= 2
|
|
429
|
+
and item[1] >= extraction_threshold
|
|
430
|
+
]
|
|
431
|
+
if len(filtered_keywords) < len(top_keywords):
|
|
432
|
+
removed_count = len(top_keywords) - len(filtered_keywords)
|
|
433
|
+
print(
|
|
434
|
+
f"\n⚠️ Filtered out {removed_count} keywords below threshold {extraction_threshold}"
|
|
435
|
+
)
|
|
436
|
+
top_keywords = filtered_keywords
|
|
437
|
+
|
|
438
|
+
# Get code identifiers and split words for boost detection
|
|
439
|
+
code_identifiers = results.get("code_identifiers", [])
|
|
440
|
+
code_split_words = results.get("code_split_words", [])
|
|
441
|
+
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
442
|
+
code_split_words_lower = [word.lower() for word in code_split_words]
|
|
443
|
+
|
|
87
444
|
if top_keywords and isinstance(top_keywords, list):
|
|
88
|
-
print("\n📊 Top Keywords (with scores):")
|
|
445
|
+
print("\n📊 Top Keywords (with weighted scores):")
|
|
89
446
|
for i, item in enumerate(top_keywords, 1):
|
|
90
447
|
if isinstance(item, (list, tuple)) and len(item) >= 2:
|
|
91
448
|
keyword, score = item[0], item[1]
|
|
92
|
-
|
|
449
|
+
|
|
450
|
+
# Determine if this keyword was boosted
|
|
451
|
+
boost_label = ""
|
|
452
|
+
if keyword.lower() in code_identifiers_lower:
|
|
453
|
+
boost_label = " [10x boost]"
|
|
454
|
+
elif keyword.lower() in code_split_words_lower:
|
|
455
|
+
boost_label = " [3x boost]"
|
|
456
|
+
|
|
457
|
+
print(f" {i:2}. {keyword:20s} (score: {score:.4f}){boost_label}")
|
|
93
458
|
else:
|
|
94
459
|
print(" No keywords extracted.")
|
|
95
460
|
|
|
96
461
|
# Display code identifiers if any
|
|
97
|
-
code_identifiers = results.get("code_identifiers")
|
|
98
462
|
if code_identifiers and isinstance(code_identifiers, list):
|
|
99
|
-
print("\n💻 Code Identifiers (10x weight):")
|
|
100
|
-
for ident in code_identifiers:
|
|
463
|
+
print(f"\n💻 Code Identifiers ({len(code_identifiers)} found, 10x weight):")
|
|
464
|
+
for ident in code_identifiers[:10]:
|
|
101
465
|
print(f" • {ident}")
|
|
466
|
+
if len(code_identifiers) > 10:
|
|
467
|
+
print(f" ... and {len(code_identifiers) - 10} more")
|
|
102
468
|
|
|
103
469
|
# Display code split words if any
|
|
104
|
-
code_split_words = results.get("code_split_words")
|
|
105
470
|
if code_split_words and isinstance(code_split_words, list):
|
|
106
|
-
print("\n🔤 Code Split Words (3x weight):")
|
|
471
|
+
print(f"\n🔤 Code Split Words ({len(code_split_words)} found, 3x weight):")
|
|
107
472
|
for word in code_split_words[:10]: # Limit to 10
|
|
108
473
|
print(f" • {word}")
|
|
474
|
+
if len(code_split_words) > 10:
|
|
475
|
+
print(f" ... and {len(code_split_words) - 10} more")
|
|
109
476
|
|
|
110
477
|
# Display statistics
|
|
111
478
|
stats = results.get("stats")
|
cicada/mcp/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Cicada MCP (Model Context Protocol) Server package.
|
|
2
|
+
|
|
3
|
+
This package contains the MCP server implementation for Cicada,
|
|
4
|
+
providing Elixir code search and analysis capabilities via the MCP standard.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Note: Avoid importing server.py at package level to prevent circular dependencies
|
|
8
|
+
# and optional dependency issues. Users should import directly:
|
|
9
|
+
# from cicada.mcp.server import CicadaServer, async_main, main
|
|
10
|
+
# from cicada.mcp.tools import get_tool_definitions
|
cicada/mcp/entry.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from cicada.entry_utils import run_cli
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def main() -> None:
|
|
5
|
+
"""Main entry point for cicada-mcp command."""
|
|
6
|
+
run_cli(
|
|
7
|
+
prog_name="cicada-mcp",
|
|
8
|
+
version_prog_name="cicada-mcp",
|
|
9
|
+
default_on_unknown="server",
|
|
10
|
+
default_on_none="server",
|
|
11
|
+
default_on_unknown_args=["--fast"],
|
|
12
|
+
default_on_none_args=["--fast"],
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
if __name__ == "__main__":
|
|
17
|
+
main()
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared filtering utilities for MCP tools.
|
|
3
|
+
|
|
4
|
+
Provides reusable filtering functions for various MCP tool operations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def filter_by_score_threshold(
|
|
11
|
+
results: list[dict[str, Any]], min_score: float
|
|
12
|
+
) -> list[dict[str, Any]]:
|
|
13
|
+
"""
|
|
14
|
+
Filter search results by minimum score threshold.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
results: List of search results with 'score' field
|
|
18
|
+
min_score: Minimum score threshold (0.0 to 1.0)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Filtered list of results meeting the score threshold
|
|
22
|
+
"""
|
|
23
|
+
if not results or min_score <= 0.0:
|
|
24
|
+
return results
|
|
25
|
+
|
|
26
|
+
return [r for r in results if r.get("score", 0.0) >= min_score]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def is_test_file(file_path: str) -> bool:
|
|
30
|
+
"""
|
|
31
|
+
Determine if a file path is a test file.
|
|
32
|
+
|
|
33
|
+
Checks for common test file patterns:
|
|
34
|
+
- Contains 'test' in the path
|
|
35
|
+
- Located in 'test' directory
|
|
36
|
+
- Filename starts with 'test_'
|
|
37
|
+
- Filename ends with '_test.ex' or '_test.exs'
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
file_path: Path to check
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
True if the file is a test file
|
|
44
|
+
"""
|
|
45
|
+
file_lower = file_path.lower()
|
|
46
|
+
|
|
47
|
+
# Common test file patterns
|
|
48
|
+
patterns = [
|
|
49
|
+
"/test/",
|
|
50
|
+
"\\test\\", # Windows paths
|
|
51
|
+
"test_",
|
|
52
|
+
"_test.ex",
|
|
53
|
+
"_test.exs",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
return any(pattern in file_lower for pattern in patterns)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def classify_usage_type(usage_sites: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
|
|
60
|
+
"""
|
|
61
|
+
Classify usage sites into test and production categories.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
usage_sites: List of usage sites with 'file' field
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Dictionary with 'test' and 'production' keys containing categorized sites
|
|
68
|
+
"""
|
|
69
|
+
test_sites = []
|
|
70
|
+
production_sites = []
|
|
71
|
+
|
|
72
|
+
for site in usage_sites:
|
|
73
|
+
file_path = site.get("file", "")
|
|
74
|
+
if is_test_file(file_path):
|
|
75
|
+
test_sites.append(site)
|
|
76
|
+
else:
|
|
77
|
+
production_sites.append(site)
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
"test": test_sites,
|
|
81
|
+
"production": production_sites,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def filter_by_file_type(usage_sites: list[dict[str, Any]], usage_type: str) -> list[dict[str, Any]]:
|
|
86
|
+
"""
|
|
87
|
+
Filter usage sites by file type (test vs production).
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
usage_sites: List of usage sites with 'file' field
|
|
91
|
+
usage_type: One of 'all', 'test_only', 'production_only'
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Filtered list of usage sites
|
|
95
|
+
"""
|
|
96
|
+
if usage_type == "all":
|
|
97
|
+
return usage_sites
|
|
98
|
+
|
|
99
|
+
classified = classify_usage_type(usage_sites)
|
|
100
|
+
|
|
101
|
+
if usage_type == "test_only":
|
|
102
|
+
return classified["test"]
|
|
103
|
+
elif usage_type == "production_only":
|
|
104
|
+
return classified["production"]
|
|
105
|
+
else:
|
|
106
|
+
# Default to all if invalid type
|
|
107
|
+
return usage_sites
|