cicada-mcp 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cicada-mcp might be problematic. Click here for more details.
- cicada/__init__.py +30 -0
- cicada/clean.py +297 -0
- cicada/command_logger.py +293 -0
- cicada/dead_code_analyzer.py +282 -0
- cicada/extractors/__init__.py +36 -0
- cicada/extractors/base.py +66 -0
- cicada/extractors/call.py +176 -0
- cicada/extractors/dependency.py +361 -0
- cicada/extractors/doc.py +179 -0
- cicada/extractors/function.py +246 -0
- cicada/extractors/module.py +123 -0
- cicada/extractors/spec.py +151 -0
- cicada/find_dead_code.py +270 -0
- cicada/formatter.py +918 -0
- cicada/git_helper.py +646 -0
- cicada/indexer.py +629 -0
- cicada/install.py +724 -0
- cicada/keyword_extractor.py +364 -0
- cicada/keyword_search.py +553 -0
- cicada/lightweight_keyword_extractor.py +298 -0
- cicada/mcp_server.py +1559 -0
- cicada/mcp_tools.py +291 -0
- cicada/parser.py +124 -0
- cicada/pr_finder.py +435 -0
- cicada/pr_indexer/__init__.py +20 -0
- cicada/pr_indexer/cli.py +62 -0
- cicada/pr_indexer/github_api_client.py +431 -0
- cicada/pr_indexer/indexer.py +297 -0
- cicada/pr_indexer/line_mapper.py +209 -0
- cicada/pr_indexer/pr_index_builder.py +253 -0
- cicada/setup.py +339 -0
- cicada/utils/__init__.py +52 -0
- cicada/utils/call_site_formatter.py +95 -0
- cicada/utils/function_grouper.py +57 -0
- cicada/utils/hash_utils.py +173 -0
- cicada/utils/index_utils.py +290 -0
- cicada/utils/path_utils.py +240 -0
- cicada/utils/signature_builder.py +106 -0
- cicada/utils/storage.py +111 -0
- cicada/utils/subprocess_runner.py +182 -0
- cicada/utils/text_utils.py +90 -0
- cicada/version_check.py +116 -0
- cicada_mcp-0.1.4.dist-info/METADATA +619 -0
- cicada_mcp-0.1.4.dist-info/RECORD +48 -0
- cicada_mcp-0.1.4.dist-info/WHEEL +5 -0
- cicada_mcp-0.1.4.dist-info/entry_points.txt +8 -0
- cicada_mcp-0.1.4.dist-info/licenses/LICENSE +21 -0
- cicada_mcp-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Keyword Extraction using spaCy
|
|
3
|
+
Advanced NLP-based keyword extraction for programming documentation
|
|
4
|
+
|
|
5
|
+
DEPRECATED: This module is being replaced by lightweight_keyword_extractor.py
|
|
6
|
+
which provides faster performance using lemminflect instead of spaCy.
|
|
7
|
+
|
|
8
|
+
The spaCy-based extractor has been kept for backward compatibility and for
|
|
9
|
+
cases where advanced NLP features are needed. For most use cases, prefer
|
|
10
|
+
LightweightKeywordExtractor from cicada.lightweight_keyword_extractor.
|
|
11
|
+
|
|
12
|
+
Performance comparison:
|
|
13
|
+
- LightweightKeywordExtractor: ~0.1s startup time
|
|
14
|
+
- KeywordExtractor (spaCy): ~2s startup time
|
|
15
|
+
|
|
16
|
+
See: cicada.lightweight_keyword_extractor.LightweightKeywordExtractor
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from collections import Counter
|
|
20
|
+
import re
|
|
21
|
+
import sys
|
|
22
|
+
import subprocess
|
|
23
|
+
|
|
24
|
+
from cicada.utils import split_camel_snake_case
|
|
25
|
+
|
|
26
|
+
# Lazy import spacy only when needed
|
|
27
|
+
spacy = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _ensure_spacy_imported():
|
|
31
|
+
"""Import spacy only when needed."""
|
|
32
|
+
global spacy
|
|
33
|
+
if spacy is None:
|
|
34
|
+
import spacy as spacy_module
|
|
35
|
+
|
|
36
|
+
spacy = spacy_module
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class KeywordExtractor:
|
|
40
|
+
"""Extract keywords from text using spaCy NLP."""
|
|
41
|
+
|
|
42
|
+
# spaCy model names for different sizes
|
|
43
|
+
SPACY_MODELS = {
|
|
44
|
+
"small": "en_core_web_sm",
|
|
45
|
+
"medium": "en_core_web_md",
|
|
46
|
+
"large": "en_core_web_lg",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def __init__(self, verbose: bool = False, model_size: str = "small"):
|
|
50
|
+
"""
|
|
51
|
+
Initialize keyword extractor with lazy model loading.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
verbose: If True, print status messages during initialization
|
|
55
|
+
model_size: Size of spaCy model to use ('small', 'medium', or 'large')
|
|
56
|
+
Default is 'small'. Medium and large models provide better
|
|
57
|
+
accuracy but are slower and require more memory.
|
|
58
|
+
"""
|
|
59
|
+
self.verbose = verbose
|
|
60
|
+
|
|
61
|
+
# Validate model size
|
|
62
|
+
if model_size not in self.SPACY_MODELS:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
f"Invalid model size '{model_size}'. "
|
|
65
|
+
f"Must be one of: {', '.join(self.SPACY_MODELS.keys())}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
self.model_size = model_size
|
|
69
|
+
self.model_name = self.SPACY_MODELS[model_size]
|
|
70
|
+
self.nlp = None # Lazy-loaded on first use
|
|
71
|
+
|
|
72
|
+
def _ensure_model_loaded(self):
|
|
73
|
+
"""
|
|
74
|
+
Ensure the spaCy model is loaded, downloading if necessary.
|
|
75
|
+
Only called when model is actually needed (lazy loading).
|
|
76
|
+
"""
|
|
77
|
+
if self.nlp is not None:
|
|
78
|
+
return # Already loaded
|
|
79
|
+
|
|
80
|
+
# Ensure spacy is imported
|
|
81
|
+
_ensure_spacy_imported()
|
|
82
|
+
|
|
83
|
+
if self.verbose:
|
|
84
|
+
print(f"Loading spaCy model ({self.model_size})...", file=sys.stderr)
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
# Import the model directly as a Python package (fast failure if not installed)
|
|
88
|
+
import importlib
|
|
89
|
+
|
|
90
|
+
model_module = importlib.import_module(self.model_name)
|
|
91
|
+
self.nlp = model_module.load()
|
|
92
|
+
if self.verbose:
|
|
93
|
+
print("✓ Model loaded successfully", file=sys.stderr)
|
|
94
|
+
except (ImportError, AttributeError):
|
|
95
|
+
# Model not installed, download it
|
|
96
|
+
if self.verbose:
|
|
97
|
+
print(
|
|
98
|
+
f"Model '{self.model_name}' not found. Downloading...",
|
|
99
|
+
file=sys.stderr,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if not self._download_model():
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
f"Failed to download spaCy model '{self.model_name}'. "
|
|
105
|
+
f"Please install it manually with: python -m spacy download {self.model_name}"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Try importing again after download
|
|
109
|
+
try:
|
|
110
|
+
import importlib
|
|
111
|
+
|
|
112
|
+
model_module = importlib.import_module(self.model_name)
|
|
113
|
+
self.nlp = model_module.load()
|
|
114
|
+
if self.verbose:
|
|
115
|
+
print("✓ Model loaded successfully", file=sys.stderr)
|
|
116
|
+
except (ImportError, AttributeError) as e:
|
|
117
|
+
raise RuntimeError(
|
|
118
|
+
f"Failed to load spaCy model '{self.model_name}' after download. "
|
|
119
|
+
f"Please try installing it manually: python -m spacy download {self.model_name}"
|
|
120
|
+
) from e
|
|
121
|
+
|
|
122
|
+
def _download_model(self) -> bool:
|
|
123
|
+
"""
|
|
124
|
+
Download the spaCy model using uv pip install.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
True if download succeeded, False otherwise
|
|
128
|
+
"""
|
|
129
|
+
# Model URLs for direct installation
|
|
130
|
+
model_urls = {
|
|
131
|
+
"en_core_web_sm": "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
|
|
132
|
+
"en_core_web_md": "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl",
|
|
133
|
+
"en_core_web_lg": "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl",
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if self.model_name not in model_urls:
|
|
137
|
+
if self.verbose:
|
|
138
|
+
print(f"Unknown model: {self.model_name}", file=sys.stderr)
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
model_url = model_urls[self.model_name]
|
|
142
|
+
|
|
143
|
+
# Use uv pip install (works in uv-managed environments)
|
|
144
|
+
try:
|
|
145
|
+
if self.verbose:
|
|
146
|
+
print(f"Running: uv pip install {model_url}", file=sys.stderr)
|
|
147
|
+
|
|
148
|
+
result = subprocess.run(
|
|
149
|
+
["uv", "pip", "install", model_url],
|
|
150
|
+
capture_output=True,
|
|
151
|
+
text=True,
|
|
152
|
+
check=True,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if self.verbose and result.stdout:
|
|
156
|
+
print(result.stdout, file=sys.stderr)
|
|
157
|
+
|
|
158
|
+
return True
|
|
159
|
+
except FileNotFoundError:
|
|
160
|
+
if self.verbose:
|
|
161
|
+
print(
|
|
162
|
+
"uv not found. Please install uv or manually install the model:",
|
|
163
|
+
file=sys.stderr,
|
|
164
|
+
)
|
|
165
|
+
print(f" uv pip install {model_url}", file=sys.stderr)
|
|
166
|
+
return False
|
|
167
|
+
except subprocess.CalledProcessError as e:
|
|
168
|
+
if self.verbose:
|
|
169
|
+
print(f"uv pip install failed: {e.stderr}", file=sys.stderr)
|
|
170
|
+
return False
|
|
171
|
+
except Exception as e:
|
|
172
|
+
if self.verbose:
|
|
173
|
+
print(f"Unexpected error during download: {e}", file=sys.stderr)
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
def extract_code_identifiers(self, text):
|
|
177
|
+
"""
|
|
178
|
+
Extract code-specific identifiers and their split words.
|
|
179
|
+
|
|
180
|
+
Returns a tuple of (identifiers, split_words) where:
|
|
181
|
+
- identifiers: original camelCase/PascalCase/snake_case identifiers
|
|
182
|
+
- split_words: individual words extracted from those identifiers
|
|
183
|
+
"""
|
|
184
|
+
# Match camelCase, snake_case, PascalCase, and mixed patterns
|
|
185
|
+
patterns = [
|
|
186
|
+
r"\b[a-z]+[A-Z][a-zA-Z]*\b", # camelCase (e.g., getUserData)
|
|
187
|
+
r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b", # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
|
|
188
|
+
r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b", # PascalCase (e.g., UserController, PostgreSQL)
|
|
189
|
+
r"\b[a-z]+_[a-z_]+\b", # snake_case (e.g., get_user_data)
|
|
190
|
+
r"\b[A-Z]{2,}\b", # All UPPERCASE (e.g., HTTP, API, SQL)
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
identifiers = []
|
|
194
|
+
for pattern in patterns:
|
|
195
|
+
matches = re.findall(pattern, text)
|
|
196
|
+
identifiers.extend(matches)
|
|
197
|
+
|
|
198
|
+
identifiers = list(set(identifiers))
|
|
199
|
+
|
|
200
|
+
# Split identifiers into individual words
|
|
201
|
+
split_words = []
|
|
202
|
+
for identifier in identifiers:
|
|
203
|
+
split_text = split_camel_snake_case(identifier)
|
|
204
|
+
# Extract individual words (lowercase, length > 1)
|
|
205
|
+
words = [
|
|
206
|
+
word.lower()
|
|
207
|
+
for word in split_text.split()
|
|
208
|
+
if len(word) > 1 and word.isalpha()
|
|
209
|
+
]
|
|
210
|
+
split_words.extend(words)
|
|
211
|
+
|
|
212
|
+
return identifiers, list(set(split_words))
|
|
213
|
+
|
|
214
|
+
def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
|
|
215
|
+
"""
|
|
216
|
+
Extract keywords and return a simple list of keyword strings.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
text: Input text to analyze
|
|
220
|
+
top_n: Number of top keywords to return
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
List of keyword strings (e.g., ['authentication', 'user', 'validate'])
|
|
224
|
+
"""
|
|
225
|
+
if not text or not text.strip():
|
|
226
|
+
return []
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
self._ensure_model_loaded()
|
|
230
|
+
results = self.extract_keywords(text, top_n=top_n)
|
|
231
|
+
# Extract just the keyword strings from top_keywords tuples
|
|
232
|
+
return [keyword for keyword, _ in results["top_keywords"]]
|
|
233
|
+
except Exception as e:
|
|
234
|
+
if self.verbose:
|
|
235
|
+
print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
|
|
236
|
+
return []
|
|
237
|
+
|
|
238
|
+
def extract_keywords(self, text, top_n=15):
|
|
239
|
+
"""
|
|
240
|
+
Extract keywords using multiple strategies with emphasis on code identifiers.
|
|
241
|
+
|
|
242
|
+
Weighting strategy:
|
|
243
|
+
- Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
|
|
244
|
+
- Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
|
|
245
|
+
- Regular words (nouns, verbs): 1x weight
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
text: Input text to analyze
|
|
249
|
+
top_n: Number of top keywords to return
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Dictionary with extracted keywords and analysis:
|
|
253
|
+
- top_keywords: List of (keyword, count) tuples, sorted by frequency
|
|
254
|
+
- code_identifiers: Original identifiers (weighted 10x)
|
|
255
|
+
- code_split_words: Words extracted from identifiers (weighted 3x)
|
|
256
|
+
- nouns, verbs, adjectives: Linguistic categories
|
|
257
|
+
- entities: Named entities found
|
|
258
|
+
- tf_scores: Term frequency scores
|
|
259
|
+
- stats: Text statistics
|
|
260
|
+
"""
|
|
261
|
+
if not text or not text.strip():
|
|
262
|
+
return {
|
|
263
|
+
"top_keywords": [],
|
|
264
|
+
"nouns": [],
|
|
265
|
+
"verbs": [],
|
|
266
|
+
"adjectives": [],
|
|
267
|
+
"proper_nouns": [],
|
|
268
|
+
"noun_chunks": [],
|
|
269
|
+
"entities": [],
|
|
270
|
+
"code_identifiers": [],
|
|
271
|
+
"tf_scores": {},
|
|
272
|
+
"stats": {
|
|
273
|
+
"total_tokens": 0,
|
|
274
|
+
"total_words": 0,
|
|
275
|
+
"unique_words": 0,
|
|
276
|
+
"sentences": 0,
|
|
277
|
+
},
|
|
278
|
+
}
|
|
279
|
+
# Ensure model is loaded (lazy loading on first use)
|
|
280
|
+
self._ensure_model_loaded()
|
|
281
|
+
|
|
282
|
+
# Process with spaCy
|
|
283
|
+
doc = self.nlp(text)
|
|
284
|
+
|
|
285
|
+
# 1. Extract nouns (concepts)
|
|
286
|
+
nouns = [
|
|
287
|
+
token.lemma_.lower()
|
|
288
|
+
for token in doc
|
|
289
|
+
if token.pos_ == "NOUN" and not token.is_stop and len(token.text) > 2
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
# 2. Extract verbs (actions)
|
|
293
|
+
verbs = [
|
|
294
|
+
token.lemma_.lower()
|
|
295
|
+
for token in doc
|
|
296
|
+
if token.pos_ == "VERB" and not token.is_stop and len(token.text) > 2
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
# 3. Extract adjectives (descriptors)
|
|
300
|
+
adjectives = [
|
|
301
|
+
token.lemma_.lower()
|
|
302
|
+
for token in doc
|
|
303
|
+
if token.pos_ == "ADJ" and not token.is_stop
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
# 4. Extract proper nouns (named entities, technologies)
|
|
307
|
+
proper_nouns = [token.text for token in doc if token.pos_ == "PROPN"]
|
|
308
|
+
|
|
309
|
+
# 5. Extract noun chunks (multi-word concepts)
|
|
310
|
+
noun_chunks = [
|
|
311
|
+
chunk.text.lower()
|
|
312
|
+
for chunk in doc.noun_chunks
|
|
313
|
+
if len(chunk.text.split()) > 1
|
|
314
|
+
]
|
|
315
|
+
|
|
316
|
+
# 6. Extract named entities
|
|
317
|
+
entities = [(ent.text, ent.label_) for ent in doc.ents]
|
|
318
|
+
|
|
319
|
+
# 7. Extract code identifiers and their split words
|
|
320
|
+
code_identifiers, code_split_words = self.extract_code_identifiers(text)
|
|
321
|
+
|
|
322
|
+
# 8. Calculate keyword frequency (combining nouns, verbs, proper nouns, identifiers, and split code words)
|
|
323
|
+
# Give full code identifiers 10x weight for exact matching
|
|
324
|
+
# Give code split words 3x weight for fuzzy matching
|
|
325
|
+
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
326
|
+
all_keywords = (
|
|
327
|
+
nouns
|
|
328
|
+
+ verbs
|
|
329
|
+
+ proper_nouns
|
|
330
|
+
+ (code_identifiers_lower * 10)
|
|
331
|
+
+ (code_split_words * 3)
|
|
332
|
+
)
|
|
333
|
+
keyword_freq = Counter(all_keywords)
|
|
334
|
+
top_keywords = keyword_freq.most_common(top_n)
|
|
335
|
+
|
|
336
|
+
# 9. Calculate TF scores (simple version)
|
|
337
|
+
total_words = len(
|
|
338
|
+
[token for token in doc if not token.is_stop and not token.is_punct]
|
|
339
|
+
)
|
|
340
|
+
tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
|
|
341
|
+
|
|
342
|
+
# Statistics
|
|
343
|
+
stats = {
|
|
344
|
+
"total_tokens": len(doc),
|
|
345
|
+
"total_words": total_words,
|
|
346
|
+
"unique_words": len(set([t.text.lower() for t in doc if not t.is_punct])),
|
|
347
|
+
"sentences": len(list(doc.sents)),
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return {
|
|
351
|
+
"top_keywords": top_keywords,
|
|
352
|
+
"nouns": list(set(nouns))[:20],
|
|
353
|
+
"verbs": list(set(verbs))[:20],
|
|
354
|
+
"adjectives": list(set(adjectives))[:15],
|
|
355
|
+
"proper_nouns": list(set(proper_nouns)),
|
|
356
|
+
"noun_chunks": list(set(noun_chunks))[:15],
|
|
357
|
+
"entities": entities,
|
|
358
|
+
"code_identifiers": code_identifiers,
|
|
359
|
+
"code_split_words": code_split_words,
|
|
360
|
+
"tf_scores": dict(
|
|
361
|
+
sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
362
|
+
),
|
|
363
|
+
"stats": stats,
|
|
364
|
+
}
|