cicada-mcp 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cicada-mcp might be problematic. Click here for more details.
- cicada/__init__.py +30 -0
- cicada/clean.py +297 -0
- cicada/command_logger.py +293 -0
- cicada/dead_code_analyzer.py +282 -0
- cicada/extractors/__init__.py +36 -0
- cicada/extractors/base.py +66 -0
- cicada/extractors/call.py +176 -0
- cicada/extractors/dependency.py +361 -0
- cicada/extractors/doc.py +179 -0
- cicada/extractors/function.py +246 -0
- cicada/extractors/module.py +123 -0
- cicada/extractors/spec.py +151 -0
- cicada/find_dead_code.py +270 -0
- cicada/formatter.py +918 -0
- cicada/git_helper.py +646 -0
- cicada/indexer.py +629 -0
- cicada/install.py +724 -0
- cicada/keyword_extractor.py +364 -0
- cicada/keyword_search.py +553 -0
- cicada/lightweight_keyword_extractor.py +298 -0
- cicada/mcp_server.py +1559 -0
- cicada/mcp_tools.py +291 -0
- cicada/parser.py +124 -0
- cicada/pr_finder.py +435 -0
- cicada/pr_indexer/__init__.py +20 -0
- cicada/pr_indexer/cli.py +62 -0
- cicada/pr_indexer/github_api_client.py +431 -0
- cicada/pr_indexer/indexer.py +297 -0
- cicada/pr_indexer/line_mapper.py +209 -0
- cicada/pr_indexer/pr_index_builder.py +253 -0
- cicada/setup.py +339 -0
- cicada/utils/__init__.py +52 -0
- cicada/utils/call_site_formatter.py +95 -0
- cicada/utils/function_grouper.py +57 -0
- cicada/utils/hash_utils.py +173 -0
- cicada/utils/index_utils.py +290 -0
- cicada/utils/path_utils.py +240 -0
- cicada/utils/signature_builder.py +106 -0
- cicada/utils/storage.py +111 -0
- cicada/utils/subprocess_runner.py +182 -0
- cicada/utils/text_utils.py +90 -0
- cicada/version_check.py +116 -0
- cicada_mcp-0.1.4.dist-info/METADATA +619 -0
- cicada_mcp-0.1.4.dist-info/RECORD +48 -0
- cicada_mcp-0.1.4.dist-info/WHEEL +5 -0
- cicada_mcp-0.1.4.dist-info/entry_points.txt +8 -0
- cicada_mcp-0.1.4.dist-info/licenses/LICENSE +21 -0
- cicada_mcp-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lightweight Keyword Extraction using lemminflect
|
|
3
|
+
Fast keyword extraction for programming documentation
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from collections import Counter
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from cicada.utils import split_camel_snake_case
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LightweightKeywordExtractor:
|
|
15
|
+
"""Extract keywords from text using lightweight lemmatization."""
|
|
16
|
+
|
|
17
|
+
STOPWORDS = {
|
|
18
|
+
"the",
|
|
19
|
+
"a",
|
|
20
|
+
"an",
|
|
21
|
+
"and",
|
|
22
|
+
"or",
|
|
23
|
+
"but",
|
|
24
|
+
"in",
|
|
25
|
+
"on",
|
|
26
|
+
"at",
|
|
27
|
+
"to",
|
|
28
|
+
"for",
|
|
29
|
+
"of",
|
|
30
|
+
"with",
|
|
31
|
+
"by",
|
|
32
|
+
"from",
|
|
33
|
+
"as",
|
|
34
|
+
"is",
|
|
35
|
+
"are",
|
|
36
|
+
"was",
|
|
37
|
+
"were",
|
|
38
|
+
"be",
|
|
39
|
+
"been",
|
|
40
|
+
"being",
|
|
41
|
+
"have",
|
|
42
|
+
"has",
|
|
43
|
+
"had",
|
|
44
|
+
"do",
|
|
45
|
+
"does",
|
|
46
|
+
"did",
|
|
47
|
+
"will",
|
|
48
|
+
"would",
|
|
49
|
+
"should",
|
|
50
|
+
"could",
|
|
51
|
+
"this",
|
|
52
|
+
"that",
|
|
53
|
+
"these",
|
|
54
|
+
"those",
|
|
55
|
+
"it",
|
|
56
|
+
"its",
|
|
57
|
+
"they",
|
|
58
|
+
"them",
|
|
59
|
+
"their",
|
|
60
|
+
"what",
|
|
61
|
+
"which",
|
|
62
|
+
"who",
|
|
63
|
+
"when",
|
|
64
|
+
"where",
|
|
65
|
+
"why",
|
|
66
|
+
"how",
|
|
67
|
+
"all",
|
|
68
|
+
"each",
|
|
69
|
+
"every",
|
|
70
|
+
"both",
|
|
71
|
+
"few",
|
|
72
|
+
"more",
|
|
73
|
+
"most",
|
|
74
|
+
"other",
|
|
75
|
+
"some",
|
|
76
|
+
"such",
|
|
77
|
+
"no",
|
|
78
|
+
"nor",
|
|
79
|
+
"not",
|
|
80
|
+
"only",
|
|
81
|
+
"own",
|
|
82
|
+
"same",
|
|
83
|
+
"so",
|
|
84
|
+
"than",
|
|
85
|
+
"too",
|
|
86
|
+
"very",
|
|
87
|
+
"can",
|
|
88
|
+
"just",
|
|
89
|
+
"up",
|
|
90
|
+
"out",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Pre-compiled regex patterns for code identifier extraction
|
|
94
|
+
CODE_PATTERNS = [
|
|
95
|
+
re.compile(r"\b[a-z]+[A-Z][a-zA-Z]*\b"), # camelCase
|
|
96
|
+
re.compile(r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b"), # HTTPServer
|
|
97
|
+
re.compile(r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b"), # PascalCase
|
|
98
|
+
re.compile(r"\b[a-z]+_[a-z_]+\b"), # snake_case
|
|
99
|
+
re.compile(r"\b[A-Z]{2,}\b"), # UPPERCASE
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Pre-compiled tokenization pattern
|
|
103
|
+
TOKEN_PATTERN = re.compile(r"\b[a-zA-Z][a-zA-Z0-9_]*\b")
|
|
104
|
+
|
|
105
|
+
def __init__(self, verbose: bool = False, model_size: str = "small"):
|
|
106
|
+
"""
|
|
107
|
+
Initialize lightweight keyword extractor.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
verbose: If True, print status messages during initialization
|
|
111
|
+
model_size: Deprecated parameter kept for backward compatibility.
|
|
112
|
+
This parameter is ignored in the lightweight extractor.
|
|
113
|
+
"""
|
|
114
|
+
self.verbose = verbose
|
|
115
|
+
self.model_size = model_size
|
|
116
|
+
self._lemminflect_loaded = False
|
|
117
|
+
|
|
118
|
+
# Deprecation warning for model_size parameter
|
|
119
|
+
if model_size != "small":
|
|
120
|
+
warnings.warn(
|
|
121
|
+
"The 'model_size' parameter is deprecated and ignored in LightweightKeywordExtractor. "
|
|
122
|
+
"The lightweight extractor does not use size-based models.",
|
|
123
|
+
DeprecationWarning,
|
|
124
|
+
stacklevel=2,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def _load_lemminflect(self):
|
|
128
|
+
"""Lazy load lemminflect library."""
|
|
129
|
+
if self._lemminflect_loaded:
|
|
130
|
+
return
|
|
131
|
+
try:
|
|
132
|
+
import lemminflect
|
|
133
|
+
|
|
134
|
+
self._lemminflect = lemminflect
|
|
135
|
+
self._lemminflect_loaded = True
|
|
136
|
+
if self.verbose:
|
|
137
|
+
print("✓ lemminflect loaded", file=sys.stderr)
|
|
138
|
+
except ImportError as e:
|
|
139
|
+
raise RuntimeError(
|
|
140
|
+
"lemminflect is required but not installed. "
|
|
141
|
+
"Please install it with: uv pip install lemminflect"
|
|
142
|
+
) from e
|
|
143
|
+
|
|
144
|
+
def _tokenize(self, text: str) -> list[str]:
|
|
145
|
+
"""Tokenize text into words."""
|
|
146
|
+
tokens = self.TOKEN_PATTERN.findall(text)
|
|
147
|
+
return tokens
|
|
148
|
+
|
|
149
|
+
def _lemmatize(self, word: str) -> str:
|
|
150
|
+
"""
|
|
151
|
+
Lemmatize a word using lemminflect with fallback.
|
|
152
|
+
|
|
153
|
+
Tries lemmatization with VERB, NOUN, and ADJ POS tags.
|
|
154
|
+
Falls back to lowercase if lemmatization fails.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
word: Word to lemmatize
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Lemmatized word (lowercase)
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
# Try different POS tags for better coverage
|
|
164
|
+
for pos in ["VERB", "NOUN", "ADJ"]:
|
|
165
|
+
lemma = self._lemminflect.getLemma(word, upos=pos)
|
|
166
|
+
if lemma:
|
|
167
|
+
return lemma[0].lower()
|
|
168
|
+
# Fallback to lowercase if no lemma found
|
|
169
|
+
return word.lower()
|
|
170
|
+
except Exception:
|
|
171
|
+
# Graceful fallback if lemminflect fails
|
|
172
|
+
return word.lower()
|
|
173
|
+
|
|
174
|
+
def extract_code_identifiers(self, text):
|
|
175
|
+
"""
|
|
176
|
+
Extract code-specific identifiers and their split words.
|
|
177
|
+
|
|
178
|
+
Returns a tuple of (identifiers, split_words) where:
|
|
179
|
+
- identifiers: original camelCase/PascalCase/snake_case identifiers
|
|
180
|
+
- split_words: individual words extracted from those identifiers
|
|
181
|
+
"""
|
|
182
|
+
identifiers = []
|
|
183
|
+
for pattern in self.CODE_PATTERNS:
|
|
184
|
+
matches = pattern.findall(text)
|
|
185
|
+
identifiers.extend(matches)
|
|
186
|
+
identifiers = list(set(identifiers))
|
|
187
|
+
|
|
188
|
+
split_words = []
|
|
189
|
+
for identifier in identifiers:
|
|
190
|
+
split_text = split_camel_snake_case(identifier)
|
|
191
|
+
words = [
|
|
192
|
+
word.lower()
|
|
193
|
+
for word in split_text.split()
|
|
194
|
+
if len(word) > 1 and word.isalpha()
|
|
195
|
+
]
|
|
196
|
+
split_words.extend(words)
|
|
197
|
+
return identifiers, list(set(split_words))
|
|
198
|
+
|
|
199
|
+
def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
|
|
200
|
+
"""
|
|
201
|
+
Extract keywords and return a simple list of keyword strings.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
text: Input text to analyze
|
|
205
|
+
top_n: Number of top keywords to return
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
List of keyword strings (e.g., ['authentication', 'user', 'validate'])
|
|
209
|
+
"""
|
|
210
|
+
if not text or not text.strip():
|
|
211
|
+
return []
|
|
212
|
+
try:
|
|
213
|
+
results = self.extract_keywords(text, top_n=top_n)
|
|
214
|
+
return [keyword for keyword, _ in results["top_keywords"]]
|
|
215
|
+
except Exception as e:
|
|
216
|
+
if self.verbose:
|
|
217
|
+
print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
|
|
218
|
+
return []
|
|
219
|
+
|
|
220
|
+
def extract_keywords(self, text, top_n=15):
|
|
221
|
+
"""
|
|
222
|
+
Extract keywords using multiple strategies with emphasis on code identifiers.
|
|
223
|
+
|
|
224
|
+
Weighting strategy:
|
|
225
|
+
- Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
|
|
226
|
+
- Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
|
|
227
|
+
- Regular lemmatized words: 1x weight
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
text: Input text to analyze
|
|
231
|
+
top_n: Number of top keywords to return
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Dictionary with extracted keywords and analysis:
|
|
235
|
+
- top_keywords: List of (keyword, count) tuples, sorted by frequency
|
|
236
|
+
- lemmatized_words: Regular words after lemmatization
|
|
237
|
+
- code_identifiers: Original identifiers (weighted 10x)
|
|
238
|
+
- code_split_words: Words extracted from identifiers (weighted 3x)
|
|
239
|
+
- tf_scores: Term frequency scores
|
|
240
|
+
- stats: Text statistics
|
|
241
|
+
"""
|
|
242
|
+
if not text or not text.strip():
|
|
243
|
+
return {
|
|
244
|
+
"top_keywords": [],
|
|
245
|
+
"lemmatized_words": [],
|
|
246
|
+
"code_identifiers": [],
|
|
247
|
+
"code_split_words": [],
|
|
248
|
+
"tf_scores": {},
|
|
249
|
+
"stats": {
|
|
250
|
+
"total_tokens": 0,
|
|
251
|
+
"total_words": 0,
|
|
252
|
+
"unique_words": 0,
|
|
253
|
+
},
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
self._load_lemminflect()
|
|
257
|
+
code_identifiers, code_split_words = self.extract_code_identifiers(text)
|
|
258
|
+
tokens = self._tokenize(text)
|
|
259
|
+
lemmatized_words = []
|
|
260
|
+
for word in tokens:
|
|
261
|
+
word_lower = word.lower()
|
|
262
|
+
if len(word) > 2 and word_lower not in self.STOPWORDS:
|
|
263
|
+
lemma = self._lemmatize(word)
|
|
264
|
+
lemmatized_words.append(lemma)
|
|
265
|
+
|
|
266
|
+
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
267
|
+
all_keywords = (
|
|
268
|
+
lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
|
|
269
|
+
)
|
|
270
|
+
keyword_freq = Counter(all_keywords)
|
|
271
|
+
top_keywords = keyword_freq.most_common(top_n)
|
|
272
|
+
|
|
273
|
+
# Fix: Calculate TF scores based on all keywords, not just lemmatized_words
|
|
274
|
+
# This ensures weighted keywords are included in the calculation
|
|
275
|
+
total_words = len(all_keywords)
|
|
276
|
+
if total_words > 0:
|
|
277
|
+
tf_scores = {
|
|
278
|
+
word: (freq / total_words) for word, freq in keyword_freq.items()
|
|
279
|
+
}
|
|
280
|
+
else:
|
|
281
|
+
tf_scores = {}
|
|
282
|
+
|
|
283
|
+
stats = {
|
|
284
|
+
"total_tokens": len(tokens),
|
|
285
|
+
"total_words": len(lemmatized_words),
|
|
286
|
+
"unique_words": len(set(lemmatized_words)),
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
"top_keywords": top_keywords,
|
|
291
|
+
"lemmatized_words": list(set(lemmatized_words))[:20],
|
|
292
|
+
"code_identifiers": code_identifiers,
|
|
293
|
+
"code_split_words": code_split_words,
|
|
294
|
+
"tf_scores": dict(
|
|
295
|
+
sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
296
|
+
),
|
|
297
|
+
"stats": stats,
|
|
298
|
+
}
|