cicada-mcp 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cicada-mcp might be problematic. Click here for more details.

Files changed (48) hide show
  1. cicada/__init__.py +30 -0
  2. cicada/clean.py +297 -0
  3. cicada/command_logger.py +293 -0
  4. cicada/dead_code_analyzer.py +282 -0
  5. cicada/extractors/__init__.py +36 -0
  6. cicada/extractors/base.py +66 -0
  7. cicada/extractors/call.py +176 -0
  8. cicada/extractors/dependency.py +361 -0
  9. cicada/extractors/doc.py +179 -0
  10. cicada/extractors/function.py +246 -0
  11. cicada/extractors/module.py +123 -0
  12. cicada/extractors/spec.py +151 -0
  13. cicada/find_dead_code.py +270 -0
  14. cicada/formatter.py +918 -0
  15. cicada/git_helper.py +646 -0
  16. cicada/indexer.py +629 -0
  17. cicada/install.py +724 -0
  18. cicada/keyword_extractor.py +364 -0
  19. cicada/keyword_search.py +553 -0
  20. cicada/lightweight_keyword_extractor.py +298 -0
  21. cicada/mcp_server.py +1559 -0
  22. cicada/mcp_tools.py +291 -0
  23. cicada/parser.py +124 -0
  24. cicada/pr_finder.py +435 -0
  25. cicada/pr_indexer/__init__.py +20 -0
  26. cicada/pr_indexer/cli.py +62 -0
  27. cicada/pr_indexer/github_api_client.py +431 -0
  28. cicada/pr_indexer/indexer.py +297 -0
  29. cicada/pr_indexer/line_mapper.py +209 -0
  30. cicada/pr_indexer/pr_index_builder.py +253 -0
  31. cicada/setup.py +339 -0
  32. cicada/utils/__init__.py +52 -0
  33. cicada/utils/call_site_formatter.py +95 -0
  34. cicada/utils/function_grouper.py +57 -0
  35. cicada/utils/hash_utils.py +173 -0
  36. cicada/utils/index_utils.py +290 -0
  37. cicada/utils/path_utils.py +240 -0
  38. cicada/utils/signature_builder.py +106 -0
  39. cicada/utils/storage.py +111 -0
  40. cicada/utils/subprocess_runner.py +182 -0
  41. cicada/utils/text_utils.py +90 -0
  42. cicada/version_check.py +116 -0
  43. cicada_mcp-0.1.4.dist-info/METADATA +619 -0
  44. cicada_mcp-0.1.4.dist-info/RECORD +48 -0
  45. cicada_mcp-0.1.4.dist-info/WHEEL +5 -0
  46. cicada_mcp-0.1.4.dist-info/entry_points.txt +8 -0
  47. cicada_mcp-0.1.4.dist-info/licenses/LICENSE +21 -0
  48. cicada_mcp-0.1.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,298 @@
1
+ """
2
+ Lightweight Keyword Extraction using lemminflect
3
+ Fast keyword extraction for programming documentation
4
+ """
5
+
6
+ from collections import Counter
7
+ import re
8
+ import sys
9
+ import warnings
10
+
11
+ from cicada.utils import split_camel_snake_case
12
+
13
+
14
+ class LightweightKeywordExtractor:
15
+ """Extract keywords from text using lightweight lemmatization."""
16
+
17
+ STOPWORDS = {
18
+ "the",
19
+ "a",
20
+ "an",
21
+ "and",
22
+ "or",
23
+ "but",
24
+ "in",
25
+ "on",
26
+ "at",
27
+ "to",
28
+ "for",
29
+ "of",
30
+ "with",
31
+ "by",
32
+ "from",
33
+ "as",
34
+ "is",
35
+ "are",
36
+ "was",
37
+ "were",
38
+ "be",
39
+ "been",
40
+ "being",
41
+ "have",
42
+ "has",
43
+ "had",
44
+ "do",
45
+ "does",
46
+ "did",
47
+ "will",
48
+ "would",
49
+ "should",
50
+ "could",
51
+ "this",
52
+ "that",
53
+ "these",
54
+ "those",
55
+ "it",
56
+ "its",
57
+ "they",
58
+ "them",
59
+ "their",
60
+ "what",
61
+ "which",
62
+ "who",
63
+ "when",
64
+ "where",
65
+ "why",
66
+ "how",
67
+ "all",
68
+ "each",
69
+ "every",
70
+ "both",
71
+ "few",
72
+ "more",
73
+ "most",
74
+ "other",
75
+ "some",
76
+ "such",
77
+ "no",
78
+ "nor",
79
+ "not",
80
+ "only",
81
+ "own",
82
+ "same",
83
+ "so",
84
+ "than",
85
+ "too",
86
+ "very",
87
+ "can",
88
+ "just",
89
+ "up",
90
+ "out",
91
+ }
92
+
93
+ # Pre-compiled regex patterns for code identifier extraction
94
+ CODE_PATTERNS = [
95
+ re.compile(r"\b[a-z]+[A-Z][a-zA-Z]*\b"), # camelCase
96
+ re.compile(r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b"), # HTTPServer
97
+ re.compile(r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b"), # PascalCase
98
+ re.compile(r"\b[a-z]+_[a-z_]+\b"), # snake_case
99
+ re.compile(r"\b[A-Z]{2,}\b"), # UPPERCASE
100
+ ]
101
+
102
+ # Pre-compiled tokenization pattern
103
+ TOKEN_PATTERN = re.compile(r"\b[a-zA-Z][a-zA-Z0-9_]*\b")
104
+
105
+ def __init__(self, verbose: bool = False, model_size: str = "small"):
106
+ """
107
+ Initialize lightweight keyword extractor.
108
+
109
+ Args:
110
+ verbose: If True, print status messages during initialization
111
+ model_size: Deprecated parameter kept for backward compatibility.
112
+ This parameter is ignored in the lightweight extractor.
113
+ """
114
+ self.verbose = verbose
115
+ self.model_size = model_size
116
+ self._lemminflect_loaded = False
117
+
118
+ # Deprecation warning for model_size parameter
119
+ if model_size != "small":
120
+ warnings.warn(
121
+ "The 'model_size' parameter is deprecated and ignored in LightweightKeywordExtractor. "
122
+ "The lightweight extractor does not use size-based models.",
123
+ DeprecationWarning,
124
+ stacklevel=2,
125
+ )
126
+
127
+ def _load_lemminflect(self):
128
+ """Lazy load lemminflect library."""
129
+ if self._lemminflect_loaded:
130
+ return
131
+ try:
132
+ import lemminflect
133
+
134
+ self._lemminflect = lemminflect
135
+ self._lemminflect_loaded = True
136
+ if self.verbose:
137
+ print("✓ lemminflect loaded", file=sys.stderr)
138
+ except ImportError as e:
139
+ raise RuntimeError(
140
+ "lemminflect is required but not installed. "
141
+ "Please install it with: uv pip install lemminflect"
142
+ ) from e
143
+
144
+ def _tokenize(self, text: str) -> list[str]:
145
+ """Tokenize text into words."""
146
+ tokens = self.TOKEN_PATTERN.findall(text)
147
+ return tokens
148
+
149
+ def _lemmatize(self, word: str) -> str:
150
+ """
151
+ Lemmatize a word using lemminflect with fallback.
152
+
153
+ Tries lemmatization with VERB, NOUN, and ADJ POS tags.
154
+ Falls back to lowercase if lemmatization fails.
155
+
156
+ Args:
157
+ word: Word to lemmatize
158
+
159
+ Returns:
160
+ Lemmatized word (lowercase)
161
+ """
162
+ try:
163
+ # Try different POS tags for better coverage
164
+ for pos in ["VERB", "NOUN", "ADJ"]:
165
+ lemma = self._lemminflect.getLemma(word, upos=pos)
166
+ if lemma:
167
+ return lemma[0].lower()
168
+ # Fallback to lowercase if no lemma found
169
+ return word.lower()
170
+ except Exception:
171
+ # Graceful fallback if lemminflect fails
172
+ return word.lower()
173
+
174
+ def extract_code_identifiers(self, text):
175
+ """
176
+ Extract code-specific identifiers and their split words.
177
+
178
+ Returns a tuple of (identifiers, split_words) where:
179
+ - identifiers: original camelCase/PascalCase/snake_case identifiers
180
+ - split_words: individual words extracted from those identifiers
181
+ """
182
+ identifiers = []
183
+ for pattern in self.CODE_PATTERNS:
184
+ matches = pattern.findall(text)
185
+ identifiers.extend(matches)
186
+ identifiers = list(set(identifiers))
187
+
188
+ split_words = []
189
+ for identifier in identifiers:
190
+ split_text = split_camel_snake_case(identifier)
191
+ words = [
192
+ word.lower()
193
+ for word in split_text.split()
194
+ if len(word) > 1 and word.isalpha()
195
+ ]
196
+ split_words.extend(words)
197
+ return identifiers, list(set(split_words))
198
+
199
+ def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
200
+ """
201
+ Extract keywords and return a simple list of keyword strings.
202
+
203
+ Args:
204
+ text: Input text to analyze
205
+ top_n: Number of top keywords to return
206
+
207
+ Returns:
208
+ List of keyword strings (e.g., ['authentication', 'user', 'validate'])
209
+ """
210
+ if not text or not text.strip():
211
+ return []
212
+ try:
213
+ results = self.extract_keywords(text, top_n=top_n)
214
+ return [keyword for keyword, _ in results["top_keywords"]]
215
+ except Exception as e:
216
+ if self.verbose:
217
+ print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
218
+ return []
219
+
220
+ def extract_keywords(self, text, top_n=15):
221
+ """
222
+ Extract keywords using multiple strategies with emphasis on code identifiers.
223
+
224
+ Weighting strategy:
225
+ - Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
226
+ - Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
227
+ - Regular lemmatized words: 1x weight
228
+
229
+ Args:
230
+ text: Input text to analyze
231
+ top_n: Number of top keywords to return
232
+
233
+ Returns:
234
+ Dictionary with extracted keywords and analysis:
235
+ - top_keywords: List of (keyword, count) tuples, sorted by frequency
236
+ - lemmatized_words: Regular words after lemmatization
237
+ - code_identifiers: Original identifiers (weighted 10x)
238
+ - code_split_words: Words extracted from identifiers (weighted 3x)
239
+ - tf_scores: Term frequency scores
240
+ - stats: Text statistics
241
+ """
242
+ if not text or not text.strip():
243
+ return {
244
+ "top_keywords": [],
245
+ "lemmatized_words": [],
246
+ "code_identifiers": [],
247
+ "code_split_words": [],
248
+ "tf_scores": {},
249
+ "stats": {
250
+ "total_tokens": 0,
251
+ "total_words": 0,
252
+ "unique_words": 0,
253
+ },
254
+ }
255
+
256
+ self._load_lemminflect()
257
+ code_identifiers, code_split_words = self.extract_code_identifiers(text)
258
+ tokens = self._tokenize(text)
259
+ lemmatized_words = []
260
+ for word in tokens:
261
+ word_lower = word.lower()
262
+ if len(word) > 2 and word_lower not in self.STOPWORDS:
263
+ lemma = self._lemmatize(word)
264
+ lemmatized_words.append(lemma)
265
+
266
+ code_identifiers_lower = [ident.lower() for ident in code_identifiers]
267
+ all_keywords = (
268
+ lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
269
+ )
270
+ keyword_freq = Counter(all_keywords)
271
+ top_keywords = keyword_freq.most_common(top_n)
272
+
273
+ # Fix: Calculate TF scores based on all keywords, not just lemmatized_words
274
+ # This ensures weighted keywords are included in the calculation
275
+ total_words = len(all_keywords)
276
+ if total_words > 0:
277
+ tf_scores = {
278
+ word: (freq / total_words) for word, freq in keyword_freq.items()
279
+ }
280
+ else:
281
+ tf_scores = {}
282
+
283
+ stats = {
284
+ "total_tokens": len(tokens),
285
+ "total_words": len(lemmatized_words),
286
+ "unique_words": len(set(lemmatized_words)),
287
+ }
288
+
289
+ return {
290
+ "top_keywords": top_keywords,
291
+ "lemmatized_words": list(set(lemmatized_words))[:20],
292
+ "code_identifiers": code_identifiers,
293
+ "code_split_words": code_split_words,
294
+ "tf_scores": dict(
295
+ sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
296
+ ),
297
+ "stats": stats,
298
+ }