cicada-mcp 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cicada-mcp might be problematic. Click here for more details.

Files changed (48) hide show
  1. cicada/__init__.py +30 -0
  2. cicada/clean.py +297 -0
  3. cicada/command_logger.py +293 -0
  4. cicada/dead_code_analyzer.py +282 -0
  5. cicada/extractors/__init__.py +36 -0
  6. cicada/extractors/base.py +66 -0
  7. cicada/extractors/call.py +176 -0
  8. cicada/extractors/dependency.py +361 -0
  9. cicada/extractors/doc.py +179 -0
  10. cicada/extractors/function.py +246 -0
  11. cicada/extractors/module.py +123 -0
  12. cicada/extractors/spec.py +151 -0
  13. cicada/find_dead_code.py +270 -0
  14. cicada/formatter.py +918 -0
  15. cicada/git_helper.py +646 -0
  16. cicada/indexer.py +629 -0
  17. cicada/install.py +724 -0
  18. cicada/keyword_extractor.py +364 -0
  19. cicada/keyword_search.py +553 -0
  20. cicada/lightweight_keyword_extractor.py +298 -0
  21. cicada/mcp_server.py +1559 -0
  22. cicada/mcp_tools.py +291 -0
  23. cicada/parser.py +124 -0
  24. cicada/pr_finder.py +435 -0
  25. cicada/pr_indexer/__init__.py +20 -0
  26. cicada/pr_indexer/cli.py +62 -0
  27. cicada/pr_indexer/github_api_client.py +431 -0
  28. cicada/pr_indexer/indexer.py +297 -0
  29. cicada/pr_indexer/line_mapper.py +209 -0
  30. cicada/pr_indexer/pr_index_builder.py +253 -0
  31. cicada/setup.py +339 -0
  32. cicada/utils/__init__.py +52 -0
  33. cicada/utils/call_site_formatter.py +95 -0
  34. cicada/utils/function_grouper.py +57 -0
  35. cicada/utils/hash_utils.py +173 -0
  36. cicada/utils/index_utils.py +290 -0
  37. cicada/utils/path_utils.py +240 -0
  38. cicada/utils/signature_builder.py +106 -0
  39. cicada/utils/storage.py +111 -0
  40. cicada/utils/subprocess_runner.py +182 -0
  41. cicada/utils/text_utils.py +90 -0
  42. cicada/version_check.py +116 -0
  43. cicada_mcp-0.1.4.dist-info/METADATA +619 -0
  44. cicada_mcp-0.1.4.dist-info/RECORD +48 -0
  45. cicada_mcp-0.1.4.dist-info/WHEEL +5 -0
  46. cicada_mcp-0.1.4.dist-info/entry_points.txt +8 -0
  47. cicada_mcp-0.1.4.dist-info/licenses/LICENSE +21 -0
  48. cicada_mcp-0.1.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,364 @@
1
+ """
2
+ Keyword Extraction using spaCy
3
+ Advanced NLP-based keyword extraction for programming documentation
4
+
5
+ DEPRECATED: This module is being replaced by lightweight_keyword_extractor.py
6
+ which provides faster performance using lemminflect instead of spaCy.
7
+
8
+ The spaCy-based extractor has been kept for backward compatibility and for
9
+ cases where advanced NLP features are needed. For most use cases, prefer
10
+ LightweightKeywordExtractor from cicada.lightweight_keyword_extractor.
11
+
12
+ Performance comparison:
13
+ - LightweightKeywordExtractor: ~0.1s startup time
14
+ - KeywordExtractor (spaCy): ~2s startup time
15
+
16
+ See: cicada.lightweight_keyword_extractor.LightweightKeywordExtractor
17
+ """
18
+
19
+ from collections import Counter
20
+ import re
21
+ import sys
22
+ import subprocess
23
+
24
+ from cicada.utils import split_camel_snake_case
25
+
26
+ # Lazy import spacy only when needed
27
+ spacy = None
28
+
29
+
30
+ def _ensure_spacy_imported():
31
+ """Import spacy only when needed."""
32
+ global spacy
33
+ if spacy is None:
34
+ import spacy as spacy_module
35
+
36
+ spacy = spacy_module
37
+
38
+
39
+ class KeywordExtractor:
40
+ """Extract keywords from text using spaCy NLP."""
41
+
42
+ # spaCy model names for different sizes
43
+ SPACY_MODELS = {
44
+ "small": "en_core_web_sm",
45
+ "medium": "en_core_web_md",
46
+ "large": "en_core_web_lg",
47
+ }
48
+
49
+ def __init__(self, verbose: bool = False, model_size: str = "small"):
50
+ """
51
+ Initialize keyword extractor with lazy model loading.
52
+
53
+ Args:
54
+ verbose: If True, print status messages during initialization
55
+ model_size: Size of spaCy model to use ('small', 'medium', or 'large')
56
+ Default is 'small'. Medium and large models provide better
57
+ accuracy but are slower and require more memory.
58
+ """
59
+ self.verbose = verbose
60
+
61
+ # Validate model size
62
+ if model_size not in self.SPACY_MODELS:
63
+ raise ValueError(
64
+ f"Invalid model size '{model_size}'. "
65
+ f"Must be one of: {', '.join(self.SPACY_MODELS.keys())}"
66
+ )
67
+
68
+ self.model_size = model_size
69
+ self.model_name = self.SPACY_MODELS[model_size]
70
+ self.nlp = None # Lazy-loaded on first use
71
+
72
+ def _ensure_model_loaded(self):
73
+ """
74
+ Ensure the spaCy model is loaded, downloading if necessary.
75
+ Only called when model is actually needed (lazy loading).
76
+ """
77
+ if self.nlp is not None:
78
+ return # Already loaded
79
+
80
+ # Ensure spacy is imported
81
+ _ensure_spacy_imported()
82
+
83
+ if self.verbose:
84
+ print(f"Loading spaCy model ({self.model_size})...", file=sys.stderr)
85
+
86
+ try:
87
+ # Import the model directly as a Python package (fast failure if not installed)
88
+ import importlib
89
+
90
+ model_module = importlib.import_module(self.model_name)
91
+ self.nlp = model_module.load()
92
+ if self.verbose:
93
+ print("✓ Model loaded successfully", file=sys.stderr)
94
+ except (ImportError, AttributeError):
95
+ # Model not installed, download it
96
+ if self.verbose:
97
+ print(
98
+ f"Model '{self.model_name}' not found. Downloading...",
99
+ file=sys.stderr,
100
+ )
101
+
102
+ if not self._download_model():
103
+ raise RuntimeError(
104
+ f"Failed to download spaCy model '{self.model_name}'. "
105
+ f"Please install it manually with: python -m spacy download {self.model_name}"
106
+ )
107
+
108
+ # Try importing again after download
109
+ try:
110
+ import importlib
111
+
112
+ model_module = importlib.import_module(self.model_name)
113
+ self.nlp = model_module.load()
114
+ if self.verbose:
115
+ print("✓ Model loaded successfully", file=sys.stderr)
116
+ except (ImportError, AttributeError) as e:
117
+ raise RuntimeError(
118
+ f"Failed to load spaCy model '{self.model_name}' after download. "
119
+ f"Please try installing it manually: python -m spacy download {self.model_name}"
120
+ ) from e
121
+
122
+ def _download_model(self) -> bool:
123
+ """
124
+ Download the spaCy model using uv pip install.
125
+
126
+ Returns:
127
+ True if download succeeded, False otherwise
128
+ """
129
+ # Model URLs for direct installation
130
+ model_urls = {
131
+ "en_core_web_sm": "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
132
+ "en_core_web_md": "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl",
133
+ "en_core_web_lg": "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl",
134
+ }
135
+
136
+ if self.model_name not in model_urls:
137
+ if self.verbose:
138
+ print(f"Unknown model: {self.model_name}", file=sys.stderr)
139
+ return False
140
+
141
+ model_url = model_urls[self.model_name]
142
+
143
+ # Use uv pip install (works in uv-managed environments)
144
+ try:
145
+ if self.verbose:
146
+ print(f"Running: uv pip install {model_url}", file=sys.stderr)
147
+
148
+ result = subprocess.run(
149
+ ["uv", "pip", "install", model_url],
150
+ capture_output=True,
151
+ text=True,
152
+ check=True,
153
+ )
154
+
155
+ if self.verbose and result.stdout:
156
+ print(result.stdout, file=sys.stderr)
157
+
158
+ return True
159
+ except FileNotFoundError:
160
+ if self.verbose:
161
+ print(
162
+ "uv not found. Please install uv or manually install the model:",
163
+ file=sys.stderr,
164
+ )
165
+ print(f" uv pip install {model_url}", file=sys.stderr)
166
+ return False
167
+ except subprocess.CalledProcessError as e:
168
+ if self.verbose:
169
+ print(f"uv pip install failed: {e.stderr}", file=sys.stderr)
170
+ return False
171
+ except Exception as e:
172
+ if self.verbose:
173
+ print(f"Unexpected error during download: {e}", file=sys.stderr)
174
+ return False
175
+
176
+ def extract_code_identifiers(self, text):
177
+ """
178
+ Extract code-specific identifiers and their split words.
179
+
180
+ Returns a tuple of (identifiers, split_words) where:
181
+ - identifiers: original camelCase/PascalCase/snake_case identifiers
182
+ - split_words: individual words extracted from those identifiers
183
+ """
184
+ # Match camelCase, snake_case, PascalCase, and mixed patterns
185
+ patterns = [
186
+ r"\b[a-z]+[A-Z][a-zA-Z]*\b", # camelCase (e.g., getUserData)
187
+ r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b", # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
188
+ r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b", # PascalCase (e.g., UserController, PostgreSQL)
189
+ r"\b[a-z]+_[a-z_]+\b", # snake_case (e.g., get_user_data)
190
+ r"\b[A-Z]{2,}\b", # All UPPERCASE (e.g., HTTP, API, SQL)
191
+ ]
192
+
193
+ identifiers = []
194
+ for pattern in patterns:
195
+ matches = re.findall(pattern, text)
196
+ identifiers.extend(matches)
197
+
198
+ identifiers = list(set(identifiers))
199
+
200
+ # Split identifiers into individual words
201
+ split_words = []
202
+ for identifier in identifiers:
203
+ split_text = split_camel_snake_case(identifier)
204
+ # Extract individual words (lowercase, length > 1)
205
+ words = [
206
+ word.lower()
207
+ for word in split_text.split()
208
+ if len(word) > 1 and word.isalpha()
209
+ ]
210
+ split_words.extend(words)
211
+
212
+ return identifiers, list(set(split_words))
213
+
214
+ def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
215
+ """
216
+ Extract keywords and return a simple list of keyword strings.
217
+
218
+ Args:
219
+ text: Input text to analyze
220
+ top_n: Number of top keywords to return
221
+
222
+ Returns:
223
+ List of keyword strings (e.g., ['authentication', 'user', 'validate'])
224
+ """
225
+ if not text or not text.strip():
226
+ return []
227
+
228
+ try:
229
+ self._ensure_model_loaded()
230
+ results = self.extract_keywords(text, top_n=top_n)
231
+ # Extract just the keyword strings from top_keywords tuples
232
+ return [keyword for keyword, _ in results["top_keywords"]]
233
+ except Exception as e:
234
+ if self.verbose:
235
+ print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
236
+ return []
237
+
238
+ def extract_keywords(self, text, top_n=15):
239
+ """
240
+ Extract keywords using multiple strategies with emphasis on code identifiers.
241
+
242
+ Weighting strategy:
243
+ - Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
244
+ - Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
245
+ - Regular words (nouns, verbs): 1x weight
246
+
247
+ Args:
248
+ text: Input text to analyze
249
+ top_n: Number of top keywords to return
250
+
251
+ Returns:
252
+ Dictionary with extracted keywords and analysis:
253
+ - top_keywords: List of (keyword, count) tuples, sorted by frequency
254
+ - code_identifiers: Original identifiers (weighted 10x)
255
+ - code_split_words: Words extracted from identifiers (weighted 3x)
256
+ - nouns, verbs, adjectives: Linguistic categories
257
+ - entities: Named entities found
258
+ - tf_scores: Term frequency scores
259
+ - stats: Text statistics
260
+ """
261
+ if not text or not text.strip():
262
+ return {
263
+ "top_keywords": [],
264
+ "nouns": [],
265
+ "verbs": [],
266
+ "adjectives": [],
267
+ "proper_nouns": [],
268
+ "noun_chunks": [],
269
+ "entities": [],
270
+ "code_identifiers": [],
271
+ "tf_scores": {},
272
+ "stats": {
273
+ "total_tokens": 0,
274
+ "total_words": 0,
275
+ "unique_words": 0,
276
+ "sentences": 0,
277
+ },
278
+ }
279
+ # Ensure model is loaded (lazy loading on first use)
280
+ self._ensure_model_loaded()
281
+
282
+ # Process with spaCy
283
+ doc = self.nlp(text)
284
+
285
+ # 1. Extract nouns (concepts)
286
+ nouns = [
287
+ token.lemma_.lower()
288
+ for token in doc
289
+ if token.pos_ == "NOUN" and not token.is_stop and len(token.text) > 2
290
+ ]
291
+
292
+ # 2. Extract verbs (actions)
293
+ verbs = [
294
+ token.lemma_.lower()
295
+ for token in doc
296
+ if token.pos_ == "VERB" and not token.is_stop and len(token.text) > 2
297
+ ]
298
+
299
+ # 3. Extract adjectives (descriptors)
300
+ adjectives = [
301
+ token.lemma_.lower()
302
+ for token in doc
303
+ if token.pos_ == "ADJ" and not token.is_stop
304
+ ]
305
+
306
+ # 4. Extract proper nouns (named entities, technologies)
307
+ proper_nouns = [token.text for token in doc if token.pos_ == "PROPN"]
308
+
309
+ # 5. Extract noun chunks (multi-word concepts)
310
+ noun_chunks = [
311
+ chunk.text.lower()
312
+ for chunk in doc.noun_chunks
313
+ if len(chunk.text.split()) > 1
314
+ ]
315
+
316
+ # 6. Extract named entities
317
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
318
+
319
+ # 7. Extract code identifiers and their split words
320
+ code_identifiers, code_split_words = self.extract_code_identifiers(text)
321
+
322
+ # 8. Calculate keyword frequency (combining nouns, verbs, proper nouns, identifiers, and split code words)
323
+ # Give full code identifiers 10x weight for exact matching
324
+ # Give code split words 3x weight for fuzzy matching
325
+ code_identifiers_lower = [ident.lower() for ident in code_identifiers]
326
+ all_keywords = (
327
+ nouns
328
+ + verbs
329
+ + proper_nouns
330
+ + (code_identifiers_lower * 10)
331
+ + (code_split_words * 3)
332
+ )
333
+ keyword_freq = Counter(all_keywords)
334
+ top_keywords = keyword_freq.most_common(top_n)
335
+
336
+ # 9. Calculate TF scores (simple version)
337
+ total_words = len(
338
+ [token for token in doc if not token.is_stop and not token.is_punct]
339
+ )
340
+ tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
341
+
342
+ # Statistics
343
+ stats = {
344
+ "total_tokens": len(doc),
345
+ "total_words": total_words,
346
+ "unique_words": len(set([t.text.lower() for t in doc if not t.is_punct])),
347
+ "sentences": len(list(doc.sents)),
348
+ }
349
+
350
+ return {
351
+ "top_keywords": top_keywords,
352
+ "nouns": list(set(nouns))[:20],
353
+ "verbs": list(set(verbs))[:20],
354
+ "adjectives": list(set(adjectives))[:15],
355
+ "proper_nouns": list(set(proper_nouns)),
356
+ "noun_chunks": list(set(noun_chunks))[:15],
357
+ "entities": entities,
358
+ "code_identifiers": code_identifiers,
359
+ "code_split_words": code_split_words,
360
+ "tf_scores": dict(
361
+ sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
362
+ ),
363
+ "stats": stats,
364
+ }