cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. cicada/_version_hash.py +4 -0
  2. cicada/cli.py +6 -748
  3. cicada/commands.py +1255 -0
  4. cicada/dead_code/__init__.py +1 -0
  5. cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
  6. cicada/dependency_analyzer.py +147 -0
  7. cicada/entry_utils.py +92 -0
  8. cicada/extractors/base.py +9 -9
  9. cicada/extractors/call.py +17 -20
  10. cicada/extractors/common.py +64 -0
  11. cicada/extractors/dependency.py +117 -235
  12. cicada/extractors/doc.py +2 -49
  13. cicada/extractors/function.py +10 -14
  14. cicada/extractors/keybert.py +228 -0
  15. cicada/extractors/keyword.py +191 -0
  16. cicada/extractors/module.py +6 -10
  17. cicada/extractors/spec.py +8 -56
  18. cicada/format/__init__.py +20 -0
  19. cicada/{ascii_art.py → format/ascii_art.py} +1 -1
  20. cicada/format/formatter.py +1145 -0
  21. cicada/git_helper.py +134 -7
  22. cicada/indexer.py +322 -89
  23. cicada/interactive_setup.py +251 -323
  24. cicada/interactive_setup_helpers.py +302 -0
  25. cicada/keyword_expander.py +437 -0
  26. cicada/keyword_search.py +208 -422
  27. cicada/keyword_test.py +383 -16
  28. cicada/mcp/__init__.py +10 -0
  29. cicada/mcp/entry.py +17 -0
  30. cicada/mcp/filter_utils.py +107 -0
  31. cicada/mcp/pattern_utils.py +118 -0
  32. cicada/{mcp_server.py → mcp/server.py} +819 -73
  33. cicada/mcp/tools.py +473 -0
  34. cicada/pr_finder.py +2 -3
  35. cicada/pr_indexer/indexer.py +3 -2
  36. cicada/setup.py +167 -35
  37. cicada/tier.py +225 -0
  38. cicada/utils/__init__.py +9 -2
  39. cicada/utils/fuzzy_match.py +54 -0
  40. cicada/utils/index_utils.py +9 -0
  41. cicada/utils/path_utils.py +18 -0
  42. cicada/utils/text_utils.py +52 -1
  43. cicada/utils/tree_utils.py +47 -0
  44. cicada/version_check.py +99 -0
  45. cicada/watch_manager.py +320 -0
  46. cicada/watcher.py +431 -0
  47. cicada_mcp-0.3.0.dist-info/METADATA +541 -0
  48. cicada_mcp-0.3.0.dist-info/RECORD +70 -0
  49. cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
  50. cicada/formatter.py +0 -864
  51. cicada/keybert_extractor.py +0 -286
  52. cicada/lightweight_keyword_extractor.py +0 -290
  53. cicada/mcp_entry.py +0 -683
  54. cicada/mcp_tools.py +0 -291
  55. cicada_mcp-0.2.0.dist-info/METADATA +0 -735
  56. cicada_mcp-0.2.0.dist-info/RECORD +0 -53
  57. cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
  58. /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
  59. /cicada/{colors.py → format/colors.py} +0 -0
  60. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
  61. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
  62. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,437 @@
1
+ """
2
+ Keyword expansion using lemminflect and word embeddings.
3
+
4
+ This module provides keyword expansion through three strategies:
5
+ 1. Lemmi: Generate inflected forms using lemminflect (e.g., run → running, runs, ran)
6
+ 2. GloVe: Semantic similarity via GloVe + inflected forms
7
+ 3. FastText: Semantic similarity via FastText + inflected forms
8
+
9
+ Example:
10
+ >>> expander = KeywordExpander(expansion_type="glove", verbose=True)
11
+ >>> expanded = expander.expand_keywords(["database"], top_n=3)
12
+ >>> print(expanded)
13
+ ['database', 'databases', 'postgresql', 'mysql', 'storage']
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import Any
19
+
20
+
21
+ class KeywordExpander:
22
+ """
23
+ Expands keywords using lemminflect and optionally word embeddings.
24
+
25
+ Supports three expansion strategies:
26
+ - lemmi: Inflected forms only (run → running, runs, ran)
27
+ - glove: GloVe embeddings + inflected forms (128MB)
28
+ - fasttext: FastText embeddings + inflected forms (958MB)
29
+ """
30
+
31
+ # Class-level cache for loaded models
32
+ _model_cache: dict[str, Any] = {}
33
+ _lemminflect_cache: Any = None
34
+
35
+ # Model configurations for word embeddings
36
+ EMBEDDING_MODELS = {
37
+ "glove": {
38
+ "name": "glove-wiki-gigaword-100",
39
+ "size_mb": 128,
40
+ "description": "GloVe embeddings (100-dim, fast)",
41
+ },
42
+ "fasttext": {
43
+ "name": "fasttext-wiki-news-subwords-300",
44
+ "size_mb": 958,
45
+ "description": "FastText embeddings (300-dim, better rare words)",
46
+ },
47
+ }
48
+
49
+ # Expansion penalty multipliers (reduce noise from derived terms)
50
+ INFLECTION_PENALTY = 0.7 # Penalty for morphological variations (run → runs, running)
51
+ SEMANTIC_EXPANSION_PENALTY = 0.9 # Penalty for embedding-based similar words
52
+ TOP_N = 5 # Number of top expansions to consider
53
+
54
+ def __init__(self, expansion_type: str = "lemmi", verbose: bool = False):
55
+ """
56
+ Initialize keyword expander.
57
+
58
+ Args:
59
+ expansion_type: Type of expansion to perform.
60
+ - "lemmi": Lemminflect inflections only
61
+ - "glove": GloVe embeddings + lemminflect (128MB)
62
+ - "fasttext": FastText embeddings + lemminflect (958MB)
63
+ verbose: Whether to print progress messages during model loading.
64
+
65
+ Raises:
66
+ ValueError: If expansion_type is not supported.
67
+ """
68
+ if expansion_type not in ["lemmi", "glove", "fasttext"]:
69
+ raise ValueError(
70
+ f"Unsupported expansion_type: {expansion_type}. "
71
+ f"Supported types: lemmi, glove, fasttext"
72
+ )
73
+
74
+ self.expansion_type = expansion_type
75
+ self.verbose = verbose
76
+ self._embedding_model: Any = None
77
+
78
+ def _load_lemminflect(self) -> Any:
79
+ """
80
+ Lazy-load lemminflect library (cached at class level).
81
+
82
+ Returns:
83
+ lemminflect module
84
+
85
+ Raises:
86
+ ImportError: If lemminflect is not installed.
87
+ """
88
+ if KeywordExpander._lemminflect_cache is not None:
89
+ return KeywordExpander._lemminflect_cache
90
+
91
+ try:
92
+ import lemminflect
93
+
94
+ KeywordExpander._lemminflect_cache = lemminflect
95
+ if self.verbose:
96
+ print("✓ lemminflect loaded")
97
+ return lemminflect
98
+ except ImportError as e:
99
+ raise ImportError(
100
+ "lemminflect is required for keyword expansion. "
101
+ "Install with: pip install lemminflect"
102
+ ) from e
103
+
104
+ def _load_embedding_model(self) -> Any:
105
+ """
106
+ Lazy-load word embedding model (glove or fasttext).
107
+
108
+ Returns:
109
+ Loaded gensim KeyedVectors model.
110
+
111
+ Raises:
112
+ ImportError: If gensim is not installed.
113
+ Exception: If model download or loading fails.
114
+ """
115
+ # Only load embeddings for glove/fasttext (not for lemmi-only)
116
+ if self.expansion_type == "lemmi":
117
+ return None
118
+
119
+ # Check class-level cache first
120
+ if self.expansion_type in self._model_cache:
121
+ if self.verbose:
122
+ print(f"✓ Using cached {self.expansion_type} model")
123
+ return self._model_cache[self.expansion_type]
124
+
125
+ try:
126
+ import gensim.downloader as api
127
+ except ImportError as e:
128
+ raise ImportError(
129
+ "gensim is required for embedding-based expansion. "
130
+ "Install with: pip install gensim"
131
+ ) from e
132
+
133
+ # Get model configuration
134
+ model_config = self.EMBEDDING_MODELS[self.expansion_type]
135
+ model_name: str = model_config["name"] # type: ignore[assignment]
136
+ size_mb: int = model_config["size_mb"] # type: ignore[assignment]
137
+
138
+ # Check if model is already downloaded to disk
139
+ from pathlib import Path
140
+
141
+ gensim_data_dir = Path.home() / "gensim-data" / model_name
142
+ is_cached = gensim_data_dir.exists()
143
+
144
+ if self.verbose:
145
+ if is_cached:
146
+ print(f"Loading {self.expansion_type} model from cache...")
147
+ else:
148
+ print(f"Loading {self.expansion_type} model...")
149
+ print(f"Downloading {size_mb}MB (first time only)...")
150
+ print("Progress bar will appear during download.")
151
+
152
+ try:
153
+ # Load model using gensim's downloader
154
+ model = api.load(model_name)
155
+
156
+ # Cache at class level
157
+ self._model_cache[self.expansion_type] = model
158
+
159
+ if self.verbose:
160
+ print(f"✓ {self.expansion_type.title()} model loaded successfully")
161
+
162
+ return model
163
+
164
+ except Exception as e:
165
+ raise Exception(f"Failed to load {self.expansion_type} model: {str(e)}") from e
166
+
167
+ def _get_inflections(self, word: str) -> set[str]:
168
+ """
169
+ Generate inflected forms of a word using lemminflect.
170
+
171
+ Args:
172
+ word: Base word to inflect
173
+
174
+ Returns:
175
+ Set of inflected forms (including the original word)
176
+ """
177
+ lemminflect = self._load_lemminflect()
178
+ inflections = {word.lower()}
179
+
180
+ try:
181
+ # Try different POS tags to get comprehensive inflections
182
+ for pos in ["VERB", "NOUN", "ADJ"]:
183
+ # Get inflections for all forms of this POS
184
+ all_inflections = lemminflect.getAllInflections(word, upos=pos)
185
+ if all_inflections:
186
+ for form_list in all_inflections.values():
187
+ inflections.update([f.lower() for f in form_list])
188
+ except Exception:
189
+ # If inflection fails, just return the original word
190
+ pass
191
+
192
+ return inflections
193
+
194
+ def expand_keywords(
195
+ self,
196
+ keywords: list[str],
197
+ top_n: int = 3,
198
+ threshold: float = 0.7,
199
+ return_scores: bool = False,
200
+ keyword_scores: dict[str, float] | None = None,
201
+ min_score: float = 0.0,
202
+ code_identifiers: list[str] | None = None,
203
+ ) -> list[str] | dict[str, Any]:
204
+ """
205
+ Expand keywords using the configured expansion strategy.
206
+
207
+ Strategy behavior:
208
+ - lemmi: Returns keywords + inflected forms
209
+ - glove/fasttext: Returns keywords + semantically similar words + inflected forms for all
210
+ - Code identifiers are NOT inflected or expanded (kept as exact matches)
211
+
212
+ Args:
213
+ keywords: List of seed keywords to expand.
214
+ top_n: Maximum number of similar words to return per keyword (for embeddings).
215
+ threshold: Minimum cosine similarity score for embeddings (0.0 to 1.0).
216
+ return_scores: If True, return detailed dict with scores. If False, return simple list.
217
+ keyword_scores: Optional dict mapping keywords to their extraction scores.
218
+ These scores multiply with expansion similarity scores.
219
+ min_score: Minimum score threshold for expanded keywords (filters out low-scoring terms).
220
+ code_identifiers: List of code identifiers that should NOT be inflected or expanded.
221
+
222
+ Returns:
223
+ If return_scores=False: Deduplicated list of expanded keywords (flat list).
224
+ If return_scores=True: Dict with 'words' (list of dicts with word/score/source) and 'simple' (flat list).
225
+
226
+ Example:
227
+ >>> expander = KeywordExpander(expansion_type="lemmi")
228
+ >>> result = expander.expand_keywords(["run", "database"])
229
+ >>> # Returns: ['run', 'running', 'runs', 'ran', 'database', 'databases', ...]
230
+
231
+ >>> result = expander.expand_keywords(["run"], return_scores=True, keyword_scores={"run": 0.95})
232
+ >>> # Returns: {
233
+ >>> 'words': [
234
+ >>> {'word': 'run', 'score': 0.95, 'source': 'original'},
235
+ >>> {'word': 'running', 'score': 0.95, 'source': 'inflection', 'parent': 'run'}
236
+ >>> ],
237
+ >>> 'simple': ['run', 'running', ...]
238
+ >>> }
239
+ """
240
+ from cicada.utils import split_camel_snake_case
241
+
242
+ # Default all keyword scores to 1.0 if not provided
243
+ if keyword_scores is None:
244
+ keyword_scores = {kw.lower(): 1.0 for kw in keywords}
245
+ else:
246
+ # Normalize keys to lowercase
247
+ keyword_scores = {k.lower(): v for k, v in keyword_scores.items()}
248
+
249
+ # Normalize code identifiers to lowercase set for fast lookup
250
+ code_identifiers_set = set()
251
+ if code_identifiers:
252
+ code_identifiers_set = {ident.lower() for ident in code_identifiers}
253
+
254
+ expanded_words = set()
255
+ word_details: dict[str, dict[str, Any]] = {} # word -> {score, source, parent}
256
+
257
+ # Step 1: Split compound identifiers and add all components
258
+ split_keywords = []
259
+ for keyword in keywords:
260
+ # Add original keyword with its extraction score
261
+ word_lower = keyword.lower()
262
+ expanded_words.add(word_lower)
263
+ extraction_score = keyword_scores.get(word_lower, 1.0)
264
+ if return_scores:
265
+ word_details[word_lower] = {
266
+ "score": extraction_score,
267
+ "source": "original",
268
+ }
269
+
270
+ # Split compound identifiers (e.g., get_keys → get, keys)
271
+ split_text = split_camel_snake_case(keyword)
272
+ words = [
273
+ word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
274
+ ]
275
+ split_keywords.extend(words)
276
+ for word in words:
277
+ expanded_words.add(word)
278
+ if return_scores and word not in word_details:
279
+ # Split words inherit parent's extraction score
280
+ word_details[word] = {
281
+ "score": extraction_score,
282
+ "source": "split",
283
+ "parent": keyword,
284
+ }
285
+
286
+ # Step 2: Generate inflections for all keywords (original + split)
287
+ # BUT: Skip inflections for code identifiers (they should remain exact)
288
+ all_keywords = list(set(keywords + split_keywords))
289
+ for keyword in all_keywords:
290
+ keyword_lower = keyword.lower()
291
+
292
+ # Skip inflections for code identifiers
293
+ if keyword_lower in code_identifiers_set:
294
+ continue
295
+
296
+ # Get the parent's score (either from original keywords or split words)
297
+ parent_score = (
298
+ word_details.get(keyword_lower, {}).get("score", 1.0) if return_scores else 1.0
299
+ )
300
+
301
+ inflections = self._get_inflections(keyword)
302
+ for inflection in inflections:
303
+ expanded_words.add(inflection)
304
+ if return_scores and inflection not in word_details:
305
+ # Inflections inherit parent's score with penalty
306
+ word_details[inflection] = {
307
+ "score": parent_score * self.INFLECTION_PENALTY,
308
+ "source": "inflection",
309
+ "parent": keyword,
310
+ }
311
+
312
+ # Step 3: If using embeddings, add semantically similar words + their inflections
313
+ if self.expansion_type in ["glove", "fasttext"]:
314
+ # Lazy-load embedding model
315
+ if self._embedding_model is None:
316
+ self._embedding_model = self._load_embedding_model()
317
+
318
+ if self._embedding_model is not None:
319
+ # Try embedding expansion for all keywords (original + split)
320
+ # BUT: Skip semantic expansion for code identifiers (they should remain exact)
321
+ for keyword in all_keywords:
322
+ keyword_lower = keyword.lower()
323
+
324
+ # Skip semantic expansion for code identifiers
325
+ if keyword_lower in code_identifiers_set:
326
+ continue
327
+
328
+ # Get parent keyword's score to multiply with similarity
329
+ parent_score = (
330
+ word_details.get(keyword_lower, {}).get("score", 1.0)
331
+ if return_scores
332
+ else 1.0
333
+ )
334
+
335
+ try:
336
+ # Get most similar words
337
+ similar_words = self._embedding_model.most_similar(
338
+ keyword_lower, topn=top_n * 3
339
+ )
340
+
341
+ # Filter by threshold and take top N
342
+ for word, similarity_score in similar_words[: self.TOP_N]:
343
+ if similarity_score >= threshold:
344
+ # Add the similar word
345
+ word_lower = word.lower()
346
+ expanded_words.add(word_lower)
347
+ if return_scores and word_lower not in word_details:
348
+ # Final score = extraction score × similarity score × semantic penalty
349
+ final_score = (
350
+ parent_score
351
+ * float(similarity_score)
352
+ * self.SEMANTIC_EXPANSION_PENALTY
353
+ )
354
+ word_details[word_lower] = {
355
+ "score": final_score,
356
+ "source": "embedding",
357
+ "parent": keyword,
358
+ }
359
+
360
+ # Add inflections of the similar word
361
+ inflections = self._get_inflections(word)
362
+ embedding_score = (
363
+ word_details.get(word_lower, {}).get(
364
+ "score",
365
+ parent_score
366
+ * float(similarity_score)
367
+ * self.SEMANTIC_EXPANSION_PENALTY,
368
+ )
369
+ if return_scores
370
+ else parent_score
371
+ * float(similarity_score)
372
+ * self.SEMANTIC_EXPANSION_PENALTY
373
+ )
374
+ for inflection in inflections:
375
+ expanded_words.add(inflection)
376
+ if return_scores and inflection not in word_details:
377
+ # Embedding inflections inherit the embedding's score with inflection penalty
378
+ word_details[inflection] = {
379
+ "score": embedding_score * self.INFLECTION_PENALTY,
380
+ "source": "embedding_inflection",
381
+ "parent": word_lower,
382
+ }
383
+
384
+ except KeyError:
385
+ # Keyword not in vocabulary - skip embedding expansion silently
386
+ # (This is expected for many code identifiers)
387
+ continue
388
+
389
+ simple_list = sorted(expanded_words)
390
+
391
+ if return_scores:
392
+ # Build detailed list with scores
393
+ detailed_list = [
394
+ {"word": word, **word_details.get(word, {"score": 1.0, "source": "unknown"})}
395
+ for word in simple_list
396
+ ]
397
+
398
+ # Apply min_score filter if specified
399
+ if min_score > 0.0:
400
+ detailed_list = [
401
+ item
402
+ for item in detailed_list
403
+ if isinstance(item.get("score"), (int, float))
404
+ and float(item.get("score", 0.0)) >= min_score
405
+ ]
406
+ simple_list = [item["word"] for item in detailed_list]
407
+
408
+ return {
409
+ "words": detailed_list,
410
+ "simple": simple_list,
411
+ }
412
+ else:
413
+ # Apply min_score filter if specified
414
+ if min_score > 0.0:
415
+ simple_list = [
416
+ word
417
+ for word in simple_list
418
+ if word_details.get(word, {}).get("score", 1.0) >= min_score
419
+ ]
420
+ return simple_list
421
+
422
+ def get_expansion_info(self) -> dict[str, Any]:
423
+ """
424
+ Get information about the expansion configuration.
425
+
426
+ Returns:
427
+ Dictionary with expansion metadata (type, model info if applicable).
428
+ """
429
+ info: dict[str, Any] = {
430
+ "type": self.expansion_type,
431
+ }
432
+
433
+ if self.expansion_type != "lemmi" and self._embedding_model is not None:
434
+ info["embedding_vocab_size"] = len(self._embedding_model.key_to_index)
435
+ info["embedding_vector_size"] = self._embedding_model.vector_size
436
+
437
+ return info