cicada-mcp 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. cicada/ascii_art.py +60 -0
  2. cicada/clean.py +195 -60
  3. cicada/cli.py +757 -0
  4. cicada/colors.py +27 -0
  5. cicada/command_logger.py +14 -16
  6. cicada/dead_code_analyzer.py +12 -19
  7. cicada/extractors/__init__.py +6 -6
  8. cicada/extractors/base.py +3 -3
  9. cicada/extractors/call.py +11 -15
  10. cicada/extractors/dependency.py +39 -51
  11. cicada/extractors/doc.py +8 -9
  12. cicada/extractors/function.py +12 -24
  13. cicada/extractors/module.py +11 -15
  14. cicada/extractors/spec.py +8 -12
  15. cicada/find_dead_code.py +15 -39
  16. cicada/formatter.py +37 -91
  17. cicada/git_helper.py +22 -34
  18. cicada/indexer.py +165 -132
  19. cicada/interactive_setup.py +490 -0
  20. cicada/keybert_extractor.py +286 -0
  21. cicada/keyword_search.py +22 -30
  22. cicada/keyword_test.py +127 -0
  23. cicada/lightweight_keyword_extractor.py +5 -13
  24. cicada/mcp_entry.py +683 -0
  25. cicada/mcp_server.py +110 -232
  26. cicada/parser.py +9 -9
  27. cicada/pr_finder.py +15 -19
  28. cicada/pr_indexer/__init__.py +3 -3
  29. cicada/pr_indexer/cli.py +4 -9
  30. cicada/pr_indexer/github_api_client.py +22 -37
  31. cicada/pr_indexer/indexer.py +17 -29
  32. cicada/pr_indexer/line_mapper.py +8 -12
  33. cicada/pr_indexer/pr_index_builder.py +22 -34
  34. cicada/setup.py +198 -89
  35. cicada/utils/__init__.py +9 -9
  36. cicada/utils/call_site_formatter.py +4 -6
  37. cicada/utils/function_grouper.py +4 -4
  38. cicada/utils/hash_utils.py +12 -15
  39. cicada/utils/index_utils.py +15 -15
  40. cicada/utils/path_utils.py +24 -29
  41. cicada/utils/signature_builder.py +3 -3
  42. cicada/utils/subprocess_runner.py +17 -19
  43. cicada/utils/text_utils.py +1 -2
  44. cicada/version_check.py +2 -5
  45. {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/METADATA +144 -55
  46. cicada_mcp-0.2.0.dist-info/RECORD +53 -0
  47. cicada_mcp-0.2.0.dist-info/entry_points.txt +4 -0
  48. cicada/install.py +0 -741
  49. cicada_mcp-0.1.5.dist-info/RECORD +0 -47
  50. cicada_mcp-0.1.5.dist-info/entry_points.txt +0 -9
  51. {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/WHEEL +0 -0
  52. {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/licenses/LICENSE +0 -0
  53. {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,286 @@
1
+ """
2
+ Keyword Extraction using KeyBERT
3
+ Semantic keyword extraction using transformer-based embeddings
4
+ """
5
+
6
+ import re
7
+ import sys
8
+ from typing import Any
9
+
10
+ from cicada.utils import split_camel_snake_case
11
+
12
+
13
+ class KeyBERTExtractor:
14
+ """Extract keywords from text using KeyBERT semantic analysis."""
15
+
16
+ # Model configurations for different performance tiers
17
+ KEYBERT_MODELS = {
18
+ "fast": "all-MiniLM-L6-v2", # 80MB, fast extraction
19
+ "regular": "BAAI/bge-small-en-v1.5", # 133MB, balanced
20
+ "max": "paraphrase-mpnet-base-v2", # 420MB, highest quality
21
+ }
22
+
23
+ # Weighting strategy constants for keyword extraction
24
+ # These control how different types of keywords are prioritized
25
+ KEYBERT_CANDIDATE_MULTIPLIER = 3 # Extract 3x keywords for weighted reranking
26
+ CODE_IDENTIFIER_BOOST = 10 # 10x weight for exact code identifiers (e.g., function names)
27
+ CODE_SPLIT_WORD_BOOST = 3 # 3x weight for identifier components (e.g., "user" from "getUserId")
28
+ BASE_SCORE_IDENTIFIER = 0.5 # Base score for identifiers not found by BERT
29
+ BASE_SCORE_SPLIT_WORD = 0.3 # Base score for split words not found by BERT
30
+
31
+ # Class variable to hold KeyBERT class (lazily loaded)
32
+ _KeyBERT: type | None = None
33
+
34
+ def __init__(self, verbose: bool = False, model_tier: str | None = None):
35
+ """
36
+ Initialize KeyBERT model.
37
+
38
+ Args:
39
+ verbose: If True, print status messages during initialization
40
+ model_tier: Model tier to use ('fast', 'regular', or 'max').
41
+ If None, must be specified via config file.
42
+
43
+ Raises:
44
+ ImportError: If KeyBERT is not installed
45
+ ValueError: If model_tier is invalid or not specified
46
+ RuntimeError: If model loading fails
47
+ """
48
+ self.verbose = verbose
49
+
50
+ # Validate model tier first
51
+ if model_tier and model_tier not in self.KEYBERT_MODELS:
52
+ raise ValueError(
53
+ f"Invalid model tier '{model_tier}'. "
54
+ f"Must be one of: {', '.join(self.KEYBERT_MODELS.keys())}"
55
+ )
56
+
57
+ if model_tier is None:
58
+ raise ValueError(
59
+ "model_tier must be specified. Pass it directly or load from config file."
60
+ )
61
+
62
+ self.model_tier = model_tier
63
+ self.model_name = self.KEYBERT_MODELS[model_tier]
64
+
65
+ # Print message BEFORE the slow import
66
+ if self.verbose:
67
+ print(
68
+ f"Loading KeyBERT model ({model_tier}: {self.model_name})",
69
+ file=sys.stderr,
70
+ )
71
+ print("This can take up to a couple of minutes.", file=sys.stderr)
72
+
73
+ # Lazy import KeyBERT (only once per class)
74
+ # This import can take significant time on first load
75
+ if KeyBERTExtractor._KeyBERT is None:
76
+ try:
77
+ from keybert import KeyBERT
78
+
79
+ KeyBERTExtractor._KeyBERT = KeyBERT
80
+ except ImportError as e:
81
+ raise ImportError(
82
+ "KeyBERT is not installed. Install it with:\n"
83
+ " uv add keybert\n"
84
+ "or\n"
85
+ " pip install keybert"
86
+ ) from e
87
+
88
+ # Initialize KeyBERT with the selected model
89
+ # Assume model is pre-downloaded (user will handle caching separately)
90
+ try:
91
+ self.kw_model = KeyBERTExtractor._KeyBERT(model=self.model_name)
92
+ if self.verbose:
93
+ print("✓ Model loaded successfully", file=sys.stderr)
94
+ except Exception as e:
95
+ raise RuntimeError(
96
+ f"Failed to load KeyBERT model '{self.model_name}'. "
97
+ f"Ensure the model is downloaded and available. Error: {e}"
98
+ ) from e
99
+
100
+ def extract_code_identifiers(self, text: str) -> tuple[list[str], list[str]]:
101
+ """
102
+ Extract code-specific identifiers and their split words.
103
+
104
+ Returns a tuple of (identifiers, split_words) where:
105
+ - identifiers: original camelCase/PascalCase/snake_case identifiers
106
+ - split_words: individual words extracted from those identifiers
107
+ """
108
+ # Match camelCase, snake_case, PascalCase, and mixed patterns
109
+ patterns = [
110
+ r"\b[a-z]+[A-Z][a-zA-Z]*\b", # camelCase (e.g., getUserData)
111
+ r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b", # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
112
+ r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b", # PascalCase (e.g., UserController, PostgreSQL)
113
+ r"\b[a-z]+_[a-z_]+\b", # snake_case (e.g., get_user_data)
114
+ r"\b[A-Z]{2,}\b", # All UPPERCASE (e.g., HTTP, API, SQL)
115
+ ]
116
+
117
+ identifiers = []
118
+ for pattern in patterns:
119
+ matches = re.findall(pattern, text)
120
+ identifiers.extend(matches)
121
+
122
+ identifiers = list(set(identifiers))
123
+
124
+ # Split identifiers into individual words
125
+ split_words = []
126
+ for identifier in identifiers:
127
+ split_text = split_camel_snake_case(identifier)
128
+ # Extract individual words (lowercase, length > 1)
129
+ words = [
130
+ word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
131
+ ]
132
+ split_words.extend(words)
133
+
134
+ return identifiers, list(set(split_words))
135
+
136
+ def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
137
+ """
138
+ Extract keywords and return a simple list of keyword strings.
139
+
140
+ Args:
141
+ text: Input text to analyze
142
+ top_n: Number of top keywords to return
143
+
144
+ Returns:
145
+ List of keyword strings (e.g., ['authentication', 'user', 'validate'])
146
+ """
147
+ if not text or not text.strip():
148
+ return []
149
+
150
+ try:
151
+ results = self.extract_keywords(text, top_n=top_n)
152
+ # Extract just the keyword strings from top_keywords tuples
153
+ return [keyword for keyword, _ in results["top_keywords"]]
154
+ except Exception as e:
155
+ if self.verbose:
156
+ print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
157
+ return []
158
+
159
+ def extract_keywords(self, text: str, top_n: int = 15) -> dict[str, Any]:
160
+ """
161
+ Extract keywords using KeyBERT semantic analysis with code identifier emphasis.
162
+
163
+ Weighting strategy:
164
+ - Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
165
+ - Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
166
+ - KeyBERT semantic keywords: Base score from embedding similarity
167
+
168
+ Magic numbers explained:
169
+ - 3x multiplier: For candidate selection (top_n * 3) to have enough keywords
170
+ before applying weights. This ensures we don't miss important keywords that
171
+ might rank higher after code identifier boosting.
172
+ - 0.5 base score: Default confidence for code identifiers not found by KeyBERT.
173
+ After 10x boost, gives them a score of 5.0, prioritizing them over most
174
+ regular keywords.
175
+ - 0.3 base score: Default confidence for code split words not found by KeyBERT.
176
+ After 3x boost, gives them a score of 0.9, placing them between regular
177
+ keywords (0.4-0.7) and full identifiers (5.0).
178
+
179
+ Args:
180
+ text: Input text to analyze
181
+ top_n: Number of top keywords to return
182
+
183
+ Returns:
184
+ Dictionary with extracted keywords and analysis:
185
+ - top_keywords: List of (keyword, score) tuples, sorted by weighted score
186
+ - code_identifiers: Original identifiers (weighted 10x)
187
+ - code_split_words: Words extracted from identifiers (weighted 3x)
188
+ - noun_chunks: 2-word phrases from KeyBERT (if any)
189
+ - Other fields (nouns, verbs, etc.) are empty (KeyBERT doesn't do POS tagging)
190
+ - stats: Basic text statistics
191
+ """
192
+ if not text or not text.strip():
193
+ return {
194
+ "top_keywords": [],
195
+ "nouns": [],
196
+ "verbs": [],
197
+ "adjectives": [],
198
+ "proper_nouns": [],
199
+ "noun_chunks": [],
200
+ "entities": [],
201
+ "code_identifiers": [],
202
+ "code_split_words": [],
203
+ "tf_scores": {},
204
+ "stats": {
205
+ "total_tokens": 0,
206
+ "total_words": 0,
207
+ "unique_words": 0,
208
+ "sentences": 0,
209
+ },
210
+ }
211
+
212
+ # 1. Extract code identifiers and their split words
213
+ code_identifiers, code_split_words = self.extract_code_identifiers(text)
214
+
215
+ # 2. Use KeyBERT to extract semantic keywords
216
+ # Extract more than needed to have candidates for weighting
217
+ try:
218
+ # KeyBERT return type can vary, use type ignore for external library
219
+ keybert_keywords: list[tuple[str, float]] = self.kw_model.extract_keywords( # type: ignore[assignment]
220
+ text,
221
+ top_n=top_n * self.KEYBERT_CANDIDATE_MULTIPLIER,
222
+ keyphrase_ngram_range=(1, 1), # Single words only
223
+ )
224
+ except Exception as e:
225
+ if self.verbose:
226
+ print(f"Warning: KeyBERT extraction failed: {e}", file=sys.stderr)
227
+ keybert_keywords = []
228
+
229
+ # 3. Build weighted keyword scores
230
+ keyword_scores: dict[str, float] = {}
231
+
232
+ # Add KeyBERT keywords with their semantic similarity scores
233
+ for keyword, score in keybert_keywords:
234
+ keyword_lower: str = keyword.lower()
235
+ keyword_scores[keyword_lower] = score
236
+
237
+ # 4. Apply code identifier boosting
238
+ # Code identifiers get strong boost as they're likely important API/function names
239
+ code_identifiers_lower = [ident.lower() for ident in code_identifiers]
240
+ for identifier in code_identifiers_lower:
241
+ if identifier in keyword_scores:
242
+ keyword_scores[identifier] *= self.CODE_IDENTIFIER_BOOST
243
+ else:
244
+ # Add with high base score if not found by KeyBERT
245
+ keyword_scores[identifier] = self.BASE_SCORE_IDENTIFIER * self.CODE_IDENTIFIER_BOOST
246
+
247
+ # 5. Apply split word boosting (lower than full identifiers)
248
+ # Split words are components of identifiers, somewhat important but less than full names
249
+ code_split_words_lower = [word.lower() for word in code_split_words]
250
+ for word in code_split_words_lower:
251
+ if word in keyword_scores:
252
+ keyword_scores[word] *= self.CODE_SPLIT_WORD_BOOST
253
+ else:
254
+ keyword_scores[word] = self.BASE_SCORE_SPLIT_WORD * self.CODE_SPLIT_WORD_BOOST
255
+
256
+ # 5. Sort by weighted score and take top_n
257
+ top_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
258
+
259
+ # 6. No noun_chunks since we're using single words only
260
+ noun_chunks = []
261
+
262
+ # 7. Calculate basic statistics
263
+ words = text.split()
264
+ unique_words = {w.lower() for w in words if w.isalpha()}
265
+ sentences = text.count(".") + text.count("!") + text.count("?")
266
+
267
+ stats = {
268
+ "total_tokens": len(words),
269
+ "total_words": len([w for w in words if w.isalpha()]),
270
+ "unique_words": len(unique_words),
271
+ "sentences": max(1, sentences), # At least 1 sentence
272
+ }
273
+
274
+ return {
275
+ "top_keywords": top_keywords,
276
+ "nouns": [], # KeyBERT doesn't extract POS tags
277
+ "verbs": [],
278
+ "adjectives": [],
279
+ "proper_nouns": [],
280
+ "noun_chunks": noun_chunks,
281
+ "entities": [], # KeyBERT doesn't do NER
282
+ "code_identifiers": code_identifiers,
283
+ "code_split_words": code_split_words,
284
+ "tf_scores": {}, # Using semantic scores instead
285
+ "stats": stats,
286
+ }
cicada/keyword_search.py CHANGED
@@ -9,9 +9,9 @@ Identifier names (function/module names) are given much higher weight than keywo
9
9
  Author: Cursor(Auto)
10
10
  """
11
11
 
12
- import re
13
12
  import fnmatch
14
- from typing import List, Dict, Any
13
+ from typing import Any
14
+
15
15
  from rank_bm25 import BM25Okapi
16
16
 
17
17
  from cicada.utils import split_identifier
@@ -24,7 +24,7 @@ class KeywordSearcher:
24
24
  # When query keyword matches the function/module name, multiply the score by this
25
25
  IDENTIFIER_MATCH_BOOST = 10.0
26
26
 
27
- def __init__(self, index: Dict[str, Any]):
27
+ def __init__(self, index: dict[str, Any]):
28
28
  """
29
29
  Initialize the keyword searcher.
30
30
 
@@ -35,7 +35,7 @@ class KeywordSearcher:
35
35
  self.bm25, self.document_map = self._initialize_bm25()
36
36
 
37
37
  @staticmethod
38
- def _extract_identifier_name(document_info: Dict[str, Any]) -> str:
38
+ def _extract_identifier_name(document_info: dict[str, Any]) -> str:
39
39
  """
40
40
  Extract the core identifier name from document info.
41
41
 
@@ -169,8 +169,8 @@ class KeywordSearcher:
169
169
  return fnmatch.fnmatch(text.lower(), pattern.lower())
170
170
 
171
171
  def _expand_wildcard_keywords(
172
- self, query_keywords: List[str], document_keywords: List[str]
173
- ) -> List[str]:
172
+ self, query_keywords: list[str], document_keywords: list[str]
173
+ ) -> list[str]:
174
174
  """
175
175
  Expand wildcard patterns to actual matching keywords from the document.
176
176
 
@@ -190,10 +190,10 @@ class KeywordSearcher:
190
190
 
191
191
  def _expand_wildcard_keywords_with_identifier(
192
192
  self,
193
- query_keywords: List[str],
194
- document_keywords: List[str],
193
+ query_keywords: list[str],
194
+ document_keywords: list[str],
195
195
  identifier_name: str,
196
- ) -> List[str]:
196
+ ) -> list[str]:
197
197
  """
198
198
  Expand wildcard patterns to actual matching keywords from the document and identifier name.
199
199
 
@@ -214,13 +214,11 @@ class KeywordSearcher:
214
214
  break # Only add each query keyword once
215
215
 
216
216
  # Also check against the full identifier name
217
- if query_kw not in matched_keywords and self._match_wildcard(
218
- query_kw, identifier_name
219
- ):
217
+ if query_kw not in matched_keywords and self._match_wildcard(query_kw, identifier_name):
220
218
  matched_keywords.append(query_kw)
221
219
  return matched_keywords
222
220
 
223
- def _get_wildcard_scores(self, query_keywords: List[str]) -> List[float]:
221
+ def _get_wildcard_scores(self, query_keywords: list[str]) -> list[float]:
224
222
  """
225
223
  Calculate BM25-like scores for wildcard matching.
226
224
 
@@ -252,11 +250,11 @@ class KeywordSearcher:
252
250
 
253
251
  return scores
254
252
 
255
- def _has_wildcards(self, keywords: List[str]) -> bool:
253
+ def _has_wildcards(self, keywords: list[str]) -> bool:
256
254
  """Check if any keywords contain wildcard patterns."""
257
255
  return any("*" in keyword for keyword in keywords)
258
256
 
259
- def search(self, query_keywords: List[str], top_n: int = 5) -> List[Dict[str, Any]]:
257
+ def search(self, query_keywords: list[str], top_n: int = 5) -> list[dict[str, Any]]:
260
258
  """
261
259
  Search for modules and functions matching the given keywords.
262
260
 
@@ -313,9 +311,7 @@ class KeywordSearcher:
313
311
  query_keywords_lower, doc_info["keywords"], identifier_name
314
312
  )
315
313
  else:
316
- matched = self._count_matches(
317
- query_keywords_lower, doc_info["keywords"]
318
- )
314
+ matched = self._count_matches(query_keywords_lower, doc_info["keywords"])
319
315
 
320
316
  # Only include documents that match at least one query keyword
321
317
  if matched["score"] > 0:
@@ -368,7 +364,7 @@ class KeywordSearcher:
368
364
  return results[:top_n]
369
365
 
370
366
  def _apply_identifier_boost(
371
- self, bm25_score: float, query_keywords: List[str], doc_info: Dict[str, Any]
367
+ self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
372
368
  ) -> float:
373
369
  """
374
370
  Apply boost to BM25 score if query keywords match the identifier name.
@@ -399,9 +395,7 @@ class KeywordSearcher:
399
395
 
400
396
  return bm25_score
401
397
 
402
- def _count_matches(
403
- self, query_keywords: List[str], item_keywords: List[str]
404
- ) -> Dict[str, Any]:
398
+ def _count_matches(self, query_keywords: list[str], item_keywords: list[str]) -> dict[str, Any]:
405
399
  """
406
400
  Count matching keywords between query and item.
407
401
 
@@ -435,10 +429,10 @@ class KeywordSearcher:
435
429
 
436
430
  def _count_wildcard_matches(
437
431
  self,
438
- query_keywords: List[str],
439
- item_keywords: List[str],
432
+ query_keywords: list[str],
433
+ item_keywords: list[str],
440
434
  identifier_name: str | None = None,
441
- ) -> Dict[str, Any]:
435
+ ) -> dict[str, Any]:
442
436
  """
443
437
  Count matching keywords between query and item using wildcard patterns.
444
438
 
@@ -462,9 +456,7 @@ class KeywordSearcher:
462
456
  query_keywords, item_keywords_lower, identifier_name
463
457
  )
464
458
  else:
465
- matched_keywords = self._expand_wildcard_keywords(
466
- query_keywords, item_keywords_lower
467
- )
459
+ matched_keywords = self._expand_wildcard_keywords(query_keywords, item_keywords_lower)
468
460
 
469
461
  score = len(matched_keywords)
470
462
  confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
@@ -476,7 +468,7 @@ class KeywordSearcher:
476
468
  }
477
469
 
478
470
  def _apply_identifier_boost_wildcard(
479
- self, bm25_score: float, query_keywords: List[str], doc_info: Dict[str, Any]
471
+ self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
480
472
  ) -> float:
481
473
  """
482
474
  Apply boost to BM25 score if query keywords match the identifier name using wildcards.
@@ -509,7 +501,7 @@ class KeywordSearcher:
509
501
  return bm25_score
510
502
 
511
503
  def _calculate_name_coverage_penalty(
512
- self, query_keywords: List[str], doc_info: Dict[str, Any]
504
+ self, query_keywords: list[str], doc_info: dict[str, Any]
513
505
  ) -> float:
514
506
  """
515
507
  Calculate penalty for functions whose names contain words NOT in the query.
cicada/keyword_test.py ADDED
@@ -0,0 +1,127 @@
1
+ """
2
+ Interactive keyword extraction testing module.
3
+
4
+ Provides an interactive REPL for testing keyword extraction methods.
5
+ """
6
+
7
+ import sys
8
+
9
+
10
+ def run_keywords_interactive(method: str = "lemminflect", tier: str = "regular"):
11
+ """
12
+ Interactive keyword extraction testing mode.
13
+
14
+ Allows users to paste text and see extracted keywords in real-time
15
+ using the specified extraction method.
16
+
17
+ Args:
18
+ method: Extraction method ('lemminflect' or 'bert')
19
+ tier: Model tier ('fast', 'regular', or 'max')
20
+ """
21
+ print(f"\n{'='*70}")
22
+ print("🔍 Cicada Interactive Keyword Extraction Test")
23
+ print(f"{'='*70}")
24
+ print(f"Method: {method.upper()}")
25
+ print(f"Tier: {tier}")
26
+ print("\nPaste or type text, then press Ctrl-D (Unix) or Ctrl-Z+Enter (Windows)")
27
+ print("to extract keywords. Press Ctrl-C to exit.\n")
28
+ print(f"{'='*70}\n")
29
+
30
+ # Initialize keyword extractor
31
+ try:
32
+ if method == "bert":
33
+ from cicada.keybert_extractor import KeyBERTExtractor
34
+
35
+ extractor = KeyBERTExtractor(model_tier=tier, verbose=True)
36
+ else:
37
+ from cicada.lightweight_keyword_extractor import LightweightKeywordExtractor
38
+
39
+ extractor = LightweightKeywordExtractor(verbose=True)
40
+ print() # Add newline after initialization
41
+ except Exception as e:
42
+ print(f"Error initializing keyword extractor: {e}", file=sys.stderr)
43
+ sys.exit(1)
44
+
45
+ # Interactive loop
46
+ stdin_closed = False
47
+ try:
48
+ while True:
49
+ print("📝 Enter text (Ctrl-D or Ctrl-Z+Enter when done):")
50
+ print("-" * 70)
51
+
52
+ # Read multi-line input until EOF
53
+ lines = []
54
+ try:
55
+ while True:
56
+ line = input()
57
+ lines.append(line)
58
+ except EOFError:
59
+ # Check if this is the first EOF (stdin just closed)
60
+ if not lines and stdin_closed:
61
+ # stdin is exhausted and we have no input - exit gracefully
62
+ print("\n👋 No more input available. Exiting.")
63
+ return
64
+ stdin_closed = True
65
+
66
+ text = "\n".join(lines)
67
+
68
+ if not text.strip():
69
+ # If stdin is closed and input is empty, exit
70
+ if stdin_closed:
71
+ print("\n👋 No more input available. Exiting.")
72
+ return
73
+ print("\n⚠️ Empty input. Please enter some text.\n")
74
+ continue
75
+
76
+ # Extract keywords
77
+ print("\n" + "=" * 70)
78
+ print("🔑 EXTRACTED KEYWORDS:")
79
+ print("=" * 70)
80
+
81
+ try:
82
+ # Get detailed results
83
+ results = extractor.extract_keywords(text, top_n=15)
84
+
85
+ # Display top keywords with scores
86
+ top_keywords = results.get("top_keywords", [])
87
+ if top_keywords and isinstance(top_keywords, list):
88
+ print("\n📊 Top Keywords (with scores):")
89
+ for i, item in enumerate(top_keywords, 1):
90
+ if isinstance(item, (list, tuple)) and len(item) >= 2:
91
+ keyword, score = item[0], item[1]
92
+ print(f" {i:2}. {keyword:20s} (score: {score:.4f})")
93
+ else:
94
+ print(" No keywords extracted.")
95
+
96
+ # Display code identifiers if any
97
+ code_identifiers = results.get("code_identifiers")
98
+ if code_identifiers and isinstance(code_identifiers, list):
99
+ print("\n💻 Code Identifiers (10x weight):")
100
+ for ident in code_identifiers:
101
+ print(f" • {ident}")
102
+
103
+ # Display code split words if any
104
+ code_split_words = results.get("code_split_words")
105
+ if code_split_words and isinstance(code_split_words, list):
106
+ print("\n🔤 Code Split Words (3x weight):")
107
+ for word in code_split_words[:10]: # Limit to 10
108
+ print(f" • {word}")
109
+
110
+ # Display statistics
111
+ stats = results.get("stats")
112
+ if stats and isinstance(stats, dict):
113
+ print("\n📈 Statistics:")
114
+ print(f" • Total tokens: {stats.get('total_tokens', 0)}")
115
+ print(f" • Total words: {stats.get('total_words', 0)}")
116
+ print(f" • Unique words: {stats.get('unique_words', 0)}")
117
+ if "sentences" in stats:
118
+ print(f" • Sentences: {stats['sentences']}")
119
+
120
+ except Exception as e:
121
+ print(f"\n❌ Error extracting keywords: {e}", file=sys.stderr)
122
+
123
+ print("\n" + "=" * 70 + "\n")
124
+
125
+ except KeyboardInterrupt:
126
+ print("\n\n👋 Exiting interactive mode. Goodbye!")
127
+ sys.exit(0)
@@ -3,10 +3,10 @@ Lightweight Keyword Extraction using lemminflect
3
3
  Fast keyword extraction for programming documentation
4
4
  """
5
5
 
6
- from collections import Counter
7
6
  import re
8
7
  import sys
9
8
  import warnings
9
+ from collections import Counter
10
10
 
11
11
  from cicada.utils import split_camel_snake_case
12
12
 
@@ -189,9 +189,7 @@ class LightweightKeywordExtractor:
189
189
  for identifier in identifiers:
190
190
  split_text = split_camel_snake_case(identifier)
191
191
  words = [
192
- word.lower()
193
- for word in split_text.split()
194
- if len(word) > 1 and word.isalpha()
192
+ word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
195
193
  ]
196
194
  split_words.extend(words)
197
195
  return identifiers, list(set(split_words))
@@ -264,9 +262,7 @@ class LightweightKeywordExtractor:
264
262
  lemmatized_words.append(lemma)
265
263
 
266
264
  code_identifiers_lower = [ident.lower() for ident in code_identifiers]
267
- all_keywords = (
268
- lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
269
- )
265
+ all_keywords = lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
270
266
  keyword_freq = Counter(all_keywords)
271
267
  top_keywords = keyword_freq.most_common(top_n)
272
268
 
@@ -274,9 +270,7 @@ class LightweightKeywordExtractor:
274
270
  # This ensures weighted keywords are included in the calculation
275
271
  total_words = len(all_keywords)
276
272
  if total_words > 0:
277
- tf_scores = {
278
- word: (freq / total_words) for word, freq in keyword_freq.items()
279
- }
273
+ tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
280
274
  else:
281
275
  tf_scores = {}
282
276
 
@@ -291,8 +285,6 @@ class LightweightKeywordExtractor:
291
285
  "lemmatized_words": list(set(lemmatized_words))[:20],
292
286
  "code_identifiers": code_identifiers,
293
287
  "code_split_words": code_split_words,
294
- "tf_scores": dict(
295
- sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
296
- ),
288
+ "tf_scores": dict(sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]),
297
289
  "stats": stats,
298
290
  }