cicada-mcp 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/ascii_art.py +60 -0
- cicada/clean.py +195 -60
- cicada/cli.py +757 -0
- cicada/colors.py +27 -0
- cicada/command_logger.py +14 -16
- cicada/dead_code_analyzer.py +12 -19
- cicada/extractors/__init__.py +6 -6
- cicada/extractors/base.py +3 -3
- cicada/extractors/call.py +11 -15
- cicada/extractors/dependency.py +39 -51
- cicada/extractors/doc.py +8 -9
- cicada/extractors/function.py +12 -24
- cicada/extractors/module.py +11 -15
- cicada/extractors/spec.py +8 -12
- cicada/find_dead_code.py +15 -39
- cicada/formatter.py +37 -91
- cicada/git_helper.py +22 -34
- cicada/indexer.py +165 -132
- cicada/interactive_setup.py +490 -0
- cicada/keybert_extractor.py +286 -0
- cicada/keyword_search.py +22 -30
- cicada/keyword_test.py +127 -0
- cicada/lightweight_keyword_extractor.py +5 -13
- cicada/mcp_entry.py +683 -0
- cicada/mcp_server.py +110 -232
- cicada/parser.py +9 -9
- cicada/pr_finder.py +15 -19
- cicada/pr_indexer/__init__.py +3 -3
- cicada/pr_indexer/cli.py +4 -9
- cicada/pr_indexer/github_api_client.py +22 -37
- cicada/pr_indexer/indexer.py +17 -29
- cicada/pr_indexer/line_mapper.py +8 -12
- cicada/pr_indexer/pr_index_builder.py +22 -34
- cicada/setup.py +198 -89
- cicada/utils/__init__.py +9 -9
- cicada/utils/call_site_formatter.py +4 -6
- cicada/utils/function_grouper.py +4 -4
- cicada/utils/hash_utils.py +12 -15
- cicada/utils/index_utils.py +15 -15
- cicada/utils/path_utils.py +24 -29
- cicada/utils/signature_builder.py +3 -3
- cicada/utils/subprocess_runner.py +17 -19
- cicada/utils/text_utils.py +1 -2
- cicada/version_check.py +2 -5
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/METADATA +144 -55
- cicada_mcp-0.2.0.dist-info/RECORD +53 -0
- cicada_mcp-0.2.0.dist-info/entry_points.txt +4 -0
- cicada/install.py +0 -741
- cicada_mcp-0.1.5.dist-info/RECORD +0 -47
- cicada_mcp-0.1.5.dist-info/entry_points.txt +0 -9
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Keyword Extraction using KeyBERT
|
|
3
|
+
Semantic keyword extraction using transformer-based embeddings
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from cicada.utils import split_camel_snake_case
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class KeyBERTExtractor:
|
|
14
|
+
"""Extract keywords from text using KeyBERT semantic analysis."""
|
|
15
|
+
|
|
16
|
+
# Model configurations for different performance tiers
|
|
17
|
+
KEYBERT_MODELS = {
|
|
18
|
+
"fast": "all-MiniLM-L6-v2", # 80MB, fast extraction
|
|
19
|
+
"regular": "BAAI/bge-small-en-v1.5", # 133MB, balanced
|
|
20
|
+
"max": "paraphrase-mpnet-base-v2", # 420MB, highest quality
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
# Weighting strategy constants for keyword extraction
|
|
24
|
+
# These control how different types of keywords are prioritized
|
|
25
|
+
KEYBERT_CANDIDATE_MULTIPLIER = 3 # Extract 3x keywords for weighted reranking
|
|
26
|
+
CODE_IDENTIFIER_BOOST = 10 # 10x weight for exact code identifiers (e.g., function names)
|
|
27
|
+
CODE_SPLIT_WORD_BOOST = 3 # 3x weight for identifier components (e.g., "user" from "getUserId")
|
|
28
|
+
BASE_SCORE_IDENTIFIER = 0.5 # Base score for identifiers not found by BERT
|
|
29
|
+
BASE_SCORE_SPLIT_WORD = 0.3 # Base score for split words not found by BERT
|
|
30
|
+
|
|
31
|
+
# Class variable to hold KeyBERT class (lazily loaded)
|
|
32
|
+
_KeyBERT: type | None = None
|
|
33
|
+
|
|
34
|
+
def __init__(self, verbose: bool = False, model_tier: str | None = None):
|
|
35
|
+
"""
|
|
36
|
+
Initialize KeyBERT model.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
verbose: If True, print status messages during initialization
|
|
40
|
+
model_tier: Model tier to use ('fast', 'regular', or 'max').
|
|
41
|
+
If None, must be specified via config file.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ImportError: If KeyBERT is not installed
|
|
45
|
+
ValueError: If model_tier is invalid or not specified
|
|
46
|
+
RuntimeError: If model loading fails
|
|
47
|
+
"""
|
|
48
|
+
self.verbose = verbose
|
|
49
|
+
|
|
50
|
+
# Validate model tier first
|
|
51
|
+
if model_tier and model_tier not in self.KEYBERT_MODELS:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"Invalid model tier '{model_tier}'. "
|
|
54
|
+
f"Must be one of: {', '.join(self.KEYBERT_MODELS.keys())}"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if model_tier is None:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"model_tier must be specified. Pass it directly or load from config file."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.model_tier = model_tier
|
|
63
|
+
self.model_name = self.KEYBERT_MODELS[model_tier]
|
|
64
|
+
|
|
65
|
+
# Print message BEFORE the slow import
|
|
66
|
+
if self.verbose:
|
|
67
|
+
print(
|
|
68
|
+
f"Loading KeyBERT model ({model_tier}: {self.model_name})",
|
|
69
|
+
file=sys.stderr,
|
|
70
|
+
)
|
|
71
|
+
print("This can take up to a couple of minutes.", file=sys.stderr)
|
|
72
|
+
|
|
73
|
+
# Lazy import KeyBERT (only once per class)
|
|
74
|
+
# This import can take significant time on first load
|
|
75
|
+
if KeyBERTExtractor._KeyBERT is None:
|
|
76
|
+
try:
|
|
77
|
+
from keybert import KeyBERT
|
|
78
|
+
|
|
79
|
+
KeyBERTExtractor._KeyBERT = KeyBERT
|
|
80
|
+
except ImportError as e:
|
|
81
|
+
raise ImportError(
|
|
82
|
+
"KeyBERT is not installed. Install it with:\n"
|
|
83
|
+
" uv add keybert\n"
|
|
84
|
+
"or\n"
|
|
85
|
+
" pip install keybert"
|
|
86
|
+
) from e
|
|
87
|
+
|
|
88
|
+
# Initialize KeyBERT with the selected model
|
|
89
|
+
# Assume model is pre-downloaded (user will handle caching separately)
|
|
90
|
+
try:
|
|
91
|
+
self.kw_model = KeyBERTExtractor._KeyBERT(model=self.model_name)
|
|
92
|
+
if self.verbose:
|
|
93
|
+
print("✓ Model loaded successfully", file=sys.stderr)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
f"Failed to load KeyBERT model '{self.model_name}'. "
|
|
97
|
+
f"Ensure the model is downloaded and available. Error: {e}"
|
|
98
|
+
) from e
|
|
99
|
+
|
|
100
|
+
def extract_code_identifiers(self, text: str) -> tuple[list[str], list[str]]:
|
|
101
|
+
"""
|
|
102
|
+
Extract code-specific identifiers and their split words.
|
|
103
|
+
|
|
104
|
+
Returns a tuple of (identifiers, split_words) where:
|
|
105
|
+
- identifiers: original camelCase/PascalCase/snake_case identifiers
|
|
106
|
+
- split_words: individual words extracted from those identifiers
|
|
107
|
+
"""
|
|
108
|
+
# Match camelCase, snake_case, PascalCase, and mixed patterns
|
|
109
|
+
patterns = [
|
|
110
|
+
r"\b[a-z]+[A-Z][a-zA-Z]*\b", # camelCase (e.g., getUserData)
|
|
111
|
+
r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b", # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
|
|
112
|
+
r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b", # PascalCase (e.g., UserController, PostgreSQL)
|
|
113
|
+
r"\b[a-z]+_[a-z_]+\b", # snake_case (e.g., get_user_data)
|
|
114
|
+
r"\b[A-Z]{2,}\b", # All UPPERCASE (e.g., HTTP, API, SQL)
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
identifiers = []
|
|
118
|
+
for pattern in patterns:
|
|
119
|
+
matches = re.findall(pattern, text)
|
|
120
|
+
identifiers.extend(matches)
|
|
121
|
+
|
|
122
|
+
identifiers = list(set(identifiers))
|
|
123
|
+
|
|
124
|
+
# Split identifiers into individual words
|
|
125
|
+
split_words = []
|
|
126
|
+
for identifier in identifiers:
|
|
127
|
+
split_text = split_camel_snake_case(identifier)
|
|
128
|
+
# Extract individual words (lowercase, length > 1)
|
|
129
|
+
words = [
|
|
130
|
+
word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
|
|
131
|
+
]
|
|
132
|
+
split_words.extend(words)
|
|
133
|
+
|
|
134
|
+
return identifiers, list(set(split_words))
|
|
135
|
+
|
|
136
|
+
def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
|
|
137
|
+
"""
|
|
138
|
+
Extract keywords and return a simple list of keyword strings.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
text: Input text to analyze
|
|
142
|
+
top_n: Number of top keywords to return
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of keyword strings (e.g., ['authentication', 'user', 'validate'])
|
|
146
|
+
"""
|
|
147
|
+
if not text or not text.strip():
|
|
148
|
+
return []
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
results = self.extract_keywords(text, top_n=top_n)
|
|
152
|
+
# Extract just the keyword strings from top_keywords tuples
|
|
153
|
+
return [keyword for keyword, _ in results["top_keywords"]]
|
|
154
|
+
except Exception as e:
|
|
155
|
+
if self.verbose:
|
|
156
|
+
print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
|
|
157
|
+
return []
|
|
158
|
+
|
|
159
|
+
def extract_keywords(self, text: str, top_n: int = 15) -> dict[str, Any]:
|
|
160
|
+
"""
|
|
161
|
+
Extract keywords using KeyBERT semantic analysis with code identifier emphasis.
|
|
162
|
+
|
|
163
|
+
Weighting strategy:
|
|
164
|
+
- Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
|
|
165
|
+
- Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
|
|
166
|
+
- KeyBERT semantic keywords: Base score from embedding similarity
|
|
167
|
+
|
|
168
|
+
Magic numbers explained:
|
|
169
|
+
- 3x multiplier: For candidate selection (top_n * 3) to have enough keywords
|
|
170
|
+
before applying weights. This ensures we don't miss important keywords that
|
|
171
|
+
might rank higher after code identifier boosting.
|
|
172
|
+
- 0.5 base score: Default confidence for code identifiers not found by KeyBERT.
|
|
173
|
+
After 10x boost, gives them a score of 5.0, prioritizing them over most
|
|
174
|
+
regular keywords.
|
|
175
|
+
- 0.3 base score: Default confidence for code split words not found by KeyBERT.
|
|
176
|
+
After 3x boost, gives them a score of 0.9, placing them between regular
|
|
177
|
+
keywords (0.4-0.7) and full identifiers (5.0).
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
text: Input text to analyze
|
|
181
|
+
top_n: Number of top keywords to return
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Dictionary with extracted keywords and analysis:
|
|
185
|
+
- top_keywords: List of (keyword, score) tuples, sorted by weighted score
|
|
186
|
+
- code_identifiers: Original identifiers (weighted 10x)
|
|
187
|
+
- code_split_words: Words extracted from identifiers (weighted 3x)
|
|
188
|
+
- noun_chunks: 2-word phrases from KeyBERT (if any)
|
|
189
|
+
- Other fields (nouns, verbs, etc.) are empty (KeyBERT doesn't do POS tagging)
|
|
190
|
+
- stats: Basic text statistics
|
|
191
|
+
"""
|
|
192
|
+
if not text or not text.strip():
|
|
193
|
+
return {
|
|
194
|
+
"top_keywords": [],
|
|
195
|
+
"nouns": [],
|
|
196
|
+
"verbs": [],
|
|
197
|
+
"adjectives": [],
|
|
198
|
+
"proper_nouns": [],
|
|
199
|
+
"noun_chunks": [],
|
|
200
|
+
"entities": [],
|
|
201
|
+
"code_identifiers": [],
|
|
202
|
+
"code_split_words": [],
|
|
203
|
+
"tf_scores": {},
|
|
204
|
+
"stats": {
|
|
205
|
+
"total_tokens": 0,
|
|
206
|
+
"total_words": 0,
|
|
207
|
+
"unique_words": 0,
|
|
208
|
+
"sentences": 0,
|
|
209
|
+
},
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
# 1. Extract code identifiers and their split words
|
|
213
|
+
code_identifiers, code_split_words = self.extract_code_identifiers(text)
|
|
214
|
+
|
|
215
|
+
# 2. Use KeyBERT to extract semantic keywords
|
|
216
|
+
# Extract more than needed to have candidates for weighting
|
|
217
|
+
try:
|
|
218
|
+
# KeyBERT return type can vary, use type ignore for external library
|
|
219
|
+
keybert_keywords: list[tuple[str, float]] = self.kw_model.extract_keywords( # type: ignore[assignment]
|
|
220
|
+
text,
|
|
221
|
+
top_n=top_n * self.KEYBERT_CANDIDATE_MULTIPLIER,
|
|
222
|
+
keyphrase_ngram_range=(1, 1), # Single words only
|
|
223
|
+
)
|
|
224
|
+
except Exception as e:
|
|
225
|
+
if self.verbose:
|
|
226
|
+
print(f"Warning: KeyBERT extraction failed: {e}", file=sys.stderr)
|
|
227
|
+
keybert_keywords = []
|
|
228
|
+
|
|
229
|
+
# 3. Build weighted keyword scores
|
|
230
|
+
keyword_scores: dict[str, float] = {}
|
|
231
|
+
|
|
232
|
+
# Add KeyBERT keywords with their semantic similarity scores
|
|
233
|
+
for keyword, score in keybert_keywords:
|
|
234
|
+
keyword_lower: str = keyword.lower()
|
|
235
|
+
keyword_scores[keyword_lower] = score
|
|
236
|
+
|
|
237
|
+
# 4. Apply code identifier boosting
|
|
238
|
+
# Code identifiers get strong boost as they're likely important API/function names
|
|
239
|
+
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
240
|
+
for identifier in code_identifiers_lower:
|
|
241
|
+
if identifier in keyword_scores:
|
|
242
|
+
keyword_scores[identifier] *= self.CODE_IDENTIFIER_BOOST
|
|
243
|
+
else:
|
|
244
|
+
# Add with high base score if not found by KeyBERT
|
|
245
|
+
keyword_scores[identifier] = self.BASE_SCORE_IDENTIFIER * self.CODE_IDENTIFIER_BOOST
|
|
246
|
+
|
|
247
|
+
# 5. Apply split word boosting (lower than full identifiers)
|
|
248
|
+
# Split words are components of identifiers, somewhat important but less than full names
|
|
249
|
+
code_split_words_lower = [word.lower() for word in code_split_words]
|
|
250
|
+
for word in code_split_words_lower:
|
|
251
|
+
if word in keyword_scores:
|
|
252
|
+
keyword_scores[word] *= self.CODE_SPLIT_WORD_BOOST
|
|
253
|
+
else:
|
|
254
|
+
keyword_scores[word] = self.BASE_SCORE_SPLIT_WORD * self.CODE_SPLIT_WORD_BOOST
|
|
255
|
+
|
|
256
|
+
# 5. Sort by weighted score and take top_n
|
|
257
|
+
top_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
258
|
+
|
|
259
|
+
# 6. No noun_chunks since we're using single words only
|
|
260
|
+
noun_chunks = []
|
|
261
|
+
|
|
262
|
+
# 7. Calculate basic statistics
|
|
263
|
+
words = text.split()
|
|
264
|
+
unique_words = {w.lower() for w in words if w.isalpha()}
|
|
265
|
+
sentences = text.count(".") + text.count("!") + text.count("?")
|
|
266
|
+
|
|
267
|
+
stats = {
|
|
268
|
+
"total_tokens": len(words),
|
|
269
|
+
"total_words": len([w for w in words if w.isalpha()]),
|
|
270
|
+
"unique_words": len(unique_words),
|
|
271
|
+
"sentences": max(1, sentences), # At least 1 sentence
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return {
|
|
275
|
+
"top_keywords": top_keywords,
|
|
276
|
+
"nouns": [], # KeyBERT doesn't extract POS tags
|
|
277
|
+
"verbs": [],
|
|
278
|
+
"adjectives": [],
|
|
279
|
+
"proper_nouns": [],
|
|
280
|
+
"noun_chunks": noun_chunks,
|
|
281
|
+
"entities": [], # KeyBERT doesn't do NER
|
|
282
|
+
"code_identifiers": code_identifiers,
|
|
283
|
+
"code_split_words": code_split_words,
|
|
284
|
+
"tf_scores": {}, # Using semantic scores instead
|
|
285
|
+
"stats": stats,
|
|
286
|
+
}
|
cicada/keyword_search.py
CHANGED
|
@@ -9,9 +9,9 @@ Identifier names (function/module names) are given much higher weight than keywo
|
|
|
9
9
|
Author: Cursor(Auto)
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
import re
|
|
13
12
|
import fnmatch
|
|
14
|
-
from typing import
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
15
|
from rank_bm25 import BM25Okapi
|
|
16
16
|
|
|
17
17
|
from cicada.utils import split_identifier
|
|
@@ -24,7 +24,7 @@ class KeywordSearcher:
|
|
|
24
24
|
# When query keyword matches the function/module name, multiply the score by this
|
|
25
25
|
IDENTIFIER_MATCH_BOOST = 10.0
|
|
26
26
|
|
|
27
|
-
def __init__(self, index:
|
|
27
|
+
def __init__(self, index: dict[str, Any]):
|
|
28
28
|
"""
|
|
29
29
|
Initialize the keyword searcher.
|
|
30
30
|
|
|
@@ -35,7 +35,7 @@ class KeywordSearcher:
|
|
|
35
35
|
self.bm25, self.document_map = self._initialize_bm25()
|
|
36
36
|
|
|
37
37
|
@staticmethod
|
|
38
|
-
def _extract_identifier_name(document_info:
|
|
38
|
+
def _extract_identifier_name(document_info: dict[str, Any]) -> str:
|
|
39
39
|
"""
|
|
40
40
|
Extract the core identifier name from document info.
|
|
41
41
|
|
|
@@ -169,8 +169,8 @@ class KeywordSearcher:
|
|
|
169
169
|
return fnmatch.fnmatch(text.lower(), pattern.lower())
|
|
170
170
|
|
|
171
171
|
def _expand_wildcard_keywords(
|
|
172
|
-
self, query_keywords:
|
|
173
|
-
) ->
|
|
172
|
+
self, query_keywords: list[str], document_keywords: list[str]
|
|
173
|
+
) -> list[str]:
|
|
174
174
|
"""
|
|
175
175
|
Expand wildcard patterns to actual matching keywords from the document.
|
|
176
176
|
|
|
@@ -190,10 +190,10 @@ class KeywordSearcher:
|
|
|
190
190
|
|
|
191
191
|
def _expand_wildcard_keywords_with_identifier(
|
|
192
192
|
self,
|
|
193
|
-
query_keywords:
|
|
194
|
-
document_keywords:
|
|
193
|
+
query_keywords: list[str],
|
|
194
|
+
document_keywords: list[str],
|
|
195
195
|
identifier_name: str,
|
|
196
|
-
) ->
|
|
196
|
+
) -> list[str]:
|
|
197
197
|
"""
|
|
198
198
|
Expand wildcard patterns to actual matching keywords from the document and identifier name.
|
|
199
199
|
|
|
@@ -214,13 +214,11 @@ class KeywordSearcher:
|
|
|
214
214
|
break # Only add each query keyword once
|
|
215
215
|
|
|
216
216
|
# Also check against the full identifier name
|
|
217
|
-
if query_kw not in matched_keywords and self._match_wildcard(
|
|
218
|
-
query_kw, identifier_name
|
|
219
|
-
):
|
|
217
|
+
if query_kw not in matched_keywords and self._match_wildcard(query_kw, identifier_name):
|
|
220
218
|
matched_keywords.append(query_kw)
|
|
221
219
|
return matched_keywords
|
|
222
220
|
|
|
223
|
-
def _get_wildcard_scores(self, query_keywords:
|
|
221
|
+
def _get_wildcard_scores(self, query_keywords: list[str]) -> list[float]:
|
|
224
222
|
"""
|
|
225
223
|
Calculate BM25-like scores for wildcard matching.
|
|
226
224
|
|
|
@@ -252,11 +250,11 @@ class KeywordSearcher:
|
|
|
252
250
|
|
|
253
251
|
return scores
|
|
254
252
|
|
|
255
|
-
def _has_wildcards(self, keywords:
|
|
253
|
+
def _has_wildcards(self, keywords: list[str]) -> bool:
|
|
256
254
|
"""Check if any keywords contain wildcard patterns."""
|
|
257
255
|
return any("*" in keyword for keyword in keywords)
|
|
258
256
|
|
|
259
|
-
def search(self, query_keywords:
|
|
257
|
+
def search(self, query_keywords: list[str], top_n: int = 5) -> list[dict[str, Any]]:
|
|
260
258
|
"""
|
|
261
259
|
Search for modules and functions matching the given keywords.
|
|
262
260
|
|
|
@@ -313,9 +311,7 @@ class KeywordSearcher:
|
|
|
313
311
|
query_keywords_lower, doc_info["keywords"], identifier_name
|
|
314
312
|
)
|
|
315
313
|
else:
|
|
316
|
-
matched = self._count_matches(
|
|
317
|
-
query_keywords_lower, doc_info["keywords"]
|
|
318
|
-
)
|
|
314
|
+
matched = self._count_matches(query_keywords_lower, doc_info["keywords"])
|
|
319
315
|
|
|
320
316
|
# Only include documents that match at least one query keyword
|
|
321
317
|
if matched["score"] > 0:
|
|
@@ -368,7 +364,7 @@ class KeywordSearcher:
|
|
|
368
364
|
return results[:top_n]
|
|
369
365
|
|
|
370
366
|
def _apply_identifier_boost(
|
|
371
|
-
self, bm25_score: float, query_keywords:
|
|
367
|
+
self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
|
|
372
368
|
) -> float:
|
|
373
369
|
"""
|
|
374
370
|
Apply boost to BM25 score if query keywords match the identifier name.
|
|
@@ -399,9 +395,7 @@ class KeywordSearcher:
|
|
|
399
395
|
|
|
400
396
|
return bm25_score
|
|
401
397
|
|
|
402
|
-
def _count_matches(
|
|
403
|
-
self, query_keywords: List[str], item_keywords: List[str]
|
|
404
|
-
) -> Dict[str, Any]:
|
|
398
|
+
def _count_matches(self, query_keywords: list[str], item_keywords: list[str]) -> dict[str, Any]:
|
|
405
399
|
"""
|
|
406
400
|
Count matching keywords between query and item.
|
|
407
401
|
|
|
@@ -435,10 +429,10 @@ class KeywordSearcher:
|
|
|
435
429
|
|
|
436
430
|
def _count_wildcard_matches(
|
|
437
431
|
self,
|
|
438
|
-
query_keywords:
|
|
439
|
-
item_keywords:
|
|
432
|
+
query_keywords: list[str],
|
|
433
|
+
item_keywords: list[str],
|
|
440
434
|
identifier_name: str | None = None,
|
|
441
|
-
) ->
|
|
435
|
+
) -> dict[str, Any]:
|
|
442
436
|
"""
|
|
443
437
|
Count matching keywords between query and item using wildcard patterns.
|
|
444
438
|
|
|
@@ -462,9 +456,7 @@ class KeywordSearcher:
|
|
|
462
456
|
query_keywords, item_keywords_lower, identifier_name
|
|
463
457
|
)
|
|
464
458
|
else:
|
|
465
|
-
matched_keywords = self._expand_wildcard_keywords(
|
|
466
|
-
query_keywords, item_keywords_lower
|
|
467
|
-
)
|
|
459
|
+
matched_keywords = self._expand_wildcard_keywords(query_keywords, item_keywords_lower)
|
|
468
460
|
|
|
469
461
|
score = len(matched_keywords)
|
|
470
462
|
confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
|
|
@@ -476,7 +468,7 @@ class KeywordSearcher:
|
|
|
476
468
|
}
|
|
477
469
|
|
|
478
470
|
def _apply_identifier_boost_wildcard(
|
|
479
|
-
self, bm25_score: float, query_keywords:
|
|
471
|
+
self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
|
|
480
472
|
) -> float:
|
|
481
473
|
"""
|
|
482
474
|
Apply boost to BM25 score if query keywords match the identifier name using wildcards.
|
|
@@ -509,7 +501,7 @@ class KeywordSearcher:
|
|
|
509
501
|
return bm25_score
|
|
510
502
|
|
|
511
503
|
def _calculate_name_coverage_penalty(
|
|
512
|
-
self, query_keywords:
|
|
504
|
+
self, query_keywords: list[str], doc_info: dict[str, Any]
|
|
513
505
|
) -> float:
|
|
514
506
|
"""
|
|
515
507
|
Calculate penalty for functions whose names contain words NOT in the query.
|
cicada/keyword_test.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Interactive keyword extraction testing module.
|
|
3
|
+
|
|
4
|
+
Provides an interactive REPL for testing keyword extraction methods.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_keywords_interactive(method: str = "lemminflect", tier: str = "regular"):
|
|
11
|
+
"""
|
|
12
|
+
Interactive keyword extraction testing mode.
|
|
13
|
+
|
|
14
|
+
Allows users to paste text and see extracted keywords in real-time
|
|
15
|
+
using the specified extraction method.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
method: Extraction method ('lemminflect' or 'bert')
|
|
19
|
+
tier: Model tier ('fast', 'regular', or 'max')
|
|
20
|
+
"""
|
|
21
|
+
print(f"\n{'='*70}")
|
|
22
|
+
print("🔍 Cicada Interactive Keyword Extraction Test")
|
|
23
|
+
print(f"{'='*70}")
|
|
24
|
+
print(f"Method: {method.upper()}")
|
|
25
|
+
print(f"Tier: {tier}")
|
|
26
|
+
print("\nPaste or type text, then press Ctrl-D (Unix) or Ctrl-Z+Enter (Windows)")
|
|
27
|
+
print("to extract keywords. Press Ctrl-C to exit.\n")
|
|
28
|
+
print(f"{'='*70}\n")
|
|
29
|
+
|
|
30
|
+
# Initialize keyword extractor
|
|
31
|
+
try:
|
|
32
|
+
if method == "bert":
|
|
33
|
+
from cicada.keybert_extractor import KeyBERTExtractor
|
|
34
|
+
|
|
35
|
+
extractor = KeyBERTExtractor(model_tier=tier, verbose=True)
|
|
36
|
+
else:
|
|
37
|
+
from cicada.lightweight_keyword_extractor import LightweightKeywordExtractor
|
|
38
|
+
|
|
39
|
+
extractor = LightweightKeywordExtractor(verbose=True)
|
|
40
|
+
print() # Add newline after initialization
|
|
41
|
+
except Exception as e:
|
|
42
|
+
print(f"Error initializing keyword extractor: {e}", file=sys.stderr)
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
# Interactive loop
|
|
46
|
+
stdin_closed = False
|
|
47
|
+
try:
|
|
48
|
+
while True:
|
|
49
|
+
print("📝 Enter text (Ctrl-D or Ctrl-Z+Enter when done):")
|
|
50
|
+
print("-" * 70)
|
|
51
|
+
|
|
52
|
+
# Read multi-line input until EOF
|
|
53
|
+
lines = []
|
|
54
|
+
try:
|
|
55
|
+
while True:
|
|
56
|
+
line = input()
|
|
57
|
+
lines.append(line)
|
|
58
|
+
except EOFError:
|
|
59
|
+
# Check if this is the first EOF (stdin just closed)
|
|
60
|
+
if not lines and stdin_closed:
|
|
61
|
+
# stdin is exhausted and we have no input - exit gracefully
|
|
62
|
+
print("\n👋 No more input available. Exiting.")
|
|
63
|
+
return
|
|
64
|
+
stdin_closed = True
|
|
65
|
+
|
|
66
|
+
text = "\n".join(lines)
|
|
67
|
+
|
|
68
|
+
if not text.strip():
|
|
69
|
+
# If stdin is closed and input is empty, exit
|
|
70
|
+
if stdin_closed:
|
|
71
|
+
print("\n👋 No more input available. Exiting.")
|
|
72
|
+
return
|
|
73
|
+
print("\n⚠️ Empty input. Please enter some text.\n")
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
# Extract keywords
|
|
77
|
+
print("\n" + "=" * 70)
|
|
78
|
+
print("🔑 EXTRACTED KEYWORDS:")
|
|
79
|
+
print("=" * 70)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
# Get detailed results
|
|
83
|
+
results = extractor.extract_keywords(text, top_n=15)
|
|
84
|
+
|
|
85
|
+
# Display top keywords with scores
|
|
86
|
+
top_keywords = results.get("top_keywords", [])
|
|
87
|
+
if top_keywords and isinstance(top_keywords, list):
|
|
88
|
+
print("\n📊 Top Keywords (with scores):")
|
|
89
|
+
for i, item in enumerate(top_keywords, 1):
|
|
90
|
+
if isinstance(item, (list, tuple)) and len(item) >= 2:
|
|
91
|
+
keyword, score = item[0], item[1]
|
|
92
|
+
print(f" {i:2}. {keyword:20s} (score: {score:.4f})")
|
|
93
|
+
else:
|
|
94
|
+
print(" No keywords extracted.")
|
|
95
|
+
|
|
96
|
+
# Display code identifiers if any
|
|
97
|
+
code_identifiers = results.get("code_identifiers")
|
|
98
|
+
if code_identifiers and isinstance(code_identifiers, list):
|
|
99
|
+
print("\n💻 Code Identifiers (10x weight):")
|
|
100
|
+
for ident in code_identifiers:
|
|
101
|
+
print(f" • {ident}")
|
|
102
|
+
|
|
103
|
+
# Display code split words if any
|
|
104
|
+
code_split_words = results.get("code_split_words")
|
|
105
|
+
if code_split_words and isinstance(code_split_words, list):
|
|
106
|
+
print("\n🔤 Code Split Words (3x weight):")
|
|
107
|
+
for word in code_split_words[:10]: # Limit to 10
|
|
108
|
+
print(f" • {word}")
|
|
109
|
+
|
|
110
|
+
# Display statistics
|
|
111
|
+
stats = results.get("stats")
|
|
112
|
+
if stats and isinstance(stats, dict):
|
|
113
|
+
print("\n📈 Statistics:")
|
|
114
|
+
print(f" • Total tokens: {stats.get('total_tokens', 0)}")
|
|
115
|
+
print(f" • Total words: {stats.get('total_words', 0)}")
|
|
116
|
+
print(f" • Unique words: {stats.get('unique_words', 0)}")
|
|
117
|
+
if "sentences" in stats:
|
|
118
|
+
print(f" • Sentences: {stats['sentences']}")
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"\n❌ Error extracting keywords: {e}", file=sys.stderr)
|
|
122
|
+
|
|
123
|
+
print("\n" + "=" * 70 + "\n")
|
|
124
|
+
|
|
125
|
+
except KeyboardInterrupt:
|
|
126
|
+
print("\n\n👋 Exiting interactive mode. Goodbye!")
|
|
127
|
+
sys.exit(0)
|
|
@@ -3,10 +3,10 @@ Lightweight Keyword Extraction using lemminflect
|
|
|
3
3
|
Fast keyword extraction for programming documentation
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
from collections import Counter
|
|
7
6
|
import re
|
|
8
7
|
import sys
|
|
9
8
|
import warnings
|
|
9
|
+
from collections import Counter
|
|
10
10
|
|
|
11
11
|
from cicada.utils import split_camel_snake_case
|
|
12
12
|
|
|
@@ -189,9 +189,7 @@ class LightweightKeywordExtractor:
|
|
|
189
189
|
for identifier in identifiers:
|
|
190
190
|
split_text = split_camel_snake_case(identifier)
|
|
191
191
|
words = [
|
|
192
|
-
word.lower()
|
|
193
|
-
for word in split_text.split()
|
|
194
|
-
if len(word) > 1 and word.isalpha()
|
|
192
|
+
word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
|
|
195
193
|
]
|
|
196
194
|
split_words.extend(words)
|
|
197
195
|
return identifiers, list(set(split_words))
|
|
@@ -264,9 +262,7 @@ class LightweightKeywordExtractor:
|
|
|
264
262
|
lemmatized_words.append(lemma)
|
|
265
263
|
|
|
266
264
|
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
267
|
-
all_keywords = (
|
|
268
|
-
lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
|
|
269
|
-
)
|
|
265
|
+
all_keywords = lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
|
|
270
266
|
keyword_freq = Counter(all_keywords)
|
|
271
267
|
top_keywords = keyword_freq.most_common(top_n)
|
|
272
268
|
|
|
@@ -274,9 +270,7 @@ class LightweightKeywordExtractor:
|
|
|
274
270
|
# This ensures weighted keywords are included in the calculation
|
|
275
271
|
total_words = len(all_keywords)
|
|
276
272
|
if total_words > 0:
|
|
277
|
-
tf_scores = {
|
|
278
|
-
word: (freq / total_words) for word, freq in keyword_freq.items()
|
|
279
|
-
}
|
|
273
|
+
tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
|
|
280
274
|
else:
|
|
281
275
|
tf_scores = {}
|
|
282
276
|
|
|
@@ -291,8 +285,6 @@ class LightweightKeywordExtractor:
|
|
|
291
285
|
"lemmatized_words": list(set(lemmatized_words))[:20],
|
|
292
286
|
"code_identifiers": code_identifiers,
|
|
293
287
|
"code_split_words": code_split_words,
|
|
294
|
-
"tf_scores": dict(
|
|
295
|
-
sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
296
|
-
),
|
|
288
|
+
"tf_scores": dict(sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]),
|
|
297
289
|
"stats": stats,
|
|
298
290
|
}
|