cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/_version_hash.py +4 -0
- cicada/cli.py +6 -748
- cicada/commands.py +1255 -0
- cicada/dead_code/__init__.py +1 -0
- cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
- cicada/dependency_analyzer.py +147 -0
- cicada/entry_utils.py +92 -0
- cicada/extractors/base.py +9 -9
- cicada/extractors/call.py +17 -20
- cicada/extractors/common.py +64 -0
- cicada/extractors/dependency.py +117 -235
- cicada/extractors/doc.py +2 -49
- cicada/extractors/function.py +10 -14
- cicada/extractors/keybert.py +228 -0
- cicada/extractors/keyword.py +191 -0
- cicada/extractors/module.py +6 -10
- cicada/extractors/spec.py +8 -56
- cicada/format/__init__.py +20 -0
- cicada/{ascii_art.py → format/ascii_art.py} +1 -1
- cicada/format/formatter.py +1145 -0
- cicada/git_helper.py +134 -7
- cicada/indexer.py +322 -89
- cicada/interactive_setup.py +251 -323
- cicada/interactive_setup_helpers.py +302 -0
- cicada/keyword_expander.py +437 -0
- cicada/keyword_search.py +208 -422
- cicada/keyword_test.py +383 -16
- cicada/mcp/__init__.py +10 -0
- cicada/mcp/entry.py +17 -0
- cicada/mcp/filter_utils.py +107 -0
- cicada/mcp/pattern_utils.py +118 -0
- cicada/{mcp_server.py → mcp/server.py} +819 -73
- cicada/mcp/tools.py +473 -0
- cicada/pr_finder.py +2 -3
- cicada/pr_indexer/indexer.py +3 -2
- cicada/setup.py +167 -35
- cicada/tier.py +225 -0
- cicada/utils/__init__.py +9 -2
- cicada/utils/fuzzy_match.py +54 -0
- cicada/utils/index_utils.py +9 -0
- cicada/utils/path_utils.py +18 -0
- cicada/utils/text_utils.py +52 -1
- cicada/utils/tree_utils.py +47 -0
- cicada/version_check.py +99 -0
- cicada/watch_manager.py +320 -0
- cicada/watcher.py +431 -0
- cicada_mcp-0.3.0.dist-info/METADATA +541 -0
- cicada_mcp-0.3.0.dist-info/RECORD +70 -0
- cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
- cicada/formatter.py +0 -864
- cicada/keybert_extractor.py +0 -286
- cicada/lightweight_keyword_extractor.py +0 -290
- cicada/mcp_entry.py +0 -683
- cicada/mcp_tools.py +0 -291
- cicada_mcp-0.2.0.dist-info/METADATA +0 -735
- cicada_mcp-0.2.0.dist-info/RECORD +0 -53
- cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
- /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
- /cicada/{colors.py → format/colors.py} +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
cicada/keybert_extractor.py
DELETED
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Keyword Extraction using KeyBERT
|
|
3
|
-
Semantic keyword extraction using transformer-based embeddings
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import re
|
|
7
|
-
import sys
|
|
8
|
-
from typing import Any
|
|
9
|
-
|
|
10
|
-
from cicada.utils import split_camel_snake_case
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class KeyBERTExtractor:
|
|
14
|
-
"""Extract keywords from text using KeyBERT semantic analysis."""
|
|
15
|
-
|
|
16
|
-
# Model configurations for different performance tiers
|
|
17
|
-
KEYBERT_MODELS = {
|
|
18
|
-
"fast": "all-MiniLM-L6-v2", # 80MB, fast extraction
|
|
19
|
-
"regular": "BAAI/bge-small-en-v1.5", # 133MB, balanced
|
|
20
|
-
"max": "paraphrase-mpnet-base-v2", # 420MB, highest quality
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
# Weighting strategy constants for keyword extraction
|
|
24
|
-
# These control how different types of keywords are prioritized
|
|
25
|
-
KEYBERT_CANDIDATE_MULTIPLIER = 3 # Extract 3x keywords for weighted reranking
|
|
26
|
-
CODE_IDENTIFIER_BOOST = 10 # 10x weight for exact code identifiers (e.g., function names)
|
|
27
|
-
CODE_SPLIT_WORD_BOOST = 3 # 3x weight for identifier components (e.g., "user" from "getUserId")
|
|
28
|
-
BASE_SCORE_IDENTIFIER = 0.5 # Base score for identifiers not found by BERT
|
|
29
|
-
BASE_SCORE_SPLIT_WORD = 0.3 # Base score for split words not found by BERT
|
|
30
|
-
|
|
31
|
-
# Class variable to hold KeyBERT class (lazily loaded)
|
|
32
|
-
_KeyBERT: type | None = None
|
|
33
|
-
|
|
34
|
-
def __init__(self, verbose: bool = False, model_tier: str | None = None):
|
|
35
|
-
"""
|
|
36
|
-
Initialize KeyBERT model.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
verbose: If True, print status messages during initialization
|
|
40
|
-
model_tier: Model tier to use ('fast', 'regular', or 'max').
|
|
41
|
-
If None, must be specified via config file.
|
|
42
|
-
|
|
43
|
-
Raises:
|
|
44
|
-
ImportError: If KeyBERT is not installed
|
|
45
|
-
ValueError: If model_tier is invalid or not specified
|
|
46
|
-
RuntimeError: If model loading fails
|
|
47
|
-
"""
|
|
48
|
-
self.verbose = verbose
|
|
49
|
-
|
|
50
|
-
# Validate model tier first
|
|
51
|
-
if model_tier and model_tier not in self.KEYBERT_MODELS:
|
|
52
|
-
raise ValueError(
|
|
53
|
-
f"Invalid model tier '{model_tier}'. "
|
|
54
|
-
f"Must be one of: {', '.join(self.KEYBERT_MODELS.keys())}"
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
if model_tier is None:
|
|
58
|
-
raise ValueError(
|
|
59
|
-
"model_tier must be specified. Pass it directly or load from config file."
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
self.model_tier = model_tier
|
|
63
|
-
self.model_name = self.KEYBERT_MODELS[model_tier]
|
|
64
|
-
|
|
65
|
-
# Print message BEFORE the slow import
|
|
66
|
-
if self.verbose:
|
|
67
|
-
print(
|
|
68
|
-
f"Loading KeyBERT model ({model_tier}: {self.model_name})",
|
|
69
|
-
file=sys.stderr,
|
|
70
|
-
)
|
|
71
|
-
print("This can take up to a couple of minutes.", file=sys.stderr)
|
|
72
|
-
|
|
73
|
-
# Lazy import KeyBERT (only once per class)
|
|
74
|
-
# This import can take significant time on first load
|
|
75
|
-
if KeyBERTExtractor._KeyBERT is None:
|
|
76
|
-
try:
|
|
77
|
-
from keybert import KeyBERT
|
|
78
|
-
|
|
79
|
-
KeyBERTExtractor._KeyBERT = KeyBERT
|
|
80
|
-
except ImportError as e:
|
|
81
|
-
raise ImportError(
|
|
82
|
-
"KeyBERT is not installed. Install it with:\n"
|
|
83
|
-
" uv add keybert\n"
|
|
84
|
-
"or\n"
|
|
85
|
-
" pip install keybert"
|
|
86
|
-
) from e
|
|
87
|
-
|
|
88
|
-
# Initialize KeyBERT with the selected model
|
|
89
|
-
# Assume model is pre-downloaded (user will handle caching separately)
|
|
90
|
-
try:
|
|
91
|
-
self.kw_model = KeyBERTExtractor._KeyBERT(model=self.model_name)
|
|
92
|
-
if self.verbose:
|
|
93
|
-
print("✓ Model loaded successfully", file=sys.stderr)
|
|
94
|
-
except Exception as e:
|
|
95
|
-
raise RuntimeError(
|
|
96
|
-
f"Failed to load KeyBERT model '{self.model_name}'. "
|
|
97
|
-
f"Ensure the model is downloaded and available. Error: {e}"
|
|
98
|
-
) from e
|
|
99
|
-
|
|
100
|
-
def extract_code_identifiers(self, text: str) -> tuple[list[str], list[str]]:
|
|
101
|
-
"""
|
|
102
|
-
Extract code-specific identifiers and their split words.
|
|
103
|
-
|
|
104
|
-
Returns a tuple of (identifiers, split_words) where:
|
|
105
|
-
- identifiers: original camelCase/PascalCase/snake_case identifiers
|
|
106
|
-
- split_words: individual words extracted from those identifiers
|
|
107
|
-
"""
|
|
108
|
-
# Match camelCase, snake_case, PascalCase, and mixed patterns
|
|
109
|
-
patterns = [
|
|
110
|
-
r"\b[a-z]+[A-Z][a-zA-Z]*\b", # camelCase (e.g., getUserData)
|
|
111
|
-
r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b", # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
|
|
112
|
-
r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b", # PascalCase (e.g., UserController, PostgreSQL)
|
|
113
|
-
r"\b[a-z]+_[a-z_]+\b", # snake_case (e.g., get_user_data)
|
|
114
|
-
r"\b[A-Z]{2,}\b", # All UPPERCASE (e.g., HTTP, API, SQL)
|
|
115
|
-
]
|
|
116
|
-
|
|
117
|
-
identifiers = []
|
|
118
|
-
for pattern in patterns:
|
|
119
|
-
matches = re.findall(pattern, text)
|
|
120
|
-
identifiers.extend(matches)
|
|
121
|
-
|
|
122
|
-
identifiers = list(set(identifiers))
|
|
123
|
-
|
|
124
|
-
# Split identifiers into individual words
|
|
125
|
-
split_words = []
|
|
126
|
-
for identifier in identifiers:
|
|
127
|
-
split_text = split_camel_snake_case(identifier)
|
|
128
|
-
# Extract individual words (lowercase, length > 1)
|
|
129
|
-
words = [
|
|
130
|
-
word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
|
|
131
|
-
]
|
|
132
|
-
split_words.extend(words)
|
|
133
|
-
|
|
134
|
-
return identifiers, list(set(split_words))
|
|
135
|
-
|
|
136
|
-
def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
|
|
137
|
-
"""
|
|
138
|
-
Extract keywords and return a simple list of keyword strings.
|
|
139
|
-
|
|
140
|
-
Args:
|
|
141
|
-
text: Input text to analyze
|
|
142
|
-
top_n: Number of top keywords to return
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
List of keyword strings (e.g., ['authentication', 'user', 'validate'])
|
|
146
|
-
"""
|
|
147
|
-
if not text or not text.strip():
|
|
148
|
-
return []
|
|
149
|
-
|
|
150
|
-
try:
|
|
151
|
-
results = self.extract_keywords(text, top_n=top_n)
|
|
152
|
-
# Extract just the keyword strings from top_keywords tuples
|
|
153
|
-
return [keyword for keyword, _ in results["top_keywords"]]
|
|
154
|
-
except Exception as e:
|
|
155
|
-
if self.verbose:
|
|
156
|
-
print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
|
|
157
|
-
return []
|
|
158
|
-
|
|
159
|
-
def extract_keywords(self, text: str, top_n: int = 15) -> dict[str, Any]:
|
|
160
|
-
"""
|
|
161
|
-
Extract keywords using KeyBERT semantic analysis with code identifier emphasis.
|
|
162
|
-
|
|
163
|
-
Weighting strategy:
|
|
164
|
-
- Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
|
|
165
|
-
- Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
|
|
166
|
-
- KeyBERT semantic keywords: Base score from embedding similarity
|
|
167
|
-
|
|
168
|
-
Magic numbers explained:
|
|
169
|
-
- 3x multiplier: For candidate selection (top_n * 3) to have enough keywords
|
|
170
|
-
before applying weights. This ensures we don't miss important keywords that
|
|
171
|
-
might rank higher after code identifier boosting.
|
|
172
|
-
- 0.5 base score: Default confidence for code identifiers not found by KeyBERT.
|
|
173
|
-
After 10x boost, gives them a score of 5.0, prioritizing them over most
|
|
174
|
-
regular keywords.
|
|
175
|
-
- 0.3 base score: Default confidence for code split words not found by KeyBERT.
|
|
176
|
-
After 3x boost, gives them a score of 0.9, placing them between regular
|
|
177
|
-
keywords (0.4-0.7) and full identifiers (5.0).
|
|
178
|
-
|
|
179
|
-
Args:
|
|
180
|
-
text: Input text to analyze
|
|
181
|
-
top_n: Number of top keywords to return
|
|
182
|
-
|
|
183
|
-
Returns:
|
|
184
|
-
Dictionary with extracted keywords and analysis:
|
|
185
|
-
- top_keywords: List of (keyword, score) tuples, sorted by weighted score
|
|
186
|
-
- code_identifiers: Original identifiers (weighted 10x)
|
|
187
|
-
- code_split_words: Words extracted from identifiers (weighted 3x)
|
|
188
|
-
- noun_chunks: 2-word phrases from KeyBERT (if any)
|
|
189
|
-
- Other fields (nouns, verbs, etc.) are empty (KeyBERT doesn't do POS tagging)
|
|
190
|
-
- stats: Basic text statistics
|
|
191
|
-
"""
|
|
192
|
-
if not text or not text.strip():
|
|
193
|
-
return {
|
|
194
|
-
"top_keywords": [],
|
|
195
|
-
"nouns": [],
|
|
196
|
-
"verbs": [],
|
|
197
|
-
"adjectives": [],
|
|
198
|
-
"proper_nouns": [],
|
|
199
|
-
"noun_chunks": [],
|
|
200
|
-
"entities": [],
|
|
201
|
-
"code_identifiers": [],
|
|
202
|
-
"code_split_words": [],
|
|
203
|
-
"tf_scores": {},
|
|
204
|
-
"stats": {
|
|
205
|
-
"total_tokens": 0,
|
|
206
|
-
"total_words": 0,
|
|
207
|
-
"unique_words": 0,
|
|
208
|
-
"sentences": 0,
|
|
209
|
-
},
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
# 1. Extract code identifiers and their split words
|
|
213
|
-
code_identifiers, code_split_words = self.extract_code_identifiers(text)
|
|
214
|
-
|
|
215
|
-
# 2. Use KeyBERT to extract semantic keywords
|
|
216
|
-
# Extract more than needed to have candidates for weighting
|
|
217
|
-
try:
|
|
218
|
-
# KeyBERT return type can vary, use type ignore for external library
|
|
219
|
-
keybert_keywords: list[tuple[str, float]] = self.kw_model.extract_keywords( # type: ignore[assignment]
|
|
220
|
-
text,
|
|
221
|
-
top_n=top_n * self.KEYBERT_CANDIDATE_MULTIPLIER,
|
|
222
|
-
keyphrase_ngram_range=(1, 1), # Single words only
|
|
223
|
-
)
|
|
224
|
-
except Exception as e:
|
|
225
|
-
if self.verbose:
|
|
226
|
-
print(f"Warning: KeyBERT extraction failed: {e}", file=sys.stderr)
|
|
227
|
-
keybert_keywords = []
|
|
228
|
-
|
|
229
|
-
# 3. Build weighted keyword scores
|
|
230
|
-
keyword_scores: dict[str, float] = {}
|
|
231
|
-
|
|
232
|
-
# Add KeyBERT keywords with their semantic similarity scores
|
|
233
|
-
for keyword, score in keybert_keywords:
|
|
234
|
-
keyword_lower: str = keyword.lower()
|
|
235
|
-
keyword_scores[keyword_lower] = score
|
|
236
|
-
|
|
237
|
-
# 4. Apply code identifier boosting
|
|
238
|
-
# Code identifiers get strong boost as they're likely important API/function names
|
|
239
|
-
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
240
|
-
for identifier in code_identifiers_lower:
|
|
241
|
-
if identifier in keyword_scores:
|
|
242
|
-
keyword_scores[identifier] *= self.CODE_IDENTIFIER_BOOST
|
|
243
|
-
else:
|
|
244
|
-
# Add with high base score if not found by KeyBERT
|
|
245
|
-
keyword_scores[identifier] = self.BASE_SCORE_IDENTIFIER * self.CODE_IDENTIFIER_BOOST
|
|
246
|
-
|
|
247
|
-
# 5. Apply split word boosting (lower than full identifiers)
|
|
248
|
-
# Split words are components of identifiers, somewhat important but less than full names
|
|
249
|
-
code_split_words_lower = [word.lower() for word in code_split_words]
|
|
250
|
-
for word in code_split_words_lower:
|
|
251
|
-
if word in keyword_scores:
|
|
252
|
-
keyword_scores[word] *= self.CODE_SPLIT_WORD_BOOST
|
|
253
|
-
else:
|
|
254
|
-
keyword_scores[word] = self.BASE_SCORE_SPLIT_WORD * self.CODE_SPLIT_WORD_BOOST
|
|
255
|
-
|
|
256
|
-
# 5. Sort by weighted score and take top_n
|
|
257
|
-
top_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
258
|
-
|
|
259
|
-
# 6. No noun_chunks since we're using single words only
|
|
260
|
-
noun_chunks = []
|
|
261
|
-
|
|
262
|
-
# 7. Calculate basic statistics
|
|
263
|
-
words = text.split()
|
|
264
|
-
unique_words = {w.lower() for w in words if w.isalpha()}
|
|
265
|
-
sentences = text.count(".") + text.count("!") + text.count("?")
|
|
266
|
-
|
|
267
|
-
stats = {
|
|
268
|
-
"total_tokens": len(words),
|
|
269
|
-
"total_words": len([w for w in words if w.isalpha()]),
|
|
270
|
-
"unique_words": len(unique_words),
|
|
271
|
-
"sentences": max(1, sentences), # At least 1 sentence
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
return {
|
|
275
|
-
"top_keywords": top_keywords,
|
|
276
|
-
"nouns": [], # KeyBERT doesn't extract POS tags
|
|
277
|
-
"verbs": [],
|
|
278
|
-
"adjectives": [],
|
|
279
|
-
"proper_nouns": [],
|
|
280
|
-
"noun_chunks": noun_chunks,
|
|
281
|
-
"entities": [], # KeyBERT doesn't do NER
|
|
282
|
-
"code_identifiers": code_identifiers,
|
|
283
|
-
"code_split_words": code_split_words,
|
|
284
|
-
"tf_scores": {}, # Using semantic scores instead
|
|
285
|
-
"stats": stats,
|
|
286
|
-
}
|
|
@@ -1,290 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Lightweight Keyword Extraction using lemminflect
|
|
3
|
-
Fast keyword extraction for programming documentation
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import re
|
|
7
|
-
import sys
|
|
8
|
-
import warnings
|
|
9
|
-
from collections import Counter
|
|
10
|
-
|
|
11
|
-
from cicada.utils import split_camel_snake_case
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class LightweightKeywordExtractor:
|
|
15
|
-
"""Extract keywords from text using lightweight lemmatization."""
|
|
16
|
-
|
|
17
|
-
STOPWORDS = {
|
|
18
|
-
"the",
|
|
19
|
-
"a",
|
|
20
|
-
"an",
|
|
21
|
-
"and",
|
|
22
|
-
"or",
|
|
23
|
-
"but",
|
|
24
|
-
"in",
|
|
25
|
-
"on",
|
|
26
|
-
"at",
|
|
27
|
-
"to",
|
|
28
|
-
"for",
|
|
29
|
-
"of",
|
|
30
|
-
"with",
|
|
31
|
-
"by",
|
|
32
|
-
"from",
|
|
33
|
-
"as",
|
|
34
|
-
"is",
|
|
35
|
-
"are",
|
|
36
|
-
"was",
|
|
37
|
-
"were",
|
|
38
|
-
"be",
|
|
39
|
-
"been",
|
|
40
|
-
"being",
|
|
41
|
-
"have",
|
|
42
|
-
"has",
|
|
43
|
-
"had",
|
|
44
|
-
"do",
|
|
45
|
-
"does",
|
|
46
|
-
"did",
|
|
47
|
-
"will",
|
|
48
|
-
"would",
|
|
49
|
-
"should",
|
|
50
|
-
"could",
|
|
51
|
-
"this",
|
|
52
|
-
"that",
|
|
53
|
-
"these",
|
|
54
|
-
"those",
|
|
55
|
-
"it",
|
|
56
|
-
"its",
|
|
57
|
-
"they",
|
|
58
|
-
"them",
|
|
59
|
-
"their",
|
|
60
|
-
"what",
|
|
61
|
-
"which",
|
|
62
|
-
"who",
|
|
63
|
-
"when",
|
|
64
|
-
"where",
|
|
65
|
-
"why",
|
|
66
|
-
"how",
|
|
67
|
-
"all",
|
|
68
|
-
"each",
|
|
69
|
-
"every",
|
|
70
|
-
"both",
|
|
71
|
-
"few",
|
|
72
|
-
"more",
|
|
73
|
-
"most",
|
|
74
|
-
"other",
|
|
75
|
-
"some",
|
|
76
|
-
"such",
|
|
77
|
-
"no",
|
|
78
|
-
"nor",
|
|
79
|
-
"not",
|
|
80
|
-
"only",
|
|
81
|
-
"own",
|
|
82
|
-
"same",
|
|
83
|
-
"so",
|
|
84
|
-
"than",
|
|
85
|
-
"too",
|
|
86
|
-
"very",
|
|
87
|
-
"can",
|
|
88
|
-
"just",
|
|
89
|
-
"up",
|
|
90
|
-
"out",
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
# Pre-compiled regex patterns for code identifier extraction
|
|
94
|
-
CODE_PATTERNS = [
|
|
95
|
-
re.compile(r"\b[a-z]+[A-Z][a-zA-Z]*\b"), # camelCase
|
|
96
|
-
re.compile(r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b"), # HTTPServer
|
|
97
|
-
re.compile(r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b"), # PascalCase
|
|
98
|
-
re.compile(r"\b[a-z]+_[a-z_]+\b"), # snake_case
|
|
99
|
-
re.compile(r"\b[A-Z]{2,}\b"), # UPPERCASE
|
|
100
|
-
]
|
|
101
|
-
|
|
102
|
-
# Pre-compiled tokenization pattern
|
|
103
|
-
TOKEN_PATTERN = re.compile(r"\b[a-zA-Z][a-zA-Z0-9_]*\b")
|
|
104
|
-
|
|
105
|
-
def __init__(self, verbose: bool = False, model_size: str = "small"):
|
|
106
|
-
"""
|
|
107
|
-
Initialize lightweight keyword extractor.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
verbose: If True, print status messages during initialization
|
|
111
|
-
model_size: Deprecated parameter kept for backward compatibility.
|
|
112
|
-
This parameter is ignored in the lightweight extractor.
|
|
113
|
-
"""
|
|
114
|
-
self.verbose = verbose
|
|
115
|
-
self.model_size = model_size
|
|
116
|
-
self._lemminflect_loaded = False
|
|
117
|
-
|
|
118
|
-
# Deprecation warning for model_size parameter
|
|
119
|
-
if model_size != "small":
|
|
120
|
-
warnings.warn(
|
|
121
|
-
"The 'model_size' parameter is deprecated and ignored in LightweightKeywordExtractor. "
|
|
122
|
-
"The lightweight extractor does not use size-based models.",
|
|
123
|
-
DeprecationWarning,
|
|
124
|
-
stacklevel=2,
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
def _load_lemminflect(self):
|
|
128
|
-
"""Lazy load lemminflect library."""
|
|
129
|
-
if self._lemminflect_loaded:
|
|
130
|
-
return
|
|
131
|
-
try:
|
|
132
|
-
import lemminflect
|
|
133
|
-
|
|
134
|
-
self._lemminflect = lemminflect
|
|
135
|
-
self._lemminflect_loaded = True
|
|
136
|
-
if self.verbose:
|
|
137
|
-
print("✓ lemminflect loaded", file=sys.stderr)
|
|
138
|
-
except ImportError as e:
|
|
139
|
-
raise RuntimeError(
|
|
140
|
-
"lemminflect is required but not installed. "
|
|
141
|
-
"Please install it with: uv pip install lemminflect"
|
|
142
|
-
) from e
|
|
143
|
-
|
|
144
|
-
def _tokenize(self, text: str) -> list[str]:
|
|
145
|
-
"""Tokenize text into words."""
|
|
146
|
-
tokens = self.TOKEN_PATTERN.findall(text)
|
|
147
|
-
return tokens
|
|
148
|
-
|
|
149
|
-
def _lemmatize(self, word: str) -> str:
|
|
150
|
-
"""
|
|
151
|
-
Lemmatize a word using lemminflect with fallback.
|
|
152
|
-
|
|
153
|
-
Tries lemmatization with VERB, NOUN, and ADJ POS tags.
|
|
154
|
-
Falls back to lowercase if lemmatization fails.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
word: Word to lemmatize
|
|
158
|
-
|
|
159
|
-
Returns:
|
|
160
|
-
Lemmatized word (lowercase)
|
|
161
|
-
"""
|
|
162
|
-
try:
|
|
163
|
-
# Try different POS tags for better coverage
|
|
164
|
-
for pos in ["VERB", "NOUN", "ADJ"]:
|
|
165
|
-
lemma = self._lemminflect.getLemma(word, upos=pos)
|
|
166
|
-
if lemma:
|
|
167
|
-
return lemma[0].lower()
|
|
168
|
-
# Fallback to lowercase if no lemma found
|
|
169
|
-
return word.lower()
|
|
170
|
-
except Exception:
|
|
171
|
-
# Graceful fallback if lemminflect fails
|
|
172
|
-
return word.lower()
|
|
173
|
-
|
|
174
|
-
def extract_code_identifiers(self, text):
|
|
175
|
-
"""
|
|
176
|
-
Extract code-specific identifiers and their split words.
|
|
177
|
-
|
|
178
|
-
Returns a tuple of (identifiers, split_words) where:
|
|
179
|
-
- identifiers: original camelCase/PascalCase/snake_case identifiers
|
|
180
|
-
- split_words: individual words extracted from those identifiers
|
|
181
|
-
"""
|
|
182
|
-
identifiers = []
|
|
183
|
-
for pattern in self.CODE_PATTERNS:
|
|
184
|
-
matches = pattern.findall(text)
|
|
185
|
-
identifiers.extend(matches)
|
|
186
|
-
identifiers = list(set(identifiers))
|
|
187
|
-
|
|
188
|
-
split_words = []
|
|
189
|
-
for identifier in identifiers:
|
|
190
|
-
split_text = split_camel_snake_case(identifier)
|
|
191
|
-
words = [
|
|
192
|
-
word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
|
|
193
|
-
]
|
|
194
|
-
split_words.extend(words)
|
|
195
|
-
return identifiers, list(set(split_words))
|
|
196
|
-
|
|
197
|
-
def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
|
|
198
|
-
"""
|
|
199
|
-
Extract keywords and return a simple list of keyword strings.
|
|
200
|
-
|
|
201
|
-
Args:
|
|
202
|
-
text: Input text to analyze
|
|
203
|
-
top_n: Number of top keywords to return
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
List of keyword strings (e.g., ['authentication', 'user', 'validate'])
|
|
207
|
-
"""
|
|
208
|
-
if not text or not text.strip():
|
|
209
|
-
return []
|
|
210
|
-
try:
|
|
211
|
-
results = self.extract_keywords(text, top_n=top_n)
|
|
212
|
-
return [keyword for keyword, _ in results["top_keywords"]]
|
|
213
|
-
except Exception as e:
|
|
214
|
-
if self.verbose:
|
|
215
|
-
print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
|
|
216
|
-
return []
|
|
217
|
-
|
|
218
|
-
def extract_keywords(self, text, top_n=15):
|
|
219
|
-
"""
|
|
220
|
-
Extract keywords using multiple strategies with emphasis on code identifiers.
|
|
221
|
-
|
|
222
|
-
Weighting strategy:
|
|
223
|
-
- Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
|
|
224
|
-
- Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
|
|
225
|
-
- Regular lemmatized words: 1x weight
|
|
226
|
-
|
|
227
|
-
Args:
|
|
228
|
-
text: Input text to analyze
|
|
229
|
-
top_n: Number of top keywords to return
|
|
230
|
-
|
|
231
|
-
Returns:
|
|
232
|
-
Dictionary with extracted keywords and analysis:
|
|
233
|
-
- top_keywords: List of (keyword, count) tuples, sorted by frequency
|
|
234
|
-
- lemmatized_words: Regular words after lemmatization
|
|
235
|
-
- code_identifiers: Original identifiers (weighted 10x)
|
|
236
|
-
- code_split_words: Words extracted from identifiers (weighted 3x)
|
|
237
|
-
- tf_scores: Term frequency scores
|
|
238
|
-
- stats: Text statistics
|
|
239
|
-
"""
|
|
240
|
-
if not text or not text.strip():
|
|
241
|
-
return {
|
|
242
|
-
"top_keywords": [],
|
|
243
|
-
"lemmatized_words": [],
|
|
244
|
-
"code_identifiers": [],
|
|
245
|
-
"code_split_words": [],
|
|
246
|
-
"tf_scores": {},
|
|
247
|
-
"stats": {
|
|
248
|
-
"total_tokens": 0,
|
|
249
|
-
"total_words": 0,
|
|
250
|
-
"unique_words": 0,
|
|
251
|
-
},
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
self._load_lemminflect()
|
|
255
|
-
code_identifiers, code_split_words = self.extract_code_identifiers(text)
|
|
256
|
-
tokens = self._tokenize(text)
|
|
257
|
-
lemmatized_words = []
|
|
258
|
-
for word in tokens:
|
|
259
|
-
word_lower = word.lower()
|
|
260
|
-
if len(word) > 2 and word_lower not in self.STOPWORDS:
|
|
261
|
-
lemma = self._lemmatize(word)
|
|
262
|
-
lemmatized_words.append(lemma)
|
|
263
|
-
|
|
264
|
-
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
265
|
-
all_keywords = lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
|
|
266
|
-
keyword_freq = Counter(all_keywords)
|
|
267
|
-
top_keywords = keyword_freq.most_common(top_n)
|
|
268
|
-
|
|
269
|
-
# Fix: Calculate TF scores based on all keywords, not just lemmatized_words
|
|
270
|
-
# This ensures weighted keywords are included in the calculation
|
|
271
|
-
total_words = len(all_keywords)
|
|
272
|
-
if total_words > 0:
|
|
273
|
-
tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
|
|
274
|
-
else:
|
|
275
|
-
tf_scores = {}
|
|
276
|
-
|
|
277
|
-
stats = {
|
|
278
|
-
"total_tokens": len(tokens),
|
|
279
|
-
"total_words": len(lemmatized_words),
|
|
280
|
-
"unique_words": len(set(lemmatized_words)),
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
return {
|
|
284
|
-
"top_keywords": top_keywords,
|
|
285
|
-
"lemmatized_words": list(set(lemmatized_words))[:20],
|
|
286
|
-
"code_identifiers": code_identifiers,
|
|
287
|
-
"code_split_words": code_split_words,
|
|
288
|
-
"tf_scores": dict(sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]),
|
|
289
|
-
"stats": stats,
|
|
290
|
-
}
|