cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/_version_hash.py +4 -0
- cicada/cli.py +6 -748
- cicada/commands.py +1255 -0
- cicada/dead_code/__init__.py +1 -0
- cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
- cicada/dependency_analyzer.py +147 -0
- cicada/entry_utils.py +92 -0
- cicada/extractors/base.py +9 -9
- cicada/extractors/call.py +17 -20
- cicada/extractors/common.py +64 -0
- cicada/extractors/dependency.py +117 -235
- cicada/extractors/doc.py +2 -49
- cicada/extractors/function.py +10 -14
- cicada/extractors/keybert.py +228 -0
- cicada/extractors/keyword.py +191 -0
- cicada/extractors/module.py +6 -10
- cicada/extractors/spec.py +8 -56
- cicada/format/__init__.py +20 -0
- cicada/{ascii_art.py → format/ascii_art.py} +1 -1
- cicada/format/formatter.py +1145 -0
- cicada/git_helper.py +134 -7
- cicada/indexer.py +322 -89
- cicada/interactive_setup.py +251 -323
- cicada/interactive_setup_helpers.py +302 -0
- cicada/keyword_expander.py +437 -0
- cicada/keyword_search.py +208 -422
- cicada/keyword_test.py +383 -16
- cicada/mcp/__init__.py +10 -0
- cicada/mcp/entry.py +17 -0
- cicada/mcp/filter_utils.py +107 -0
- cicada/mcp/pattern_utils.py +118 -0
- cicada/{mcp_server.py → mcp/server.py} +819 -73
- cicada/mcp/tools.py +473 -0
- cicada/pr_finder.py +2 -3
- cicada/pr_indexer/indexer.py +3 -2
- cicada/setup.py +167 -35
- cicada/tier.py +225 -0
- cicada/utils/__init__.py +9 -2
- cicada/utils/fuzzy_match.py +54 -0
- cicada/utils/index_utils.py +9 -0
- cicada/utils/path_utils.py +18 -0
- cicada/utils/text_utils.py +52 -1
- cicada/utils/tree_utils.py +47 -0
- cicada/version_check.py +99 -0
- cicada/watch_manager.py +320 -0
- cicada/watcher.py +431 -0
- cicada_mcp-0.3.0.dist-info/METADATA +541 -0
- cicada_mcp-0.3.0.dist-info/RECORD +70 -0
- cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
- cicada/formatter.py +0 -864
- cicada/keybert_extractor.py +0 -286
- cicada/lightweight_keyword_extractor.py +0 -290
- cicada/mcp_entry.py +0 -683
- cicada/mcp_tools.py +0 -291
- cicada_mcp-0.2.0.dist-info/METADATA +0 -735
- cicada_mcp-0.2.0.dist-info/RECORD +0 -53
- cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
- /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
- /cicada/{colors.py → format/colors.py} +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Keyword expansion using lemminflect and word embeddings.
|
|
3
|
+
|
|
4
|
+
This module provides keyword expansion through three strategies:
|
|
5
|
+
1. Lemmi: Generate inflected forms using lemminflect (e.g., run → running, runs, ran)
|
|
6
|
+
2. GloVe: Semantic similarity via GloVe + inflected forms
|
|
7
|
+
3. FastText: Semantic similarity via FastText + inflected forms
|
|
8
|
+
|
|
9
|
+
Example:
|
|
10
|
+
>>> expander = KeywordExpander(expansion_type="glove", verbose=True)
|
|
11
|
+
>>> expanded = expander.expand_keywords(["database"], top_n=3)
|
|
12
|
+
>>> print(expanded)
|
|
13
|
+
['database', 'databases', 'postgresql', 'mysql', 'storage']
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class KeywordExpander:
|
|
22
|
+
"""
|
|
23
|
+
Expands keywords using lemminflect and optionally word embeddings.
|
|
24
|
+
|
|
25
|
+
Supports three expansion strategies:
|
|
26
|
+
- lemmi: Inflected forms only (run → running, runs, ran)
|
|
27
|
+
- glove: GloVe embeddings + inflected forms (128MB)
|
|
28
|
+
- fasttext: FastText embeddings + inflected forms (958MB)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Class-level cache for loaded models
|
|
32
|
+
_model_cache: dict[str, Any] = {}
|
|
33
|
+
_lemminflect_cache: Any = None
|
|
34
|
+
|
|
35
|
+
# Model configurations for word embeddings
|
|
36
|
+
EMBEDDING_MODELS = {
|
|
37
|
+
"glove": {
|
|
38
|
+
"name": "glove-wiki-gigaword-100",
|
|
39
|
+
"size_mb": 128,
|
|
40
|
+
"description": "GloVe embeddings (100-dim, fast)",
|
|
41
|
+
},
|
|
42
|
+
"fasttext": {
|
|
43
|
+
"name": "fasttext-wiki-news-subwords-300",
|
|
44
|
+
"size_mb": 958,
|
|
45
|
+
"description": "FastText embeddings (300-dim, better rare words)",
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Expansion penalty multipliers (reduce noise from derived terms)
|
|
50
|
+
INFLECTION_PENALTY = 0.7 # Penalty for morphological variations (run → runs, running)
|
|
51
|
+
SEMANTIC_EXPANSION_PENALTY = 0.9 # Penalty for embedding-based similar words
|
|
52
|
+
TOP_N = 5 # Number of top expansions to consider
|
|
53
|
+
|
|
54
|
+
def __init__(self, expansion_type: str = "lemmi", verbose: bool = False):
|
|
55
|
+
"""
|
|
56
|
+
Initialize keyword expander.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
expansion_type: Type of expansion to perform.
|
|
60
|
+
- "lemmi": Lemminflect inflections only
|
|
61
|
+
- "glove": GloVe embeddings + lemminflect (128MB)
|
|
62
|
+
- "fasttext": FastText embeddings + lemminflect (958MB)
|
|
63
|
+
verbose: Whether to print progress messages during model loading.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If expansion_type is not supported.
|
|
67
|
+
"""
|
|
68
|
+
if expansion_type not in ["lemmi", "glove", "fasttext"]:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"Unsupported expansion_type: {expansion_type}. "
|
|
71
|
+
f"Supported types: lemmi, glove, fasttext"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self.expansion_type = expansion_type
|
|
75
|
+
self.verbose = verbose
|
|
76
|
+
self._embedding_model: Any = None
|
|
77
|
+
|
|
78
|
+
def _load_lemminflect(self) -> Any:
|
|
79
|
+
"""
|
|
80
|
+
Lazy-load lemminflect library (cached at class level).
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
lemminflect module
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
ImportError: If lemminflect is not installed.
|
|
87
|
+
"""
|
|
88
|
+
if KeywordExpander._lemminflect_cache is not None:
|
|
89
|
+
return KeywordExpander._lemminflect_cache
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
import lemminflect
|
|
93
|
+
|
|
94
|
+
KeywordExpander._lemminflect_cache = lemminflect
|
|
95
|
+
if self.verbose:
|
|
96
|
+
print("✓ lemminflect loaded")
|
|
97
|
+
return lemminflect
|
|
98
|
+
except ImportError as e:
|
|
99
|
+
raise ImportError(
|
|
100
|
+
"lemminflect is required for keyword expansion. "
|
|
101
|
+
"Install with: pip install lemminflect"
|
|
102
|
+
) from e
|
|
103
|
+
|
|
104
|
+
def _load_embedding_model(self) -> Any:
|
|
105
|
+
"""
|
|
106
|
+
Lazy-load word embedding model (glove or fasttext).
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Loaded gensim KeyedVectors model.
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
ImportError: If gensim is not installed.
|
|
113
|
+
Exception: If model download or loading fails.
|
|
114
|
+
"""
|
|
115
|
+
# Only load embeddings for glove/fasttext (not for lemmi-only)
|
|
116
|
+
if self.expansion_type == "lemmi":
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
# Check class-level cache first
|
|
120
|
+
if self.expansion_type in self._model_cache:
|
|
121
|
+
if self.verbose:
|
|
122
|
+
print(f"✓ Using cached {self.expansion_type} model")
|
|
123
|
+
return self._model_cache[self.expansion_type]
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
import gensim.downloader as api
|
|
127
|
+
except ImportError as e:
|
|
128
|
+
raise ImportError(
|
|
129
|
+
"gensim is required for embedding-based expansion. "
|
|
130
|
+
"Install with: pip install gensim"
|
|
131
|
+
) from e
|
|
132
|
+
|
|
133
|
+
# Get model configuration
|
|
134
|
+
model_config = self.EMBEDDING_MODELS[self.expansion_type]
|
|
135
|
+
model_name: str = model_config["name"] # type: ignore[assignment]
|
|
136
|
+
size_mb: int = model_config["size_mb"] # type: ignore[assignment]
|
|
137
|
+
|
|
138
|
+
# Check if model is already downloaded to disk
|
|
139
|
+
from pathlib import Path
|
|
140
|
+
|
|
141
|
+
gensim_data_dir = Path.home() / "gensim-data" / model_name
|
|
142
|
+
is_cached = gensim_data_dir.exists()
|
|
143
|
+
|
|
144
|
+
if self.verbose:
|
|
145
|
+
if is_cached:
|
|
146
|
+
print(f"Loading {self.expansion_type} model from cache...")
|
|
147
|
+
else:
|
|
148
|
+
print(f"Loading {self.expansion_type} model...")
|
|
149
|
+
print(f"Downloading {size_mb}MB (first time only)...")
|
|
150
|
+
print("Progress bar will appear during download.")
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
# Load model using gensim's downloader
|
|
154
|
+
model = api.load(model_name)
|
|
155
|
+
|
|
156
|
+
# Cache at class level
|
|
157
|
+
self._model_cache[self.expansion_type] = model
|
|
158
|
+
|
|
159
|
+
if self.verbose:
|
|
160
|
+
print(f"✓ {self.expansion_type.title()} model loaded successfully")
|
|
161
|
+
|
|
162
|
+
return model
|
|
163
|
+
|
|
164
|
+
except Exception as e:
|
|
165
|
+
raise Exception(f"Failed to load {self.expansion_type} model: {str(e)}") from e
|
|
166
|
+
|
|
167
|
+
def _get_inflections(self, word: str) -> set[str]:
|
|
168
|
+
"""
|
|
169
|
+
Generate inflected forms of a word using lemminflect.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
word: Base word to inflect
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Set of inflected forms (including the original word)
|
|
176
|
+
"""
|
|
177
|
+
lemminflect = self._load_lemminflect()
|
|
178
|
+
inflections = {word.lower()}
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
# Try different POS tags to get comprehensive inflections
|
|
182
|
+
for pos in ["VERB", "NOUN", "ADJ"]:
|
|
183
|
+
# Get inflections for all forms of this POS
|
|
184
|
+
all_inflections = lemminflect.getAllInflections(word, upos=pos)
|
|
185
|
+
if all_inflections:
|
|
186
|
+
for form_list in all_inflections.values():
|
|
187
|
+
inflections.update([f.lower() for f in form_list])
|
|
188
|
+
except Exception:
|
|
189
|
+
# If inflection fails, just return the original word
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
return inflections
|
|
193
|
+
|
|
194
|
+
def expand_keywords(
|
|
195
|
+
self,
|
|
196
|
+
keywords: list[str],
|
|
197
|
+
top_n: int = 3,
|
|
198
|
+
threshold: float = 0.7,
|
|
199
|
+
return_scores: bool = False,
|
|
200
|
+
keyword_scores: dict[str, float] | None = None,
|
|
201
|
+
min_score: float = 0.0,
|
|
202
|
+
code_identifiers: list[str] | None = None,
|
|
203
|
+
) -> list[str] | dict[str, Any]:
|
|
204
|
+
"""
|
|
205
|
+
Expand keywords using the configured expansion strategy.
|
|
206
|
+
|
|
207
|
+
Strategy behavior:
|
|
208
|
+
- lemmi: Returns keywords + inflected forms
|
|
209
|
+
- glove/fasttext: Returns keywords + semantically similar words + inflected forms for all
|
|
210
|
+
- Code identifiers are NOT inflected or expanded (kept as exact matches)
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
keywords: List of seed keywords to expand.
|
|
214
|
+
top_n: Maximum number of similar words to return per keyword (for embeddings).
|
|
215
|
+
threshold: Minimum cosine similarity score for embeddings (0.0 to 1.0).
|
|
216
|
+
return_scores: If True, return detailed dict with scores. If False, return simple list.
|
|
217
|
+
keyword_scores: Optional dict mapping keywords to their extraction scores.
|
|
218
|
+
These scores multiply with expansion similarity scores.
|
|
219
|
+
min_score: Minimum score threshold for expanded keywords (filters out low-scoring terms).
|
|
220
|
+
code_identifiers: List of code identifiers that should NOT be inflected or expanded.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
If return_scores=False: Deduplicated list of expanded keywords (flat list).
|
|
224
|
+
If return_scores=True: Dict with 'words' (list of dicts with word/score/source) and 'simple' (flat list).
|
|
225
|
+
|
|
226
|
+
Example:
|
|
227
|
+
>>> expander = KeywordExpander(expansion_type="lemmi")
|
|
228
|
+
>>> result = expander.expand_keywords(["run", "database"])
|
|
229
|
+
>>> # Returns: ['run', 'running', 'runs', 'ran', 'database', 'databases', ...]
|
|
230
|
+
|
|
231
|
+
>>> result = expander.expand_keywords(["run"], return_scores=True, keyword_scores={"run": 0.95})
|
|
232
|
+
>>> # Returns: {
|
|
233
|
+
>>> 'words': [
|
|
234
|
+
>>> {'word': 'run', 'score': 0.95, 'source': 'original'},
|
|
235
|
+
>>> {'word': 'running', 'score': 0.95, 'source': 'inflection', 'parent': 'run'}
|
|
236
|
+
>>> ],
|
|
237
|
+
>>> 'simple': ['run', 'running', ...]
|
|
238
|
+
>>> }
|
|
239
|
+
"""
|
|
240
|
+
from cicada.utils import split_camel_snake_case
|
|
241
|
+
|
|
242
|
+
# Default all keyword scores to 1.0 if not provided
|
|
243
|
+
if keyword_scores is None:
|
|
244
|
+
keyword_scores = {kw.lower(): 1.0 for kw in keywords}
|
|
245
|
+
else:
|
|
246
|
+
# Normalize keys to lowercase
|
|
247
|
+
keyword_scores = {k.lower(): v for k, v in keyword_scores.items()}
|
|
248
|
+
|
|
249
|
+
# Normalize code identifiers to lowercase set for fast lookup
|
|
250
|
+
code_identifiers_set = set()
|
|
251
|
+
if code_identifiers:
|
|
252
|
+
code_identifiers_set = {ident.lower() for ident in code_identifiers}
|
|
253
|
+
|
|
254
|
+
expanded_words = set()
|
|
255
|
+
word_details: dict[str, dict[str, Any]] = {} # word -> {score, source, parent}
|
|
256
|
+
|
|
257
|
+
# Step 1: Split compound identifiers and add all components
|
|
258
|
+
split_keywords = []
|
|
259
|
+
for keyword in keywords:
|
|
260
|
+
# Add original keyword with its extraction score
|
|
261
|
+
word_lower = keyword.lower()
|
|
262
|
+
expanded_words.add(word_lower)
|
|
263
|
+
extraction_score = keyword_scores.get(word_lower, 1.0)
|
|
264
|
+
if return_scores:
|
|
265
|
+
word_details[word_lower] = {
|
|
266
|
+
"score": extraction_score,
|
|
267
|
+
"source": "original",
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
# Split compound identifiers (e.g., get_keys → get, keys)
|
|
271
|
+
split_text = split_camel_snake_case(keyword)
|
|
272
|
+
words = [
|
|
273
|
+
word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
|
|
274
|
+
]
|
|
275
|
+
split_keywords.extend(words)
|
|
276
|
+
for word in words:
|
|
277
|
+
expanded_words.add(word)
|
|
278
|
+
if return_scores and word not in word_details:
|
|
279
|
+
# Split words inherit parent's extraction score
|
|
280
|
+
word_details[word] = {
|
|
281
|
+
"score": extraction_score,
|
|
282
|
+
"source": "split",
|
|
283
|
+
"parent": keyword,
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
# Step 2: Generate inflections for all keywords (original + split)
|
|
287
|
+
# BUT: Skip inflections for code identifiers (they should remain exact)
|
|
288
|
+
all_keywords = list(set(keywords + split_keywords))
|
|
289
|
+
for keyword in all_keywords:
|
|
290
|
+
keyword_lower = keyword.lower()
|
|
291
|
+
|
|
292
|
+
# Skip inflections for code identifiers
|
|
293
|
+
if keyword_lower in code_identifiers_set:
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
# Get the parent's score (either from original keywords or split words)
|
|
297
|
+
parent_score = (
|
|
298
|
+
word_details.get(keyword_lower, {}).get("score", 1.0) if return_scores else 1.0
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
inflections = self._get_inflections(keyword)
|
|
302
|
+
for inflection in inflections:
|
|
303
|
+
expanded_words.add(inflection)
|
|
304
|
+
if return_scores and inflection not in word_details:
|
|
305
|
+
# Inflections inherit parent's score with penalty
|
|
306
|
+
word_details[inflection] = {
|
|
307
|
+
"score": parent_score * self.INFLECTION_PENALTY,
|
|
308
|
+
"source": "inflection",
|
|
309
|
+
"parent": keyword,
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
# Step 3: If using embeddings, add semantically similar words + their inflections
|
|
313
|
+
if self.expansion_type in ["glove", "fasttext"]:
|
|
314
|
+
# Lazy-load embedding model
|
|
315
|
+
if self._embedding_model is None:
|
|
316
|
+
self._embedding_model = self._load_embedding_model()
|
|
317
|
+
|
|
318
|
+
if self._embedding_model is not None:
|
|
319
|
+
# Try embedding expansion for all keywords (original + split)
|
|
320
|
+
# BUT: Skip semantic expansion for code identifiers (they should remain exact)
|
|
321
|
+
for keyword in all_keywords:
|
|
322
|
+
keyword_lower = keyword.lower()
|
|
323
|
+
|
|
324
|
+
# Skip semantic expansion for code identifiers
|
|
325
|
+
if keyword_lower in code_identifiers_set:
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
# Get parent keyword's score to multiply with similarity
|
|
329
|
+
parent_score = (
|
|
330
|
+
word_details.get(keyword_lower, {}).get("score", 1.0)
|
|
331
|
+
if return_scores
|
|
332
|
+
else 1.0
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
# Get most similar words
|
|
337
|
+
similar_words = self._embedding_model.most_similar(
|
|
338
|
+
keyword_lower, topn=top_n * 3
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Filter by threshold and take top N
|
|
342
|
+
for word, similarity_score in similar_words[: self.TOP_N]:
|
|
343
|
+
if similarity_score >= threshold:
|
|
344
|
+
# Add the similar word
|
|
345
|
+
word_lower = word.lower()
|
|
346
|
+
expanded_words.add(word_lower)
|
|
347
|
+
if return_scores and word_lower not in word_details:
|
|
348
|
+
# Final score = extraction score × similarity score × semantic penalty
|
|
349
|
+
final_score = (
|
|
350
|
+
parent_score
|
|
351
|
+
* float(similarity_score)
|
|
352
|
+
* self.SEMANTIC_EXPANSION_PENALTY
|
|
353
|
+
)
|
|
354
|
+
word_details[word_lower] = {
|
|
355
|
+
"score": final_score,
|
|
356
|
+
"source": "embedding",
|
|
357
|
+
"parent": keyword,
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
# Add inflections of the similar word
|
|
361
|
+
inflections = self._get_inflections(word)
|
|
362
|
+
embedding_score = (
|
|
363
|
+
word_details.get(word_lower, {}).get(
|
|
364
|
+
"score",
|
|
365
|
+
parent_score
|
|
366
|
+
* float(similarity_score)
|
|
367
|
+
* self.SEMANTIC_EXPANSION_PENALTY,
|
|
368
|
+
)
|
|
369
|
+
if return_scores
|
|
370
|
+
else parent_score
|
|
371
|
+
* float(similarity_score)
|
|
372
|
+
* self.SEMANTIC_EXPANSION_PENALTY
|
|
373
|
+
)
|
|
374
|
+
for inflection in inflections:
|
|
375
|
+
expanded_words.add(inflection)
|
|
376
|
+
if return_scores and inflection not in word_details:
|
|
377
|
+
# Embedding inflections inherit the embedding's score with inflection penalty
|
|
378
|
+
word_details[inflection] = {
|
|
379
|
+
"score": embedding_score * self.INFLECTION_PENALTY,
|
|
380
|
+
"source": "embedding_inflection",
|
|
381
|
+
"parent": word_lower,
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
except KeyError:
|
|
385
|
+
# Keyword not in vocabulary - skip embedding expansion silently
|
|
386
|
+
# (This is expected for many code identifiers)
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
simple_list = sorted(expanded_words)
|
|
390
|
+
|
|
391
|
+
if return_scores:
|
|
392
|
+
# Build detailed list with scores
|
|
393
|
+
detailed_list = [
|
|
394
|
+
{"word": word, **word_details.get(word, {"score": 1.0, "source": "unknown"})}
|
|
395
|
+
for word in simple_list
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
# Apply min_score filter if specified
|
|
399
|
+
if min_score > 0.0:
|
|
400
|
+
detailed_list = [
|
|
401
|
+
item
|
|
402
|
+
for item in detailed_list
|
|
403
|
+
if isinstance(item.get("score"), (int, float))
|
|
404
|
+
and float(item.get("score", 0.0)) >= min_score
|
|
405
|
+
]
|
|
406
|
+
simple_list = [item["word"] for item in detailed_list]
|
|
407
|
+
|
|
408
|
+
return {
|
|
409
|
+
"words": detailed_list,
|
|
410
|
+
"simple": simple_list,
|
|
411
|
+
}
|
|
412
|
+
else:
|
|
413
|
+
# Apply min_score filter if specified
|
|
414
|
+
if min_score > 0.0:
|
|
415
|
+
simple_list = [
|
|
416
|
+
word
|
|
417
|
+
for word in simple_list
|
|
418
|
+
if word_details.get(word, {}).get("score", 1.0) >= min_score
|
|
419
|
+
]
|
|
420
|
+
return simple_list
|
|
421
|
+
|
|
422
|
+
def get_expansion_info(self) -> dict[str, Any]:
|
|
423
|
+
"""
|
|
424
|
+
Get information about the expansion configuration.
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
Dictionary with expansion metadata (type, model info if applicable).
|
|
428
|
+
"""
|
|
429
|
+
info: dict[str, Any] = {
|
|
430
|
+
"type": self.expansion_type,
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
if self.expansion_type != "lemmi" and self._embedding_model is not None:
|
|
434
|
+
info["embedding_vocab_size"] = len(self._embedding_model.key_to_index)
|
|
435
|
+
info["embedding_vector_size"] = self._embedding_model.vector_size
|
|
436
|
+
|
|
437
|
+
return info
|