cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/_version_hash.py +4 -0
- cicada/cli.py +6 -748
- cicada/commands.py +1255 -0
- cicada/dead_code/__init__.py +1 -0
- cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
- cicada/dependency_analyzer.py +147 -0
- cicada/entry_utils.py +92 -0
- cicada/extractors/base.py +9 -9
- cicada/extractors/call.py +17 -20
- cicada/extractors/common.py +64 -0
- cicada/extractors/dependency.py +117 -235
- cicada/extractors/doc.py +2 -49
- cicada/extractors/function.py +10 -14
- cicada/extractors/keybert.py +228 -0
- cicada/extractors/keyword.py +191 -0
- cicada/extractors/module.py +6 -10
- cicada/extractors/spec.py +8 -56
- cicada/format/__init__.py +20 -0
- cicada/{ascii_art.py → format/ascii_art.py} +1 -1
- cicada/format/formatter.py +1145 -0
- cicada/git_helper.py +134 -7
- cicada/indexer.py +322 -89
- cicada/interactive_setup.py +251 -323
- cicada/interactive_setup_helpers.py +302 -0
- cicada/keyword_expander.py +437 -0
- cicada/keyword_search.py +208 -422
- cicada/keyword_test.py +383 -16
- cicada/mcp/__init__.py +10 -0
- cicada/mcp/entry.py +17 -0
- cicada/mcp/filter_utils.py +107 -0
- cicada/mcp/pattern_utils.py +118 -0
- cicada/{mcp_server.py → mcp/server.py} +819 -73
- cicada/mcp/tools.py +473 -0
- cicada/pr_finder.py +2 -3
- cicada/pr_indexer/indexer.py +3 -2
- cicada/setup.py +167 -35
- cicada/tier.py +225 -0
- cicada/utils/__init__.py +9 -2
- cicada/utils/fuzzy_match.py +54 -0
- cicada/utils/index_utils.py +9 -0
- cicada/utils/path_utils.py +18 -0
- cicada/utils/text_utils.py +52 -1
- cicada/utils/tree_utils.py +47 -0
- cicada/version_check.py +99 -0
- cicada/watch_manager.py +320 -0
- cicada/watcher.py +431 -0
- cicada_mcp-0.3.0.dist-info/METADATA +541 -0
- cicada_mcp-0.3.0.dist-info/RECORD +70 -0
- cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
- cicada/formatter.py +0 -864
- cicada/keybert_extractor.py +0 -286
- cicada/lightweight_keyword_extractor.py +0 -290
- cicada/mcp_entry.py +0 -683
- cicada/mcp_tools.py +0 -291
- cicada_mcp-0.2.0.dist-info/METADATA +0 -735
- cicada_mcp-0.2.0.dist-info/RECORD +0 -53
- cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
- /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
- /cicada/{colors.py → format/colors.py} +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Keyword Extraction using KeyBERT
|
|
3
|
+
Semantic keyword extraction using transformer-based embeddings
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
# Disable tokenizers parallelism to avoid fork warnings
|
|
13
|
+
# Must be set before importing transformers/keybert
|
|
14
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
15
|
+
|
|
16
|
+
from cicada.extractors.keyword import BaseKeywordExtractor
|
|
17
|
+
from cicada.utils import extract_code_identifiers
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class KeyBERTExtractor(BaseKeywordExtractor):
|
|
21
|
+
"""Extract keywords from text using KeyBERT semantic analysis."""
|
|
22
|
+
|
|
23
|
+
# Single model configuration
|
|
24
|
+
MODEL_NAME = "BAAI/bge-small-en-v1.5" # 133MB, balanced performance
|
|
25
|
+
|
|
26
|
+
# Weighting strategy constants for keyword extraction
|
|
27
|
+
KEYBERT_CANDIDATE_MULTIPLIER = 3 # Extract 3x keywords for weighted reranking
|
|
28
|
+
CODE_IDENTIFIER_BOOST = 10 # 10x weight for exact code identifiers
|
|
29
|
+
CODE_SPLIT_WORD_BOOST = 3 # 3x weight for identifier components
|
|
30
|
+
BASE_SCORE_IDENTIFIER = 0.5 # Base score for identifiers not found by BERT
|
|
31
|
+
BASE_SCORE_SPLIT_WORD = 0.3 # Base score for split words not found by BERT
|
|
32
|
+
|
|
33
|
+
# Class variable to hold KeyBERT class (lazily loaded)
|
|
34
|
+
_KeyBERT: type | None = None
|
|
35
|
+
|
|
36
|
+
def __init__(self, verbose: bool = False):
|
|
37
|
+
super().__init__(verbose)
|
|
38
|
+
self.verbose = verbose
|
|
39
|
+
|
|
40
|
+
# Print message BEFORE the slow import
|
|
41
|
+
if self.verbose:
|
|
42
|
+
print(
|
|
43
|
+
f"Loading KeyBERT model ({self.MODEL_NAME})",
|
|
44
|
+
file=sys.stderr,
|
|
45
|
+
)
|
|
46
|
+
print("This can take up to a couple of minutes.", file=sys.stderr)
|
|
47
|
+
|
|
48
|
+
# Lazy import KeyBERT (only once per class)
|
|
49
|
+
if KeyBERTExtractor._KeyBERT is None:
|
|
50
|
+
try:
|
|
51
|
+
from keybert import KeyBERT
|
|
52
|
+
|
|
53
|
+
KeyBERTExtractor._KeyBERT = KeyBERT
|
|
54
|
+
except ImportError as e:
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"KeyBERT is not installed. Install it with:\n"
|
|
57
|
+
" uv add keybert\n"
|
|
58
|
+
"or\n"
|
|
59
|
+
" pip install keybert"
|
|
60
|
+
) from e
|
|
61
|
+
|
|
62
|
+
# Initialize KeyBERT with the model
|
|
63
|
+
try:
|
|
64
|
+
self.kw_model = KeyBERTExtractor._KeyBERT(model=self.MODEL_NAME)
|
|
65
|
+
if self.verbose:
|
|
66
|
+
print("✓ Model loaded successfully", file=sys.stderr)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
raise RuntimeError(
|
|
69
|
+
f"Failed to load KeyBERT model '{self.MODEL_NAME}'. "
|
|
70
|
+
f"Ensure the model is downloaded and available. Error: {e}"
|
|
71
|
+
) from e
|
|
72
|
+
|
|
73
|
+
def _calculate_term_frequencies(self, text: str) -> dict[str, int]:
|
|
74
|
+
"""Calculate term frequencies for all words in the text.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
text: Input text to analyze
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Dictionary mapping lowercase words to their raw frequency counts
|
|
81
|
+
"""
|
|
82
|
+
tokens = re.findall(r"\b[a-zA-Z][a-zA-Z0-9_]*\b", text.lower())
|
|
83
|
+
term_freq = Counter(tokens)
|
|
84
|
+
return dict(term_freq)
|
|
85
|
+
|
|
86
|
+
def _apply_code_boosting(
|
|
87
|
+
self,
|
|
88
|
+
keyword_scores: dict[str, float],
|
|
89
|
+
code_identifiers: list[str],
|
|
90
|
+
code_split_words: list[str],
|
|
91
|
+
tf_scores: dict[str, int],
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Apply boosting to code identifiers and split words in-place.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
keyword_scores: Dictionary of keyword scores to modify in-place
|
|
97
|
+
code_identifiers: List of code identifiers to boost
|
|
98
|
+
code_split_words: List of split words from identifiers to boost
|
|
99
|
+
tf_scores: Term frequency scores for calculating base scores
|
|
100
|
+
"""
|
|
101
|
+
# Apply code identifier boosting
|
|
102
|
+
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
103
|
+
for identifier in code_identifiers_lower:
|
|
104
|
+
if identifier in keyword_scores:
|
|
105
|
+
keyword_scores[identifier] *= self.CODE_IDENTIFIER_BOOST
|
|
106
|
+
else:
|
|
107
|
+
# Add with base score × frequency if not found by KeyBERT
|
|
108
|
+
freq = tf_scores.get(identifier, 1)
|
|
109
|
+
keyword_scores[identifier] = (
|
|
110
|
+
self.BASE_SCORE_IDENTIFIER * freq * self.CODE_IDENTIFIER_BOOST
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Apply split word boosting (but only if not already a code identifier)
|
|
114
|
+
code_split_words_lower = [word.lower() for word in code_split_words]
|
|
115
|
+
code_identifiers_set = set(code_identifiers_lower) # For O(1) lookup
|
|
116
|
+
for word in code_split_words_lower:
|
|
117
|
+
# Skip words that are already code identifiers (avoid double-boosting)
|
|
118
|
+
if word in code_identifiers_set:
|
|
119
|
+
continue
|
|
120
|
+
if word in keyword_scores:
|
|
121
|
+
keyword_scores[word] *= self.CODE_SPLIT_WORD_BOOST
|
|
122
|
+
else:
|
|
123
|
+
freq = tf_scores.get(word, 1)
|
|
124
|
+
keyword_scores[word] = (
|
|
125
|
+
self.BASE_SCORE_SPLIT_WORD * freq * self.CODE_SPLIT_WORD_BOOST
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _calculate_statistics(self, text: str) -> dict[str, int]:
|
|
129
|
+
"""Calculate basic text statistics.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
text: Input text to analyze
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Dictionary with basic statistics (total_tokens, total_words, unique_words, sentences)
|
|
136
|
+
"""
|
|
137
|
+
words = text.split()
|
|
138
|
+
unique_words = {w.lower() for w in words if w.isalpha()}
|
|
139
|
+
sentences = text.count(".") + text.count("!") + text.count("?")
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
"total_tokens": len(words),
|
|
143
|
+
"total_words": len([w for w in words if w.isalpha()]),
|
|
144
|
+
"unique_words": len(unique_words),
|
|
145
|
+
"sentences": max(1, sentences),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
def extract_keywords(
|
|
149
|
+
self, text: str, top_n: int = 15, min_score: float = 0.0
|
|
150
|
+
) -> dict[str, Any]:
|
|
151
|
+
"""
|
|
152
|
+
Extract keywords using KeyBERT semantic analysis with code identifier emphasis and frequency weighting.
|
|
153
|
+
|
|
154
|
+
Weighting strategy:
|
|
155
|
+
- Semantic score × raw frequency (repetition increases score, document length doesn't matter)
|
|
156
|
+
- Full code identifiers (e.g., getUserData, snake_case): 10x weight
|
|
157
|
+
- Code split words (e.g., get, user, data): 3x weight
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
text: Input text to analyze
|
|
161
|
+
top_n: Number of top keywords to return
|
|
162
|
+
min_score: Minimum score threshold for keywords (filters out low-scoring terms)
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Dictionary with extracted keywords and analysis:
|
|
166
|
+
- top_keywords: List of (keyword, score) tuples, sorted by weighted score
|
|
167
|
+
- code_identifiers: Original identifiers (weighted 10x)
|
|
168
|
+
- code_split_words: Words extracted from identifiers (weighted 3x)
|
|
169
|
+
- stats: Basic text statistics
|
|
170
|
+
"""
|
|
171
|
+
if not text or not text.strip():
|
|
172
|
+
return {
|
|
173
|
+
"top_keywords": [],
|
|
174
|
+
"code_identifiers": [],
|
|
175
|
+
"code_split_words": [],
|
|
176
|
+
"tf_scores": {},
|
|
177
|
+
"stats": {
|
|
178
|
+
"total_tokens": 0,
|
|
179
|
+
"total_words": 0,
|
|
180
|
+
"unique_words": 0,
|
|
181
|
+
"sentences": 0,
|
|
182
|
+
},
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
# 1. Extract code identifiers and their split words
|
|
186
|
+
code_identifiers, code_split_words = extract_code_identifiers(text)
|
|
187
|
+
|
|
188
|
+
# 2. Calculate term frequencies for all words (raw counts, not normalized)
|
|
189
|
+
tf_scores = self._calculate_term_frequencies(text)
|
|
190
|
+
|
|
191
|
+
# 3. Use KeyBERT to extract semantic keywords
|
|
192
|
+
try:
|
|
193
|
+
keybert_keywords: list[tuple[str, float]] = self.kw_model.extract_keywords( # type: ignore[assignment]
|
|
194
|
+
text,
|
|
195
|
+
top_n=top_n * self.KEYBERT_CANDIDATE_MULTIPLIER,
|
|
196
|
+
keyphrase_ngram_range=(1, 1), # Single words only
|
|
197
|
+
)
|
|
198
|
+
except Exception as e:
|
|
199
|
+
if self.verbose:
|
|
200
|
+
print(f"Warning: KeyBERT extraction failed: {e}", file=sys.stderr)
|
|
201
|
+
keybert_keywords = []
|
|
202
|
+
|
|
203
|
+
# 4. Build weighted keyword scores (semantic × frequency)
|
|
204
|
+
keyword_scores: dict[str, float] = {}
|
|
205
|
+
|
|
206
|
+
# Add KeyBERT keywords with semantic score × frequency
|
|
207
|
+
for keyword, semantic_score in keybert_keywords:
|
|
208
|
+
keyword_lower: str = keyword.lower()
|
|
209
|
+
freq = tf_scores.get(keyword_lower, 1) # Default frequency of 1
|
|
210
|
+
keyword_scores[keyword_lower] = semantic_score * freq
|
|
211
|
+
|
|
212
|
+
# 5. Apply code identifier and split word boosting
|
|
213
|
+
self._apply_code_boosting(keyword_scores, code_identifiers, code_split_words, tf_scores)
|
|
214
|
+
|
|
215
|
+
# 6. Filter by minimum score threshold and sort by weighted score
|
|
216
|
+
filtered_scores = {k: v for k, v in keyword_scores.items() if v >= min_score}
|
|
217
|
+
top_keywords = sorted(filtered_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
218
|
+
|
|
219
|
+
# 7. Calculate basic statistics
|
|
220
|
+
stats = self._calculate_statistics(text)
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
"top_keywords": top_keywords,
|
|
224
|
+
"code_identifiers": code_identifiers,
|
|
225
|
+
"code_split_words": code_split_words,
|
|
226
|
+
"tf_scores": tf_scores,
|
|
227
|
+
"stats": stats,
|
|
228
|
+
}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from collections import Counter
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from cicada.utils import extract_code_identifiers as util_extract_code_identifiers
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseKeywordExtractor:
|
|
9
|
+
"""Base class for keyword extraction."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, verbose: bool = False):
|
|
12
|
+
self.verbose = verbose
|
|
13
|
+
self.STOPWORDS = {
|
|
14
|
+
"the",
|
|
15
|
+
"a",
|
|
16
|
+
"an",
|
|
17
|
+
"and",
|
|
18
|
+
"or",
|
|
19
|
+
"but",
|
|
20
|
+
"in",
|
|
21
|
+
"on",
|
|
22
|
+
"at",
|
|
23
|
+
"to",
|
|
24
|
+
"for",
|
|
25
|
+
"of",
|
|
26
|
+
"with",
|
|
27
|
+
"by",
|
|
28
|
+
"from",
|
|
29
|
+
"as",
|
|
30
|
+
"is",
|
|
31
|
+
"are",
|
|
32
|
+
"was",
|
|
33
|
+
"were",
|
|
34
|
+
"be",
|
|
35
|
+
"been",
|
|
36
|
+
"being",
|
|
37
|
+
"have",
|
|
38
|
+
"has",
|
|
39
|
+
"had",
|
|
40
|
+
"do",
|
|
41
|
+
"does",
|
|
42
|
+
"did",
|
|
43
|
+
"will",
|
|
44
|
+
"would",
|
|
45
|
+
"should",
|
|
46
|
+
"could",
|
|
47
|
+
"this",
|
|
48
|
+
"that",
|
|
49
|
+
"these",
|
|
50
|
+
"those",
|
|
51
|
+
"it",
|
|
52
|
+
"its",
|
|
53
|
+
"they",
|
|
54
|
+
"them",
|
|
55
|
+
"their",
|
|
56
|
+
"what",
|
|
57
|
+
"which",
|
|
58
|
+
"who",
|
|
59
|
+
"when",
|
|
60
|
+
"where",
|
|
61
|
+
"why",
|
|
62
|
+
"how",
|
|
63
|
+
"all",
|
|
64
|
+
"each",
|
|
65
|
+
"every",
|
|
66
|
+
"both",
|
|
67
|
+
"few",
|
|
68
|
+
"more",
|
|
69
|
+
"most",
|
|
70
|
+
"other",
|
|
71
|
+
"some",
|
|
72
|
+
"such",
|
|
73
|
+
"no",
|
|
74
|
+
"nor",
|
|
75
|
+
"not",
|
|
76
|
+
"only",
|
|
77
|
+
"own",
|
|
78
|
+
"same",
|
|
79
|
+
"so",
|
|
80
|
+
"than",
|
|
81
|
+
"too",
|
|
82
|
+
"very",
|
|
83
|
+
"can",
|
|
84
|
+
"just",
|
|
85
|
+
"up",
|
|
86
|
+
"out",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def _tokenize(self, text: str) -> list[str]:
|
|
90
|
+
"""Tokenize text into words."""
|
|
91
|
+
import re
|
|
92
|
+
|
|
93
|
+
return re.findall(r"\b[a-zA-Z][a-zA-Z0-9_]*\b", text)
|
|
94
|
+
|
|
95
|
+
def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
|
|
96
|
+
if not text or not text.strip():
|
|
97
|
+
return []
|
|
98
|
+
try:
|
|
99
|
+
results = self.extract_keywords(text, top_n=top_n)
|
|
100
|
+
return [keyword for keyword, _ in results["top_keywords"]]
|
|
101
|
+
except Exception as e:
|
|
102
|
+
if self.verbose:
|
|
103
|
+
print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
|
|
104
|
+
return []
|
|
105
|
+
|
|
106
|
+
def _extract_keywords(
|
|
107
|
+
self,
|
|
108
|
+
words: list[str],
|
|
109
|
+
code_identifiers: list[str],
|
|
110
|
+
code_split_words: list[str],
|
|
111
|
+
top_n: int,
|
|
112
|
+
total_tokens: int,
|
|
113
|
+
) -> tuple[list, dict, dict]:
|
|
114
|
+
"""Extract keywords from a list of words.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
words: Filtered words (after stopword removal)
|
|
118
|
+
code_identifiers: Extracted code identifiers
|
|
119
|
+
code_split_words: Words split from code identifiers
|
|
120
|
+
top_n: Number of top keywords to return
|
|
121
|
+
total_tokens: Total token count before filtering (for stats)
|
|
122
|
+
"""
|
|
123
|
+
code_identifiers_lower = [ident.lower() for ident in code_identifiers]
|
|
124
|
+
all_keywords = words + (code_identifiers_lower * 10) + (code_split_words * 3)
|
|
125
|
+
keyword_freq = Counter(all_keywords)
|
|
126
|
+
top_keywords = keyword_freq.most_common(top_n)
|
|
127
|
+
|
|
128
|
+
total_words = len(all_keywords)
|
|
129
|
+
if total_words > 0:
|
|
130
|
+
tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
|
|
131
|
+
else:
|
|
132
|
+
tf_scores = {}
|
|
133
|
+
|
|
134
|
+
stats = {
|
|
135
|
+
"total_tokens": total_tokens,
|
|
136
|
+
"total_words": len(words),
|
|
137
|
+
"unique_words": len(set(words)),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return top_keywords, tf_scores, stats
|
|
141
|
+
|
|
142
|
+
def extract_keywords(
|
|
143
|
+
self, text: str, top_n: int = 15, min_score: float = 0.0
|
|
144
|
+
) -> dict[str, Any]:
|
|
145
|
+
raise NotImplementedError
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class RegularKeywordExtractor(BaseKeywordExtractor):
|
|
149
|
+
"""Extract keywords using basic term frequency (TF) without lemmatization."""
|
|
150
|
+
|
|
151
|
+
def extract_keywords(
|
|
152
|
+
self, text: str, top_n: int = 15, min_score: float = 0.0
|
|
153
|
+
) -> dict[str, Any]:
|
|
154
|
+
if not text or not text.strip():
|
|
155
|
+
return {
|
|
156
|
+
"top_keywords": [],
|
|
157
|
+
"regular_words": [],
|
|
158
|
+
"code_identifiers": [],
|
|
159
|
+
"code_split_words": [],
|
|
160
|
+
"tf_scores": {},
|
|
161
|
+
"stats": {
|
|
162
|
+
"total_tokens": 0,
|
|
163
|
+
"total_words": 0,
|
|
164
|
+
"unique_words": 0,
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
code_identifiers, code_split_words = util_extract_code_identifiers(text)
|
|
169
|
+
tokens = self._tokenize(text)
|
|
170
|
+
total_tokens = len(tokens)
|
|
171
|
+
regular_words = []
|
|
172
|
+
for word in tokens:
|
|
173
|
+
word_lower = word.lower()
|
|
174
|
+
if len(word) > 2 and word_lower not in self.STOPWORDS:
|
|
175
|
+
regular_words.append(word_lower)
|
|
176
|
+
|
|
177
|
+
top_keywords, tf_scores, stats = self._extract_keywords(
|
|
178
|
+
regular_words, code_identifiers, code_split_words, top_n, total_tokens
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Filter by minimum score threshold (min_score is a frequency count for RegularKeywordExtractor)
|
|
182
|
+
filtered_keywords = [(word, score) for word, score in top_keywords if score >= min_score]
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
"top_keywords": filtered_keywords,
|
|
186
|
+
"regular_words": list(set(regular_words))[:20],
|
|
187
|
+
"code_identifiers": code_identifiers,
|
|
188
|
+
"code_split_words": code_split_words,
|
|
189
|
+
"tf_scores": dict(sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]),
|
|
190
|
+
"stats": stats,
|
|
191
|
+
}
|
cicada/extractors/module.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
Module extraction logic.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from cicada.utils import extract_text_from_node
|
|
6
|
+
|
|
5
7
|
from .base import extract_string_from_arguments
|
|
6
8
|
|
|
7
9
|
|
|
@@ -31,7 +33,7 @@ def _find_modules_recursive(node, source_code: bytes, modules: list):
|
|
|
31
33
|
|
|
32
34
|
# Check if this is a defmodule call
|
|
33
35
|
if target and arguments:
|
|
34
|
-
target_text =
|
|
36
|
+
target_text = extract_text_from_node(target, source_code)
|
|
35
37
|
|
|
36
38
|
if target_text == "defmodule":
|
|
37
39
|
# Extract module name from arguments
|
|
@@ -39,9 +41,7 @@ def _find_modules_recursive(node, source_code: bytes, modules: list):
|
|
|
39
41
|
|
|
40
42
|
for arg_child in arguments.children:
|
|
41
43
|
if arg_child.type == "alias":
|
|
42
|
-
module_name =
|
|
43
|
-
"utf-8"
|
|
44
|
-
)
|
|
44
|
+
module_name = extract_text_from_node(arg_child, source_code)
|
|
45
45
|
break
|
|
46
46
|
|
|
47
47
|
if module_name and do_block:
|
|
@@ -82,9 +82,7 @@ def _find_moduledoc_recursive(node, source_code: bytes) -> str | None:
|
|
|
82
82
|
# Check if this is a moduledoc attribute
|
|
83
83
|
for call_child in operand.children:
|
|
84
84
|
if call_child.type == "identifier":
|
|
85
|
-
attr_name =
|
|
86
|
-
"utf-8"
|
|
87
|
-
)
|
|
85
|
+
attr_name = extract_text_from_node(call_child, source_code)
|
|
88
86
|
|
|
89
87
|
if attr_name == "moduledoc":
|
|
90
88
|
# Extract the documentation string from the arguments
|
|
@@ -102,9 +100,7 @@ def _find_moduledoc_recursive(node, source_code: bytes) -> str | None:
|
|
|
102
100
|
is_defmodule = False
|
|
103
101
|
for call_child in child.children:
|
|
104
102
|
if call_child.type == "identifier":
|
|
105
|
-
target_text =
|
|
106
|
-
"utf-8"
|
|
107
|
-
)
|
|
103
|
+
target_text = extract_text_from_node(call_child, source_code)
|
|
108
104
|
if target_text == "defmodule":
|
|
109
105
|
is_defmodule = True
|
|
110
106
|
break
|
cicada/extractors/spec.py
CHANGED
|
@@ -2,62 +2,18 @@
|
|
|
2
2
|
Type spec extraction logic.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from cicada.utils import extract_text_from_node
|
|
6
|
+
|
|
7
|
+
from .common import _find_attribute_recursive
|
|
8
|
+
|
|
5
9
|
|
|
6
10
|
def extract_specs(node, source_code: bytes) -> dict:
|
|
7
11
|
"""Extract all @spec attributes from a module body."""
|
|
8
12
|
specs = {}
|
|
9
|
-
|
|
13
|
+
_find_attribute_recursive(node, source_code, specs, "spec", _parse_spec)
|
|
10
14
|
return specs
|
|
11
15
|
|
|
12
16
|
|
|
13
|
-
def _find_specs_recursive(node, source_code: bytes, specs: dict):
|
|
14
|
-
"""Recursively find @spec declarations."""
|
|
15
|
-
# Look for unary_operator nodes (which represent @ attributes)
|
|
16
|
-
if node.type == "unary_operator":
|
|
17
|
-
operator = None
|
|
18
|
-
operand = None
|
|
19
|
-
|
|
20
|
-
for child in node.children:
|
|
21
|
-
if child.type == "@":
|
|
22
|
-
operator = child
|
|
23
|
-
elif child.type == "call":
|
|
24
|
-
operand = child
|
|
25
|
-
|
|
26
|
-
if operator and operand:
|
|
27
|
-
# Check if this is a spec attribute
|
|
28
|
-
for call_child in operand.children:
|
|
29
|
-
if call_child.type == "identifier":
|
|
30
|
-
attr_name = source_code[call_child.start_byte : call_child.end_byte].decode(
|
|
31
|
-
"utf-8"
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
if attr_name == "spec":
|
|
35
|
-
# Extract the spec definition
|
|
36
|
-
spec_info = _parse_spec(operand, source_code)
|
|
37
|
-
if spec_info:
|
|
38
|
-
key = f"{spec_info['name']}/{spec_info['arity']}"
|
|
39
|
-
specs[key] = spec_info
|
|
40
|
-
|
|
41
|
-
# Recursively search children
|
|
42
|
-
for child in node.children:
|
|
43
|
-
# Don't recurse into nested defmodule or function definitions
|
|
44
|
-
if child.type == "call":
|
|
45
|
-
is_defmodule_or_def = False
|
|
46
|
-
for call_child in child.children:
|
|
47
|
-
if call_child.type == "identifier":
|
|
48
|
-
target_text = source_code[call_child.start_byte : call_child.end_byte].decode(
|
|
49
|
-
"utf-8"
|
|
50
|
-
)
|
|
51
|
-
if target_text in ["defmodule", "def", "defp"]:
|
|
52
|
-
is_defmodule_or_def = True
|
|
53
|
-
break
|
|
54
|
-
|
|
55
|
-
if is_defmodule_or_def:
|
|
56
|
-
continue
|
|
57
|
-
|
|
58
|
-
_find_specs_recursive(child, source_code, specs)
|
|
59
|
-
|
|
60
|
-
|
|
61
17
|
def _parse_spec(spec_node, source_code: bytes) -> dict | None:
|
|
62
18
|
"""Parse a @spec attribute to extract function name, arity, parameter types, and return type."""
|
|
63
19
|
# @spec is represented as: spec(function_signature)
|
|
@@ -81,9 +37,7 @@ def _parse_spec(spec_node, source_code: bytes) -> dict | None:
|
|
|
81
37
|
found_call = True
|
|
82
38
|
elif found_call and op_child.type not in ["::", "operator"]:
|
|
83
39
|
# This is the return type node (after :: operator)
|
|
84
|
-
return_type = source_code
|
|
85
|
-
op_child.start_byte : op_child.end_byte
|
|
86
|
-
].decode("utf-8")
|
|
40
|
+
return_type = extract_text_from_node(op_child, source_code)
|
|
87
41
|
|
|
88
42
|
if func_call:
|
|
89
43
|
func_name = None
|
|
@@ -91,9 +45,7 @@ def _parse_spec(spec_node, source_code: bytes) -> dict | None:
|
|
|
91
45
|
|
|
92
46
|
for fc_child in func_call.children:
|
|
93
47
|
if fc_child.type == "identifier":
|
|
94
|
-
func_name = source_code
|
|
95
|
-
fc_child.start_byte : fc_child.end_byte
|
|
96
|
-
].decode("utf-8")
|
|
48
|
+
func_name = extract_text_from_node(fc_child, source_code)
|
|
97
49
|
elif fc_child.type == "arguments":
|
|
98
50
|
param_types = _extract_param_types(fc_child, source_code)
|
|
99
51
|
|
|
@@ -117,7 +69,7 @@ def _extract_param_types(params_node, source_code: bytes) -> list[str]:
|
|
|
117
69
|
continue
|
|
118
70
|
|
|
119
71
|
# Get the type as a string
|
|
120
|
-
type_str =
|
|
72
|
+
type_str = extract_text_from_node(child, source_code)
|
|
121
73
|
param_types.append(type_str)
|
|
122
74
|
|
|
123
75
|
return param_types
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Format module for Cicada - handles formatting, colors, and ASCII art."""
|
|
2
|
+
|
|
3
|
+
from .ascii_art import generate_gradient_ascii_art
|
|
4
|
+
from .colors import BOLD, CYAN, GREEN, GREY, PRIMARY, RESET, SELECTED, YELLOW
|
|
5
|
+
from .formatter import JSONFormatter, ModuleFormatter, main
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"generate_gradient_ascii_art",
|
|
9
|
+
"BOLD",
|
|
10
|
+
"CYAN",
|
|
11
|
+
"GREEN",
|
|
12
|
+
"GREY",
|
|
13
|
+
"PRIMARY",
|
|
14
|
+
"RESET",
|
|
15
|
+
"SELECTED",
|
|
16
|
+
"YELLOW",
|
|
17
|
+
"ModuleFormatter",
|
|
18
|
+
"JSONFormatter",
|
|
19
|
+
"main",
|
|
20
|
+
]
|