cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. cicada/_version_hash.py +4 -0
  2. cicada/cli.py +6 -748
  3. cicada/commands.py +1255 -0
  4. cicada/dead_code/__init__.py +1 -0
  5. cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
  6. cicada/dependency_analyzer.py +147 -0
  7. cicada/entry_utils.py +92 -0
  8. cicada/extractors/base.py +9 -9
  9. cicada/extractors/call.py +17 -20
  10. cicada/extractors/common.py +64 -0
  11. cicada/extractors/dependency.py +117 -235
  12. cicada/extractors/doc.py +2 -49
  13. cicada/extractors/function.py +10 -14
  14. cicada/extractors/keybert.py +228 -0
  15. cicada/extractors/keyword.py +191 -0
  16. cicada/extractors/module.py +6 -10
  17. cicada/extractors/spec.py +8 -56
  18. cicada/format/__init__.py +20 -0
  19. cicada/{ascii_art.py → format/ascii_art.py} +1 -1
  20. cicada/format/formatter.py +1145 -0
  21. cicada/git_helper.py +134 -7
  22. cicada/indexer.py +322 -89
  23. cicada/interactive_setup.py +251 -323
  24. cicada/interactive_setup_helpers.py +302 -0
  25. cicada/keyword_expander.py +437 -0
  26. cicada/keyword_search.py +208 -422
  27. cicada/keyword_test.py +383 -16
  28. cicada/mcp/__init__.py +10 -0
  29. cicada/mcp/entry.py +17 -0
  30. cicada/mcp/filter_utils.py +107 -0
  31. cicada/mcp/pattern_utils.py +118 -0
  32. cicada/{mcp_server.py → mcp/server.py} +819 -73
  33. cicada/mcp/tools.py +473 -0
  34. cicada/pr_finder.py +2 -3
  35. cicada/pr_indexer/indexer.py +3 -2
  36. cicada/setup.py +167 -35
  37. cicada/tier.py +225 -0
  38. cicada/utils/__init__.py +9 -2
  39. cicada/utils/fuzzy_match.py +54 -0
  40. cicada/utils/index_utils.py +9 -0
  41. cicada/utils/path_utils.py +18 -0
  42. cicada/utils/text_utils.py +52 -1
  43. cicada/utils/tree_utils.py +47 -0
  44. cicada/version_check.py +99 -0
  45. cicada/watch_manager.py +320 -0
  46. cicada/watcher.py +431 -0
  47. cicada_mcp-0.3.0.dist-info/METADATA +541 -0
  48. cicada_mcp-0.3.0.dist-info/RECORD +70 -0
  49. cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
  50. cicada/formatter.py +0 -864
  51. cicada/keybert_extractor.py +0 -286
  52. cicada/lightweight_keyword_extractor.py +0 -290
  53. cicada/mcp_entry.py +0 -683
  54. cicada/mcp_tools.py +0 -291
  55. cicada_mcp-0.2.0.dist-info/METADATA +0 -735
  56. cicada_mcp-0.2.0.dist-info/RECORD +0 -53
  57. cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
  58. /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
  59. /cicada/{colors.py → format/colors.py} +0 -0
  60. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
  61. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
  62. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,286 +0,0 @@
1
- """
2
- Keyword Extraction using KeyBERT
3
- Semantic keyword extraction using transformer-based embeddings
4
- """
5
-
6
- import re
7
- import sys
8
- from typing import Any
9
-
10
- from cicada.utils import split_camel_snake_case
11
-
12
-
13
- class KeyBERTExtractor:
14
- """Extract keywords from text using KeyBERT semantic analysis."""
15
-
16
- # Model configurations for different performance tiers
17
- KEYBERT_MODELS = {
18
- "fast": "all-MiniLM-L6-v2", # 80MB, fast extraction
19
- "regular": "BAAI/bge-small-en-v1.5", # 133MB, balanced
20
- "max": "paraphrase-mpnet-base-v2", # 420MB, highest quality
21
- }
22
-
23
- # Weighting strategy constants for keyword extraction
24
- # These control how different types of keywords are prioritized
25
- KEYBERT_CANDIDATE_MULTIPLIER = 3 # Extract 3x keywords for weighted reranking
26
- CODE_IDENTIFIER_BOOST = 10 # 10x weight for exact code identifiers (e.g., function names)
27
- CODE_SPLIT_WORD_BOOST = 3 # 3x weight for identifier components (e.g., "user" from "getUserId")
28
- BASE_SCORE_IDENTIFIER = 0.5 # Base score for identifiers not found by BERT
29
- BASE_SCORE_SPLIT_WORD = 0.3 # Base score for split words not found by BERT
30
-
31
- # Class variable to hold KeyBERT class (lazily loaded)
32
- _KeyBERT: type | None = None
33
-
34
- def __init__(self, verbose: bool = False, model_tier: str | None = None):
35
- """
36
- Initialize KeyBERT model.
37
-
38
- Args:
39
- verbose: If True, print status messages during initialization
40
- model_tier: Model tier to use ('fast', 'regular', or 'max').
41
- If None, must be specified via config file.
42
-
43
- Raises:
44
- ImportError: If KeyBERT is not installed
45
- ValueError: If model_tier is invalid or not specified
46
- RuntimeError: If model loading fails
47
- """
48
- self.verbose = verbose
49
-
50
- # Validate model tier first
51
- if model_tier and model_tier not in self.KEYBERT_MODELS:
52
- raise ValueError(
53
- f"Invalid model tier '{model_tier}'. "
54
- f"Must be one of: {', '.join(self.KEYBERT_MODELS.keys())}"
55
- )
56
-
57
- if model_tier is None:
58
- raise ValueError(
59
- "model_tier must be specified. Pass it directly or load from config file."
60
- )
61
-
62
- self.model_tier = model_tier
63
- self.model_name = self.KEYBERT_MODELS[model_tier]
64
-
65
- # Print message BEFORE the slow import
66
- if self.verbose:
67
- print(
68
- f"Loading KeyBERT model ({model_tier}: {self.model_name})",
69
- file=sys.stderr,
70
- )
71
- print("This can take up to a couple of minutes.", file=sys.stderr)
72
-
73
- # Lazy import KeyBERT (only once per class)
74
- # This import can take significant time on first load
75
- if KeyBERTExtractor._KeyBERT is None:
76
- try:
77
- from keybert import KeyBERT
78
-
79
- KeyBERTExtractor._KeyBERT = KeyBERT
80
- except ImportError as e:
81
- raise ImportError(
82
- "KeyBERT is not installed. Install it with:\n"
83
- " uv add keybert\n"
84
- "or\n"
85
- " pip install keybert"
86
- ) from e
87
-
88
- # Initialize KeyBERT with the selected model
89
- # Assume model is pre-downloaded (user will handle caching separately)
90
- try:
91
- self.kw_model = KeyBERTExtractor._KeyBERT(model=self.model_name)
92
- if self.verbose:
93
- print("✓ Model loaded successfully", file=sys.stderr)
94
- except Exception as e:
95
- raise RuntimeError(
96
- f"Failed to load KeyBERT model '{self.model_name}'. "
97
- f"Ensure the model is downloaded and available. Error: {e}"
98
- ) from e
99
-
100
- def extract_code_identifiers(self, text: str) -> tuple[list[str], list[str]]:
101
- """
102
- Extract code-specific identifiers and their split words.
103
-
104
- Returns a tuple of (identifiers, split_words) where:
105
- - identifiers: original camelCase/PascalCase/snake_case identifiers
106
- - split_words: individual words extracted from those identifiers
107
- """
108
- # Match camelCase, snake_case, PascalCase, and mixed patterns
109
- patterns = [
110
- r"\b[a-z]+[A-Z][a-zA-Z]*\b", # camelCase (e.g., getUserData)
111
- r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b", # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
112
- r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b", # PascalCase (e.g., UserController, PostgreSQL)
113
- r"\b[a-z]+_[a-z_]+\b", # snake_case (e.g., get_user_data)
114
- r"\b[A-Z]{2,}\b", # All UPPERCASE (e.g., HTTP, API, SQL)
115
- ]
116
-
117
- identifiers = []
118
- for pattern in patterns:
119
- matches = re.findall(pattern, text)
120
- identifiers.extend(matches)
121
-
122
- identifiers = list(set(identifiers))
123
-
124
- # Split identifiers into individual words
125
- split_words = []
126
- for identifier in identifiers:
127
- split_text = split_camel_snake_case(identifier)
128
- # Extract individual words (lowercase, length > 1)
129
- words = [
130
- word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
131
- ]
132
- split_words.extend(words)
133
-
134
- return identifiers, list(set(split_words))
135
-
136
- def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
137
- """
138
- Extract keywords and return a simple list of keyword strings.
139
-
140
- Args:
141
- text: Input text to analyze
142
- top_n: Number of top keywords to return
143
-
144
- Returns:
145
- List of keyword strings (e.g., ['authentication', 'user', 'validate'])
146
- """
147
- if not text or not text.strip():
148
- return []
149
-
150
- try:
151
- results = self.extract_keywords(text, top_n=top_n)
152
- # Extract just the keyword strings from top_keywords tuples
153
- return [keyword for keyword, _ in results["top_keywords"]]
154
- except Exception as e:
155
- if self.verbose:
156
- print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
157
- return []
158
-
159
- def extract_keywords(self, text: str, top_n: int = 15) -> dict[str, Any]:
160
- """
161
- Extract keywords using KeyBERT semantic analysis with code identifier emphasis.
162
-
163
- Weighting strategy:
164
- - Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
165
- - Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
166
- - KeyBERT semantic keywords: Base score from embedding similarity
167
-
168
- Magic numbers explained:
169
- - 3x multiplier: For candidate selection (top_n * 3) to have enough keywords
170
- before applying weights. This ensures we don't miss important keywords that
171
- might rank higher after code identifier boosting.
172
- - 0.5 base score: Default confidence for code identifiers not found by KeyBERT.
173
- After 10x boost, gives them a score of 5.0, prioritizing them over most
174
- regular keywords.
175
- - 0.3 base score: Default confidence for code split words not found by KeyBERT.
176
- After 3x boost, gives them a score of 0.9, placing them between regular
177
- keywords (0.4-0.7) and full identifiers (5.0).
178
-
179
- Args:
180
- text: Input text to analyze
181
- top_n: Number of top keywords to return
182
-
183
- Returns:
184
- Dictionary with extracted keywords and analysis:
185
- - top_keywords: List of (keyword, score) tuples, sorted by weighted score
186
- - code_identifiers: Original identifiers (weighted 10x)
187
- - code_split_words: Words extracted from identifiers (weighted 3x)
188
- - noun_chunks: 2-word phrases from KeyBERT (if any)
189
- - Other fields (nouns, verbs, etc.) are empty (KeyBERT doesn't do POS tagging)
190
- - stats: Basic text statistics
191
- """
192
- if not text or not text.strip():
193
- return {
194
- "top_keywords": [],
195
- "nouns": [],
196
- "verbs": [],
197
- "adjectives": [],
198
- "proper_nouns": [],
199
- "noun_chunks": [],
200
- "entities": [],
201
- "code_identifiers": [],
202
- "code_split_words": [],
203
- "tf_scores": {},
204
- "stats": {
205
- "total_tokens": 0,
206
- "total_words": 0,
207
- "unique_words": 0,
208
- "sentences": 0,
209
- },
210
- }
211
-
212
- # 1. Extract code identifiers and their split words
213
- code_identifiers, code_split_words = self.extract_code_identifiers(text)
214
-
215
- # 2. Use KeyBERT to extract semantic keywords
216
- # Extract more than needed to have candidates for weighting
217
- try:
218
- # KeyBERT return type can vary, use type ignore for external library
219
- keybert_keywords: list[tuple[str, float]] = self.kw_model.extract_keywords( # type: ignore[assignment]
220
- text,
221
- top_n=top_n * self.KEYBERT_CANDIDATE_MULTIPLIER,
222
- keyphrase_ngram_range=(1, 1), # Single words only
223
- )
224
- except Exception as e:
225
- if self.verbose:
226
- print(f"Warning: KeyBERT extraction failed: {e}", file=sys.stderr)
227
- keybert_keywords = []
228
-
229
- # 3. Build weighted keyword scores
230
- keyword_scores: dict[str, float] = {}
231
-
232
- # Add KeyBERT keywords with their semantic similarity scores
233
- for keyword, score in keybert_keywords:
234
- keyword_lower: str = keyword.lower()
235
- keyword_scores[keyword_lower] = score
236
-
237
- # 4. Apply code identifier boosting
238
- # Code identifiers get strong boost as they're likely important API/function names
239
- code_identifiers_lower = [ident.lower() for ident in code_identifiers]
240
- for identifier in code_identifiers_lower:
241
- if identifier in keyword_scores:
242
- keyword_scores[identifier] *= self.CODE_IDENTIFIER_BOOST
243
- else:
244
- # Add with high base score if not found by KeyBERT
245
- keyword_scores[identifier] = self.BASE_SCORE_IDENTIFIER * self.CODE_IDENTIFIER_BOOST
246
-
247
- # 5. Apply split word boosting (lower than full identifiers)
248
- # Split words are components of identifiers, somewhat important but less than full names
249
- code_split_words_lower = [word.lower() for word in code_split_words]
250
- for word in code_split_words_lower:
251
- if word in keyword_scores:
252
- keyword_scores[word] *= self.CODE_SPLIT_WORD_BOOST
253
- else:
254
- keyword_scores[word] = self.BASE_SCORE_SPLIT_WORD * self.CODE_SPLIT_WORD_BOOST
255
-
256
- # 5. Sort by weighted score and take top_n
257
- top_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
258
-
259
- # 6. No noun_chunks since we're using single words only
260
- noun_chunks = []
261
-
262
- # 7. Calculate basic statistics
263
- words = text.split()
264
- unique_words = {w.lower() for w in words if w.isalpha()}
265
- sentences = text.count(".") + text.count("!") + text.count("?")
266
-
267
- stats = {
268
- "total_tokens": len(words),
269
- "total_words": len([w for w in words if w.isalpha()]),
270
- "unique_words": len(unique_words),
271
- "sentences": max(1, sentences), # At least 1 sentence
272
- }
273
-
274
- return {
275
- "top_keywords": top_keywords,
276
- "nouns": [], # KeyBERT doesn't extract POS tags
277
- "verbs": [],
278
- "adjectives": [],
279
- "proper_nouns": [],
280
- "noun_chunks": noun_chunks,
281
- "entities": [], # KeyBERT doesn't do NER
282
- "code_identifiers": code_identifiers,
283
- "code_split_words": code_split_words,
284
- "tf_scores": {}, # Using semantic scores instead
285
- "stats": stats,
286
- }
@@ -1,290 +0,0 @@
1
- """
2
- Lightweight Keyword Extraction using lemminflect
3
- Fast keyword extraction for programming documentation
4
- """
5
-
6
- import re
7
- import sys
8
- import warnings
9
- from collections import Counter
10
-
11
- from cicada.utils import split_camel_snake_case
12
-
13
-
14
- class LightweightKeywordExtractor:
15
- """Extract keywords from text using lightweight lemmatization."""
16
-
17
- STOPWORDS = {
18
- "the",
19
- "a",
20
- "an",
21
- "and",
22
- "or",
23
- "but",
24
- "in",
25
- "on",
26
- "at",
27
- "to",
28
- "for",
29
- "of",
30
- "with",
31
- "by",
32
- "from",
33
- "as",
34
- "is",
35
- "are",
36
- "was",
37
- "were",
38
- "be",
39
- "been",
40
- "being",
41
- "have",
42
- "has",
43
- "had",
44
- "do",
45
- "does",
46
- "did",
47
- "will",
48
- "would",
49
- "should",
50
- "could",
51
- "this",
52
- "that",
53
- "these",
54
- "those",
55
- "it",
56
- "its",
57
- "they",
58
- "them",
59
- "their",
60
- "what",
61
- "which",
62
- "who",
63
- "when",
64
- "where",
65
- "why",
66
- "how",
67
- "all",
68
- "each",
69
- "every",
70
- "both",
71
- "few",
72
- "more",
73
- "most",
74
- "other",
75
- "some",
76
- "such",
77
- "no",
78
- "nor",
79
- "not",
80
- "only",
81
- "own",
82
- "same",
83
- "so",
84
- "than",
85
- "too",
86
- "very",
87
- "can",
88
- "just",
89
- "up",
90
- "out",
91
- }
92
-
93
- # Pre-compiled regex patterns for code identifier extraction
94
- CODE_PATTERNS = [
95
- re.compile(r"\b[a-z]+[A-Z][a-zA-Z]*\b"), # camelCase
96
- re.compile(r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b"), # HTTPServer
97
- re.compile(r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b"), # PascalCase
98
- re.compile(r"\b[a-z]+_[a-z_]+\b"), # snake_case
99
- re.compile(r"\b[A-Z]{2,}\b"), # UPPERCASE
100
- ]
101
-
102
- # Pre-compiled tokenization pattern
103
- TOKEN_PATTERN = re.compile(r"\b[a-zA-Z][a-zA-Z0-9_]*\b")
104
-
105
- def __init__(self, verbose: bool = False, model_size: str = "small"):
106
- """
107
- Initialize lightweight keyword extractor.
108
-
109
- Args:
110
- verbose: If True, print status messages during initialization
111
- model_size: Deprecated parameter kept for backward compatibility.
112
- This parameter is ignored in the lightweight extractor.
113
- """
114
- self.verbose = verbose
115
- self.model_size = model_size
116
- self._lemminflect_loaded = False
117
-
118
- # Deprecation warning for model_size parameter
119
- if model_size != "small":
120
- warnings.warn(
121
- "The 'model_size' parameter is deprecated and ignored in LightweightKeywordExtractor. "
122
- "The lightweight extractor does not use size-based models.",
123
- DeprecationWarning,
124
- stacklevel=2,
125
- )
126
-
127
- def _load_lemminflect(self):
128
- """Lazy load lemminflect library."""
129
- if self._lemminflect_loaded:
130
- return
131
- try:
132
- import lemminflect
133
-
134
- self._lemminflect = lemminflect
135
- self._lemminflect_loaded = True
136
- if self.verbose:
137
- print("✓ lemminflect loaded", file=sys.stderr)
138
- except ImportError as e:
139
- raise RuntimeError(
140
- "lemminflect is required but not installed. "
141
- "Please install it with: uv pip install lemminflect"
142
- ) from e
143
-
144
- def _tokenize(self, text: str) -> list[str]:
145
- """Tokenize text into words."""
146
- tokens = self.TOKEN_PATTERN.findall(text)
147
- return tokens
148
-
149
- def _lemmatize(self, word: str) -> str:
150
- """
151
- Lemmatize a word using lemminflect with fallback.
152
-
153
- Tries lemmatization with VERB, NOUN, and ADJ POS tags.
154
- Falls back to lowercase if lemmatization fails.
155
-
156
- Args:
157
- word: Word to lemmatize
158
-
159
- Returns:
160
- Lemmatized word (lowercase)
161
- """
162
- try:
163
- # Try different POS tags for better coverage
164
- for pos in ["VERB", "NOUN", "ADJ"]:
165
- lemma = self._lemminflect.getLemma(word, upos=pos)
166
- if lemma:
167
- return lemma[0].lower()
168
- # Fallback to lowercase if no lemma found
169
- return word.lower()
170
- except Exception:
171
- # Graceful fallback if lemminflect fails
172
- return word.lower()
173
-
174
- def extract_code_identifiers(self, text):
175
- """
176
- Extract code-specific identifiers and their split words.
177
-
178
- Returns a tuple of (identifiers, split_words) where:
179
- - identifiers: original camelCase/PascalCase/snake_case identifiers
180
- - split_words: individual words extracted from those identifiers
181
- """
182
- identifiers = []
183
- for pattern in self.CODE_PATTERNS:
184
- matches = pattern.findall(text)
185
- identifiers.extend(matches)
186
- identifiers = list(set(identifiers))
187
-
188
- split_words = []
189
- for identifier in identifiers:
190
- split_text = split_camel_snake_case(identifier)
191
- words = [
192
- word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
193
- ]
194
- split_words.extend(words)
195
- return identifiers, list(set(split_words))
196
-
197
- def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
198
- """
199
- Extract keywords and return a simple list of keyword strings.
200
-
201
- Args:
202
- text: Input text to analyze
203
- top_n: Number of top keywords to return
204
-
205
- Returns:
206
- List of keyword strings (e.g., ['authentication', 'user', 'validate'])
207
- """
208
- if not text or not text.strip():
209
- return []
210
- try:
211
- results = self.extract_keywords(text, top_n=top_n)
212
- return [keyword for keyword, _ in results["top_keywords"]]
213
- except Exception as e:
214
- if self.verbose:
215
- print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
216
- return []
217
-
218
- def extract_keywords(self, text, top_n=15):
219
- """
220
- Extract keywords using multiple strategies with emphasis on code identifiers.
221
-
222
- Weighting strategy:
223
- - Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
224
- - Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
225
- - Regular lemmatized words: 1x weight
226
-
227
- Args:
228
- text: Input text to analyze
229
- top_n: Number of top keywords to return
230
-
231
- Returns:
232
- Dictionary with extracted keywords and analysis:
233
- - top_keywords: List of (keyword, count) tuples, sorted by frequency
234
- - lemmatized_words: Regular words after lemmatization
235
- - code_identifiers: Original identifiers (weighted 10x)
236
- - code_split_words: Words extracted from identifiers (weighted 3x)
237
- - tf_scores: Term frequency scores
238
- - stats: Text statistics
239
- """
240
- if not text or not text.strip():
241
- return {
242
- "top_keywords": [],
243
- "lemmatized_words": [],
244
- "code_identifiers": [],
245
- "code_split_words": [],
246
- "tf_scores": {},
247
- "stats": {
248
- "total_tokens": 0,
249
- "total_words": 0,
250
- "unique_words": 0,
251
- },
252
- }
253
-
254
- self._load_lemminflect()
255
- code_identifiers, code_split_words = self.extract_code_identifiers(text)
256
- tokens = self._tokenize(text)
257
- lemmatized_words = []
258
- for word in tokens:
259
- word_lower = word.lower()
260
- if len(word) > 2 and word_lower not in self.STOPWORDS:
261
- lemma = self._lemmatize(word)
262
- lemmatized_words.append(lemma)
263
-
264
- code_identifiers_lower = [ident.lower() for ident in code_identifiers]
265
- all_keywords = lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
266
- keyword_freq = Counter(all_keywords)
267
- top_keywords = keyword_freq.most_common(top_n)
268
-
269
- # Fix: Calculate TF scores based on all keywords, not just lemmatized_words
270
- # This ensures weighted keywords are included in the calculation
271
- total_words = len(all_keywords)
272
- if total_words > 0:
273
- tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
274
- else:
275
- tf_scores = {}
276
-
277
- stats = {
278
- "total_tokens": len(tokens),
279
- "total_words": len(lemmatized_words),
280
- "unique_words": len(set(lemmatized_words)),
281
- }
282
-
283
- return {
284
- "top_keywords": top_keywords,
285
- "lemmatized_words": list(set(lemmatized_words))[:20],
286
- "code_identifiers": code_identifiers,
287
- "code_split_words": code_split_words,
288
- "tf_scores": dict(sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]),
289
- "stats": stats,
290
- }