cicada-mcp 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cicada-mcp might be problematic. Click here for more details.

cicada/install.py CHANGED
@@ -251,7 +251,15 @@ def detect_installation_method():
251
251
  ".local/share/uv/tools" in script_path_str
252
252
  or ".local/bin/cicada-" in script_path_str
253
253
  ):
254
- # Installed via uv tool install
254
+ # Installed via uv tool install - check for cicada-mcp first
255
+ if shutil.which("cicada-mcp"):
256
+ return (
257
+ "cicada-mcp",
258
+ [],
259
+ None,
260
+ "uv tool install (ensure ~/.local/bin is in PATH)",
261
+ )
262
+ # Fall back to cicada-server for backwards compatibility
255
263
  return (
256
264
  "cicada-server",
257
265
  [],
@@ -259,7 +267,11 @@ def detect_installation_method():
259
267
  "uv tool install (ensure ~/.local/bin is in PATH)",
260
268
  )
261
269
 
262
- # Check if cicada-server is in PATH (from uv tool install)
270
+ # Check if cicada-mcp is in PATH first (from uv tool install)
271
+ if shutil.which("cicada-mcp"):
272
+ return ("cicada-mcp", [], None, "uv tool install (permanent, fast)")
273
+
274
+ # Fall back to cicada-server for backwards compatibility
263
275
  if shutil.which("cicada-server"):
264
276
  return ("cicada-server", [], None, "uv tool install (permanent, fast)")
265
277
 
@@ -279,8 +291,13 @@ def check_tools_in_path():
279
291
  """Check if cicada tools are in PATH."""
280
292
  import shutil
281
293
 
282
- tools = ["cicada-server", "cicada-index"]
294
+ # Check for cicada-mcp (new) or cicada-server (backwards compat)
295
+ has_mcp_server = shutil.which("cicada-mcp") or shutil.which("cicada-server")
296
+ tools = ["cicada-index"]
283
297
  visible_tools = [tool for tool in tools if shutil.which(tool)]
298
+ if has_mcp_server:
299
+ visible_tools.insert(0, "cicada-mcp/cicada-server")
300
+ tools.insert(0, "cicada-mcp/cicada-server")
284
301
 
285
302
  if len(visible_tools) == len(tools):
286
303
  return "all_visible"
@@ -351,8 +368,8 @@ def create_mcp_config(repo_path, _cicada_dir, _python_bin):
351
368
  print(f"✓ MCP configuration updated at {mcp_config_path}")
352
369
 
353
370
  # Show what was configured
354
- if command == "cicada-server":
355
- print("✅ Using 'cicada-server' command (fast, no paths needed)")
371
+ if command in ("cicada-mcp", "cicada-server"):
372
+ print(f"✅ Using '{command}' command (fast, no paths needed)")
356
373
  else:
357
374
  print(f"ℹ️ Using Python: {command}")
358
375
 
cicada/setup.py CHANGED
@@ -102,9 +102,15 @@ def get_mcp_config_for_editor(
102
102
  # Detect installation method
103
103
  import shutil
104
104
 
105
+ # Check for cicada-mcp first (new name), fall back to cicada-server (backwards compat)
106
+ has_cicada_mcp = shutil.which("cicada-mcp") is not None
105
107
  has_cicada_server = shutil.which("cicada-server") is not None
106
108
 
107
- if has_cicada_server:
109
+ if has_cicada_mcp:
110
+ command = "cicada-mcp"
111
+ args = []
112
+ cwd = None
113
+ elif has_cicada_server:
108
114
  command = "cicada-server"
109
115
  args = []
110
116
  cwd = None
@@ -275,7 +281,8 @@ def setup(editor: EditorType, repo_path: Path | None = None) -> None:
275
281
  import shutil
276
282
  from cicada import __version__
277
283
 
278
- if not shutil.which("cicada-server"):
284
+ # Check for either cicada-mcp or cicada-server (backwards compat)
285
+ if not (shutil.which("cicada-mcp") or shutil.which("cicada-server")):
279
286
  print("💡 Tip: For best experience, install Cicada permanently:")
280
287
  print(
281
288
  f" uv tool install git+https://github.com/wende/cicada.git@v{__version__}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cicada-mcp
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: An Elixir module search MCP server
5
5
  Author-email: wende <wende@hey.com>
6
6
  Maintainer-email: wende <wende@hey.com>
@@ -138,7 +138,7 @@ cicada claude # or: cicada cursor, cicada vs
138
138
 
139
139
  **Available commands after installation:**
140
140
  - `cicada [claude|cursor|vs]` - One-command setup per project
141
- - `cicada-server` - MCP server (auto-started by editor)
141
+ - `cicada-mcp` - MCP server (auto-started by editor)
142
142
  - `cicada-index` - Re-index code with custom options (medium/large spaCy models)
143
143
  - `cicada-index-pr` - Index pull requests for PR attribution
144
144
  - `cicada-install` - Legacy setup (creates `.cicada/` in repo)
@@ -169,6 +169,33 @@ uvx --from git+https://github.com/wende/cicada.git@latest cicada vs
169
169
 
170
170
  Once you're convinced, install permanently with `uv tool install` above!
171
171
 
172
+ ### Quick Setup for Cursor and Claude Code
173
+
174
+ **For Cursor:**
175
+
176
+ Click the install button at the top of this README or visit:
177
+ [![Install MCP Server](https://cursor.com/deeplink/mcp-install-dark.svg)](https://cursor.com/en-US/install-mcp?name=cicada&config=eyJjb21tYW5kIjoidXZ4IC0tZnJvbSBnaXQraHR0cHM6Ly9naXRodWIuY29tL3dlbmRlL2NpY2FkYS5naXRAbGF0ZXN0IGNpY2FkYS1zZXJ2ZXIgLiJ9)
178
+
179
+ **For Claude Code:**
180
+
181
+ ```bash
182
+ # Option 1: Using claude mcp add command
183
+ claude mcp add cicada -- uvx --from git+https://github.com/wende/cicada.git@latest cicada-mcp ./path/to/your/codebase
184
+
185
+ # Option 2: Using setup script
186
+ uvx --from git+https://github.com/wende/cicada.git@latest cicada claude
187
+ ```
188
+
189
+ **Then for both editors,** run these commands in your codebase to generate keyword lookup and GitHub PR lookup databases:
190
+
191
+ ```bash
192
+ # Generate keyword lookup database
193
+ uvx --from git+https://github.com/wende/cicada.git@latest cicada-index .
194
+
195
+ # Generate GitHub PR lookup database
196
+ uvx --from git+https://github.com/wende/cicada.git@latest cicada-index-pr .
197
+ ```
198
+
172
199
  ---
173
200
 
174
201
  ## Quick Start
@@ -221,7 +248,7 @@ your-project/
221
248
  {
222
249
  "mcpServers": {
223
250
  "cicada": {
224
- "command": "cicada-server",
251
+ "command": "cicada-mcp",
225
252
  "env": {
226
253
  "CICADA_REPO_PATH": "/path/to/project",
227
254
  "CICADA_CONFIG_DIR": "/home/user/.cicada/projects/<hash>"
@@ -6,15 +6,14 @@ cicada/find_dead_code.py,sha256=xCheicrNbYhLvrPGgqVJJBbf_rAm_gXwnfONDWPnNI0,8288
6
6
  cicada/formatter.py,sha256=wwxD1nt1ub7HDeDRGc61JhpmgleNVlp0SfQG9QBgGns,36194
7
7
  cicada/git_helper.py,sha256=zhyqSfk90tCwndWYxhh-LxFmqqXB1Wki91uDkZRr7Js,24303
8
8
  cicada/indexer.py,sha256=gVj6Jwc-sZgcGZnueqpRqcn4Wu451qo6RVfGuQahaZ4,25249
9
- cicada/install.py,sha256=mM8hj1_45CkXUFbJd8ve8dqYyIzNY1HhNbVKbseiJ4s,23214
10
- cicada/keyword_extractor.py,sha256=9oEEU3cwv5prsWYn1P-nNFayArQeXgCFNzx4iaq1qhg,13425
9
+ cicada/install.py,sha256=VU7OI031cM0S-Y7udXVRFs2hluQRI9S6tIm3XBLJL2w,23980
11
10
  cicada/keyword_search.py,sha256=pj5zSsYKX-pOeWyGI53ZRAZm91BnrEMHofGNoenoIqQ,21746
12
11
  cicada/lightweight_keyword_extractor.py,sha256=KtxcOjLPuoY6EjcWNvHvoZswcg9IoryMfG4EM3_LDMg,9172
13
12
  cicada/mcp_server.py,sha256=k_JnwQExgQ-dTAA-MfPTl8G02B9MEmVZJb8fAc_UnPY,60299
14
13
  cicada/mcp_tools.py,sha256=LHNyrpztmY0yk1Ysu3_I-ZE7KmngJJ0ukKd-1OJpenA,13805
15
14
  cicada/parser.py,sha256=uQlzYnQQicUWU-yF9LgvqDK-83xImzGlZOkjPoov8_I,4022
16
15
  cicada/pr_finder.py,sha256=FPSaGe5W4RwPi93VmyoIWcUZIaHLZdHsT7s_WCIvHBM,14214
17
- cicada/setup.py,sha256=n9hFlK4LmPG7ivCvnburXvD-7sWwZjCvz6sdWRD_d_0,9166
16
+ cicada/setup.py,sha256=23TRe2dRQNG2XsTobwN4jpfQ6aOTRXp_cRp1zHqv6CI,9512
18
17
  cicada/version_check.py,sha256=c8BFl--ohKfLZYe_3tX40rKXydTR6FVGWiseGuIvcBk,3181
19
18
  cicada/extractors/__init__.py,sha256=Dnm_jjWMGPvaGmt1aZqcgpS964tak4hys5BFOjbCcg8,890
20
19
  cicada/extractors/base.py,sha256=reenF-Cngpg1LgueWsddYzGcmtHElSuNv1F5OlZRFpI,2487
@@ -40,9 +39,9 @@ cicada/utils/signature_builder.py,sha256=O76JfypSESNncQ_OppCAR7aUDz4ocBNPXEmI9uh
40
39
  cicada/utils/storage.py,sha256=wbw_Ma77v4uevDGTQP06Eu4m5V8IU6GkKUARYWXgj1A,2578
41
40
  cicada/utils/subprocess_runner.py,sha256=fibqu_YCCmQPvtTwaDkGkVyhGVSQ6oX235pBYavQW5M,5168
42
41
  cicada/utils/text_utils.py,sha256=_lt_65BcAVZa36QrTY84GR8v5m5oxvfPY3tr6PoNaxw,2923
43
- cicada_mcp-0.1.4.dist-info/licenses/LICENSE,sha256=ijMI5EAN1o3jl676-BOu0ELzlsBr2FqTRzmha9e1lug,1062
44
- cicada_mcp-0.1.4.dist-info/METADATA,sha256=pj5-4L2Bz3xn6w6r7HJTRwgPCTa-KiLNOUrTPREDLF0,18931
45
- cicada_mcp-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
- cicada_mcp-0.1.4.dist-info/entry_points.txt,sha256=DFW2H5na_prQFHRcgcDOkziQCpzykOZuHO5cztoMA2Y,281
47
- cicada_mcp-0.1.4.dist-info/top_level.txt,sha256=xZCtaMDbCi2CKA5PExum99ZU54IJg5iognV-k44a1W0,7
48
- cicada_mcp-0.1.4.dist-info/RECORD,,
42
+ cicada_mcp-0.1.5.dist-info/licenses/LICENSE,sha256=ijMI5EAN1o3jl676-BOu0ELzlsBr2FqTRzmha9e1lug,1062
43
+ cicada_mcp-0.1.5.dist-info/METADATA,sha256=h0oThL5OxMrls5uEubs4LQOJCK9p8yx-MkFQHzaVq7o,19952
44
+ cicada_mcp-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ cicada_mcp-0.1.5.dist-info/entry_points.txt,sha256=cwG5-3TwDGwFPiKiOC5gjlfvi9mBFc01EVtX2ZcXpcQ,317
46
+ cicada_mcp-0.1.5.dist-info/top_level.txt,sha256=xZCtaMDbCi2CKA5PExum99ZU54IJg5iognV-k44a1W0,7
47
+ cicada_mcp-0.1.5.dist-info/RECORD,,
@@ -5,4 +5,5 @@ cicada-find-dead-code = cicada.find_dead_code:main
5
5
  cicada-index = cicada.indexer:main
6
6
  cicada-index-pr = cicada.pr_indexer:main
7
7
  cicada-install = cicada.install:main
8
+ cicada-mcp = cicada.mcp_server:main
8
9
  cicada-server = cicada.mcp_server:main
@@ -1,364 +0,0 @@
1
- """
2
- Keyword Extraction using spaCy
3
- Advanced NLP-based keyword extraction for programming documentation
4
-
5
- DEPRECATED: This module is being replaced by lightweight_keyword_extractor.py
6
- which provides faster performance using lemminflect instead of spaCy.
7
-
8
- The spaCy-based extractor has been kept for backward compatibility and for
9
- cases where advanced NLP features are needed. For most use cases, prefer
10
- LightweightKeywordExtractor from cicada.lightweight_keyword_extractor.
11
-
12
- Performance comparison:
13
- - LightweightKeywordExtractor: ~0.1s startup time
14
- - KeywordExtractor (spaCy): ~2s startup time
15
-
16
- See: cicada.lightweight_keyword_extractor.LightweightKeywordExtractor
17
- """
18
-
19
- from collections import Counter
20
- import re
21
- import sys
22
- import subprocess
23
-
24
- from cicada.utils import split_camel_snake_case
25
-
26
- # Lazy import spacy only when needed
27
- spacy = None
28
-
29
-
30
- def _ensure_spacy_imported():
31
- """Import spacy only when needed."""
32
- global spacy
33
- if spacy is None:
34
- import spacy as spacy_module
35
-
36
- spacy = spacy_module
37
-
38
-
39
- class KeywordExtractor:
40
- """Extract keywords from text using spaCy NLP."""
41
-
42
- # spaCy model names for different sizes
43
- SPACY_MODELS = {
44
- "small": "en_core_web_sm",
45
- "medium": "en_core_web_md",
46
- "large": "en_core_web_lg",
47
- }
48
-
49
- def __init__(self, verbose: bool = False, model_size: str = "small"):
50
- """
51
- Initialize keyword extractor with lazy model loading.
52
-
53
- Args:
54
- verbose: If True, print status messages during initialization
55
- model_size: Size of spaCy model to use ('small', 'medium', or 'large')
56
- Default is 'small'. Medium and large models provide better
57
- accuracy but are slower and require more memory.
58
- """
59
- self.verbose = verbose
60
-
61
- # Validate model size
62
- if model_size not in self.SPACY_MODELS:
63
- raise ValueError(
64
- f"Invalid model size '{model_size}'. "
65
- f"Must be one of: {', '.join(self.SPACY_MODELS.keys())}"
66
- )
67
-
68
- self.model_size = model_size
69
- self.model_name = self.SPACY_MODELS[model_size]
70
- self.nlp = None # Lazy-loaded on first use
71
-
72
- def _ensure_model_loaded(self):
73
- """
74
- Ensure the spaCy model is loaded, downloading if necessary.
75
- Only called when model is actually needed (lazy loading).
76
- """
77
- if self.nlp is not None:
78
- return # Already loaded
79
-
80
- # Ensure spacy is imported
81
- _ensure_spacy_imported()
82
-
83
- if self.verbose:
84
- print(f"Loading spaCy model ({self.model_size})...", file=sys.stderr)
85
-
86
- try:
87
- # Import the model directly as a Python package (fast failure if not installed)
88
- import importlib
89
-
90
- model_module = importlib.import_module(self.model_name)
91
- self.nlp = model_module.load()
92
- if self.verbose:
93
- print("✓ Model loaded successfully", file=sys.stderr)
94
- except (ImportError, AttributeError):
95
- # Model not installed, download it
96
- if self.verbose:
97
- print(
98
- f"Model '{self.model_name}' not found. Downloading...",
99
- file=sys.stderr,
100
- )
101
-
102
- if not self._download_model():
103
- raise RuntimeError(
104
- f"Failed to download spaCy model '{self.model_name}'. "
105
- f"Please install it manually with: python -m spacy download {self.model_name}"
106
- )
107
-
108
- # Try importing again after download
109
- try:
110
- import importlib
111
-
112
- model_module = importlib.import_module(self.model_name)
113
- self.nlp = model_module.load()
114
- if self.verbose:
115
- print("✓ Model loaded successfully", file=sys.stderr)
116
- except (ImportError, AttributeError) as e:
117
- raise RuntimeError(
118
- f"Failed to load spaCy model '{self.model_name}' after download. "
119
- f"Please try installing it manually: python -m spacy download {self.model_name}"
120
- ) from e
121
-
122
- def _download_model(self) -> bool:
123
- """
124
- Download the spaCy model using uv pip install.
125
-
126
- Returns:
127
- True if download succeeded, False otherwise
128
- """
129
- # Model URLs for direct installation
130
- model_urls = {
131
- "en_core_web_sm": "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
132
- "en_core_web_md": "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl",
133
- "en_core_web_lg": "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl",
134
- }
135
-
136
- if self.model_name not in model_urls:
137
- if self.verbose:
138
- print(f"Unknown model: {self.model_name}", file=sys.stderr)
139
- return False
140
-
141
- model_url = model_urls[self.model_name]
142
-
143
- # Use uv pip install (works in uv-managed environments)
144
- try:
145
- if self.verbose:
146
- print(f"Running: uv pip install {model_url}", file=sys.stderr)
147
-
148
- result = subprocess.run(
149
- ["uv", "pip", "install", model_url],
150
- capture_output=True,
151
- text=True,
152
- check=True,
153
- )
154
-
155
- if self.verbose and result.stdout:
156
- print(result.stdout, file=sys.stderr)
157
-
158
- return True
159
- except FileNotFoundError:
160
- if self.verbose:
161
- print(
162
- "uv not found. Please install uv or manually install the model:",
163
- file=sys.stderr,
164
- )
165
- print(f" uv pip install {model_url}", file=sys.stderr)
166
- return False
167
- except subprocess.CalledProcessError as e:
168
- if self.verbose:
169
- print(f"uv pip install failed: {e.stderr}", file=sys.stderr)
170
- return False
171
- except Exception as e:
172
- if self.verbose:
173
- print(f"Unexpected error during download: {e}", file=sys.stderr)
174
- return False
175
-
176
- def extract_code_identifiers(self, text):
177
- """
178
- Extract code-specific identifiers and their split words.
179
-
180
- Returns a tuple of (identifiers, split_words) where:
181
- - identifiers: original camelCase/PascalCase/snake_case identifiers
182
- - split_words: individual words extracted from those identifiers
183
- """
184
- # Match camelCase, snake_case, PascalCase, and mixed patterns
185
- patterns = [
186
- r"\b[a-z]+[A-Z][a-zA-Z]*\b", # camelCase (e.g., getUserData)
187
- r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b", # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
188
- r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b", # PascalCase (e.g., UserController, PostgreSQL)
189
- r"\b[a-z]+_[a-z_]+\b", # snake_case (e.g., get_user_data)
190
- r"\b[A-Z]{2,}\b", # All UPPERCASE (e.g., HTTP, API, SQL)
191
- ]
192
-
193
- identifiers = []
194
- for pattern in patterns:
195
- matches = re.findall(pattern, text)
196
- identifiers.extend(matches)
197
-
198
- identifiers = list(set(identifiers))
199
-
200
- # Split identifiers into individual words
201
- split_words = []
202
- for identifier in identifiers:
203
- split_text = split_camel_snake_case(identifier)
204
- # Extract individual words (lowercase, length > 1)
205
- words = [
206
- word.lower()
207
- for word in split_text.split()
208
- if len(word) > 1 and word.isalpha()
209
- ]
210
- split_words.extend(words)
211
-
212
- return identifiers, list(set(split_words))
213
-
214
- def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
215
- """
216
- Extract keywords and return a simple list of keyword strings.
217
-
218
- Args:
219
- text: Input text to analyze
220
- top_n: Number of top keywords to return
221
-
222
- Returns:
223
- List of keyword strings (e.g., ['authentication', 'user', 'validate'])
224
- """
225
- if not text or not text.strip():
226
- return []
227
-
228
- try:
229
- self._ensure_model_loaded()
230
- results = self.extract_keywords(text, top_n=top_n)
231
- # Extract just the keyword strings from top_keywords tuples
232
- return [keyword for keyword, _ in results["top_keywords"]]
233
- except Exception as e:
234
- if self.verbose:
235
- print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
236
- return []
237
-
238
- def extract_keywords(self, text, top_n=15):
239
- """
240
- Extract keywords using multiple strategies with emphasis on code identifiers.
241
-
242
- Weighting strategy:
243
- - Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
244
- - Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
245
- - Regular words (nouns, verbs): 1x weight
246
-
247
- Args:
248
- text: Input text to analyze
249
- top_n: Number of top keywords to return
250
-
251
- Returns:
252
- Dictionary with extracted keywords and analysis:
253
- - top_keywords: List of (keyword, count) tuples, sorted by frequency
254
- - code_identifiers: Original identifiers (weighted 10x)
255
- - code_split_words: Words extracted from identifiers (weighted 3x)
256
- - nouns, verbs, adjectives: Linguistic categories
257
- - entities: Named entities found
258
- - tf_scores: Term frequency scores
259
- - stats: Text statistics
260
- """
261
- if not text or not text.strip():
262
- return {
263
- "top_keywords": [],
264
- "nouns": [],
265
- "verbs": [],
266
- "adjectives": [],
267
- "proper_nouns": [],
268
- "noun_chunks": [],
269
- "entities": [],
270
- "code_identifiers": [],
271
- "tf_scores": {},
272
- "stats": {
273
- "total_tokens": 0,
274
- "total_words": 0,
275
- "unique_words": 0,
276
- "sentences": 0,
277
- },
278
- }
279
- # Ensure model is loaded (lazy loading on first use)
280
- self._ensure_model_loaded()
281
-
282
- # Process with spaCy
283
- doc = self.nlp(text)
284
-
285
- # 1. Extract nouns (concepts)
286
- nouns = [
287
- token.lemma_.lower()
288
- for token in doc
289
- if token.pos_ == "NOUN" and not token.is_stop and len(token.text) > 2
290
- ]
291
-
292
- # 2. Extract verbs (actions)
293
- verbs = [
294
- token.lemma_.lower()
295
- for token in doc
296
- if token.pos_ == "VERB" and not token.is_stop and len(token.text) > 2
297
- ]
298
-
299
- # 3. Extract adjectives (descriptors)
300
- adjectives = [
301
- token.lemma_.lower()
302
- for token in doc
303
- if token.pos_ == "ADJ" and not token.is_stop
304
- ]
305
-
306
- # 4. Extract proper nouns (named entities, technologies)
307
- proper_nouns = [token.text for token in doc if token.pos_ == "PROPN"]
308
-
309
- # 5. Extract noun chunks (multi-word concepts)
310
- noun_chunks = [
311
- chunk.text.lower()
312
- for chunk in doc.noun_chunks
313
- if len(chunk.text.split()) > 1
314
- ]
315
-
316
- # 6. Extract named entities
317
- entities = [(ent.text, ent.label_) for ent in doc.ents]
318
-
319
- # 7. Extract code identifiers and their split words
320
- code_identifiers, code_split_words = self.extract_code_identifiers(text)
321
-
322
- # 8. Calculate keyword frequency (combining nouns, verbs, proper nouns, identifiers, and split code words)
323
- # Give full code identifiers 10x weight for exact matching
324
- # Give code split words 3x weight for fuzzy matching
325
- code_identifiers_lower = [ident.lower() for ident in code_identifiers]
326
- all_keywords = (
327
- nouns
328
- + verbs
329
- + proper_nouns
330
- + (code_identifiers_lower * 10)
331
- + (code_split_words * 3)
332
- )
333
- keyword_freq = Counter(all_keywords)
334
- top_keywords = keyword_freq.most_common(top_n)
335
-
336
- # 9. Calculate TF scores (simple version)
337
- total_words = len(
338
- [token for token in doc if not token.is_stop and not token.is_punct]
339
- )
340
- tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
341
-
342
- # Statistics
343
- stats = {
344
- "total_tokens": len(doc),
345
- "total_words": total_words,
346
- "unique_words": len(set([t.text.lower() for t in doc if not t.is_punct])),
347
- "sentences": len(list(doc.sents)),
348
- }
349
-
350
- return {
351
- "top_keywords": top_keywords,
352
- "nouns": list(set(nouns))[:20],
353
- "verbs": list(set(verbs))[:20],
354
- "adjectives": list(set(adjectives))[:15],
355
- "proper_nouns": list(set(proper_nouns)),
356
- "noun_chunks": list(set(noun_chunks))[:15],
357
- "entities": entities,
358
- "code_identifiers": code_identifiers,
359
- "code_split_words": code_split_words,
360
- "tf_scores": dict(
361
- sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
362
- ),
363
- "stats": stats,
364
- }