cicada-mcp 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cicada-mcp might be problematic. Click here for more details.
- cicada/__init__.py +30 -0
- cicada/clean.py +297 -0
- cicada/command_logger.py +293 -0
- cicada/dead_code_analyzer.py +282 -0
- cicada/extractors/__init__.py +36 -0
- cicada/extractors/base.py +66 -0
- cicada/extractors/call.py +176 -0
- cicada/extractors/dependency.py +361 -0
- cicada/extractors/doc.py +179 -0
- cicada/extractors/function.py +246 -0
- cicada/extractors/module.py +123 -0
- cicada/extractors/spec.py +151 -0
- cicada/find_dead_code.py +270 -0
- cicada/formatter.py +918 -0
- cicada/git_helper.py +646 -0
- cicada/indexer.py +629 -0
- cicada/install.py +724 -0
- cicada/keyword_extractor.py +364 -0
- cicada/keyword_search.py +553 -0
- cicada/lightweight_keyword_extractor.py +298 -0
- cicada/mcp_server.py +1559 -0
- cicada/mcp_tools.py +291 -0
- cicada/parser.py +124 -0
- cicada/pr_finder.py +435 -0
- cicada/pr_indexer/__init__.py +20 -0
- cicada/pr_indexer/cli.py +62 -0
- cicada/pr_indexer/github_api_client.py +431 -0
- cicada/pr_indexer/indexer.py +297 -0
- cicada/pr_indexer/line_mapper.py +209 -0
- cicada/pr_indexer/pr_index_builder.py +253 -0
- cicada/setup.py +339 -0
- cicada/utils/__init__.py +52 -0
- cicada/utils/call_site_formatter.py +95 -0
- cicada/utils/function_grouper.py +57 -0
- cicada/utils/hash_utils.py +173 -0
- cicada/utils/index_utils.py +290 -0
- cicada/utils/path_utils.py +240 -0
- cicada/utils/signature_builder.py +106 -0
- cicada/utils/storage.py +111 -0
- cicada/utils/subprocess_runner.py +182 -0
- cicada/utils/text_utils.py +90 -0
- cicada/version_check.py +116 -0
- cicada_mcp-0.1.4.dist-info/METADATA +619 -0
- cicada_mcp-0.1.4.dist-info/RECORD +48 -0
- cicada_mcp-0.1.4.dist-info/WHEEL +5 -0
- cicada_mcp-0.1.4.dist-info/entry_points.txt +8 -0
- cicada_mcp-0.1.4.dist-info/licenses/LICENSE +21 -0
- cicada_mcp-0.1.4.dist-info/top_level.txt +1 -0
cicada/keyword_search.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Keyword-based search for modules and functions.
|
|
3
|
+
|
|
4
|
+
Provides semantic search capabilities by matching query keywords
|
|
5
|
+
against extracted keywords in the index using BM25 ranking.
|
|
6
|
+
|
|
7
|
+
Identifier names (function/module names) are given much higher weight than keywords.
|
|
8
|
+
|
|
9
|
+
Author: Cursor(Auto)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
import fnmatch
|
|
14
|
+
from typing import List, Dict, Any
|
|
15
|
+
from rank_bm25 import BM25Okapi
|
|
16
|
+
|
|
17
|
+
from cicada.utils import split_identifier
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class KeywordSearcher:
|
|
21
|
+
"""Search for modules and functions by keywords using BM25 ranking."""
|
|
22
|
+
|
|
23
|
+
# Boost multiplier for identifier name matches
|
|
24
|
+
# When query keyword matches the function/module name, multiply the score by this
|
|
25
|
+
IDENTIFIER_MATCH_BOOST = 10.0
|
|
26
|
+
|
|
27
|
+
def __init__(self, index: Dict[str, Any]):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the keyword searcher.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
index: The Cicada index dictionary containing modules and metadata
|
|
33
|
+
"""
|
|
34
|
+
self.index = index
|
|
35
|
+
self.bm25, self.document_map = self._initialize_bm25()
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _extract_identifier_name(document_info: Dict[str, Any]) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Extract the core identifier name from document info.
|
|
41
|
+
|
|
42
|
+
For modules: returns the module name
|
|
43
|
+
For functions: returns the function name (without arity)
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
document_info: Document information dictionary
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
The identifier name
|
|
50
|
+
"""
|
|
51
|
+
if document_info["type"] == "module":
|
|
52
|
+
return document_info["name"]
|
|
53
|
+
else:
|
|
54
|
+
return document_info["function"]
|
|
55
|
+
|
|
56
|
+
def _initialize_bm25(self) -> tuple:
|
|
57
|
+
"""
|
|
58
|
+
Initialize BM25 calculator with all documents in the index.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Tuple of (BM25Okapi instance, document_map dict)
|
|
62
|
+
- document_map maps document index to (type, module_name, location_info)
|
|
63
|
+
"""
|
|
64
|
+
documents = []
|
|
65
|
+
document_map = []
|
|
66
|
+
|
|
67
|
+
# Collect all documents (modules and functions with keywords)
|
|
68
|
+
for module_name, module_data in self.index.get("modules", {}).items():
|
|
69
|
+
# Add module as a document
|
|
70
|
+
if module_data.get("keywords"):
|
|
71
|
+
doc_keywords = [kw.lower() for kw in module_data["keywords"]]
|
|
72
|
+
documents.append(doc_keywords)
|
|
73
|
+
document_map.append(
|
|
74
|
+
{
|
|
75
|
+
"type": "module",
|
|
76
|
+
"name": module_name,
|
|
77
|
+
"module": module_name,
|
|
78
|
+
"file": module_data["file"],
|
|
79
|
+
"line": module_data["line"],
|
|
80
|
+
"doc": module_data.get("moduledoc"),
|
|
81
|
+
"keywords": module_data["keywords"],
|
|
82
|
+
}
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Add functions as documents
|
|
86
|
+
for func in module_data.get("functions", []):
|
|
87
|
+
if func.get("keywords"):
|
|
88
|
+
doc_keywords = [kw.lower() for kw in func["keywords"]]
|
|
89
|
+
documents.append(doc_keywords)
|
|
90
|
+
full_name = f"{module_name}.{func['name']}/{func['arity']}"
|
|
91
|
+
document_map.append(
|
|
92
|
+
{
|
|
93
|
+
"type": "function",
|
|
94
|
+
"name": full_name,
|
|
95
|
+
"module": module_name,
|
|
96
|
+
"function": func["name"],
|
|
97
|
+
"arity": func["arity"],
|
|
98
|
+
"file": module_data["file"],
|
|
99
|
+
"line": func["line"],
|
|
100
|
+
"doc": func.get("doc"),
|
|
101
|
+
"keywords": func["keywords"],
|
|
102
|
+
}
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# If no documents were created (no keywords extracted), create documents using identifier names
|
|
106
|
+
if not documents:
|
|
107
|
+
for module_name, module_data in self.index.get("modules", {}).items():
|
|
108
|
+
# Add module as a document using its name as keywords
|
|
109
|
+
module_keywords = split_identifier(module_name)
|
|
110
|
+
documents.append(module_keywords)
|
|
111
|
+
document_map.append(
|
|
112
|
+
{
|
|
113
|
+
"type": "module",
|
|
114
|
+
"name": module_name,
|
|
115
|
+
"module": module_name,
|
|
116
|
+
"file": module_data["file"],
|
|
117
|
+
"line": module_data["line"],
|
|
118
|
+
"doc": module_data.get("moduledoc"),
|
|
119
|
+
"keywords": module_keywords,
|
|
120
|
+
}
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Add functions as documents
|
|
124
|
+
for func in module_data.get("functions", []):
|
|
125
|
+
# Use extracted keywords if available, otherwise fall back to split identifier
|
|
126
|
+
if func.get("keywords"):
|
|
127
|
+
func_keywords = [kw.lower() for kw in func["keywords"]]
|
|
128
|
+
else:
|
|
129
|
+
func_keywords = split_identifier(func["name"])
|
|
130
|
+
|
|
131
|
+
documents.append(func_keywords)
|
|
132
|
+
full_name = f"{module_name}.{func['name']}/{func['arity']}"
|
|
133
|
+
document_map.append(
|
|
134
|
+
{
|
|
135
|
+
"type": "function",
|
|
136
|
+
"name": full_name,
|
|
137
|
+
"module": module_name,
|
|
138
|
+
"function": func["name"],
|
|
139
|
+
"arity": func["arity"],
|
|
140
|
+
"file": module_data["file"],
|
|
141
|
+
"line": func["line"],
|
|
142
|
+
"doc": func.get("doc"),
|
|
143
|
+
"keywords": func_keywords,
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Initialize BM25 with all documents
|
|
148
|
+
# Use b=0.4 (lower than default 0.75) to reduce length normalization penalty
|
|
149
|
+
# This is appropriate for code search where longer names are more specific, not verbose
|
|
150
|
+
bm25 = BM25Okapi(documents, b=0.4) if documents else None
|
|
151
|
+
return bm25, document_map
|
|
152
|
+
|
|
153
|
+
def _match_wildcard(self, pattern: str, text: str) -> bool:
|
|
154
|
+
"""
|
|
155
|
+
Check if text matches a wildcard pattern.
|
|
156
|
+
|
|
157
|
+
Supports * (matches any characters) only.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
pattern: Wildcard pattern (e.g., "create*", "test_*")
|
|
161
|
+
text: Text to match against
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
True if text matches the pattern
|
|
165
|
+
"""
|
|
166
|
+
# Only support * wildcard, not ?
|
|
167
|
+
if "?" in pattern:
|
|
168
|
+
return False
|
|
169
|
+
return fnmatch.fnmatch(text.lower(), pattern.lower())
|
|
170
|
+
|
|
171
|
+
def _expand_wildcard_keywords(
|
|
172
|
+
self, query_keywords: List[str], document_keywords: List[str]
|
|
173
|
+
) -> List[str]:
|
|
174
|
+
"""
|
|
175
|
+
Expand wildcard patterns to actual matching keywords from the document.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
query_keywords: List of query keywords (may contain wildcards)
|
|
179
|
+
document_keywords: List of keywords from a document
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
List of actual matching keywords found in the document
|
|
183
|
+
"""
|
|
184
|
+
matched_keywords = []
|
|
185
|
+
for query_kw in query_keywords:
|
|
186
|
+
for doc_kw in document_keywords:
|
|
187
|
+
if self._match_wildcard(query_kw, doc_kw):
|
|
188
|
+
matched_keywords.append(query_kw)
|
|
189
|
+
return matched_keywords
|
|
190
|
+
|
|
191
|
+
def _expand_wildcard_keywords_with_identifier(
|
|
192
|
+
self,
|
|
193
|
+
query_keywords: List[str],
|
|
194
|
+
document_keywords: List[str],
|
|
195
|
+
identifier_name: str,
|
|
196
|
+
) -> List[str]:
|
|
197
|
+
"""
|
|
198
|
+
Expand wildcard patterns to actual matching keywords from the document and identifier name.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
query_keywords: List of query keywords (may contain wildcards)
|
|
202
|
+
document_keywords: List of keywords from a document
|
|
203
|
+
identifier_name: The full identifier name (function/module name)
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
List of actual matching keywords found in the document or identifier
|
|
207
|
+
"""
|
|
208
|
+
matched_keywords = []
|
|
209
|
+
for query_kw in query_keywords:
|
|
210
|
+
# Check against individual keywords
|
|
211
|
+
for doc_kw in document_keywords:
|
|
212
|
+
if self._match_wildcard(query_kw, doc_kw):
|
|
213
|
+
matched_keywords.append(query_kw)
|
|
214
|
+
break # Only add each query keyword once
|
|
215
|
+
|
|
216
|
+
# Also check against the full identifier name
|
|
217
|
+
if query_kw not in matched_keywords and self._match_wildcard(
|
|
218
|
+
query_kw, identifier_name
|
|
219
|
+
):
|
|
220
|
+
matched_keywords.append(query_kw)
|
|
221
|
+
return matched_keywords
|
|
222
|
+
|
|
223
|
+
def _get_wildcard_scores(self, query_keywords: List[str]) -> List[float]:
|
|
224
|
+
"""
|
|
225
|
+
Calculate BM25-like scores for wildcard matching.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
query_keywords: List of query keywords (may contain wildcards)
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
List of scores for each document
|
|
232
|
+
"""
|
|
233
|
+
scores = []
|
|
234
|
+
|
|
235
|
+
for _, doc_info in enumerate(self.document_map):
|
|
236
|
+
doc_keywords = [kw.lower() for kw in doc_info["keywords"]]
|
|
237
|
+
identifier_name = self._extract_identifier_name(doc_info)
|
|
238
|
+
|
|
239
|
+
# Find matching keywords using wildcard patterns
|
|
240
|
+
# Check both individual keywords and full identifier name
|
|
241
|
+
matched_keywords = self._expand_wildcard_keywords_with_identifier(
|
|
242
|
+
query_keywords, doc_keywords, identifier_name
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if matched_keywords:
|
|
246
|
+
# Calculate a simple score based on number of matches
|
|
247
|
+
# This is a simplified version of BM25 for wildcard matching
|
|
248
|
+
score = len(matched_keywords) / len(query_keywords)
|
|
249
|
+
scores.append(score)
|
|
250
|
+
else:
|
|
251
|
+
scores.append(0.0)
|
|
252
|
+
|
|
253
|
+
return scores
|
|
254
|
+
|
|
255
|
+
def _has_wildcards(self, keywords: List[str]) -> bool:
|
|
256
|
+
"""Check if any keywords contain wildcard patterns."""
|
|
257
|
+
return any("*" in keyword for keyword in keywords)
|
|
258
|
+
|
|
259
|
+
def search(self, query_keywords: List[str], top_n: int = 5) -> List[Dict[str, Any]]:
|
|
260
|
+
"""
|
|
261
|
+
Search for modules and functions matching the given keywords.
|
|
262
|
+
|
|
263
|
+
Uses BM25 ranking to score documents based on keyword relevance.
|
|
264
|
+
Identifier names (function/module names) are boosted significantly
|
|
265
|
+
when they match query keywords.
|
|
266
|
+
|
|
267
|
+
Automatically detects wildcard patterns (* supported) in keywords.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
query_keywords: List of keywords to search for
|
|
271
|
+
top_n: Maximum number of results to return
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
List of result dictionaries sorted by score (descending), each containing:
|
|
275
|
+
- type: 'module' or 'function'
|
|
276
|
+
- name: Full name (e.g., 'MyApp.User' or 'MyApp.User.create/2')
|
|
277
|
+
- module: Module name
|
|
278
|
+
- file: File path
|
|
279
|
+
- line: Line number
|
|
280
|
+
- score: BM25 score (float), boosted if identifier matches query
|
|
281
|
+
- confidence: Percentage of query keywords matched
|
|
282
|
+
- matched_keywords: List of matched keywords
|
|
283
|
+
- doc: Documentation string (if available)
|
|
284
|
+
"""
|
|
285
|
+
if not query_keywords or self.bm25 is None or not self.document_map:
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
# Normalize query keywords to lowercase
|
|
289
|
+
query_keywords_lower = [kw.lower() for kw in query_keywords]
|
|
290
|
+
|
|
291
|
+
# Check if wildcards are present
|
|
292
|
+
enable_wildcards = self._has_wildcards(query_keywords_lower)
|
|
293
|
+
|
|
294
|
+
# Get BM25 scores for all documents
|
|
295
|
+
if enable_wildcards:
|
|
296
|
+
# For wildcard matching, we need to manually score documents
|
|
297
|
+
bm25_scores = self._get_wildcard_scores(query_keywords_lower)
|
|
298
|
+
else:
|
|
299
|
+
bm25_scores = self.bm25.get_scores(query_keywords_lower)
|
|
300
|
+
|
|
301
|
+
results = []
|
|
302
|
+
|
|
303
|
+
# Build results with scores
|
|
304
|
+
for doc_idx, bm25_score in enumerate(bm25_scores):
|
|
305
|
+
# BM25 can produce negative scores for small corpuses
|
|
306
|
+
# We check matched keywords instead to filter relevance
|
|
307
|
+
doc_info = self.document_map[doc_idx]
|
|
308
|
+
|
|
309
|
+
# Check if there are any matching keywords first
|
|
310
|
+
if enable_wildcards:
|
|
311
|
+
identifier_name = self._extract_identifier_name(doc_info)
|
|
312
|
+
matched = self._count_wildcard_matches(
|
|
313
|
+
query_keywords_lower, doc_info["keywords"], identifier_name
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
matched = self._count_matches(
|
|
317
|
+
query_keywords_lower, doc_info["keywords"]
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Only include documents that match at least one query keyword
|
|
321
|
+
if matched["score"] > 0:
|
|
322
|
+
# Apply identifier name boost
|
|
323
|
+
if enable_wildcards:
|
|
324
|
+
final_score = self._apply_identifier_boost_wildcard(
|
|
325
|
+
bm25_score, query_keywords_lower, doc_info
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
final_score = self._apply_identifier_boost(
|
|
329
|
+
bm25_score, query_keywords_lower, doc_info
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Apply name coverage penalty (penalize functions with extra words not in query)
|
|
333
|
+
coverage_penalty = self._calculate_name_coverage_penalty(
|
|
334
|
+
query_keywords_lower, doc_info
|
|
335
|
+
)
|
|
336
|
+
# For negative scores, divide by penalty instead of multiply
|
|
337
|
+
# This ensures penalty always makes the score worse regardless of sign
|
|
338
|
+
if final_score < 0 and coverage_penalty < 1.0:
|
|
339
|
+
final_score = final_score / coverage_penalty
|
|
340
|
+
else:
|
|
341
|
+
final_score = final_score * coverage_penalty
|
|
342
|
+
|
|
343
|
+
result = {
|
|
344
|
+
"type": doc_info["type"],
|
|
345
|
+
"name": doc_info["name"],
|
|
346
|
+
"module": doc_info["module"],
|
|
347
|
+
"file": doc_info["file"],
|
|
348
|
+
"line": doc_info["line"],
|
|
349
|
+
"score": round(final_score, 4),
|
|
350
|
+
"confidence": matched["confidence"],
|
|
351
|
+
"matched_keywords": matched["matched_keywords"],
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
# Add type-specific fields
|
|
355
|
+
if doc_info["type"] == "function":
|
|
356
|
+
result["function"] = doc_info["function"]
|
|
357
|
+
result["arity"] = doc_info["arity"]
|
|
358
|
+
|
|
359
|
+
# Add documentation if available
|
|
360
|
+
if doc_info.get("doc"):
|
|
361
|
+
result["doc"] = doc_info["doc"]
|
|
362
|
+
|
|
363
|
+
results.append(result)
|
|
364
|
+
|
|
365
|
+
# Sort by final score (descending), then by name for stable results
|
|
366
|
+
results.sort(key=lambda x: (-x["score"], x["name"]))
|
|
367
|
+
|
|
368
|
+
return results[:top_n]
|
|
369
|
+
|
|
370
|
+
def _apply_identifier_boost(
|
|
371
|
+
self, bm25_score: float, query_keywords: List[str], doc_info: Dict[str, Any]
|
|
372
|
+
) -> float:
|
|
373
|
+
"""
|
|
374
|
+
Apply boost to BM25 score if query keywords match the identifier name.
|
|
375
|
+
|
|
376
|
+
Identifier names (function/module names) are given much higher weight.
|
|
377
|
+
If any query keyword matches a word in the identifier name, the score
|
|
378
|
+
is multiplied by IDENTIFIER_MATCH_BOOST.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
bm25_score: The original BM25 score
|
|
382
|
+
query_keywords: Normalized query keywords (lowercase)
|
|
383
|
+
doc_info: Document information with function/module name
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Final score (boosted if identifier matches)
|
|
387
|
+
"""
|
|
388
|
+
# Extract the identifier name (module name or function name)
|
|
389
|
+
identifier_name = self._extract_identifier_name(doc_info)
|
|
390
|
+
|
|
391
|
+
# Split identifier into words
|
|
392
|
+
identifier_words = split_identifier(identifier_name)
|
|
393
|
+
|
|
394
|
+
# Check if any query keyword matches any word in the identifier
|
|
395
|
+
for query_kw in query_keywords:
|
|
396
|
+
if query_kw in identifier_words:
|
|
397
|
+
# Apply significant boost for identifier match
|
|
398
|
+
return bm25_score * self.IDENTIFIER_MATCH_BOOST
|
|
399
|
+
|
|
400
|
+
return bm25_score
|
|
401
|
+
|
|
402
|
+
def _count_matches(
|
|
403
|
+
self, query_keywords: List[str], item_keywords: List[str]
|
|
404
|
+
) -> Dict[str, Any]:
|
|
405
|
+
"""
|
|
406
|
+
Count matching keywords between query and item.
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
query_keywords: Query keywords (normalized to lowercase)
|
|
410
|
+
item_keywords: Keywords from module/function
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
Dictionary with:
|
|
414
|
+
- score: Number of matching keywords
|
|
415
|
+
- confidence: Percentage match (score / len(query_keywords) * 100)
|
|
416
|
+
- matched_keywords: List of matched keywords
|
|
417
|
+
"""
|
|
418
|
+
# Normalize item keywords to lowercase
|
|
419
|
+
item_keywords_lower = [kw.lower() for kw in item_keywords]
|
|
420
|
+
|
|
421
|
+
# Find matches
|
|
422
|
+
matched_keywords = []
|
|
423
|
+
for query_kw in query_keywords:
|
|
424
|
+
if query_kw in item_keywords_lower:
|
|
425
|
+
matched_keywords.append(query_kw)
|
|
426
|
+
|
|
427
|
+
score = len(matched_keywords)
|
|
428
|
+
confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
|
|
429
|
+
|
|
430
|
+
return {
|
|
431
|
+
"score": score,
|
|
432
|
+
"confidence": round(confidence, 1),
|
|
433
|
+
"matched_keywords": matched_keywords,
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
def _count_wildcard_matches(
|
|
437
|
+
self,
|
|
438
|
+
query_keywords: List[str],
|
|
439
|
+
item_keywords: List[str],
|
|
440
|
+
identifier_name: str | None = None,
|
|
441
|
+
) -> Dict[str, Any]:
|
|
442
|
+
"""
|
|
443
|
+
Count matching keywords between query and item using wildcard patterns.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
query_keywords: Query keywords (may contain wildcards, normalized to lowercase)
|
|
447
|
+
item_keywords: Keywords from module/function
|
|
448
|
+
identifier_name: The full identifier name (function/module name)
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Dictionary with:
|
|
452
|
+
- score: Number of matching keywords
|
|
453
|
+
- confidence: Percentage match (score / len(query_keywords) * 100)
|
|
454
|
+
- matched_keywords: List of matched keywords
|
|
455
|
+
"""
|
|
456
|
+
# Normalize item keywords to lowercase
|
|
457
|
+
item_keywords_lower = [kw.lower() for kw in item_keywords]
|
|
458
|
+
|
|
459
|
+
# Find matches using wildcard patterns
|
|
460
|
+
if identifier_name:
|
|
461
|
+
matched_keywords = self._expand_wildcard_keywords_with_identifier(
|
|
462
|
+
query_keywords, item_keywords_lower, identifier_name
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
matched_keywords = self._expand_wildcard_keywords(
|
|
466
|
+
query_keywords, item_keywords_lower
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
score = len(matched_keywords)
|
|
470
|
+
confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
|
|
471
|
+
|
|
472
|
+
return {
|
|
473
|
+
"score": score,
|
|
474
|
+
"confidence": round(confidence, 1),
|
|
475
|
+
"matched_keywords": matched_keywords,
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
def _apply_identifier_boost_wildcard(
|
|
479
|
+
self, bm25_score: float, query_keywords: List[str], doc_info: Dict[str, Any]
|
|
480
|
+
) -> float:
|
|
481
|
+
"""
|
|
482
|
+
Apply boost to BM25 score if query keywords match the identifier name using wildcards.
|
|
483
|
+
|
|
484
|
+
Identifier names (function/module names) are given much higher weight.
|
|
485
|
+
If any query keyword matches a word in the identifier name using wildcard patterns,
|
|
486
|
+
the score is multiplied by IDENTIFIER_MATCH_BOOST.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
bm25_score: The original BM25 score
|
|
490
|
+
query_keywords: Query keywords (may contain wildcards, normalized to lowercase)
|
|
491
|
+
doc_info: Document information with function/module name
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
Final score (boosted if identifier matches)
|
|
495
|
+
"""
|
|
496
|
+
# Extract the identifier name (module name or function name)
|
|
497
|
+
identifier_name = self._extract_identifier_name(doc_info)
|
|
498
|
+
|
|
499
|
+
# Split identifier into words
|
|
500
|
+
identifier_words = split_identifier(identifier_name)
|
|
501
|
+
|
|
502
|
+
# Check if any query keyword matches any word in the identifier using wildcards
|
|
503
|
+
for query_kw in query_keywords:
|
|
504
|
+
for identifier_word in identifier_words:
|
|
505
|
+
if self._match_wildcard(query_kw, identifier_word):
|
|
506
|
+
# Apply significant boost for identifier match
|
|
507
|
+
return bm25_score * self.IDENTIFIER_MATCH_BOOST
|
|
508
|
+
|
|
509
|
+
return bm25_score
|
|
510
|
+
|
|
511
|
+
def _calculate_name_coverage_penalty(
|
|
512
|
+
self, query_keywords: List[str], doc_info: Dict[str, Any]
|
|
513
|
+
) -> float:
|
|
514
|
+
"""
|
|
515
|
+
Calculate penalty for functions whose names contain words NOT in the query.
|
|
516
|
+
|
|
517
|
+
This helps rank exact matches higher than functions with extra words in their names.
|
|
518
|
+
For example, searching "create user" should rank "create_user" higher than
|
|
519
|
+
"create_invalid_user" because "invalid" is not in the query.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
query_keywords: Normalized query keywords (lowercase)
|
|
523
|
+
doc_info: Document information with function/module name
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
Penalty multiplier between 0.1 and 1.0:
|
|
527
|
+
- 1.0 = no penalty (exact match or all extra words in query)
|
|
528
|
+
- 0.7 = 1 extra word not in query (30% penalty)
|
|
529
|
+
- 0.4 = 2 extra words not in query (60% penalty)
|
|
530
|
+
- 0.1 = 3+ extra words not in query (90% penalty cap)
|
|
531
|
+
"""
|
|
532
|
+
# Only apply to functions (not modules)
|
|
533
|
+
if doc_info["type"] != "function":
|
|
534
|
+
return 1.0
|
|
535
|
+
|
|
536
|
+
# Get function name and split it
|
|
537
|
+
func_name = doc_info["function"]
|
|
538
|
+
func_words = set(split_identifier(func_name))
|
|
539
|
+
|
|
540
|
+
# Find words in function name that are NOT in query
|
|
541
|
+
query_set = set(query_keywords)
|
|
542
|
+
extra_words = func_words - query_set
|
|
543
|
+
|
|
544
|
+
# No penalty if all function name words are in query (exact match)
|
|
545
|
+
if not extra_words:
|
|
546
|
+
return 1.0
|
|
547
|
+
|
|
548
|
+
# Apply 30% penalty per extra word, with a floor of 0.1 (max 90% penalty)
|
|
549
|
+
# This penalty is strong enough to overcome BM25 length normalization bias
|
|
550
|
+
penalty_per_word = 0.3
|
|
551
|
+
total_penalty = min(len(extra_words) * penalty_per_word, 0.9)
|
|
552
|
+
|
|
553
|
+
return 1.0 - total_penalty
|