cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/_version_hash.py +4 -0
- cicada/cli.py +6 -748
- cicada/commands.py +1255 -0
- cicada/dead_code/__init__.py +1 -0
- cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
- cicada/dependency_analyzer.py +147 -0
- cicada/entry_utils.py +92 -0
- cicada/extractors/base.py +9 -9
- cicada/extractors/call.py +17 -20
- cicada/extractors/common.py +64 -0
- cicada/extractors/dependency.py +117 -235
- cicada/extractors/doc.py +2 -49
- cicada/extractors/function.py +10 -14
- cicada/extractors/keybert.py +228 -0
- cicada/extractors/keyword.py +191 -0
- cicada/extractors/module.py +6 -10
- cicada/extractors/spec.py +8 -56
- cicada/format/__init__.py +20 -0
- cicada/{ascii_art.py → format/ascii_art.py} +1 -1
- cicada/format/formatter.py +1145 -0
- cicada/git_helper.py +134 -7
- cicada/indexer.py +322 -89
- cicada/interactive_setup.py +251 -323
- cicada/interactive_setup_helpers.py +302 -0
- cicada/keyword_expander.py +437 -0
- cicada/keyword_search.py +208 -422
- cicada/keyword_test.py +383 -16
- cicada/mcp/__init__.py +10 -0
- cicada/mcp/entry.py +17 -0
- cicada/mcp/filter_utils.py +107 -0
- cicada/mcp/pattern_utils.py +118 -0
- cicada/{mcp_server.py → mcp/server.py} +819 -73
- cicada/mcp/tools.py +473 -0
- cicada/pr_finder.py +2 -3
- cicada/pr_indexer/indexer.py +3 -2
- cicada/setup.py +167 -35
- cicada/tier.py +225 -0
- cicada/utils/__init__.py +9 -2
- cicada/utils/fuzzy_match.py +54 -0
- cicada/utils/index_utils.py +9 -0
- cicada/utils/path_utils.py +18 -0
- cicada/utils/text_utils.py +52 -1
- cicada/utils/tree_utils.py +47 -0
- cicada/version_check.py +99 -0
- cicada/watch_manager.py +320 -0
- cicada/watcher.py +431 -0
- cicada_mcp-0.3.0.dist-info/METADATA +541 -0
- cicada_mcp-0.3.0.dist-info/RECORD +70 -0
- cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
- cicada/formatter.py +0 -864
- cicada/keybert_extractor.py +0 -286
- cicada/lightweight_keyword_extractor.py +0 -290
- cicada/mcp_entry.py +0 -683
- cicada/mcp_tools.py +0 -291
- cicada_mcp-0.2.0.dist-info/METADATA +0 -735
- cicada_mcp-0.2.0.dist-info/RECORD +0 -53
- cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
- /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
- /cicada/{colors.py → format/colors.py} +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
cicada/keyword_search.py
CHANGED
|
@@ -1,28 +1,19 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Keyword-based search for modules and functions.
|
|
3
3
|
|
|
4
|
-
Provides
|
|
5
|
-
|
|
4
|
+
Provides simple keyword search by summing weights of matched keywords.
|
|
5
|
+
Keywords are pre-weighted during extraction/expansion based on frequency,
|
|
6
|
+
code identifier presence, semantic similarity, etc.
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
Author: Cursor(Auto)
|
|
8
|
+
Author: Cicada Team
|
|
10
9
|
"""
|
|
11
10
|
|
|
12
11
|
import fnmatch
|
|
13
12
|
from typing import Any
|
|
14
13
|
|
|
15
|
-
from rank_bm25 import BM25Okapi
|
|
16
|
-
|
|
17
|
-
from cicada.utils import split_identifier
|
|
18
|
-
|
|
19
14
|
|
|
20
15
|
class KeywordSearcher:
|
|
21
|
-
"""Search for modules and functions by keywords using
|
|
22
|
-
|
|
23
|
-
# Boost multiplier for identifier name matches
|
|
24
|
-
# When query keyword matches the function/module name, multiply the score by this
|
|
25
|
-
IDENTIFIER_MATCH_BOOST = 10.0
|
|
16
|
+
"""Search for modules and functions by keywords using pre-weighted keyword scores."""
|
|
26
17
|
|
|
27
18
|
def __init__(self, index: dict[str, Any]):
|
|
28
19
|
"""
|
|
@@ -32,123 +23,85 @@ class KeywordSearcher:
|
|
|
32
23
|
index: The Cicada index dictionary containing modules and metadata
|
|
33
24
|
"""
|
|
34
25
|
self.index = index
|
|
35
|
-
self.
|
|
26
|
+
self.documents = self._build_document_map()
|
|
36
27
|
|
|
37
|
-
|
|
38
|
-
def _extract_identifier_name(document_info: dict[str, Any]) -> str:
|
|
28
|
+
def _build_document_map(self) -> list[dict[str, Any]]:
|
|
39
29
|
"""
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
For modules: returns the module name
|
|
43
|
-
For functions: returns the function name (without arity)
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
document_info: Document information dictionary
|
|
30
|
+
Build a list of searchable documents from the index.
|
|
47
31
|
|
|
48
32
|
Returns:
|
|
49
|
-
|
|
50
|
-
"""
|
|
51
|
-
if document_info["type"] == "module":
|
|
52
|
-
return document_info["name"]
|
|
53
|
-
else:
|
|
54
|
-
return document_info["function"]
|
|
55
|
-
|
|
56
|
-
def _initialize_bm25(self) -> tuple:
|
|
57
|
-
"""
|
|
58
|
-
Initialize BM25 calculator with all documents in the index.
|
|
59
|
-
|
|
60
|
-
Returns:
|
|
61
|
-
Tuple of (BM25Okapi instance, document_map dict)
|
|
62
|
-
- document_map maps document index to (type, module_name, location_info)
|
|
33
|
+
List of document dicts with type, name, module, keywords, etc.
|
|
63
34
|
"""
|
|
64
35
|
documents = []
|
|
65
|
-
document_map = []
|
|
66
36
|
|
|
67
|
-
#
|
|
37
|
+
# Process all modules and their functions
|
|
68
38
|
for module_name, module_data in self.index.get("modules", {}).items():
|
|
69
39
|
# Add module as a document
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
documents.append(
|
|
73
|
-
document_map.append(
|
|
74
|
-
{
|
|
75
|
-
"type": "module",
|
|
76
|
-
"name": module_name,
|
|
77
|
-
"module": module_name,
|
|
78
|
-
"file": module_data["file"],
|
|
79
|
-
"line": module_data["line"],
|
|
80
|
-
"doc": module_data.get("moduledoc"),
|
|
81
|
-
"keywords": module_data["keywords"],
|
|
82
|
-
}
|
|
83
|
-
)
|
|
40
|
+
module_doc = self._create_module_document(module_name, module_data)
|
|
41
|
+
if module_doc:
|
|
42
|
+
documents.append(module_doc)
|
|
84
43
|
|
|
85
44
|
# Add functions as documents
|
|
86
45
|
for func in module_data.get("functions", []):
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
documents.append(
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
"line": module_data["line"],
|
|
118
|
-
"doc": module_data.get("moduledoc"),
|
|
119
|
-
"keywords": module_keywords,
|
|
120
|
-
}
|
|
121
|
-
)
|
|
46
|
+
func_doc = self._create_function_document(module_name, module_data, func)
|
|
47
|
+
if func_doc:
|
|
48
|
+
documents.append(func_doc)
|
|
49
|
+
|
|
50
|
+
return documents
|
|
51
|
+
|
|
52
|
+
def _create_module_document(
|
|
53
|
+
self, module_name: str, module_data: dict[str, Any]
|
|
54
|
+
) -> dict[str, Any] | None:
|
|
55
|
+
"""Create a searchable document for a module."""
|
|
56
|
+
if not module_data.get("keywords"):
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
# Keywords can be either dict {word: score} or list [words]
|
|
60
|
+
# If list, convert to dict with uniform scores
|
|
61
|
+
keywords_dict = module_data["keywords"]
|
|
62
|
+
if isinstance(keywords_dict, list):
|
|
63
|
+
keywords_dict = {kw.lower(): 1.0 for kw in keywords_dict}
|
|
64
|
+
else:
|
|
65
|
+
keywords_dict = {k.lower(): v for k, v in keywords_dict.items()}
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"type": "module",
|
|
69
|
+
"name": module_name,
|
|
70
|
+
"module": module_name,
|
|
71
|
+
"file": module_data["file"],
|
|
72
|
+
"line": module_data["line"],
|
|
73
|
+
"doc": module_data.get("moduledoc"),
|
|
74
|
+
"keywords": keywords_dict,
|
|
75
|
+
}
|
|
122
76
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
return bm25, document_map
|
|
77
|
+
def _create_function_document(
|
|
78
|
+
self, module_name: str, module_data: dict[str, Any], func: dict[str, Any]
|
|
79
|
+
) -> dict[str, Any] | None:
|
|
80
|
+
"""Create a searchable document for a function."""
|
|
81
|
+
if not func.get("keywords"):
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
# Keywords can be either dict {word: score} or list [words]
|
|
85
|
+
# If list, convert to dict with uniform scores
|
|
86
|
+
keywords_dict = func["keywords"]
|
|
87
|
+
if isinstance(keywords_dict, list):
|
|
88
|
+
keywords_dict = {kw.lower(): 1.0 for kw in keywords_dict}
|
|
89
|
+
else:
|
|
90
|
+
keywords_dict = {k.lower(): v for k, v in keywords_dict.items()}
|
|
91
|
+
|
|
92
|
+
full_name = f"{module_name}.{func['name']}/{func['arity']}"
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"type": "function",
|
|
96
|
+
"name": full_name,
|
|
97
|
+
"module": module_name,
|
|
98
|
+
"function": func["name"],
|
|
99
|
+
"arity": func["arity"],
|
|
100
|
+
"file": module_data["file"],
|
|
101
|
+
"line": func["line"],
|
|
102
|
+
"doc": func.get("doc"),
|
|
103
|
+
"keywords": keywords_dict,
|
|
104
|
+
}
|
|
152
105
|
|
|
153
106
|
def _match_wildcard(self, pattern: str, text: str) -> bool:
|
|
154
107
|
"""
|
|
@@ -168,105 +121,135 @@ class KeywordSearcher:
|
|
|
168
121
|
return False
|
|
169
122
|
return fnmatch.fnmatch(text.lower(), pattern.lower())
|
|
170
123
|
|
|
171
|
-
def
|
|
172
|
-
self,
|
|
173
|
-
|
|
124
|
+
def _calculate_score(
|
|
125
|
+
self,
|
|
126
|
+
query_keywords: list[str],
|
|
127
|
+
keyword_groups: list[int],
|
|
128
|
+
total_terms: int,
|
|
129
|
+
doc_keywords: dict[str, float],
|
|
130
|
+
) -> dict[str, Any]:
|
|
174
131
|
"""
|
|
175
|
-
|
|
132
|
+
Calculate the search score by summing weights of matched keywords.
|
|
176
133
|
|
|
177
134
|
Args:
|
|
178
|
-
query_keywords:
|
|
179
|
-
|
|
135
|
+
query_keywords: Query keywords (normalized to lowercase)
|
|
136
|
+
doc_keywords: Document keywords with their scores
|
|
180
137
|
|
|
181
138
|
Returns:
|
|
182
|
-
|
|
139
|
+
Dictionary with:
|
|
140
|
+
- score: Sum of matched keyword weights
|
|
141
|
+
- matched_keywords: List of matched keywords
|
|
142
|
+
- confidence: Percentage of query keywords that matched
|
|
183
143
|
"""
|
|
184
144
|
matched_keywords = []
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
145
|
+
matched_groups: set[int] = set()
|
|
146
|
+
total_score = 0.0
|
|
147
|
+
|
|
148
|
+
for query_kw, group_idx in zip(query_keywords, keyword_groups, strict=False):
|
|
149
|
+
if query_kw in doc_keywords:
|
|
150
|
+
matched_keywords.append(query_kw)
|
|
151
|
+
matched_groups.add(group_idx)
|
|
152
|
+
total_score += doc_keywords[query_kw]
|
|
153
|
+
|
|
154
|
+
denominator = total_terms if total_terms else len(query_keywords)
|
|
155
|
+
confidence = (len(matched_groups) / denominator * 100) if denominator else 0
|
|
190
156
|
|
|
191
|
-
|
|
157
|
+
return {
|
|
158
|
+
"score": total_score,
|
|
159
|
+
"matched_keywords": matched_keywords,
|
|
160
|
+
"confidence": round(confidence, 1),
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
def _calculate_wildcard_score(
|
|
192
164
|
self,
|
|
193
165
|
query_keywords: list[str],
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
166
|
+
keyword_groups: list[int],
|
|
167
|
+
total_terms: int,
|
|
168
|
+
doc_keywords: dict[str, float],
|
|
169
|
+
) -> dict[str, Any]:
|
|
197
170
|
"""
|
|
198
|
-
|
|
171
|
+
Calculate the search score using wildcard pattern matching.
|
|
199
172
|
|
|
200
173
|
Args:
|
|
201
|
-
query_keywords:
|
|
202
|
-
|
|
203
|
-
identifier_name: The full identifier name (function/module name)
|
|
174
|
+
query_keywords: Query keywords with potential wildcards (normalized to lowercase)
|
|
175
|
+
doc_keywords: Document keywords with their scores
|
|
204
176
|
|
|
205
177
|
Returns:
|
|
206
|
-
|
|
178
|
+
Dictionary with:
|
|
179
|
+
- score: Sum of matched keyword weights
|
|
180
|
+
- matched_keywords: List of matched query patterns
|
|
181
|
+
- confidence: Percentage of query keywords that matched
|
|
207
182
|
"""
|
|
208
183
|
matched_keywords = []
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
184
|
+
matched_groups: set[int] = set()
|
|
185
|
+
total_score = 0.0
|
|
186
|
+
|
|
187
|
+
for query_kw, group_idx in zip(query_keywords, keyword_groups, strict=False):
|
|
188
|
+
# Find all doc keywords matching this pattern
|
|
189
|
+
for doc_kw, weight in doc_keywords.items():
|
|
212
190
|
if self._match_wildcard(query_kw, doc_kw):
|
|
213
|
-
|
|
214
|
-
|
|
191
|
+
# Add query keyword to matched list (not the doc keyword)
|
|
192
|
+
if query_kw not in matched_keywords:
|
|
193
|
+
matched_keywords.append(query_kw)
|
|
194
|
+
matched_groups.add(group_idx)
|
|
195
|
+
# Add the weight only once per query keyword
|
|
196
|
+
total_score += weight
|
|
197
|
+
break
|
|
215
198
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
matched_keywords.append(query_kw)
|
|
219
|
-
return matched_keywords
|
|
199
|
+
denominator = total_terms if total_terms else len(query_keywords)
|
|
200
|
+
confidence = (len(matched_groups) / denominator * 100) if denominator else 0
|
|
220
201
|
|
|
221
|
-
|
|
202
|
+
return {
|
|
203
|
+
"score": total_score,
|
|
204
|
+
"matched_keywords": matched_keywords,
|
|
205
|
+
"confidence": round(confidence, 1),
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
def _has_wildcards(self, keywords: list[str]) -> bool:
|
|
209
|
+
"""Check if any keywords contain wildcard patterns (* or |)."""
|
|
210
|
+
return any("*" in keyword or "|" in keyword for keyword in keywords)
|
|
211
|
+
|
|
212
|
+
def _expand_or_patterns(self, keywords: list[str]) -> tuple[list[str], list[int]]:
|
|
222
213
|
"""
|
|
223
|
-
|
|
214
|
+
Expand OR patterns (|) in keywords.
|
|
224
215
|
|
|
225
216
|
Args:
|
|
226
|
-
|
|
217
|
+
keywords: List of keywords that may contain | for OR logic
|
|
227
218
|
|
|
228
219
|
Returns:
|
|
229
|
-
|
|
220
|
+
Tuple of:
|
|
221
|
+
- Expanded list of keywords with OR patterns split out
|
|
222
|
+
- Parallel list of group indexes mapping each expanded keyword back to the
|
|
223
|
+
original keyword position. This lets us compute confidence using the
|
|
224
|
+
number of user-supplied keywords rather than the expanded variants.
|
|
225
|
+
|
|
226
|
+
Example:
|
|
227
|
+
["create*|update*", "user"] -> (["create*", "update*", "user"], [0, 0, 1])
|
|
230
228
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
for
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if matched_keywords:
|
|
244
|
-
# Calculate a simple score based on number of matches
|
|
245
|
-
# This is a simplified version of BM25 for wildcard matching
|
|
246
|
-
score = len(matched_keywords) / len(query_keywords)
|
|
247
|
-
scores.append(score)
|
|
248
|
-
else:
|
|
249
|
-
scores.append(0.0)
|
|
250
|
-
|
|
251
|
-
return scores
|
|
252
|
-
|
|
253
|
-
def _has_wildcards(self, keywords: list[str]) -> bool:
|
|
254
|
-
"""Check if any keywords contain wildcard patterns."""
|
|
255
|
-
return any("*" in keyword for keyword in keywords)
|
|
256
|
-
|
|
257
|
-
def search(self, query_keywords: list[str], top_n: int = 5) -> list[dict[str, Any]]:
|
|
229
|
+
expanded: list[str] = []
|
|
230
|
+
groups: list[int] = []
|
|
231
|
+
for idx, keyword in enumerate(keywords):
|
|
232
|
+
parts = [p.strip() for p in keyword.split("|")] if "|" in keyword else [keyword]
|
|
233
|
+
for part in parts:
|
|
234
|
+
expanded.append(part)
|
|
235
|
+
groups.append(idx)
|
|
236
|
+
return expanded, groups
|
|
237
|
+
|
|
238
|
+
def search(
|
|
239
|
+
self, query_keywords: list[str], top_n: int = 5, filter_type: str = "all"
|
|
240
|
+
) -> list[dict[str, Any]]:
|
|
258
241
|
"""
|
|
259
242
|
Search for modules and functions matching the given keywords.
|
|
260
243
|
|
|
261
|
-
Uses
|
|
262
|
-
|
|
263
|
-
when they match query keywords.
|
|
244
|
+
Uses pre-weighted keyword scores calculated during extraction/expansion.
|
|
245
|
+
The score for each result is the sum of weights of matched keywords.
|
|
264
246
|
|
|
265
|
-
Automatically detects wildcard patterns (* supported) in keywords.
|
|
247
|
+
Automatically detects wildcard patterns (* supported) and OR patterns (| supported) in keywords.
|
|
266
248
|
|
|
267
249
|
Args:
|
|
268
|
-
query_keywords: List of keywords to search for
|
|
250
|
+
query_keywords: List of keywords to search for (supports "create*|update*" for OR patterns)
|
|
269
251
|
top_n: Maximum number of results to return
|
|
252
|
+
filter_type: Filter results by type ('all', 'modules', 'functions'). Defaults to 'all'.
|
|
270
253
|
|
|
271
254
|
Returns:
|
|
272
255
|
List of result dictionaries sorted by score (descending), each containing:
|
|
@@ -275,271 +258,74 @@ class KeywordSearcher:
|
|
|
275
258
|
- module: Module name
|
|
276
259
|
- file: File path
|
|
277
260
|
- line: Line number
|
|
278
|
-
- score:
|
|
261
|
+
- score: Sum of matched keyword weights (float)
|
|
279
262
|
- confidence: Percentage of query keywords matched
|
|
280
263
|
- matched_keywords: List of matched keywords
|
|
281
264
|
- doc: Documentation string (if available)
|
|
282
265
|
"""
|
|
283
|
-
if not query_keywords or
|
|
266
|
+
if not query_keywords or not self.documents:
|
|
284
267
|
return []
|
|
285
268
|
|
|
286
269
|
# Normalize query keywords to lowercase
|
|
287
270
|
query_keywords_lower = [kw.lower() for kw in query_keywords]
|
|
288
271
|
|
|
289
|
-
#
|
|
290
|
-
|
|
272
|
+
# Expand OR patterns (e.g., "create*|update*" -> ["create*", "update*"])
|
|
273
|
+
query_keywords_expanded, keyword_groups = self._expand_or_patterns(query_keywords_lower)
|
|
291
274
|
|
|
292
|
-
#
|
|
293
|
-
|
|
294
|
-
# For wildcard matching, we need to manually score documents
|
|
295
|
-
bm25_scores = self._get_wildcard_scores(query_keywords_lower)
|
|
296
|
-
else:
|
|
297
|
-
bm25_scores = self.bm25.get_scores(query_keywords_lower)
|
|
275
|
+
# Check if wildcards are present
|
|
276
|
+
enable_wildcards = self._has_wildcards(query_keywords_expanded)
|
|
298
277
|
|
|
299
278
|
results = []
|
|
300
279
|
|
|
301
|
-
#
|
|
302
|
-
for
|
|
303
|
-
#
|
|
304
|
-
# We check matched keywords instead to filter relevance
|
|
305
|
-
doc_info = self.document_map[doc_idx]
|
|
306
|
-
|
|
307
|
-
# Check if there are any matching keywords first
|
|
280
|
+
# Search all documents
|
|
281
|
+
for doc in self.documents:
|
|
282
|
+
# Calculate score
|
|
308
283
|
if enable_wildcards:
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
284
|
+
result_data = self._calculate_wildcard_score(
|
|
285
|
+
query_keywords_expanded,
|
|
286
|
+
keyword_groups,
|
|
287
|
+
len(query_keywords_lower),
|
|
288
|
+
doc["keywords"],
|
|
312
289
|
)
|
|
313
290
|
else:
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
if enable_wildcards:
|
|
320
|
-
final_score = self._apply_identifier_boost_wildcard(
|
|
321
|
-
bm25_score, query_keywords_lower, doc_info
|
|
322
|
-
)
|
|
323
|
-
else:
|
|
324
|
-
final_score = self._apply_identifier_boost(
|
|
325
|
-
bm25_score, query_keywords_lower, doc_info
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
# Apply name coverage penalty (penalize functions with extra words not in query)
|
|
329
|
-
coverage_penalty = self._calculate_name_coverage_penalty(
|
|
330
|
-
query_keywords_lower, doc_info
|
|
291
|
+
result_data = self._calculate_score(
|
|
292
|
+
query_keywords_expanded,
|
|
293
|
+
keyword_groups,
|
|
294
|
+
len(query_keywords_lower),
|
|
295
|
+
doc["keywords"],
|
|
331
296
|
)
|
|
332
|
-
# For negative scores, divide by penalty instead of multiply
|
|
333
|
-
# This ensures penalty always makes the score worse regardless of sign
|
|
334
|
-
if final_score < 0 and coverage_penalty < 1.0:
|
|
335
|
-
final_score = final_score / coverage_penalty
|
|
336
|
-
else:
|
|
337
|
-
final_score = final_score * coverage_penalty
|
|
338
297
|
|
|
298
|
+
# Only include results with at least one matched keyword
|
|
299
|
+
if result_data["score"] > 0:
|
|
339
300
|
result = {
|
|
340
|
-
"type":
|
|
341
|
-
"name":
|
|
342
|
-
"module":
|
|
343
|
-
"file":
|
|
344
|
-
"line":
|
|
345
|
-
"score": round(
|
|
346
|
-
"confidence":
|
|
347
|
-
"matched_keywords":
|
|
301
|
+
"type": doc["type"],
|
|
302
|
+
"name": doc["name"],
|
|
303
|
+
"module": doc["module"],
|
|
304
|
+
"file": doc["file"],
|
|
305
|
+
"line": doc["line"],
|
|
306
|
+
"score": round(result_data["score"], 4),
|
|
307
|
+
"confidence": result_data["confidence"],
|
|
308
|
+
"matched_keywords": result_data["matched_keywords"],
|
|
348
309
|
}
|
|
349
310
|
|
|
350
311
|
# Add type-specific fields
|
|
351
|
-
if
|
|
352
|
-
result["function"] =
|
|
353
|
-
result["arity"] =
|
|
312
|
+
if doc["type"] == "function":
|
|
313
|
+
result["function"] = doc["function"]
|
|
314
|
+
result["arity"] = doc["arity"]
|
|
354
315
|
|
|
355
316
|
# Add documentation if available
|
|
356
|
-
if
|
|
357
|
-
result["doc"] =
|
|
317
|
+
if doc.get("doc"):
|
|
318
|
+
result["doc"] = doc["doc"]
|
|
358
319
|
|
|
359
320
|
results.append(result)
|
|
360
321
|
|
|
361
|
-
#
|
|
322
|
+
# Apply type filter
|
|
323
|
+
if filter_type == "modules":
|
|
324
|
+
results = [r for r in results if r["type"] == "module"]
|
|
325
|
+
elif filter_type == "functions":
|
|
326
|
+
results = [r for r in results if r["type"] == "function"]
|
|
327
|
+
|
|
328
|
+
# Sort by score (descending), then by name for stable results
|
|
362
329
|
results.sort(key=lambda x: (-x["score"], x["name"]))
|
|
363
330
|
|
|
364
331
|
return results[:top_n]
|
|
365
|
-
|
|
366
|
-
def _apply_identifier_boost(
|
|
367
|
-
self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
|
|
368
|
-
) -> float:
|
|
369
|
-
"""
|
|
370
|
-
Apply boost to BM25 score if query keywords match the identifier name.
|
|
371
|
-
|
|
372
|
-
Identifier names (function/module names) are given much higher weight.
|
|
373
|
-
If any query keyword matches a word in the identifier name, the score
|
|
374
|
-
is multiplied by IDENTIFIER_MATCH_BOOST.
|
|
375
|
-
|
|
376
|
-
Args:
|
|
377
|
-
bm25_score: The original BM25 score
|
|
378
|
-
query_keywords: Normalized query keywords (lowercase)
|
|
379
|
-
doc_info: Document information with function/module name
|
|
380
|
-
|
|
381
|
-
Returns:
|
|
382
|
-
Final score (boosted if identifier matches)
|
|
383
|
-
"""
|
|
384
|
-
# Extract the identifier name (module name or function name)
|
|
385
|
-
identifier_name = self._extract_identifier_name(doc_info)
|
|
386
|
-
|
|
387
|
-
# Split identifier into words
|
|
388
|
-
identifier_words = split_identifier(identifier_name)
|
|
389
|
-
|
|
390
|
-
# Check if any query keyword matches any word in the identifier
|
|
391
|
-
for query_kw in query_keywords:
|
|
392
|
-
if query_kw in identifier_words:
|
|
393
|
-
# Apply significant boost for identifier match
|
|
394
|
-
return bm25_score * self.IDENTIFIER_MATCH_BOOST
|
|
395
|
-
|
|
396
|
-
return bm25_score
|
|
397
|
-
|
|
398
|
-
def _count_matches(self, query_keywords: list[str], item_keywords: list[str]) -> dict[str, Any]:
|
|
399
|
-
"""
|
|
400
|
-
Count matching keywords between query and item.
|
|
401
|
-
|
|
402
|
-
Args:
|
|
403
|
-
query_keywords: Query keywords (normalized to lowercase)
|
|
404
|
-
item_keywords: Keywords from module/function
|
|
405
|
-
|
|
406
|
-
Returns:
|
|
407
|
-
Dictionary with:
|
|
408
|
-
- score: Number of matching keywords
|
|
409
|
-
- confidence: Percentage match (score / len(query_keywords) * 100)
|
|
410
|
-
- matched_keywords: List of matched keywords
|
|
411
|
-
"""
|
|
412
|
-
# Normalize item keywords to lowercase
|
|
413
|
-
item_keywords_lower = [kw.lower() for kw in item_keywords]
|
|
414
|
-
|
|
415
|
-
# Find matches
|
|
416
|
-
matched_keywords = []
|
|
417
|
-
for query_kw in query_keywords:
|
|
418
|
-
if query_kw in item_keywords_lower:
|
|
419
|
-
matched_keywords.append(query_kw)
|
|
420
|
-
|
|
421
|
-
score = len(matched_keywords)
|
|
422
|
-
confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
|
|
423
|
-
|
|
424
|
-
return {
|
|
425
|
-
"score": score,
|
|
426
|
-
"confidence": round(confidence, 1),
|
|
427
|
-
"matched_keywords": matched_keywords,
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
def _count_wildcard_matches(
|
|
431
|
-
self,
|
|
432
|
-
query_keywords: list[str],
|
|
433
|
-
item_keywords: list[str],
|
|
434
|
-
identifier_name: str | None = None,
|
|
435
|
-
) -> dict[str, Any]:
|
|
436
|
-
"""
|
|
437
|
-
Count matching keywords between query and item using wildcard patterns.
|
|
438
|
-
|
|
439
|
-
Args:
|
|
440
|
-
query_keywords: Query keywords (may contain wildcards, normalized to lowercase)
|
|
441
|
-
item_keywords: Keywords from module/function
|
|
442
|
-
identifier_name: The full identifier name (function/module name)
|
|
443
|
-
|
|
444
|
-
Returns:
|
|
445
|
-
Dictionary with:
|
|
446
|
-
- score: Number of matching keywords
|
|
447
|
-
- confidence: Percentage match (score / len(query_keywords) * 100)
|
|
448
|
-
- matched_keywords: List of matched keywords
|
|
449
|
-
"""
|
|
450
|
-
# Normalize item keywords to lowercase
|
|
451
|
-
item_keywords_lower = [kw.lower() for kw in item_keywords]
|
|
452
|
-
|
|
453
|
-
# Find matches using wildcard patterns
|
|
454
|
-
if identifier_name:
|
|
455
|
-
matched_keywords = self._expand_wildcard_keywords_with_identifier(
|
|
456
|
-
query_keywords, item_keywords_lower, identifier_name
|
|
457
|
-
)
|
|
458
|
-
else:
|
|
459
|
-
matched_keywords = self._expand_wildcard_keywords(query_keywords, item_keywords_lower)
|
|
460
|
-
|
|
461
|
-
score = len(matched_keywords)
|
|
462
|
-
confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
|
|
463
|
-
|
|
464
|
-
return {
|
|
465
|
-
"score": score,
|
|
466
|
-
"confidence": round(confidence, 1),
|
|
467
|
-
"matched_keywords": matched_keywords,
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
def _apply_identifier_boost_wildcard(
|
|
471
|
-
self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
|
|
472
|
-
) -> float:
|
|
473
|
-
"""
|
|
474
|
-
Apply boost to BM25 score if query keywords match the identifier name using wildcards.
|
|
475
|
-
|
|
476
|
-
Identifier names (function/module names) are given much higher weight.
|
|
477
|
-
If any query keyword matches a word in the identifier name using wildcard patterns,
|
|
478
|
-
the score is multiplied by IDENTIFIER_MATCH_BOOST.
|
|
479
|
-
|
|
480
|
-
Args:
|
|
481
|
-
bm25_score: The original BM25 score
|
|
482
|
-
query_keywords: Query keywords (may contain wildcards, normalized to lowercase)
|
|
483
|
-
doc_info: Document information with function/module name
|
|
484
|
-
|
|
485
|
-
Returns:
|
|
486
|
-
Final score (boosted if identifier matches)
|
|
487
|
-
"""
|
|
488
|
-
# Extract the identifier name (module name or function name)
|
|
489
|
-
identifier_name = self._extract_identifier_name(doc_info)
|
|
490
|
-
|
|
491
|
-
# Split identifier into words
|
|
492
|
-
identifier_words = split_identifier(identifier_name)
|
|
493
|
-
|
|
494
|
-
# Check if any query keyword matches any word in the identifier using wildcards
|
|
495
|
-
for query_kw in query_keywords:
|
|
496
|
-
for identifier_word in identifier_words:
|
|
497
|
-
if self._match_wildcard(query_kw, identifier_word):
|
|
498
|
-
# Apply significant boost for identifier match
|
|
499
|
-
return bm25_score * self.IDENTIFIER_MATCH_BOOST
|
|
500
|
-
|
|
501
|
-
return bm25_score
|
|
502
|
-
|
|
503
|
-
def _calculate_name_coverage_penalty(
|
|
504
|
-
self, query_keywords: list[str], doc_info: dict[str, Any]
|
|
505
|
-
) -> float:
|
|
506
|
-
"""
|
|
507
|
-
Calculate penalty for functions whose names contain words NOT in the query.
|
|
508
|
-
|
|
509
|
-
This helps rank exact matches higher than functions with extra words in their names.
|
|
510
|
-
For example, searching "create user" should rank "create_user" higher than
|
|
511
|
-
"create_invalid_user" because "invalid" is not in the query.
|
|
512
|
-
|
|
513
|
-
Args:
|
|
514
|
-
query_keywords: Normalized query keywords (lowercase)
|
|
515
|
-
doc_info: Document information with function/module name
|
|
516
|
-
|
|
517
|
-
Returns:
|
|
518
|
-
Penalty multiplier between 0.1 and 1.0:
|
|
519
|
-
- 1.0 = no penalty (exact match or all extra words in query)
|
|
520
|
-
- 0.7 = 1 extra word not in query (30% penalty)
|
|
521
|
-
- 0.4 = 2 extra words not in query (60% penalty)
|
|
522
|
-
- 0.1 = 3+ extra words not in query (90% penalty cap)
|
|
523
|
-
"""
|
|
524
|
-
# Only apply to functions (not modules)
|
|
525
|
-
if doc_info["type"] != "function":
|
|
526
|
-
return 1.0
|
|
527
|
-
|
|
528
|
-
# Get function name and split it
|
|
529
|
-
func_name = doc_info["function"]
|
|
530
|
-
func_words = set(split_identifier(func_name))
|
|
531
|
-
|
|
532
|
-
# Find words in function name that are NOT in query
|
|
533
|
-
query_set = set(query_keywords)
|
|
534
|
-
extra_words = func_words - query_set
|
|
535
|
-
|
|
536
|
-
# No penalty if all function name words are in query (exact match)
|
|
537
|
-
if not extra_words:
|
|
538
|
-
return 1.0
|
|
539
|
-
|
|
540
|
-
# Apply 30% penalty per extra word, with a floor of 0.1 (max 90% penalty)
|
|
541
|
-
# This penalty is strong enough to overcome BM25 length normalization bias
|
|
542
|
-
penalty_per_word = 0.3
|
|
543
|
-
total_penalty = min(len(extra_words) * penalty_per_word, 0.9)
|
|
544
|
-
|
|
545
|
-
return 1.0 - total_penalty
|