cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. cicada/_version_hash.py +4 -0
  2. cicada/cli.py +6 -748
  3. cicada/commands.py +1255 -0
  4. cicada/dead_code/__init__.py +1 -0
  5. cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
  6. cicada/dependency_analyzer.py +147 -0
  7. cicada/entry_utils.py +92 -0
  8. cicada/extractors/base.py +9 -9
  9. cicada/extractors/call.py +17 -20
  10. cicada/extractors/common.py +64 -0
  11. cicada/extractors/dependency.py +117 -235
  12. cicada/extractors/doc.py +2 -49
  13. cicada/extractors/function.py +10 -14
  14. cicada/extractors/keybert.py +228 -0
  15. cicada/extractors/keyword.py +191 -0
  16. cicada/extractors/module.py +6 -10
  17. cicada/extractors/spec.py +8 -56
  18. cicada/format/__init__.py +20 -0
  19. cicada/{ascii_art.py → format/ascii_art.py} +1 -1
  20. cicada/format/formatter.py +1145 -0
  21. cicada/git_helper.py +134 -7
  22. cicada/indexer.py +322 -89
  23. cicada/interactive_setup.py +251 -323
  24. cicada/interactive_setup_helpers.py +302 -0
  25. cicada/keyword_expander.py +437 -0
  26. cicada/keyword_search.py +208 -422
  27. cicada/keyword_test.py +383 -16
  28. cicada/mcp/__init__.py +10 -0
  29. cicada/mcp/entry.py +17 -0
  30. cicada/mcp/filter_utils.py +107 -0
  31. cicada/mcp/pattern_utils.py +118 -0
  32. cicada/{mcp_server.py → mcp/server.py} +819 -73
  33. cicada/mcp/tools.py +473 -0
  34. cicada/pr_finder.py +2 -3
  35. cicada/pr_indexer/indexer.py +3 -2
  36. cicada/setup.py +167 -35
  37. cicada/tier.py +225 -0
  38. cicada/utils/__init__.py +9 -2
  39. cicada/utils/fuzzy_match.py +54 -0
  40. cicada/utils/index_utils.py +9 -0
  41. cicada/utils/path_utils.py +18 -0
  42. cicada/utils/text_utils.py +52 -1
  43. cicada/utils/tree_utils.py +47 -0
  44. cicada/version_check.py +99 -0
  45. cicada/watch_manager.py +320 -0
  46. cicada/watcher.py +431 -0
  47. cicada_mcp-0.3.0.dist-info/METADATA +541 -0
  48. cicada_mcp-0.3.0.dist-info/RECORD +70 -0
  49. cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
  50. cicada/formatter.py +0 -864
  51. cicada/keybert_extractor.py +0 -286
  52. cicada/lightweight_keyword_extractor.py +0 -290
  53. cicada/mcp_entry.py +0 -683
  54. cicada/mcp_tools.py +0 -291
  55. cicada_mcp-0.2.0.dist-info/METADATA +0 -735
  56. cicada_mcp-0.2.0.dist-info/RECORD +0 -53
  57. cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
  58. /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
  59. /cicada/{colors.py → format/colors.py} +0 -0
  60. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
  61. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
  62. {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
cicada/keyword_search.py CHANGED
@@ -1,28 +1,19 @@
1
1
  """
2
2
  Keyword-based search for modules and functions.
3
3
 
4
- Provides semantic search capabilities by matching query keywords
5
- against extracted keywords in the index using BM25 ranking.
4
+ Provides simple keyword search by summing weights of matched keywords.
5
+ Keywords are pre-weighted during extraction/expansion based on frequency,
6
+ code identifier presence, semantic similarity, etc.
6
7
 
7
- Identifier names (function/module names) are given much higher weight than keywords.
8
-
9
- Author: Cursor(Auto)
8
+ Author: Cicada Team
10
9
  """
11
10
 
12
11
  import fnmatch
13
12
  from typing import Any
14
13
 
15
- from rank_bm25 import BM25Okapi
16
-
17
- from cicada.utils import split_identifier
18
-
19
14
 
20
15
  class KeywordSearcher:
21
- """Search for modules and functions by keywords using BM25 ranking."""
22
-
23
- # Boost multiplier for identifier name matches
24
- # When query keyword matches the function/module name, multiply the score by this
25
- IDENTIFIER_MATCH_BOOST = 10.0
16
+ """Search for modules and functions by keywords using pre-weighted keyword scores."""
26
17
 
27
18
  def __init__(self, index: dict[str, Any]):
28
19
  """
@@ -32,123 +23,85 @@ class KeywordSearcher:
32
23
  index: The Cicada index dictionary containing modules and metadata
33
24
  """
34
25
  self.index = index
35
- self.bm25, self.document_map = self._initialize_bm25()
26
+ self.documents = self._build_document_map()
36
27
 
37
- @staticmethod
38
- def _extract_identifier_name(document_info: dict[str, Any]) -> str:
28
+ def _build_document_map(self) -> list[dict[str, Any]]:
39
29
  """
40
- Extract the core identifier name from document info.
41
-
42
- For modules: returns the module name
43
- For functions: returns the function name (without arity)
44
-
45
- Args:
46
- document_info: Document information dictionary
30
+ Build a list of searchable documents from the index.
47
31
 
48
32
  Returns:
49
- The identifier name
50
- """
51
- if document_info["type"] == "module":
52
- return document_info["name"]
53
- else:
54
- return document_info["function"]
55
-
56
- def _initialize_bm25(self) -> tuple:
57
- """
58
- Initialize BM25 calculator with all documents in the index.
59
-
60
- Returns:
61
- Tuple of (BM25Okapi instance, document_map dict)
62
- - document_map maps document index to (type, module_name, location_info)
33
+ List of document dicts with type, name, module, keywords, etc.
63
34
  """
64
35
  documents = []
65
- document_map = []
66
36
 
67
- # Collect all documents (modules and functions with keywords)
37
+ # Process all modules and their functions
68
38
  for module_name, module_data in self.index.get("modules", {}).items():
69
39
  # Add module as a document
70
- if module_data.get("keywords"):
71
- doc_keywords = [kw.lower() for kw in module_data["keywords"]]
72
- documents.append(doc_keywords)
73
- document_map.append(
74
- {
75
- "type": "module",
76
- "name": module_name,
77
- "module": module_name,
78
- "file": module_data["file"],
79
- "line": module_data["line"],
80
- "doc": module_data.get("moduledoc"),
81
- "keywords": module_data["keywords"],
82
- }
83
- )
40
+ module_doc = self._create_module_document(module_name, module_data)
41
+ if module_doc:
42
+ documents.append(module_doc)
84
43
 
85
44
  # Add functions as documents
86
45
  for func in module_data.get("functions", []):
87
- if func.get("keywords"):
88
- doc_keywords = [kw.lower() for kw in func["keywords"]]
89
- documents.append(doc_keywords)
90
- full_name = f"{module_name}.{func['name']}/{func['arity']}"
91
- document_map.append(
92
- {
93
- "type": "function",
94
- "name": full_name,
95
- "module": module_name,
96
- "function": func["name"],
97
- "arity": func["arity"],
98
- "file": module_data["file"],
99
- "line": func["line"],
100
- "doc": func.get("doc"),
101
- "keywords": func["keywords"],
102
- }
103
- )
104
-
105
- # If no documents were created (no keywords extracted), create documents using identifier names
106
- if not documents:
107
- for module_name, module_data in self.index.get("modules", {}).items():
108
- # Add module as a document using its name as keywords
109
- module_keywords = split_identifier(module_name)
110
- documents.append(module_keywords)
111
- document_map.append(
112
- {
113
- "type": "module",
114
- "name": module_name,
115
- "module": module_name,
116
- "file": module_data["file"],
117
- "line": module_data["line"],
118
- "doc": module_data.get("moduledoc"),
119
- "keywords": module_keywords,
120
- }
121
- )
46
+ func_doc = self._create_function_document(module_name, module_data, func)
47
+ if func_doc:
48
+ documents.append(func_doc)
49
+
50
+ return documents
51
+
52
+ def _create_module_document(
53
+ self, module_name: str, module_data: dict[str, Any]
54
+ ) -> dict[str, Any] | None:
55
+ """Create a searchable document for a module."""
56
+ if not module_data.get("keywords"):
57
+ return None
58
+
59
+ # Keywords can be either dict {word: score} or list [words]
60
+ # If list, convert to dict with uniform scores
61
+ keywords_dict = module_data["keywords"]
62
+ if isinstance(keywords_dict, list):
63
+ keywords_dict = {kw.lower(): 1.0 for kw in keywords_dict}
64
+ else:
65
+ keywords_dict = {k.lower(): v for k, v in keywords_dict.items()}
66
+
67
+ return {
68
+ "type": "module",
69
+ "name": module_name,
70
+ "module": module_name,
71
+ "file": module_data["file"],
72
+ "line": module_data["line"],
73
+ "doc": module_data.get("moduledoc"),
74
+ "keywords": keywords_dict,
75
+ }
122
76
 
123
- # Add functions as documents
124
- for func in module_data.get("functions", []):
125
- # Use extracted keywords if available, otherwise fall back to split identifier
126
- if func.get("keywords"):
127
- func_keywords = [kw.lower() for kw in func["keywords"]]
128
- else:
129
- func_keywords = split_identifier(func["name"])
130
-
131
- documents.append(func_keywords)
132
- full_name = f"{module_name}.{func['name']}/{func['arity']}"
133
- document_map.append(
134
- {
135
- "type": "function",
136
- "name": full_name,
137
- "module": module_name,
138
- "function": func["name"],
139
- "arity": func["arity"],
140
- "file": module_data["file"],
141
- "line": func["line"],
142
- "doc": func.get("doc"),
143
- "keywords": func_keywords,
144
- }
145
- )
146
-
147
- # Initialize BM25 with all documents
148
- # Use b=0.4 (lower than default 0.75) to reduce length normalization penalty
149
- # This is appropriate for code search where longer names are more specific, not verbose
150
- bm25 = BM25Okapi(documents, b=0.4) if documents else None
151
- return bm25, document_map
77
+ def _create_function_document(
78
+ self, module_name: str, module_data: dict[str, Any], func: dict[str, Any]
79
+ ) -> dict[str, Any] | None:
80
+ """Create a searchable document for a function."""
81
+ if not func.get("keywords"):
82
+ return None
83
+
84
+ # Keywords can be either dict {word: score} or list [words]
85
+ # If list, convert to dict with uniform scores
86
+ keywords_dict = func["keywords"]
87
+ if isinstance(keywords_dict, list):
88
+ keywords_dict = {kw.lower(): 1.0 for kw in keywords_dict}
89
+ else:
90
+ keywords_dict = {k.lower(): v for k, v in keywords_dict.items()}
91
+
92
+ full_name = f"{module_name}.{func['name']}/{func['arity']}"
93
+
94
+ return {
95
+ "type": "function",
96
+ "name": full_name,
97
+ "module": module_name,
98
+ "function": func["name"],
99
+ "arity": func["arity"],
100
+ "file": module_data["file"],
101
+ "line": func["line"],
102
+ "doc": func.get("doc"),
103
+ "keywords": keywords_dict,
104
+ }
152
105
 
153
106
  def _match_wildcard(self, pattern: str, text: str) -> bool:
154
107
  """
@@ -168,105 +121,135 @@ class KeywordSearcher:
168
121
  return False
169
122
  return fnmatch.fnmatch(text.lower(), pattern.lower())
170
123
 
171
- def _expand_wildcard_keywords(
172
- self, query_keywords: list[str], document_keywords: list[str]
173
- ) -> list[str]:
124
+ def _calculate_score(
125
+ self,
126
+ query_keywords: list[str],
127
+ keyword_groups: list[int],
128
+ total_terms: int,
129
+ doc_keywords: dict[str, float],
130
+ ) -> dict[str, Any]:
174
131
  """
175
- Expand wildcard patterns to actual matching keywords from the document.
132
+ Calculate the search score by summing weights of matched keywords.
176
133
 
177
134
  Args:
178
- query_keywords: List of query keywords (may contain wildcards)
179
- document_keywords: List of keywords from a document
135
+ query_keywords: Query keywords (normalized to lowercase)
136
+ doc_keywords: Document keywords with their scores
180
137
 
181
138
  Returns:
182
- List of actual matching keywords found in the document
139
+ Dictionary with:
140
+ - score: Sum of matched keyword weights
141
+ - matched_keywords: List of matched keywords
142
+ - confidence: Percentage of query keywords that matched
183
143
  """
184
144
  matched_keywords = []
185
- for query_kw in query_keywords:
186
- for doc_kw in document_keywords:
187
- if self._match_wildcard(query_kw, doc_kw):
188
- matched_keywords.append(query_kw)
189
- return matched_keywords
145
+ matched_groups: set[int] = set()
146
+ total_score = 0.0
147
+
148
+ for query_kw, group_idx in zip(query_keywords, keyword_groups, strict=False):
149
+ if query_kw in doc_keywords:
150
+ matched_keywords.append(query_kw)
151
+ matched_groups.add(group_idx)
152
+ total_score += doc_keywords[query_kw]
153
+
154
+ denominator = total_terms if total_terms else len(query_keywords)
155
+ confidence = (len(matched_groups) / denominator * 100) if denominator else 0
190
156
 
191
- def _expand_wildcard_keywords_with_identifier(
157
+ return {
158
+ "score": total_score,
159
+ "matched_keywords": matched_keywords,
160
+ "confidence": round(confidence, 1),
161
+ }
162
+
163
+ def _calculate_wildcard_score(
192
164
  self,
193
165
  query_keywords: list[str],
194
- document_keywords: list[str],
195
- identifier_name: str,
196
- ) -> list[str]:
166
+ keyword_groups: list[int],
167
+ total_terms: int,
168
+ doc_keywords: dict[str, float],
169
+ ) -> dict[str, Any]:
197
170
  """
198
- Expand wildcard patterns to actual matching keywords from the document and identifier name.
171
+ Calculate the search score using wildcard pattern matching.
199
172
 
200
173
  Args:
201
- query_keywords: List of query keywords (may contain wildcards)
202
- document_keywords: List of keywords from a document
203
- identifier_name: The full identifier name (function/module name)
174
+ query_keywords: Query keywords with potential wildcards (normalized to lowercase)
175
+ doc_keywords: Document keywords with their scores
204
176
 
205
177
  Returns:
206
- List of actual matching keywords found in the document or identifier
178
+ Dictionary with:
179
+ - score: Sum of matched keyword weights
180
+ - matched_keywords: List of matched query patterns
181
+ - confidence: Percentage of query keywords that matched
207
182
  """
208
183
  matched_keywords = []
209
- for query_kw in query_keywords:
210
- # Check against individual keywords
211
- for doc_kw in document_keywords:
184
+ matched_groups: set[int] = set()
185
+ total_score = 0.0
186
+
187
+ for query_kw, group_idx in zip(query_keywords, keyword_groups, strict=False):
188
+ # Find all doc keywords matching this pattern
189
+ for doc_kw, weight in doc_keywords.items():
212
190
  if self._match_wildcard(query_kw, doc_kw):
213
- matched_keywords.append(query_kw)
214
- break # Only add each query keyword once
191
+ # Add query keyword to matched list (not the doc keyword)
192
+ if query_kw not in matched_keywords:
193
+ matched_keywords.append(query_kw)
194
+ matched_groups.add(group_idx)
195
+ # Add the weight only once per query keyword
196
+ total_score += weight
197
+ break
215
198
 
216
- # Also check against the full identifier name
217
- if query_kw not in matched_keywords and self._match_wildcard(query_kw, identifier_name):
218
- matched_keywords.append(query_kw)
219
- return matched_keywords
199
+ denominator = total_terms if total_terms else len(query_keywords)
200
+ confidence = (len(matched_groups) / denominator * 100) if denominator else 0
220
201
 
221
- def _get_wildcard_scores(self, query_keywords: list[str]) -> list[float]:
202
+ return {
203
+ "score": total_score,
204
+ "matched_keywords": matched_keywords,
205
+ "confidence": round(confidence, 1),
206
+ }
207
+
208
+ def _has_wildcards(self, keywords: list[str]) -> bool:
209
+ """Check if any keywords contain wildcard patterns (* or |)."""
210
+ return any("*" in keyword or "|" in keyword for keyword in keywords)
211
+
212
+ def _expand_or_patterns(self, keywords: list[str]) -> tuple[list[str], list[int]]:
222
213
  """
223
- Calculate BM25-like scores for wildcard matching.
214
+ Expand OR patterns (|) in keywords.
224
215
 
225
216
  Args:
226
- query_keywords: List of query keywords (may contain wildcards)
217
+ keywords: List of keywords that may contain | for OR logic
227
218
 
228
219
  Returns:
229
- List of scores for each document
220
+ Tuple of:
221
+ - Expanded list of keywords with OR patterns split out
222
+ - Parallel list of group indexes mapping each expanded keyword back to the
223
+ original keyword position. This lets us compute confidence using the
224
+ number of user-supplied keywords rather than the expanded variants.
225
+
226
+ Example:
227
+ ["create*|update*", "user"] -> (["create*", "update*", "user"], [0, 0, 1])
230
228
  """
231
- scores = []
232
-
233
- for _, doc_info in enumerate(self.document_map):
234
- doc_keywords = [kw.lower() for kw in doc_info["keywords"]]
235
- identifier_name = self._extract_identifier_name(doc_info)
236
-
237
- # Find matching keywords using wildcard patterns
238
- # Check both individual keywords and full identifier name
239
- matched_keywords = self._expand_wildcard_keywords_with_identifier(
240
- query_keywords, doc_keywords, identifier_name
241
- )
242
-
243
- if matched_keywords:
244
- # Calculate a simple score based on number of matches
245
- # This is a simplified version of BM25 for wildcard matching
246
- score = len(matched_keywords) / len(query_keywords)
247
- scores.append(score)
248
- else:
249
- scores.append(0.0)
250
-
251
- return scores
252
-
253
- def _has_wildcards(self, keywords: list[str]) -> bool:
254
- """Check if any keywords contain wildcard patterns."""
255
- return any("*" in keyword for keyword in keywords)
256
-
257
- def search(self, query_keywords: list[str], top_n: int = 5) -> list[dict[str, Any]]:
229
+ expanded: list[str] = []
230
+ groups: list[int] = []
231
+ for idx, keyword in enumerate(keywords):
232
+ parts = [p.strip() for p in keyword.split("|")] if "|" in keyword else [keyword]
233
+ for part in parts:
234
+ expanded.append(part)
235
+ groups.append(idx)
236
+ return expanded, groups
237
+
238
+ def search(
239
+ self, query_keywords: list[str], top_n: int = 5, filter_type: str = "all"
240
+ ) -> list[dict[str, Any]]:
258
241
  """
259
242
  Search for modules and functions matching the given keywords.
260
243
 
261
- Uses BM25 ranking to score documents based on keyword relevance.
262
- Identifier names (function/module names) are boosted significantly
263
- when they match query keywords.
244
+ Uses pre-weighted keyword scores calculated during extraction/expansion.
245
+ The score for each result is the sum of weights of matched keywords.
264
246
 
265
- Automatically detects wildcard patterns (* supported) in keywords.
247
+ Automatically detects wildcard patterns (* supported) and OR patterns (| supported) in keywords.
266
248
 
267
249
  Args:
268
- query_keywords: List of keywords to search for
250
+ query_keywords: List of keywords to search for (supports "create*|update*" for OR patterns)
269
251
  top_n: Maximum number of results to return
252
+ filter_type: Filter results by type ('all', 'modules', 'functions'). Defaults to 'all'.
270
253
 
271
254
  Returns:
272
255
  List of result dictionaries sorted by score (descending), each containing:
@@ -275,271 +258,74 @@ class KeywordSearcher:
275
258
  - module: Module name
276
259
  - file: File path
277
260
  - line: Line number
278
- - score: BM25 score (float), boosted if identifier matches query
261
+ - score: Sum of matched keyword weights (float)
279
262
  - confidence: Percentage of query keywords matched
280
263
  - matched_keywords: List of matched keywords
281
264
  - doc: Documentation string (if available)
282
265
  """
283
- if not query_keywords or self.bm25 is None or not self.document_map:
266
+ if not query_keywords or not self.documents:
284
267
  return []
285
268
 
286
269
  # Normalize query keywords to lowercase
287
270
  query_keywords_lower = [kw.lower() for kw in query_keywords]
288
271
 
289
- # Check if wildcards are present
290
- enable_wildcards = self._has_wildcards(query_keywords_lower)
272
+ # Expand OR patterns (e.g., "create*|update*" -> ["create*", "update*"])
273
+ query_keywords_expanded, keyword_groups = self._expand_or_patterns(query_keywords_lower)
291
274
 
292
- # Get BM25 scores for all documents
293
- if enable_wildcards:
294
- # For wildcard matching, we need to manually score documents
295
- bm25_scores = self._get_wildcard_scores(query_keywords_lower)
296
- else:
297
- bm25_scores = self.bm25.get_scores(query_keywords_lower)
275
+ # Check if wildcards are present
276
+ enable_wildcards = self._has_wildcards(query_keywords_expanded)
298
277
 
299
278
  results = []
300
279
 
301
- # Build results with scores
302
- for doc_idx, bm25_score in enumerate(bm25_scores):
303
- # BM25 can produce negative scores for small corpuses
304
- # We check matched keywords instead to filter relevance
305
- doc_info = self.document_map[doc_idx]
306
-
307
- # Check if there are any matching keywords first
280
+ # Search all documents
281
+ for doc in self.documents:
282
+ # Calculate score
308
283
  if enable_wildcards:
309
- identifier_name = self._extract_identifier_name(doc_info)
310
- matched = self._count_wildcard_matches(
311
- query_keywords_lower, doc_info["keywords"], identifier_name
284
+ result_data = self._calculate_wildcard_score(
285
+ query_keywords_expanded,
286
+ keyword_groups,
287
+ len(query_keywords_lower),
288
+ doc["keywords"],
312
289
  )
313
290
  else:
314
- matched = self._count_matches(query_keywords_lower, doc_info["keywords"])
315
-
316
- # Only include documents that match at least one query keyword
317
- if matched["score"] > 0:
318
- # Apply identifier name boost
319
- if enable_wildcards:
320
- final_score = self._apply_identifier_boost_wildcard(
321
- bm25_score, query_keywords_lower, doc_info
322
- )
323
- else:
324
- final_score = self._apply_identifier_boost(
325
- bm25_score, query_keywords_lower, doc_info
326
- )
327
-
328
- # Apply name coverage penalty (penalize functions with extra words not in query)
329
- coverage_penalty = self._calculate_name_coverage_penalty(
330
- query_keywords_lower, doc_info
291
+ result_data = self._calculate_score(
292
+ query_keywords_expanded,
293
+ keyword_groups,
294
+ len(query_keywords_lower),
295
+ doc["keywords"],
331
296
  )
332
- # For negative scores, divide by penalty instead of multiply
333
- # This ensures penalty always makes the score worse regardless of sign
334
- if final_score < 0 and coverage_penalty < 1.0:
335
- final_score = final_score / coverage_penalty
336
- else:
337
- final_score = final_score * coverage_penalty
338
297
 
298
+ # Only include results with at least one matched keyword
299
+ if result_data["score"] > 0:
339
300
  result = {
340
- "type": doc_info["type"],
341
- "name": doc_info["name"],
342
- "module": doc_info["module"],
343
- "file": doc_info["file"],
344
- "line": doc_info["line"],
345
- "score": round(final_score, 4),
346
- "confidence": matched["confidence"],
347
- "matched_keywords": matched["matched_keywords"],
301
+ "type": doc["type"],
302
+ "name": doc["name"],
303
+ "module": doc["module"],
304
+ "file": doc["file"],
305
+ "line": doc["line"],
306
+ "score": round(result_data["score"], 4),
307
+ "confidence": result_data["confidence"],
308
+ "matched_keywords": result_data["matched_keywords"],
348
309
  }
349
310
 
350
311
  # Add type-specific fields
351
- if doc_info["type"] == "function":
352
- result["function"] = doc_info["function"]
353
- result["arity"] = doc_info["arity"]
312
+ if doc["type"] == "function":
313
+ result["function"] = doc["function"]
314
+ result["arity"] = doc["arity"]
354
315
 
355
316
  # Add documentation if available
356
- if doc_info.get("doc"):
357
- result["doc"] = doc_info["doc"]
317
+ if doc.get("doc"):
318
+ result["doc"] = doc["doc"]
358
319
 
359
320
  results.append(result)
360
321
 
361
- # Sort by final score (descending), then by name for stable results
322
+ # Apply type filter
323
+ if filter_type == "modules":
324
+ results = [r for r in results if r["type"] == "module"]
325
+ elif filter_type == "functions":
326
+ results = [r for r in results if r["type"] == "function"]
327
+
328
+ # Sort by score (descending), then by name for stable results
362
329
  results.sort(key=lambda x: (-x["score"], x["name"]))
363
330
 
364
331
  return results[:top_n]
365
-
366
- def _apply_identifier_boost(
367
- self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
368
- ) -> float:
369
- """
370
- Apply boost to BM25 score if query keywords match the identifier name.
371
-
372
- Identifier names (function/module names) are given much higher weight.
373
- If any query keyword matches a word in the identifier name, the score
374
- is multiplied by IDENTIFIER_MATCH_BOOST.
375
-
376
- Args:
377
- bm25_score: The original BM25 score
378
- query_keywords: Normalized query keywords (lowercase)
379
- doc_info: Document information with function/module name
380
-
381
- Returns:
382
- Final score (boosted if identifier matches)
383
- """
384
- # Extract the identifier name (module name or function name)
385
- identifier_name = self._extract_identifier_name(doc_info)
386
-
387
- # Split identifier into words
388
- identifier_words = split_identifier(identifier_name)
389
-
390
- # Check if any query keyword matches any word in the identifier
391
- for query_kw in query_keywords:
392
- if query_kw in identifier_words:
393
- # Apply significant boost for identifier match
394
- return bm25_score * self.IDENTIFIER_MATCH_BOOST
395
-
396
- return bm25_score
397
-
398
- def _count_matches(self, query_keywords: list[str], item_keywords: list[str]) -> dict[str, Any]:
399
- """
400
- Count matching keywords between query and item.
401
-
402
- Args:
403
- query_keywords: Query keywords (normalized to lowercase)
404
- item_keywords: Keywords from module/function
405
-
406
- Returns:
407
- Dictionary with:
408
- - score: Number of matching keywords
409
- - confidence: Percentage match (score / len(query_keywords) * 100)
410
- - matched_keywords: List of matched keywords
411
- """
412
- # Normalize item keywords to lowercase
413
- item_keywords_lower = [kw.lower() for kw in item_keywords]
414
-
415
- # Find matches
416
- matched_keywords = []
417
- for query_kw in query_keywords:
418
- if query_kw in item_keywords_lower:
419
- matched_keywords.append(query_kw)
420
-
421
- score = len(matched_keywords)
422
- confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
423
-
424
- return {
425
- "score": score,
426
- "confidence": round(confidence, 1),
427
- "matched_keywords": matched_keywords,
428
- }
429
-
430
- def _count_wildcard_matches(
431
- self,
432
- query_keywords: list[str],
433
- item_keywords: list[str],
434
- identifier_name: str | None = None,
435
- ) -> dict[str, Any]:
436
- """
437
- Count matching keywords between query and item using wildcard patterns.
438
-
439
- Args:
440
- query_keywords: Query keywords (may contain wildcards, normalized to lowercase)
441
- item_keywords: Keywords from module/function
442
- identifier_name: The full identifier name (function/module name)
443
-
444
- Returns:
445
- Dictionary with:
446
- - score: Number of matching keywords
447
- - confidence: Percentage match (score / len(query_keywords) * 100)
448
- - matched_keywords: List of matched keywords
449
- """
450
- # Normalize item keywords to lowercase
451
- item_keywords_lower = [kw.lower() for kw in item_keywords]
452
-
453
- # Find matches using wildcard patterns
454
- if identifier_name:
455
- matched_keywords = self._expand_wildcard_keywords_with_identifier(
456
- query_keywords, item_keywords_lower, identifier_name
457
- )
458
- else:
459
- matched_keywords = self._expand_wildcard_keywords(query_keywords, item_keywords_lower)
460
-
461
- score = len(matched_keywords)
462
- confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
463
-
464
- return {
465
- "score": score,
466
- "confidence": round(confidence, 1),
467
- "matched_keywords": matched_keywords,
468
- }
469
-
470
- def _apply_identifier_boost_wildcard(
471
- self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
472
- ) -> float:
473
- """
474
- Apply boost to BM25 score if query keywords match the identifier name using wildcards.
475
-
476
- Identifier names (function/module names) are given much higher weight.
477
- If any query keyword matches a word in the identifier name using wildcard patterns,
478
- the score is multiplied by IDENTIFIER_MATCH_BOOST.
479
-
480
- Args:
481
- bm25_score: The original BM25 score
482
- query_keywords: Query keywords (may contain wildcards, normalized to lowercase)
483
- doc_info: Document information with function/module name
484
-
485
- Returns:
486
- Final score (boosted if identifier matches)
487
- """
488
- # Extract the identifier name (module name or function name)
489
- identifier_name = self._extract_identifier_name(doc_info)
490
-
491
- # Split identifier into words
492
- identifier_words = split_identifier(identifier_name)
493
-
494
- # Check if any query keyword matches any word in the identifier using wildcards
495
- for query_kw in query_keywords:
496
- for identifier_word in identifier_words:
497
- if self._match_wildcard(query_kw, identifier_word):
498
- # Apply significant boost for identifier match
499
- return bm25_score * self.IDENTIFIER_MATCH_BOOST
500
-
501
- return bm25_score
502
-
503
- def _calculate_name_coverage_penalty(
504
- self, query_keywords: list[str], doc_info: dict[str, Any]
505
- ) -> float:
506
- """
507
- Calculate penalty for functions whose names contain words NOT in the query.
508
-
509
- This helps rank exact matches higher than functions with extra words in their names.
510
- For example, searching "create user" should rank "create_user" higher than
511
- "create_invalid_user" because "invalid" is not in the query.
512
-
513
- Args:
514
- query_keywords: Normalized query keywords (lowercase)
515
- doc_info: Document information with function/module name
516
-
517
- Returns:
518
- Penalty multiplier between 0.1 and 1.0:
519
- - 1.0 = no penalty (exact match or all extra words in query)
520
- - 0.7 = 1 extra word not in query (30% penalty)
521
- - 0.4 = 2 extra words not in query (60% penalty)
522
- - 0.1 = 3+ extra words not in query (90% penalty cap)
523
- """
524
- # Only apply to functions (not modules)
525
- if doc_info["type"] != "function":
526
- return 1.0
527
-
528
- # Get function name and split it
529
- func_name = doc_info["function"]
530
- func_words = set(split_identifier(func_name))
531
-
532
- # Find words in function name that are NOT in query
533
- query_set = set(query_keywords)
534
- extra_words = func_words - query_set
535
-
536
- # No penalty if all function name words are in query (exact match)
537
- if not extra_words:
538
- return 1.0
539
-
540
- # Apply 30% penalty per extra word, with a floor of 0.1 (max 90% penalty)
541
- # This penalty is strong enough to overcome BM25 length normalization bias
542
- penalty_per_word = 0.3
543
- total_penalty = min(len(extra_words) * penalty_per_word, 0.9)
544
-
545
- return 1.0 - total_penalty