cicada-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cicada/_version_hash.py +4 -0
- cicada/cli.py +6 -748
- cicada/commands.py +1255 -0
- cicada/dead_code/__init__.py +1 -0
- cicada/{find_dead_code.py → dead_code/finder.py} +2 -1
- cicada/dependency_analyzer.py +147 -0
- cicada/entry_utils.py +92 -0
- cicada/extractors/base.py +9 -9
- cicada/extractors/call.py +17 -20
- cicada/extractors/common.py +64 -0
- cicada/extractors/dependency.py +117 -235
- cicada/extractors/doc.py +2 -49
- cicada/extractors/function.py +10 -14
- cicada/extractors/keybert.py +228 -0
- cicada/extractors/keyword.py +191 -0
- cicada/extractors/module.py +6 -10
- cicada/extractors/spec.py +8 -56
- cicada/format/__init__.py +20 -0
- cicada/{ascii_art.py → format/ascii_art.py} +1 -1
- cicada/format/formatter.py +1145 -0
- cicada/git_helper.py +134 -7
- cicada/indexer.py +322 -89
- cicada/interactive_setup.py +251 -323
- cicada/interactive_setup_helpers.py +302 -0
- cicada/keyword_expander.py +437 -0
- cicada/keyword_search.py +208 -422
- cicada/keyword_test.py +383 -16
- cicada/mcp/__init__.py +10 -0
- cicada/mcp/entry.py +17 -0
- cicada/mcp/filter_utils.py +107 -0
- cicada/mcp/pattern_utils.py +118 -0
- cicada/{mcp_server.py → mcp/server.py} +819 -73
- cicada/mcp/tools.py +473 -0
- cicada/pr_finder.py +2 -3
- cicada/pr_indexer/indexer.py +3 -2
- cicada/setup.py +167 -35
- cicada/tier.py +225 -0
- cicada/utils/__init__.py +9 -2
- cicada/utils/fuzzy_match.py +54 -0
- cicada/utils/index_utils.py +9 -0
- cicada/utils/path_utils.py +18 -0
- cicada/utils/text_utils.py +52 -1
- cicada/utils/tree_utils.py +47 -0
- cicada/version_check.py +99 -0
- cicada/watch_manager.py +320 -0
- cicada/watcher.py +431 -0
- cicada_mcp-0.3.0.dist-info/METADATA +541 -0
- cicada_mcp-0.3.0.dist-info/RECORD +70 -0
- cicada_mcp-0.3.0.dist-info/entry_points.txt +4 -0
- cicada/formatter.py +0 -864
- cicada/keybert_extractor.py +0 -286
- cicada/lightweight_keyword_extractor.py +0 -290
- cicada/mcp_entry.py +0 -683
- cicada/mcp_tools.py +0 -291
- cicada_mcp-0.2.0.dist-info/METADATA +0 -735
- cicada_mcp-0.2.0.dist-info/RECORD +0 -53
- cicada_mcp-0.2.0.dist-info/entry_points.txt +0 -4
- /cicada/{dead_code_analyzer.py → dead_code/analyzer.py} +0 -0
- /cicada/{colors.py → format/colors.py} +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/WHEEL +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cicada_mcp-0.2.0.dist-info → cicada_mcp-0.3.0.dist-info}/top_level.txt +0 -0
cicada/setup.py
CHANGED
|
@@ -22,7 +22,7 @@ from cicada.utils import (
|
|
|
22
22
|
get_index_path,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
|
-
EditorType = Literal["claude", "cursor", "vs"]
|
|
25
|
+
EditorType = Literal["claude", "cursor", "vs", "gemini", "codex"]
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def _load_existing_config(config_path: Path) -> dict:
|
|
@@ -74,7 +74,6 @@ def _build_server_config(
|
|
|
74
74
|
server_config["cwd"] = cwd
|
|
75
75
|
|
|
76
76
|
server_config["env"] = {
|
|
77
|
-
"CICADA_REPO_PATH": str(repo_path),
|
|
78
77
|
"CICADA_CONFIG_DIR": str(storage_dir),
|
|
79
78
|
}
|
|
80
79
|
|
|
@@ -118,6 +117,16 @@ def get_mcp_config_for_editor(
|
|
|
118
117
|
"config_key": "mcp.servers",
|
|
119
118
|
"needs_dir": True,
|
|
120
119
|
},
|
|
120
|
+
"gemini": {
|
|
121
|
+
"config_path": repo_path / ".gemini" / "mcp.json",
|
|
122
|
+
"config_key": "mcpServers",
|
|
123
|
+
"needs_dir": True,
|
|
124
|
+
},
|
|
125
|
+
"codex": {
|
|
126
|
+
"config_path": repo_path / ".codex" / "mcp.json",
|
|
127
|
+
"config_key": "mcpServers",
|
|
128
|
+
"needs_dir": True,
|
|
129
|
+
},
|
|
121
130
|
}
|
|
122
131
|
|
|
123
132
|
if editor not in editor_specs:
|
|
@@ -147,8 +156,8 @@ def get_mcp_config_for_editor(
|
|
|
147
156
|
def create_config_yaml(
|
|
148
157
|
repo_path: Path,
|
|
149
158
|
storage_dir: Path,
|
|
150
|
-
|
|
151
|
-
|
|
159
|
+
extraction_method: str | None = None,
|
|
160
|
+
expansion_method: str | None = None,
|
|
152
161
|
verbose: bool = True,
|
|
153
162
|
) -> None:
|
|
154
163
|
"""
|
|
@@ -157,18 +166,18 @@ def create_config_yaml(
|
|
|
157
166
|
Args:
|
|
158
167
|
repo_path: Path to the repository
|
|
159
168
|
storage_dir: Path to the storage directory
|
|
160
|
-
|
|
161
|
-
|
|
169
|
+
extraction_method: Keyword extraction method ('regular' or 'bert'), None for default
|
|
170
|
+
expansion_method: Expansion method ('lemmi', 'glove', or 'fasttext'), None for default
|
|
162
171
|
verbose: If True, print success message. If False, silently create config.
|
|
163
172
|
"""
|
|
164
173
|
config_path = get_config_path(repo_path)
|
|
165
174
|
index_path = get_index_path(repo_path)
|
|
166
175
|
|
|
167
|
-
# Default to
|
|
168
|
-
if
|
|
169
|
-
|
|
170
|
-
if
|
|
171
|
-
|
|
176
|
+
# Default to regular extraction + lemmi expansion
|
|
177
|
+
if extraction_method is None:
|
|
178
|
+
extraction_method = "regular"
|
|
179
|
+
if expansion_method is None:
|
|
180
|
+
expansion_method = "lemmi"
|
|
172
181
|
|
|
173
182
|
config_content = f"""repository:
|
|
174
183
|
path: {repo_path}
|
|
@@ -177,8 +186,10 @@ storage:
|
|
|
177
186
|
index_path: {index_path}
|
|
178
187
|
|
|
179
188
|
keyword_extraction:
|
|
180
|
-
method: {
|
|
181
|
-
|
|
189
|
+
method: {extraction_method}
|
|
190
|
+
|
|
191
|
+
keyword_expansion:
|
|
192
|
+
method: {expansion_method}
|
|
182
193
|
"""
|
|
183
194
|
|
|
184
195
|
with open(config_path, "w") as f:
|
|
@@ -249,11 +260,118 @@ def setup_multiple_editors(
|
|
|
249
260
|
print(f"⚠ Error creating {editor.upper()} config: {e}")
|
|
250
261
|
|
|
251
262
|
|
|
263
|
+
def update_claude_md(repo_path: Path, editor: EditorType | None = None) -> None:
|
|
264
|
+
"""Update CLAUDE.md and AGENTS.md with instructions to use cicada-mcp for Elixir codebase searches.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
repo_path: Path to the repository
|
|
268
|
+
editor: Editor type - defaults to None which updates CLAUDE.md (for backward compatibility)
|
|
269
|
+
"""
|
|
270
|
+
from cicada.mcp.tools import get_tool_definitions
|
|
271
|
+
|
|
272
|
+
claude_md_path = repo_path / "CLAUDE.md"
|
|
273
|
+
agents_md_path = repo_path / "AGENTS.md"
|
|
274
|
+
|
|
275
|
+
# Process CLAUDE.md if no editor specified (backward compatibility) or if editor is 'claude'
|
|
276
|
+
if (editor is None or editor == "claude") and claude_md_path.exists():
|
|
277
|
+
_update_md_file(claude_md_path, get_tool_definitions())
|
|
278
|
+
|
|
279
|
+
# Process AGENTS.md for all editors if it exists (when editor is specified)
|
|
280
|
+
if editor is not None and agents_md_path.exists():
|
|
281
|
+
_update_md_file(agents_md_path, get_tool_definitions())
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _update_md_file(md_path: Path, tools) -> None:
|
|
285
|
+
"""Update a markdown file with cicada tool instructions.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
md_path: Path to the markdown file (CLAUDE.md or AGENTS.md)
|
|
289
|
+
tools: Tool definitions from get_tool_definitions()
|
|
290
|
+
"""
|
|
291
|
+
import re
|
|
292
|
+
|
|
293
|
+
# Auto-generate tool list from tools
|
|
294
|
+
tool_list: list[str] = []
|
|
295
|
+
|
|
296
|
+
for tool in tools:
|
|
297
|
+
# Skip deprecated tools
|
|
298
|
+
if tool.description and "DEPRECATED" in tool.description:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Extract first sentence from description (up to first period or newline)
|
|
302
|
+
if tool.description:
|
|
303
|
+
desc = tool.description.split("\n")[0].strip()
|
|
304
|
+
if "." in desc:
|
|
305
|
+
desc = desc.split(".")[0] + "."
|
|
306
|
+
line = f" - {desc} `mcp__cicada__{tool.name}`"
|
|
307
|
+
tool_list.append(line)
|
|
308
|
+
|
|
309
|
+
tool_list_str = "\n".join(tool_list)
|
|
310
|
+
|
|
311
|
+
# Identify the categories of tools
|
|
312
|
+
grep_antipatterns = [
|
|
313
|
+
" - ❌ Searching for module structure",
|
|
314
|
+
" - ❌ Searching for function definitions",
|
|
315
|
+
" - ❌ Searching for module imports/usage",
|
|
316
|
+
]
|
|
317
|
+
grep_antipatterns_str = "\n".join(grep_antipatterns)
|
|
318
|
+
|
|
319
|
+
instruction_content = f"""<cicada>
|
|
320
|
+
**ALWAYS use cicada-mcp tools for Elixir code searches. NEVER use Grep/Find for these tasks.**
|
|
321
|
+
|
|
322
|
+
### Use cicada tools for:
|
|
323
|
+
{tool_list_str}
|
|
324
|
+
|
|
325
|
+
### DO NOT use Grep for:
|
|
326
|
+
{grep_antipatterns_str}
|
|
327
|
+
|
|
328
|
+
### You can still use Grep for:
|
|
329
|
+
- ✓ Non-code files (markdown, JSON, config)
|
|
330
|
+
- ✓ String literal searches
|
|
331
|
+
- ✓ Pattern matching in single line comments
|
|
332
|
+
</cicada>
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
# Read existing content
|
|
337
|
+
with open(md_path) as f:
|
|
338
|
+
content = f.read()
|
|
339
|
+
|
|
340
|
+
# Pattern to find existing <cicada>...</cicada> tags
|
|
341
|
+
cicada_pattern = re.compile(r"<cicada>.*?</cicada>", re.DOTALL)
|
|
342
|
+
|
|
343
|
+
# Check if <cicada> tags exist
|
|
344
|
+
if cicada_pattern.search(content):
|
|
345
|
+
# Replace existing content between tags
|
|
346
|
+
new_content = cicada_pattern.sub(instruction_content, content)
|
|
347
|
+
with open(md_path, "w") as f:
|
|
348
|
+
f.write(new_content)
|
|
349
|
+
print(f"✓ Updated <cicada> instructions in {md_path.name}")
|
|
350
|
+
elif "cicada-mcp" in content.lower() or "cicada" in content.lower():
|
|
351
|
+
# Content already mentions cicada, don't add duplication
|
|
352
|
+
# This handles cases where users manually added cicada instructions
|
|
353
|
+
print(f"✓ {md_path.name} already mentions cicada, skipping update")
|
|
354
|
+
else:
|
|
355
|
+
# Append the instruction
|
|
356
|
+
with open(md_path, "a") as f:
|
|
357
|
+
# Add newline if file doesn't end with one
|
|
358
|
+
if content and not content.endswith("\n"):
|
|
359
|
+
f.write("\n")
|
|
360
|
+
|
|
361
|
+
f.write("\n")
|
|
362
|
+
f.write(instruction_content)
|
|
363
|
+
|
|
364
|
+
print(f"✓ Added cicada-mcp usage instructions to {md_path.name}")
|
|
365
|
+
except Exception:
|
|
366
|
+
# Fail silently on any errors
|
|
367
|
+
pass
|
|
368
|
+
|
|
369
|
+
|
|
252
370
|
def setup(
|
|
253
371
|
editor: EditorType,
|
|
254
372
|
repo_path: Path | None = None,
|
|
255
|
-
|
|
256
|
-
|
|
373
|
+
extraction_method: str | None = None,
|
|
374
|
+
expansion_method: str | None = None,
|
|
257
375
|
index_exists: bool = False,
|
|
258
376
|
) -> None:
|
|
259
377
|
"""
|
|
@@ -262,8 +380,8 @@ def setup(
|
|
|
262
380
|
Args:
|
|
263
381
|
editor: Editor type (claude, cursor, vs)
|
|
264
382
|
repo_path: Path to the repository (defaults to current directory)
|
|
265
|
-
|
|
266
|
-
|
|
383
|
+
extraction_method: Keyword extraction method ('regular' or 'bert'), None for default
|
|
384
|
+
expansion_method: Expansion method ('lemmi', 'glove', or 'fasttext'), None for default
|
|
267
385
|
index_exists: If True, skip banner and show condensed output (index already exists)
|
|
268
386
|
"""
|
|
269
387
|
# Determine repository path
|
|
@@ -276,15 +394,19 @@ def setup(
|
|
|
276
394
|
|
|
277
395
|
# Show condensed output if index already exists
|
|
278
396
|
if index_exists:
|
|
279
|
-
# Determine method
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
print(
|
|
397
|
+
# Determine method for display
|
|
398
|
+
display_extraction = extraction_method if extraction_method else "regular"
|
|
399
|
+
display_expansion = expansion_method if expansion_method else "lemmi"
|
|
400
|
+
print(
|
|
401
|
+
f"✓ Found existing index ({display_extraction.upper()} + {display_expansion.upper()})"
|
|
402
|
+
)
|
|
283
403
|
# Skip indexing when index_exists is True - we're just reusing it
|
|
284
404
|
should_index = False
|
|
285
405
|
force_full = False
|
|
286
406
|
# Ensure config.yaml is up to date with current settings
|
|
287
|
-
create_config_yaml(
|
|
407
|
+
create_config_yaml(
|
|
408
|
+
repo_path, storage_dir, extraction_method, expansion_method, verbose=False
|
|
409
|
+
)
|
|
288
410
|
else:
|
|
289
411
|
# Show full banner for new setup
|
|
290
412
|
print("=" * 60)
|
|
@@ -307,20 +429,20 @@ def setup(
|
|
|
307
429
|
try:
|
|
308
430
|
with open(config_path) as f:
|
|
309
431
|
existing_config = yaml.safe_load(f)
|
|
310
|
-
|
|
311
|
-
"method", "
|
|
432
|
+
existing_extraction = existing_config.get("keyword_extraction", {}).get(
|
|
433
|
+
"method", "regular"
|
|
312
434
|
)
|
|
313
|
-
|
|
314
|
-
"
|
|
435
|
+
existing_expansion = existing_config.get("keyword_expansion", {}).get(
|
|
436
|
+
"method", "lemmi"
|
|
315
437
|
)
|
|
316
438
|
|
|
317
|
-
# Determine new
|
|
318
|
-
|
|
319
|
-
|
|
439
|
+
# Determine new methods (default to regular + lemmi if not specified)
|
|
440
|
+
new_extraction = extraction_method if extraction_method else "regular"
|
|
441
|
+
new_expansion = expansion_method if expansion_method else "lemmi"
|
|
320
442
|
|
|
321
443
|
# Check if settings changed
|
|
322
|
-
settings_changed = (
|
|
323
|
-
|
|
444
|
+
settings_changed = (existing_extraction != new_extraction) or (
|
|
445
|
+
existing_expansion != new_expansion
|
|
324
446
|
)
|
|
325
447
|
|
|
326
448
|
if settings_changed:
|
|
@@ -329,9 +451,11 @@ def setup(
|
|
|
329
451
|
print("=" * 60)
|
|
330
452
|
print()
|
|
331
453
|
print(
|
|
332
|
-
f"This repository already has an index with {
|
|
454
|
+
f"This repository already has an index with {existing_extraction.upper()} + {existing_expansion.upper()}."
|
|
455
|
+
)
|
|
456
|
+
print(
|
|
457
|
+
f"You are now switching to {new_extraction.upper()} + {new_expansion.upper()}."
|
|
333
458
|
)
|
|
334
|
-
print(f"You are now switching to {new_method.upper()} ({new_tier}).")
|
|
335
459
|
print()
|
|
336
460
|
print(
|
|
337
461
|
"This will require reindexing the ENTIRE codebase, which may take several minutes."
|
|
@@ -347,7 +471,9 @@ def setup(
|
|
|
347
471
|
force_full = True # Force full reindex when settings change
|
|
348
472
|
else:
|
|
349
473
|
# Settings unchanged - just use existing index
|
|
350
|
-
print(
|
|
474
|
+
print(
|
|
475
|
+
f"✓ Using existing index ({existing_extraction.upper()} + {existing_expansion.upper()})"
|
|
476
|
+
)
|
|
351
477
|
print()
|
|
352
478
|
should_index = False
|
|
353
479
|
except Exception:
|
|
@@ -355,13 +481,19 @@ def setup(
|
|
|
355
481
|
pass
|
|
356
482
|
|
|
357
483
|
# Create/update config.yaml BEFORE indexing (indexer reads this to determine keyword method)
|
|
358
|
-
create_config_yaml(
|
|
484
|
+
create_config_yaml(
|
|
485
|
+
repo_path, storage_dir, extraction_method, expansion_method, verbose=False
|
|
486
|
+
)
|
|
359
487
|
|
|
360
488
|
# Index repository if needed
|
|
361
489
|
if should_index:
|
|
362
490
|
index_repository(repo_path, force_full=force_full)
|
|
363
491
|
print()
|
|
364
492
|
|
|
493
|
+
# Update CLAUDE.md with cicada instructions (only for Claude Code editor)
|
|
494
|
+
if editor == "claude":
|
|
495
|
+
update_claude_md(repo_path)
|
|
496
|
+
|
|
365
497
|
# Create MCP config for the editor
|
|
366
498
|
config_path, config_content = get_mcp_config_for_editor(editor, repo_path, storage_dir)
|
|
367
499
|
|
cicada/tier.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tier Configuration Module - Centralized tier resolution and conversion logic.
|
|
3
|
+
|
|
4
|
+
This module provides a single source of truth for:
|
|
5
|
+
- Tier validation (fast, regular, max)
|
|
6
|
+
- Tier resolution from arguments or config files
|
|
7
|
+
- Tier <-> (extraction_method, expansion_method) conversions
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
# Tier to methods mapping
|
|
15
|
+
TIER_METHODS = {
|
|
16
|
+
"fast": ("regular", "lemmi"),
|
|
17
|
+
"regular": ("bert", "glove"),
|
|
18
|
+
"max": ("bert", "fasttext"),
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
# Default methods if no configuration exists
|
|
22
|
+
DEFAULT_METHODS = ("regular", "lemmi")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def validate_tier_flags(args: argparse.Namespace, *, require_force: bool = False) -> None:
|
|
26
|
+
"""Validate that only one tier flag is specified.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
args: Parsed command-line arguments with fast, regular, and max attributes
|
|
30
|
+
require_force: Whether --force is required when specifying tier flags
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
SystemExit: If validation fails
|
|
34
|
+
"""
|
|
35
|
+
tier_flags = [bool(args.fast), bool(getattr(args, "regular", False)), bool(args.max)]
|
|
36
|
+
tier_count = sum(tier_flags)
|
|
37
|
+
|
|
38
|
+
if tier_count > 1:
|
|
39
|
+
print(
|
|
40
|
+
"Error: Can only specify one tier flag (--fast, --regular, or --max)",
|
|
41
|
+
file=sys.stderr,
|
|
42
|
+
)
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
if not require_force:
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
force_enabled = getattr(args, "force", False) is True
|
|
49
|
+
tier_specified = tier_count == 1
|
|
50
|
+
|
|
51
|
+
if force_enabled and not tier_specified:
|
|
52
|
+
print(
|
|
53
|
+
"Error: --force requires specifying a tier flag (--fast, --regular, or --max).",
|
|
54
|
+
file=sys.stderr,
|
|
55
|
+
)
|
|
56
|
+
sys.exit(2)
|
|
57
|
+
|
|
58
|
+
if tier_specified and not force_enabled:
|
|
59
|
+
print(
|
|
60
|
+
"Error: Tier flags now require --force to override the configured tier.",
|
|
61
|
+
file=sys.stderr,
|
|
62
|
+
)
|
|
63
|
+
print(
|
|
64
|
+
"Run 'cicada index --force --fast|--regular|--max' to select a tier.",
|
|
65
|
+
file=sys.stderr,
|
|
66
|
+
)
|
|
67
|
+
sys.exit(2)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def tier_flag_specified(args: argparse.Namespace) -> bool:
|
|
71
|
+
"""Return True when any tier flag is present."""
|
|
72
|
+
return bool(args.fast or getattr(args, "regular", False) or args.max)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_tier_from_args(args: argparse.Namespace) -> str | None:
|
|
76
|
+
"""Extract tier from command-line arguments.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
args: Parsed command-line arguments with fast, regular, and max attributes
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Tier string ("fast", "regular", or "max"), or None if no tier flag specified
|
|
83
|
+
"""
|
|
84
|
+
if args.fast:
|
|
85
|
+
return "fast"
|
|
86
|
+
if args.max:
|
|
87
|
+
return "max"
|
|
88
|
+
if getattr(args, "regular", False):
|
|
89
|
+
return "regular"
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def tier_to_methods(tier: str) -> tuple[str, str]:
|
|
94
|
+
"""Convert tier to (extraction_method, expansion_method).
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
tier: Tier string ("fast", "regular", or "max")
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Tuple of (extraction_method, expansion_method)
|
|
101
|
+
- extraction_method is 'regular' or 'bert'
|
|
102
|
+
- expansion_method is 'lemmi', 'glove', or 'fasttext'
|
|
103
|
+
|
|
104
|
+
Tier mappings:
|
|
105
|
+
- fast: regular extraction + lemmi expansion
|
|
106
|
+
- regular: bert extraction + glove expansion
|
|
107
|
+
- max: bert extraction + fasttext expansion
|
|
108
|
+
"""
|
|
109
|
+
return TIER_METHODS.get(tier, DEFAULT_METHODS)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def methods_to_tier(extraction_method: str, expansion_method: str) -> str:
|
|
113
|
+
"""Convert (extraction_method, expansion_method) to tier.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
extraction_method: 'regular' or 'bert'
|
|
117
|
+
expansion_method: 'lemmi', 'glove', or 'fasttext'
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Tier string: "fast", "regular", or "max"
|
|
121
|
+
"""
|
|
122
|
+
method_pair = (extraction_method, expansion_method)
|
|
123
|
+
|
|
124
|
+
# Find matching tier in our mapping
|
|
125
|
+
for tier, methods in TIER_METHODS.items():
|
|
126
|
+
if methods == method_pair:
|
|
127
|
+
return tier
|
|
128
|
+
|
|
129
|
+
# Fallback logic for partial matches
|
|
130
|
+
if extraction_method == "regular":
|
|
131
|
+
return "fast"
|
|
132
|
+
|
|
133
|
+
if extraction_method == "bert":
|
|
134
|
+
if expansion_method == "fasttext":
|
|
135
|
+
return "max"
|
|
136
|
+
return "regular"
|
|
137
|
+
|
|
138
|
+
# Default to regular for unknown combinations
|
|
139
|
+
return "regular"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def read_keyword_extraction_config(repo_path: Path) -> tuple[str, str]:
|
|
143
|
+
"""Read keyword extraction configuration from config.yaml.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
repo_path: Path to the repository
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
tuple[str, str]: (extraction_method, expansion_method) where:
|
|
150
|
+
- extraction_method is 'regular' or 'bert'
|
|
151
|
+
- expansion_method is 'lemmi', 'glove', or 'fasttext'
|
|
152
|
+
Returns DEFAULT_METHODS if config not found.
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
import yaml
|
|
156
|
+
|
|
157
|
+
from cicada.utils.storage import get_config_path
|
|
158
|
+
|
|
159
|
+
config_path = get_config_path(repo_path)
|
|
160
|
+
if not config_path.exists():
|
|
161
|
+
return DEFAULT_METHODS
|
|
162
|
+
|
|
163
|
+
with open(config_path) as f:
|
|
164
|
+
config = yaml.safe_load(f)
|
|
165
|
+
|
|
166
|
+
if not config:
|
|
167
|
+
return DEFAULT_METHODS
|
|
168
|
+
|
|
169
|
+
extraction_method = config.get("keyword_extraction", {}).get("method", DEFAULT_METHODS[0])
|
|
170
|
+
expansion_method = config.get("keyword_expansion", {}).get("method", DEFAULT_METHODS[1])
|
|
171
|
+
return (extraction_method, expansion_method)
|
|
172
|
+
|
|
173
|
+
except Exception:
|
|
174
|
+
# If anything goes wrong, use defaults
|
|
175
|
+
return DEFAULT_METHODS
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def determine_tier(args: argparse.Namespace, repo_path: Path | None = None) -> str:
|
|
179
|
+
"""Determine indexing tier from args or existing config.
|
|
180
|
+
|
|
181
|
+
This is the main function for tier resolution. It:
|
|
182
|
+
1. Checks command-line arguments first (--fast, --regular, --max)
|
|
183
|
+
2. Falls back to reading from config.yaml if no args provided
|
|
184
|
+
3. Defaults to "regular" if no config found
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
args: Parsed command-line arguments with fast, regular, and max attributes
|
|
188
|
+
repo_path: Optional repository path to read config from
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Tier string: "fast", "regular", or "max"
|
|
192
|
+
"""
|
|
193
|
+
# Check args first
|
|
194
|
+
tier = get_tier_from_args(args)
|
|
195
|
+
if tier is not None:
|
|
196
|
+
return tier
|
|
197
|
+
|
|
198
|
+
# If no tier flag specified, try to load from existing config
|
|
199
|
+
if repo_path is not None:
|
|
200
|
+
extraction_method, expansion_method = read_keyword_extraction_config(repo_path)
|
|
201
|
+
return methods_to_tier(extraction_method, expansion_method)
|
|
202
|
+
|
|
203
|
+
# Default to regular tier
|
|
204
|
+
return "regular"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def get_extraction_expansion_methods(
|
|
208
|
+
args: argparse.Namespace,
|
|
209
|
+
) -> tuple[str | None, str | None]:
|
|
210
|
+
"""Map tier flags to extraction and expansion methods.
|
|
211
|
+
|
|
212
|
+
This is a convenience function for backward compatibility.
|
|
213
|
+
Returns (None, None) if no tier flag is specified, allowing callers
|
|
214
|
+
to distinguish between "no tier specified" and "default tier".
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
args: Parsed command-line arguments with fast, regular, and max attributes
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Tuple of (extraction_method, expansion_method), or (None, None) if no tier flag
|
|
221
|
+
"""
|
|
222
|
+
tier = get_tier_from_args(args)
|
|
223
|
+
if tier is None:
|
|
224
|
+
return None, None
|
|
225
|
+
return tier_to_methods(tier)
|
cicada/utils/__init__.py
CHANGED
|
@@ -7,13 +7,14 @@ code duplication and improve maintainability.
|
|
|
7
7
|
|
|
8
8
|
from .call_site_formatter import CallSiteFormatter
|
|
9
9
|
from .function_grouper import FunctionGrouper
|
|
10
|
+
from .fuzzy_match import find_similar_names
|
|
10
11
|
from .index_utils import (
|
|
11
12
|
load_index,
|
|
12
13
|
merge_indexes_incremental,
|
|
13
14
|
save_index,
|
|
14
15
|
validate_index_structure,
|
|
15
16
|
)
|
|
16
|
-
from .path_utils import normalize_file_path, resolve_to_repo_root
|
|
17
|
+
from .path_utils import is_git_repository, normalize_file_path, resolve_to_repo_root
|
|
17
18
|
from .signature_builder import SignatureBuilder
|
|
18
19
|
from .storage import (
|
|
19
20
|
create_storage_dir,
|
|
@@ -25,7 +26,8 @@ from .storage import (
|
|
|
25
26
|
get_storage_dir,
|
|
26
27
|
)
|
|
27
28
|
from .subprocess_runner import SubprocessRunner, run_gh_command, run_git_command
|
|
28
|
-
from .text_utils import split_camel_snake_case, split_identifier
|
|
29
|
+
from .text_utils import extract_code_identifiers, split_camel_snake_case, split_identifier
|
|
30
|
+
from .tree_utils import extract_text_from_node, is_function_definition_call
|
|
29
31
|
|
|
30
32
|
__all__ = [
|
|
31
33
|
"SubprocessRunner",
|
|
@@ -33,6 +35,7 @@ __all__ = [
|
|
|
33
35
|
"run_gh_command",
|
|
34
36
|
"normalize_file_path",
|
|
35
37
|
"resolve_to_repo_root",
|
|
38
|
+
"is_git_repository",
|
|
36
39
|
"load_index",
|
|
37
40
|
"save_index",
|
|
38
41
|
"merge_indexes_incremental",
|
|
@@ -40,8 +43,10 @@ __all__ = [
|
|
|
40
43
|
"FunctionGrouper",
|
|
41
44
|
"CallSiteFormatter",
|
|
42
45
|
"SignatureBuilder",
|
|
46
|
+
"find_similar_names",
|
|
43
47
|
"split_identifier",
|
|
44
48
|
"split_camel_snake_case",
|
|
49
|
+
"extract_code_identifiers",
|
|
45
50
|
"get_repo_hash",
|
|
46
51
|
"get_storage_dir",
|
|
47
52
|
"create_storage_dir",
|
|
@@ -49,4 +54,6 @@ __all__ = [
|
|
|
49
54
|
"get_config_path",
|
|
50
55
|
"get_hashes_path",
|
|
51
56
|
"get_pr_index_path",
|
|
57
|
+
"extract_text_from_node",
|
|
58
|
+
"is_function_definition_call",
|
|
52
59
|
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fuzzy matching utilities for finding similar names.
|
|
3
|
+
|
|
4
|
+
This module provides utilities for finding similar module and function names
|
|
5
|
+
using fuzzy string matching algorithms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from difflib import SequenceMatcher
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_similar_names(
|
|
12
|
+
query: str, candidates: list[str], max_suggestions: int = 5, threshold: float = 0.4
|
|
13
|
+
) -> list[tuple[str, float]]:
|
|
14
|
+
"""
|
|
15
|
+
Find similar names using fuzzy matching.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
query: The query string to match
|
|
19
|
+
candidates: List of candidate names to match against
|
|
20
|
+
max_suggestions: Maximum number of suggestions to return
|
|
21
|
+
threshold: Minimum similarity score (0.0-1.0) to include in results
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
List of (name, similarity_score) tuples, sorted by similarity (descending)
|
|
25
|
+
"""
|
|
26
|
+
similarities: list[tuple[str, float]] = []
|
|
27
|
+
query_lower = query.lower()
|
|
28
|
+
|
|
29
|
+
# Early exit for exact match
|
|
30
|
+
for candidate in candidates:
|
|
31
|
+
if query_lower == candidate.lower():
|
|
32
|
+
return [(candidate, 1.0)]
|
|
33
|
+
|
|
34
|
+
# Limit search space for very large indices to prevent performance issues
|
|
35
|
+
search_candidates = candidates[:500] if len(candidates) > 500 else candidates
|
|
36
|
+
|
|
37
|
+
for candidate in search_candidates:
|
|
38
|
+
# Calculate base similarity score
|
|
39
|
+
similarity = SequenceMatcher(None, query_lower, candidate.lower()).ratio()
|
|
40
|
+
|
|
41
|
+
# Boost score for substring matches
|
|
42
|
+
if query_lower in candidate.lower():
|
|
43
|
+
similarity = max(similarity, 0.7)
|
|
44
|
+
|
|
45
|
+
# Boost score for partial component matches (e.g., "User" matches "MyApp.User")
|
|
46
|
+
query_parts = query.split(".")
|
|
47
|
+
if any(qpart.lower() in candidate.lower() for qpart in query_parts):
|
|
48
|
+
similarity = max(similarity, 0.6)
|
|
49
|
+
|
|
50
|
+
similarities.append((candidate, similarity))
|
|
51
|
+
|
|
52
|
+
# Sort by similarity (descending) and return top matches above threshold
|
|
53
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
54
|
+
return [(name, score) for name, score in similarities[:max_suggestions] if score > threshold]
|
cicada/utils/index_utils.py
CHANGED
|
@@ -274,12 +274,21 @@ def merge_indexes_incremental(
|
|
|
274
274
|
if "modules" in new_index:
|
|
275
275
|
merged["modules"].update(new_index["modules"])
|
|
276
276
|
|
|
277
|
+
# Preserve original cicada_version from old_index if it exists
|
|
278
|
+
original_version = None
|
|
279
|
+
if "metadata" in old_index:
|
|
280
|
+
original_version = old_index["metadata"].get("cicada_version")
|
|
281
|
+
|
|
277
282
|
# Merge metadata - take from new_index if available, else old_index
|
|
278
283
|
if "metadata" in new_index:
|
|
279
284
|
merged["metadata"].update(new_index["metadata"])
|
|
280
285
|
elif "metadata" in old_index:
|
|
281
286
|
merged["metadata"].update(old_index["metadata"])
|
|
282
287
|
|
|
288
|
+
# Restore original version if it existed (don't overwrite with new version)
|
|
289
|
+
if original_version:
|
|
290
|
+
merged["metadata"]["cicada_version"] = original_version
|
|
291
|
+
|
|
283
292
|
# Update module and function counts
|
|
284
293
|
stats = get_index_stats(merged)
|
|
285
294
|
merged["metadata"]["total_modules"] = stats["total_modules"]
|