PyPI - cicada-mcp - Versions diffs - 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

cicada-mcp 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

cicada/ascii_art.py +60 -0
cicada/clean.py +195 -60
cicada/cli.py +757 -0
cicada/colors.py +27 -0
cicada/command_logger.py +14 -16
cicada/dead_code_analyzer.py +12 -19
cicada/extractors/__init__.py +6 -6
cicada/extractors/base.py +3 -3
cicada/extractors/call.py +11 -15
cicada/extractors/dependency.py +39 -51
cicada/extractors/doc.py +8 -9
cicada/extractors/function.py +12 -24
cicada/extractors/module.py +11 -15
cicada/extractors/spec.py +8 -12
cicada/find_dead_code.py +15 -39
cicada/formatter.py +37 -91
cicada/git_helper.py +22 -34
cicada/indexer.py +165 -132
cicada/interactive_setup.py +490 -0
cicada/keybert_extractor.py +286 -0
cicada/keyword_search.py +22 -30
cicada/keyword_test.py +127 -0
cicada/lightweight_keyword_extractor.py +5 -13
cicada/mcp_entry.py +683 -0
cicada/mcp_server.py +110 -232
cicada/parser.py +9 -9
cicada/pr_finder.py +15 -19
cicada/pr_indexer/__init__.py +3 -3
cicada/pr_indexer/cli.py +4 -9
cicada/pr_indexer/github_api_client.py +22 -37
cicada/pr_indexer/indexer.py +17 -29
cicada/pr_indexer/line_mapper.py +8 -12
cicada/pr_indexer/pr_index_builder.py +22 -34
cicada/setup.py +198 -89
cicada/utils/__init__.py +9 -9
cicada/utils/call_site_formatter.py +4 -6
cicada/utils/function_grouper.py +4 -4
cicada/utils/hash_utils.py +12 -15
cicada/utils/index_utils.py +15 -15
cicada/utils/path_utils.py +24 -29
cicada/utils/signature_builder.py +3 -3
cicada/utils/subprocess_runner.py +17 -19
cicada/utils/text_utils.py +1 -2
cicada/version_check.py +2 -5
{cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/METADATA +144 -55
cicada_mcp-0.2.0.dist-info/RECORD +53 -0
cicada_mcp-0.2.0.dist-info/entry_points.txt +4 -0
cicada/install.py +0 -741
cicada_mcp-0.1.5.dist-info/RECORD +0 -47
cicada_mcp-0.1.5.dist-info/entry_points.txt +0 -9
{cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/WHEEL +0 -0
{cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/licenses/LICENSE +0 -0
{cicada_mcp-0.1.5.dist-info → cicada_mcp-0.2.0.dist-info}/top_level.txt +0 -0

cicada/keybert_extractor.py ADDED Viewed

@@ -0,0 +1,286 @@
+"""
+Keyword Extraction using KeyBERT
+Semantic keyword extraction using transformer-based embeddings
+"""
+import re
+import sys
+from typing import Any
+from cicada.utils import split_camel_snake_case
+class KeyBERTExtractor:
+    """Extract keywords from text using KeyBERT semantic analysis."""
+    # Model configurations for different performance tiers
+    KEYBERT_MODELS = {
+        "fast": "all-MiniLM-L6-v2",  # 80MB, fast extraction
+        "regular": "BAAI/bge-small-en-v1.5",  # 133MB, balanced
+        "max": "paraphrase-mpnet-base-v2",  # 420MB, highest quality
+    }
+    # Weighting strategy constants for keyword extraction
+    # These control how different types of keywords are prioritized
+    KEYBERT_CANDIDATE_MULTIPLIER = 3  # Extract 3x keywords for weighted reranking
+    CODE_IDENTIFIER_BOOST = 10  # 10x weight for exact code identifiers (e.g., function names)
+    CODE_SPLIT_WORD_BOOST = 3  # 3x weight for identifier components (e.g., "user" from "getUserId")
+    BASE_SCORE_IDENTIFIER = 0.5  # Base score for identifiers not found by BERT
+    BASE_SCORE_SPLIT_WORD = 0.3  # Base score for split words not found by BERT
+    # Class variable to hold KeyBERT class (lazily loaded)
+    _KeyBERT: type | None = None
+    def __init__(self, verbose: bool = False, model_tier: str | None = None):
+        """
+        Initialize KeyBERT model.
+        Args:
+            verbose: If True, print status messages during initialization
+            model_tier: Model tier to use ('fast', 'regular', or 'max').
+                       If None, must be specified via config file.
+        Raises:
+            ImportError: If KeyBERT is not installed
+            ValueError: If model_tier is invalid or not specified
+            RuntimeError: If model loading fails
+        """
+        self.verbose = verbose
+        # Validate model tier first
+        if model_tier and model_tier not in self.KEYBERT_MODELS:
+            raise ValueError(
+                f"Invalid model tier '{model_tier}'. "
+                f"Must be one of: {', '.join(self.KEYBERT_MODELS.keys())}"
+            )
+        if model_tier is None:
+            raise ValueError(
+                "model_tier must be specified. Pass it directly or load from config file."
+            )
+        self.model_tier = model_tier
+        self.model_name = self.KEYBERT_MODELS[model_tier]
+        # Print message BEFORE the slow import
+        if self.verbose:
+            print(
+                f"Loading KeyBERT model ({model_tier}: {self.model_name})",
+                file=sys.stderr,
+            )
+            print("This can take up to a couple of minutes.", file=sys.stderr)
+        # Lazy import KeyBERT (only once per class)
+        # This import can take significant time on first load
+        if KeyBERTExtractor._KeyBERT is None:
+            try:
+                from keybert import KeyBERT
+                KeyBERTExtractor._KeyBERT = KeyBERT
+            except ImportError as e:
+                raise ImportError(
+                    "KeyBERT is not installed. Install it with:\n"
+                    "  uv add keybert\n"
+                    "or\n"
+                    "  pip install keybert"
+                ) from e
+        # Initialize KeyBERT with the selected model
+        # Assume model is pre-downloaded (user will handle caching separately)
+        try:
+            self.kw_model = KeyBERTExtractor._KeyBERT(model=self.model_name)
+            if self.verbose:
+                print("✓ Model loaded successfully", file=sys.stderr)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load KeyBERT model '{self.model_name}'. "
+                f"Ensure the model is downloaded and available. Error: {e}"
+            ) from e
+    def extract_code_identifiers(self, text: str) -> tuple[list[str], list[str]]:
+        """
+        Extract code-specific identifiers and their split words.
+        Returns a tuple of (identifiers, split_words) where:
+        - identifiers: original camelCase/PascalCase/snake_case identifiers
+        - split_words: individual words extracted from those identifiers
+        """
+        # Match camelCase, snake_case, PascalCase, and mixed patterns
+        patterns = [
+            r"\b[a-z]+[A-Z][a-zA-Z]*\b",  # camelCase (e.g., getUserData)
+            r"\b[A-Z]{2,}[a-z]+[a-zA-Z]*\b",  # Uppercase prefix + PascalCase (e.g., HTTPServer, XMLParser)
+            r"\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b",  # PascalCase (e.g., UserController, PostgreSQL)
+            r"\b[a-z]+_[a-z_]+\b",  # snake_case (e.g., get_user_data)
+            r"\b[A-Z]{2,}\b",  # All UPPERCASE (e.g., HTTP, API, SQL)
+        ]
+        identifiers = []
+        for pattern in patterns:
+            matches = re.findall(pattern, text)
+            identifiers.extend(matches)
+        identifiers = list(set(identifiers))
+        # Split identifiers into individual words
+        split_words = []
+        for identifier in identifiers:
+            split_text = split_camel_snake_case(identifier)
+            # Extract individual words (lowercase, length > 1)
+            words = [
+                word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
+            ]
+            split_words.extend(words)
+        return identifiers, list(set(split_words))
+    def extract_keywords_simple(self, text: str, top_n: int = 10) -> list[str]:
+        """
+        Extract keywords and return a simple list of keyword strings.
+        Args:
+            text: Input text to analyze
+            top_n: Number of top keywords to return
+        Returns:
+            List of keyword strings (e.g., ['authentication', 'user', 'validate'])
+        """
+        if not text or not text.strip():
+            return []
+        try:
+            results = self.extract_keywords(text, top_n=top_n)
+            # Extract just the keyword strings from top_keywords tuples
+            return [keyword for keyword, _ in results["top_keywords"]]
+        except Exception as e:
+            if self.verbose:
+                print(f"Warning: Keyword extraction failed: {e}", file=sys.stderr)
+            return []
+    def extract_keywords(self, text: str, top_n: int = 15) -> dict[str, Any]:
+        """
+        Extract keywords using KeyBERT semantic analysis with code identifier emphasis.
+        Weighting strategy:
+        - Full code identifiers (e.g., getUserData, snake_case): 10x weight (exact match priority)
+        - Code split words (e.g., get, user, data): 3x weight (fuzzy match support)
+        - KeyBERT semantic keywords: Base score from embedding similarity
+        Magic numbers explained:
+        - 3x multiplier: For candidate selection (top_n * 3) to have enough keywords
+          before applying weights. This ensures we don't miss important keywords that
+          might rank higher after code identifier boosting.
+        - 0.5 base score: Default confidence for code identifiers not found by KeyBERT.
+          After 10x boost, gives them a score of 5.0, prioritizing them over most
+          regular keywords.
+        - 0.3 base score: Default confidence for code split words not found by KeyBERT.
+          After 3x boost, gives them a score of 0.9, placing them between regular
+          keywords (0.4-0.7) and full identifiers (5.0).
+        Args:
+            text: Input text to analyze
+            top_n: Number of top keywords to return
+        Returns:
+            Dictionary with extracted keywords and analysis:
+            - top_keywords: List of (keyword, score) tuples, sorted by weighted score
+            - code_identifiers: Original identifiers (weighted 10x)
+            - code_split_words: Words extracted from identifiers (weighted 3x)
+            - noun_chunks: 2-word phrases from KeyBERT (if any)
+            - Other fields (nouns, verbs, etc.) are empty (KeyBERT doesn't do POS tagging)
+            - stats: Basic text statistics
+        """
+        if not text or not text.strip():
+            return {
+                "top_keywords": [],
+                "nouns": [],
+                "verbs": [],
+                "adjectives": [],
+                "proper_nouns": [],
+                "noun_chunks": [],
+                "entities": [],
+                "code_identifiers": [],
+                "code_split_words": [],
+                "tf_scores": {},
+                "stats": {
+                    "total_tokens": 0,
+                    "total_words": 0,
+                    "unique_words": 0,
+                    "sentences": 0,
+                },
+            }
+        # 1. Extract code identifiers and their split words
+        code_identifiers, code_split_words = self.extract_code_identifiers(text)
+        # 2. Use KeyBERT to extract semantic keywords
+        # Extract more than needed to have candidates for weighting
+        try:
+            # KeyBERT return type can vary, use type ignore for external library
+            keybert_keywords: list[tuple[str, float]] = self.kw_model.extract_keywords(  # type: ignore[assignment]
+                text,
+                top_n=top_n * self.KEYBERT_CANDIDATE_MULTIPLIER,
+                keyphrase_ngram_range=(1, 1),  # Single words only
+            )
+        except Exception as e:
+            if self.verbose:
+                print(f"Warning: KeyBERT extraction failed: {e}", file=sys.stderr)
+            keybert_keywords = []
+        # 3. Build weighted keyword scores
+        keyword_scores: dict[str, float] = {}
+        # Add KeyBERT keywords with their semantic similarity scores
+        for keyword, score in keybert_keywords:
+            keyword_lower: str = keyword.lower()
+            keyword_scores[keyword_lower] = score
+        # 4. Apply code identifier boosting
+        # Code identifiers get strong boost as they're likely important API/function names
+        code_identifiers_lower = [ident.lower() for ident in code_identifiers]
+        for identifier in code_identifiers_lower:
+            if identifier in keyword_scores:
+                keyword_scores[identifier] *= self.CODE_IDENTIFIER_BOOST
+            else:
+                # Add with high base score if not found by KeyBERT
+                keyword_scores[identifier] = self.BASE_SCORE_IDENTIFIER * self.CODE_IDENTIFIER_BOOST
+        # 5. Apply split word boosting (lower than full identifiers)
+        # Split words are components of identifiers, somewhat important but less than full names
+        code_split_words_lower = [word.lower() for word in code_split_words]
+        for word in code_split_words_lower:
+            if word in keyword_scores:
+                keyword_scores[word] *= self.CODE_SPLIT_WORD_BOOST
+            else:
+                keyword_scores[word] = self.BASE_SCORE_SPLIT_WORD * self.CODE_SPLIT_WORD_BOOST
+        # 5. Sort by weighted score and take top_n
+        top_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
+        # 6. No noun_chunks since we're using single words only
+        noun_chunks = []
+        # 7. Calculate basic statistics
+        words = text.split()
+        unique_words = {w.lower() for w in words if w.isalpha()}
+        sentences = text.count(".") + text.count("!") + text.count("?")
+        stats = {
+            "total_tokens": len(words),
+            "total_words": len([w for w in words if w.isalpha()]),
+            "unique_words": len(unique_words),
+            "sentences": max(1, sentences),  # At least 1 sentence
+        }
+        return {
+            "top_keywords": top_keywords,
+            "nouns": [],  # KeyBERT doesn't extract POS tags
+            "verbs": [],
+            "adjectives": [],
+            "proper_nouns": [],
+            "noun_chunks": noun_chunks,
+            "entities": [],  # KeyBERT doesn't do NER
+            "code_identifiers": code_identifiers,
+            "code_split_words": code_split_words,
+            "tf_scores": {},  # Using semantic scores instead
+            "stats": stats,
+        }

cicada/keyword_search.py CHANGED Viewed

@@ -9,9 +9,9 @@ Identifier names (function/module names) are given much higher weight than keywo
 Author: Cursor(Auto)
 """
-import re
 import fnmatch
-from typing import List, Dict, Any
+from typing import Any
 from rank_bm25 import BM25Okapi
 from cicada.utils import split_identifier
@@ -24,7 +24,7 @@ class KeywordSearcher:
     # When query keyword matches the function/module name, multiply the score by this
     IDENTIFIER_MATCH_BOOST = 10.0
-    def __init__(self, index: Dict[str, Any]):
+    def __init__(self, index: dict[str, Any]):
         """
         Initialize the keyword searcher.
@@ -35,7 +35,7 @@ class KeywordSearcher:
         self.bm25, self.document_map = self._initialize_bm25()
     @staticmethod
-    def _extract_identifier_name(document_info: Dict[str, Any]) -> str:
+    def _extract_identifier_name(document_info: dict[str, Any]) -> str:
         """
         Extract the core identifier name from document info.
@@ -169,8 +169,8 @@ class KeywordSearcher:
         return fnmatch.fnmatch(text.lower(), pattern.lower())
     def _expand_wildcard_keywords(
-        self, query_keywords: List[str], document_keywords: List[str]
-    ) -> List[str]:
+        self, query_keywords: list[str], document_keywords: list[str]
+    ) -> list[str]:
         """
         Expand wildcard patterns to actual matching keywords from the document.
@@ -190,10 +190,10 @@ class KeywordSearcher:
     def _expand_wildcard_keywords_with_identifier(
         self,
-        query_keywords: List[str],
-        document_keywords: List[str],
+        query_keywords: list[str],
+        document_keywords: list[str],
         identifier_name: str,
-    ) -> List[str]:
+    ) -> list[str]:
         """
         Expand wildcard patterns to actual matching keywords from the document and identifier name.
@@ -214,13 +214,11 @@ class KeywordSearcher:
                     break  # Only add each query keyword once
             # Also check against the full identifier name
-            if query_kw not in matched_keywords and self._match_wildcard(
-                query_kw, identifier_name
-            ):
+            if query_kw not in matched_keywords and self._match_wildcard(query_kw, identifier_name):
                 matched_keywords.append(query_kw)
         return matched_keywords
-    def _get_wildcard_scores(self, query_keywords: List[str]) -> List[float]:
+    def _get_wildcard_scores(self, query_keywords: list[str]) -> list[float]:
         """
         Calculate BM25-like scores for wildcard matching.
@@ -252,11 +250,11 @@ class KeywordSearcher:
         return scores
-    def _has_wildcards(self, keywords: List[str]) -> bool:
+    def _has_wildcards(self, keywords: list[str]) -> bool:
         """Check if any keywords contain wildcard patterns."""
         return any("*" in keyword for keyword in keywords)
-    def search(self, query_keywords: List[str], top_n: int = 5) -> List[Dict[str, Any]]:
+    def search(self, query_keywords: list[str], top_n: int = 5) -> list[dict[str, Any]]:
         """
         Search for modules and functions matching the given keywords.
@@ -313,9 +311,7 @@ class KeywordSearcher:
                     query_keywords_lower, doc_info["keywords"], identifier_name
                 )
             else:
-                matched = self._count_matches(
-                    query_keywords_lower, doc_info["keywords"]
-                )
+                matched = self._count_matches(query_keywords_lower, doc_info["keywords"])
             # Only include documents that match at least one query keyword
             if matched["score"] > 0:
@@ -368,7 +364,7 @@ class KeywordSearcher:
         return results[:top_n]
     def _apply_identifier_boost(
-        self, bm25_score: float, query_keywords: List[str], doc_info: Dict[str, Any]
+        self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
     ) -> float:
         """
         Apply boost to BM25 score if query keywords match the identifier name.
@@ -399,9 +395,7 @@ class KeywordSearcher:
         return bm25_score
-    def _count_matches(
-        self, query_keywords: List[str], item_keywords: List[str]
-    ) -> Dict[str, Any]:
+    def _count_matches(self, query_keywords: list[str], item_keywords: list[str]) -> dict[str, Any]:
         """
         Count matching keywords between query and item.
@@ -435,10 +429,10 @@ class KeywordSearcher:
     def _count_wildcard_matches(
         self,
-        query_keywords: List[str],
-        item_keywords: List[str],
+        query_keywords: list[str],
+        item_keywords: list[str],
         identifier_name: str | None = None,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         """
         Count matching keywords between query and item using wildcard patterns.
@@ -462,9 +456,7 @@ class KeywordSearcher:
                 query_keywords, item_keywords_lower, identifier_name
             )
         else:
-            matched_keywords = self._expand_wildcard_keywords(
-                query_keywords, item_keywords_lower
-            )
+            matched_keywords = self._expand_wildcard_keywords(query_keywords, item_keywords_lower)
         score = len(matched_keywords)
         confidence = (score / len(query_keywords)) * 100 if query_keywords else 0
@@ -476,7 +468,7 @@ class KeywordSearcher:
         }
     def _apply_identifier_boost_wildcard(
-        self, bm25_score: float, query_keywords: List[str], doc_info: Dict[str, Any]
+        self, bm25_score: float, query_keywords: list[str], doc_info: dict[str, Any]
     ) -> float:
         """
         Apply boost to BM25 score if query keywords match the identifier name using wildcards.
@@ -509,7 +501,7 @@ class KeywordSearcher:
         return bm25_score
     def _calculate_name_coverage_penalty(
-        self, query_keywords: List[str], doc_info: Dict[str, Any]
+        self, query_keywords: list[str], doc_info: dict[str, Any]
     ) -> float:
         """
         Calculate penalty for functions whose names contain words NOT in the query.

cicada/keyword_test.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""
+Interactive keyword extraction testing module.
+Provides an interactive REPL for testing keyword extraction methods.
+"""
+import sys
+def run_keywords_interactive(method: str = "lemminflect", tier: str = "regular"):
+    """
+    Interactive keyword extraction testing mode.
+    Allows users to paste text and see extracted keywords in real-time
+    using the specified extraction method.
+    Args:
+        method: Extraction method ('lemminflect' or 'bert')
+        tier: Model tier ('fast', 'regular', or 'max')
+    """
+    print(f"\n{'='*70}")
+    print("🔍 Cicada Interactive Keyword Extraction Test")
+    print(f"{'='*70}")
+    print(f"Method: {method.upper()}")
+    print(f"Tier: {tier}")
+    print("\nPaste or type text, then press Ctrl-D (Unix) or Ctrl-Z+Enter (Windows)")
+    print("to extract keywords. Press Ctrl-C to exit.\n")
+    print(f"{'='*70}\n")
+    # Initialize keyword extractor
+    try:
+        if method == "bert":
+            from cicada.keybert_extractor import KeyBERTExtractor
+            extractor = KeyBERTExtractor(model_tier=tier, verbose=True)
+        else:
+            from cicada.lightweight_keyword_extractor import LightweightKeywordExtractor
+            extractor = LightweightKeywordExtractor(verbose=True)
+        print()  # Add newline after initialization
+    except Exception as e:
+        print(f"Error initializing keyword extractor: {e}", file=sys.stderr)
+        sys.exit(1)
+    # Interactive loop
+    stdin_closed = False
+    try:
+        while True:
+            print("📝 Enter text (Ctrl-D or Ctrl-Z+Enter when done):")
+            print("-" * 70)
+            # Read multi-line input until EOF
+            lines = []
+            try:
+                while True:
+                    line = input()
+                    lines.append(line)
+            except EOFError:
+                # Check if this is the first EOF (stdin just closed)
+                if not lines and stdin_closed:
+                    # stdin is exhausted and we have no input - exit gracefully
+                    print("\n👋 No more input available. Exiting.")
+                    return
+                stdin_closed = True
+            text = "\n".join(lines)
+            if not text.strip():
+                # If stdin is closed and input is empty, exit
+                if stdin_closed:
+                    print("\n👋 No more input available. Exiting.")
+                    return
+                print("\n⚠️  Empty input. Please enter some text.\n")
+                continue
+            # Extract keywords
+            print("\n" + "=" * 70)
+            print("🔑 EXTRACTED KEYWORDS:")
+            print("=" * 70)
+            try:
+                # Get detailed results
+                results = extractor.extract_keywords(text, top_n=15)
+                # Display top keywords with scores
+                top_keywords = results.get("top_keywords", [])
+                if top_keywords and isinstance(top_keywords, list):
+                    print("\n📊 Top Keywords (with scores):")
+                    for i, item in enumerate(top_keywords, 1):
+                        if isinstance(item, (list, tuple)) and len(item) >= 2:
+                            keyword, score = item[0], item[1]
+                            print(f"  {i:2}. {keyword:20s} (score: {score:.4f})")
+                else:
+                    print("  No keywords extracted.")
+                # Display code identifiers if any
+                code_identifiers = results.get("code_identifiers")
+                if code_identifiers and isinstance(code_identifiers, list):
+                    print("\n💻 Code Identifiers (10x weight):")
+                    for ident in code_identifiers:
+                        print(f"  • {ident}")
+                # Display code split words if any
+                code_split_words = results.get("code_split_words")
+                if code_split_words and isinstance(code_split_words, list):
+                    print("\n🔤 Code Split Words (3x weight):")
+                    for word in code_split_words[:10]:  # Limit to 10
+                        print(f"  • {word}")
+                # Display statistics
+                stats = results.get("stats")
+                if stats and isinstance(stats, dict):
+                    print("\n📈 Statistics:")
+                    print(f"  • Total tokens: {stats.get('total_tokens', 0)}")
+                    print(f"  • Total words: {stats.get('total_words', 0)}")
+                    print(f"  • Unique words: {stats.get('unique_words', 0)}")
+                    if "sentences" in stats:
+                        print(f"  • Sentences: {stats['sentences']}")
+            except Exception as e:
+                print(f"\n❌ Error extracting keywords: {e}", file=sys.stderr)
+            print("\n" + "=" * 70 + "\n")
+    except KeyboardInterrupt:
+        print("\n\n👋 Exiting interactive mode. Goodbye!")
+        sys.exit(0)

cicada/lightweight_keyword_extractor.py CHANGED Viewed

@@ -3,10 +3,10 @@ Lightweight Keyword Extraction using lemminflect
 Fast keyword extraction for programming documentation
 """
-from collections import Counter
 import re
 import sys
 import warnings
+from collections import Counter
 from cicada.utils import split_camel_snake_case
@@ -189,9 +189,7 @@ class LightweightKeywordExtractor:
         for identifier in identifiers:
             split_text = split_camel_snake_case(identifier)
             words = [
-                word.lower()
-                for word in split_text.split()
-                if len(word) > 1 and word.isalpha()
+                word.lower() for word in split_text.split() if len(word) > 1 and word.isalpha()
             ]
             split_words.extend(words)
         return identifiers, list(set(split_words))
@@ -264,9 +262,7 @@ class LightweightKeywordExtractor:
                 lemmatized_words.append(lemma)
         code_identifiers_lower = [ident.lower() for ident in code_identifiers]
-        all_keywords = (
-            lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
-        )
+        all_keywords = lemmatized_words + (code_identifiers_lower * 10) + (code_split_words * 3)
         keyword_freq = Counter(all_keywords)
         top_keywords = keyword_freq.most_common(top_n)
@@ -274,9 +270,7 @@ class LightweightKeywordExtractor:
         # This ensures weighted keywords are included in the calculation
         total_words = len(all_keywords)
         if total_words > 0:
-            tf_scores = {
-                word: (freq / total_words) for word, freq in keyword_freq.items()
-            }
+            tf_scores = {word: (freq / total_words) for word, freq in keyword_freq.items()}
         else:
             tf_scores = {}
@@ -291,8 +285,6 @@ class LightweightKeywordExtractor:
             "lemmatized_words": list(set(lemmatized_words))[:20],
             "code_identifiers": code_identifiers,
             "code_split_words": code_split_words,
-            "tf_scores": dict(
-                sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
-            ),
+            "tf_scores": dict(sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)[:10]),
             "stats": stats,
         }

cicada-mcp 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

cicada-mcp 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl