PyPI - ha-mcp-dev - Versions diffs - 7.2.0.dev350__tar.gz → 7.2.0.dev351__tar.gz - Mend

ha-mcp-dev 7.2.0.dev350tar.gz → 7.2.0.dev351tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

{ha_mcp_dev-7.2.0.dev350/src/ha_mcp_dev.egg-info → ha_mcp_dev-7.2.0.dev351}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ha-mcp-dev
-Version: 7.2.0.dev350
+Version: 7.2.0.dev351
 Summary: Home Assistant MCP Server - Complete control of Home Assistant through MCP
 Author-email: Julien <github@qc-h.net>
 License: MIT

{ha_mcp_dev-7.2.0.dev350 → ha_mcp_dev-7.2.0.dev351}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ha-mcp-dev"
-version = "7.2.0.dev350"
+version = "7.2.0.dev351"
 description = "Home Assistant MCP Server - Complete control of Home Assistant through MCP"
 readme = "README.md"
 requires-python = ">=3.13,<3.14"

{ha_mcp_dev-7.2.0.dev350 → ha_mcp_dev-7.2.0.dev351}/src/ha_mcp/tools/smart_search.py RENAMED Viewed

@@ -11,7 +11,13 @@ from typing import Any
 from ..client.rest_client import HomeAssistantClient
 from ..config import get_global_settings
-from ..utils.fuzzy_search import calculate_partial_ratio, create_fuzzy_searcher
+from ..utils.fuzzy_search import (
+    BM25Scorer,
+    calculate_partial_ratio,
+    calculate_ratio,
+    create_fuzzy_searcher,
+    tokenize,
+)
 from .helpers import exception_to_structured_error
 logger = logging.getLogger(__name__)
@@ -1429,53 +1435,109 @@ class SmartSearchTools:
         query: str,
         exact_match: bool = False,
     ) -> int:
-        """
-        Recursively search for query string in nested dictionary/list structures.
+        """Search for query in nested dictionary/list structures.
         When exact_match is True, uses substring matching (returns 100 if found, 0 if not).
-        When exact_match is False, uses fuzzy matching with partial ratio scoring.
+        When exact_match is False, collects all string leaves, tokenizes them into a
+        single BM25 document, and scores against the query tokens.  Falls back to
+        token-level SequenceMatcher if BM25 returns 0 (typo correction).
         """
-        max_score = 0
+        if exact_match:
+            return self._search_in_dict_exact(data, query)
+        # Fuzzy path: collect all string leaves, build a single tokenised document
+        leaves: list[str] = []
+        self._collect_string_leaves(data, leaves)
+        if not leaves:
+            return 0
+        query_tokens = tokenize(query)
+        if not query_tokens:
+            return 0
+        # Build a single flat token list from all leaves
+        doc_tokens: list[str] = []
+        for leaf in leaves:
+            doc_tokens.extend(tokenize(leaf))
+        if not doc_tokens:
+            return 0
+        # Use BM25 with a 1-document corpus (the config dict as a single doc)
+        scorer = BM25Scorer()
+        scorer.fit([doc_tokens])
+        raw = scorer.score(query_tokens, 0)
+        if raw > 0:
+            # Normalise against the theoretical max (sum of IDF per query
+            # token). With a 1-document corpus every token's IDF is identical
+            # (~0.288 with smoothing), so the ratio effectively measures how
+            # many query tokens the config contains. Cap at 100 for the edge
+            # case where high TF pushes raw above the sum-of-IDFs baseline.
+            max_possible = scorer.max_possible_score(query_tokens)
+            if max_possible > 0:
+                return min(100, round(raw / max_possible * 100))
+            logger.warning(
+                "BM25 scored > 0 but max_possible IDF is 0; "
+                "query_tokens=%s, doc_tokens_len=%d",
+                query_tokens,
+                len(doc_tokens),
+            )
+            return 100
+        # Tier-3 fallback: token-level SequenceMatcher for typos
+        logger.debug(
+            "BM25 returned 0 for query_tokens=%s; "
+            "falling back to SequenceMatcher typo scoring over %d unique tokens",
+            query_tokens,
+            len(set(doc_tokens)),
+        )
+        best = 0
+        for qt in query_tokens:
+            for dt in set(doc_tokens):
+                best = max(best, calculate_ratio(qt, dt))
+        return best if best >= 70 else 0
+    @staticmethod
+    def _collect_string_leaves(
+        data: dict[str, Any] | list[Any] | Any, out: list[str]
+    ) -> None:
+        """Recursively collect all string representations from nested data."""
         if isinstance(data, dict):
             for key, value in data.items():
-                if exact_match:
-                    if query in str(key).lower():
-                        return 100
-                else:
-                    key_score = calculate_partial_ratio(query, str(key).lower())
-                    max_score = max(max_score, key_score)
+                out.append(str(key))
+                SmartSearchTools._collect_string_leaves(value, out)
+        elif isinstance(data, list):
+            for item in data:
+                SmartSearchTools._collect_string_leaves(item, out)
+        elif isinstance(data, str):
+            out.append(data)
+        elif data is not None:
+            out.append(str(data))
-                value_score = self._search_in_dict(value, query, exact_match)
-                max_score = max(max_score, value_score)
-                if exact_match and max_score >= 100:
+    @staticmethod
+    def _search_in_dict_exact(
+        data: dict[str, Any] | list[Any] | Any,
+        query: str,
+    ) -> int:
+        """Exact substring search in nested structures (returns 100 or 0)."""
+        if isinstance(data, dict):
+            for key, value in data.items():
+                if query in str(key).lower():
+                    return 100
+                if SmartSearchTools._search_in_dict_exact(value, query) >= 100:
                     return 100
         elif isinstance(data, list):
             for item in data:
-                item_score = self._search_in_dict(item, query, exact_match)
-                max_score = max(max_score, item_score)
-                if exact_match and max_score >= 100:
+                if SmartSearchTools._search_in_dict_exact(item, query) >= 100:
                     return 100
         elif isinstance(data, str):
-            if exact_match:
-                if query in data.lower():
-                    return 100
-            else:
-                max_score = max(max_score, calculate_partial_ratio(query, data.lower()))
+            if query in data.lower():
+                return 100
         elif data is not None:
-            if exact_match:
-                if query in str(data).lower():
-                    return 100
-            else:
-                max_score = max(
-                    max_score,
-                    calculate_partial_ratio(query, str(data).lower()),
-                )
-        return max_score
+            if query in str(data).lower():
+                return 100
+        return 0
 def create_smart_search_tools(

{ha_mcp_dev-7.2.0.dev350 → ha_mcp_dev-7.2.0.dev351}/src/ha_mcp/utils/fuzzy_search.py RENAMED Viewed

@@ -1,20 +1,139 @@
 """
 Fuzzy entity search utilities for Home Assistant MCP server.
-This module uses Python's built-in difflib for string similarity calculations,
-eliminating the need for external dependencies like textdistance and numpy.
+This module provides two search strategies:
+- BM25 keyword search (primary fuzzy path): tokenized scoring with IDF term weighting,
+  effective for multi-word queries and short entity-name corpora.
+- SequenceMatcher (tier-3 fallback): character-level similarity for single-token typo
+  correction when BM25 returns nothing.
+See issue #851 for background on the BM25 migration.
 """
 import logging
+import math
+import re
 from collections.abc import Iterable
 from difflib import SequenceMatcher
 from typing import Any
 logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Tokenizer for HA entity IDs and friendly names
+# ---------------------------------------------------------------------------
+_SPLIT_RE = re.compile(r"[._\-\s]+")
+def tokenize(text: str) -> list[str]:
+    """Split text on `.`, `_`, `-`, and whitespace, lowercase, drop empties."""
+    return [t for t in _SPLIT_RE.split(text.lower()) if t]
+# ---------------------------------------------------------------------------
+# BM25 scorer – lightweight, zero-dependency
+# ---------------------------------------------------------------------------
+class BM25Scorer:
+    """BM25 (Okapi) scorer tuned for short HA entity-name documents.
+    Parameters are set conservatively for corpora of 2-5 token documents:
+      k1=1.2  - moderate term-frequency saturation
+      b=0.5   - reduced length-normalization (entity names are uniformly short)
+    """
+    def __init__(self, k1: float = 1.2, b: float = 0.5) -> None:
+        self.k1 = k1
+        self.b = b
+        # Populated by fit()
+        self._idf: dict[str, float] = {}
+        self._doc_tokens: list[list[str]] = []
+        self._doc_lens: list[int] = []
+        self._avgdl: float = 0.0
+    # -- corpus building ----------------------------------------------------
+    def fit(self, corpus: list[list[str]]) -> None:
+        """Build IDF table from a pre-tokenized corpus."""
+        self._doc_tokens = corpus
+        n = len(corpus)
+        if n == 0:
+            return
+        self._doc_lens = [len(doc) for doc in corpus]
+        self._avgdl = sum(self._doc_lens) / n
+        # Guard against all-empty corpora: avoids nan from 0/0 in length normalization
+        if self._avgdl == 0.0:
+            self._avgdl = 1.0
+        # document frequency per token
+        df: dict[str, int] = {}
+        for doc in corpus:
+            seen: set[str] = set()
+            for token in doc:
+                if token not in seen:
+                    df[token] = df.get(token, 0) + 1
+                    seen.add(token)
+        # IDF with smoothing (Robertson variant)
+        self._idf = {
+            token: math.log((n - freq + 0.5) / (freq + 0.5) + 1.0)
+            for token, freq in df.items()
+        }
+    # -- scoring ------------------------------------------------------------
+    def score(self, query_tokens: list[str], doc_index: int) -> float:
+        """Return the BM25 score for *query_tokens* against document at *doc_index*."""
+        doc = self._doc_tokens[doc_index]
+        dl = self._doc_lens[doc_index]
+        # term frequency in this document
+        tf: dict[str, int] = {}
+        for t in doc:
+            tf[t] = tf.get(t, 0) + 1
+        total = 0.0
+        for qt in query_tokens:
+            idf = self._idf.get(qt, 0.0)
+            f = tf.get(qt, 0)
+            if f == 0:
+                continue
+            numer = f * (self.k1 + 1)
+            denom = f + self.k1 * (1 - self.b + self.b * dl / self._avgdl)
+            total += idf * numer / denom
+        return total
+    def score_all(self, query_tokens: list[str]) -> list[float]:
+        """Return BM25 scores for every document in the fitted corpus."""
+        return [self.score(query_tokens, i) for i in range(len(self._doc_tokens))]
+    def max_possible_score(self, query_tokens: list[str]) -> float:
+        """Return the theoretical maximum BM25 score for *query_tokens*.
+        Used for absolute normalization: dividing a raw score by this produces
+        a 0-1 ratio representing how close a document is to a perfect match.
+        Query tokens absent from the corpus contribute the corpus's maximum
+        IDF as a penalty — this prevents partial matches from scoring as
+        perfect matches when the other query tokens simply do not exist in
+        the corpus.
+        """
+        if not self._idf:
+            return 0.0
+        max_idf = max(self._idf.values())
+        return sum(self._idf.get(t, max_idf) for t in query_tokens)
+# ---------------------------------------------------------------------------
+# FuzzyEntitySearcher – now BM25-primary with SequenceMatcher fallback
+# ---------------------------------------------------------------------------
 class FuzzyEntitySearcher:
-    """Advanced fuzzy entity search with AI-optimized scoring."""
+    """Entity search with BM25 keyword scoring and SequenceMatcher fallback."""
     def __init__(self, threshold: int = 60):
         """Initialize with fuzzy matching threshold."""
@@ -24,14 +143,13 @@ class FuzzyEntitySearcher:
     def search_entities(
         self, entities: list[dict[str, Any]], query: str, limit: int = 10, offset: int = 0
     ) -> tuple[list[dict[str, Any]], int]:
-        """
-        Search entities with fuzzy matching and intelligent scoring.
+        """Search entities using BM25 scoring with SequenceMatcher typo fallback.
-        Args:
-            entities: List of Home Assistant entity states
-            query: Search query (can be partial, with typos)
-            limit: Maximum number of results
-            offset: Number of results to skip for pagination
+        Strategy:
+          1. Tokenize every entity (entity_id + friendly_name) into a BM25 corpus.
+          2. Score all documents with BM25.  Keep results above a positive threshold.
+          3. If BM25 returns nothing, fall back to token-level SequenceMatcher on
+             query tokens vs document tokens (catches single-character typos).
         Returns:
             Tuple of (paginated results list, total match count)
@@ -39,44 +157,110 @@ class FuzzyEntitySearcher:
         if not query or not entities:
             return [], 0
-        matches = []
         query_lower = query.lower().strip()
+        query_tokens = tokenize(query_lower)
+        if not query_tokens:
+            return [], 0
+        # Build per-entity document: tokens from entity_id + friendly_name
+        docs: list[list[str]] = []
+        meta: list[tuple[str, str, str, dict[str, Any], str]] = []  # eid, name, domain, attrs, state
         for entity in entities:
             entity_id = entity.get("entity_id", "")
             attributes = entity.get("attributes", {})
             friendly_name = attributes.get("friendly_name", entity_id)
             domain = entity_id.split(".")[0] if "." in entity_id else ""
+            state = entity.get("state", "unknown")
+            tokens = tokenize(entity_id) + tokenize(friendly_name)
+            docs.append(tokens)
+            meta.append((entity_id, friendly_name, domain, attributes, state))
+        # Fit BM25
+        scorer = BM25Scorer()
+        scorer.fit(docs)
+        raw_scores = scorer.score_all(query_tokens)
+        # Normalise against theoretical max (sum of IDFs) to produce absolute
+        # scores in the 0-100 range. Empirical-max normalization would always
+        # inflate the best match to 100 regardless of actual relevance, which
+        # defeats the purpose of a threshold-based quality gate.
+        theoretical_max = scorer.max_possible_score(query_tokens)
+        matches: list[dict[str, Any]] = []
+        if theoretical_max > 0:
+            for i, raw in enumerate(raw_scores):
+                if raw <= 0:
+                    continue
+                score = min(100, round(raw / theoretical_max * 100))
+                if score < self.threshold:
+                    continue
+                eid, fname, domain, attrs, state = meta[i]
+                matches.append({
+                    "entity_id": eid,
+                    "friendly_name": fname,
+                    "domain": domain,
+                    "state": state,
+                    "attributes": attrs,
+                    "score": score,
+                    "match_type": self._get_match_type(eid, fname, domain, query_lower),
+                })
+        # Tier-3 fallback: token-level SequenceMatcher only if BM25 scored
+        # every document at zero. Firing the fallback when BM25 found valid
+        # partial matches (just below threshold) would allow a character-level
+        # match on the same token to inflate the score to 100, re-introducing
+        # exactly the noise floor the new absolute normalization is fixing.
+        bm25_found_any = any(raw > 0 for raw in raw_scores)
+        if not matches and not bm25_found_any:
+            matches = self._typo_fallback(query_tokens, query_lower, docs, meta)
-            # Calculate comprehensive score
-            score = self._calculate_entity_score(
-                entity_id, friendly_name, domain, query_lower
-            )
-            if score >= self.threshold:
-                matches.append(
-                    {
-                        "entity_id": entity_id,
-                        "friendly_name": friendly_name,
-                        "domain": domain,
-                        "state": entity.get("state", "unknown"),
-                        "attributes": attributes,
-                        "score": score,
-                        "match_type": self._get_match_type(
-                            entity_id, friendly_name, domain, query_lower
-                        ),
-                    }
-                )
-        # Sort by score descending
         matches.sort(key=lambda x: x["score"], reverse=True)
         total_matches = len(matches)
         return matches[offset:offset + limit], total_matches
+    # -- private helpers -----------------------------------------------------
+    def _typo_fallback(
+        self,
+        query_tokens: list[str],
+        query_lower: str,
+        docs: list[list[str]],
+        meta: list[tuple[str, str, str, dict[str, Any], str]],
+    ) -> list[dict[str, Any]]:
+        """Token-level SequenceMatcher fallback for typo correction."""
+        results: list[dict[str, Any]] = []
+        for i, doc_tokens in enumerate(docs):
+            best_token_score = 0
+            for qt in query_tokens:
+                for dt in doc_tokens:
+                    ratio = calculate_ratio(qt, dt)
+                    best_token_score = max(best_token_score, ratio)
+            if best_token_score >= 75:  # stricter threshold for typo fallback
+                eid, fname, domain, attrs, state = meta[i]
+                results.append({
+                    "entity_id": eid,
+                    "friendly_name": fname,
+                    "domain": domain,
+                    "state": state,
+                    "attributes": attrs,
+                    "score": best_token_score,
+                    "match_type": "typo_fallback",
+                })
+        return results
     def _calculate_entity_score(
         self, entity_id: str, friendly_name: str, domain: str, query: str
     ) -> int:
-        """Calculate comprehensive fuzzy score for an entity."""
+        """Calculate a comprehensive fuzzy score for an entity name/domain.
+        Actively used by ``ha_deep_search`` name scoring (automation, script,
+        helper phases) to produce a score comparable to the legacy additive
+        output those paths already rely on. Do not remove without migrating
+        the deep-search callers to a BM25-based scheme.
+        """
         score = 0
         # Exact matches get highest scores

{ha_mcp_dev-7.2.0.dev350 → ha_mcp_dev-7.2.0.dev351/src/ha_mcp_dev.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ha-mcp-dev
-Version: 7.2.0.dev350
+Version: 7.2.0.dev351
 Summary: Home Assistant MCP Server - Complete control of Home Assistant through MCP
 Author-email: Julien <github@qc-h.net>
 License: MIT