PyPI - vouch - Versions diffs - 0.2.0__py3-none-any.whl - Mend

vouch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

vouch/__init__.py +59 -0
vouch/_lang.py +264 -0
vouch/_llm.py +181 -0
vouch/adapters/__init__.py +68 -0
vouch/adapters/base.py +28 -0
vouch/adapters/browser.py +299 -0
vouch/adapters/browser_pool.py +240 -0
vouch/adapters/http.py +183 -0
vouch/adapters/stealth.py +40 -0
vouch/captcha/__init__.py +25 -0
vouch/captcha/solver.py +245 -0
vouch/captcha/tesseract.py +147 -0
vouch/catalog.py +306 -0
vouch/cli.py +364 -0
vouch/config.py +95 -0
vouch/discovery/__init__.py +9 -0
vouch/discovery/cache.py +163 -0
vouch/discovery/humanize.py +57 -0
vouch/discovery/probe.py +58 -0
vouch/discovery/search_bar.py +136 -0
vouch/dns_resolver.py +103 -0
vouch/engine.py +693 -0
vouch/exceptions.py +45 -0
vouch/extraction/__init__.py +7 -0
vouch/extraction/css_selectors.py +366 -0
vouch/extraction/llm.py +41 -0
vouch/extraction/llm_extract.py +509 -0
vouch/extraction/pdf.py +32 -0
vouch/extraction/trafilatura.py +243 -0
vouch/integrations/__init__.py +9 -0
vouch/integrations/_common.py +30 -0
vouch/integrations/crewai.py +52 -0
vouch/integrations/langchain.py +55 -0
vouch/integrations/mcp.py +90 -0
vouch/integrations/pydantic_ai.py +45 -0
vouch/models.py +129 -0
vouch/monitor/__init__.py +8 -0
vouch/monitor/notify.py +58 -0
vouch/monitor/watcher.py +141 -0
vouch/plugins.py +169 -0
vouch/profiles/__init__.py +28 -0
vouch/profiles/builtin.yaml +169 -0
vouch/profiles/registry.py +114 -0
vouch/profiles/update.py +168 -0
vouch/router/__init__.py +33 -0
vouch/router/all_router.py +12 -0
vouch/router/base.py +40 -0
vouch/router/embedding_router.py +91 -0
vouch/router/llm_router.py +109 -0
vouch/router/tag_router.py +48 -0
vouch/server.py +168 -0
vouch-0.2.0.dist-info/METADATA +1074 -0
vouch-0.2.0.dist-info/RECORD +56 -0
vouch-0.2.0.dist-info/WHEEL +4 -0
vouch-0.2.0.dist-info/entry_points.txt +2 -0
vouch-0.2.0.dist-info/licenses/LICENSE +21 -0

vouch/__init__.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""vouch — curated AI search for agents.
+Public API:
+    search          — one-shot search function (Level 1)
+    SearchEngine    — orchestrator with persistent catalog (Level 2/3)
+    Site            — declarative source descriptor
+    Catalog         — SQLite-backed registry of sites
+    Monitor         — change tracking (optional)
+See README.md for the full guide.
+"""
+from __future__ import annotations
+from .catalog import Catalog, Site
+from .engine import SearchEngine, search
+from .exceptions import (
+    AdapterError,
+    BlockedError,
+    CatalogError,
+    CurioError,  # back-compat alias (deprecated, will be removed in v1.0)
+    DiscoveryError,
+    RouterError,
+    VouchError,
+)
+from .models import Chunk, RouteDecision, SearchResult
+from .profiles import ProfileRegistry, get_profile, list_profiles
+__version__ = "0.2.0"
+__all__ = [
+    "AdapterError",
+    "BlockedError",
+    "Catalog",
+    "CatalogError",
+    "Chunk",
+    "CurioError",
+    "DiscoveryError",
+    "ProfileRegistry",
+    "RouteDecision",
+    "RouterError",
+    "SearchEngine",
+    "SearchResult",
+    "Site",
+    "VouchError",
+    "__version__",
+    "get_profile",
+    "list_profiles",
+    "search",
+]
+def __getattr__(name: str):
+    # Lazy import for optional Monitor (requires apscheduler).
+    if name == "Monitor":
+        from .monitor.watcher import Monitor
+        return Monitor
+    raise AttributeError(f"module 'vouch' has no attribute {name!r}")

vouch/_lang.py ADDED Viewed

@@ -0,0 +1,264 @@
+"""Lightweight language detection — heuristic, zero deps, fast.
+Used to set Accept-Language headers and to bias the router slightly toward
+sites tagged with the same language. Not perfect, just useful: distinguishes
+PT / ES / EN / FR / DE / IT well enough for routing.
+"""
+from __future__ import annotations
+import re
+import unicodedata
+from collections import Counter
+_PT_MARKERS = {
+    # function words and common content words
+    "de",
+    "do",
+    "da",
+    "dos",
+    "das",
+    "no",
+    "na",
+    "nos",
+    "nas",
+    "que",
+    "uma",
+    "uns",
+    "para",
+    "como",
+    "com",
+    "mais",
+    "ou",
+    "não",
+    "está",
+    "são",
+    "também",
+    "você",
+    "três",
+    "depois",
+    "muito",
+    "isso",
+    "esse",
+    "essa",
+    "fazer",
+    "tem",
+    "porque",
+    "através",
+    # very-PT content words / domain
+    "história",
+    "biografia",
+    "geografia",
+    "política",
+    "saúde",
+    "imposto",
+    "renda",
+    "tributação",
+    "tributário",
+    "fiscal",
+    "previdenciário",
+    "regulação",
+    "código",
+    "lei",
+    "ministério",
+    "brasil",
+    "brasileira",
+    "brasileiro",
+    "português",
+    "decreto",
+    "decisão",
+    "secretaria",
+    "ministra",
+    "presidente",
+    "futebol",
+    "samba",
+    "feijoada",
+    "açúcar",
+    "pão",
+    "começar",
+    "começo",
+}
+_ES_MARKERS = {
+    # function words
+    "el",
+    "la",
+    "los",
+    "las",
+    "un",
+    "una",
+    "y",
+    "que",
+    "para",
+    "como",
+    "más",
+    "también",
+    "del",
+    "al",
+    "se",
+    "no",
+    # diacritics-only ES
+    "español",
+    "españa",
+    "está",
+    "qué",
+    "cómo",
+    "cuándo",
+    "dónde",
+    "quién",
+    "según",
+    "después",
+    "días",
+    "años",
+    # domain
+    "tributación",
+    "renta",
+    "modelo",
+    "hacienda",
+    "autónomo",
+    "ahorro",
+    "cuenta",
+    "cuota",
+    "tarifa",
+    "vivienda",
+    "deducción",
+    "criptomonedas",
+    "presidente",
+    "ministro",
+    "decreto",
+    "historia",
+    "biografía",
+    "literatura",
+    "música",
+    "fútbol",
+    "paella",
+    "tortilla",
+    "flamenco",
+}
+_EN_MARKERS = {
+    # function words
+    "the",
+    "and",
+    "of",
+    "for",
+    "with",
+    "to",
+    "in",
+    "on",
+    "at",
+    "by",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "this",
+    "that",
+    "what",
+    "how",
+    "why",
+    "where",
+    "when",
+    "who",
+    "you",
+    "your",
+    "us",
+    "we",
+    "they",
+    "their",
+    "all",
+    "any",
+    "some",
+    "more",
+    "most",
+    "best",
+    "good",
+    "tutorial",
+    "explained",
+    "guide",
+    "review",
+    "comparison",
+    # domain
+    "tax",
+    "deduction",
+    "irs",
+    "hmrc",
+    "vat",
+    "ein",
+    "401k",
+    "history",
+    "biography",
+    "anatomy",
+    "evolution",
+    "recipe",
+    "workout",
+    "wine",
+    "league",
+    "season",
+    "transformer",
+    "paper",
+    "model",
+    "framework",
+}
+_FR_MARKERS = {"le", "la", "les", "des", "ç", "déclaration", "impôt", "français"}
+_IT_MARKERS = {"il", "gli", "tasse", "imposte", "dichiarazione", "italiano"}
+_DE_MARKERS = {"der", "die", "das", "und", "steuer", "erklärung", "deutsch"}
+_TOKEN = re.compile(r"\w+", re.UNICODE)
+def detect_language(text: str) -> str:
+    """Return ``"pt"``, ``"es"``, ``"en"``, ``"fr"``, ``"de"``, ``"it"``, or ``"unknown"``."""
+    text = (text or "").strip().lower()
+    if not text:
+        return "unknown"
+    # Diacritic-based fast path.
+    if "ção" in text or "ções" in text or "ões" in text:
+        return "pt"
+    if "ñ" in text:
+        return "es"
+    tokens = set(_TOKEN.findall(text))
+    score = Counter()
+    score["pt"] = len(tokens & _PT_MARKERS) + (2 if any("ç" in t for t in tokens) else 0)
+    score["es"] = len(tokens & _ES_MARKERS)
+    score["en"] = len(tokens & _EN_MARKERS)
+    score["fr"] = len(tokens & _FR_MARKERS)
+    score["it"] = len(tokens & _IT_MARKERS)
+    score["de"] = len(tokens & _DE_MARKERS)
+    # Tie-break: ASCII-only with EN markers → en. With unicode accents → pt or es.
+    has_diacritics = any(
+        unicodedata.category(c) == "Mn" for c in unicodedata.normalize("NFD", text)
+    )
+    if not has_diacritics and score["en"] > 0 and score["en"] >= score.most_common(1)[0][1]:
+        return "en"
+    best, n = score.most_common(1)[0]
+    if n == 0:
+        return "unknown"
+    return best
+_ACCEPT_LANG = {
+    "pt": "pt-BR,pt;q=0.9,en;q=0.5",
+    "es": "es-ES,es;q=0.9,en;q=0.5",
+    "en": "en-US,en;q=0.9",
+    "fr": "fr-FR,fr;q=0.9,en;q=0.5",
+    "de": "de-DE,de;q=0.9,en;q=0.5",
+    "it": "it-IT,it;q=0.9,en;q=0.5",
+    "unknown": "en-US,en;q=0.9,pt;q=0.6,es;q=0.6",
+}
+def accept_language_for(text_or_lang: str) -> str:
+    """Return an Accept-Language header value for a query string OR a language code."""
+    val = text_or_lang
+    if len(val) > 5 or " " in val:
+        val = detect_language(val)
+    return _ACCEPT_LANG.get(val, _ACCEPT_LANG["unknown"])
+__all__ = ["accept_language_for", "detect_language"]

vouch/_llm.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Thin wrapper over LiteLLM with retries, fallback chains, and cost tracking."""
+from __future__ import annotations
+import json
+import logging
+import os
+import re
+from typing import Any
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+from .exceptions import LLMError
+from .models import TokenUsage
+log = logging.getLogger("vouch.llm")
+def _apply_keys(api_keys: dict[str, str] | None) -> None:
+    if not api_keys:
+        return
+    mapping = {
+        "anthropic": "ANTHROPIC_API_KEY",
+        "openai": "OPENAI_API_KEY",
+        "gemini": "GEMINI_API_KEY",
+        "google": "GEMINI_API_KEY",
+        "groq": "GROQ_API_KEY",
+        "mistral": "MISTRAL_API_KEY",
+        "cohere": "COHERE_API_KEY",
+        "azure": "AZURE_API_KEY",
+        "deepseek": "DEEPSEEK_API_KEY",
+    }
+    for provider, key in api_keys.items():
+        env = mapping.get(provider.lower(), provider.upper() + "_API_KEY")
+        os.environ.setdefault(env, key)
+# Approximate per-million-token pricing — used only for estimates, not billing.
+_PRICE = {
+    "claude-haiku-4-5": (1.0, 5.0),
+    "claude-sonnet-4-6": (3.0, 15.0),
+    "claude-opus-4-7": (15.0, 75.0),
+    "gpt-4.1-mini": (0.15, 0.6),
+    "gpt-4o-mini": (0.15, 0.6),
+    "gpt-4.1": (5.0, 15.0),
+    "gemini-2.5-flash": (0.075, 0.3),
+    "gemini-2.5-flash-lite": (0.04, 0.15),
+    "gemini-2.5-pro": (1.25, 5.0),
+    "qwen2.5": (0.0, 0.0),
+    "llama3": (0.0, 0.0),
+    "ollama": (0.0, 0.0),
+}
+def estimate_cost(model: str, tokens_in: int, tokens_out: int) -> float:
+    key = next((k for k in _PRICE if k in model), "ollama")
+    pin, pout = _PRICE[key]
+    return (tokens_in * pin + tokens_out * pout) / 1_000_000
+class LLMClient:
+    """Wraps litellm.completion with fallback chain + token accounting."""
+    def __init__(self, model: str | list[str], api_keys: dict[str, str] | None = None):
+        self.models = [model] if isinstance(model, str) else list(model)
+        if not self.models:
+            raise ValueError("LLMClient requires at least one model")
+        _apply_keys(api_keys)
+        self.tokens = TokenUsage()
+        self.cost_usd = 0.0
+    @retry(
+        retry=retry_if_exception_type(Exception),
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=1, max=8),
+        reraise=True,
+    )
+    def _call_one(self, model: str, messages: list[dict], **kwargs) -> dict:
+        import litellm
+        # Avoid noisy "Provider List" logs.
+        litellm.suppress_debug_info = True
+        return litellm.completion(model=model, messages=messages, **kwargs)
+    def chat(
+        self,
+        messages: list[dict],
+        *,
+        temperature: float = 0.0,
+        max_tokens: int | None = None,
+        response_format: dict | None = None,
+        timeout: float | None = 60,
+        **kwargs,
+    ) -> str:
+        last_err: Exception | None = None
+        for model in self.models:
+            try:
+                resp = self._call_one(
+                    model,
+                    messages,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    response_format=response_format,
+                    timeout=timeout,
+                    **kwargs,
+                )
+                txt, usage = _extract(resp)
+                self.tokens = self.tokens.add(usage)
+                self.cost_usd += estimate_cost(model, usage.input, usage.output)
+                return txt
+            except Exception as e:
+                log.warning("LLM call to %s failed: %s", model, e)
+                last_err = e
+        raise LLMError(f"All LLM models failed. Last error: {last_err}")
+    def chat_json(self, messages: list[dict], **kwargs) -> Any:
+        """Force JSON output and parse. Falls back to first JSON-looking block."""
+        kwargs.setdefault("response_format", {"type": "json_object"})
+        kwargs.setdefault("temperature", 0.0)
+        try:
+            txt = self.chat(messages, **kwargs)
+        except LLMError:
+            kwargs.pop("response_format", None)
+            txt = self.chat(messages, **kwargs)
+        return _parse_json_loose(txt)
+    def vision(
+        self,
+        prompt: str,
+        image_b64: str,
+        *,
+        mime: str = "image/png",
+        **kwargs,
+    ) -> str:
+        msg = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{mime};base64,{image_b64}"},
+                    },
+                ],
+            }
+        ]
+        return self.chat(msg, **kwargs)
+def _extract(resp) -> tuple[str, TokenUsage]:
+    try:
+        msg = resp.choices[0].message
+        text = msg.content or ""
+    except Exception as e:
+        raise LLMError(f"Could not extract message from LLM response: {e}") from e
+    usage = getattr(resp, "usage", None) or {}
+    pin = getattr(usage, "prompt_tokens", None) or usage.get("prompt_tokens", 0) if usage else 0
+    pout = (
+        getattr(usage, "completion_tokens", None) or usage.get("completion_tokens", 0)
+        if usage
+        else 0
+    )
+    return text, TokenUsage(input=int(pin or 0), output=int(pout or 0))
+_JSON_BLOCK = re.compile(r"\{.*\}|\[.*\]", re.DOTALL)
+def _parse_json_loose(text: str) -> Any:
+    text = text.strip()
+    # Strip markdown fences.
+    if text.startswith("```"):
+        text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
+        text = re.sub(r"\n?```$", "", text)
+    try:
+        return json.loads(text)
+    except Exception:
+        m = _JSON_BLOCK.search(text)
+        if not m:
+            raise LLMError(f"Could not parse JSON from LLM output: {text[:200]!r}") from None
+        return json.loads(m.group(0))

vouch/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Per-site search executors."""
+from __future__ import annotations
+from .base import AdapterContext, SiteAdapter
+from .http import HTTPAdapter
+__all__ = ["AdapterContext", "HTTPAdapter", "SiteAdapter", "build_adapter"]
+def build_adapter(
+    site, config, *, llm=None, selector_cache=None, pool=None, stealth_pool=None
+) -> SiteAdapter:
+    """Pick an adapter for a site based on its declared behavior + capabilities.
+    Order of preference:
+      0. Third-party plugin registered for this domain via ``vouch.adapters``
+         entry points (e.g. ``vouch-adapter-arxiv`` published on PyPI).
+      1. ``behavior="external"`` → HTTPAdapter (commercial-bypass plug point).
+      2. Has ``search_url_template`` and not stealth → HTTPAdapter (10x faster, no Chromium).
+      3. ``behavior="stealth"`` → patchright BrowserAdapter.
+      4. Default → playwright BrowserAdapter, falling back to HTTP if not installed.
+    The engine passes its shared ``pool`` (and optionally ``stealth_pool``) so
+    every BrowserAdapter the engine constructs reuses a single Chromium
+    instance instead of launching one per call.
+    """
+    # Tier 0: check for a third-party plugin registered for this host.
+    try:
+        from ..plugins import find_adapter_factory
+        factory = find_adapter_factory(site.url)
+        if factory is not None:
+            try:
+                return factory(
+                    site=site,
+                    config=config,
+                    llm=llm,
+                    selector_cache=selector_cache,
+                    pool=pool,
+                    stealth_pool=stealth_pool,
+                )
+            except TypeError:
+                # Older plugins may not accept all kwargs — be permissive.
+                return factory(config=config, llm=llm, selector_cache=selector_cache)
+    except Exception:
+        pass  # plugin path is best-effort; never block on it
+    behavior = site.behavior
+    if behavior == "external":
+        return HTTPAdapter(config=config, llm=llm, selector_cache=selector_cache)
+    if site.search_url_template and behavior != "stealth":
+        return HTTPAdapter(config=config, llm=llm, selector_cache=selector_cache)
+    if behavior == "stealth":
+        try:
+            from .stealth import StealthBrowserAdapter
+            return StealthBrowserAdapter(
+                config=config, llm=llm, selector_cache=selector_cache, pool=stealth_pool
+            )
+        except ImportError:
+            pass
+    try:
+        from .browser import BrowserAdapter
+        return BrowserAdapter(config=config, llm=llm, selector_cache=selector_cache, pool=pool)
+    except ImportError:
+        return HTTPAdapter(config=config)

vouch/adapters/base.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""SiteAdapter Protocol — the contract for per-site search execution."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Protocol, runtime_checkable
+from ..catalog import Site
+from ..models import Chunk
+@dataclass
+class AdapterContext:
+    site: Site
+    query: str
+    depth: int
+    max_results: int = 10
+    timeout: float = 60.0
+    extra: dict = field(default_factory=dict)
+@runtime_checkable
+class SiteAdapter(Protocol):
+    """Contract for any per-site search executor."""
+    def search(self, ctx: AdapterContext) -> list[Chunk]: ...
+    def close(self) -> None: ...