vouch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. vouch/__init__.py +59 -0
  2. vouch/_lang.py +264 -0
  3. vouch/_llm.py +181 -0
  4. vouch/adapters/__init__.py +68 -0
  5. vouch/adapters/base.py +28 -0
  6. vouch/adapters/browser.py +299 -0
  7. vouch/adapters/browser_pool.py +240 -0
  8. vouch/adapters/http.py +183 -0
  9. vouch/adapters/stealth.py +40 -0
  10. vouch/captcha/__init__.py +25 -0
  11. vouch/captcha/solver.py +245 -0
  12. vouch/captcha/tesseract.py +147 -0
  13. vouch/catalog.py +306 -0
  14. vouch/cli.py +364 -0
  15. vouch/config.py +95 -0
  16. vouch/discovery/__init__.py +9 -0
  17. vouch/discovery/cache.py +163 -0
  18. vouch/discovery/humanize.py +57 -0
  19. vouch/discovery/probe.py +58 -0
  20. vouch/discovery/search_bar.py +136 -0
  21. vouch/dns_resolver.py +103 -0
  22. vouch/engine.py +693 -0
  23. vouch/exceptions.py +45 -0
  24. vouch/extraction/__init__.py +7 -0
  25. vouch/extraction/css_selectors.py +366 -0
  26. vouch/extraction/llm.py +41 -0
  27. vouch/extraction/llm_extract.py +509 -0
  28. vouch/extraction/pdf.py +32 -0
  29. vouch/extraction/trafilatura.py +243 -0
  30. vouch/integrations/__init__.py +9 -0
  31. vouch/integrations/_common.py +30 -0
  32. vouch/integrations/crewai.py +52 -0
  33. vouch/integrations/langchain.py +55 -0
  34. vouch/integrations/mcp.py +90 -0
  35. vouch/integrations/pydantic_ai.py +45 -0
  36. vouch/models.py +129 -0
  37. vouch/monitor/__init__.py +8 -0
  38. vouch/monitor/notify.py +58 -0
  39. vouch/monitor/watcher.py +141 -0
  40. vouch/plugins.py +169 -0
  41. vouch/profiles/__init__.py +28 -0
  42. vouch/profiles/builtin.yaml +169 -0
  43. vouch/profiles/registry.py +114 -0
  44. vouch/profiles/update.py +168 -0
  45. vouch/router/__init__.py +33 -0
  46. vouch/router/all_router.py +12 -0
  47. vouch/router/base.py +40 -0
  48. vouch/router/embedding_router.py +91 -0
  49. vouch/router/llm_router.py +109 -0
  50. vouch/router/tag_router.py +48 -0
  51. vouch/server.py +168 -0
  52. vouch-0.2.0.dist-info/METADATA +1074 -0
  53. vouch-0.2.0.dist-info/RECORD +56 -0
  54. vouch-0.2.0.dist-info/WHEEL +4 -0
  55. vouch-0.2.0.dist-info/entry_points.txt +2 -0
  56. vouch-0.2.0.dist-info/licenses/LICENSE +21 -0
vouch/__init__.py ADDED
@@ -0,0 +1,59 @@
1
+ """vouch — curated AI search for agents.
2
+
3
+ Public API:
4
+ search — one-shot search function (Level 1)
5
+ SearchEngine — orchestrator with persistent catalog (Level 2/3)
6
+ Site — declarative source descriptor
7
+ Catalog — SQLite-backed registry of sites
8
+ Monitor — change tracking (optional)
9
+
10
+ See README.md for the full guide.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from .catalog import Catalog, Site
16
+ from .engine import SearchEngine, search
17
+ from .exceptions import (
18
+ AdapterError,
19
+ BlockedError,
20
+ CatalogError,
21
+ CurioError, # back-compat alias (deprecated, will be removed in v1.0)
22
+ DiscoveryError,
23
+ RouterError,
24
+ VouchError,
25
+ )
26
+ from .models import Chunk, RouteDecision, SearchResult
27
+ from .profiles import ProfileRegistry, get_profile, list_profiles
28
+
29
+ __version__ = "0.2.0"
30
+
31
+ __all__ = [
32
+ "AdapterError",
33
+ "BlockedError",
34
+ "Catalog",
35
+ "CatalogError",
36
+ "Chunk",
37
+ "CurioError",
38
+ "DiscoveryError",
39
+ "ProfileRegistry",
40
+ "RouteDecision",
41
+ "RouterError",
42
+ "SearchEngine",
43
+ "SearchResult",
44
+ "Site",
45
+ "VouchError",
46
+ "__version__",
47
+ "get_profile",
48
+ "list_profiles",
49
+ "search",
50
+ ]
51
+
52
+
53
+ def __getattr__(name: str):
54
+ # Lazy import for optional Monitor (requires apscheduler).
55
+ if name == "Monitor":
56
+ from .monitor.watcher import Monitor
57
+
58
+ return Monitor
59
+ raise AttributeError(f"module 'vouch' has no attribute {name!r}")
vouch/_lang.py ADDED
@@ -0,0 +1,264 @@
1
+ """Lightweight language detection — heuristic, zero deps, fast.
2
+
3
+ Used to set Accept-Language headers and to bias the router slightly toward
4
+ sites tagged with the same language. Not perfect, just useful: distinguishes
5
+ PT / ES / EN / FR / DE / IT well enough for routing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ import unicodedata
12
+ from collections import Counter
13
+
14
+ _PT_MARKERS = {
15
+ # function words and common content words
16
+ "de",
17
+ "do",
18
+ "da",
19
+ "dos",
20
+ "das",
21
+ "no",
22
+ "na",
23
+ "nos",
24
+ "nas",
25
+ "que",
26
+ "uma",
27
+ "uns",
28
+ "para",
29
+ "como",
30
+ "com",
31
+ "mais",
32
+ "ou",
33
+ "não",
34
+ "está",
35
+ "são",
36
+ "também",
37
+ "você",
38
+ "três",
39
+ "depois",
40
+ "muito",
41
+ "isso",
42
+ "esse",
43
+ "essa",
44
+ "fazer",
45
+ "tem",
46
+ "porque",
47
+ "através",
48
+ # very-PT content words / domain
49
+ "história",
50
+ "biografia",
51
+ "geografia",
52
+ "política",
53
+ "saúde",
54
+ "imposto",
55
+ "renda",
56
+ "tributação",
57
+ "tributário",
58
+ "fiscal",
59
+ "previdenciário",
60
+ "regulação",
61
+ "código",
62
+ "lei",
63
+ "ministério",
64
+ "brasil",
65
+ "brasileira",
66
+ "brasileiro",
67
+ "português",
68
+ "decreto",
69
+ "decisão",
70
+ "secretaria",
71
+ "ministra",
72
+ "presidente",
73
+ "futebol",
74
+ "samba",
75
+ "feijoada",
76
+ "açúcar",
77
+ "pão",
78
+ "começar",
79
+ "começo",
80
+ }
81
+ _ES_MARKERS = {
82
+ # function words
83
+ "el",
84
+ "la",
85
+ "los",
86
+ "las",
87
+ "un",
88
+ "una",
89
+ "y",
90
+ "que",
91
+ "para",
92
+ "como",
93
+ "más",
94
+ "también",
95
+ "del",
96
+ "al",
97
+ "se",
98
+ "no",
99
+ # diacritics-only ES
100
+ "español",
101
+ "españa",
102
+ "está",
103
+ "qué",
104
+ "cómo",
105
+ "cuándo",
106
+ "dónde",
107
+ "quién",
108
+ "según",
109
+ "después",
110
+ "días",
111
+ "años",
112
+ # domain
113
+ "tributación",
114
+ "renta",
115
+ "modelo",
116
+ "hacienda",
117
+ "autónomo",
118
+ "ahorro",
119
+ "cuenta",
120
+ "cuota",
121
+ "tarifa",
122
+ "vivienda",
123
+ "deducción",
124
+ "criptomonedas",
125
+ "presidente",
126
+ "ministro",
127
+ "decreto",
128
+ "historia",
129
+ "biografía",
130
+ "literatura",
131
+ "música",
132
+ "fútbol",
133
+ "paella",
134
+ "tortilla",
135
+ "flamenco",
136
+ }
137
+ _EN_MARKERS = {
138
+ # function words
139
+ "the",
140
+ "and",
141
+ "of",
142
+ "for",
143
+ "with",
144
+ "to",
145
+ "in",
146
+ "on",
147
+ "at",
148
+ "by",
149
+ "is",
150
+ "are",
151
+ "was",
152
+ "were",
153
+ "be",
154
+ "been",
155
+ "this",
156
+ "that",
157
+ "what",
158
+ "how",
159
+ "why",
160
+ "where",
161
+ "when",
162
+ "who",
163
+ "you",
164
+ "your",
165
+ "us",
166
+ "we",
167
+ "they",
168
+ "their",
169
+ "all",
170
+ "any",
171
+ "some",
172
+ "more",
173
+ "most",
174
+ "best",
175
+ "good",
176
+ "tutorial",
177
+ "explained",
178
+ "guide",
179
+ "review",
180
+ "comparison",
181
+ # domain
182
+ "tax",
183
+ "deduction",
184
+ "irs",
185
+ "hmrc",
186
+ "vat",
187
+ "ein",
188
+ "401k",
189
+ "history",
190
+ "biography",
191
+ "anatomy",
192
+ "evolution",
193
+ "recipe",
194
+ "workout",
195
+ "wine",
196
+ "league",
197
+ "season",
198
+ "transformer",
199
+ "paper",
200
+ "model",
201
+ "framework",
202
+ }
203
+ _FR_MARKERS = {"le", "la", "les", "des", "ç", "déclaration", "impôt", "français"}
204
+ _IT_MARKERS = {"il", "gli", "tasse", "imposte", "dichiarazione", "italiano"}
205
+ _DE_MARKERS = {"der", "die", "das", "und", "steuer", "erklärung", "deutsch"}
206
+
207
+ _TOKEN = re.compile(r"\w+", re.UNICODE)
208
+
209
+
210
+ def detect_language(text: str) -> str:
211
+ """Return ``"pt"``, ``"es"``, ``"en"``, ``"fr"``, ``"de"``, ``"it"``, or ``"unknown"``."""
212
+ text = (text or "").strip().lower()
213
+ if not text:
214
+ return "unknown"
215
+
216
+ # Diacritic-based fast path.
217
+ if "ção" in text or "ções" in text or "ões" in text:
218
+ return "pt"
219
+ if "ñ" in text:
220
+ return "es"
221
+
222
+ tokens = set(_TOKEN.findall(text))
223
+
224
+ score = Counter()
225
+ score["pt"] = len(tokens & _PT_MARKERS) + (2 if any("ç" in t for t in tokens) else 0)
226
+ score["es"] = len(tokens & _ES_MARKERS)
227
+ score["en"] = len(tokens & _EN_MARKERS)
228
+ score["fr"] = len(tokens & _FR_MARKERS)
229
+ score["it"] = len(tokens & _IT_MARKERS)
230
+ score["de"] = len(tokens & _DE_MARKERS)
231
+
232
+ # Tie-break: ASCII-only with EN markers → en. With unicode accents → pt or es.
233
+ has_diacritics = any(
234
+ unicodedata.category(c) == "Mn" for c in unicodedata.normalize("NFD", text)
235
+ )
236
+ if not has_diacritics and score["en"] > 0 and score["en"] >= score.most_common(1)[0][1]:
237
+ return "en"
238
+
239
+ best, n = score.most_common(1)[0]
240
+ if n == 0:
241
+ return "unknown"
242
+ return best
243
+
244
+
245
+ _ACCEPT_LANG = {
246
+ "pt": "pt-BR,pt;q=0.9,en;q=0.5",
247
+ "es": "es-ES,es;q=0.9,en;q=0.5",
248
+ "en": "en-US,en;q=0.9",
249
+ "fr": "fr-FR,fr;q=0.9,en;q=0.5",
250
+ "de": "de-DE,de;q=0.9,en;q=0.5",
251
+ "it": "it-IT,it;q=0.9,en;q=0.5",
252
+ "unknown": "en-US,en;q=0.9,pt;q=0.6,es;q=0.6",
253
+ }
254
+
255
+
256
+ def accept_language_for(text_or_lang: str) -> str:
257
+ """Return an Accept-Language header value for a query string OR a language code."""
258
+ val = text_or_lang
259
+ if len(val) > 5 or " " in val:
260
+ val = detect_language(val)
261
+ return _ACCEPT_LANG.get(val, _ACCEPT_LANG["unknown"])
262
+
263
+
264
+ __all__ = ["accept_language_for", "detect_language"]
vouch/_llm.py ADDED
@@ -0,0 +1,181 @@
1
+ """Thin wrapper over LiteLLM with retries, fallback chains, and cost tracking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ from typing import Any
10
+
11
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
12
+
13
+ from .exceptions import LLMError
14
+ from .models import TokenUsage
15
+
16
+ log = logging.getLogger("vouch.llm")
17
+
18
+
19
+ def _apply_keys(api_keys: dict[str, str] | None) -> None:
20
+ if not api_keys:
21
+ return
22
+ mapping = {
23
+ "anthropic": "ANTHROPIC_API_KEY",
24
+ "openai": "OPENAI_API_KEY",
25
+ "gemini": "GEMINI_API_KEY",
26
+ "google": "GEMINI_API_KEY",
27
+ "groq": "GROQ_API_KEY",
28
+ "mistral": "MISTRAL_API_KEY",
29
+ "cohere": "COHERE_API_KEY",
30
+ "azure": "AZURE_API_KEY",
31
+ "deepseek": "DEEPSEEK_API_KEY",
32
+ }
33
+ for provider, key in api_keys.items():
34
+ env = mapping.get(provider.lower(), provider.upper() + "_API_KEY")
35
+ os.environ.setdefault(env, key)
36
+
37
+
38
+ # Approximate per-million-token pricing — used only for estimates, not billing.
39
+ _PRICE = {
40
+ "claude-haiku-4-5": (1.0, 5.0),
41
+ "claude-sonnet-4-6": (3.0, 15.0),
42
+ "claude-opus-4-7": (15.0, 75.0),
43
+ "gpt-4.1-mini": (0.15, 0.6),
44
+ "gpt-4o-mini": (0.15, 0.6),
45
+ "gpt-4.1": (5.0, 15.0),
46
+ "gemini-2.5-flash": (0.075, 0.3),
47
+ "gemini-2.5-flash-lite": (0.04, 0.15),
48
+ "gemini-2.5-pro": (1.25, 5.0),
49
+ "qwen2.5": (0.0, 0.0),
50
+ "llama3": (0.0, 0.0),
51
+ "ollama": (0.0, 0.0),
52
+ }
53
+
54
+
55
+ def estimate_cost(model: str, tokens_in: int, tokens_out: int) -> float:
56
+ key = next((k for k in _PRICE if k in model), "ollama")
57
+ pin, pout = _PRICE[key]
58
+ return (tokens_in * pin + tokens_out * pout) / 1_000_000
59
+
60
+
61
+ class LLMClient:
62
+ """Wraps litellm.completion with fallback chain + token accounting."""
63
+
64
+ def __init__(self, model: str | list[str], api_keys: dict[str, str] | None = None):
65
+ self.models = [model] if isinstance(model, str) else list(model)
66
+ if not self.models:
67
+ raise ValueError("LLMClient requires at least one model")
68
+ _apply_keys(api_keys)
69
+ self.tokens = TokenUsage()
70
+ self.cost_usd = 0.0
71
+
72
+ @retry(
73
+ retry=retry_if_exception_type(Exception),
74
+ stop=stop_after_attempt(3),
75
+ wait=wait_exponential(multiplier=1, min=1, max=8),
76
+ reraise=True,
77
+ )
78
+ def _call_one(self, model: str, messages: list[dict], **kwargs) -> dict:
79
+ import litellm
80
+
81
+ # Avoid noisy "Provider List" logs.
82
+ litellm.suppress_debug_info = True
83
+ return litellm.completion(model=model, messages=messages, **kwargs)
84
+
85
+ def chat(
86
+ self,
87
+ messages: list[dict],
88
+ *,
89
+ temperature: float = 0.0,
90
+ max_tokens: int | None = None,
91
+ response_format: dict | None = None,
92
+ timeout: float | None = 60,
93
+ **kwargs,
94
+ ) -> str:
95
+ last_err: Exception | None = None
96
+ for model in self.models:
97
+ try:
98
+ resp = self._call_one(
99
+ model,
100
+ messages,
101
+ temperature=temperature,
102
+ max_tokens=max_tokens,
103
+ response_format=response_format,
104
+ timeout=timeout,
105
+ **kwargs,
106
+ )
107
+ txt, usage = _extract(resp)
108
+ self.tokens = self.tokens.add(usage)
109
+ self.cost_usd += estimate_cost(model, usage.input, usage.output)
110
+ return txt
111
+ except Exception as e:
112
+ log.warning("LLM call to %s failed: %s", model, e)
113
+ last_err = e
114
+ raise LLMError(f"All LLM models failed. Last error: {last_err}")
115
+
116
+ def chat_json(self, messages: list[dict], **kwargs) -> Any:
117
+ """Force JSON output and parse. Falls back to first JSON-looking block."""
118
+ kwargs.setdefault("response_format", {"type": "json_object"})
119
+ kwargs.setdefault("temperature", 0.0)
120
+ try:
121
+ txt = self.chat(messages, **kwargs)
122
+ except LLMError:
123
+ kwargs.pop("response_format", None)
124
+ txt = self.chat(messages, **kwargs)
125
+ return _parse_json_loose(txt)
126
+
127
+ def vision(
128
+ self,
129
+ prompt: str,
130
+ image_b64: str,
131
+ *,
132
+ mime: str = "image/png",
133
+ **kwargs,
134
+ ) -> str:
135
+ msg = [
136
+ {
137
+ "role": "user",
138
+ "content": [
139
+ {"type": "text", "text": prompt},
140
+ {
141
+ "type": "image_url",
142
+ "image_url": {"url": f"data:{mime};base64,{image_b64}"},
143
+ },
144
+ ],
145
+ }
146
+ ]
147
+ return self.chat(msg, **kwargs)
148
+
149
+
150
+ def _extract(resp) -> tuple[str, TokenUsage]:
151
+ try:
152
+ msg = resp.choices[0].message
153
+ text = msg.content or ""
154
+ except Exception as e:
155
+ raise LLMError(f"Could not extract message from LLM response: {e}") from e
156
+ usage = getattr(resp, "usage", None) or {}
157
+ pin = getattr(usage, "prompt_tokens", None) or usage.get("prompt_tokens", 0) if usage else 0
158
+ pout = (
159
+ getattr(usage, "completion_tokens", None) or usage.get("completion_tokens", 0)
160
+ if usage
161
+ else 0
162
+ )
163
+ return text, TokenUsage(input=int(pin or 0), output=int(pout or 0))
164
+
165
+
166
+ _JSON_BLOCK = re.compile(r"\{.*\}|\[.*\]", re.DOTALL)
167
+
168
+
169
+ def _parse_json_loose(text: str) -> Any:
170
+ text = text.strip()
171
+ # Strip markdown fences.
172
+ if text.startswith("```"):
173
+ text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
174
+ text = re.sub(r"\n?```$", "", text)
175
+ try:
176
+ return json.loads(text)
177
+ except Exception:
178
+ m = _JSON_BLOCK.search(text)
179
+ if not m:
180
+ raise LLMError(f"Could not parse JSON from LLM output: {text[:200]!r}") from None
181
+ return json.loads(m.group(0))
@@ -0,0 +1,68 @@
1
+ """Per-site search executors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import AdapterContext, SiteAdapter
6
+ from .http import HTTPAdapter
7
+
8
+ __all__ = ["AdapterContext", "HTTPAdapter", "SiteAdapter", "build_adapter"]
9
+
10
+
11
+ def build_adapter(
12
+ site, config, *, llm=None, selector_cache=None, pool=None, stealth_pool=None
13
+ ) -> SiteAdapter:
14
+ """Pick an adapter for a site based on its declared behavior + capabilities.
15
+
16
+ Order of preference:
17
+ 0. Third-party plugin registered for this domain via ``vouch.adapters``
18
+ entry points (e.g. ``vouch-adapter-arxiv`` published on PyPI).
19
+ 1. ``behavior="external"`` → HTTPAdapter (commercial-bypass plug point).
20
+ 2. Has ``search_url_template`` and not stealth → HTTPAdapter (10x faster, no Chromium).
21
+ 3. ``behavior="stealth"`` → patchright BrowserAdapter.
22
+ 4. Default → playwright BrowserAdapter, falling back to HTTP if not installed.
23
+
24
+ The engine passes its shared ``pool`` (and optionally ``stealth_pool``) so
25
+ every BrowserAdapter the engine constructs reuses a single Chromium
26
+ instance instead of launching one per call.
27
+ """
28
+ # Tier 0: check for a third-party plugin registered for this host.
29
+ try:
30
+ from ..plugins import find_adapter_factory
31
+
32
+ factory = find_adapter_factory(site.url)
33
+ if factory is not None:
34
+ try:
35
+ return factory(
36
+ site=site,
37
+ config=config,
38
+ llm=llm,
39
+ selector_cache=selector_cache,
40
+ pool=pool,
41
+ stealth_pool=stealth_pool,
42
+ )
43
+ except TypeError:
44
+ # Older plugins may not accept all kwargs — be permissive.
45
+ return factory(config=config, llm=llm, selector_cache=selector_cache)
46
+ except Exception:
47
+ pass # plugin path is best-effort; never block on it
48
+
49
+ behavior = site.behavior
50
+ if behavior == "external":
51
+ return HTTPAdapter(config=config, llm=llm, selector_cache=selector_cache)
52
+ if site.search_url_template and behavior != "stealth":
53
+ return HTTPAdapter(config=config, llm=llm, selector_cache=selector_cache)
54
+ if behavior == "stealth":
55
+ try:
56
+ from .stealth import StealthBrowserAdapter
57
+
58
+ return StealthBrowserAdapter(
59
+ config=config, llm=llm, selector_cache=selector_cache, pool=stealth_pool
60
+ )
61
+ except ImportError:
62
+ pass
63
+ try:
64
+ from .browser import BrowserAdapter
65
+
66
+ return BrowserAdapter(config=config, llm=llm, selector_cache=selector_cache, pool=pool)
67
+ except ImportError:
68
+ return HTTPAdapter(config=config)
vouch/adapters/base.py ADDED
@@ -0,0 +1,28 @@
1
+ """SiteAdapter Protocol — the contract for per-site search execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Protocol, runtime_checkable
7
+
8
+ from ..catalog import Site
9
+ from ..models import Chunk
10
+
11
+
12
+ @dataclass
13
+ class AdapterContext:
14
+ site: Site
15
+ query: str
16
+ depth: int
17
+ max_results: int = 10
18
+ timeout: float = 60.0
19
+ extra: dict = field(default_factory=dict)
20
+
21
+
22
+ @runtime_checkable
23
+ class SiteAdapter(Protocol):
24
+ """Contract for any per-site search executor."""
25
+
26
+ def search(self, ctx: AdapterContext) -> list[Chunk]: ...
27
+
28
+ def close(self) -> None: ...