bits-bie 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bie/discovery.py ADDED
@@ -0,0 +1,214 @@
1
+ """
2
+ Free, no-API-key, real-time web search discovery for BIE.
3
+
4
+ This module answers the question: *"given a query, what URLs on the
5
+ internet are relevant right now?"* — without requiring any paid API,
6
+ subscription, or API key.
7
+
8
+ It tries multiple lightweight, no-JS public search endpoints in order,
9
+ falling back automatically if one is blocked, rate-limited, or returns
10
+ no results:
11
+
12
+ 1. DuckDuckGo HTML (``https://html.duckduckgo.com/html/``)
13
+ 2. DuckDuckGo Lite (``https://lite.duckduckgo.com/lite/``)
14
+ 3. Bing HTML (``https://www.bing.com/search``)
15
+
16
+ This is the **discovery** step. BIE then crawls the discovered URLs with
17
+ Bitscrape and ranks the extracted content with its hybrid BM25+vector
18
+ index — giving a genuine "type a query, get a real-time answer from the
19
+ internet" experience with zero configuration and zero cost.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+ import re
26
+ from urllib.parse import parse_qs, unquote, urlparse
27
+
28
+ import httpx
29
+
30
+ logger = logging.getLogger("bie.discovery")
31
+
32
+ _USER_AGENT = (
33
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
34
+ "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
35
+ )
36
+
37
+ # DuckDuckGo wraps result links as //duckduckgo.com/l/?uddg=<encoded-url>&...
38
+ _DDG_REDIRECT_RE = re.compile(r"^(?:https?:)?//duckduckgo\.com/l/\?")
39
+
40
+ # Tracking/redirector domains we never want to treat as a "real" result
41
+ _BLOCKED_HOST_FRAGMENTS = (
42
+ "duckduckgo.com",
43
+ "bing.com/search",
44
+ "go.microsoft.com",
45
+ "r.search.yahoo.com",
46
+ )
47
+
48
+
49
+ def discover_urls(query: str, max_results: int = 5, timeout: float = 15.0) -> list[str]:
50
+ """Return up to ``max_results`` candidate URLs for ``query`` from the
51
+ live web — no API key required.
52
+
53
+ Tries DuckDuckGo (HTML, then Lite) and falls back to Bing HTML search
54
+ if both fail or return nothing.
55
+
56
+ Args:
57
+ query: The natural-language search query.
58
+ max_results: Maximum number of URLs to return.
59
+ timeout: HTTP request timeout in seconds (per attempt).
60
+
61
+ Returns:
62
+ A list of absolute, deduplicated URLs in result order. Returns an
63
+ empty list only if every backend fails — callers should treat this
64
+ as "try again" rather than "no results exist".
65
+ """
66
+ headers = {
67
+ "User-Agent": _USER_AGENT,
68
+ "Accept-Language": "en-US,en;q=0.9",
69
+ }
70
+
71
+ attempts = [
72
+ ("ddg_html", _fetch_ddg_html, query),
73
+ ("ddg_lite", _fetch_ddg_lite, query),
74
+ ("bing_html", _fetch_bing_html, query),
75
+ ]
76
+
77
+ for name, fetch, q in attempts:
78
+ try:
79
+ with httpx.Client(timeout=timeout, headers=headers, follow_redirects=True) as client:
80
+ html = fetch(client, q)
81
+ except httpx.HTTPError as exc:
82
+ logger.warning("Discovery backend %s failed: %s", name, exc)
83
+ continue
84
+
85
+ if not html:
86
+ continue
87
+
88
+ urls = _parse_result_urls(html, max_results)
89
+ if urls:
90
+ logger.debug("Discovery backend %s returned %d url(s)", name, len(urls))
91
+ return urls
92
+
93
+ logger.warning("All discovery backends failed or returned no results for query=%r", query)
94
+ return []
95
+
96
+
97
+ def discover_urls_multi(
98
+ queries: list[str], max_results_per_query: int = 5, max_total: int = 15, timeout: float = 15.0
99
+ ) -> list[str]:
100
+ """Run :func:`discover_urls` for several query variants and merge the
101
+ results, ranked by how many variants surfaced each URL.
102
+
103
+ This implements simple **query fan-out**: searching multiple phrasings
104
+ of the same question (e.g. the original query plus a couple of
105
+ rewordings) surfaces a broader, more relevant set of candidate pages
106
+ than a single query alone — particularly for ambiguous or
107
+ multi-faceted questions.
108
+
109
+ Args:
110
+ queries: Query variants to search, in priority order. The first
111
+ is treated as the primary query.
112
+ max_results_per_query: How many URLs to fetch per query variant.
113
+ max_total: Maximum number of URLs to return overall.
114
+ timeout: Per-request timeout in seconds.
115
+
116
+ Returns:
117
+ Deduplicated URLs, ordered by (number of variants that returned
118
+ them, then first-seen order). URLs found by multiple query
119
+ variants are considered more likely relevant.
120
+ """
121
+ url_votes: dict[str, int] = {}
122
+ url_order: dict[str, int] = {}
123
+ order_counter = 0
124
+
125
+ for query in queries:
126
+ for url in discover_urls(query, max_results=max_results_per_query, timeout=timeout):
127
+ if url not in url_votes:
128
+ url_votes[url] = 0
129
+ url_order[url] = order_counter
130
+ order_counter += 1
131
+ url_votes[url] += 1
132
+
133
+ ranked = sorted(url_votes.keys(), key=lambda u: (-url_votes[u], url_order[u]))
134
+ return ranked[:max_total]
135
+
136
+
137
+ def _fetch_ddg_html(client: httpx.Client, query: str) -> str:
138
+ resp = client.post("https://html.duckduckgo.com/html/", data={"q": query})
139
+ resp.raise_for_status()
140
+ return resp.text
141
+
142
+
143
+ def _fetch_ddg_lite(client: httpx.Client, query: str) -> str:
144
+ resp = client.post("https://lite.duckduckgo.com/lite/", data={"q": query})
145
+ resp.raise_for_status()
146
+ return resp.text
147
+
148
+
149
+ def _fetch_bing_html(client: httpx.Client, query: str) -> str:
150
+ resp = client.get("https://www.bing.com/search", params={"q": query, "form": "QBLH"})
151
+ resp.raise_for_status()
152
+ return resp.text
153
+
154
+
155
+ def _parse_result_urls(html: str, max_results: int) -> list[str]:
156
+ """Extract organic result URLs from a search results page.
157
+
158
+ Handles:
159
+ - DuckDuckGo HTML: ``<a class="result__a" href="...">``
160
+ - DuckDuckGo Lite: ``<a class="result-link" href="...">``
161
+ - Bing: ``<li class="b_algo">...<a href="...">``
162
+ """
163
+ hrefs = re.findall(r'class="result__a"[^>]*href="([^"]+)"', html)
164
+ if not hrefs:
165
+ hrefs = re.findall(r'class="result-link"[^>]*href="([^"]+)"', html)
166
+ if not hrefs:
167
+ hrefs = _extract_bing_hrefs(html)
168
+
169
+ urls: list[str] = []
170
+ seen: set[str] = set()
171
+ for href in hrefs:
172
+ url = _resolve_redirect(href)
173
+ if not url or not url.startswith("http"):
174
+ continue
175
+ if _is_blocked_host(url):
176
+ continue
177
+ if url in seen:
178
+ continue
179
+ seen.add(url)
180
+ urls.append(url)
181
+ if len(urls) >= max_results:
182
+ break
183
+
184
+ return urls
185
+
186
+
187
+ def _extract_bing_hrefs(html: str) -> list[str]:
188
+ """Extract result links from Bing's organic result blocks
189
+ (``<li class="b_algo">``)."""
190
+ hrefs: list[str] = []
191
+ for block in re.findall(r'<li class="b_algo".*?</li>', html, flags=re.S):
192
+ m = re.search(r'<h2[^>]*>\s*<a[^>]*href="([^"]+)"', block)
193
+ if m:
194
+ hrefs.append(m.group(1))
195
+ return hrefs
196
+
197
+
198
+ def _resolve_redirect(href: str) -> str | None:
199
+ """Unwrap DuckDuckGo's ``//duckduckgo.com/l/?uddg=<url-encoded-target>``
200
+ redirect links to get the real target URL. Other links pass through
201
+ unchanged."""
202
+ href = href.strip().replace("&amp;", "&")
203
+
204
+ if _DDG_REDIRECT_RE.match(href):
205
+ parsed = urlparse(href if href.startswith("http") else f"https:{href}")
206
+ qs = parse_qs(parsed.query)
207
+ target = qs.get("uddg", [None])[0]
208
+ return unquote(target) if target else None
209
+
210
+ return href
211
+
212
+
213
+ def _is_blocked_host(url: str) -> bool:
214
+ return any(fragment in url for fragment in _BLOCKED_HOST_FRAGMENTS)
bie/engine.py ADDED
@@ -0,0 +1,151 @@
1
+ """
2
+ The ``BIE`` class — the main entry point of the BitSearch Intelligence
3
+ Engine Python API.
4
+
5
+ .. code-block:: python
6
+
7
+ import bie
8
+
9
+ engine = bie.BIE()
10
+ engine.crawl(["https://example.com/news"])
11
+ results = engine.search("what happened today")
12
+ for r in results:
13
+ print(r)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import time
20
+
21
+ from bie.chunker import chunk_document
22
+ from bie.config import BIESettings
23
+ from bie.crawler import Crawler
24
+ from bie.index import HybridIndex
25
+ from bie.models import Document, SearchResponse, SearchResult
26
+
27
+ logger = logging.getLogger("bie")
28
+
29
+
30
+ class BIE:
31
+ """The BitSearch Intelligence Engine — crawl, index, and search the web.
32
+
33
+ Args:
34
+ settings: Optional :class:`bie.config.BIESettings`. If omitted,
35
+ settings are loaded from environment variables / ``.env``.
36
+
37
+ Example::
38
+
39
+ engine = bie.BIE()
40
+ engine.crawl(["https://www.bbc.com/news"])
41
+ for hit in engine.search("global markets"):
42
+ print(hit.title, hit.url, hit.score)
43
+ """
44
+
45
+ def __init__(self, settings: BIESettings | None = None) -> None:
46
+ self.settings = settings or BIESettings()
47
+ self.index = HybridIndex(self.settings)
48
+ self.crawler = Crawler(self.settings)
49
+
50
+ # ------------------------------------------------------------------
51
+ # Ingestion
52
+ # ------------------------------------------------------------------
53
+
54
+ def crawl(
55
+ self,
56
+ urls: list[str],
57
+ allowed_domains: list[str] | None = None,
58
+ instruction: str = "",
59
+ ) -> int:
60
+ """Crawl ``urls`` (and linked pages, bounded by settings) and add
61
+ the extracted documents to the index.
62
+
63
+ Args:
64
+ urls: Seed URLs to crawl.
65
+ allowed_domains: Restrict link-following to these domains
66
+ (defaults to the domains of ``urls``).
67
+ instruction: Optional short natural-language description of
68
+ what to look for (e.g. "pricing and plans pages"). When
69
+ set, outgoing links are ranked by keyword overlap with the
70
+ instruction and only the most relevant are followed. This
71
+ is a keyword-relevance heuristic, not full semantic
72
+ understanding — see :mod:`bie.spiders.generic` for
73
+ details.
74
+
75
+ Returns the number of documents added.
76
+ """
77
+ documents = self.crawler.crawl(
78
+ urls, allowed_domains=allowed_domains, instruction=instruction
79
+ )
80
+ for doc in documents:
81
+ self.add_document(doc)
82
+ return len(documents)
83
+
84
+ def add_document(self, doc: Document) -> None:
85
+ """Add a single pre-fetched :class:`~bie.models.Document` to the index."""
86
+ chunks = chunk_document(
87
+ doc, chunk_size=self.settings.chunk_size, overlap=self.settings.chunk_overlap
88
+ )
89
+ if not chunks:
90
+ return
91
+ self.index.add_document(doc, chunks)
92
+
93
+ def add_text(
94
+ self,
95
+ url: str,
96
+ text: str,
97
+ title: str = "",
98
+ trust_score: float = 0.5,
99
+ **metadata,
100
+ ) -> None:
101
+ """Add raw text directly (no crawling) — useful for indexing local
102
+ documents, PDFs you've already extracted, API responses, etc."""
103
+ doc = Document(
104
+ url=url,
105
+ title=title or url,
106
+ text=text,
107
+ trust_score=trust_score,
108
+ metadata=metadata,
109
+ )
110
+ self.add_document(doc)
111
+
112
+ # ------------------------------------------------------------------
113
+ # Retrieval
114
+ # ------------------------------------------------------------------
115
+
116
+ def search(self, query: str, top_k: int = 10) -> list[SearchResult]:
117
+ """Run a hybrid (BM25 + vector) search over the indexed documents."""
118
+ return self.index.search(query, top_k=top_k)
119
+
120
+ def search_full(self, query: str, top_k: int = 10) -> SearchResponse:
121
+ """Like :meth:`search`, but returns a full :class:`SearchResponse`
122
+ with timing and index-size metadata (matches the ``/search`` API)."""
123
+ start = time.perf_counter()
124
+ results = self.search(query, top_k=top_k)
125
+ took_ms = (time.perf_counter() - start) * 1000
126
+ return SearchResponse(
127
+ query=query,
128
+ results=results,
129
+ took_ms=round(took_ms, 2),
130
+ total_indexed_documents=len(self.index),
131
+ )
132
+
133
+ # ------------------------------------------------------------------
134
+ # Convenience
135
+ # ------------------------------------------------------------------
136
+
137
+ def search_web(self, query: str, urls: list[str], top_k: int = 10) -> list[SearchResult]:
138
+ """One-shot: crawl ``urls``, then immediately search the freshly
139
+ indexed content. Equivalent to ``engine.crawl(urls)`` followed by
140
+ ``engine.search(query)``."""
141
+ self.crawl(urls)
142
+ return self.search(query, top_k=top_k)
143
+
144
+ def __len__(self) -> int:
145
+ return len(self.index)
146
+
147
+ def __repr__(self) -> str: # pragma: no cover
148
+ return (
149
+ f"<BIE documents={len(self.index)} "
150
+ f"vector_search={'on' if self.index.vector_enabled else 'off'}>"
151
+ )
bie/extract.py ADDED
@@ -0,0 +1,218 @@
1
+ """
2
+ ``bie.extract()`` — retrieve clean, readable Markdown from a single URL.
3
+
4
+ This is BIE's "give me this page as clean text" primitive: fetch a URL,
5
+ strip navigation/ads/scripts/styling noise, and convert the main content
6
+ to Markdown — the format LLMs work with best.
7
+
8
+ For static HTML pages, this uses a direct HTTP fetch (fast, no browser).
9
+ For JavaScript-rendered pages (SPAs, sites that return a near-empty
10
+ ``<body>`` until JS runs), pass ``render_js=True`` to fall back to a
11
+ headless Playwright browser — requires the optional ``bie[render]``
12
+ extra (``pip install "bits-bie[render]"`` plus ``playwright install
13
+ chromium`` once).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import logging
20
+ import re
21
+ from dataclasses import dataclass, field
22
+ from typing import TYPE_CHECKING
23
+
24
+ import httpx
25
+ from markdownify import markdownify
26
+
27
+ from bie.security import scan_for_prompt_injection
28
+
29
+ if TYPE_CHECKING:
30
+ from bie.security import SecurityReport
31
+
32
+ logger = logging.getLogger("bie.extract")
33
+
34
+ _USER_AGENT = (
35
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
36
+ "(KHTML, like Gecko) Chrome/124.0 Safari/537.36 BIE/0.5"
37
+ )
38
+
39
+ # Tags whose content is never useful for an LLM and should be stripped
40
+ # before markdown conversion.
41
+ _STRIP_TAGS = (
42
+ "script",
43
+ "style",
44
+ "noscript",
45
+ "nav",
46
+ "footer",
47
+ "header",
48
+ "form",
49
+ "iframe",
50
+ "svg",
51
+ "button",
52
+ "aside",
53
+ )
54
+
55
+ # A page is considered "JS-required" if, after stripping noise tags, the
56
+ # remaining visible text is suspiciously short.
57
+ _JS_REQUIRED_TEXT_THRESHOLD = 80
58
+
59
+
60
+ class ExtractError(RuntimeError):
61
+ """Raised when a page can't be fetched or extracted."""
62
+
63
+
64
+ @dataclass
65
+ class ExtractResult:
66
+ """Result of :func:`bie.extract.extract`."""
67
+
68
+ url: str
69
+ title: str
70
+ markdown: str
71
+ text: str
72
+ word_count: int
73
+ rendered_with_js: bool = False
74
+ security: "SecurityReport | None" = field(default=None)
75
+
76
+ def __str__(self) -> str: # pragma: no cover - convenience only
77
+ flag = " [JS-rendered]" if self.rendered_with_js else ""
78
+ return f"<ExtractResult {self.url!r} title={self.title!r} words={self.word_count}{flag}>"
79
+
80
+
81
+ def extract(
82
+ url: str,
83
+ render_js: bool = False,
84
+ timeout: float = 20.0,
85
+ scan_security: bool = True,
86
+ ) -> ExtractResult:
87
+ """Fetch ``url`` and return its content as clean Markdown.
88
+
89
+ Args:
90
+ url: The page to fetch.
91
+ render_js: If True, render the page with a headless browser
92
+ (requires ``pip install "bits-bie[render]"`` and
93
+ ``playwright install chromium``). If False (default), BIE
94
+ still auto-detects JS-only pages and raises a helpful
95
+ :class:`ExtractError` suggesting ``render_js=True`` rather
96
+ than silently returning near-empty content.
97
+ timeout: Request timeout in seconds.
98
+ scan_security: If True (default), scan the extracted text for
99
+ prompt-injection patterns and attach a
100
+ :class:`bie.security.SecurityReport` to ``result.security``.
101
+ This does **not** remove or alter content, only flags it.
102
+
103
+ Returns:
104
+ An :class:`ExtractResult` with ``markdown``, plain ``text``,
105
+ ``title``, and ``word_count``.
106
+
107
+ Raises:
108
+ ExtractError: if the page can't be fetched, or appears to require
109
+ JavaScript and ``render_js=False``.
110
+ """
111
+ if render_js:
112
+ html = _fetch_with_playwright(url, timeout)
113
+ rendered_with_js = True
114
+ else:
115
+ html = _fetch_static(url, timeout)
116
+ rendered_with_js = False
117
+
118
+ if _looks_js_only(html):
119
+ raise ExtractError(
120
+ f"{url} appears to require JavaScript to render its content "
121
+ f"(static fetch returned very little text). Retry with "
122
+ f"extract(url, render_js=True) — requires "
123
+ f'\'pip install "bits-bie[render]"\' and '
124
+ f"'playwright install chromium' once."
125
+ )
126
+
127
+ title, markdown, text = _to_markdown(html)
128
+
129
+ result = ExtractResult(
130
+ url=url,
131
+ title=title,
132
+ markdown=markdown,
133
+ text=text,
134
+ word_count=len(text.split()),
135
+ rendered_with_js=rendered_with_js,
136
+ )
137
+
138
+ if scan_security:
139
+ result.security = scan_for_prompt_injection(text)
140
+
141
+ return result
142
+
143
+
144
+ def _fetch_static(url: str, timeout: float) -> str:
145
+ headers = {"User-Agent": _USER_AGENT}
146
+ try:
147
+ with httpx.Client(timeout=timeout, headers=headers, follow_redirects=True) as client:
148
+ resp = client.get(url)
149
+ resp.raise_for_status()
150
+ return resp.text
151
+ except httpx.HTTPError as exc:
152
+ raise ExtractError(f"Failed to fetch {url}: {exc}") from exc
153
+
154
+
155
+ def _fetch_with_playwright(url: str, timeout: float) -> str:
156
+ try:
157
+ from playwright.async_api import async_playwright
158
+ except ImportError as exc:
159
+ raise ExtractError(
160
+ "render_js=True requires the 'playwright' package. Install with: "
161
+ 'pip install "bits-bie[render]" && playwright install chromium'
162
+ ) from exc
163
+
164
+ async def _run() -> str:
165
+ async with async_playwright() as pw:
166
+ browser = await pw.chromium.launch()
167
+ try:
168
+ page = await browser.new_page(user_agent=_USER_AGENT)
169
+ await page.goto(url, timeout=timeout * 1000, wait_until="networkidle")
170
+ return await page.content()
171
+ finally:
172
+ await browser.close()
173
+
174
+ try:
175
+ return asyncio.run(_run())
176
+ except Exception as exc:
177
+ raise ExtractError(f"Failed to render {url} with Playwright: {exc}") from exc
178
+
179
+
180
+ def _looks_js_only(html: str) -> bool:
181
+ """Heuristic: after stripping script/style/nav/etc, is there
182
+ suspiciously little visible text? Typical of SPA shells that render
183
+ everything client-side (e.g. ``<div id="root"></div>``)."""
184
+ _, _, text = _to_markdown(html)
185
+ return len(text.strip()) < _JS_REQUIRED_TEXT_THRESHOLD
186
+
187
+
188
+ def _to_markdown(html: str) -> tuple[str, str, str]:
189
+ """Strip noise tags and convert to (title, markdown, plain_text)."""
190
+ from selectolax.parser import HTMLParser
191
+
192
+ tree = HTMLParser(html)
193
+
194
+ title_node = tree.css_first("title")
195
+ title = _clean_text(title_node.text()) if title_node else ""
196
+
197
+ for tag in _STRIP_TAGS:
198
+ for node in tree.css(tag):
199
+ node.decompose()
200
+
201
+ body = tree.css_first("body") or tree
202
+ body_html = body.html or ""
203
+
204
+ markdown = markdownify(body_html, heading_style="ATX", strip=["a"]).strip()
205
+ markdown = _collapse_blank_lines(markdown)
206
+
207
+ text_node = tree.css_first("body") or tree
208
+ text = _clean_text(text_node.text(separator=" ", deep=True))
209
+
210
+ return title, markdown, text
211
+
212
+
213
+ def _collapse_blank_lines(markdown: str) -> str:
214
+ return re.sub(r"\n{3,}", "\n\n", markdown)
215
+
216
+
217
+ def _clean_text(text: str) -> str:
218
+ return re.sub(r"\s+", " ", text or "").strip()