bits-bie 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bie/__init__.py +85 -0
- bie/chunker.py +83 -0
- bie/cli.py +275 -0
- bie/config.py +57 -0
- bie/crawler.py +115 -0
- bie/discovery.py +214 -0
- bie/engine.py +151 -0
- bie/extract.py +218 -0
- bie/index.py +225 -0
- bie/integrations/__init__.py +7 -0
- bie/integrations/langchain.py +142 -0
- bie/mcp/__init__.py +3 -0
- bie/mcp/server.py +193 -0
- bie/models.py +76 -0
- bie/query_expansion.py +99 -0
- bie/quicksearch.py +194 -0
- bie/security.py +124 -0
- bie/server.py +248 -0
- bie/sitecrawl.py +93 -0
- bie/sitemap.py +174 -0
- bie/spiders/__init__.py +3 -0
- bie/spiders/generic.py +178 -0
- bits_bie-1.2.0.dist-info/METADATA +447 -0
- bits_bie-1.2.0.dist-info/RECORD +27 -0
- bits_bie-1.2.0.dist-info/WHEEL +4 -0
- bits_bie-1.2.0.dist-info/entry_points.txt +2 -0
- bits_bie-1.2.0.dist-info/licenses/LICENSE +21 -0
bie/discovery.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Free, no-API-key, real-time web search discovery for BIE.
|
|
3
|
+
|
|
4
|
+
This module answers the question: *"given a query, what URLs on the
|
|
5
|
+
internet are relevant right now?"* — without requiring any paid API,
|
|
6
|
+
subscription, or API key.
|
|
7
|
+
|
|
8
|
+
It tries multiple lightweight, no-JS public search endpoints in order,
|
|
9
|
+
falling back automatically if one is blocked, rate-limited, or returns
|
|
10
|
+
no results:
|
|
11
|
+
|
|
12
|
+
1. DuckDuckGo HTML (``https://html.duckduckgo.com/html/``)
|
|
13
|
+
2. DuckDuckGo Lite (``https://lite.duckduckgo.com/lite/``)
|
|
14
|
+
3. Bing HTML (``https://www.bing.com/search``)
|
|
15
|
+
|
|
16
|
+
This is the **discovery** step. BIE then crawls the discovered URLs with
|
|
17
|
+
Bitscrape and ranks the extracted content with its hybrid BM25+vector
|
|
18
|
+
index — giving a genuine "type a query, get a real-time answer from the
|
|
19
|
+
internet" experience with zero configuration and zero cost.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import re
|
|
26
|
+
from urllib.parse import parse_qs, unquote, urlparse
|
|
27
|
+
|
|
28
|
+
import httpx
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger("bie.discovery")
|
|
31
|
+
|
|
32
|
+
_USER_AGENT = (
|
|
33
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
34
|
+
"(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# DuckDuckGo wraps result links as //duckduckgo.com/l/?uddg=<encoded-url>&...
|
|
38
|
+
_DDG_REDIRECT_RE = re.compile(r"^(?:https?:)?//duckduckgo\.com/l/\?")
|
|
39
|
+
|
|
40
|
+
# Tracking/redirector domains we never want to treat as a "real" result
|
|
41
|
+
_BLOCKED_HOST_FRAGMENTS = (
|
|
42
|
+
"duckduckgo.com",
|
|
43
|
+
"bing.com/search",
|
|
44
|
+
"go.microsoft.com",
|
|
45
|
+
"r.search.yahoo.com",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def discover_urls(query: str, max_results: int = 5, timeout: float = 15.0) -> list[str]:
|
|
50
|
+
"""Return up to ``max_results`` candidate URLs for ``query`` from the
|
|
51
|
+
live web — no API key required.
|
|
52
|
+
|
|
53
|
+
Tries DuckDuckGo (HTML, then Lite) and falls back to Bing HTML search
|
|
54
|
+
if both fail or return nothing.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
query: The natural-language search query.
|
|
58
|
+
max_results: Maximum number of URLs to return.
|
|
59
|
+
timeout: HTTP request timeout in seconds (per attempt).
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A list of absolute, deduplicated URLs in result order. Returns an
|
|
63
|
+
empty list only if every backend fails — callers should treat this
|
|
64
|
+
as "try again" rather than "no results exist".
|
|
65
|
+
"""
|
|
66
|
+
headers = {
|
|
67
|
+
"User-Agent": _USER_AGENT,
|
|
68
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
attempts = [
|
|
72
|
+
("ddg_html", _fetch_ddg_html, query),
|
|
73
|
+
("ddg_lite", _fetch_ddg_lite, query),
|
|
74
|
+
("bing_html", _fetch_bing_html, query),
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
for name, fetch, q in attempts:
|
|
78
|
+
try:
|
|
79
|
+
with httpx.Client(timeout=timeout, headers=headers, follow_redirects=True) as client:
|
|
80
|
+
html = fetch(client, q)
|
|
81
|
+
except httpx.HTTPError as exc:
|
|
82
|
+
logger.warning("Discovery backend %s failed: %s", name, exc)
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if not html:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
urls = _parse_result_urls(html, max_results)
|
|
89
|
+
if urls:
|
|
90
|
+
logger.debug("Discovery backend %s returned %d url(s)", name, len(urls))
|
|
91
|
+
return urls
|
|
92
|
+
|
|
93
|
+
logger.warning("All discovery backends failed or returned no results for query=%r", query)
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def discover_urls_multi(
|
|
98
|
+
queries: list[str], max_results_per_query: int = 5, max_total: int = 15, timeout: float = 15.0
|
|
99
|
+
) -> list[str]:
|
|
100
|
+
"""Run :func:`discover_urls` for several query variants and merge the
|
|
101
|
+
results, ranked by how many variants surfaced each URL.
|
|
102
|
+
|
|
103
|
+
This implements simple **query fan-out**: searching multiple phrasings
|
|
104
|
+
of the same question (e.g. the original query plus a couple of
|
|
105
|
+
rewordings) surfaces a broader, more relevant set of candidate pages
|
|
106
|
+
than a single query alone — particularly for ambiguous or
|
|
107
|
+
multi-faceted questions.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
queries: Query variants to search, in priority order. The first
|
|
111
|
+
is treated as the primary query.
|
|
112
|
+
max_results_per_query: How many URLs to fetch per query variant.
|
|
113
|
+
max_total: Maximum number of URLs to return overall.
|
|
114
|
+
timeout: Per-request timeout in seconds.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Deduplicated URLs, ordered by (number of variants that returned
|
|
118
|
+
them, then first-seen order). URLs found by multiple query
|
|
119
|
+
variants are considered more likely relevant.
|
|
120
|
+
"""
|
|
121
|
+
url_votes: dict[str, int] = {}
|
|
122
|
+
url_order: dict[str, int] = {}
|
|
123
|
+
order_counter = 0
|
|
124
|
+
|
|
125
|
+
for query in queries:
|
|
126
|
+
for url in discover_urls(query, max_results=max_results_per_query, timeout=timeout):
|
|
127
|
+
if url not in url_votes:
|
|
128
|
+
url_votes[url] = 0
|
|
129
|
+
url_order[url] = order_counter
|
|
130
|
+
order_counter += 1
|
|
131
|
+
url_votes[url] += 1
|
|
132
|
+
|
|
133
|
+
ranked = sorted(url_votes.keys(), key=lambda u: (-url_votes[u], url_order[u]))
|
|
134
|
+
return ranked[:max_total]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _fetch_ddg_html(client: httpx.Client, query: str) -> str:
|
|
138
|
+
resp = client.post("https://html.duckduckgo.com/html/", data={"q": query})
|
|
139
|
+
resp.raise_for_status()
|
|
140
|
+
return resp.text
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _fetch_ddg_lite(client: httpx.Client, query: str) -> str:
|
|
144
|
+
resp = client.post("https://lite.duckduckgo.com/lite/", data={"q": query})
|
|
145
|
+
resp.raise_for_status()
|
|
146
|
+
return resp.text
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _fetch_bing_html(client: httpx.Client, query: str) -> str:
|
|
150
|
+
resp = client.get("https://www.bing.com/search", params={"q": query, "form": "QBLH"})
|
|
151
|
+
resp.raise_for_status()
|
|
152
|
+
return resp.text
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _parse_result_urls(html: str, max_results: int) -> list[str]:
|
|
156
|
+
"""Extract organic result URLs from a search results page.
|
|
157
|
+
|
|
158
|
+
Handles:
|
|
159
|
+
- DuckDuckGo HTML: ``<a class="result__a" href="...">``
|
|
160
|
+
- DuckDuckGo Lite: ``<a class="result-link" href="...">``
|
|
161
|
+
- Bing: ``<li class="b_algo">...<a href="...">``
|
|
162
|
+
"""
|
|
163
|
+
hrefs = re.findall(r'class="result__a"[^>]*href="([^"]+)"', html)
|
|
164
|
+
if not hrefs:
|
|
165
|
+
hrefs = re.findall(r'class="result-link"[^>]*href="([^"]+)"', html)
|
|
166
|
+
if not hrefs:
|
|
167
|
+
hrefs = _extract_bing_hrefs(html)
|
|
168
|
+
|
|
169
|
+
urls: list[str] = []
|
|
170
|
+
seen: set[str] = set()
|
|
171
|
+
for href in hrefs:
|
|
172
|
+
url = _resolve_redirect(href)
|
|
173
|
+
if not url or not url.startswith("http"):
|
|
174
|
+
continue
|
|
175
|
+
if _is_blocked_host(url):
|
|
176
|
+
continue
|
|
177
|
+
if url in seen:
|
|
178
|
+
continue
|
|
179
|
+
seen.add(url)
|
|
180
|
+
urls.append(url)
|
|
181
|
+
if len(urls) >= max_results:
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
return urls
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _extract_bing_hrefs(html: str) -> list[str]:
|
|
188
|
+
"""Extract result links from Bing's organic result blocks
|
|
189
|
+
(``<li class="b_algo">``)."""
|
|
190
|
+
hrefs: list[str] = []
|
|
191
|
+
for block in re.findall(r'<li class="b_algo".*?</li>', html, flags=re.S):
|
|
192
|
+
m = re.search(r'<h2[^>]*>\s*<a[^>]*href="([^"]+)"', block)
|
|
193
|
+
if m:
|
|
194
|
+
hrefs.append(m.group(1))
|
|
195
|
+
return hrefs
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _resolve_redirect(href: str) -> str | None:
|
|
199
|
+
"""Unwrap DuckDuckGo's ``//duckduckgo.com/l/?uddg=<url-encoded-target>``
|
|
200
|
+
redirect links to get the real target URL. Other links pass through
|
|
201
|
+
unchanged."""
|
|
202
|
+
href = href.strip().replace("&", "&")
|
|
203
|
+
|
|
204
|
+
if _DDG_REDIRECT_RE.match(href):
|
|
205
|
+
parsed = urlparse(href if href.startswith("http") else f"https:{href}")
|
|
206
|
+
qs = parse_qs(parsed.query)
|
|
207
|
+
target = qs.get("uddg", [None])[0]
|
|
208
|
+
return unquote(target) if target else None
|
|
209
|
+
|
|
210
|
+
return href
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _is_blocked_host(url: str) -> bool:
|
|
214
|
+
return any(fragment in url for fragment in _BLOCKED_HOST_FRAGMENTS)
|
bie/engine.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The ``BIE`` class — the main entry point of the BitSearch Intelligence
|
|
3
|
+
Engine Python API.
|
|
4
|
+
|
|
5
|
+
.. code-block:: python
|
|
6
|
+
|
|
7
|
+
import bie
|
|
8
|
+
|
|
9
|
+
engine = bie.BIE()
|
|
10
|
+
engine.crawl(["https://example.com/news"])
|
|
11
|
+
results = engine.search("what happened today")
|
|
12
|
+
for r in results:
|
|
13
|
+
print(r)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import time
|
|
20
|
+
|
|
21
|
+
from bie.chunker import chunk_document
|
|
22
|
+
from bie.config import BIESettings
|
|
23
|
+
from bie.crawler import Crawler
|
|
24
|
+
from bie.index import HybridIndex
|
|
25
|
+
from bie.models import Document, SearchResponse, SearchResult
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("bie")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BIE:
|
|
31
|
+
"""The BitSearch Intelligence Engine — crawl, index, and search the web.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
settings: Optional :class:`bie.config.BIESettings`. If omitted,
|
|
35
|
+
settings are loaded from environment variables / ``.env``.
|
|
36
|
+
|
|
37
|
+
Example::
|
|
38
|
+
|
|
39
|
+
engine = bie.BIE()
|
|
40
|
+
engine.crawl(["https://www.bbc.com/news"])
|
|
41
|
+
for hit in engine.search("global markets"):
|
|
42
|
+
print(hit.title, hit.url, hit.score)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, settings: BIESettings | None = None) -> None:
|
|
46
|
+
self.settings = settings or BIESettings()
|
|
47
|
+
self.index = HybridIndex(self.settings)
|
|
48
|
+
self.crawler = Crawler(self.settings)
|
|
49
|
+
|
|
50
|
+
# ------------------------------------------------------------------
|
|
51
|
+
# Ingestion
|
|
52
|
+
# ------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
def crawl(
|
|
55
|
+
self,
|
|
56
|
+
urls: list[str],
|
|
57
|
+
allowed_domains: list[str] | None = None,
|
|
58
|
+
instruction: str = "",
|
|
59
|
+
) -> int:
|
|
60
|
+
"""Crawl ``urls`` (and linked pages, bounded by settings) and add
|
|
61
|
+
the extracted documents to the index.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
urls: Seed URLs to crawl.
|
|
65
|
+
allowed_domains: Restrict link-following to these domains
|
|
66
|
+
(defaults to the domains of ``urls``).
|
|
67
|
+
instruction: Optional short natural-language description of
|
|
68
|
+
what to look for (e.g. "pricing and plans pages"). When
|
|
69
|
+
set, outgoing links are ranked by keyword overlap with the
|
|
70
|
+
instruction and only the most relevant are followed. This
|
|
71
|
+
is a keyword-relevance heuristic, not full semantic
|
|
72
|
+
understanding — see :mod:`bie.spiders.generic` for
|
|
73
|
+
details.
|
|
74
|
+
|
|
75
|
+
Returns the number of documents added.
|
|
76
|
+
"""
|
|
77
|
+
documents = self.crawler.crawl(
|
|
78
|
+
urls, allowed_domains=allowed_domains, instruction=instruction
|
|
79
|
+
)
|
|
80
|
+
for doc in documents:
|
|
81
|
+
self.add_document(doc)
|
|
82
|
+
return len(documents)
|
|
83
|
+
|
|
84
|
+
def add_document(self, doc: Document) -> None:
|
|
85
|
+
"""Add a single pre-fetched :class:`~bie.models.Document` to the index."""
|
|
86
|
+
chunks = chunk_document(
|
|
87
|
+
doc, chunk_size=self.settings.chunk_size, overlap=self.settings.chunk_overlap
|
|
88
|
+
)
|
|
89
|
+
if not chunks:
|
|
90
|
+
return
|
|
91
|
+
self.index.add_document(doc, chunks)
|
|
92
|
+
|
|
93
|
+
def add_text(
|
|
94
|
+
self,
|
|
95
|
+
url: str,
|
|
96
|
+
text: str,
|
|
97
|
+
title: str = "",
|
|
98
|
+
trust_score: float = 0.5,
|
|
99
|
+
**metadata,
|
|
100
|
+
) -> None:
|
|
101
|
+
"""Add raw text directly (no crawling) — useful for indexing local
|
|
102
|
+
documents, PDFs you've already extracted, API responses, etc."""
|
|
103
|
+
doc = Document(
|
|
104
|
+
url=url,
|
|
105
|
+
title=title or url,
|
|
106
|
+
text=text,
|
|
107
|
+
trust_score=trust_score,
|
|
108
|
+
metadata=metadata,
|
|
109
|
+
)
|
|
110
|
+
self.add_document(doc)
|
|
111
|
+
|
|
112
|
+
# ------------------------------------------------------------------
|
|
113
|
+
# Retrieval
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
def search(self, query: str, top_k: int = 10) -> list[SearchResult]:
|
|
117
|
+
"""Run a hybrid (BM25 + vector) search over the indexed documents."""
|
|
118
|
+
return self.index.search(query, top_k=top_k)
|
|
119
|
+
|
|
120
|
+
def search_full(self, query: str, top_k: int = 10) -> SearchResponse:
|
|
121
|
+
"""Like :meth:`search`, but returns a full :class:`SearchResponse`
|
|
122
|
+
with timing and index-size metadata (matches the ``/search`` API)."""
|
|
123
|
+
start = time.perf_counter()
|
|
124
|
+
results = self.search(query, top_k=top_k)
|
|
125
|
+
took_ms = (time.perf_counter() - start) * 1000
|
|
126
|
+
return SearchResponse(
|
|
127
|
+
query=query,
|
|
128
|
+
results=results,
|
|
129
|
+
took_ms=round(took_ms, 2),
|
|
130
|
+
total_indexed_documents=len(self.index),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# ------------------------------------------------------------------
|
|
134
|
+
# Convenience
|
|
135
|
+
# ------------------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
def search_web(self, query: str, urls: list[str], top_k: int = 10) -> list[SearchResult]:
|
|
138
|
+
"""One-shot: crawl ``urls``, then immediately search the freshly
|
|
139
|
+
indexed content. Equivalent to ``engine.crawl(urls)`` followed by
|
|
140
|
+
``engine.search(query)``."""
|
|
141
|
+
self.crawl(urls)
|
|
142
|
+
return self.search(query, top_k=top_k)
|
|
143
|
+
|
|
144
|
+
def __len__(self) -> int:
|
|
145
|
+
return len(self.index)
|
|
146
|
+
|
|
147
|
+
def __repr__(self) -> str: # pragma: no cover
|
|
148
|
+
return (
|
|
149
|
+
f"<BIE documents={len(self.index)} "
|
|
150
|
+
f"vector_search={'on' if self.index.vector_enabled else 'off'}>"
|
|
151
|
+
)
|
bie/extract.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
``bie.extract()`` — retrieve clean, readable Markdown from a single URL.
|
|
3
|
+
|
|
4
|
+
This is BIE's "give me this page as clean text" primitive: fetch a URL,
|
|
5
|
+
strip navigation/ads/scripts/styling noise, and convert the main content
|
|
6
|
+
to Markdown — the format LLMs work with best.
|
|
7
|
+
|
|
8
|
+
For static HTML pages, this uses a direct HTTP fetch (fast, no browser).
|
|
9
|
+
For JavaScript-rendered pages (SPAs, sites that return a near-empty
|
|
10
|
+
``<body>`` until JS runs), pass ``render_js=True`` to fall back to a
|
|
11
|
+
headless Playwright browser — requires the optional ``bie[render]``
|
|
12
|
+
extra (``pip install "bits-bie[render]"`` plus ``playwright install
|
|
13
|
+
chromium`` once).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import TYPE_CHECKING
|
|
23
|
+
|
|
24
|
+
import httpx
|
|
25
|
+
from markdownify import markdownify
|
|
26
|
+
|
|
27
|
+
from bie.security import scan_for_prompt_injection
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from bie.security import SecurityReport
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger("bie.extract")
|
|
33
|
+
|
|
34
|
+
_USER_AGENT = (
|
|
35
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
36
|
+
"(KHTML, like Gecko) Chrome/124.0 Safari/537.36 BIE/0.5"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Tags whose content is never useful for an LLM and should be stripped
|
|
40
|
+
# before markdown conversion.
|
|
41
|
+
_STRIP_TAGS = (
|
|
42
|
+
"script",
|
|
43
|
+
"style",
|
|
44
|
+
"noscript",
|
|
45
|
+
"nav",
|
|
46
|
+
"footer",
|
|
47
|
+
"header",
|
|
48
|
+
"form",
|
|
49
|
+
"iframe",
|
|
50
|
+
"svg",
|
|
51
|
+
"button",
|
|
52
|
+
"aside",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# A page is considered "JS-required" if, after stripping noise tags, the
|
|
56
|
+
# remaining visible text is suspiciously short.
|
|
57
|
+
_JS_REQUIRED_TEXT_THRESHOLD = 80
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ExtractError(RuntimeError):
|
|
61
|
+
"""Raised when a page can't be fetched or extracted."""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class ExtractResult:
|
|
66
|
+
"""Result of :func:`bie.extract.extract`."""
|
|
67
|
+
|
|
68
|
+
url: str
|
|
69
|
+
title: str
|
|
70
|
+
markdown: str
|
|
71
|
+
text: str
|
|
72
|
+
word_count: int
|
|
73
|
+
rendered_with_js: bool = False
|
|
74
|
+
security: "SecurityReport | None" = field(default=None)
|
|
75
|
+
|
|
76
|
+
def __str__(self) -> str: # pragma: no cover - convenience only
|
|
77
|
+
flag = " [JS-rendered]" if self.rendered_with_js else ""
|
|
78
|
+
return f"<ExtractResult {self.url!r} title={self.title!r} words={self.word_count}{flag}>"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def extract(
|
|
82
|
+
url: str,
|
|
83
|
+
render_js: bool = False,
|
|
84
|
+
timeout: float = 20.0,
|
|
85
|
+
scan_security: bool = True,
|
|
86
|
+
) -> ExtractResult:
|
|
87
|
+
"""Fetch ``url`` and return its content as clean Markdown.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
url: The page to fetch.
|
|
91
|
+
render_js: If True, render the page with a headless browser
|
|
92
|
+
(requires ``pip install "bits-bie[render]"`` and
|
|
93
|
+
``playwright install chromium``). If False (default), BIE
|
|
94
|
+
still auto-detects JS-only pages and raises a helpful
|
|
95
|
+
:class:`ExtractError` suggesting ``render_js=True`` rather
|
|
96
|
+
than silently returning near-empty content.
|
|
97
|
+
timeout: Request timeout in seconds.
|
|
98
|
+
scan_security: If True (default), scan the extracted text for
|
|
99
|
+
prompt-injection patterns and attach a
|
|
100
|
+
:class:`bie.security.SecurityReport` to ``result.security``.
|
|
101
|
+
This does **not** remove or alter content, only flags it.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
An :class:`ExtractResult` with ``markdown``, plain ``text``,
|
|
105
|
+
``title``, and ``word_count``.
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
ExtractError: if the page can't be fetched, or appears to require
|
|
109
|
+
JavaScript and ``render_js=False``.
|
|
110
|
+
"""
|
|
111
|
+
if render_js:
|
|
112
|
+
html = _fetch_with_playwright(url, timeout)
|
|
113
|
+
rendered_with_js = True
|
|
114
|
+
else:
|
|
115
|
+
html = _fetch_static(url, timeout)
|
|
116
|
+
rendered_with_js = False
|
|
117
|
+
|
|
118
|
+
if _looks_js_only(html):
|
|
119
|
+
raise ExtractError(
|
|
120
|
+
f"{url} appears to require JavaScript to render its content "
|
|
121
|
+
f"(static fetch returned very little text). Retry with "
|
|
122
|
+
f"extract(url, render_js=True) — requires "
|
|
123
|
+
f'\'pip install "bits-bie[render]"\' and '
|
|
124
|
+
f"'playwright install chromium' once."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
title, markdown, text = _to_markdown(html)
|
|
128
|
+
|
|
129
|
+
result = ExtractResult(
|
|
130
|
+
url=url,
|
|
131
|
+
title=title,
|
|
132
|
+
markdown=markdown,
|
|
133
|
+
text=text,
|
|
134
|
+
word_count=len(text.split()),
|
|
135
|
+
rendered_with_js=rendered_with_js,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if scan_security:
|
|
139
|
+
result.security = scan_for_prompt_injection(text)
|
|
140
|
+
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _fetch_static(url: str, timeout: float) -> str:
|
|
145
|
+
headers = {"User-Agent": _USER_AGENT}
|
|
146
|
+
try:
|
|
147
|
+
with httpx.Client(timeout=timeout, headers=headers, follow_redirects=True) as client:
|
|
148
|
+
resp = client.get(url)
|
|
149
|
+
resp.raise_for_status()
|
|
150
|
+
return resp.text
|
|
151
|
+
except httpx.HTTPError as exc:
|
|
152
|
+
raise ExtractError(f"Failed to fetch {url}: {exc}") from exc
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _fetch_with_playwright(url: str, timeout: float) -> str:
|
|
156
|
+
try:
|
|
157
|
+
from playwright.async_api import async_playwright
|
|
158
|
+
except ImportError as exc:
|
|
159
|
+
raise ExtractError(
|
|
160
|
+
"render_js=True requires the 'playwright' package. Install with: "
|
|
161
|
+
'pip install "bits-bie[render]" && playwright install chromium'
|
|
162
|
+
) from exc
|
|
163
|
+
|
|
164
|
+
async def _run() -> str:
|
|
165
|
+
async with async_playwright() as pw:
|
|
166
|
+
browser = await pw.chromium.launch()
|
|
167
|
+
try:
|
|
168
|
+
page = await browser.new_page(user_agent=_USER_AGENT)
|
|
169
|
+
await page.goto(url, timeout=timeout * 1000, wait_until="networkidle")
|
|
170
|
+
return await page.content()
|
|
171
|
+
finally:
|
|
172
|
+
await browser.close()
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
return asyncio.run(_run())
|
|
176
|
+
except Exception as exc:
|
|
177
|
+
raise ExtractError(f"Failed to render {url} with Playwright: {exc}") from exc
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _looks_js_only(html: str) -> bool:
|
|
181
|
+
"""Heuristic: after stripping script/style/nav/etc, is there
|
|
182
|
+
suspiciously little visible text? Typical of SPA shells that render
|
|
183
|
+
everything client-side (e.g. ``<div id="root"></div>``)."""
|
|
184
|
+
_, _, text = _to_markdown(html)
|
|
185
|
+
return len(text.strip()) < _JS_REQUIRED_TEXT_THRESHOLD
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _to_markdown(html: str) -> tuple[str, str, str]:
|
|
189
|
+
"""Strip noise tags and convert to (title, markdown, plain_text)."""
|
|
190
|
+
from selectolax.parser import HTMLParser
|
|
191
|
+
|
|
192
|
+
tree = HTMLParser(html)
|
|
193
|
+
|
|
194
|
+
title_node = tree.css_first("title")
|
|
195
|
+
title = _clean_text(title_node.text()) if title_node else ""
|
|
196
|
+
|
|
197
|
+
for tag in _STRIP_TAGS:
|
|
198
|
+
for node in tree.css(tag):
|
|
199
|
+
node.decompose()
|
|
200
|
+
|
|
201
|
+
body = tree.css_first("body") or tree
|
|
202
|
+
body_html = body.html or ""
|
|
203
|
+
|
|
204
|
+
markdown = markdownify(body_html, heading_style="ATX", strip=["a"]).strip()
|
|
205
|
+
markdown = _collapse_blank_lines(markdown)
|
|
206
|
+
|
|
207
|
+
text_node = tree.css_first("body") or tree
|
|
208
|
+
text = _clean_text(text_node.text(separator=" ", deep=True))
|
|
209
|
+
|
|
210
|
+
return title, markdown, text
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _collapse_blank_lines(markdown: str) -> str:
|
|
214
|
+
return re.sub(r"\n{3,}", "\n\n", markdown)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _clean_text(text: str) -> str:
|
|
218
|
+
return re.sub(r"\s+", " ", text or "").strip()
|