bits-bie 1.2.2__tar.gz → 1.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bits_bie-1.2.2 → bits_bie-1.2.5}/.github/workflows/ci.yml +1 -1
- {bits_bie-1.2.2 → bits_bie-1.2.5}/PKG-INFO +99 -41
- {bits_bie-1.2.2 → bits_bie-1.2.5}/README.md +98 -40
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/__init__.py +14 -3
- bits_bie-1.2.5/bie/_asyncutil.py +84 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/cli.py +31 -6
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/config.py +19 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/crawler.py +32 -24
- bits_bie-1.2.5/bie/discovery.py +642 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/engine.py +25 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/extract.py +2 -2
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/integrations/langchain.py +6 -9
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/mcp/server.py +27 -4
- bits_bie-1.2.5/bie/models.py +151 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/quicksearch.py +122 -22
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/server.py +17 -7
- {bits_bie-1.2.2 → bits_bie-1.2.5}/pyproject.toml +1 -1
- bits_bie-1.2.5/tests/test_asyncutil.py +110 -0
- bits_bie-1.2.5/tests/test_crawler.py +134 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_discovery.py +104 -3
- bits_bie-1.2.5/tests/test_discovery_errors.py +322 -0
- bits_bie-1.2.5/tests/test_websearch_response.py +238 -0
- bits_bie-1.2.2/bie/discovery.py +0 -467
- bits_bie-1.2.2/bie/models.py +0 -76
- {bits_bie-1.2.2 → bits_bie-1.2.5}/.github/workflows/publish.yml +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/.gitignore +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/LICENSE +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/_async_utils.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/chunker.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/index.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/integrations/__init__.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/mcp/__init__.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/query_expansion.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/security.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/sitecrawl.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/sitemap.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/spiders/__init__.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/spiders/generic.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/docs/API.md +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/basic_search.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/extract_page.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/map_and_crawl.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/reusable_index.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/web_search.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_async_utils.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_chunker.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_crawler_notebook_safe.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_crawler_request_patch.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_discovery_error_handling.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_engine.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_extract.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_index.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_query_expansion.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_quicksearch.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_security.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_sitecrawl.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_sitemap.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_spider_relevance.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bits-bie
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.5
|
|
4
4
|
Summary: BitSearch Intelligence Engine — real-time, citation-backed web search & extraction for AI apps. Built on Bitscrape.
|
|
5
5
|
Project-URL: Homepage, https://github.com/Sudharsansm/BIE
|
|
6
6
|
Project-URL: Repository, https://github.com/Sudharsansm/BIE
|
|
@@ -66,7 +66,7 @@ API keys, no subscriptions, no third-party search services.**
|
|
|
66
66
|
|
|
67
67
|
BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
|
|
68
68
|
**search, extract, map, crawl, and a hybrid index** — all running locally
|
|
69
|
-
on top of [**
|
|
69
|
+
on top of [**BitS**](https://pypi.org/project/bitscrape/), our
|
|
70
70
|
async crawling framework. Use it as a Python library, REST API, CLI, or
|
|
71
71
|
[MCP](https://modelcontextprotocol.io) server.
|
|
72
72
|
|
|
@@ -112,13 +112,46 @@ state-of-the-art ranking, a commercial search API may still be the right
|
|
|
112
112
|
choice for that piece. BIE is for teams that want a capable, free,
|
|
113
113
|
self-hosted starting point — and full control over the code.
|
|
114
114
|
|
|
115
|
+
### How this compares to ChatGPT Search / Tavily
|
|
116
|
+
|
|
117
|
+
`bie.websearch_response()` is shaped like those tools' "web search" tool
|
|
118
|
+
responses on purpose: ranked, cited `results` with snippets, an
|
|
119
|
+
`answer` field, and `.to_context()` for dropping straight into a prompt.
|
|
120
|
+
Two things are genuinely different, and worth being precise about:
|
|
121
|
+
|
|
122
|
+
- **`answer` is extractive, not generated.** ChatGPT Search and Tavily's
|
|
123
|
+
`include_answer` run an LLM server-side to *write* a summary answer.
|
|
124
|
+
BIE doesn't run an LLM — `answer` is the single best-matching passage
|
|
125
|
+
found (verbatim from a live page). It's a strong starting point for
|
|
126
|
+
*your* LLM/agent to read and synthesize from, not a finished answer on
|
|
127
|
+
its own.
|
|
128
|
+
- **Discovery is "best-effort free", not a dedicated index.** ChatGPT
|
|
129
|
+
Search/Tavily run their own crawl infrastructure and indexes. BIE's
|
|
130
|
+
default discovery scrapes DuckDuckGo/Bing's public result pages, which
|
|
131
|
+
can be rate-limited or served a CAPTCHA — `degraded`/`diagnostics` tell
|
|
132
|
+
you when this happens for a given query, so your agent can react (retry,
|
|
133
|
+
fall back to general knowledge, etc.) instead of silently getting a
|
|
134
|
+
bad answer.
|
|
135
|
+
|
|
136
|
+
**SearXNG closes most of that second gap.** Self-hosting
|
|
137
|
+
[SearXNG](https://github.com/searxng/searxng) and adding it as a
|
|
138
|
+
discovery backend (`BIE_DISCOVERY_BACKENDS=searxng,...` +
|
|
139
|
+
`BIE_SEARXNG_URL=...`) gives BIE a stable JSON API that itself aggregates
|
|
140
|
+
Google/Bing/Brave/etc. server-side — far less prone to the "200 OK but
|
|
141
|
+
0 results" failure mode of scraping DDG/Bing HTML directly. It's the
|
|
142
|
+
single highest-leverage change for making `websearch()`'s *discovery*
|
|
143
|
+
step behave consistently. It doesn't change the `answer` field's
|
|
144
|
+
extractive (vs. LLM-generated) nature — that's a property of BIE not
|
|
145
|
+
running an LLM, independent of which discovery backend is used.
|
|
146
|
+
|
|
115
147
|
---
|
|
116
148
|
|
|
117
149
|
## Core primitives
|
|
118
150
|
|
|
119
151
|
| Function | What it does |
|
|
120
152
|
|---|---|
|
|
121
|
-
| `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback) with query fan-out, crawled and ranked by BIE's hybrid index. |
|
|
153
|
+
| `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback, optional SearXNG) with query fan-out, crawled and ranked by BIE's hybrid index. |
|
|
154
|
+
| `bie.websearch_response(query)` | Like `websearch`, but returns the full Tavily/ChatGPT-Search-shaped response: ranked `results`, an extractive `answer`, `degraded`/`diagnostics`, and `.to_context()` for an LLM-prompt-ready citation block. |
|
|
122
155
|
| `bie.extract(url)` | Fetch a URL and return clean Markdown, with nav/ads/scripts stripped. Optional JS rendering via Playwright. |
|
|
123
156
|
| `bie.map_site(url)` | Discover a site's sitemap(s) and the URLs they list, before crawling. |
|
|
124
157
|
| `bie.crawl_site(urls, instruction=...)` | Crawl a site, prioritizing links by keyword-relevance to your instruction. Returns an index + ranked results. |
|
|
@@ -146,17 +179,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
|
|
|
146
179
|
pip install "bits-bie[mcp]" # Model Context Protocol server
|
|
147
180
|
pip install "bits-bie[render]" # JS rendering for extract() via Playwright
|
|
148
181
|
pip install "bits-bie[langchain]" # LangChain tool adapters
|
|
149
|
-
pip install "bits-bie[notebook]" # smoother Jupyter/Colab
|
|
182
|
+
pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
|
|
150
183
|
pip install "bits-bie[all]" # everything
|
|
151
184
|
```
|
|
152
185
|
|
|
153
|
-
> **Using BIE in Jupyter / Google Colab?** All sync entry points
|
|
154
|
-
> (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
|
|
155
|
-
> work inside notebooks out of the box — BIE detects the notebook's
|
|
156
|
-
> already-running event loop and handles it automatically. Installing
|
|
157
|
-
> `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
|
|
158
|
-
> efficient, but is not required.
|
|
159
|
-
|
|
160
186
|
> BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
|
|
161
187
|
> proprietary async crawling & extraction framework, which is installed
|
|
162
188
|
> automatically.
|
|
@@ -176,6 +202,20 @@ for r in results:
|
|
|
176
202
|
print(r.snippet)
|
|
177
203
|
```
|
|
178
204
|
|
|
205
|
+
For the full, Tavily/ChatGPT-Search-shaped response — extractive
|
|
206
|
+
`answer`, timing, and `degraded`/`diagnostics` for when live discovery
|
|
207
|
+
doesn't fully succeed:
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
response = bie.websearch_response("who won the latest F1 race")
|
|
211
|
+
|
|
212
|
+
print(response.answer) # best-matching passage (not LLM-written)
|
|
213
|
+
print(response.to_context()) # numbered sources block, ready for a prompt
|
|
214
|
+
|
|
215
|
+
if response.degraded:
|
|
216
|
+
print("live data degraded:", response.diagnostics)
|
|
217
|
+
```
|
|
218
|
+
|
|
179
219
|
`websearch` pipeline:
|
|
180
220
|
|
|
181
221
|
1. **Discovery** — free, public, no-key search endpoints (DuckDuckGo,
|
|
@@ -407,44 +447,62 @@ engine = BIE(BIESettings(
|
|
|
407
447
|
| `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
|
|
408
448
|
| `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
|
|
409
449
|
| `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
|
|
450
|
+
| `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
|
|
451
|
+
| `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
|
|
410
452
|
| `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
|
|
411
|
-
| — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
|
|
412
|
-
| — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
|
|
413
453
|
|
|
414
|
-
|
|
454
|
+
---
|
|
415
455
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
456
|
+
## Troubleshooting
|
|
457
|
+
|
|
458
|
+
**`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
|
|
459
|
+
during a crawl — this was a Bitscrape scheduler bug (its priority queue
|
|
460
|
+
compared `Request` objects directly when two requests shared the same
|
|
461
|
+
priority). BIE patches `bitscrape.Request` to be orderable at import
|
|
462
|
+
time, so this no longer occurs. If you still see it, you're likely on an
|
|
463
|
+
older `bits-bie` version — upgrade.
|
|
464
|
+
|
|
465
|
+
**`RuntimeError: asyncio.run() cannot be called from a running event
|
|
466
|
+
loop`** — Jupyter/Colab/IPython already run an event loop, which used to
|
|
467
|
+
break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
|
|
468
|
+
running loop automatically and either use
|
|
469
|
+
[`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
|
|
470
|
+
`pip install "bits-bie[notebook]"`) or fall back to running the crawl on
|
|
471
|
+
a background thread — no code changes needed. If you're already inside
|
|
472
|
+
an `async def`, you can also call `await engine.acrawl(urls)` directly.
|
|
473
|
+
|
|
474
|
+
**`bie.websearch(...)` returns `[]` / all discovery backends fail** —
|
|
475
|
+
discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
|
|
476
|
+
be blocked or rate-limited. Call
|
|
477
|
+
`bie.discovery.get_last_discovery_diagnostics()` right after to see why:
|
|
421
478
|
|
|
422
|
-
|
|
423
|
-
|
|
479
|
+
```python
|
|
480
|
+
import bie
|
|
481
|
+
from bie.discovery import get_last_discovery_diagnostics
|
|
424
482
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
Colab) rather than a locked-down sandbox.
|
|
430
|
-
- **"reachable but no results"** — connections succeeded but responses
|
|
431
|
-
were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
|
|
432
|
-
This means the IP is likely being rate-limited; try again later, reduce
|
|
433
|
-
request frequency, or switch to a self-hosted backend (below).
|
|
483
|
+
results = bie.websearch("...")
|
|
484
|
+
if not results:
|
|
485
|
+
print(get_last_discovery_diagnostics().summary())
|
|
486
|
+
```
|
|
434
487
|
|
|
435
|
-
|
|
436
|
-
[SearXNG](https://docs.searxng.org/) instance and point BIE at it:
|
|
488
|
+
This distinguishes three cases:
|
|
437
489
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
490
|
+
- **Network blocked** — every backend failed at the connection level
|
|
491
|
+
(or an egress proxy returned `x-deny-reason: host_not_allowed`). This
|
|
492
|
+
environment can't reach these hosts at all — check its outbound
|
|
493
|
+
network/proxy/firewall config. Common in sandboxed code-execution
|
|
494
|
+
environments; Colab and most servers have unrestricted outbound access.
|
|
495
|
+
- **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
|
|
496
|
+
typically from bot-detection on a shared IP. Retry later, reduce
|
|
497
|
+
request volume, or configure a `searxng` backend (below).
|
|
498
|
+
- **Empty response** — got `200 OK` but no parseable results (often a
|
|
499
|
+
CAPTCHA/consent page).
|
|
442
500
|
|
|
443
|
-
|
|
444
|
-
SearXNG
|
|
501
|
+
For the most reliable no-API-key discovery, self-host
|
|
502
|
+
[SearXNG](https://github.com/searxng/searxng) and add it as a backend:
|
|
445
503
|
|
|
446
504
|
```bash
|
|
447
|
-
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
|
|
505
|
+
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
|
|
448
506
|
export BIE_SEARXNG_URL=http://localhost:8080
|
|
449
507
|
```
|
|
450
508
|
|
|
@@ -483,10 +541,10 @@ for Elasticsearch/Milvus-backed implementations behind the same
|
|
|
483
541
|
|
|
484
542
|
---
|
|
485
543
|
|
|
486
|
-
## Built on
|
|
544
|
+
## Built on BitS
|
|
487
545
|
|
|
488
546
|
BIE's crawling and extraction layer is powered by
|
|
489
|
-
[**
|
|
547
|
+
[**BitS**](https://github.com/Sudharsansm/Bitscrape)
|
|
490
548
|
(`pip install bitscrape`), our async, robots.txt-aware web scraping
|
|
491
549
|
framework — giving BIE high-performance, polite crawling out of the box.
|
|
492
550
|
|
|
@@ -10,7 +10,7 @@ API keys, no subscriptions, no third-party search services.**
|
|
|
10
10
|
|
|
11
11
|
BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
|
|
12
12
|
**search, extract, map, crawl, and a hybrid index** — all running locally
|
|
13
|
-
on top of [**
|
|
13
|
+
on top of [**BitS**](https://pypi.org/project/bitscrape/), our
|
|
14
14
|
async crawling framework. Use it as a Python library, REST API, CLI, or
|
|
15
15
|
[MCP](https://modelcontextprotocol.io) server.
|
|
16
16
|
|
|
@@ -56,13 +56,46 @@ state-of-the-art ranking, a commercial search API may still be the right
|
|
|
56
56
|
choice for that piece. BIE is for teams that want a capable, free,
|
|
57
57
|
self-hosted starting point — and full control over the code.
|
|
58
58
|
|
|
59
|
+
### How this compares to ChatGPT Search / Tavily
|
|
60
|
+
|
|
61
|
+
`bie.websearch_response()` is shaped like those tools' "web search" tool
|
|
62
|
+
responses on purpose: ranked, cited `results` with snippets, an
|
|
63
|
+
`answer` field, and `.to_context()` for dropping straight into a prompt.
|
|
64
|
+
Two things are genuinely different, and worth being precise about:
|
|
65
|
+
|
|
66
|
+
- **`answer` is extractive, not generated.** ChatGPT Search and Tavily's
|
|
67
|
+
`include_answer` run an LLM server-side to *write* a summary answer.
|
|
68
|
+
BIE doesn't run an LLM — `answer` is the single best-matching passage
|
|
69
|
+
found (verbatim from a live page). It's a strong starting point for
|
|
70
|
+
*your* LLM/agent to read and synthesize from, not a finished answer on
|
|
71
|
+
its own.
|
|
72
|
+
- **Discovery is "best-effort free", not a dedicated index.** ChatGPT
|
|
73
|
+
Search/Tavily run their own crawl infrastructure and indexes. BIE's
|
|
74
|
+
default discovery scrapes DuckDuckGo/Bing's public result pages, which
|
|
75
|
+
can be rate-limited or served a CAPTCHA — `degraded`/`diagnostics` tell
|
|
76
|
+
you when this happens for a given query, so your agent can react (retry,
|
|
77
|
+
fall back to general knowledge, etc.) instead of silently getting a
|
|
78
|
+
bad answer.
|
|
79
|
+
|
|
80
|
+
**SearXNG closes most of that second gap.** Self-hosting
|
|
81
|
+
[SearXNG](https://github.com/searxng/searxng) and adding it as a
|
|
82
|
+
discovery backend (`BIE_DISCOVERY_BACKENDS=searxng,...` +
|
|
83
|
+
`BIE_SEARXNG_URL=...`) gives BIE a stable JSON API that itself aggregates
|
|
84
|
+
Google/Bing/Brave/etc. server-side — far less prone to the "200 OK but
|
|
85
|
+
0 results" failure mode of scraping DDG/Bing HTML directly. It's the
|
|
86
|
+
single highest-leverage change for making `websearch()`'s *discovery*
|
|
87
|
+
step behave consistently. It doesn't change the `answer` field's
|
|
88
|
+
extractive (vs. LLM-generated) nature — that's a property of BIE not
|
|
89
|
+
running an LLM, independent of which discovery backend is used.
|
|
90
|
+
|
|
59
91
|
---
|
|
60
92
|
|
|
61
93
|
## Core primitives
|
|
62
94
|
|
|
63
95
|
| Function | What it does |
|
|
64
96
|
|---|---|
|
|
65
|
-
| `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback) with query fan-out, crawled and ranked by BIE's hybrid index. |
|
|
97
|
+
| `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback, optional SearXNG) with query fan-out, crawled and ranked by BIE's hybrid index. |
|
|
98
|
+
| `bie.websearch_response(query)` | Like `websearch`, but returns the full Tavily/ChatGPT-Search-shaped response: ranked `results`, an extractive `answer`, `degraded`/`diagnostics`, and `.to_context()` for an LLM-prompt-ready citation block. |
|
|
66
99
|
| `bie.extract(url)` | Fetch a URL and return clean Markdown, with nav/ads/scripts stripped. Optional JS rendering via Playwright. |
|
|
67
100
|
| `bie.map_site(url)` | Discover a site's sitemap(s) and the URLs they list, before crawling. |
|
|
68
101
|
| `bie.crawl_site(urls, instruction=...)` | Crawl a site, prioritizing links by keyword-relevance to your instruction. Returns an index + ranked results. |
|
|
@@ -90,17 +123,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
|
|
|
90
123
|
pip install "bits-bie[mcp]" # Model Context Protocol server
|
|
91
124
|
pip install "bits-bie[render]" # JS rendering for extract() via Playwright
|
|
92
125
|
pip install "bits-bie[langchain]" # LangChain tool adapters
|
|
93
|
-
pip install "bits-bie[notebook]" # smoother Jupyter/Colab
|
|
126
|
+
pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
|
|
94
127
|
pip install "bits-bie[all]" # everything
|
|
95
128
|
```
|
|
96
129
|
|
|
97
|
-
> **Using BIE in Jupyter / Google Colab?** All sync entry points
|
|
98
|
-
> (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
|
|
99
|
-
> work inside notebooks out of the box — BIE detects the notebook's
|
|
100
|
-
> already-running event loop and handles it automatically. Installing
|
|
101
|
-
> `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
|
|
102
|
-
> efficient, but is not required.
|
|
103
|
-
|
|
104
130
|
> BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
|
|
105
131
|
> proprietary async crawling & extraction framework, which is installed
|
|
106
132
|
> automatically.
|
|
@@ -120,6 +146,20 @@ for r in results:
|
|
|
120
146
|
print(r.snippet)
|
|
121
147
|
```
|
|
122
148
|
|
|
149
|
+
For the full, Tavily/ChatGPT-Search-shaped response — extractive
|
|
150
|
+
`answer`, timing, and `degraded`/`diagnostics` for when live discovery
|
|
151
|
+
doesn't fully succeed:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
response = bie.websearch_response("who won the latest F1 race")
|
|
155
|
+
|
|
156
|
+
print(response.answer) # best-matching passage (not LLM-written)
|
|
157
|
+
print(response.to_context()) # numbered sources block, ready for a prompt
|
|
158
|
+
|
|
159
|
+
if response.degraded:
|
|
160
|
+
print("live data degraded:", response.diagnostics)
|
|
161
|
+
```
|
|
162
|
+
|
|
123
163
|
`websearch` pipeline:
|
|
124
164
|
|
|
125
165
|
1. **Discovery** — free, public, no-key search endpoints (DuckDuckGo,
|
|
@@ -351,44 +391,62 @@ engine = BIE(BIESettings(
|
|
|
351
391
|
| `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
|
|
352
392
|
| `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
|
|
353
393
|
| `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
|
|
394
|
+
| `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
|
|
395
|
+
| `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
|
|
354
396
|
| `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
|
|
355
|
-
| — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
|
|
356
|
-
| — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
|
|
357
397
|
|
|
358
|
-
|
|
398
|
+
---
|
|
359
399
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
400
|
+
## Troubleshooting
|
|
401
|
+
|
|
402
|
+
**`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
|
|
403
|
+
during a crawl — this was a Bitscrape scheduler bug (its priority queue
|
|
404
|
+
compared `Request` objects directly when two requests shared the same
|
|
405
|
+
priority). BIE patches `bitscrape.Request` to be orderable at import
|
|
406
|
+
time, so this no longer occurs. If you still see it, you're likely on an
|
|
407
|
+
older `bits-bie` version — upgrade.
|
|
408
|
+
|
|
409
|
+
**`RuntimeError: asyncio.run() cannot be called from a running event
|
|
410
|
+
loop`** — Jupyter/Colab/IPython already run an event loop, which used to
|
|
411
|
+
break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
|
|
412
|
+
running loop automatically and either use
|
|
413
|
+
[`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
|
|
414
|
+
`pip install "bits-bie[notebook]"`) or fall back to running the crawl on
|
|
415
|
+
a background thread — no code changes needed. If you're already inside
|
|
416
|
+
an `async def`, you can also call `await engine.acrawl(urls)` directly.
|
|
417
|
+
|
|
418
|
+
**`bie.websearch(...)` returns `[]` / all discovery backends fail** —
|
|
419
|
+
discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
|
|
420
|
+
be blocked or rate-limited. Call
|
|
421
|
+
`bie.discovery.get_last_discovery_diagnostics()` right after to see why:
|
|
365
422
|
|
|
366
|
-
|
|
367
|
-
|
|
423
|
+
```python
|
|
424
|
+
import bie
|
|
425
|
+
from bie.discovery import get_last_discovery_diagnostics
|
|
368
426
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
Colab) rather than a locked-down sandbox.
|
|
374
|
-
- **"reachable but no results"** — connections succeeded but responses
|
|
375
|
-
were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
|
|
376
|
-
This means the IP is likely being rate-limited; try again later, reduce
|
|
377
|
-
request frequency, or switch to a self-hosted backend (below).
|
|
427
|
+
results = bie.websearch("...")
|
|
428
|
+
if not results:
|
|
429
|
+
print(get_last_discovery_diagnostics().summary())
|
|
430
|
+
```
|
|
378
431
|
|
|
379
|
-
|
|
380
|
-
[SearXNG](https://docs.searxng.org/) instance and point BIE at it:
|
|
432
|
+
This distinguishes three cases:
|
|
381
433
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
434
|
+
- **Network blocked** — every backend failed at the connection level
|
|
435
|
+
(or an egress proxy returned `x-deny-reason: host_not_allowed`). This
|
|
436
|
+
environment can't reach these hosts at all — check its outbound
|
|
437
|
+
network/proxy/firewall config. Common in sandboxed code-execution
|
|
438
|
+
environments; Colab and most servers have unrestricted outbound access.
|
|
439
|
+
- **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
|
|
440
|
+
typically from bot-detection on a shared IP. Retry later, reduce
|
|
441
|
+
request volume, or configure a `searxng` backend (below).
|
|
442
|
+
- **Empty response** — got `200 OK` but no parseable results (often a
|
|
443
|
+
CAPTCHA/consent page).
|
|
386
444
|
|
|
387
|
-
|
|
388
|
-
SearXNG
|
|
445
|
+
For the most reliable no-API-key discovery, self-host
|
|
446
|
+
[SearXNG](https://github.com/searxng/searxng) and add it as a backend:
|
|
389
447
|
|
|
390
448
|
```bash
|
|
391
|
-
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
|
|
449
|
+
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
|
|
392
450
|
export BIE_SEARXNG_URL=http://localhost:8080
|
|
393
451
|
```
|
|
394
452
|
|
|
@@ -427,10 +485,10 @@ for Elasticsearch/Milvus-backed implementations behind the same
|
|
|
427
485
|
|
|
428
486
|
---
|
|
429
487
|
|
|
430
|
-
## Built on
|
|
488
|
+
## Built on BitS
|
|
431
489
|
|
|
432
490
|
BIE's crawling and extraction layer is powered by
|
|
433
|
-
[**
|
|
491
|
+
[**BitS**](https://github.com/Sudharsansm/Bitscrape)
|
|
434
492
|
(`pip install bitscrape`), our async, robots.txt-aware web scraping
|
|
435
493
|
framework — giving BIE high-performance, polite crawling out of the box.
|
|
436
494
|
|
|
@@ -10,6 +10,10 @@ Core primitives
|
|
|
10
10
|
----------------
|
|
11
11
|
|
|
12
12
|
- :func:`websearch` — search the live internet for a query (no URLs needed)
|
|
13
|
+
- :func:`websearch_response` — like ``websearch``, but returns a full
|
|
14
|
+
:class:`SearchResponse` (extractive ``answer``, ``took_ms``,
|
|
15
|
+
``degraded``/``diagnostics``, and ``.to_context()`` for LLM prompts) —
|
|
16
|
+
the Tavily/ChatGPT-Search-style "web search tool" shape
|
|
13
17
|
- :func:`search` — crawl + rank specific URLs against a query
|
|
14
18
|
- :func:`extract` — get clean Markdown from a single URL
|
|
15
19
|
- :func:`map_site` — discover a site's sitemap before crawling
|
|
@@ -29,6 +33,11 @@ Quick start
|
|
|
29
33
|
print(r.title, r.url)
|
|
30
34
|
print(r.snippet)
|
|
31
35
|
|
|
36
|
+
# Or get the full response — extractive answer + LLM-ready context
|
|
37
|
+
response = bie.websearch_response("who won the latest F1 race")
|
|
38
|
+
print(response.answer) # best-matching passage (not LLM-written)
|
|
39
|
+
print(response.to_context()) # numbered sources block, ready for a prompt
|
|
40
|
+
|
|
32
41
|
# Get clean markdown from a specific page
|
|
33
42
|
page = bie.extract("https://example.com/article")
|
|
34
43
|
print(page.markdown)
|
|
@@ -59,8 +68,8 @@ from importlib import metadata as _metadata
|
|
|
59
68
|
from bie.config import BIESettings
|
|
60
69
|
from bie.engine import BIE
|
|
61
70
|
from bie.extract import ExtractError, ExtractResult, extract
|
|
62
|
-
from bie.models import Document, SearchResult
|
|
63
|
-
from bie.quicksearch import search, websearch
|
|
71
|
+
from bie.models import Document, SearchResponse, SearchResult
|
|
72
|
+
from bie.quicksearch import search, websearch, websearch_response
|
|
64
73
|
from bie.security import SecurityFinding, SecurityReport, scan_for_prompt_injection
|
|
65
74
|
from bie.sitecrawl import crawl_site
|
|
66
75
|
from bie.sitemap import SiteMap, map_site
|
|
@@ -70,15 +79,17 @@ try:
|
|
|
70
79
|
__version__ = _metadata.version("bits-bie")
|
|
71
80
|
except _metadata.PackageNotFoundError:
|
|
72
81
|
# Editable/source checkout without installed metadata.
|
|
73
|
-
__version__ = "1.2.
|
|
82
|
+
__version__ = "1.2.5"
|
|
74
83
|
|
|
75
84
|
__all__ = [
|
|
76
85
|
"BIE",
|
|
77
86
|
"BIESettings",
|
|
78
87
|
"Document",
|
|
79
88
|
"SearchResult",
|
|
89
|
+
"SearchResponse",
|
|
80
90
|
"search",
|
|
81
91
|
"websearch",
|
|
92
|
+
"websearch_response",
|
|
82
93
|
"extract",
|
|
83
94
|
"ExtractResult",
|
|
84
95
|
"ExtractError",
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Internal helper for calling async BIE internals from synchronous code.
|
|
3
|
+
|
|
4
|
+
Plain scripts have no running event loop, so ``asyncio.run()`` works fine.
|
|
5
|
+
Jupyter/Colab/IPython kernels, however, *already* run an event loop, and
|
|
6
|
+
``asyncio.run()`` raises::
|
|
7
|
+
|
|
8
|
+
RuntimeError: asyncio.run() cannot be called from a running event loop
|
|
9
|
+
|
|
10
|
+
:func:`run_sync` detects this and transparently falls back to:
|
|
11
|
+
|
|
12
|
+
1. ``nest_asyncio`` (if installed) — patches the running loop so it can be
|
|
13
|
+
re-entered, then runs the coroutine on it directly.
|
|
14
|
+
2. A dedicated background thread with its own fresh event loop — works
|
|
15
|
+
everywhere, with zero extra dependencies, at the cost of a thread
|
|
16
|
+
spin-up per call.
|
|
17
|
+
|
|
18
|
+
This means the same sync call (e.g. ``engine.crawl(urls)``) works
|
|
19
|
+
unchanged in plain scripts, notebooks, and servers.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import asyncio
|
|
25
|
+
import threading
|
|
26
|
+
from typing import Any, Coroutine, TypeVar
|
|
27
|
+
|
|
28
|
+
T = TypeVar("T")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def run_sync(coro: Coroutine[Any, Any, T]) -> T:
|
|
32
|
+
"""Run ``coro`` to completion and return its result, regardless of
|
|
33
|
+
whether a thread already has an asyncio event loop running.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
coro: An awaitable coroutine object (not yet awaited/started).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The coroutine's return value.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
Whatever exception the coroutine itself raises.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
asyncio.get_running_loop()
|
|
46
|
+
except RuntimeError:
|
|
47
|
+
# No loop running in this thread — the normal case for scripts,
|
|
48
|
+
# CLI commands, and server request handlers.
|
|
49
|
+
return asyncio.run(coro)
|
|
50
|
+
|
|
51
|
+
# A loop is already running in this thread (e.g. Jupyter/Colab/IPython,
|
|
52
|
+
# or an async framework that called into sync BIE code).
|
|
53
|
+
try:
|
|
54
|
+
import nest_asyncio # type: ignore[import-not-found]
|
|
55
|
+
except ImportError:
|
|
56
|
+
return _run_in_new_thread(coro)
|
|
57
|
+
|
|
58
|
+
nest_asyncio.apply()
|
|
59
|
+
loop = asyncio.get_event_loop()
|
|
60
|
+
return loop.run_until_complete(coro)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _run_in_new_thread(coro: Coroutine[Any, Any, T]) -> T:
|
|
64
|
+
"""Run ``coro`` to completion on a fresh event loop in a new thread.
|
|
65
|
+
|
|
66
|
+
Used as the dependency-free fallback when a loop is already running in
|
|
67
|
+
the calling thread and ``nest_asyncio`` isn't installed.
|
|
68
|
+
"""
|
|
69
|
+
result: dict[str, Any] = {}
|
|
70
|
+
error: dict[str, BaseException] = {}
|
|
71
|
+
|
|
72
|
+
def _runner() -> None:
|
|
73
|
+
try:
|
|
74
|
+
result["value"] = asyncio.run(coro)
|
|
75
|
+
except BaseException as exc: # noqa: BLE001 - re-raised on the caller's thread
|
|
76
|
+
error["value"] = exc
|
|
77
|
+
|
|
78
|
+
thread = threading.Thread(target=_runner, name="bie-async-runner", daemon=True)
|
|
79
|
+
thread.start()
|
|
80
|
+
thread.join()
|
|
81
|
+
|
|
82
|
+
if "value" in error:
|
|
83
|
+
raise error["value"]
|
|
84
|
+
return result["value"] # type: ignore[return-value]
|
|
@@ -70,6 +70,12 @@ def search(query: str, urls: tuple[str, ...], top_k: int, max_pages: int, no_emb
|
|
|
70
70
|
@click.option("--no-deep", is_flag=True, help="Skip crawling; return raw discovery order without snippets")
|
|
71
71
|
@click.option("--no-embeddings", is_flag=True, help="Disable semantic/vector re-ranking (BM25 only)")
|
|
72
72
|
@click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
|
|
73
|
+
@click.option(
|
|
74
|
+
"--context",
|
|
75
|
+
"as_context",
|
|
76
|
+
is_flag=True,
|
|
77
|
+
help="Output a numbered, citation-ready text block for an LLM prompt (response.to_context())",
|
|
78
|
+
)
|
|
73
79
|
def search_live(
|
|
74
80
|
query: str,
|
|
75
81
|
top_k: int,
|
|
@@ -77,16 +83,18 @@ def search_live(
|
|
|
77
83
|
no_deep: bool,
|
|
78
84
|
no_embeddings: bool,
|
|
79
85
|
as_json: bool,
|
|
86
|
+
as_context: bool,
|
|
80
87
|
) -> None:
|
|
81
88
|
"""Search the live internet for QUERY — no seed URLs, no API key, no subscription.
|
|
82
89
|
|
|
83
90
|
Discovers relevant URLs via free public search endpoints (DuckDuckGo,
|
|
84
|
-
|
|
91
|
+
Bing, and optionally a self-hosted SearXNG instance — see
|
|
92
|
+
BIE_DISCOVERY_BACKENDS), crawls them with Bitscrape, and ranks the
|
|
85
93
|
extracted content against QUERY using BIE's hybrid BM25+vector index.
|
|
86
94
|
"""
|
|
87
95
|
import bie
|
|
88
96
|
|
|
89
|
-
|
|
97
|
+
response = bie.websearch_response(
|
|
90
98
|
query,
|
|
91
99
|
top_k=top_k,
|
|
92
100
|
discovery_results=discovery_results,
|
|
@@ -95,23 +103,40 @@ def search_live(
|
|
|
95
103
|
)
|
|
96
104
|
|
|
97
105
|
if as_json:
|
|
98
|
-
click.echo(
|
|
106
|
+
click.echo(response.model_dump_json(indent=2))
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
if as_context:
|
|
110
|
+
click.echo(response.to_context())
|
|
99
111
|
return
|
|
100
112
|
|
|
101
|
-
if not results:
|
|
113
|
+
if not response.results:
|
|
102
114
|
click.echo(
|
|
103
115
|
"No results found. The free search backends may be temporarily "
|
|
104
|
-
"rate-limiting — try again in a moment
|
|
116
|
+
"rate-limiting — try again in a moment.\n"
|
|
105
117
|
)
|
|
118
|
+
if response.diagnostics:
|
|
119
|
+
click.echo(f"Diagnosis: {response.diagnostics}")
|
|
106
120
|
return
|
|
107
121
|
|
|
108
|
-
|
|
122
|
+
if response.answer:
|
|
123
|
+
click.echo(f"Answer: {response.answer}\n")
|
|
124
|
+
|
|
125
|
+
if response.degraded:
|
|
126
|
+
click.echo("⚠ Live discovery/crawling was degraded for this query.")
|
|
127
|
+
if response.diagnostics:
|
|
128
|
+
click.echo(f" {response.diagnostics}")
|
|
129
|
+
click.echo()
|
|
130
|
+
|
|
131
|
+
for i, r in enumerate(response.results, 1):
|
|
109
132
|
click.echo(f"\n{i}. {r.title}")
|
|
110
133
|
click.echo(f" {r.url}")
|
|
111
134
|
click.echo(f" score={r.score:.4f}")
|
|
112
135
|
if r.snippet:
|
|
113
136
|
click.echo(f" {r.snippet}")
|
|
114
137
|
|
|
138
|
+
click.echo(f"\n({response.took_ms:.0f}ms)")
|
|
139
|
+
|
|
115
140
|
|
|
116
141
|
@cli.command()
|
|
117
142
|
@click.argument("urls", nargs=-1, required=True)
|
|
@@ -39,6 +39,25 @@ class BIESettings(BaseSettings):
|
|
|
39
39
|
index_dir: str = Field(".bie_index", description="Directory for persisted index")
|
|
40
40
|
persist: bool = Field(False, description="Persist index to disk between runs")
|
|
41
41
|
|
|
42
|
+
# --- Discovery (no-API-key web search) ----------------------------------
|
|
43
|
+
discovery_backends: str = Field(
|
|
44
|
+
"ddg_html,ddg_lite,bing_html",
|
|
45
|
+
description="Comma-separated, ordered list of discovery backends to "
|
|
46
|
+
"try for bie.websearch()/discover_urls(). Built-in backends: "
|
|
47
|
+
"'ddg_html', 'ddg_lite', 'bing_html', 'searxng'. The 'searxng' "
|
|
48
|
+
"backend requires `searxng_url` to also be set. Unknown names are "
|
|
49
|
+
"skipped with a warning. Override with the BIE_DISCOVERY_BACKENDS "
|
|
50
|
+
"env var, e.g. BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html",
|
|
51
|
+
)
|
|
52
|
+
searxng_url: str | None = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description="Base URL of a self-hosted SearXNG instance (e.g. "
|
|
55
|
+
"'http://localhost:8080'), used by the 'searxng' discovery backend. "
|
|
56
|
+
"Self-hosting SearXNG is the most reliable no-API-key discovery "
|
|
57
|
+
"option since it isn't subject to the rate limits / layout changes "
|
|
58
|
+
"that affect scraping DDG/Bing HTML directly.",
|
|
59
|
+
)
|
|
60
|
+
|
|
42
61
|
# --- Server --------------------------------------------------------------
|
|
43
62
|
host: str = "0.0.0.0"
|
|
44
63
|
port: int = 8000
|