bits-bie 1.2.2__tar.gz → 1.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {bits_bie-1.2.2 → bits_bie-1.2.5}/.github/workflows/ci.yml +1 -1
  2. {bits_bie-1.2.2 → bits_bie-1.2.5}/PKG-INFO +99 -41
  3. {bits_bie-1.2.2 → bits_bie-1.2.5}/README.md +98 -40
  4. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/__init__.py +14 -3
  5. bits_bie-1.2.5/bie/_asyncutil.py +84 -0
  6. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/cli.py +31 -6
  7. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/config.py +19 -0
  8. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/crawler.py +32 -24
  9. bits_bie-1.2.5/bie/discovery.py +642 -0
  10. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/engine.py +25 -0
  11. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/extract.py +2 -2
  12. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/integrations/langchain.py +6 -9
  13. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/mcp/server.py +27 -4
  14. bits_bie-1.2.5/bie/models.py +151 -0
  15. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/quicksearch.py +122 -22
  16. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/server.py +17 -7
  17. {bits_bie-1.2.2 → bits_bie-1.2.5}/pyproject.toml +1 -1
  18. bits_bie-1.2.5/tests/test_asyncutil.py +110 -0
  19. bits_bie-1.2.5/tests/test_crawler.py +134 -0
  20. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_discovery.py +104 -3
  21. bits_bie-1.2.5/tests/test_discovery_errors.py +322 -0
  22. bits_bie-1.2.5/tests/test_websearch_response.py +238 -0
  23. bits_bie-1.2.2/bie/discovery.py +0 -467
  24. bits_bie-1.2.2/bie/models.py +0 -76
  25. {bits_bie-1.2.2 → bits_bie-1.2.5}/.github/workflows/publish.yml +0 -0
  26. {bits_bie-1.2.2 → bits_bie-1.2.5}/.gitignore +0 -0
  27. {bits_bie-1.2.2 → bits_bie-1.2.5}/LICENSE +0 -0
  28. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/_async_utils.py +0 -0
  29. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/chunker.py +0 -0
  30. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/index.py +0 -0
  31. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/integrations/__init__.py +0 -0
  32. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/mcp/__init__.py +0 -0
  33. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/query_expansion.py +0 -0
  34. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/security.py +0 -0
  35. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/sitecrawl.py +0 -0
  36. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/sitemap.py +0 -0
  37. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/spiders/__init__.py +0 -0
  38. {bits_bie-1.2.2 → bits_bie-1.2.5}/bie/spiders/generic.py +0 -0
  39. {bits_bie-1.2.2 → bits_bie-1.2.5}/docs/API.md +0 -0
  40. {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/basic_search.py +0 -0
  41. {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/extract_page.py +0 -0
  42. {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/map_and_crawl.py +0 -0
  43. {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/reusable_index.py +0 -0
  44. {bits_bie-1.2.2 → bits_bie-1.2.5}/examples/web_search.py +0 -0
  45. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_async_utils.py +0 -0
  46. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_chunker.py +0 -0
  47. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_crawler_notebook_safe.py +0 -0
  48. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_crawler_request_patch.py +0 -0
  49. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_discovery_error_handling.py +0 -0
  50. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_engine.py +0 -0
  51. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_extract.py +0 -0
  52. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_index.py +0 -0
  53. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_query_expansion.py +0 -0
  54. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_quicksearch.py +0 -0
  55. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_security.py +0 -0
  56. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_sitecrawl.py +0 -0
  57. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_sitemap.py +0 -0
  58. {bits_bie-1.2.2 → bits_bie-1.2.5}/tests/test_spider_relevance.py +0 -0
@@ -24,4 +24,4 @@ jobs:
24
24
  - name: Run tests
25
25
  run: pytest -v
26
26
  - name: Lint
27
- run: ruff check bie tests
27
+ run: ruff check bie tests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bits-bie
3
- Version: 1.2.2
3
+ Version: 1.2.5
4
4
  Summary: BitSearch Intelligence Engine — real-time, citation-backed web search & extraction for AI apps. Built on Bitscrape.
5
5
  Project-URL: Homepage, https://github.com/Sudharsansm/BIE
6
6
  Project-URL: Repository, https://github.com/Sudharsansm/BIE
@@ -66,7 +66,7 @@ API keys, no subscriptions, no third-party search services.**
66
66
 
67
67
  BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
68
68
  **search, extract, map, crawl, and a hybrid index** — all running locally
69
- on top of [**Bitscrape**](https://pypi.org/project/bitscrape/), our
69
+ on top of [**BitS**](https://pypi.org/project/bitscrape/), our
70
70
  async crawling framework. Use it as a Python library, REST API, CLI, or
71
71
  [MCP](https://modelcontextprotocol.io) server.
72
72
 
@@ -112,13 +112,46 @@ state-of-the-art ranking, a commercial search API may still be the right
112
112
  choice for that piece. BIE is for teams that want a capable, free,
113
113
  self-hosted starting point — and full control over the code.
114
114
 
115
+ ### How this compares to ChatGPT Search / Tavily
116
+
117
+ `bie.websearch_response()` is shaped like those tools' "web search" tool
118
+ responses on purpose: ranked, cited `results` with snippets, an
119
+ `answer` field, and `.to_context()` for dropping straight into a prompt.
120
+ Two things are genuinely different, and worth being precise about:
121
+
122
+ - **`answer` is extractive, not generated.** ChatGPT Search and Tavily's
123
+ `include_answer` run an LLM server-side to *write* a summary answer.
124
+ BIE doesn't run an LLM — `answer` is the single best-matching passage
125
+ found (verbatim from a live page). It's a strong starting point for
126
+ *your* LLM/agent to read and synthesize from, not a finished answer on
127
+ its own.
128
+ - **Discovery is "best-effort free", not a dedicated index.** ChatGPT
129
+ Search/Tavily run their own crawl infrastructure and indexes. BIE's
130
+ default discovery scrapes DuckDuckGo/Bing's public result pages, which
131
+ can be rate-limited or served a CAPTCHA — `degraded`/`diagnostics` tell
132
+ you when this happens for a given query, so your agent can react (retry,
133
+ fall back to general knowledge, etc.) instead of silently getting a
134
+ bad answer.
135
+
136
+ **SearXNG closes most of that second gap.** Self-hosting
137
+ [SearXNG](https://github.com/searxng/searxng) and adding it as a
138
+ discovery backend (`BIE_DISCOVERY_BACKENDS=searxng,...` +
139
+ `BIE_SEARXNG_URL=...`) gives BIE a stable JSON API that itself aggregates
140
+ Google/Bing/Brave/etc. server-side — far less prone to the "200 OK but
141
+ 0 results" failure mode of scraping DDG/Bing HTML directly. It's the
142
+ single highest-leverage change for making `websearch()`'s *discovery*
143
+ step behave consistently. It doesn't change the `answer` field's
144
+ extractive (vs. LLM-generated) nature — that's a property of BIE not
145
+ running an LLM, independent of which discovery backend is used.
146
+
115
147
  ---
116
148
 
117
149
  ## Core primitives
118
150
 
119
151
  | Function | What it does |
120
152
  |---|---|
121
- | `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback) with query fan-out, crawled and ranked by BIE's hybrid index. |
153
+ | `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback, optional SearXNG) with query fan-out, crawled and ranked by BIE's hybrid index. |
154
+ | `bie.websearch_response(query)` | Like `websearch`, but returns the full Tavily/ChatGPT-Search-shaped response: ranked `results`, an extractive `answer`, `degraded`/`diagnostics`, and `.to_context()` for an LLM-prompt-ready citation block. |
122
155
  | `bie.extract(url)` | Fetch a URL and return clean Markdown, with nav/ads/scripts stripped. Optional JS rendering via Playwright. |
123
156
  | `bie.map_site(url)` | Discover a site's sitemap(s) and the URLs they list, before crawling. |
124
157
  | `bie.crawl_site(urls, instruction=...)` | Crawl a site, prioritizing links by keyword-relevance to your instruction. Returns an index + ranked results. |
@@ -146,17 +179,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
146
179
  pip install "bits-bie[mcp]" # Model Context Protocol server
147
180
  pip install "bits-bie[render]" # JS rendering for extract() via Playwright
148
181
  pip install "bits-bie[langchain]" # LangChain tool adapters
149
- pip install "bits-bie[notebook]" # smoother Jupyter/Colab support (nest_asyncio)
182
+ pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
150
183
  pip install "bits-bie[all]" # everything
151
184
  ```
152
185
 
153
- > **Using BIE in Jupyter / Google Colab?** All sync entry points
154
- > (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
155
- > work inside notebooks out of the box — BIE detects the notebook's
156
- > already-running event loop and handles it automatically. Installing
157
- > `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
158
- > efficient, but is not required.
159
-
160
186
  > BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
161
187
  > proprietary async crawling & extraction framework, which is installed
162
188
  > automatically.
@@ -176,6 +202,20 @@ for r in results:
176
202
  print(r.snippet)
177
203
  ```
178
204
 
205
+ For the full, Tavily/ChatGPT-Search-shaped response — extractive
206
+ `answer`, timing, and `degraded`/`diagnostics` for when live discovery
207
+ doesn't fully succeed:
208
+
209
+ ```python
210
+ response = bie.websearch_response("who won the latest F1 race")
211
+
212
+ print(response.answer) # best-matching passage (not LLM-written)
213
+ print(response.to_context()) # numbered sources block, ready for a prompt
214
+
215
+ if response.degraded:
216
+ print("live data degraded:", response.diagnostics)
217
+ ```
218
+
179
219
  `websearch` pipeline:
180
220
 
181
221
  1. **Discovery** — free, public, no-key search endpoints (DuckDuckGo,
@@ -407,44 +447,62 @@ engine = BIE(BIESettings(
407
447
  | `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
408
448
  | `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
409
449
  | `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
450
+ | `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
451
+ | `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
410
452
  | `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
411
- | — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
412
- | — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
413
453
 
414
- ### Discovery backends & troubleshooting empty `websearch()` results
454
+ ---
415
455
 
416
- `websearch()` discovers candidate URLs by scraping public search-engine
417
- result pages (DuckDuckGo HTML, DuckDuckGo Lite, Bing HTML, in that order
418
- by default). This is inherently fragile these are not official APIs,
419
- and shared/cloud IPs (CI runners, some notebook hosts, restrictive
420
- sandboxes) can be rate-limited or blocked entirely.
456
+ ## Troubleshooting
457
+
458
+ **`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
459
+ during a crawl this was a Bitscrape scheduler bug (its priority queue
460
+ compared `Request` objects directly when two requests shared the same
461
+ priority). BIE patches `bitscrape.Request` to be orderable at import
462
+ time, so this no longer occurs. If you still see it, you're likely on an
463
+ older `bits-bie` version — upgrade.
464
+
465
+ **`RuntimeError: asyncio.run() cannot be called from a running event
466
+ loop`** — Jupyter/Colab/IPython already run an event loop, which used to
467
+ break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
468
+ running loop automatically and either use
469
+ [`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
470
+ `pip install "bits-bie[notebook]"`) or fall back to running the crawl on
471
+ a background thread — no code changes needed. If you're already inside
472
+ an `async def`, you can also call `await engine.acrawl(urls)` directly.
473
+
474
+ **`bie.websearch(...)` returns `[]` / all discovery backends fail** —
475
+ discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
476
+ be blocked or rate-limited. Call
477
+ `bie.discovery.get_last_discovery_diagnostics()` right after to see why:
421
478
 
422
- If `websearch()` returns `[]`, BIE logs a `WARNING` that distinguishes
423
- two failure categories:
479
+ ```python
480
+ import bie
481
+ from bie.discovery import get_last_discovery_diagnostics
424
482
 
425
- - **"network blocked"** — every backend failed at the connection level
426
- (timeouts, connection refused, or a sandbox/proxy denial). This means
427
- the environment itself can't reach these hosts — re-run in an
428
- environment with normal internet access (a local machine, server, or
429
- Colab) rather than a locked-down sandbox.
430
- - **"reachable but no results"** — connections succeeded but responses
431
- were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
432
- This means the IP is likely being rate-limited; try again later, reduce
433
- request frequency, or switch to a self-hosted backend (below).
483
+ results = bie.websearch("...")
484
+ if not results:
485
+ print(get_last_discovery_diagnostics().summary())
486
+ ```
434
487
 
435
- For a durable fix to rate-limiting, run a self-hosted
436
- [SearXNG](https://docs.searxng.org/) instance and point BIE at it:
488
+ This distinguishes three cases:
437
489
 
438
- ```bash
439
- export BIE_DISCOVERY_BACKENDS=searxng
440
- export BIE_SEARXNG_URL=http://localhost:8080
441
- ```
490
+ - **Network blocked** — every backend failed at the connection level
491
+ (or an egress proxy returned `x-deny-reason: host_not_allowed`). This
492
+ environment can't reach these hosts at all — check its outbound
493
+ network/proxy/firewall config. Common in sandboxed code-execution
494
+ environments; Colab and most servers have unrestricted outbound access.
495
+ - **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
496
+ typically from bot-detection on a shared IP. Retry later, reduce
497
+ request volume, or configure a `searxng` backend (below).
498
+ - **Empty response** — got `200 OK` but no parseable results (often a
499
+ CAPTCHA/consent page).
442
500
 
443
- You can also combine backends and reorder them, e.g. to prefer your
444
- SearXNG instance but fall back to DuckDuckGo:
501
+ For the most reliable no-API-key discovery, self-host
502
+ [SearXNG](https://github.com/searxng/searxng) and add it as a backend:
445
503
 
446
504
  ```bash
447
- export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
505
+ export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
448
506
  export BIE_SEARXNG_URL=http://localhost:8080
449
507
  ```
450
508
 
@@ -483,10 +541,10 @@ for Elasticsearch/Milvus-backed implementations behind the same
483
541
 
484
542
  ---
485
543
 
486
- ## Built on Bitscrape
544
+ ## Built on BitS
487
545
 
488
546
  BIE's crawling and extraction layer is powered by
489
- [**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
547
+ [**BitS**](https://github.com/Sudharsansm/Bitscrape)
490
548
  (`pip install bitscrape`), our async, robots.txt-aware web scraping
491
549
  framework — giving BIE high-performance, polite crawling out of the box.
492
550
 
@@ -10,7 +10,7 @@ API keys, no subscriptions, no third-party search services.**
10
10
 
11
11
  BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
12
12
  **search, extract, map, crawl, and a hybrid index** — all running locally
13
- on top of [**Bitscrape**](https://pypi.org/project/bitscrape/), our
13
+ on top of [**BitS**](https://pypi.org/project/bitscrape/), our
14
14
  async crawling framework. Use it as a Python library, REST API, CLI, or
15
15
  [MCP](https://modelcontextprotocol.io) server.
16
16
 
@@ -56,13 +56,46 @@ state-of-the-art ranking, a commercial search API may still be the right
56
56
  choice for that piece. BIE is for teams that want a capable, free,
57
57
  self-hosted starting point — and full control over the code.
58
58
 
59
+ ### How this compares to ChatGPT Search / Tavily
60
+
61
+ `bie.websearch_response()` is shaped like those tools' "web search" tool
62
+ responses on purpose: ranked, cited `results` with snippets, an
63
+ `answer` field, and `.to_context()` for dropping straight into a prompt.
64
+ Two things are genuinely different, and worth being precise about:
65
+
66
+ - **`answer` is extractive, not generated.** ChatGPT Search and Tavily's
67
+ `include_answer` run an LLM server-side to *write* a summary answer.
68
+ BIE doesn't run an LLM — `answer` is the single best-matching passage
69
+ found (verbatim from a live page). It's a strong starting point for
70
+ *your* LLM/agent to read and synthesize from, not a finished answer on
71
+ its own.
72
+ - **Discovery is "best-effort free", not a dedicated index.** ChatGPT
73
+ Search/Tavily run their own crawl infrastructure and indexes. BIE's
74
+ default discovery scrapes DuckDuckGo/Bing's public result pages, which
75
+ can be rate-limited or served a CAPTCHA — `degraded`/`diagnostics` tell
76
+ you when this happens for a given query, so your agent can react (retry,
77
+ fall back to general knowledge, etc.) instead of silently getting a
78
+ bad answer.
79
+
80
+ **SearXNG closes most of that second gap.** Self-hosting
81
+ [SearXNG](https://github.com/searxng/searxng) and adding it as a
82
+ discovery backend (`BIE_DISCOVERY_BACKENDS=searxng,...` +
83
+ `BIE_SEARXNG_URL=...`) gives BIE a stable JSON API that itself aggregates
84
+ Google/Bing/Brave/etc. server-side — far less prone to the "200 OK but
85
+ 0 results" failure mode of scraping DDG/Bing HTML directly. It's the
86
+ single highest-leverage change for making `websearch()`'s *discovery*
87
+ step behave consistently. It doesn't change the `answer` field's
88
+ extractive (vs. LLM-generated) nature — that's a property of BIE not
89
+ running an LLM, independent of which discovery backend is used.
90
+
59
91
  ---
60
92
 
61
93
  ## Core primitives
62
94
 
63
95
  | Function | What it does |
64
96
  |---|---|
65
- | `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback) with query fan-out, crawled and ranked by BIE's hybrid index. |
97
+ | `bie.websearch(query)` | Search the live internet — no URLs needed. Free discovery (DuckDuckGo + Bing fallback, optional SearXNG) with query fan-out, crawled and ranked by BIE's hybrid index. |
98
+ | `bie.websearch_response(query)` | Like `websearch`, but returns the full Tavily/ChatGPT-Search-shaped response: ranked `results`, an extractive `answer`, `degraded`/`diagnostics`, and `.to_context()` for an LLM-prompt-ready citation block. |
66
99
  | `bie.extract(url)` | Fetch a URL and return clean Markdown, with nav/ads/scripts stripped. Optional JS rendering via Playwright. |
67
100
  | `bie.map_site(url)` | Discover a site's sitemap(s) and the URLs they list, before crawling. |
68
101
  | `bie.crawl_site(urls, instruction=...)` | Crawl a site, prioritizing links by keyword-relevance to your instruction. Returns an index + ranked results. |
@@ -90,17 +123,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
90
123
  pip install "bits-bie[mcp]" # Model Context Protocol server
91
124
  pip install "bits-bie[render]" # JS rendering for extract() via Playwright
92
125
  pip install "bits-bie[langchain]" # LangChain tool adapters
93
- pip install "bits-bie[notebook]" # smoother Jupyter/Colab support (nest_asyncio)
126
+ pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
94
127
  pip install "bits-bie[all]" # everything
95
128
  ```
96
129
 
97
- > **Using BIE in Jupyter / Google Colab?** All sync entry points
98
- > (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
99
- > work inside notebooks out of the box — BIE detects the notebook's
100
- > already-running event loop and handles it automatically. Installing
101
- > `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
102
- > efficient, but is not required.
103
-
104
130
  > BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
105
131
  > proprietary async crawling & extraction framework, which is installed
106
132
  > automatically.
@@ -120,6 +146,20 @@ for r in results:
120
146
  print(r.snippet)
121
147
  ```
122
148
 
149
+ For the full, Tavily/ChatGPT-Search-shaped response — extractive
150
+ `answer`, timing, and `degraded`/`diagnostics` for when live discovery
151
+ doesn't fully succeed:
152
+
153
+ ```python
154
+ response = bie.websearch_response("who won the latest F1 race")
155
+
156
+ print(response.answer) # best-matching passage (not LLM-written)
157
+ print(response.to_context()) # numbered sources block, ready for a prompt
158
+
159
+ if response.degraded:
160
+ print("live data degraded:", response.diagnostics)
161
+ ```
162
+
123
163
  `websearch` pipeline:
124
164
 
125
165
  1. **Discovery** — free, public, no-key search endpoints (DuckDuckGo,
@@ -351,44 +391,62 @@ engine = BIE(BIESettings(
351
391
  | `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
352
392
  | `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
353
393
  | `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
394
+ | `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
395
+ | `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
354
396
  | `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
355
- | — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
356
- | — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
357
397
 
358
- ### Discovery backends & troubleshooting empty `websearch()` results
398
+ ---
359
399
 
360
- `websearch()` discovers candidate URLs by scraping public search-engine
361
- result pages (DuckDuckGo HTML, DuckDuckGo Lite, Bing HTML, in that order
362
- by default). This is inherently fragile these are not official APIs,
363
- and shared/cloud IPs (CI runners, some notebook hosts, restrictive
364
- sandboxes) can be rate-limited or blocked entirely.
400
+ ## Troubleshooting
401
+
402
+ **`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
403
+ during a crawl this was a Bitscrape scheduler bug (its priority queue
404
+ compared `Request` objects directly when two requests shared the same
405
+ priority). BIE patches `bitscrape.Request` to be orderable at import
406
+ time, so this no longer occurs. If you still see it, you're likely on an
407
+ older `bits-bie` version — upgrade.
408
+
409
+ **`RuntimeError: asyncio.run() cannot be called from a running event
410
+ loop`** — Jupyter/Colab/IPython already run an event loop, which used to
411
+ break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
412
+ running loop automatically and either use
413
+ [`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
414
+ `pip install "bits-bie[notebook]"`) or fall back to running the crawl on
415
+ a background thread — no code changes needed. If you're already inside
416
+ an `async def`, you can also call `await engine.acrawl(urls)` directly.
417
+
418
+ **`bie.websearch(...)` returns `[]` / all discovery backends fail** —
419
+ discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
420
+ be blocked or rate-limited. Call
421
+ `bie.discovery.get_last_discovery_diagnostics()` right after to see why:
365
422
 
366
- If `websearch()` returns `[]`, BIE logs a `WARNING` that distinguishes
367
- two failure categories:
423
+ ```python
424
+ import bie
425
+ from bie.discovery import get_last_discovery_diagnostics
368
426
 
369
- - **"network blocked"** — every backend failed at the connection level
370
- (timeouts, connection refused, or a sandbox/proxy denial). This means
371
- the environment itself can't reach these hosts — re-run in an
372
- environment with normal internet access (a local machine, server, or
373
- Colab) rather than a locked-down sandbox.
374
- - **"reachable but no results"** — connections succeeded but responses
375
- were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
376
- This means the IP is likely being rate-limited; try again later, reduce
377
- request frequency, or switch to a self-hosted backend (below).
427
+ results = bie.websearch("...")
428
+ if not results:
429
+ print(get_last_discovery_diagnostics().summary())
430
+ ```
378
431
 
379
- For a durable fix to rate-limiting, run a self-hosted
380
- [SearXNG](https://docs.searxng.org/) instance and point BIE at it:
432
+ This distinguishes three cases:
381
433
 
382
- ```bash
383
- export BIE_DISCOVERY_BACKENDS=searxng
384
- export BIE_SEARXNG_URL=http://localhost:8080
385
- ```
434
+ - **Network blocked** — every backend failed at the connection level
435
+ (or an egress proxy returned `x-deny-reason: host_not_allowed`). This
436
+ environment can't reach these hosts at all — check its outbound
437
+ network/proxy/firewall config. Common in sandboxed code-execution
438
+ environments; Colab and most servers have unrestricted outbound access.
439
+ - **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
440
+ typically from bot-detection on a shared IP. Retry later, reduce
441
+ request volume, or configure a `searxng` backend (below).
442
+ - **Empty response** — got `200 OK` but no parseable results (often a
443
+ CAPTCHA/consent page).
386
444
 
387
- You can also combine backends and reorder them, e.g. to prefer your
388
- SearXNG instance but fall back to DuckDuckGo:
445
+ For the most reliable no-API-key discovery, self-host
446
+ [SearXNG](https://github.com/searxng/searxng) and add it as a backend:
389
447
 
390
448
  ```bash
391
- export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
449
+ export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
392
450
  export BIE_SEARXNG_URL=http://localhost:8080
393
451
  ```
394
452
 
@@ -427,10 +485,10 @@ for Elasticsearch/Milvus-backed implementations behind the same
427
485
 
428
486
  ---
429
487
 
430
- ## Built on Bitscrape
488
+ ## Built on BitS
431
489
 
432
490
  BIE's crawling and extraction layer is powered by
433
- [**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
491
+ [**BitS**](https://github.com/Sudharsansm/Bitscrape)
434
492
  (`pip install bitscrape`), our async, robots.txt-aware web scraping
435
493
  framework — giving BIE high-performance, polite crawling out of the box.
436
494
 
@@ -10,6 +10,10 @@ Core primitives
10
10
  ----------------
11
11
 
12
12
  - :func:`websearch` — search the live internet for a query (no URLs needed)
13
+ - :func:`websearch_response` — like ``websearch``, but returns a full
14
+ :class:`SearchResponse` (extractive ``answer``, ``took_ms``,
15
+ ``degraded``/``diagnostics``, and ``.to_context()`` for LLM prompts) —
16
+ the Tavily/ChatGPT-Search-style "web search tool" shape
13
17
  - :func:`search` — crawl + rank specific URLs against a query
14
18
  - :func:`extract` — get clean Markdown from a single URL
15
19
  - :func:`map_site` — discover a site's sitemap before crawling
@@ -29,6 +33,11 @@ Quick start
29
33
  print(r.title, r.url)
30
34
  print(r.snippet)
31
35
 
36
+ # Or get the full response — extractive answer + LLM-ready context
37
+ response = bie.websearch_response("who won the latest F1 race")
38
+ print(response.answer) # best-matching passage (not LLM-written)
39
+ print(response.to_context()) # numbered sources block, ready for a prompt
40
+
32
41
  # Get clean markdown from a specific page
33
42
  page = bie.extract("https://example.com/article")
34
43
  print(page.markdown)
@@ -59,8 +68,8 @@ from importlib import metadata as _metadata
59
68
  from bie.config import BIESettings
60
69
  from bie.engine import BIE
61
70
  from bie.extract import ExtractError, ExtractResult, extract
62
- from bie.models import Document, SearchResult
63
- from bie.quicksearch import search, websearch
71
+ from bie.models import Document, SearchResponse, SearchResult
72
+ from bie.quicksearch import search, websearch, websearch_response
64
73
  from bie.security import SecurityFinding, SecurityReport, scan_for_prompt_injection
65
74
  from bie.sitecrawl import crawl_site
66
75
  from bie.sitemap import SiteMap, map_site
@@ -70,15 +79,17 @@ try:
70
79
  __version__ = _metadata.version("bits-bie")
71
80
  except _metadata.PackageNotFoundError:
72
81
  # Editable/source checkout without installed metadata.
73
- __version__ = "1.2.2"
82
+ __version__ = "1.2.5"
74
83
 
75
84
  __all__ = [
76
85
  "BIE",
77
86
  "BIESettings",
78
87
  "Document",
79
88
  "SearchResult",
89
+ "SearchResponse",
80
90
  "search",
81
91
  "websearch",
92
+ "websearch_response",
82
93
  "extract",
83
94
  "ExtractResult",
84
95
  "ExtractError",
@@ -0,0 +1,84 @@
1
+ """
2
+ Internal helper for calling async BIE internals from synchronous code.
3
+
4
+ Plain scripts have no running event loop, so ``asyncio.run()`` works fine.
5
+ Jupyter/Colab/IPython kernels, however, *already* run an event loop, and
6
+ ``asyncio.run()`` raises::
7
+
8
+ RuntimeError: asyncio.run() cannot be called from a running event loop
9
+
10
+ :func:`run_sync` detects this and transparently falls back to:
11
+
12
+ 1. ``nest_asyncio`` (if installed) — patches the running loop so it can be
13
+ re-entered, then runs the coroutine on it directly.
14
+ 2. A dedicated background thread with its own fresh event loop — works
15
+ everywhere, with zero extra dependencies, at the cost of a thread
16
+ spin-up per call.
17
+
18
+ This means the same sync call (e.g. ``engine.crawl(urls)``) works
19
+ unchanged in plain scripts, notebooks, and servers.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import asyncio
25
+ import threading
26
+ from typing import Any, Coroutine, TypeVar
27
+
28
+ T = TypeVar("T")
29
+
30
+
31
+ def run_sync(coro: Coroutine[Any, Any, T]) -> T:
32
+ """Run ``coro`` to completion and return its result, regardless of
33
+ whether a thread already has an asyncio event loop running.
34
+
35
+ Args:
36
+ coro: An awaitable coroutine object (not yet awaited/started).
37
+
38
+ Returns:
39
+ The coroutine's return value.
40
+
41
+ Raises:
42
+ Whatever exception the coroutine itself raises.
43
+ """
44
+ try:
45
+ asyncio.get_running_loop()
46
+ except RuntimeError:
47
+ # No loop running in this thread — the normal case for scripts,
48
+ # CLI commands, and server request handlers.
49
+ return asyncio.run(coro)
50
+
51
+ # A loop is already running in this thread (e.g. Jupyter/Colab/IPython,
52
+ # or an async framework that called into sync BIE code).
53
+ try:
54
+ import nest_asyncio # type: ignore[import-not-found]
55
+ except ImportError:
56
+ return _run_in_new_thread(coro)
57
+
58
+ nest_asyncio.apply()
59
+ loop = asyncio.get_event_loop()
60
+ return loop.run_until_complete(coro)
61
+
62
+
63
+ def _run_in_new_thread(coro: Coroutine[Any, Any, T]) -> T:
64
+ """Run ``coro`` to completion on a fresh event loop in a new thread.
65
+
66
+ Used as the dependency-free fallback when a loop is already running in
67
+ the calling thread and ``nest_asyncio`` isn't installed.
68
+ """
69
+ result: dict[str, Any] = {}
70
+ error: dict[str, BaseException] = {}
71
+
72
+ def _runner() -> None:
73
+ try:
74
+ result["value"] = asyncio.run(coro)
75
+ except BaseException as exc: # noqa: BLE001 - re-raised on the caller's thread
76
+ error["value"] = exc
77
+
78
+ thread = threading.Thread(target=_runner, name="bie-async-runner", daemon=True)
79
+ thread.start()
80
+ thread.join()
81
+
82
+ if "value" in error:
83
+ raise error["value"]
84
+ return result["value"] # type: ignore[return-value]
@@ -70,6 +70,12 @@ def search(query: str, urls: tuple[str, ...], top_k: int, max_pages: int, no_emb
70
70
  @click.option("--no-deep", is_flag=True, help="Skip crawling; return raw discovery order without snippets")
71
71
  @click.option("--no-embeddings", is_flag=True, help="Disable semantic/vector re-ranking (BM25 only)")
72
72
  @click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
73
+ @click.option(
74
+ "--context",
75
+ "as_context",
76
+ is_flag=True,
77
+ help="Output a numbered, citation-ready text block for an LLM prompt (response.to_context())",
78
+ )
73
79
  def search_live(
74
80
  query: str,
75
81
  top_k: int,
@@ -77,16 +83,18 @@ def search_live(
77
83
  no_deep: bool,
78
84
  no_embeddings: bool,
79
85
  as_json: bool,
86
+ as_context: bool,
80
87
  ) -> None:
81
88
  """Search the live internet for QUERY — no seed URLs, no API key, no subscription.
82
89
 
83
90
  Discovers relevant URLs via free public search endpoints (DuckDuckGo,
84
- with a Bing fallback), crawls them with Bitscrape, and ranks the
91
+ Bing, and optionally a self-hosted SearXNG instance — see
92
+ BIE_DISCOVERY_BACKENDS), crawls them with Bitscrape, and ranks the
85
93
  extracted content against QUERY using BIE's hybrid BM25+vector index.
86
94
  """
87
95
  import bie
88
96
 
89
- results = bie.websearch(
97
+ response = bie.websearch_response(
90
98
  query,
91
99
  top_k=top_k,
92
100
  discovery_results=discovery_results,
@@ -95,23 +103,40 @@ def search_live(
95
103
  )
96
104
 
97
105
  if as_json:
98
- click.echo(json.dumps([r.model_dump() for r in results], indent=2))
106
+ click.echo(response.model_dump_json(indent=2))
107
+ return
108
+
109
+ if as_context:
110
+ click.echo(response.to_context())
99
111
  return
100
112
 
101
- if not results:
113
+ if not response.results:
102
114
  click.echo(
103
115
  "No results found. The free search backends may be temporarily "
104
- "rate-limiting — try again in a moment."
116
+ "rate-limiting — try again in a moment.\n"
105
117
  )
118
+ if response.diagnostics:
119
+ click.echo(f"Diagnosis: {response.diagnostics}")
106
120
  return
107
121
 
108
- for i, r in enumerate(results, 1):
122
+ if response.answer:
123
+ click.echo(f"Answer: {response.answer}\n")
124
+
125
+ if response.degraded:
126
+ click.echo("⚠ Live discovery/crawling was degraded for this query.")
127
+ if response.diagnostics:
128
+ click.echo(f" {response.diagnostics}")
129
+ click.echo()
130
+
131
+ for i, r in enumerate(response.results, 1):
109
132
  click.echo(f"\n{i}. {r.title}")
110
133
  click.echo(f" {r.url}")
111
134
  click.echo(f" score={r.score:.4f}")
112
135
  if r.snippet:
113
136
  click.echo(f" {r.snippet}")
114
137
 
138
+ click.echo(f"\n({response.took_ms:.0f}ms)")
139
+
115
140
 
116
141
  @cli.command()
117
142
  @click.argument("urls", nargs=-1, required=True)
@@ -39,6 +39,25 @@ class BIESettings(BaseSettings):
39
39
  index_dir: str = Field(".bie_index", description="Directory for persisted index")
40
40
  persist: bool = Field(False, description="Persist index to disk between runs")
41
41
 
42
+ # --- Discovery (no-API-key web search) ----------------------------------
43
+ discovery_backends: str = Field(
44
+ "ddg_html,ddg_lite,bing_html",
45
+ description="Comma-separated, ordered list of discovery backends to "
46
+ "try for bie.websearch()/discover_urls(). Built-in backends: "
47
+ "'ddg_html', 'ddg_lite', 'bing_html', 'searxng'. The 'searxng' "
48
+ "backend requires `searxng_url` to also be set. Unknown names are "
49
+ "skipped with a warning. Override with the BIE_DISCOVERY_BACKENDS "
50
+ "env var, e.g. BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html",
51
+ )
52
+ searxng_url: str | None = Field(
53
+ default=None,
54
+ description="Base URL of a self-hosted SearXNG instance (e.g. "
55
+ "'http://localhost:8080'), used by the 'searxng' discovery backend. "
56
+ "Self-hosting SearXNG is the most reliable no-API-key discovery "
57
+ "option since it isn't subject to the rate limits / layout changes "
58
+ "that affect scraping DDG/Bing HTML directly.",
59
+ )
60
+
42
61
  # --- Server --------------------------------------------------------------
43
62
  host: str = "0.0.0.0"
44
63
  port: int = 8000