bits-bie 1.2.2__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bits_bie-1.2.2 → bits_bie-1.2.4}/.github/workflows/ci.yml +1 -1
- {bits_bie-1.2.2 → bits_bie-1.2.4}/PKG-INFO +50 -39
- {bits_bie-1.2.2 → bits_bie-1.2.4}/README.md +49 -38
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/__init__.py +1 -1
- bits_bie-1.2.4/bie/_asyncutil.py +84 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/config.py +19 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/crawler.py +32 -24
- bits_bie-1.2.4/bie/discovery.py +567 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/engine.py +25 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/extract.py +2 -2
- {bits_bie-1.2.2 → bits_bie-1.2.4}/pyproject.toml +1 -1
- bits_bie-1.2.4/tests/test_asyncutil.py +110 -0
- bits_bie-1.2.4/tests/test_crawler.py +134 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_discovery.py +2 -2
- bits_bie-1.2.4/tests/test_discovery_errors.py +301 -0
- bits_bie-1.2.2/bie/discovery.py +0 -467
- {bits_bie-1.2.2 → bits_bie-1.2.4}/.github/workflows/publish.yml +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/.gitignore +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/LICENSE +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/_async_utils.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/chunker.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/cli.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/index.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/integrations/__init__.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/integrations/langchain.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/mcp/__init__.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/mcp/server.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/models.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/query_expansion.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/quicksearch.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/security.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/server.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/sitecrawl.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/sitemap.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/spiders/__init__.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/spiders/generic.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/docs/API.md +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/basic_search.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/extract_page.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/map_and_crawl.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/reusable_index.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/web_search.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_async_utils.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_chunker.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_crawler_notebook_safe.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_crawler_request_patch.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_discovery_error_handling.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_engine.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_extract.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_index.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_query_expansion.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_quicksearch.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_security.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_sitecrawl.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_sitemap.py +0 -0
- {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_spider_relevance.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bits-bie
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: BitSearch Intelligence Engine — real-time, citation-backed web search & extraction for AI apps. Built on Bitscrape.
|
|
5
5
|
Project-URL: Homepage, https://github.com/Sudharsansm/BIE
|
|
6
6
|
Project-URL: Repository, https://github.com/Sudharsansm/BIE
|
|
@@ -66,7 +66,7 @@ API keys, no subscriptions, no third-party search services.**
|
|
|
66
66
|
|
|
67
67
|
BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
|
|
68
68
|
**search, extract, map, crawl, and a hybrid index** — all running locally
|
|
69
|
-
on top of [**
|
|
69
|
+
on top of [**BitS **](https://pypi.org/project/bitscrape/), our
|
|
70
70
|
async crawling framework. Use it as a Python library, REST API, CLI, or
|
|
71
71
|
[MCP](https://modelcontextprotocol.io) server.
|
|
72
72
|
|
|
@@ -146,17 +146,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
|
|
|
146
146
|
pip install "bits-bie[mcp]" # Model Context Protocol server
|
|
147
147
|
pip install "bits-bie[render]" # JS rendering for extract() via Playwright
|
|
148
148
|
pip install "bits-bie[langchain]" # LangChain tool adapters
|
|
149
|
-
pip install "bits-bie[notebook]" # smoother Jupyter/Colab
|
|
149
|
+
pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
|
|
150
150
|
pip install "bits-bie[all]" # everything
|
|
151
151
|
```
|
|
152
152
|
|
|
153
|
-
> **Using BIE in Jupyter / Google Colab?** All sync entry points
|
|
154
|
-
> (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
|
|
155
|
-
> work inside notebooks out of the box — BIE detects the notebook's
|
|
156
|
-
> already-running event loop and handles it automatically. Installing
|
|
157
|
-
> `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
|
|
158
|
-
> efficient, but is not required.
|
|
159
|
-
|
|
160
153
|
> BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
|
|
161
154
|
> proprietary async crawling & extraction framework, which is installed
|
|
162
155
|
> automatically.
|
|
@@ -407,44 +400,62 @@ engine = BIE(BIESettings(
|
|
|
407
400
|
| `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
|
|
408
401
|
| `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
|
|
409
402
|
| `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
|
|
403
|
+
| `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
|
|
404
|
+
| `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
|
|
410
405
|
| `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
|
|
411
|
-
| — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
|
|
412
|
-
| — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
|
|
413
406
|
|
|
414
|
-
|
|
407
|
+
---
|
|
415
408
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
409
|
+
## Troubleshooting
|
|
410
|
+
|
|
411
|
+
**`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
|
|
412
|
+
during a crawl — this was a Bitscrape scheduler bug (its priority queue
|
|
413
|
+
compared `Request` objects directly when two requests shared the same
|
|
414
|
+
priority). BIE patches `bitscrape.Request` to be orderable at import
|
|
415
|
+
time, so this no longer occurs. If you still see it, you're likely on an
|
|
416
|
+
older `bits-bie` version — upgrade.
|
|
417
|
+
|
|
418
|
+
**`RuntimeError: asyncio.run() cannot be called from a running event
|
|
419
|
+
loop`** — Jupyter/Colab/IPython already run an event loop, which used to
|
|
420
|
+
break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
|
|
421
|
+
running loop automatically and either use
|
|
422
|
+
[`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
|
|
423
|
+
`pip install "bits-bie[notebook]"`) or fall back to running the crawl on
|
|
424
|
+
a background thread — no code changes needed. If you're already inside
|
|
425
|
+
an `async def`, you can also call `await engine.acrawl(urls)` directly.
|
|
426
|
+
|
|
427
|
+
**`bie.websearch(...)` returns `[]` / all discovery backends fail** —
|
|
428
|
+
discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
|
|
429
|
+
be blocked or rate-limited. Call
|
|
430
|
+
`bie.discovery.get_last_discovery_diagnostics()` right after to see why:
|
|
421
431
|
|
|
422
|
-
|
|
423
|
-
|
|
432
|
+
```python
|
|
433
|
+
import bie
|
|
434
|
+
from bie.discovery import get_last_discovery_diagnostics
|
|
424
435
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
Colab) rather than a locked-down sandbox.
|
|
430
|
-
- **"reachable but no results"** — connections succeeded but responses
|
|
431
|
-
were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
|
|
432
|
-
This means the IP is likely being rate-limited; try again later, reduce
|
|
433
|
-
request frequency, or switch to a self-hosted backend (below).
|
|
436
|
+
results = bie.websearch("...")
|
|
437
|
+
if not results:
|
|
438
|
+
print(get_last_discovery_diagnostics().summary())
|
|
439
|
+
```
|
|
434
440
|
|
|
435
|
-
|
|
436
|
-
[SearXNG](https://docs.searxng.org/) instance and point BIE at it:
|
|
441
|
+
This distinguishes three cases:
|
|
437
442
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
443
|
+
- **Network blocked** — every backend failed at the connection level
|
|
444
|
+
(or an egress proxy returned `x-deny-reason: host_not_allowed`). This
|
|
445
|
+
environment can't reach these hosts at all — check its outbound
|
|
446
|
+
network/proxy/firewall config. Common in sandboxed code-execution
|
|
447
|
+
environments; Colab and most servers have unrestricted outbound access.
|
|
448
|
+
- **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
|
|
449
|
+
typically from bot-detection on a shared IP. Retry later, reduce
|
|
450
|
+
request volume, or configure a `searxng` backend (below).
|
|
451
|
+
- **Empty response** — got `200 OK` but no parseable results (often a
|
|
452
|
+
CAPTCHA/consent page).
|
|
442
453
|
|
|
443
|
-
|
|
444
|
-
SearXNG
|
|
454
|
+
For the most reliable no-API-key discovery, self-host
|
|
455
|
+
[SearXNG](https://github.com/searxng/searxng) and add it as a backend:
|
|
445
456
|
|
|
446
457
|
```bash
|
|
447
|
-
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
|
|
458
|
+
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
|
|
448
459
|
export BIE_SEARXNG_URL=http://localhost:8080
|
|
449
460
|
```
|
|
450
461
|
|
|
@@ -486,7 +497,7 @@ for Elasticsearch/Milvus-backed implementations behind the same
|
|
|
486
497
|
## Built on Bitscrape
|
|
487
498
|
|
|
488
499
|
BIE's crawling and extraction layer is powered by
|
|
489
|
-
[**
|
|
500
|
+
[**BitS**](https://github.com/Sudharsansm/Bitscrape)
|
|
490
501
|
(`pip install bitscrape`), our async, robots.txt-aware web scraping
|
|
491
502
|
framework — giving BIE high-performance, polite crawling out of the box.
|
|
492
503
|
|
|
@@ -10,7 +10,7 @@ API keys, no subscriptions, no third-party search services.**
|
|
|
10
10
|
|
|
11
11
|
BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
|
|
12
12
|
**search, extract, map, crawl, and a hybrid index** — all running locally
|
|
13
|
-
on top of [**
|
|
13
|
+
on top of [**BitS **](https://pypi.org/project/bitscrape/), our
|
|
14
14
|
async crawling framework. Use it as a Python library, REST API, CLI, or
|
|
15
15
|
[MCP](https://modelcontextprotocol.io) server.
|
|
16
16
|
|
|
@@ -90,17 +90,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
|
|
|
90
90
|
pip install "bits-bie[mcp]" # Model Context Protocol server
|
|
91
91
|
pip install "bits-bie[render]" # JS rendering for extract() via Playwright
|
|
92
92
|
pip install "bits-bie[langchain]" # LangChain tool adapters
|
|
93
|
-
pip install "bits-bie[notebook]" # smoother Jupyter/Colab
|
|
93
|
+
pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
|
|
94
94
|
pip install "bits-bie[all]" # everything
|
|
95
95
|
```
|
|
96
96
|
|
|
97
|
-
> **Using BIE in Jupyter / Google Colab?** All sync entry points
|
|
98
|
-
> (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
|
|
99
|
-
> work inside notebooks out of the box — BIE detects the notebook's
|
|
100
|
-
> already-running event loop and handles it automatically. Installing
|
|
101
|
-
> `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
|
|
102
|
-
> efficient, but is not required.
|
|
103
|
-
|
|
104
97
|
> BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
|
|
105
98
|
> proprietary async crawling & extraction framework, which is installed
|
|
106
99
|
> automatically.
|
|
@@ -351,44 +344,62 @@ engine = BIE(BIESettings(
|
|
|
351
344
|
| `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
|
|
352
345
|
| `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
|
|
353
346
|
| `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
|
|
347
|
+
| `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
|
|
348
|
+
| `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
|
|
354
349
|
| `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
|
|
355
|
-
| — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
|
|
356
|
-
| — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
|
|
357
350
|
|
|
358
|
-
|
|
351
|
+
---
|
|
359
352
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
353
|
+
## Troubleshooting
|
|
354
|
+
|
|
355
|
+
**`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
|
|
356
|
+
during a crawl — this was a Bitscrape scheduler bug (its priority queue
|
|
357
|
+
compared `Request` objects directly when two requests shared the same
|
|
358
|
+
priority). BIE patches `bitscrape.Request` to be orderable at import
|
|
359
|
+
time, so this no longer occurs. If you still see it, you're likely on an
|
|
360
|
+
older `bits-bie` version — upgrade.
|
|
361
|
+
|
|
362
|
+
**`RuntimeError: asyncio.run() cannot be called from a running event
|
|
363
|
+
loop`** — Jupyter/Colab/IPython already run an event loop, which used to
|
|
364
|
+
break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
|
|
365
|
+
running loop automatically and either use
|
|
366
|
+
[`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
|
|
367
|
+
`pip install "bits-bie[notebook]"`) or fall back to running the crawl on
|
|
368
|
+
a background thread — no code changes needed. If you're already inside
|
|
369
|
+
an `async def`, you can also call `await engine.acrawl(urls)` directly.
|
|
370
|
+
|
|
371
|
+
**`bie.websearch(...)` returns `[]` / all discovery backends fail** —
|
|
372
|
+
discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
|
|
373
|
+
be blocked or rate-limited. Call
|
|
374
|
+
`bie.discovery.get_last_discovery_diagnostics()` right after to see why:
|
|
365
375
|
|
|
366
|
-
|
|
367
|
-
|
|
376
|
+
```python
|
|
377
|
+
import bie
|
|
378
|
+
from bie.discovery import get_last_discovery_diagnostics
|
|
368
379
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
Colab) rather than a locked-down sandbox.
|
|
374
|
-
- **"reachable but no results"** — connections succeeded but responses
|
|
375
|
-
were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
|
|
376
|
-
This means the IP is likely being rate-limited; try again later, reduce
|
|
377
|
-
request frequency, or switch to a self-hosted backend (below).
|
|
380
|
+
results = bie.websearch("...")
|
|
381
|
+
if not results:
|
|
382
|
+
print(get_last_discovery_diagnostics().summary())
|
|
383
|
+
```
|
|
378
384
|
|
|
379
|
-
|
|
380
|
-
[SearXNG](https://docs.searxng.org/) instance and point BIE at it:
|
|
385
|
+
This distinguishes three cases:
|
|
381
386
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
387
|
+
- **Network blocked** — every backend failed at the connection level
|
|
388
|
+
(or an egress proxy returned `x-deny-reason: host_not_allowed`). This
|
|
389
|
+
environment can't reach these hosts at all — check its outbound
|
|
390
|
+
network/proxy/firewall config. Common in sandboxed code-execution
|
|
391
|
+
environments; Colab and most servers have unrestricted outbound access.
|
|
392
|
+
- **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
|
|
393
|
+
typically from bot-detection on a shared IP. Retry later, reduce
|
|
394
|
+
request volume, or configure a `searxng` backend (below).
|
|
395
|
+
- **Empty response** — got `200 OK` but no parseable results (often a
|
|
396
|
+
CAPTCHA/consent page).
|
|
386
397
|
|
|
387
|
-
|
|
388
|
-
SearXNG
|
|
398
|
+
For the most reliable no-API-key discovery, self-host
|
|
399
|
+
[SearXNG](https://github.com/searxng/searxng) and add it as a backend:
|
|
389
400
|
|
|
390
401
|
```bash
|
|
391
|
-
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
|
|
402
|
+
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
|
|
392
403
|
export BIE_SEARXNG_URL=http://localhost:8080
|
|
393
404
|
```
|
|
394
405
|
|
|
@@ -430,7 +441,7 @@ for Elasticsearch/Milvus-backed implementations behind the same
|
|
|
430
441
|
## Built on Bitscrape
|
|
431
442
|
|
|
432
443
|
BIE's crawling and extraction layer is powered by
|
|
433
|
-
[**
|
|
444
|
+
[**BitS**](https://github.com/Sudharsansm/Bitscrape)
|
|
434
445
|
(`pip install bitscrape`), our async, robots.txt-aware web scraping
|
|
435
446
|
framework — giving BIE high-performance, polite crawling out of the box.
|
|
436
447
|
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Internal helper for calling async BIE internals from synchronous code.
|
|
3
|
+
|
|
4
|
+
Plain scripts have no running event loop, so ``asyncio.run()`` works fine.
|
|
5
|
+
Jupyter/Colab/IPython kernels, however, *already* run an event loop, and
|
|
6
|
+
``asyncio.run()`` raises::
|
|
7
|
+
|
|
8
|
+
RuntimeError: asyncio.run() cannot be called from a running event loop
|
|
9
|
+
|
|
10
|
+
:func:`run_sync` detects this and transparently falls back to:
|
|
11
|
+
|
|
12
|
+
1. ``nest_asyncio`` (if installed) — patches the running loop so it can be
|
|
13
|
+
re-entered, then runs the coroutine on it directly.
|
|
14
|
+
2. A dedicated background thread with its own fresh event loop — works
|
|
15
|
+
everywhere, with zero extra dependencies, at the cost of a thread
|
|
16
|
+
spin-up per call.
|
|
17
|
+
|
|
18
|
+
This means the same sync call (e.g. ``engine.crawl(urls)``) works
|
|
19
|
+
unchanged in plain scripts, notebooks, and servers.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import asyncio
|
|
25
|
+
import threading
|
|
26
|
+
from typing import Any, Coroutine, TypeVar
|
|
27
|
+
|
|
28
|
+
T = TypeVar("T")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def run_sync(coro: Coroutine[Any, Any, T]) -> T:
|
|
32
|
+
"""Run ``coro`` to completion and return its result, regardless of
|
|
33
|
+
whether a thread already has an asyncio event loop running.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
coro: An awaitable coroutine object (not yet awaited/started).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The coroutine's return value.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
Whatever exception the coroutine itself raises.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
asyncio.get_running_loop()
|
|
46
|
+
except RuntimeError:
|
|
47
|
+
# No loop running in this thread — the normal case for scripts,
|
|
48
|
+
# CLI commands, and server request handlers.
|
|
49
|
+
return asyncio.run(coro)
|
|
50
|
+
|
|
51
|
+
# A loop is already running in this thread (e.g. Jupyter/Colab/IPython,
|
|
52
|
+
# or an async framework that called into sync BIE code).
|
|
53
|
+
try:
|
|
54
|
+
import nest_asyncio # type: ignore[import-not-found]
|
|
55
|
+
except ImportError:
|
|
56
|
+
return _run_in_new_thread(coro)
|
|
57
|
+
|
|
58
|
+
nest_asyncio.apply()
|
|
59
|
+
loop = asyncio.get_event_loop()
|
|
60
|
+
return loop.run_until_complete(coro)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _run_in_new_thread(coro: Coroutine[Any, Any, T]) -> T:
|
|
64
|
+
"""Run ``coro`` to completion on a fresh event loop in a new thread.
|
|
65
|
+
|
|
66
|
+
Used as the dependency-free fallback when a loop is already running in
|
|
67
|
+
the calling thread and ``nest_asyncio`` isn't installed.
|
|
68
|
+
"""
|
|
69
|
+
result: dict[str, Any] = {}
|
|
70
|
+
error: dict[str, BaseException] = {}
|
|
71
|
+
|
|
72
|
+
def _runner() -> None:
|
|
73
|
+
try:
|
|
74
|
+
result["value"] = asyncio.run(coro)
|
|
75
|
+
except BaseException as exc: # noqa: BLE001 - re-raised on the caller's thread
|
|
76
|
+
error["value"] = exc
|
|
77
|
+
|
|
78
|
+
thread = threading.Thread(target=_runner, name="bie-async-runner", daemon=True)
|
|
79
|
+
thread.start()
|
|
80
|
+
thread.join()
|
|
81
|
+
|
|
82
|
+
if "value" in error:
|
|
83
|
+
raise error["value"]
|
|
84
|
+
return result["value"] # type: ignore[return-value]
|
|
@@ -39,6 +39,25 @@ class BIESettings(BaseSettings):
|
|
|
39
39
|
index_dir: str = Field(".bie_index", description="Directory for persisted index")
|
|
40
40
|
persist: bool = Field(False, description="Persist index to disk between runs")
|
|
41
41
|
|
|
42
|
+
# --- Discovery (no-API-key web search) ----------------------------------
|
|
43
|
+
discovery_backends: str = Field(
|
|
44
|
+
"ddg_html,ddg_lite,bing_html",
|
|
45
|
+
description="Comma-separated, ordered list of discovery backends to "
|
|
46
|
+
"try for bie.websearch()/discover_urls(). Built-in backends: "
|
|
47
|
+
"'ddg_html', 'ddg_lite', 'bing_html', 'searxng'. The 'searxng' "
|
|
48
|
+
"backend requires `searxng_url` to also be set. Unknown names are "
|
|
49
|
+
"skipped with a warning. Override with the BIE_DISCOVERY_BACKENDS "
|
|
50
|
+
"env var, e.g. BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html",
|
|
51
|
+
)
|
|
52
|
+
searxng_url: str | None = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description="Base URL of a self-hosted SearXNG instance (e.g. "
|
|
55
|
+
"'http://localhost:8080'), used by the 'searxng' discovery backend. "
|
|
56
|
+
"Self-hosting SearXNG is the most reliable no-API-key discovery "
|
|
57
|
+
"option since it isn't subject to the rate limits / layout changes "
|
|
58
|
+
"that affect scraping DDG/Bing HTML directly.",
|
|
59
|
+
)
|
|
60
|
+
|
|
42
61
|
# --- Server --------------------------------------------------------------
|
|
43
62
|
host: str = "0.0.0.0"
|
|
44
63
|
port: int = 8000
|
|
@@ -16,7 +16,7 @@ from urllib.parse import urlparse
|
|
|
16
16
|
import bitscrape
|
|
17
17
|
from bitscrape.pipeline.pipelines import BasePipeline
|
|
18
18
|
|
|
19
|
-
from bie.
|
|
19
|
+
from bie._asyncutil import run_sync
|
|
20
20
|
from bie.config import BIESettings
|
|
21
21
|
from bie.models import Document
|
|
22
22
|
from bie.spiders.generic import BIESpider
|
|
@@ -25,33 +25,39 @@ logger = logging.getLogger("bie.crawler")
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def _patch_request_ordering() -> None:
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
same priority, ``heapq`` falls back to comparing the ``Request``
|
|
34
|
-
objects directly with ``<`` — but ``Request`` (a pydantic
|
|
35
|
-
``BaseModel``) doesn't define ``__lt__``, so this raises::
|
|
28
|
+
"""Work around a Bitscrape bug where its scheduler's
|
|
29
|
+
``asyncio.PriorityQueue[tuple[int, Request]]`` compares ``Request``
|
|
30
|
+
objects directly whenever two requests share the same priority
|
|
31
|
+
(the common case -- most requests are ``RequestPriority.NORMAL``),
|
|
32
|
+
raising::
|
|
36
33
|
|
|
37
34
|
TypeError: '<' not supported between instances of 'Request' and 'Request'
|
|
38
35
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
``Request`` is a pydantic model with no ``__lt__``/etc., so tuple
|
|
37
|
+
comparison falls through to comparing the ``Request`` instances
|
|
38
|
+
themselves once priorities tie.
|
|
39
|
+
|
|
40
|
+
This patches ``bitscrape.Request`` (a pydantic ``BaseModel``) with an
|
|
41
|
+
identity-based ordering at import time, so equal-priority ties are
|
|
42
|
+
broken deterministically instead of crashing. This does not change
|
|
43
|
+
crawl semantics -- priority still determines order; only the
|
|
44
|
+
previously-crashing tie-break becomes well-defined.
|
|
45
|
+
|
|
46
|
+
The patch is idempotent and a no-op if a future Bitscrape release
|
|
47
|
+
already defines ``__lt__`` on ``Request``.
|
|
43
48
|
"""
|
|
44
|
-
request_cls = bitscrape
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# Already defines real ordering (future Bitscrape fix) — no-op.
|
|
49
|
+
request_cls = getattr(bitscrape, "Request", None)
|
|
50
|
+
if request_cls is None:
|
|
51
|
+
logger.debug("bitscrape.Request not found -- skipping ordering patch")
|
|
48
52
|
return
|
|
53
|
+
if "__lt__" in request_cls.__dict__:
|
|
54
|
+
return # already orderable (newer bitscrape version fixed it upstream)
|
|
49
55
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
request_cls.
|
|
54
|
-
logger.debug("Patched bitscrape.Request
|
|
56
|
+
request_cls.__lt__ = lambda self, other: id(self) < id(other)
|
|
57
|
+
request_cls.__le__ = lambda self, other: id(self) <= id(other)
|
|
58
|
+
request_cls.__gt__ = lambda self, other: id(self) > id(other)
|
|
59
|
+
request_cls.__ge__ = lambda self, other: id(self) >= id(other)
|
|
60
|
+
logger.debug("Patched bitscrape.Request with identity-based ordering")
|
|
55
61
|
|
|
56
62
|
|
|
57
63
|
_patch_request_ordering()
|
|
@@ -80,8 +86,10 @@ class Crawler:
|
|
|
80
86
|
"""Synchronous convenience wrapper around :meth:`acrawl`.
|
|
81
87
|
|
|
82
88
|
Safe to call from plain scripts, CLI commands, server request
|
|
83
|
-
handlers, *and* Jupyter/Colab notebooks (which already run
|
|
84
|
-
event loop
|
|
89
|
+
handlers, *and* Jupyter/Colab/IPython notebooks (which already run
|
|
90
|
+
an event loop, where a plain ``asyncio.run()`` would raise
|
|
91
|
+
``RuntimeError: asyncio.run() cannot be called from a running
|
|
92
|
+
event loop``). See :func:`bie._asyncutil.run_sync`.
|
|
85
93
|
"""
|
|
86
94
|
return run_sync(self.acrawl(urls, allowed_domains, instruction))
|
|
87
95
|
|