bits-bie 1.2.2__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {bits_bie-1.2.2 → bits_bie-1.2.4}/.github/workflows/ci.yml +1 -1
  2. {bits_bie-1.2.2 → bits_bie-1.2.4}/PKG-INFO +50 -39
  3. {bits_bie-1.2.2 → bits_bie-1.2.4}/README.md +49 -38
  4. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/__init__.py +1 -1
  5. bits_bie-1.2.4/bie/_asyncutil.py +84 -0
  6. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/config.py +19 -0
  7. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/crawler.py +32 -24
  8. bits_bie-1.2.4/bie/discovery.py +567 -0
  9. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/engine.py +25 -0
  10. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/extract.py +2 -2
  11. {bits_bie-1.2.2 → bits_bie-1.2.4}/pyproject.toml +1 -1
  12. bits_bie-1.2.4/tests/test_asyncutil.py +110 -0
  13. bits_bie-1.2.4/tests/test_crawler.py +134 -0
  14. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_discovery.py +2 -2
  15. bits_bie-1.2.4/tests/test_discovery_errors.py +301 -0
  16. bits_bie-1.2.2/bie/discovery.py +0 -467
  17. {bits_bie-1.2.2 → bits_bie-1.2.4}/.github/workflows/publish.yml +0 -0
  18. {bits_bie-1.2.2 → bits_bie-1.2.4}/.gitignore +0 -0
  19. {bits_bie-1.2.2 → bits_bie-1.2.4}/LICENSE +0 -0
  20. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/_async_utils.py +0 -0
  21. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/chunker.py +0 -0
  22. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/cli.py +0 -0
  23. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/index.py +0 -0
  24. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/integrations/__init__.py +0 -0
  25. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/integrations/langchain.py +0 -0
  26. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/mcp/__init__.py +0 -0
  27. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/mcp/server.py +0 -0
  28. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/models.py +0 -0
  29. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/query_expansion.py +0 -0
  30. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/quicksearch.py +0 -0
  31. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/security.py +0 -0
  32. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/server.py +0 -0
  33. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/sitecrawl.py +0 -0
  34. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/sitemap.py +0 -0
  35. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/spiders/__init__.py +0 -0
  36. {bits_bie-1.2.2 → bits_bie-1.2.4}/bie/spiders/generic.py +0 -0
  37. {bits_bie-1.2.2 → bits_bie-1.2.4}/docs/API.md +0 -0
  38. {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/basic_search.py +0 -0
  39. {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/extract_page.py +0 -0
  40. {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/map_and_crawl.py +0 -0
  41. {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/reusable_index.py +0 -0
  42. {bits_bie-1.2.2 → bits_bie-1.2.4}/examples/web_search.py +0 -0
  43. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_async_utils.py +0 -0
  44. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_chunker.py +0 -0
  45. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_crawler_notebook_safe.py +0 -0
  46. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_crawler_request_patch.py +0 -0
  47. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_discovery_error_handling.py +0 -0
  48. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_engine.py +0 -0
  49. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_extract.py +0 -0
  50. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_index.py +0 -0
  51. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_query_expansion.py +0 -0
  52. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_quicksearch.py +0 -0
  53. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_security.py +0 -0
  54. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_sitecrawl.py +0 -0
  55. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_sitemap.py +0 -0
  56. {bits_bie-1.2.2 → bits_bie-1.2.4}/tests/test_spider_relevance.py +0 -0
@@ -24,4 +24,4 @@ jobs:
24
24
  - name: Run tests
25
25
  run: pytest -v
26
26
  - name: Lint
27
- run: ruff check bie tests
27
+ run: ruff check bie tests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bits-bie
3
- Version: 1.2.2
3
+ Version: 1.2.4
4
4
  Summary: BitSearch Intelligence Engine — real-time, citation-backed web search & extraction for AI apps. Built on Bitscrape.
5
5
  Project-URL: Homepage, https://github.com/Sudharsansm/BIE
6
6
  Project-URL: Repository, https://github.com/Sudharsansm/BIE
@@ -66,7 +66,7 @@ API keys, no subscriptions, no third-party search services.**
66
66
 
67
67
  BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
68
68
  **search, extract, map, crawl, and a hybrid index** — all running locally
69
- on top of [**Bitscrape**](https://pypi.org/project/bitscrape/), our
69
+ on top of [**BitS **](https://pypi.org/project/bitscrape/), our
70
70
  async crawling framework. Use it as a Python library, REST API, CLI, or
71
71
  [MCP](https://modelcontextprotocol.io) server.
72
72
 
@@ -146,17 +146,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
146
146
  pip install "bits-bie[mcp]" # Model Context Protocol server
147
147
  pip install "bits-bie[render]" # JS rendering for extract() via Playwright
148
148
  pip install "bits-bie[langchain]" # LangChain tool adapters
149
- pip install "bits-bie[notebook]" # smoother Jupyter/Colab support (nest_asyncio)
149
+ pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
150
150
  pip install "bits-bie[all]" # everything
151
151
  ```
152
152
 
153
- > **Using BIE in Jupyter / Google Colab?** All sync entry points
154
- > (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
155
- > work inside notebooks out of the box — BIE detects the notebook's
156
- > already-running event loop and handles it automatically. Installing
157
- > `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
158
- > efficient, but is not required.
159
-
160
153
  > BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
161
154
  > proprietary async crawling & extraction framework, which is installed
162
155
  > automatically.
@@ -407,44 +400,62 @@ engine = BIE(BIESettings(
407
400
  | `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
408
401
  | `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
409
402
  | `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
403
+ | `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
404
+ | `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
410
405
  | `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
411
- | — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
412
- | — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
413
406
 
414
- ### Discovery backends & troubleshooting empty `websearch()` results
407
+ ---
415
408
 
416
- `websearch()` discovers candidate URLs by scraping public search-engine
417
- result pages (DuckDuckGo HTML, DuckDuckGo Lite, Bing HTML, in that order
418
- by default). This is inherently fragile these are not official APIs,
419
- and shared/cloud IPs (CI runners, some notebook hosts, restrictive
420
- sandboxes) can be rate-limited or blocked entirely.
409
+ ## Troubleshooting
410
+
411
+ **`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
412
+ during a crawl this was a Bitscrape scheduler bug (its priority queue
413
+ compared `Request` objects directly when two requests shared the same
414
+ priority). BIE patches `bitscrape.Request` to be orderable at import
415
+ time, so this no longer occurs. If you still see it, you're likely on an
416
+ older `bits-bie` version — upgrade.
417
+
418
+ **`RuntimeError: asyncio.run() cannot be called from a running event
419
+ loop`** — Jupyter/Colab/IPython already run an event loop, which used to
420
+ break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
421
+ running loop automatically and either use
422
+ [`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
423
+ `pip install "bits-bie[notebook]"`) or fall back to running the crawl on
424
+ a background thread — no code changes needed. If you're already inside
425
+ an `async def`, you can also call `await engine.acrawl(urls)` directly.
426
+
427
+ **`bie.websearch(...)` returns `[]` / all discovery backends fail** —
428
+ discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
429
+ be blocked or rate-limited. Call
430
+ `bie.discovery.get_last_discovery_diagnostics()` right after to see why:
421
431
 
422
- If `websearch()` returns `[]`, BIE logs a `WARNING` that distinguishes
423
- two failure categories:
432
+ ```python
433
+ import bie
434
+ from bie.discovery import get_last_discovery_diagnostics
424
435
 
425
- - **"network blocked"** — every backend failed at the connection level
426
- (timeouts, connection refused, or a sandbox/proxy denial). This means
427
- the environment itself can't reach these hosts — re-run in an
428
- environment with normal internet access (a local machine, server, or
429
- Colab) rather than a locked-down sandbox.
430
- - **"reachable but no results"** — connections succeeded but responses
431
- were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
432
- This means the IP is likely being rate-limited; try again later, reduce
433
- request frequency, or switch to a self-hosted backend (below).
436
+ results = bie.websearch("...")
437
+ if not results:
438
+ print(get_last_discovery_diagnostics().summary())
439
+ ```
434
440
 
435
- For a durable fix to rate-limiting, run a self-hosted
436
- [SearXNG](https://docs.searxng.org/) instance and point BIE at it:
441
+ This distinguishes three cases:
437
442
 
438
- ```bash
439
- export BIE_DISCOVERY_BACKENDS=searxng
440
- export BIE_SEARXNG_URL=http://localhost:8080
441
- ```
443
+ - **Network blocked** — every backend failed at the connection level
444
+ (or an egress proxy returned `x-deny-reason: host_not_allowed`). This
445
+ environment can't reach these hosts at all — check its outbound
446
+ network/proxy/firewall config. Common in sandboxed code-execution
447
+ environments; Colab and most servers have unrestricted outbound access.
448
+ - **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
449
+ typically from bot-detection on a shared IP. Retry later, reduce
450
+ request volume, or configure a `searxng` backend (below).
451
+ - **Empty response** — got `200 OK` but no parseable results (often a
452
+ CAPTCHA/consent page).
442
453
 
443
- You can also combine backends and reorder them, e.g. to prefer your
444
- SearXNG instance but fall back to DuckDuckGo:
454
+ For the most reliable no-API-key discovery, self-host
455
+ [SearXNG](https://github.com/searxng/searxng) and add it as a backend:
445
456
 
446
457
  ```bash
447
- export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
458
+ export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
448
459
  export BIE_SEARXNG_URL=http://localhost:8080
449
460
  ```
450
461
 
@@ -486,7 +497,7 @@ for Elasticsearch/Milvus-backed implementations behind the same
486
497
  ## Built on Bitscrape
487
498
 
488
499
  BIE's crawling and extraction layer is powered by
489
- [**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
500
+ [**BitS**](https://github.com/Sudharsansm/Bitscrape)
490
501
  (`pip install bitscrape`), our async, robots.txt-aware web scraping
491
502
  framework — giving BIE high-performance, polite crawling out of the box.
492
503
 
@@ -10,7 +10,7 @@ API keys, no subscriptions, no third-party search services.**
10
10
 
11
11
  BIE gives any LLM, RAG pipeline, or AI agent five core primitives —
12
12
  **search, extract, map, crawl, and a hybrid index** — all running locally
13
- on top of [**Bitscrape**](https://pypi.org/project/bitscrape/), our
13
+ on top of [**BitS **](https://pypi.org/project/bitscrape/), our
14
14
  async crawling framework. Use it as a Python library, REST API, CLI, or
15
15
  [MCP](https://modelcontextprotocol.io) server.
16
16
 
@@ -90,17 +90,10 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
90
90
  pip install "bits-bie[mcp]" # Model Context Protocol server
91
91
  pip install "bits-bie[render]" # JS rendering for extract() via Playwright
92
92
  pip install "bits-bie[langchain]" # LangChain tool adapters
93
- pip install "bits-bie[notebook]" # smoother Jupyter/Colab support (nest_asyncio)
93
+ pip install "bits-bie[notebook]" # smoother async behaviour in Jupyter/Colab
94
94
  pip install "bits-bie[all]" # everything
95
95
  ```
96
96
 
97
- > **Using BIE in Jupyter / Google Colab?** All sync entry points
98
- > (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
99
- > work inside notebooks out of the box — BIE detects the notebook's
100
- > already-running event loop and handles it automatically. Installing
101
- > `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
102
- > efficient, but is not required.
103
-
104
97
  > BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
105
98
  > proprietary async crawling & extraction framework, which is installed
106
99
  > automatically.
@@ -351,44 +344,62 @@ engine = BIE(BIESettings(
351
344
  | `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
352
345
  | `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
353
346
  | `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
347
+ | `discovery_backends` | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Ordered, comma-separated discovery backends for `websearch()`. Add `searxng` for a self-hosted instance. |
348
+ | `searxng_url` | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted SearXNG instance, used by the `searxng` discovery backend |
354
349
  | `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
355
- | — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
356
- | — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
357
350
 
358
- ### Discovery backends & troubleshooting empty `websearch()` results
351
+ ---
359
352
 
360
- `websearch()` discovers candidate URLs by scraping public search-engine
361
- result pages (DuckDuckGo HTML, DuckDuckGo Lite, Bing HTML, in that order
362
- by default). This is inherently fragile these are not official APIs,
363
- and shared/cloud IPs (CI runners, some notebook hosts, restrictive
364
- sandboxes) can be rate-limited or blocked entirely.
353
+ ## Troubleshooting
354
+
355
+ **`TypeError: '<' not supported between instances of 'Request' and 'Request'`**
356
+ during a crawl this was a Bitscrape scheduler bug (its priority queue
357
+ compared `Request` objects directly when two requests shared the same
358
+ priority). BIE patches `bitscrape.Request` to be orderable at import
359
+ time, so this no longer occurs. If you still see it, you're likely on an
360
+ older `bits-bie` version — upgrade.
361
+
362
+ **`RuntimeError: asyncio.run() cannot be called from a running event
363
+ loop`** — Jupyter/Colab/IPython already run an event loop, which used to
364
+ break `engine.crawl(urls)` / `bie.websearch(...)`. Both now detect a
365
+ running loop automatically and either use
366
+ [`nest_asyncio`](https://pypi.org/project/nest_asyncio/) (install via
367
+ `pip install "bits-bie[notebook]"`) or fall back to running the crawl on
368
+ a background thread — no code changes needed. If you're already inside
369
+ an `async def`, you can also call `await engine.acrawl(urls)` directly.
370
+
371
+ **`bie.websearch(...)` returns `[]` / all discovery backends fail** —
372
+ discovery scrapes DuckDuckGo/Bing's public HTML result pages, which can
373
+ be blocked or rate-limited. Call
374
+ `bie.discovery.get_last_discovery_diagnostics()` right after to see why:
365
375
 
366
- If `websearch()` returns `[]`, BIE logs a `WARNING` that distinguishes
367
- two failure categories:
376
+ ```python
377
+ import bie
378
+ from bie.discovery import get_last_discovery_diagnostics
368
379
 
369
- - **"network blocked"** — every backend failed at the connection level
370
- (timeouts, connection refused, or a sandbox/proxy denial). This means
371
- the environment itself can't reach these hosts — re-run in an
372
- environment with normal internet access (a local machine, server, or
373
- Colab) rather than a locked-down sandbox.
374
- - **"reachable but no results"** — connections succeeded but responses
375
- were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
376
- This means the IP is likely being rate-limited; try again later, reduce
377
- request frequency, or switch to a self-hosted backend (below).
380
+ results = bie.websearch("...")
381
+ if not results:
382
+ print(get_last_discovery_diagnostics().summary())
383
+ ```
378
384
 
379
- For a durable fix to rate-limiting, run a self-hosted
380
- [SearXNG](https://docs.searxng.org/) instance and point BIE at it:
385
+ This distinguishes three cases:
381
386
 
382
- ```bash
383
- export BIE_DISCOVERY_BACKENDS=searxng
384
- export BIE_SEARXNG_URL=http://localhost:8080
385
- ```
387
+ - **Network blocked** — every backend failed at the connection level
388
+ (or an egress proxy returned `x-deny-reason: host_not_allowed`). This
389
+ environment can't reach these hosts at all — check its outbound
390
+ network/proxy/firewall config. Common in sandboxed code-execution
391
+ environments; Colab and most servers have unrestricted outbound access.
392
+ - **Blocked / rate-limited** — backends responded with `403`/`429`/etc.,
393
+ typically from bot-detection on a shared IP. Retry later, reduce
394
+ request volume, or configure a `searxng` backend (below).
395
+ - **Empty response** — got `200 OK` but no parseable results (often a
396
+ CAPTCHA/consent page).
386
397
 
387
- You can also combine backends and reorder them, e.g. to prefer your
388
- SearXNG instance but fall back to DuckDuckGo:
398
+ For the most reliable no-API-key discovery, self-host
399
+ [SearXNG](https://github.com/searxng/searxng) and add it as a backend:
389
400
 
390
401
  ```bash
391
- export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
402
+ export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html
392
403
  export BIE_SEARXNG_URL=http://localhost:8080
393
404
  ```
394
405
 
@@ -430,7 +441,7 @@ for Elasticsearch/Milvus-backed implementations behind the same
430
441
  ## Built on Bitscrape
431
442
 
432
443
  BIE's crawling and extraction layer is powered by
433
- [**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
444
+ [**BitS**](https://github.com/Sudharsansm/Bitscrape)
434
445
  (`pip install bitscrape`), our async, robots.txt-aware web scraping
435
446
  framework — giving BIE high-performance, polite crawling out of the box.
436
447
 
@@ -70,7 +70,7 @@ try:
70
70
  __version__ = _metadata.version("bits-bie")
71
71
  except _metadata.PackageNotFoundError:
72
72
  # Editable/source checkout without installed metadata.
73
- __version__ = "1.2.2"
73
+ __version__ = "1.2.4"
74
74
 
75
75
  __all__ = [
76
76
  "BIE",
@@ -0,0 +1,84 @@
1
+ """
2
+ Internal helper for calling async BIE internals from synchronous code.
3
+
4
+ Plain scripts have no running event loop, so ``asyncio.run()`` works fine.
5
+ Jupyter/Colab/IPython kernels, however, *already* run an event loop, and
6
+ ``asyncio.run()`` raises::
7
+
8
+ RuntimeError: asyncio.run() cannot be called from a running event loop
9
+
10
+ :func:`run_sync` detects this and transparently falls back to:
11
+
12
+ 1. ``nest_asyncio`` (if installed) — patches the running loop so it can be
13
+ re-entered, then runs the coroutine on it directly.
14
+ 2. A dedicated background thread with its own fresh event loop — works
15
+ everywhere, with zero extra dependencies, at the cost of a thread
16
+ spin-up per call.
17
+
18
+ This means the same sync call (e.g. ``engine.crawl(urls)``) works
19
+ unchanged in plain scripts, notebooks, and servers.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import asyncio
25
+ import threading
26
+ from typing import Any, Coroutine, TypeVar
27
+
28
+ T = TypeVar("T")
29
+
30
+
31
+ def run_sync(coro: Coroutine[Any, Any, T]) -> T:
32
+ """Run ``coro`` to completion and return its result, regardless of
33
+ whether a thread already has an asyncio event loop running.
34
+
35
+ Args:
36
+ coro: An awaitable coroutine object (not yet awaited/started).
37
+
38
+ Returns:
39
+ The coroutine's return value.
40
+
41
+ Raises:
42
+ Whatever exception the coroutine itself raises.
43
+ """
44
+ try:
45
+ asyncio.get_running_loop()
46
+ except RuntimeError:
47
+ # No loop running in this thread — the normal case for scripts,
48
+ # CLI commands, and server request handlers.
49
+ return asyncio.run(coro)
50
+
51
+ # A loop is already running in this thread (e.g. Jupyter/Colab/IPython,
52
+ # or an async framework that called into sync BIE code).
53
+ try:
54
+ import nest_asyncio # type: ignore[import-not-found]
55
+ except ImportError:
56
+ return _run_in_new_thread(coro)
57
+
58
+ nest_asyncio.apply()
59
+ loop = asyncio.get_event_loop()
60
+ return loop.run_until_complete(coro)
61
+
62
+
63
+ def _run_in_new_thread(coro: Coroutine[Any, Any, T]) -> T:
64
+ """Run ``coro`` to completion on a fresh event loop in a new thread.
65
+
66
+ Used as the dependency-free fallback when a loop is already running in
67
+ the calling thread and ``nest_asyncio`` isn't installed.
68
+ """
69
+ result: dict[str, Any] = {}
70
+ error: dict[str, BaseException] = {}
71
+
72
+ def _runner() -> None:
73
+ try:
74
+ result["value"] = asyncio.run(coro)
75
+ except BaseException as exc: # noqa: BLE001 - re-raised on the caller's thread
76
+ error["value"] = exc
77
+
78
+ thread = threading.Thread(target=_runner, name="bie-async-runner", daemon=True)
79
+ thread.start()
80
+ thread.join()
81
+
82
+ if "value" in error:
83
+ raise error["value"]
84
+ return result["value"] # type: ignore[return-value]
@@ -39,6 +39,25 @@ class BIESettings(BaseSettings):
39
39
  index_dir: str = Field(".bie_index", description="Directory for persisted index")
40
40
  persist: bool = Field(False, description="Persist index to disk between runs")
41
41
 
42
+ # --- Discovery (no-API-key web search) ----------------------------------
43
+ discovery_backends: str = Field(
44
+ "ddg_html,ddg_lite,bing_html",
45
+ description="Comma-separated, ordered list of discovery backends to "
46
+ "try for bie.websearch()/discover_urls(). Built-in backends: "
47
+ "'ddg_html', 'ddg_lite', 'bing_html', 'searxng'. The 'searxng' "
48
+ "backend requires `searxng_url` to also be set. Unknown names are "
49
+ "skipped with a warning. Override with the BIE_DISCOVERY_BACKENDS "
50
+ "env var, e.g. BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite,bing_html",
51
+ )
52
+ searxng_url: str | None = Field(
53
+ default=None,
54
+ description="Base URL of a self-hosted SearXNG instance (e.g. "
55
+ "'http://localhost:8080'), used by the 'searxng' discovery backend. "
56
+ "Self-hosting SearXNG is the most reliable no-API-key discovery "
57
+ "option since it isn't subject to the rate limits / layout changes "
58
+ "that affect scraping DDG/Bing HTML directly.",
59
+ )
60
+
42
61
  # --- Server --------------------------------------------------------------
43
62
  host: str = "0.0.0.0"
44
63
  port: int = 8000
@@ -16,7 +16,7 @@ from urllib.parse import urlparse
16
16
  import bitscrape
17
17
  from bitscrape.pipeline.pipelines import BasePipeline
18
18
 
19
- from bie._async_utils import run_sync
19
+ from bie._asyncutil import run_sync
20
20
  from bie.config import BIESettings
21
21
  from bie.models import Document
22
22
  from bie.spiders.generic import BIESpider
@@ -25,33 +25,39 @@ logger = logging.getLogger("bie.crawler")
25
25
 
26
26
 
27
27
  def _patch_request_ordering() -> None:
28
- """Make ``bitscrape.Request`` orderable for its priority-queue
29
- tie-breaks.
30
-
31
- Bitscrape's scheduler stores requests in an ``asyncio.PriorityQueue``
32
- as ``(priority.value, request)`` tuples. When two requests share the
33
- same priority, ``heapq`` falls back to comparing the ``Request``
34
- objects directly with ``<`` — but ``Request`` (a pydantic
35
- ``BaseModel``) doesn't define ``__lt__``, so this raises::
28
+ """Work around a Bitscrape bug where its scheduler's
29
+ ``asyncio.PriorityQueue[tuple[int, Request]]`` compares ``Request``
30
+ objects directly whenever two requests share the same priority
31
+ (the common case -- most requests are ``RequestPriority.NORMAL``),
32
+ raising::
36
33
 
37
34
  TypeError: '<' not supported between instances of 'Request' and 'Request'
38
35
 
39
- This patches in an arbitrary-but-stable ``__lt__`` (by ``id()``) so
40
- same-priority requests can be ordered without error. The patch is a
41
- no-op if a future Bitscrape version already defines ``__lt__`` on
42
- ``Request``.
36
+ ``Request`` is a pydantic model with no ``__lt__``/etc., so tuple
37
+ comparison falls through to comparing the ``Request`` instances
38
+ themselves once priorities tie.
39
+
40
+ This patches ``bitscrape.Request`` (a pydantic ``BaseModel``) with an
41
+ identity-based ordering at import time, so equal-priority ties are
42
+ broken deterministically instead of crashing. This does not change
43
+ crawl semantics -- priority still determines order; only the
44
+ previously-crashing tie-break becomes well-defined.
45
+
46
+ The patch is idempotent and a no-op if a future Bitscrape release
47
+ already defines ``__lt__`` on ``Request``.
43
48
  """
44
- request_cls = bitscrape.Request
45
- current = getattr(request_cls, "__lt__", None)
46
- if current is not None and current is not object.__lt__:
47
- # Already defines real ordering (future Bitscrape fix) — no-op.
49
+ request_cls = getattr(bitscrape, "Request", None)
50
+ if request_cls is None:
51
+ logger.debug("bitscrape.Request not found -- skipping ordering patch")
48
52
  return
53
+ if "__lt__" in request_cls.__dict__:
54
+ return # already orderable (newer bitscrape version fixed it upstream)
49
55
 
50
- def _lt(self: Any, other: Any) -> bool:
51
- return id(self) < id(other)
52
-
53
- request_cls.__lt__ = _lt
54
- logger.debug("Patched bitscrape.Request.__lt__ for priority-queue tie-breaks")
56
+ request_cls.__lt__ = lambda self, other: id(self) < id(other)
57
+ request_cls.__le__ = lambda self, other: id(self) <= id(other)
58
+ request_cls.__gt__ = lambda self, other: id(self) > id(other)
59
+ request_cls.__ge__ = lambda self, other: id(self) >= id(other)
60
+ logger.debug("Patched bitscrape.Request with identity-based ordering")
55
61
 
56
62
 
57
63
  _patch_request_ordering()
@@ -80,8 +86,10 @@ class Crawler:
80
86
  """Synchronous convenience wrapper around :meth:`acrawl`.
81
87
 
82
88
  Safe to call from plain scripts, CLI commands, server request
83
- handlers, *and* Jupyter/Colab notebooks (which already run an
84
- event loop) see :func:`bie._async_utils.run_sync`.
89
+ handlers, *and* Jupyter/Colab/IPython notebooks (which already run
90
+ an event loop, where a plain ``asyncio.run()`` would raise
91
+ ``RuntimeError: asyncio.run() cannot be called from a running
92
+ event loop``). See :func:`bie._asyncutil.run_sync`.
85
93
  """
86
94
  return run_sync(self.acrawl(urls, allowed_domains, instruction))
87
95