bits-bie 1.2.1__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bits_bie-1.2.1 → bits_bie-1.2.2}/PKG-INFO +54 -4
- {bits_bie-1.2.1 → bits_bie-1.2.2}/README.md +49 -2
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/__init__.py +8 -1
- bits_bie-1.2.2/bie/_async_utils.py +93 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/crawler.py +41 -3
- bits_bie-1.2.2/bie/discovery.py +467 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/extract.py +2 -2
- {bits_bie-1.2.1 → bits_bie-1.2.2}/pyproject.toml +6 -3
- bits_bie-1.2.2/tests/test_async_utils.py +118 -0
- bits_bie-1.2.2/tests/test_crawler_notebook_safe.py +79 -0
- bits_bie-1.2.2/tests/test_crawler_request_patch.py +67 -0
- bits_bie-1.2.2/tests/test_discovery_error_handling.py +201 -0
- bits_bie-1.2.1/bie/discovery.py +0 -214
- {bits_bie-1.2.1 → bits_bie-1.2.2}/.github/workflows/ci.yml +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/.github/workflows/publish.yml +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/.gitignore +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/LICENSE +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/chunker.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/cli.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/config.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/engine.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/index.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/integrations/__init__.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/integrations/langchain.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/mcp/__init__.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/mcp/server.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/models.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/query_expansion.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/quicksearch.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/security.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/server.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/sitecrawl.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/sitemap.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/spiders/__init__.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/bie/spiders/generic.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/docs/API.md +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/examples/basic_search.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/examples/extract_page.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/examples/map_and_crawl.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/examples/reusable_index.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/examples/web_search.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_chunker.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_discovery.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_engine.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_extract.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_index.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_query_expansion.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_quicksearch.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_security.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_sitecrawl.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_sitemap.py +0 -0
- {bits_bie-1.2.1 → bits_bie-1.2.2}/tests/test_spider_relevance.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bits-bie
|
|
3
|
-
Version: 1.2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.2.2
|
|
4
|
+
Summary: BitSearch Intelligence Engine — real-time, citation-backed web search & extraction for AI apps. Built on Bitscrape.
|
|
5
5
|
Project-URL: Homepage, https://github.com/Sudharsansm/BIE
|
|
6
6
|
Project-URL: Repository, https://github.com/Sudharsansm/BIE
|
|
7
7
|
Project-URL: Issues, https://github.com/Sudharsansm/BIE/issues
|
|
@@ -31,6 +31,7 @@ Provides-Extra: all
|
|
|
31
31
|
Requires-Dist: fastapi>=0.110; extra == 'all'
|
|
32
32
|
Requires-Dist: langchain-core>=0.2; extra == 'all'
|
|
33
33
|
Requires-Dist: mcp>=1.0; extra == 'all'
|
|
34
|
+
Requires-Dist: nest-asyncio>=1.5; extra == 'all'
|
|
34
35
|
Requires-Dist: playwright>=1.40; extra == 'all'
|
|
35
36
|
Requires-Dist: sentence-transformers>=2.2; extra == 'all'
|
|
36
37
|
Requires-Dist: uvicorn[standard]>=0.27; extra == 'all'
|
|
@@ -44,6 +45,8 @@ Provides-Extra: langchain
|
|
|
44
45
|
Requires-Dist: langchain-core>=0.2; extra == 'langchain'
|
|
45
46
|
Provides-Extra: mcp
|
|
46
47
|
Requires-Dist: mcp>=1.0; extra == 'mcp'
|
|
48
|
+
Provides-Extra: notebook
|
|
49
|
+
Requires-Dist: nest-asyncio>=1.5; extra == 'notebook'
|
|
47
50
|
Provides-Extra: render
|
|
48
51
|
Requires-Dist: playwright>=1.40; extra == 'render'
|
|
49
52
|
Provides-Extra: server
|
|
@@ -143,9 +146,17 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
|
|
|
143
146
|
pip install "bits-bie[mcp]" # Model Context Protocol server
|
|
144
147
|
pip install "bits-bie[render]" # JS rendering for extract() via Playwright
|
|
145
148
|
pip install "bits-bie[langchain]" # LangChain tool adapters
|
|
149
|
+
pip install "bits-bie[notebook]" # smoother Jupyter/Colab support (nest_asyncio)
|
|
146
150
|
pip install "bits-bie[all]" # everything
|
|
147
151
|
```
|
|
148
152
|
|
|
153
|
+
> **Using BIE in Jupyter / Google Colab?** All sync entry points
|
|
154
|
+
> (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
|
|
155
|
+
> work inside notebooks out of the box — BIE detects the notebook's
|
|
156
|
+
> already-running event loop and handles it automatically. Installing
|
|
157
|
+
> `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
|
|
158
|
+
> efficient, but is not required.
|
|
159
|
+
|
|
149
160
|
> BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
|
|
150
161
|
> proprietary async crawling & extraction framework, which is installed
|
|
151
162
|
> automatically.
|
|
@@ -397,6 +408,45 @@ engine = BIE(BIESettings(
|
|
|
397
408
|
| `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
|
|
398
409
|
| `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
|
|
399
410
|
| `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
|
|
411
|
+
| — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
|
|
412
|
+
| — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
|
|
413
|
+
|
|
414
|
+
### Discovery backends & troubleshooting empty `websearch()` results
|
|
415
|
+
|
|
416
|
+
`websearch()` discovers candidate URLs by scraping public search-engine
|
|
417
|
+
result pages (DuckDuckGo HTML, DuckDuckGo Lite, Bing HTML, in that order
|
|
418
|
+
by default). This is inherently fragile — these are not official APIs,
|
|
419
|
+
and shared/cloud IPs (CI runners, some notebook hosts, restrictive
|
|
420
|
+
sandboxes) can be rate-limited or blocked entirely.
|
|
421
|
+
|
|
422
|
+
If `websearch()` returns `[]`, BIE logs a `WARNING` that distinguishes
|
|
423
|
+
two failure categories:
|
|
424
|
+
|
|
425
|
+
- **"network blocked"** — every backend failed at the connection level
|
|
426
|
+
(timeouts, connection refused, or a sandbox/proxy denial). This means
|
|
427
|
+
the environment itself can't reach these hosts — re-run in an
|
|
428
|
+
environment with normal internet access (a local machine, server, or
|
|
429
|
+
Colab) rather than a locked-down sandbox.
|
|
430
|
+
- **"reachable but no results"** — connections succeeded but responses
|
|
431
|
+
were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
|
|
432
|
+
This means the IP is likely being rate-limited; try again later, reduce
|
|
433
|
+
request frequency, or switch to a self-hosted backend (below).
|
|
434
|
+
|
|
435
|
+
For a durable fix to rate-limiting, run a self-hosted
|
|
436
|
+
[SearXNG](https://docs.searxng.org/) instance and point BIE at it:
|
|
437
|
+
|
|
438
|
+
```bash
|
|
439
|
+
export BIE_DISCOVERY_BACKENDS=searxng
|
|
440
|
+
export BIE_SEARXNG_URL=http://localhost:8080
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
You can also combine backends and reorder them, e.g. to prefer your
|
|
444
|
+
SearXNG instance but fall back to DuckDuckGo:
|
|
445
|
+
|
|
446
|
+
```bash
|
|
447
|
+
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
|
|
448
|
+
export BIE_SEARXNG_URL=http://localhost:8080
|
|
449
|
+
```
|
|
400
450
|
|
|
401
451
|
---
|
|
402
452
|
|
|
@@ -433,10 +483,10 @@ for Elasticsearch/Milvus-backed implementations behind the same
|
|
|
433
483
|
|
|
434
484
|
---
|
|
435
485
|
|
|
436
|
-
## Built on
|
|
486
|
+
## Built on Bitscrape
|
|
437
487
|
|
|
438
488
|
BIE's crawling and extraction layer is powered by
|
|
439
|
-
[**
|
|
489
|
+
[**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
|
|
440
490
|
(`pip install bitscrape`), our async, robots.txt-aware web scraping
|
|
441
491
|
framework — giving BIE high-performance, polite crawling out of the box.
|
|
442
492
|
|
|
@@ -90,9 +90,17 @@ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
|
|
|
90
90
|
pip install "bits-bie[mcp]" # Model Context Protocol server
|
|
91
91
|
pip install "bits-bie[render]" # JS rendering for extract() via Playwright
|
|
92
92
|
pip install "bits-bie[langchain]" # LangChain tool adapters
|
|
93
|
+
pip install "bits-bie[notebook]" # smoother Jupyter/Colab support (nest_asyncio)
|
|
93
94
|
pip install "bits-bie[all]" # everything
|
|
94
95
|
```
|
|
95
96
|
|
|
97
|
+
> **Using BIE in Jupyter / Google Colab?** All sync entry points
|
|
98
|
+
> (`engine.crawl(...)`, `bie.websearch(...)`, `bie.extract(..., render_js=True)`)
|
|
99
|
+
> work inside notebooks out of the box — BIE detects the notebook's
|
|
100
|
+
> already-running event loop and handles it automatically. Installing
|
|
101
|
+
> `bits-bie[notebook]` (adds `nest_asyncio`) makes this slightly more
|
|
102
|
+
> efficient, but is not required.
|
|
103
|
+
|
|
96
104
|
> BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
|
|
97
105
|
> proprietary async crawling & extraction framework, which is installed
|
|
98
106
|
> automatically.
|
|
@@ -344,6 +352,45 @@ engine = BIE(BIESettings(
|
|
|
344
352
|
| `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
|
|
345
353
|
| `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
|
|
346
354
|
| `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
|
|
355
|
+
| — | `BIE_DISCOVERY_BACKENDS` | `ddg_html,ddg_lite,bing_html` | Comma-separated list and order of `websearch()` discovery backends. Known names: `ddg_html`, `ddg_lite`, `bing_html`, `searxng`. |
|
|
356
|
+
| — | `BIE_SEARXNG_URL` | `None` | Base URL of a self-hosted [SearXNG](https://docs.searxng.org/) instance, used when `searxng` is included in `BIE_DISCOVERY_BACKENDS`. |
|
|
357
|
+
|
|
358
|
+
### Discovery backends & troubleshooting empty `websearch()` results
|
|
359
|
+
|
|
360
|
+
`websearch()` discovers candidate URLs by scraping public search-engine
|
|
361
|
+
result pages (DuckDuckGo HTML, DuckDuckGo Lite, Bing HTML, in that order
|
|
362
|
+
by default). This is inherently fragile — these are not official APIs,
|
|
363
|
+
and shared/cloud IPs (CI runners, some notebook hosts, restrictive
|
|
364
|
+
sandboxes) can be rate-limited or blocked entirely.
|
|
365
|
+
|
|
366
|
+
If `websearch()` returns `[]`, BIE logs a `WARNING` that distinguishes
|
|
367
|
+
two failure categories:
|
|
368
|
+
|
|
369
|
+
- **"network blocked"** — every backend failed at the connection level
|
|
370
|
+
(timeouts, connection refused, or a sandbox/proxy denial). This means
|
|
371
|
+
the environment itself can't reach these hosts — re-run in an
|
|
372
|
+
environment with normal internet access (a local machine, server, or
|
|
373
|
+
Colab) rather than a locked-down sandbox.
|
|
374
|
+
- **"reachable but no results"** — connections succeeded but responses
|
|
375
|
+
were empty, a CAPTCHA/consent page, or rate-limited (HTTP 403/429).
|
|
376
|
+
This means the IP is likely being rate-limited; try again later, reduce
|
|
377
|
+
request frequency, or switch to a self-hosted backend (below).
|
|
378
|
+
|
|
379
|
+
For a durable fix to rate-limiting, run a self-hosted
|
|
380
|
+
[SearXNG](https://docs.searxng.org/) instance and point BIE at it:
|
|
381
|
+
|
|
382
|
+
```bash
|
|
383
|
+
export BIE_DISCOVERY_BACKENDS=searxng
|
|
384
|
+
export BIE_SEARXNG_URL=http://localhost:8080
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
You can also combine backends and reorder them, e.g. to prefer your
|
|
388
|
+
SearXNG instance but fall back to DuckDuckGo:
|
|
389
|
+
|
|
390
|
+
```bash
|
|
391
|
+
export BIE_DISCOVERY_BACKENDS=searxng,ddg_html,ddg_lite
|
|
392
|
+
export BIE_SEARXNG_URL=http://localhost:8080
|
|
393
|
+
```
|
|
347
394
|
|
|
348
395
|
---
|
|
349
396
|
|
|
@@ -380,10 +427,10 @@ for Elasticsearch/Milvus-backed implementations behind the same
|
|
|
380
427
|
|
|
381
428
|
---
|
|
382
429
|
|
|
383
|
-
## Built on
|
|
430
|
+
## Built on Bitscrape
|
|
384
431
|
|
|
385
432
|
BIE's crawling and extraction layer is powered by
|
|
386
|
-
[**
|
|
433
|
+
[**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
|
|
387
434
|
(`pip install bitscrape`), our async, robots.txt-aware web scraping
|
|
388
435
|
framework — giving BIE high-performance, polite crawling out of the box.
|
|
389
436
|
|
|
@@ -54,6 +54,8 @@ Run as an MCP tool (for Claude Desktop, Claude Code, etc.)::
|
|
|
54
54
|
|
|
55
55
|
from __future__ import annotations
|
|
56
56
|
|
|
57
|
+
from importlib import metadata as _metadata
|
|
58
|
+
|
|
57
59
|
from bie.config import BIESettings
|
|
58
60
|
from bie.engine import BIE
|
|
59
61
|
from bie.extract import ExtractError, ExtractResult, extract
|
|
@@ -63,7 +65,12 @@ from bie.security import SecurityFinding, SecurityReport, scan_for_prompt_inject
|
|
|
63
65
|
from bie.sitecrawl import crawl_site
|
|
64
66
|
from bie.sitemap import SiteMap, map_site
|
|
65
67
|
|
|
66
|
-
|
|
68
|
+
try:
|
|
69
|
+
# Reflects the version actually installed (matches PyPI/pyproject.toml).
|
|
70
|
+
__version__ = _metadata.version("bits-bie")
|
|
71
|
+
except _metadata.PackageNotFoundError:
|
|
72
|
+
# Editable/source checkout without installed metadata.
|
|
73
|
+
__version__ = "1.2.2"
|
|
67
74
|
|
|
68
75
|
__all__ = [
|
|
69
76
|
"BIE",
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Internal helpers for running async code from synchronous entry points,
|
|
3
|
+
safely whether or not the caller is already inside an event loop.
|
|
4
|
+
|
|
5
|
+
This module exists because BIE's public sync API (``Crawler.crawl``,
|
|
6
|
+
``BIE.crawl``, etc.) wraps async crawl logic with ``asyncio.run()`` —
|
|
7
|
+
which works fine in plain scripts, CLI commands, and server request
|
|
8
|
+
handlers, but **raises** ``RuntimeError: asyncio.run() cannot be called
|
|
9
|
+
from a running event loop`` when called from Jupyter/Colab notebooks
|
|
10
|
+
(which run their own persistent event loop).
|
|
11
|
+
|
|
12
|
+
:func:`run_sync` detects this and falls back automatically:
|
|
13
|
+
|
|
14
|
+
1. **No running loop** (plain script/CLI/server) — use ``asyncio.run()``
|
|
15
|
+
directly. This is the common case and has zero overhead.
|
|
16
|
+
2. **Running loop + nest_asyncio installed** — patch the running loop
|
|
17
|
+
with `nest_asyncio <https://pypi.org/project/nest_asyncio/>`_ so
|
|
18
|
+
``asyncio.run()`` can be called from within it. Cheap, same-thread.
|
|
19
|
+
3. **Running loop, no nest_asyncio** — run the coroutine to completion in
|
|
20
|
+
a fresh event loop on a separate worker thread, and block until it
|
|
21
|
+
finishes. Always works, no extra dependencies required, slightly more
|
|
22
|
+
overhead (one thread per call).
|
|
23
|
+
|
|
24
|
+
Callers (``Crawler.crawl``, ``BIE.crawl``, etc.) don't need to know which
|
|
25
|
+
path was taken — :func:`run_sync` always returns the coroutine's result
|
|
26
|
+
or raises its exception, as if it were called from a script with no
|
|
27
|
+
running loop.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import asyncio
|
|
33
|
+
import concurrent.futures
|
|
34
|
+
import logging
|
|
35
|
+
from typing import Coroutine, TypeVar
|
|
36
|
+
|
|
37
|
+
_T = TypeVar("_T")
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger("bie.async_utils")
|
|
40
|
+
|
|
41
|
+
_nest_asyncio_applied = False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def run_sync(coro: Coroutine[None, None, _T]) -> _T:
|
|
45
|
+
"""Run ``coro`` to completion and return its result, working correctly
|
|
46
|
+
whether or not the calling thread already has a running event loop.
|
|
47
|
+
|
|
48
|
+
See module docstring for the fallback strategy.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
asyncio.get_running_loop()
|
|
52
|
+
except RuntimeError:
|
|
53
|
+
# No running loop in this thread — the normal case for scripts,
|
|
54
|
+
# CLI commands, and most server request handlers.
|
|
55
|
+
return asyncio.run(coro)
|
|
56
|
+
|
|
57
|
+
# We're inside a running event loop (e.g. a Jupyter/Colab cell).
|
|
58
|
+
# First choice: nest_asyncio, if available — patches the loop so
|
|
59
|
+
# asyncio.run() works from within it. Cheapest option, same thread.
|
|
60
|
+
if _try_apply_nest_asyncio():
|
|
61
|
+
return asyncio.run(coro)
|
|
62
|
+
|
|
63
|
+
# Fallback: run the coroutine in a brand-new event loop on a separate
|
|
64
|
+
# thread, and block the calling (notebook) thread until it's done.
|
|
65
|
+
# This always works and requires no extra dependencies.
|
|
66
|
+
logger.debug(
|
|
67
|
+
"Running coroutine in a separate thread (already inside an event "
|
|
68
|
+
"loop and nest_asyncio is not installed). Install nest_asyncio for "
|
|
69
|
+
"lower overhead: pip install nest_asyncio"
|
|
70
|
+
)
|
|
71
|
+
return _run_in_new_thread(coro)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _try_apply_nest_asyncio() -> bool:
|
|
75
|
+
global _nest_asyncio_applied
|
|
76
|
+
if _nest_asyncio_applied:
|
|
77
|
+
return True
|
|
78
|
+
try:
|
|
79
|
+
import nest_asyncio
|
|
80
|
+
except ImportError:
|
|
81
|
+
return False
|
|
82
|
+
nest_asyncio.apply()
|
|
83
|
+
_nest_asyncio_applied = True
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _run_in_new_thread(coro: Coroutine[None, None, _T]) -> _T:
|
|
88
|
+
def _runner() -> _T:
|
|
89
|
+
return asyncio.run(coro)
|
|
90
|
+
|
|
91
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
92
|
+
future = pool.submit(_runner)
|
|
93
|
+
return future.result()
|
|
@@ -9,7 +9,6 @@ objects, ready for chunking + indexing.
|
|
|
9
9
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
|
-
import asyncio
|
|
13
12
|
import logging
|
|
14
13
|
from typing import Any
|
|
15
14
|
from urllib.parse import urlparse
|
|
@@ -17,6 +16,7 @@ from urllib.parse import urlparse
|
|
|
17
16
|
import bitscrape
|
|
18
17
|
from bitscrape.pipeline.pipelines import BasePipeline
|
|
19
18
|
|
|
19
|
+
from bie._async_utils import run_sync
|
|
20
20
|
from bie.config import BIESettings
|
|
21
21
|
from bie.models import Document
|
|
22
22
|
from bie.spiders.generic import BIESpider
|
|
@@ -24,6 +24,39 @@ from bie.spiders.generic import BIESpider
|
|
|
24
24
|
logger = logging.getLogger("bie.crawler")
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
def _patch_request_ordering() -> None:
|
|
28
|
+
"""Make ``bitscrape.Request`` orderable for its priority-queue
|
|
29
|
+
tie-breaks.
|
|
30
|
+
|
|
31
|
+
Bitscrape's scheduler stores requests in an ``asyncio.PriorityQueue``
|
|
32
|
+
as ``(priority.value, request)`` tuples. When two requests share the
|
|
33
|
+
same priority, ``heapq`` falls back to comparing the ``Request``
|
|
34
|
+
objects directly with ``<`` — but ``Request`` (a pydantic
|
|
35
|
+
``BaseModel``) doesn't define ``__lt__``, so this raises::
|
|
36
|
+
|
|
37
|
+
TypeError: '<' not supported between instances of 'Request' and 'Request'
|
|
38
|
+
|
|
39
|
+
This patches in an arbitrary-but-stable ``__lt__`` (by ``id()``) so
|
|
40
|
+
same-priority requests can be ordered without error. The patch is a
|
|
41
|
+
no-op if a future Bitscrape version already defines ``__lt__`` on
|
|
42
|
+
``Request``.
|
|
43
|
+
"""
|
|
44
|
+
request_cls = bitscrape.Request
|
|
45
|
+
current = getattr(request_cls, "__lt__", None)
|
|
46
|
+
if current is not None and current is not object.__lt__:
|
|
47
|
+
# Already defines real ordering (future Bitscrape fix) — no-op.
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
def _lt(self: Any, other: Any) -> bool:
|
|
51
|
+
return id(self) < id(other)
|
|
52
|
+
|
|
53
|
+
request_cls.__lt__ = _lt
|
|
54
|
+
logger.debug("Patched bitscrape.Request.__lt__ for priority-queue tie-breaks")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
_patch_request_ordering()
|
|
58
|
+
|
|
59
|
+
|
|
27
60
|
class _CollectorPipeline(BasePipeline):
|
|
28
61
|
"""Collects every scraped item into an in-memory list."""
|
|
29
62
|
|
|
@@ -44,8 +77,13 @@ class Crawler:
|
|
|
44
77
|
def crawl(
|
|
45
78
|
self, urls: list[str], allowed_domains: list[str] | None = None, instruction: str = ""
|
|
46
79
|
) -> list[Document]:
|
|
47
|
-
"""Synchronous convenience wrapper around :meth:`acrawl`.
|
|
48
|
-
|
|
80
|
+
"""Synchronous convenience wrapper around :meth:`acrawl`.
|
|
81
|
+
|
|
82
|
+
Safe to call from plain scripts, CLI commands, server request
|
|
83
|
+
handlers, *and* Jupyter/Colab notebooks (which already run an
|
|
84
|
+
event loop) — see :func:`bie._async_utils.run_sync`.
|
|
85
|
+
"""
|
|
86
|
+
return run_sync(self.acrawl(urls, allowed_domains, instruction))
|
|
49
87
|
|
|
50
88
|
async def acrawl(
|
|
51
89
|
self,
|