webglean 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webglean-0.1.0/.gitignore +14 -0
- webglean-0.1.0/LICENSE +21 -0
- webglean-0.1.0/PKG-INFO +65 -0
- webglean-0.1.0/README.md +44 -0
- webglean-0.1.0/examples/basic_usage.py +20 -0
- webglean-0.1.0/pyproject.toml +38 -0
- webglean-0.1.0/src/webglean/__init__.py +31 -0
- webglean-0.1.0/src/webglean/_http.py +40 -0
- webglean-0.1.0/src/webglean/client.py +213 -0
- webglean-0.1.0/src/webglean/errors.py +10 -0
- webglean-0.1.0/src/webglean/py.typed +0 -0
- webglean-0.1.0/src/webglean/types.py +92 -0
- webglean-0.1.0/tests/test_client.py +151 -0
webglean-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 WebGlean
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
webglean-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: webglean
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Official Python client for the WebGlean API
|
|
5
|
+
Project-URL: Homepage, https://webglean.dev/docs/sdks
|
|
6
|
+
Project-URL: Documentation, https://webglean.dev/docs/sdks
|
|
7
|
+
Project-URL: Repository, https://github.com/qubomax/webglean
|
|
8
|
+
Project-URL: Issues, https://github.com/qubomax/webglean/issues
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai-agents,crawler,llm,markdown,rag,scraping,web-scraping,webglean
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Requires-Dist: httpx>=0.27
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
19
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# webglean
|
|
23
|
+
|
|
24
|
+
Official Python client for the [WebGlean](https://webglean.dev) API.
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install webglean
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from webglean import WebGlean
|
|
32
|
+
|
|
33
|
+
client = WebGlean(api_key="wg_your_key") # or set WEBGLEAN_API_KEY
|
|
34
|
+
|
|
35
|
+
result = client.scrape("https://example.com")
|
|
36
|
+
print(result["markdown"])
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
`WebGlean` also works as a context manager (`with WebGlean(...) as client:`), which closes the underlying HTTP connection pool on exit.
|
|
40
|
+
|
|
41
|
+
## Methods
|
|
42
|
+
|
|
43
|
+
| Method | Endpoint |
|
|
44
|
+
|---|---|
|
|
45
|
+
| `scrape(url, ...)` | `POST /v1/scrape` |
|
|
46
|
+
| `crawl(url, ...)` / `get_crawl(id)` / `crawl_and_wait(id, ...)` | `POST /v1/crawl`, `GET /v1/crawl/:id` |
|
|
47
|
+
| `extract(url, schema=, prompt=)` | `POST /v1/extract` |
|
|
48
|
+
| `map(url, ...)` | `POST /v1/map` |
|
|
49
|
+
| `search(query, ...)` | `POST /v1/search` |
|
|
50
|
+
| `create_monitor(url, ...)` / `list_monitors()` / `get_monitor(id)` / `delete_monitor(id)` | `POST/GET/DELETE /v1/monitor` |
|
|
51
|
+
| `batch_scrape(urls=, items=, ...)` / `get_batch(id)` / `batch_scrape_and_wait(id, ...)` | `POST /v1/batch/scrape`, `GET /v1/batch/scrape/:id` |
|
|
52
|
+
|
|
53
|
+
`crawl_and_wait` and `batch_scrape_and_wait` poll every 2s by default (`poll_interval`, `timeout` in seconds) until the job finishes, raising `WebGleanError` (status `0`) on timeout.
|
|
54
|
+
|
|
55
|
+
## Errors
|
|
56
|
+
|
|
57
|
+
Any non-2xx response raises `WebGleanError`, with `.status` and the message set from the API's error body.
|
|
58
|
+
|
|
59
|
+
## Development
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
python3 -m venv .venv && source .venv/bin/activate
|
|
63
|
+
pip install -e ".[dev]"
|
|
64
|
+
pytest
|
|
65
|
+
```
|
webglean-0.1.0/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# webglean
|
|
2
|
+
|
|
3
|
+
Official Python client for the [WebGlean](https://webglean.dev) API.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install webglean
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from webglean import WebGlean
|
|
11
|
+
|
|
12
|
+
client = WebGlean(api_key="wg_your_key") # or set WEBGLEAN_API_KEY
|
|
13
|
+
|
|
14
|
+
result = client.scrape("https://example.com")
|
|
15
|
+
print(result["markdown"])
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
`WebGlean` also works as a context manager (`with WebGlean(...) as client:`), which closes the underlying HTTP connection pool on exit.
|
|
19
|
+
|
|
20
|
+
## Methods
|
|
21
|
+
|
|
22
|
+
| Method | Endpoint |
|
|
23
|
+
|---|---|
|
|
24
|
+
| `scrape(url, ...)` | `POST /v1/scrape` |
|
|
25
|
+
| `crawl(url, ...)` / `get_crawl(id)` / `crawl_and_wait(id, ...)` | `POST /v1/crawl`, `GET /v1/crawl/:id` |
|
|
26
|
+
| `extract(url, schema=, prompt=)` | `POST /v1/extract` |
|
|
27
|
+
| `map(url, ...)` | `POST /v1/map` |
|
|
28
|
+
| `search(query, ...)` | `POST /v1/search` |
|
|
29
|
+
| `create_monitor(url, ...)` / `list_monitors()` / `get_monitor(id)` / `delete_monitor(id)` | `POST/GET/DELETE /v1/monitor` |
|
|
30
|
+
| `batch_scrape(urls=, items=, ...)` / `get_batch(id)` / `batch_scrape_and_wait(id, ...)` | `POST /v1/batch/scrape`, `GET /v1/batch/scrape/:id` |
|
|
31
|
+
|
|
32
|
+
`crawl_and_wait` and `batch_scrape_and_wait` poll every 2s by default (`poll_interval`, `timeout` in seconds) until the job finishes, raising `WebGleanError` (status `0`) on timeout.
|
|
33
|
+
|
|
34
|
+
## Errors
|
|
35
|
+
|
|
36
|
+
Any non-2xx response raises `WebGleanError`, with `.status` and the message set from the API's error body.
|
|
37
|
+
|
|
38
|
+
## Development
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
python3 -m venv .venv && source .venv/bin/activate
|
|
42
|
+
pip install -e ".[dev]"
|
|
43
|
+
pytest
|
|
44
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from webglean import WebGlean
|
|
4
|
+
|
|
5
|
+
client = WebGlean(
|
|
6
|
+
api_key=os.environ.get("WEBGLEAN_API_KEY", "wg_your_key"),
|
|
7
|
+
base_url=os.environ.get("WEBGLEAN_BASE_URL", "https://api.webglean.dev"),
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
scraped = client.scrape("https://example.com")
|
|
11
|
+
print(scraped["markdown"])
|
|
12
|
+
|
|
13
|
+
crawl_id = client.crawl("https://example.com", max_pages=5)
|
|
14
|
+
crawl = client.crawl_and_wait(crawl_id)
|
|
15
|
+
print(f"Crawled {crawl['pagesCrawled']} pages")
|
|
16
|
+
|
|
17
|
+
result = client.map("https://example.com")
|
|
18
|
+
print(f"Found {result['total']} URLs")
|
|
19
|
+
|
|
20
|
+
client.close()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "webglean"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Official Python client for the WebGlean API"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
keywords = ["webglean", "scraping", "web-scraping", "crawler", "markdown", "llm", "rag", "ai-agents"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"httpx>=0.27",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://webglean.dev/docs/sdks"
|
|
24
|
+
Documentation = "https://webglean.dev/docs/sdks"
|
|
25
|
+
Repository = "https://github.com/qubomax/webglean"
|
|
26
|
+
Issues = "https://github.com/qubomax/webglean/issues"
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
dev = [
|
|
30
|
+
"pytest>=8.0",
|
|
31
|
+
"respx>=0.21",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[tool.hatch.build.targets.wheel]
|
|
35
|
+
packages = ["src/webglean"]
|
|
36
|
+
|
|
37
|
+
[tool.pytest.ini_options]
|
|
38
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .client import WebGlean
|
|
2
|
+
from .errors import WebGleanError
|
|
3
|
+
from .types import (
|
|
4
|
+
BatchResultItem,
|
|
5
|
+
BatchStatus,
|
|
6
|
+
CrawlPage,
|
|
7
|
+
CrawlStatus,
|
|
8
|
+
MapResult,
|
|
9
|
+
Monitor,
|
|
10
|
+
MonitorChange,
|
|
11
|
+
MonitorDetail,
|
|
12
|
+
ScrapeMetadata,
|
|
13
|
+
ScrapeResult,
|
|
14
|
+
SearchResultItem,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"WebGlean",
|
|
19
|
+
"WebGleanError",
|
|
20
|
+
"ScrapeResult",
|
|
21
|
+
"ScrapeMetadata",
|
|
22
|
+
"CrawlStatus",
|
|
23
|
+
"CrawlPage",
|
|
24
|
+
"MapResult",
|
|
25
|
+
"SearchResultItem",
|
|
26
|
+
"Monitor",
|
|
27
|
+
"MonitorDetail",
|
|
28
|
+
"MonitorChange",
|
|
29
|
+
"BatchStatus",
|
|
30
|
+
"BatchResultItem",
|
|
31
|
+
]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from .errors import WebGleanError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HttpClient:
|
|
11
|
+
def __init__(self, base_url: str, api_key: str, timeout: float) -> None:
|
|
12
|
+
self._client = httpx.Client(
|
|
13
|
+
base_url=base_url,
|
|
14
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
15
|
+
timeout=timeout,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
def request(
|
|
19
|
+
self,
|
|
20
|
+
method: str,
|
|
21
|
+
path: str,
|
|
22
|
+
json_body: Optional[Dict[str, Any]] = None,
|
|
23
|
+
) -> Dict[str, Any]:
|
|
24
|
+
res = self._client.request(method, path, json=json_body)
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
body: Dict[str, Any] = res.json()
|
|
28
|
+
except ValueError:
|
|
29
|
+
raise WebGleanError(f"Invalid JSON response (HTTP {res.status_code})", res.status_code)
|
|
30
|
+
|
|
31
|
+
if res.status_code >= 400 or body.get("success") is False:
|
|
32
|
+
raise WebGleanError(
|
|
33
|
+
body.get("error") or f"Request failed with HTTP {res.status_code}",
|
|
34
|
+
res.status_code,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return body
|
|
38
|
+
|
|
39
|
+
def close(self) -> None:
|
|
40
|
+
self._client.close()
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from ._http import HttpClient
|
|
8
|
+
from .errors import WebGleanError
|
|
9
|
+
from .types import (
|
|
10
|
+
BatchStatus,
|
|
11
|
+
CrawlStatus,
|
|
12
|
+
MapResult,
|
|
13
|
+
Monitor,
|
|
14
|
+
MonitorDetail,
|
|
15
|
+
ScrapeResult,
|
|
16
|
+
SearchResultItem,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
DEFAULT_BASE_URL = "https://api.webglean.dev"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class WebGlean:
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
api_key: Optional[str] = None,
|
|
26
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
27
|
+
timeout: float = 120.0,
|
|
28
|
+
) -> None:
|
|
29
|
+
resolved_key = api_key or os.environ.get("WEBGLEAN_API_KEY")
|
|
30
|
+
if not resolved_key:
|
|
31
|
+
raise ValueError(
|
|
32
|
+
"WebGlean: an api_key is required (pass it directly or set WEBGLEAN_API_KEY)"
|
|
33
|
+
)
|
|
34
|
+
self._http = HttpClient(base_url, resolved_key, timeout)
|
|
35
|
+
|
|
36
|
+
def close(self) -> None:
|
|
37
|
+
self._http.close()
|
|
38
|
+
|
|
39
|
+
def __enter__(self) -> "WebGlean":
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def __exit__(self, *exc_info: Any) -> None:
|
|
43
|
+
self.close()
|
|
44
|
+
|
|
45
|
+
# ── Scrape ──────────────────────────────────────────────────────────
|
|
46
|
+
|
|
47
|
+
def scrape(
|
|
48
|
+
self,
|
|
49
|
+
url: str,
|
|
50
|
+
format: str = "markdown",
|
|
51
|
+
only_main_content: bool = True,
|
|
52
|
+
) -> ScrapeResult:
|
|
53
|
+
body = self._http.request(
|
|
54
|
+
"POST",
|
|
55
|
+
"/v1/scrape",
|
|
56
|
+
{"url": url, "formats": [format], "onlyMainContent": only_main_content},
|
|
57
|
+
)
|
|
58
|
+
return body["data"]
|
|
59
|
+
|
|
60
|
+
# ── Crawl ───────────────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
def crawl(
|
|
63
|
+
self,
|
|
64
|
+
url: str,
|
|
65
|
+
max_depth: int = 2,
|
|
66
|
+
max_pages: int = 10,
|
|
67
|
+
include_paths: Optional[List[str]] = None,
|
|
68
|
+
exclude_paths: Optional[List[str]] = None,
|
|
69
|
+
) -> str:
|
|
70
|
+
body = self._http.request(
|
|
71
|
+
"POST",
|
|
72
|
+
"/v1/crawl",
|
|
73
|
+
{
|
|
74
|
+
"url": url,
|
|
75
|
+
"maxDepth": max_depth,
|
|
76
|
+
"maxPages": max_pages,
|
|
77
|
+
"includePaths": include_paths or [],
|
|
78
|
+
"excludePaths": exclude_paths or [],
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
|
+
return body["id"]
|
|
82
|
+
|
|
83
|
+
def get_crawl(self, crawl_id: str) -> CrawlStatus:
|
|
84
|
+
body = self._http.request("GET", f"/v1/crawl/{crawl_id}")
|
|
85
|
+
return body["data"]
|
|
86
|
+
|
|
87
|
+
def crawl_and_wait(
|
|
88
|
+
self, crawl_id: str, poll_interval: float = 2.0, timeout: float = 600.0
|
|
89
|
+
) -> CrawlStatus:
|
|
90
|
+
return self._poll_until(
|
|
91
|
+
lambda: self.get_crawl(crawl_id),
|
|
92
|
+
lambda status: status["status"] in ("done", "failed"),
|
|
93
|
+
poll_interval,
|
|
94
|
+
timeout,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# ── Extract ─────────────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
def extract(
|
|
100
|
+
self,
|
|
101
|
+
url: str,
|
|
102
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
103
|
+
prompt: Optional[str] = None,
|
|
104
|
+
) -> Any:
|
|
105
|
+
if not schema and not prompt:
|
|
106
|
+
raise ValueError("WebGlean.extract: provide at least one of schema, prompt")
|
|
107
|
+
body = self._http.request(
|
|
108
|
+
"POST", "/v1/extract", {"url": url, "schema": schema, "prompt": prompt}
|
|
109
|
+
)
|
|
110
|
+
return body["data"]
|
|
111
|
+
|
|
112
|
+
# ── Map ─────────────────────────────────────────────────────────────
|
|
113
|
+
|
|
114
|
+
def map(self, url: str, max_urls: int = 100, search: Optional[str] = None) -> MapResult:
|
|
115
|
+
"""Unlike every other endpoint, /v1/map's response is not wrapped in `data`."""
|
|
116
|
+
body = self._http.request(
|
|
117
|
+
"POST", "/v1/map", {"url": url, "maxUrls": max_urls, "search": search}
|
|
118
|
+
)
|
|
119
|
+
return {"links": body["links"], "total": body["total"]}
|
|
120
|
+
|
|
121
|
+
# ── Search ──────────────────────────────────────────────────────────
|
|
122
|
+
|
|
123
|
+
def search(
|
|
124
|
+
self,
|
|
125
|
+
query: str,
|
|
126
|
+
num_results: int = 5,
|
|
127
|
+
country: Optional[str] = None,
|
|
128
|
+
lang: Optional[str] = None,
|
|
129
|
+
) -> List[SearchResultItem]:
|
|
130
|
+
"""A result item can have markdown=None + error set for a page that
|
|
131
|
+
individually failed to scrape, even though the overall call
|
|
132
|
+
succeeded — that's returned as-is, not raised."""
|
|
133
|
+
body = self._http.request(
|
|
134
|
+
"POST",
|
|
135
|
+
"/v1/search",
|
|
136
|
+
{"query": query, "numResults": num_results, "country": country, "lang": lang},
|
|
137
|
+
)
|
|
138
|
+
return body["data"]
|
|
139
|
+
|
|
140
|
+
# ── Monitor ─────────────────────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
def create_monitor(
|
|
143
|
+
self, url: str, interval: str = "daily", webhook_url: Optional[str] = None
|
|
144
|
+
) -> Monitor:
|
|
145
|
+
body = self._http.request(
|
|
146
|
+
"POST",
|
|
147
|
+
"/v1/monitor",
|
|
148
|
+
{"url": url, "interval": interval, "webhookUrl": webhook_url},
|
|
149
|
+
)
|
|
150
|
+
return body["data"]
|
|
151
|
+
|
|
152
|
+
def list_monitors(self) -> List[Monitor]:
|
|
153
|
+
body = self._http.request("GET", "/v1/monitor")
|
|
154
|
+
return body["data"]
|
|
155
|
+
|
|
156
|
+
def get_monitor(self, monitor_id: str) -> MonitorDetail:
|
|
157
|
+
body = self._http.request("GET", f"/v1/monitor/{monitor_id}")
|
|
158
|
+
return body["data"]
|
|
159
|
+
|
|
160
|
+
def delete_monitor(self, monitor_id: str) -> None:
|
|
161
|
+
self._http.request("DELETE", f"/v1/monitor/{monitor_id}")
|
|
162
|
+
|
|
163
|
+
# ── Batch scrape ────────────────────────────────────────────────────
|
|
164
|
+
|
|
165
|
+
def batch_scrape(
|
|
166
|
+
self,
|
|
167
|
+
urls: Optional[List[str]] = None,
|
|
168
|
+
items: Optional[List[Dict[str, Any]]] = None,
|
|
169
|
+
format: str = "markdown",
|
|
170
|
+
only_main_content: bool = True,
|
|
171
|
+
) -> str:
|
|
172
|
+
if not urls and not items:
|
|
173
|
+
raise ValueError("WebGlean.batch_scrape: provide either urls or items")
|
|
174
|
+
payload: Dict[str, Any] = {"format": format, "onlyMainContent": only_main_content}
|
|
175
|
+
if items:
|
|
176
|
+
payload["items"] = items
|
|
177
|
+
else:
|
|
178
|
+
payload["urls"] = urls
|
|
179
|
+
body = self._http.request("POST", "/v1/batch/scrape", payload)
|
|
180
|
+
return body["id"]
|
|
181
|
+
|
|
182
|
+
def get_batch(self, batch_id: str) -> BatchStatus:
|
|
183
|
+
body = self._http.request("GET", f"/v1/batch/scrape/{batch_id}")
|
|
184
|
+
return {
|
|
185
|
+
"status": body["status"],
|
|
186
|
+
"total": body["total"],
|
|
187
|
+
"completed": body["completed"],
|
|
188
|
+
"failed": body["failed"],
|
|
189
|
+
"creditsUsed": body["creditsUsed"],
|
|
190
|
+
"results": body["results"],
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
def batch_scrape_and_wait(
|
|
194
|
+
self, batch_id: str, poll_interval: float = 2.0, timeout: float = 1200.0
|
|
195
|
+
) -> BatchStatus:
|
|
196
|
+
return self._poll_until(
|
|
197
|
+
lambda: self.get_batch(batch_id),
|
|
198
|
+
lambda status: status["status"] == "done",
|
|
199
|
+
poll_interval,
|
|
200
|
+
timeout,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# ── Internal ────────────────────────────────────────────────────────
|
|
204
|
+
|
|
205
|
+
def _poll_until(self, fetch_status, is_done, poll_interval: float, timeout: float):
|
|
206
|
+
deadline = time.monotonic() + timeout
|
|
207
|
+
while True:
|
|
208
|
+
status = fetch_status()
|
|
209
|
+
if is_done(status):
|
|
210
|
+
return status
|
|
211
|
+
if time.monotonic() >= deadline:
|
|
212
|
+
raise WebGleanError(f"Timed out after {timeout}s waiting for job to finish", 0)
|
|
213
|
+
time.sleep(poll_interval)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
class WebGleanError(Exception):
|
|
2
|
+
"""Raised for any non-2xx response from the WebGlean API, and for
|
|
3
|
+
client-side polling timeouts (status 0)."""
|
|
4
|
+
|
|
5
|
+
def __init__(self, message: str, status: int) -> None:
|
|
6
|
+
super().__init__(message)
|
|
7
|
+
self.status = status
|
|
8
|
+
|
|
9
|
+
def __repr__(self) -> str:
|
|
10
|
+
return f"WebGleanError(status={self.status!r}, message={str(self)!r})"
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, TypedDict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ScrapeMetadata(TypedDict, total=False):
|
|
7
|
+
title: Optional[str]
|
|
8
|
+
description: Optional[str]
|
|
9
|
+
url: Optional[str]
|
|
10
|
+
statusCode: Optional[int]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ScrapeResult(TypedDict):
|
|
14
|
+
markdown: str
|
|
15
|
+
html: str
|
|
16
|
+
text: str
|
|
17
|
+
metadata: ScrapeMetadata
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CrawlPage(TypedDict):
|
|
21
|
+
url: str
|
|
22
|
+
markdown: str
|
|
23
|
+
html: str
|
|
24
|
+
text: str
|
|
25
|
+
metadata: ScrapeMetadata
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CrawlStatus(TypedDict):
|
|
29
|
+
id: str
|
|
30
|
+
url: str
|
|
31
|
+
status: str # "pending" | "processing" | "done" | "failed"
|
|
32
|
+
pagesCrawled: int
|
|
33
|
+
maxPages: int
|
|
34
|
+
creditsUsed: int
|
|
35
|
+
createdAt: str
|
|
36
|
+
completedAt: Optional[str]
|
|
37
|
+
pages: List[CrawlPage]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MapResult(TypedDict):
|
|
41
|
+
links: List[str]
|
|
42
|
+
total: int
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SearchResultItem(TypedDict):
|
|
46
|
+
url: str
|
|
47
|
+
title: str
|
|
48
|
+
snippet: str
|
|
49
|
+
markdown: Optional[str]
|
|
50
|
+
"""None if this specific page failed to scrape — check `error` in that case."""
|
|
51
|
+
error: Optional[str]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Monitor(TypedDict):
|
|
55
|
+
id: str
|
|
56
|
+
url: str
|
|
57
|
+
interval: str # "hourly" | "daily" | "weekly"
|
|
58
|
+
status: str # "active" | "paused" | "cancelled"
|
|
59
|
+
createdAt: str
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class MonitorChange(TypedDict):
|
|
63
|
+
id: str
|
|
64
|
+
detectedAt: str
|
|
65
|
+
snapshot: str
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class MonitorDetail(Monitor):
|
|
69
|
+
changes: List[MonitorChange]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class BatchResultItem(TypedDict, total=False):
|
|
73
|
+
id: str
|
|
74
|
+
"""Only present if you supplied an id via `items`."""
|
|
75
|
+
url: str
|
|
76
|
+
status: str # "pending" | "processing" | "done" | "failed"
|
|
77
|
+
data: ScrapeResult
|
|
78
|
+
"""Only present when status is "done"."""
|
|
79
|
+
error: str
|
|
80
|
+
"""Only present when status is "failed"."""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class BatchStatus(TypedDict):
|
|
84
|
+
status: str # "pending" | "processing" | "done"
|
|
85
|
+
total: int
|
|
86
|
+
completed: int
|
|
87
|
+
failed: int
|
|
88
|
+
creditsUsed: int
|
|
89
|
+
results: List[BatchResultItem]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
BatchScrapeItem = Dict[str, Any] # {"id": Optional[str], "url": str}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import httpx
|
|
2
|
+
import pytest
|
|
3
|
+
import respx
|
|
4
|
+
|
|
5
|
+
from webglean import WebGlean, WebGleanError
|
|
6
|
+
|
|
7
|
+
BASE = "https://api.webglean.dev"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_requires_api_key(monkeypatch):
|
|
11
|
+
monkeypatch.delenv("WEBGLEAN_API_KEY", raising=False)
|
|
12
|
+
with pytest.raises(ValueError, match="api_key is required"):
|
|
13
|
+
WebGlean()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_falls_back_to_env_var(monkeypatch):
|
|
17
|
+
monkeypatch.setenv("WEBGLEAN_API_KEY", "wg_from_env")
|
|
18
|
+
WebGlean() # should not raise
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@respx.mock
|
|
22
|
+
def test_scrape_sends_singular_format_and_unwraps_data():
|
|
23
|
+
route = respx.post(f"{BASE}/v1/scrape").mock(
|
|
24
|
+
return_value=httpx.Response(
|
|
25
|
+
200,
|
|
26
|
+
json={
|
|
27
|
+
"success": True,
|
|
28
|
+
"data": {"markdown": "# Hi", "html": "<h1>Hi</h1>", "text": "Hi", "metadata": {"title": "Hi"}},
|
|
29
|
+
},
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
client = WebGlean(api_key="wg_test")
|
|
33
|
+
|
|
34
|
+
result = client.scrape("https://example.com", format="markdown")
|
|
35
|
+
|
|
36
|
+
assert result == {"markdown": "# Hi", "html": "<h1>Hi</h1>", "text": "Hi", "metadata": {"title": "Hi"}}
|
|
37
|
+
sent = route.calls.last.request
|
|
38
|
+
assert sent.headers["Authorization"] == "Bearer wg_test"
|
|
39
|
+
import json as jsonlib
|
|
40
|
+
assert jsonlib.loads(sent.content) == {
|
|
41
|
+
"url": "https://example.com",
|
|
42
|
+
"formats": ["markdown"],
|
|
43
|
+
"onlyMainContent": True,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@respx.mock
|
|
48
|
+
def test_raises_webglean_error_on_non_2xx():
|
|
49
|
+
respx.post(f"{BASE}/v1/scrape").mock(
|
|
50
|
+
return_value=httpx.Response(401, json={"success": False, "error": "Invalid or missing API key"})
|
|
51
|
+
)
|
|
52
|
+
client = WebGlean(api_key="wg_bad")
|
|
53
|
+
|
|
54
|
+
with pytest.raises(WebGleanError) as exc_info:
|
|
55
|
+
client.scrape("https://example.com")
|
|
56
|
+
|
|
57
|
+
assert exc_info.value.status == 401
|
|
58
|
+
assert str(exc_info.value) == "Invalid or missing API key"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@respx.mock
|
|
62
|
+
def test_crawl_returns_just_the_id():
|
|
63
|
+
respx.post(f"{BASE}/v1/crawl").mock(return_value=httpx.Response(202, json={"success": True, "id": "crawl_1"}))
|
|
64
|
+
client = WebGlean(api_key="wg_test")
|
|
65
|
+
|
|
66
|
+
assert client.crawl("https://example.com") == "crawl_1"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@respx.mock
|
|
70
|
+
def test_map_reads_links_total_from_top_level():
|
|
71
|
+
respx.post(f"{BASE}/v1/map").mock(
|
|
72
|
+
return_value=httpx.Response(200, json={"success": True, "links": ["https://example.com/a"], "total": 1})
|
|
73
|
+
)
|
|
74
|
+
client = WebGlean(api_key="wg_test")
|
|
75
|
+
|
|
76
|
+
assert client.map("https://example.com") == {"links": ["https://example.com/a"], "total": 1}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@respx.mock
|
|
80
|
+
def test_search_returns_per_item_null_error_as_is():
|
|
81
|
+
respx.post(f"{BASE}/v1/search").mock(
|
|
82
|
+
return_value=httpx.Response(
|
|
83
|
+
200,
|
|
84
|
+
json={
|
|
85
|
+
"success": True,
|
|
86
|
+
"data": [
|
|
87
|
+
{"url": "https://good.com", "title": "Good", "snippet": "...", "markdown": "# Good", "error": None},
|
|
88
|
+
{"url": "https://bad.com", "title": "Bad", "snippet": "...", "markdown": None, "error": "Page failed to load"},
|
|
89
|
+
],
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
client = WebGlean(api_key="wg_test")
|
|
94
|
+
|
|
95
|
+
results = client.search("test")
|
|
96
|
+
|
|
97
|
+
assert results[1] == {
|
|
98
|
+
"url": "https://bad.com",
|
|
99
|
+
"title": "Bad",
|
|
100
|
+
"snippet": "...",
|
|
101
|
+
"markdown": None,
|
|
102
|
+
"error": "Page failed to load",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_extract_raises_locally_without_schema_or_prompt():
|
|
107
|
+
client = WebGlean(api_key="wg_test")
|
|
108
|
+
with pytest.raises(ValueError, match="at least one of schema, prompt"):
|
|
109
|
+
client.extract("https://example.com")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_batch_scrape_raises_locally_without_urls_or_items():
|
|
113
|
+
client = WebGlean(api_key="wg_test")
|
|
114
|
+
with pytest.raises(ValueError, match="provide either urls or items"):
|
|
115
|
+
client.batch_scrape()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@respx.mock
|
|
119
|
+
def test_delete_monitor_returns_none():
|
|
120
|
+
respx.delete(f"{BASE}/v1/monitor/mon_1").mock(return_value=httpx.Response(200, json={"success": True}))
|
|
121
|
+
client = WebGlean(api_key="wg_test")
|
|
122
|
+
|
|
123
|
+
assert client.delete_monitor("mon_1") is None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@respx.mock
|
|
127
|
+
def test_crawl_and_wait_polls_until_terminal():
|
|
128
|
+
respx.get(f"{BASE}/v1/crawl/crawl_1").mock(
|
|
129
|
+
side_effect=[
|
|
130
|
+
httpx.Response(200, json={"success": True, "data": {"id": "crawl_1", "status": "processing"}}),
|
|
131
|
+
httpx.Response(200, json={"success": True, "data": {"id": "crawl_1", "status": "done"}}),
|
|
132
|
+
]
|
|
133
|
+
)
|
|
134
|
+
client = WebGlean(api_key="wg_test")
|
|
135
|
+
|
|
136
|
+
result = client.crawl_and_wait("crawl_1", poll_interval=0)
|
|
137
|
+
|
|
138
|
+
assert result == {"id": "crawl_1", "status": "done"}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@respx.mock
|
|
142
|
+
def test_crawl_and_wait_raises_on_timeout():
|
|
143
|
+
respx.get(f"{BASE}/v1/crawl/crawl_1").mock(
|
|
144
|
+
return_value=httpx.Response(200, json={"success": True, "data": {"id": "crawl_1", "status": "processing"}})
|
|
145
|
+
)
|
|
146
|
+
client = WebGlean(api_key="wg_test")
|
|
147
|
+
|
|
148
|
+
with pytest.raises(WebGleanError) as exc_info:
|
|
149
|
+
client.crawl_and_wait("crawl_1", poll_interval=0, timeout=0)
|
|
150
|
+
|
|
151
|
+
assert exc_info.value.status == 0
|