webglean 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ node_modules
2
+ .next
3
+ dist
4
+ .env
5
+ .env.local
6
+ drizzle/
7
+ .DS_Store
8
+ playwright-report/
9
+ test-results/
10
+ *.tsbuildinfo
11
+ .venv/
12
+ __pycache__/
13
+ *.egg-info/
14
+ .pytest_cache/
webglean-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 WebGlean
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.4
2
+ Name: webglean
3
+ Version: 0.1.0
4
+ Summary: Official Python client for the WebGlean API
5
+ Project-URL: Homepage, https://webglean.dev/docs/sdks
6
+ Project-URL: Documentation, https://webglean.dev/docs/sdks
7
+ Project-URL: Repository, https://github.com/qubomax/webglean
8
+ Project-URL: Issues, https://github.com/qubomax/webglean/issues
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai-agents,crawler,llm,markdown,rag,scraping,web-scraping,webglean
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: httpx>=0.27
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest>=8.0; extra == 'dev'
19
+ Requires-Dist: respx>=0.21; extra == 'dev'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # webglean
23
+
24
+ Official Python client for the [WebGlean](https://webglean.dev) API.
25
+
26
+ ```bash
27
+ pip install webglean
28
+ ```
29
+
30
+ ```python
31
+ from webglean import WebGlean
32
+
33
+ client = WebGlean(api_key="wg_your_key") # or set WEBGLEAN_API_KEY
34
+
35
+ result = client.scrape("https://example.com")
36
+ print(result["markdown"])
37
+ ```
38
+
39
+ `WebGlean` also works as a context manager (`with WebGlean(...) as client:`), which closes the underlying HTTP connection pool on exit.
40
+
41
+ ## Methods
42
+
43
+ | Method | Endpoint |
44
+ |---|---|
45
+ | `scrape(url, ...)` | `POST /v1/scrape` |
46
+ | `crawl(url, ...)` / `get_crawl(id)` / `crawl_and_wait(id, ...)` | `POST /v1/crawl`, `GET /v1/crawl/:id` |
47
+ | `extract(url, schema=, prompt=)` | `POST /v1/extract` |
48
+ | `map(url, ...)` | `POST /v1/map` |
49
+ | `search(query, ...)` | `POST /v1/search` |
50
+ | `create_monitor(url, ...)` / `list_monitors()` / `get_monitor(id)` / `delete_monitor(id)` | `POST/GET/DELETE /v1/monitor` |
51
+ | `batch_scrape(urls=, items=, ...)` / `get_batch(id)` / `batch_scrape_and_wait(id, ...)` | `POST /v1/batch/scrape`, `GET /v1/batch/scrape/:id` |
52
+
53
+ `crawl_and_wait` and `batch_scrape_and_wait` poll every 2s by default (`poll_interval`, `timeout` in seconds) until the job finishes, raising `WebGleanError` (status `0`) on timeout.
54
+
55
+ ## Errors
56
+
57
+ Any non-2xx response raises `WebGleanError`, with `.status` and the message set from the API's error body.
58
+
59
+ ## Development
60
+
61
+ ```bash
62
+ python3 -m venv .venv && source .venv/bin/activate
63
+ pip install -e ".[dev]"
64
+ pytest
65
+ ```
@@ -0,0 +1,44 @@
1
+ # webglean
2
+
3
+ Official Python client for the [WebGlean](https://webglean.dev) API.
4
+
5
+ ```bash
6
+ pip install webglean
7
+ ```
8
+
9
+ ```python
10
+ from webglean import WebGlean
11
+
12
+ client = WebGlean(api_key="wg_your_key") # or set WEBGLEAN_API_KEY
13
+
14
+ result = client.scrape("https://example.com")
15
+ print(result["markdown"])
16
+ ```
17
+
18
+ `WebGlean` also works as a context manager (`with WebGlean(...) as client:`), which closes the underlying HTTP connection pool on exit.
19
+
20
+ ## Methods
21
+
22
+ | Method | Endpoint |
23
+ |---|---|
24
+ | `scrape(url, ...)` | `POST /v1/scrape` |
25
+ | `crawl(url, ...)` / `get_crawl(id)` / `crawl_and_wait(id, ...)` | `POST /v1/crawl`, `GET /v1/crawl/:id` |
26
+ | `extract(url, schema=, prompt=)` | `POST /v1/extract` |
27
+ | `map(url, ...)` | `POST /v1/map` |
28
+ | `search(query, ...)` | `POST /v1/search` |
29
+ | `create_monitor(url, ...)` / `list_monitors()` / `get_monitor(id)` / `delete_monitor(id)` | `POST/GET/DELETE /v1/monitor` |
30
+ | `batch_scrape(urls=, items=, ...)` / `get_batch(id)` / `batch_scrape_and_wait(id, ...)` | `POST /v1/batch/scrape`, `GET /v1/batch/scrape/:id` |
31
+
32
+ `crawl_and_wait` and `batch_scrape_and_wait` poll every 2s by default (`poll_interval`, `timeout` in seconds) until the job finishes, raising `WebGleanError` (status `0`) on timeout.
33
+
34
+ ## Errors
35
+
36
+ Any non-2xx response raises `WebGleanError`, with `.status` and the message set from the API's error body.
37
+
38
+ ## Development
39
+
40
+ ```bash
41
+ python3 -m venv .venv && source .venv/bin/activate
42
+ pip install -e ".[dev]"
43
+ pytest
44
+ ```
@@ -0,0 +1,20 @@
1
+ import os
2
+
3
+ from webglean import WebGlean
4
+
5
+ client = WebGlean(
6
+ api_key=os.environ.get("WEBGLEAN_API_KEY", "wg_your_key"),
7
+ base_url=os.environ.get("WEBGLEAN_BASE_URL", "https://api.webglean.dev"),
8
+ )
9
+
10
+ scraped = client.scrape("https://example.com")
11
+ print(scraped["markdown"])
12
+
13
+ crawl_id = client.crawl("https://example.com", max_pages=5)
14
+ crawl = client.crawl_and_wait(crawl_id)
15
+ print(f"Crawled {crawl['pagesCrawled']} pages")
16
+
17
+ result = client.map("https://example.com")
18
+ print(f"Found {result['total']} URLs")
19
+
20
+ client.close()
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "webglean"
7
+ version = "0.1.0"
8
+ description = "Official Python client for the WebGlean API"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ keywords = ["webglean", "scraping", "web-scraping", "crawler", "markdown", "llm", "rag", "ai-agents"]
13
+ classifiers = [
14
+ "License :: OSI Approved :: MIT License",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3 :: Only",
17
+ ]
18
+ dependencies = [
19
+ "httpx>=0.27",
20
+ ]
21
+
22
+ [project.urls]
23
+ Homepage = "https://webglean.dev/docs/sdks"
24
+ Documentation = "https://webglean.dev/docs/sdks"
25
+ Repository = "https://github.com/qubomax/webglean"
26
+ Issues = "https://github.com/qubomax/webglean/issues"
27
+
28
+ [project.optional-dependencies]
29
+ dev = [
30
+ "pytest>=8.0",
31
+ "respx>=0.21",
32
+ ]
33
+
34
+ [tool.hatch.build.targets.wheel]
35
+ packages = ["src/webglean"]
36
+
37
+ [tool.pytest.ini_options]
38
+ testpaths = ["tests"]
@@ -0,0 +1,31 @@
1
+ from .client import WebGlean
2
+ from .errors import WebGleanError
3
+ from .types import (
4
+ BatchResultItem,
5
+ BatchStatus,
6
+ CrawlPage,
7
+ CrawlStatus,
8
+ MapResult,
9
+ Monitor,
10
+ MonitorChange,
11
+ MonitorDetail,
12
+ ScrapeMetadata,
13
+ ScrapeResult,
14
+ SearchResultItem,
15
+ )
16
+
17
+ __all__ = [
18
+ "WebGlean",
19
+ "WebGleanError",
20
+ "ScrapeResult",
21
+ "ScrapeMetadata",
22
+ "CrawlStatus",
23
+ "CrawlPage",
24
+ "MapResult",
25
+ "SearchResultItem",
26
+ "Monitor",
27
+ "MonitorDetail",
28
+ "MonitorChange",
29
+ "BatchStatus",
30
+ "BatchResultItem",
31
+ ]
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import httpx
6
+
7
+ from .errors import WebGleanError
8
+
9
+
10
+ class HttpClient:
11
+ def __init__(self, base_url: str, api_key: str, timeout: float) -> None:
12
+ self._client = httpx.Client(
13
+ base_url=base_url,
14
+ headers={"Authorization": f"Bearer {api_key}"},
15
+ timeout=timeout,
16
+ )
17
+
18
+ def request(
19
+ self,
20
+ method: str,
21
+ path: str,
22
+ json_body: Optional[Dict[str, Any]] = None,
23
+ ) -> Dict[str, Any]:
24
+ res = self._client.request(method, path, json=json_body)
25
+
26
+ try:
27
+ body: Dict[str, Any] = res.json()
28
+ except ValueError:
29
+ raise WebGleanError(f"Invalid JSON response (HTTP {res.status_code})", res.status_code)
30
+
31
+ if res.status_code >= 400 or body.get("success") is False:
32
+ raise WebGleanError(
33
+ body.get("error") or f"Request failed with HTTP {res.status_code}",
34
+ res.status_code,
35
+ )
36
+
37
+ return body
38
+
39
+ def close(self) -> None:
40
+ self._client.close()
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from ._http import HttpClient
8
+ from .errors import WebGleanError
9
+ from .types import (
10
+ BatchStatus,
11
+ CrawlStatus,
12
+ MapResult,
13
+ Monitor,
14
+ MonitorDetail,
15
+ ScrapeResult,
16
+ SearchResultItem,
17
+ )
18
+
19
+ DEFAULT_BASE_URL = "https://api.webglean.dev"
20
+
21
+
22
+ class WebGlean:
23
+ def __init__(
24
+ self,
25
+ api_key: Optional[str] = None,
26
+ base_url: str = DEFAULT_BASE_URL,
27
+ timeout: float = 120.0,
28
+ ) -> None:
29
+ resolved_key = api_key or os.environ.get("WEBGLEAN_API_KEY")
30
+ if not resolved_key:
31
+ raise ValueError(
32
+ "WebGlean: an api_key is required (pass it directly or set WEBGLEAN_API_KEY)"
33
+ )
34
+ self._http = HttpClient(base_url, resolved_key, timeout)
35
+
36
+ def close(self) -> None:
37
+ self._http.close()
38
+
39
+ def __enter__(self) -> "WebGlean":
40
+ return self
41
+
42
+ def __exit__(self, *exc_info: Any) -> None:
43
+ self.close()
44
+
45
+ # ── Scrape ──────────────────────────────────────────────────────────
46
+
47
+ def scrape(
48
+ self,
49
+ url: str,
50
+ format: str = "markdown",
51
+ only_main_content: bool = True,
52
+ ) -> ScrapeResult:
53
+ body = self._http.request(
54
+ "POST",
55
+ "/v1/scrape",
56
+ {"url": url, "formats": [format], "onlyMainContent": only_main_content},
57
+ )
58
+ return body["data"]
59
+
60
+ # ── Crawl ───────────────────────────────────────────────────────────
61
+
62
+ def crawl(
63
+ self,
64
+ url: str,
65
+ max_depth: int = 2,
66
+ max_pages: int = 10,
67
+ include_paths: Optional[List[str]] = None,
68
+ exclude_paths: Optional[List[str]] = None,
69
+ ) -> str:
70
+ body = self._http.request(
71
+ "POST",
72
+ "/v1/crawl",
73
+ {
74
+ "url": url,
75
+ "maxDepth": max_depth,
76
+ "maxPages": max_pages,
77
+ "includePaths": include_paths or [],
78
+ "excludePaths": exclude_paths or [],
79
+ },
80
+ )
81
+ return body["id"]
82
+
83
+ def get_crawl(self, crawl_id: str) -> CrawlStatus:
84
+ body = self._http.request("GET", f"/v1/crawl/{crawl_id}")
85
+ return body["data"]
86
+
87
+ def crawl_and_wait(
88
+ self, crawl_id: str, poll_interval: float = 2.0, timeout: float = 600.0
89
+ ) -> CrawlStatus:
90
+ return self._poll_until(
91
+ lambda: self.get_crawl(crawl_id),
92
+ lambda status: status["status"] in ("done", "failed"),
93
+ poll_interval,
94
+ timeout,
95
+ )
96
+
97
+ # ── Extract ─────────────────────────────────────────────────────────
98
+
99
+ def extract(
100
+ self,
101
+ url: str,
102
+ schema: Optional[Dict[str, Any]] = None,
103
+ prompt: Optional[str] = None,
104
+ ) -> Any:
105
+ if not schema and not prompt:
106
+ raise ValueError("WebGlean.extract: provide at least one of schema, prompt")
107
+ body = self._http.request(
108
+ "POST", "/v1/extract", {"url": url, "schema": schema, "prompt": prompt}
109
+ )
110
+ return body["data"]
111
+
112
+ # ── Map ─────────────────────────────────────────────────────────────
113
+
114
+ def map(self, url: str, max_urls: int = 100, search: Optional[str] = None) -> MapResult:
115
+ """Unlike every other endpoint, /v1/map's response is not wrapped in `data`."""
116
+ body = self._http.request(
117
+ "POST", "/v1/map", {"url": url, "maxUrls": max_urls, "search": search}
118
+ )
119
+ return {"links": body["links"], "total": body["total"]}
120
+
121
+ # ── Search ──────────────────────────────────────────────────────────
122
+
123
+ def search(
124
+ self,
125
+ query: str,
126
+ num_results: int = 5,
127
+ country: Optional[str] = None,
128
+ lang: Optional[str] = None,
129
+ ) -> List[SearchResultItem]:
130
+ """A result item can have markdown=None + error set for a page that
131
+ individually failed to scrape, even though the overall call
132
+ succeeded — that's returned as-is, not raised."""
133
+ body = self._http.request(
134
+ "POST",
135
+ "/v1/search",
136
+ {"query": query, "numResults": num_results, "country": country, "lang": lang},
137
+ )
138
+ return body["data"]
139
+
140
+ # ── Monitor ─────────────────────────────────────────────────────────
141
+
142
+ def create_monitor(
143
+ self, url: str, interval: str = "daily", webhook_url: Optional[str] = None
144
+ ) -> Monitor:
145
+ body = self._http.request(
146
+ "POST",
147
+ "/v1/monitor",
148
+ {"url": url, "interval": interval, "webhookUrl": webhook_url},
149
+ )
150
+ return body["data"]
151
+
152
+ def list_monitors(self) -> List[Monitor]:
153
+ body = self._http.request("GET", "/v1/monitor")
154
+ return body["data"]
155
+
156
+ def get_monitor(self, monitor_id: str) -> MonitorDetail:
157
+ body = self._http.request("GET", f"/v1/monitor/{monitor_id}")
158
+ return body["data"]
159
+
160
+ def delete_monitor(self, monitor_id: str) -> None:
161
+ self._http.request("DELETE", f"/v1/monitor/{monitor_id}")
162
+
163
+ # ── Batch scrape ────────────────────────────────────────────────────
164
+
165
+ def batch_scrape(
166
+ self,
167
+ urls: Optional[List[str]] = None,
168
+ items: Optional[List[Dict[str, Any]]] = None,
169
+ format: str = "markdown",
170
+ only_main_content: bool = True,
171
+ ) -> str:
172
+ if not urls and not items:
173
+ raise ValueError("WebGlean.batch_scrape: provide either urls or items")
174
+ payload: Dict[str, Any] = {"format": format, "onlyMainContent": only_main_content}
175
+ if items:
176
+ payload["items"] = items
177
+ else:
178
+ payload["urls"] = urls
179
+ body = self._http.request("POST", "/v1/batch/scrape", payload)
180
+ return body["id"]
181
+
182
+ def get_batch(self, batch_id: str) -> BatchStatus:
183
+ body = self._http.request("GET", f"/v1/batch/scrape/{batch_id}")
184
+ return {
185
+ "status": body["status"],
186
+ "total": body["total"],
187
+ "completed": body["completed"],
188
+ "failed": body["failed"],
189
+ "creditsUsed": body["creditsUsed"],
190
+ "results": body["results"],
191
+ }
192
+
193
+ def batch_scrape_and_wait(
194
+ self, batch_id: str, poll_interval: float = 2.0, timeout: float = 1200.0
195
+ ) -> BatchStatus:
196
+ return self._poll_until(
197
+ lambda: self.get_batch(batch_id),
198
+ lambda status: status["status"] == "done",
199
+ poll_interval,
200
+ timeout,
201
+ )
202
+
203
+ # ── Internal ────────────────────────────────────────────────────────
204
+
205
+ def _poll_until(self, fetch_status, is_done, poll_interval: float, timeout: float):
206
+ deadline = time.monotonic() + timeout
207
+ while True:
208
+ status = fetch_status()
209
+ if is_done(status):
210
+ return status
211
+ if time.monotonic() >= deadline:
212
+ raise WebGleanError(f"Timed out after {timeout}s waiting for job to finish", 0)
213
+ time.sleep(poll_interval)
@@ -0,0 +1,10 @@
1
+ class WebGleanError(Exception):
2
+ """Raised for any non-2xx response from the WebGlean API, and for
3
+ client-side polling timeouts (status 0)."""
4
+
5
+ def __init__(self, message: str, status: int) -> None:
6
+ super().__init__(message)
7
+ self.status = status
8
+
9
+ def __repr__(self) -> str:
10
+ return f"WebGleanError(status={self.status!r}, message={str(self)!r})"
File without changes
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, TypedDict
4
+
5
+
6
+ class ScrapeMetadata(TypedDict, total=False):
7
+ title: Optional[str]
8
+ description: Optional[str]
9
+ url: Optional[str]
10
+ statusCode: Optional[int]
11
+
12
+
13
+ class ScrapeResult(TypedDict):
14
+ markdown: str
15
+ html: str
16
+ text: str
17
+ metadata: ScrapeMetadata
18
+
19
+
20
+ class CrawlPage(TypedDict):
21
+ url: str
22
+ markdown: str
23
+ html: str
24
+ text: str
25
+ metadata: ScrapeMetadata
26
+
27
+
28
+ class CrawlStatus(TypedDict):
29
+ id: str
30
+ url: str
31
+ status: str # "pending" | "processing" | "done" | "failed"
32
+ pagesCrawled: int
33
+ maxPages: int
34
+ creditsUsed: int
35
+ createdAt: str
36
+ completedAt: Optional[str]
37
+ pages: List[CrawlPage]
38
+
39
+
40
+ class MapResult(TypedDict):
41
+ links: List[str]
42
+ total: int
43
+
44
+
45
+ class SearchResultItem(TypedDict):
46
+ url: str
47
+ title: str
48
+ snippet: str
49
+ markdown: Optional[str]
50
+ """None if this specific page failed to scrape — check `error` in that case."""
51
+ error: Optional[str]
52
+
53
+
54
+ class Monitor(TypedDict):
55
+ id: str
56
+ url: str
57
+ interval: str # "hourly" | "daily" | "weekly"
58
+ status: str # "active" | "paused" | "cancelled"
59
+ createdAt: str
60
+
61
+
62
+ class MonitorChange(TypedDict):
63
+ id: str
64
+ detectedAt: str
65
+ snapshot: str
66
+
67
+
68
+ class MonitorDetail(Monitor):
69
+ changes: List[MonitorChange]
70
+
71
+
72
+ class BatchResultItem(TypedDict, total=False):
73
+ id: str
74
+ """Only present if you supplied an id via `items`."""
75
+ url: str
76
+ status: str # "pending" | "processing" | "done" | "failed"
77
+ data: ScrapeResult
78
+ """Only present when status is "done"."""
79
+ error: str
80
+ """Only present when status is "failed"."""
81
+
82
+
83
+ class BatchStatus(TypedDict):
84
+ status: str # "pending" | "processing" | "done"
85
+ total: int
86
+ completed: int
87
+ failed: int
88
+ creditsUsed: int
89
+ results: List[BatchResultItem]
90
+
91
+
92
+ BatchScrapeItem = Dict[str, Any] # {"id": Optional[str], "url": str}
@@ -0,0 +1,151 @@
1
+ import httpx
2
+ import pytest
3
+ import respx
4
+
5
+ from webglean import WebGlean, WebGleanError
6
+
7
+ BASE = "https://api.webglean.dev"
8
+
9
+
10
+ def test_requires_api_key(monkeypatch):
11
+ monkeypatch.delenv("WEBGLEAN_API_KEY", raising=False)
12
+ with pytest.raises(ValueError, match="api_key is required"):
13
+ WebGlean()
14
+
15
+
16
+ def test_falls_back_to_env_var(monkeypatch):
17
+ monkeypatch.setenv("WEBGLEAN_API_KEY", "wg_from_env")
18
+ WebGlean() # should not raise
19
+
20
+
21
+ @respx.mock
22
+ def test_scrape_sends_singular_format_and_unwraps_data():
23
+ route = respx.post(f"{BASE}/v1/scrape").mock(
24
+ return_value=httpx.Response(
25
+ 200,
26
+ json={
27
+ "success": True,
28
+ "data": {"markdown": "# Hi", "html": "<h1>Hi</h1>", "text": "Hi", "metadata": {"title": "Hi"}},
29
+ },
30
+ )
31
+ )
32
+ client = WebGlean(api_key="wg_test")
33
+
34
+ result = client.scrape("https://example.com", format="markdown")
35
+
36
+ assert result == {"markdown": "# Hi", "html": "<h1>Hi</h1>", "text": "Hi", "metadata": {"title": "Hi"}}
37
+ sent = route.calls.last.request
38
+ assert sent.headers["Authorization"] == "Bearer wg_test"
39
+ import json as jsonlib
40
+ assert jsonlib.loads(sent.content) == {
41
+ "url": "https://example.com",
42
+ "formats": ["markdown"],
43
+ "onlyMainContent": True,
44
+ }
45
+
46
+
47
+ @respx.mock
48
+ def test_raises_webglean_error_on_non_2xx():
49
+ respx.post(f"{BASE}/v1/scrape").mock(
50
+ return_value=httpx.Response(401, json={"success": False, "error": "Invalid or missing API key"})
51
+ )
52
+ client = WebGlean(api_key="wg_bad")
53
+
54
+ with pytest.raises(WebGleanError) as exc_info:
55
+ client.scrape("https://example.com")
56
+
57
+ assert exc_info.value.status == 401
58
+ assert str(exc_info.value) == "Invalid or missing API key"
59
+
60
+
61
+ @respx.mock
62
+ def test_crawl_returns_just_the_id():
63
+ respx.post(f"{BASE}/v1/crawl").mock(return_value=httpx.Response(202, json={"success": True, "id": "crawl_1"}))
64
+ client = WebGlean(api_key="wg_test")
65
+
66
+ assert client.crawl("https://example.com") == "crawl_1"
67
+
68
+
69
+ @respx.mock
70
+ def test_map_reads_links_total_from_top_level():
71
+ respx.post(f"{BASE}/v1/map").mock(
72
+ return_value=httpx.Response(200, json={"success": True, "links": ["https://example.com/a"], "total": 1})
73
+ )
74
+ client = WebGlean(api_key="wg_test")
75
+
76
+ assert client.map("https://example.com") == {"links": ["https://example.com/a"], "total": 1}
77
+
78
+
79
+ @respx.mock
80
+ def test_search_returns_per_item_null_error_as_is():
81
+ respx.post(f"{BASE}/v1/search").mock(
82
+ return_value=httpx.Response(
83
+ 200,
84
+ json={
85
+ "success": True,
86
+ "data": [
87
+ {"url": "https://good.com", "title": "Good", "snippet": "...", "markdown": "# Good", "error": None},
88
+ {"url": "https://bad.com", "title": "Bad", "snippet": "...", "markdown": None, "error": "Page failed to load"},
89
+ ],
90
+ },
91
+ )
92
+ )
93
+ client = WebGlean(api_key="wg_test")
94
+
95
+ results = client.search("test")
96
+
97
+ assert results[1] == {
98
+ "url": "https://bad.com",
99
+ "title": "Bad",
100
+ "snippet": "...",
101
+ "markdown": None,
102
+ "error": "Page failed to load",
103
+ }
104
+
105
+
106
+ def test_extract_raises_locally_without_schema_or_prompt():
107
+ client = WebGlean(api_key="wg_test")
108
+ with pytest.raises(ValueError, match="at least one of schema, prompt"):
109
+ client.extract("https://example.com")
110
+
111
+
112
+ def test_batch_scrape_raises_locally_without_urls_or_items():
113
+ client = WebGlean(api_key="wg_test")
114
+ with pytest.raises(ValueError, match="provide either urls or items"):
115
+ client.batch_scrape()
116
+
117
+
118
+ @respx.mock
119
+ def test_delete_monitor_returns_none():
120
+ respx.delete(f"{BASE}/v1/monitor/mon_1").mock(return_value=httpx.Response(200, json={"success": True}))
121
+ client = WebGlean(api_key="wg_test")
122
+
123
+ assert client.delete_monitor("mon_1") is None
124
+
125
+
126
+ @respx.mock
127
+ def test_crawl_and_wait_polls_until_terminal():
128
+ respx.get(f"{BASE}/v1/crawl/crawl_1").mock(
129
+ side_effect=[
130
+ httpx.Response(200, json={"success": True, "data": {"id": "crawl_1", "status": "processing"}}),
131
+ httpx.Response(200, json={"success": True, "data": {"id": "crawl_1", "status": "done"}}),
132
+ ]
133
+ )
134
+ client = WebGlean(api_key="wg_test")
135
+
136
+ result = client.crawl_and_wait("crawl_1", poll_interval=0)
137
+
138
+ assert result == {"id": "crawl_1", "status": "done"}
139
+
140
+
141
+ @respx.mock
142
+ def test_crawl_and_wait_raises_on_timeout():
143
+ respx.get(f"{BASE}/v1/crawl/crawl_1").mock(
144
+ return_value=httpx.Response(200, json={"success": True, "data": {"id": "crawl_1", "status": "processing"}})
145
+ )
146
+ client = WebGlean(api_key="wg_test")
147
+
148
+ with pytest.raises(WebGleanError) as exc_info:
149
+ client.crawl_and_wait("crawl_1", poll_interval=0, timeout=0)
150
+
151
+ assert exc_info.value.status == 0