webglean 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
webglean/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ from .client import WebGlean
2
+ from .errors import WebGleanError
3
+ from .types import (
4
+ BatchResultItem,
5
+ BatchStatus,
6
+ CrawlPage,
7
+ CrawlStatus,
8
+ MapResult,
9
+ Monitor,
10
+ MonitorChange,
11
+ MonitorDetail,
12
+ ScrapeMetadata,
13
+ ScrapeResult,
14
+ SearchResultItem,
15
+ )
16
+
17
+ __all__ = [
18
+ "WebGlean",
19
+ "WebGleanError",
20
+ "ScrapeResult",
21
+ "ScrapeMetadata",
22
+ "CrawlStatus",
23
+ "CrawlPage",
24
+ "MapResult",
25
+ "SearchResultItem",
26
+ "Monitor",
27
+ "MonitorDetail",
28
+ "MonitorChange",
29
+ "BatchStatus",
30
+ "BatchResultItem",
31
+ ]
webglean/_http.py ADDED
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import httpx
6
+
7
+ from .errors import WebGleanError
8
+
9
+
10
+ class HttpClient:
11
+ def __init__(self, base_url: str, api_key: str, timeout: float) -> None:
12
+ self._client = httpx.Client(
13
+ base_url=base_url,
14
+ headers={"Authorization": f"Bearer {api_key}"},
15
+ timeout=timeout,
16
+ )
17
+
18
+ def request(
19
+ self,
20
+ method: str,
21
+ path: str,
22
+ json_body: Optional[Dict[str, Any]] = None,
23
+ ) -> Dict[str, Any]:
24
+ res = self._client.request(method, path, json=json_body)
25
+
26
+ try:
27
+ body: Dict[str, Any] = res.json()
28
+ except ValueError:
29
+ raise WebGleanError(f"Invalid JSON response (HTTP {res.status_code})", res.status_code)
30
+
31
+ if res.status_code >= 400 or body.get("success") is False:
32
+ raise WebGleanError(
33
+ body.get("error") or f"Request failed with HTTP {res.status_code}",
34
+ res.status_code,
35
+ )
36
+
37
+ return body
38
+
39
+ def close(self) -> None:
40
+ self._client.close()
webglean/client.py ADDED
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from ._http import HttpClient
8
+ from .errors import WebGleanError
9
+ from .types import (
10
+ BatchStatus,
11
+ CrawlStatus,
12
+ MapResult,
13
+ Monitor,
14
+ MonitorDetail,
15
+ ScrapeResult,
16
+ SearchResultItem,
17
+ )
18
+
19
+ DEFAULT_BASE_URL = "https://api.webglean.dev"
20
+
21
+
22
+ class WebGlean:
23
+ def __init__(
24
+ self,
25
+ api_key: Optional[str] = None,
26
+ base_url: str = DEFAULT_BASE_URL,
27
+ timeout: float = 120.0,
28
+ ) -> None:
29
+ resolved_key = api_key or os.environ.get("WEBGLEAN_API_KEY")
30
+ if not resolved_key:
31
+ raise ValueError(
32
+ "WebGlean: an api_key is required (pass it directly or set WEBGLEAN_API_KEY)"
33
+ )
34
+ self._http = HttpClient(base_url, resolved_key, timeout)
35
+
36
+ def close(self) -> None:
37
+ self._http.close()
38
+
39
+ def __enter__(self) -> "WebGlean":
40
+ return self
41
+
42
+ def __exit__(self, *exc_info: Any) -> None:
43
+ self.close()
44
+
45
+ # ── Scrape ──────────────────────────────────────────────────────────
46
+
47
+ def scrape(
48
+ self,
49
+ url: str,
50
+ format: str = "markdown",
51
+ only_main_content: bool = True,
52
+ ) -> ScrapeResult:
53
+ body = self._http.request(
54
+ "POST",
55
+ "/v1/scrape",
56
+ {"url": url, "formats": [format], "onlyMainContent": only_main_content},
57
+ )
58
+ return body["data"]
59
+
60
+ # ── Crawl ───────────────────────────────────────────────────────────
61
+
62
+ def crawl(
63
+ self,
64
+ url: str,
65
+ max_depth: int = 2,
66
+ max_pages: int = 10,
67
+ include_paths: Optional[List[str]] = None,
68
+ exclude_paths: Optional[List[str]] = None,
69
+ ) -> str:
70
+ body = self._http.request(
71
+ "POST",
72
+ "/v1/crawl",
73
+ {
74
+ "url": url,
75
+ "maxDepth": max_depth,
76
+ "maxPages": max_pages,
77
+ "includePaths": include_paths or [],
78
+ "excludePaths": exclude_paths or [],
79
+ },
80
+ )
81
+ return body["id"]
82
+
83
+ def get_crawl(self, crawl_id: str) -> CrawlStatus:
84
+ body = self._http.request("GET", f"/v1/crawl/{crawl_id}")
85
+ return body["data"]
86
+
87
+ def crawl_and_wait(
88
+ self, crawl_id: str, poll_interval: float = 2.0, timeout: float = 600.0
89
+ ) -> CrawlStatus:
90
+ return self._poll_until(
91
+ lambda: self.get_crawl(crawl_id),
92
+ lambda status: status["status"] in ("done", "failed"),
93
+ poll_interval,
94
+ timeout,
95
+ )
96
+
97
+ # ── Extract ─────────────────────────────────────────────────────────
98
+
99
+ def extract(
100
+ self,
101
+ url: str,
102
+ schema: Optional[Dict[str, Any]] = None,
103
+ prompt: Optional[str] = None,
104
+ ) -> Any:
105
+ if not schema and not prompt:
106
+ raise ValueError("WebGlean.extract: provide at least one of schema, prompt")
107
+ body = self._http.request(
108
+ "POST", "/v1/extract", {"url": url, "schema": schema, "prompt": prompt}
109
+ )
110
+ return body["data"]
111
+
112
+ # ── Map ─────────────────────────────────────────────────────────────
113
+
114
+ def map(self, url: str, max_urls: int = 100, search: Optional[str] = None) -> MapResult:
115
+ """Unlike every other endpoint, /v1/map's response is not wrapped in `data`."""
116
+ body = self._http.request(
117
+ "POST", "/v1/map", {"url": url, "maxUrls": max_urls, "search": search}
118
+ )
119
+ return {"links": body["links"], "total": body["total"]}
120
+
121
+ # ── Search ──────────────────────────────────────────────────────────
122
+
123
+ def search(
124
+ self,
125
+ query: str,
126
+ num_results: int = 5,
127
+ country: Optional[str] = None,
128
+ lang: Optional[str] = None,
129
+ ) -> List[SearchResultItem]:
130
+ """A result item can have markdown=None + error set for a page that
131
+ individually failed to scrape, even though the overall call
132
+ succeeded — that's returned as-is, not raised."""
133
+ body = self._http.request(
134
+ "POST",
135
+ "/v1/search",
136
+ {"query": query, "numResults": num_results, "country": country, "lang": lang},
137
+ )
138
+ return body["data"]
139
+
140
+ # ── Monitor ─────────────────────────────────────────────────────────
141
+
142
+ def create_monitor(
143
+ self, url: str, interval: str = "daily", webhook_url: Optional[str] = None
144
+ ) -> Monitor:
145
+ body = self._http.request(
146
+ "POST",
147
+ "/v1/monitor",
148
+ {"url": url, "interval": interval, "webhookUrl": webhook_url},
149
+ )
150
+ return body["data"]
151
+
152
+ def list_monitors(self) -> List[Monitor]:
153
+ body = self._http.request("GET", "/v1/monitor")
154
+ return body["data"]
155
+
156
+ def get_monitor(self, monitor_id: str) -> MonitorDetail:
157
+ body = self._http.request("GET", f"/v1/monitor/{monitor_id}")
158
+ return body["data"]
159
+
160
+ def delete_monitor(self, monitor_id: str) -> None:
161
+ self._http.request("DELETE", f"/v1/monitor/{monitor_id}")
162
+
163
+ # ── Batch scrape ────────────────────────────────────────────────────
164
+
165
+ def batch_scrape(
166
+ self,
167
+ urls: Optional[List[str]] = None,
168
+ items: Optional[List[Dict[str, Any]]] = None,
169
+ format: str = "markdown",
170
+ only_main_content: bool = True,
171
+ ) -> str:
172
+ if not urls and not items:
173
+ raise ValueError("WebGlean.batch_scrape: provide either urls or items")
174
+ payload: Dict[str, Any] = {"format": format, "onlyMainContent": only_main_content}
175
+ if items:
176
+ payload["items"] = items
177
+ else:
178
+ payload["urls"] = urls
179
+ body = self._http.request("POST", "/v1/batch/scrape", payload)
180
+ return body["id"]
181
+
182
+ def get_batch(self, batch_id: str) -> BatchStatus:
183
+ body = self._http.request("GET", f"/v1/batch/scrape/{batch_id}")
184
+ return {
185
+ "status": body["status"],
186
+ "total": body["total"],
187
+ "completed": body["completed"],
188
+ "failed": body["failed"],
189
+ "creditsUsed": body["creditsUsed"],
190
+ "results": body["results"],
191
+ }
192
+
193
+ def batch_scrape_and_wait(
194
+ self, batch_id: str, poll_interval: float = 2.0, timeout: float = 1200.0
195
+ ) -> BatchStatus:
196
+ return self._poll_until(
197
+ lambda: self.get_batch(batch_id),
198
+ lambda status: status["status"] == "done",
199
+ poll_interval,
200
+ timeout,
201
+ )
202
+
203
+ # ── Internal ────────────────────────────────────────────────────────
204
+
205
+ def _poll_until(self, fetch_status, is_done, poll_interval: float, timeout: float):
206
+ deadline = time.monotonic() + timeout
207
+ while True:
208
+ status = fetch_status()
209
+ if is_done(status):
210
+ return status
211
+ if time.monotonic() >= deadline:
212
+ raise WebGleanError(f"Timed out after {timeout}s waiting for job to finish", 0)
213
+ time.sleep(poll_interval)
webglean/errors.py ADDED
@@ -0,0 +1,10 @@
1
+ class WebGleanError(Exception):
2
+ """Raised for any non-2xx response from the WebGlean API, and for
3
+ client-side polling timeouts (status 0)."""
4
+
5
+ def __init__(self, message: str, status: int) -> None:
6
+ super().__init__(message)
7
+ self.status = status
8
+
9
+ def __repr__(self) -> str:
10
+ return f"WebGleanError(status={self.status!r}, message={str(self)!r})"
webglean/py.typed ADDED
File without changes
webglean/types.py ADDED
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, TypedDict
4
+
5
+
6
+ class ScrapeMetadata(TypedDict, total=False):
7
+ title: Optional[str]
8
+ description: Optional[str]
9
+ url: Optional[str]
10
+ statusCode: Optional[int]
11
+
12
+
13
+ class ScrapeResult(TypedDict):
14
+ markdown: str
15
+ html: str
16
+ text: str
17
+ metadata: ScrapeMetadata
18
+
19
+
20
+ class CrawlPage(TypedDict):
21
+ url: str
22
+ markdown: str
23
+ html: str
24
+ text: str
25
+ metadata: ScrapeMetadata
26
+
27
+
28
+ class CrawlStatus(TypedDict):
29
+ id: str
30
+ url: str
31
+ status: str # "pending" | "processing" | "done" | "failed"
32
+ pagesCrawled: int
33
+ maxPages: int
34
+ creditsUsed: int
35
+ createdAt: str
36
+ completedAt: Optional[str]
37
+ pages: List[CrawlPage]
38
+
39
+
40
+ class MapResult(TypedDict):
41
+ links: List[str]
42
+ total: int
43
+
44
+
45
+ class SearchResultItem(TypedDict):
46
+ url: str
47
+ title: str
48
+ snippet: str
49
+ markdown: Optional[str]
50
+ """None if this specific page failed to scrape — check `error` in that case."""
51
+ error: Optional[str]
52
+
53
+
54
+ class Monitor(TypedDict):
55
+ id: str
56
+ url: str
57
+ interval: str # "hourly" | "daily" | "weekly"
58
+ status: str # "active" | "paused" | "cancelled"
59
+ createdAt: str
60
+
61
+
62
+ class MonitorChange(TypedDict):
63
+ id: str
64
+ detectedAt: str
65
+ snapshot: str
66
+
67
+
68
+ class MonitorDetail(Monitor):
69
+ changes: List[MonitorChange]
70
+
71
+
72
+ class BatchResultItem(TypedDict, total=False):
73
+ id: str
74
+ """Only present if you supplied an id via `items`."""
75
+ url: str
76
+ status: str # "pending" | "processing" | "done" | "failed"
77
+ data: ScrapeResult
78
+ """Only present when status is "done"."""
79
+ error: str
80
+ """Only present when status is "failed"."""
81
+
82
+
83
+ class BatchStatus(TypedDict):
84
+ status: str # "pending" | "processing" | "done"
85
+ total: int
86
+ completed: int
87
+ failed: int
88
+ creditsUsed: int
89
+ results: List[BatchResultItem]
90
+
91
+
92
+ BatchScrapeItem = Dict[str, Any] # {"id": Optional[str], "url": str}
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.4
2
+ Name: webglean
3
+ Version: 0.1.0
4
+ Summary: Official Python client for the WebGlean API
5
+ Project-URL: Homepage, https://webglean.dev/docs/sdks
6
+ Project-URL: Documentation, https://webglean.dev/docs/sdks
7
+ Project-URL: Repository, https://github.com/qubomax/webglean
8
+ Project-URL: Issues, https://github.com/qubomax/webglean/issues
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai-agents,crawler,llm,markdown,rag,scraping,web-scraping,webglean
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: httpx>=0.27
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest>=8.0; extra == 'dev'
19
+ Requires-Dist: respx>=0.21; extra == 'dev'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # webglean
23
+
24
+ Official Python client for the [WebGlean](https://webglean.dev) API.
25
+
26
+ ```bash
27
+ pip install webglean
28
+ ```
29
+
30
+ ```python
31
+ from webglean import WebGlean
32
+
33
+ client = WebGlean(api_key="wg_your_key") # or set WEBGLEAN_API_KEY
34
+
35
+ result = client.scrape("https://example.com")
36
+ print(result["markdown"])
37
+ ```
38
+
39
+ `WebGlean` also works as a context manager (`with WebGlean(...) as client:`), which closes the underlying HTTP connection pool on exit.
40
+
41
+ ## Methods
42
+
43
+ | Method | Endpoint |
44
+ |---|---|
45
+ | `scrape(url, ...)` | `POST /v1/scrape` |
46
+ | `crawl(url, ...)` / `get_crawl(id)` / `crawl_and_wait(id, ...)` | `POST /v1/crawl`, `GET /v1/crawl/:id` |
47
+ | `extract(url, schema=, prompt=)` | `POST /v1/extract` |
48
+ | `map(url, ...)` | `POST /v1/map` |
49
+ | `search(query, ...)` | `POST /v1/search` |
50
+ | `create_monitor(url, ...)` / `list_monitors()` / `get_monitor(id)` / `delete_monitor(id)` | `POST/GET/DELETE /v1/monitor` |
51
+ | `batch_scrape(urls=, items=, ...)` / `get_batch(id)` / `batch_scrape_and_wait(id, ...)` | `POST /v1/batch/scrape`, `GET /v1/batch/scrape/:id` |
52
+
53
+ `crawl_and_wait` and `batch_scrape_and_wait` poll every 2s by default (`poll_interval`, `timeout` in seconds) until the job finishes, raising `WebGleanError` (status `0`) on timeout.
54
+
55
+ ## Errors
56
+
57
+ Any non-2xx response raises `WebGleanError`, with `.status` and the message set from the API's error body.
58
+
59
+ ## Development
60
+
61
+ ```bash
62
+ python3 -m venv .venv && source .venv/bin/activate
63
+ pip install -e ".[dev]"
64
+ pytest
65
+ ```
@@ -0,0 +1,10 @@
1
+ webglean/__init__.py,sha256=xtU8AhX7lu5VCmMDmyvfuwnB_ropqoPum8m0vlFHbjU,552
2
+ webglean/_http.py,sha256=MBs-m24v_FWccKMIusWZsfgZdQg4Wt3bnnYxvNSEgFU,1101
3
+ webglean/client.py,sha256=jJgj6JOa4FzLHlD6dVXOWsIfo0IncTBeQhDXjntYidk,7927
4
+ webglean/errors.py,sha256=j7vtxFO8Ar0wgjbP_oMnwhXaP918loykDKQzhh0VRHo,384
5
+ webglean/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ webglean/types.py,sha256=hSl2VFw7ZQaLEuWHpq_OjJSESIgoGfHGARqmcHwqfF4,1937
7
+ webglean-0.1.0.dist-info/METADATA,sha256=rsZ8_1ArZ3pPJni9_DuISk0zDZy9K0vbkvjg7EGCI1o,2267
8
+ webglean-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ webglean-0.1.0.dist-info/licenses/LICENSE,sha256=CDaq1vSLNtc5piM2cqQaqQkopo6qAxURB9FKSewKJJg,1065
10
+ webglean-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 WebGlean
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.