PyPI - webglean - Versions diffs - 0.1.0__py3-none-any.whl - Mend

webglean 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

webglean/__init__.py +31 -0
webglean/_http.py +40 -0
webglean/client.py +213 -0
webglean/errors.py +10 -0
webglean/py.typed +0 -0
webglean/types.py +92 -0
webglean-0.1.0.dist-info/METADATA +65 -0
webglean-0.1.0.dist-info/RECORD +10 -0
webglean-0.1.0.dist-info/WHEEL +4 -0
webglean-0.1.0.dist-info/licenses/LICENSE +21 -0

webglean/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+from .client import WebGlean
+from .errors import WebGleanError
+from .types import (
+    BatchResultItem,
+    BatchStatus,
+    CrawlPage,
+    CrawlStatus,
+    MapResult,
+    Monitor,
+    MonitorChange,
+    MonitorDetail,
+    ScrapeMetadata,
+    ScrapeResult,
+    SearchResultItem,
+)
+__all__ = [
+    "WebGlean",
+    "WebGleanError",
+    "ScrapeResult",
+    "ScrapeMetadata",
+    "CrawlStatus",
+    "CrawlPage",
+    "MapResult",
+    "SearchResultItem",
+    "Monitor",
+    "MonitorDetail",
+    "MonitorChange",
+    "BatchStatus",
+    "BatchResultItem",
+]

webglean/_http.py ADDED Viewed

@@ -0,0 +1,40 @@
+from __future__ import annotations
+from typing import Any, Dict, Optional
+import httpx
+from .errors import WebGleanError
+class HttpClient:
+    def __init__(self, base_url: str, api_key: str, timeout: float) -> None:
+        self._client = httpx.Client(
+            base_url=base_url,
+            headers={"Authorization": f"Bearer {api_key}"},
+            timeout=timeout,
+        )
+    def request(
+        self,
+        method: str,
+        path: str,
+        json_body: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        res = self._client.request(method, path, json=json_body)
+        try:
+            body: Dict[str, Any] = res.json()
+        except ValueError:
+            raise WebGleanError(f"Invalid JSON response (HTTP {res.status_code})", res.status_code)
+        if res.status_code >= 400 or body.get("success") is False:
+            raise WebGleanError(
+                body.get("error") or f"Request failed with HTTP {res.status_code}",
+                res.status_code,
+            )
+        return body
+    def close(self) -> None:
+        self._client.close()

webglean/client.py ADDED Viewed

@@ -0,0 +1,213 @@
+from __future__ import annotations
+import os
+import time
+from typing import Any, Dict, List, Optional
+from ._http import HttpClient
+from .errors import WebGleanError
+from .types import (
+    BatchStatus,
+    CrawlStatus,
+    MapResult,
+    Monitor,
+    MonitorDetail,
+    ScrapeResult,
+    SearchResultItem,
+)
+DEFAULT_BASE_URL = "https://api.webglean.dev"
+class WebGlean:
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        base_url: str = DEFAULT_BASE_URL,
+        timeout: float = 120.0,
+    ) -> None:
+        resolved_key = api_key or os.environ.get("WEBGLEAN_API_KEY")
+        if not resolved_key:
+            raise ValueError(
+                "WebGlean: an api_key is required (pass it directly or set WEBGLEAN_API_KEY)"
+            )
+        self._http = HttpClient(base_url, resolved_key, timeout)
+    def close(self) -> None:
+        self._http.close()
+    def __enter__(self) -> "WebGlean":
+        return self
+    def __exit__(self, *exc_info: Any) -> None:
+        self.close()
+    # ── Scrape ──────────────────────────────────────────────────────────
+    def scrape(
+        self,
+        url: str,
+        format: str = "markdown",
+        only_main_content: bool = True,
+    ) -> ScrapeResult:
+        body = self._http.request(
+            "POST",
+            "/v1/scrape",
+            {"url": url, "formats": [format], "onlyMainContent": only_main_content},
+        )
+        return body["data"]
+    # ── Crawl ───────────────────────────────────────────────────────────
+    def crawl(
+        self,
+        url: str,
+        max_depth: int = 2,
+        max_pages: int = 10,
+        include_paths: Optional[List[str]] = None,
+        exclude_paths: Optional[List[str]] = None,
+    ) -> str:
+        body = self._http.request(
+            "POST",
+            "/v1/crawl",
+            {
+                "url": url,
+                "maxDepth": max_depth,
+                "maxPages": max_pages,
+                "includePaths": include_paths or [],
+                "excludePaths": exclude_paths or [],
+            },
+        )
+        return body["id"]
+    def get_crawl(self, crawl_id: str) -> CrawlStatus:
+        body = self._http.request("GET", f"/v1/crawl/{crawl_id}")
+        return body["data"]
+    def crawl_and_wait(
+        self, crawl_id: str, poll_interval: float = 2.0, timeout: float = 600.0
+    ) -> CrawlStatus:
+        return self._poll_until(
+            lambda: self.get_crawl(crawl_id),
+            lambda status: status["status"] in ("done", "failed"),
+            poll_interval,
+            timeout,
+        )
+    # ── Extract ─────────────────────────────────────────────────────────
+    def extract(
+        self,
+        url: str,
+        schema: Optional[Dict[str, Any]] = None,
+        prompt: Optional[str] = None,
+    ) -> Any:
+        if not schema and not prompt:
+            raise ValueError("WebGlean.extract: provide at least one of schema, prompt")
+        body = self._http.request(
+            "POST", "/v1/extract", {"url": url, "schema": schema, "prompt": prompt}
+        )
+        return body["data"]
+    # ── Map ─────────────────────────────────────────────────────────────
+    def map(self, url: str, max_urls: int = 100, search: Optional[str] = None) -> MapResult:
+        """Unlike every other endpoint, /v1/map's response is not wrapped in `data`."""
+        body = self._http.request(
+            "POST", "/v1/map", {"url": url, "maxUrls": max_urls, "search": search}
+        )
+        return {"links": body["links"], "total": body["total"]}
+    # ── Search ──────────────────────────────────────────────────────────
+    def search(
+        self,
+        query: str,
+        num_results: int = 5,
+        country: Optional[str] = None,
+        lang: Optional[str] = None,
+    ) -> List[SearchResultItem]:
+        """A result item can have markdown=None + error set for a page that
+        individually failed to scrape, even though the overall call
+        succeeded — that's returned as-is, not raised."""
+        body = self._http.request(
+            "POST",
+            "/v1/search",
+            {"query": query, "numResults": num_results, "country": country, "lang": lang},
+        )
+        return body["data"]
+    # ── Monitor ─────────────────────────────────────────────────────────
+    def create_monitor(
+        self, url: str, interval: str = "daily", webhook_url: Optional[str] = None
+    ) -> Monitor:
+        body = self._http.request(
+            "POST",
+            "/v1/monitor",
+            {"url": url, "interval": interval, "webhookUrl": webhook_url},
+        )
+        return body["data"]
+    def list_monitors(self) -> List[Monitor]:
+        body = self._http.request("GET", "/v1/monitor")
+        return body["data"]
+    def get_monitor(self, monitor_id: str) -> MonitorDetail:
+        body = self._http.request("GET", f"/v1/monitor/{monitor_id}")
+        return body["data"]
+    def delete_monitor(self, monitor_id: str) -> None:
+        self._http.request("DELETE", f"/v1/monitor/{monitor_id}")
+    # ── Batch scrape ────────────────────────────────────────────────────
+    def batch_scrape(
+        self,
+        urls: Optional[List[str]] = None,
+        items: Optional[List[Dict[str, Any]]] = None,
+        format: str = "markdown",
+        only_main_content: bool = True,
+    ) -> str:
+        if not urls and not items:
+            raise ValueError("WebGlean.batch_scrape: provide either urls or items")
+        payload: Dict[str, Any] = {"format": format, "onlyMainContent": only_main_content}
+        if items:
+            payload["items"] = items
+        else:
+            payload["urls"] = urls
+        body = self._http.request("POST", "/v1/batch/scrape", payload)
+        return body["id"]
+    def get_batch(self, batch_id: str) -> BatchStatus:
+        body = self._http.request("GET", f"/v1/batch/scrape/{batch_id}")
+        return {
+            "status": body["status"],
+            "total": body["total"],
+            "completed": body["completed"],
+            "failed": body["failed"],
+            "creditsUsed": body["creditsUsed"],
+            "results": body["results"],
+        }
+    def batch_scrape_and_wait(
+        self, batch_id: str, poll_interval: float = 2.0, timeout: float = 1200.0
+    ) -> BatchStatus:
+        return self._poll_until(
+            lambda: self.get_batch(batch_id),
+            lambda status: status["status"] == "done",
+            poll_interval,
+            timeout,
+        )
+    # ── Internal ────────────────────────────────────────────────────────
+    def _poll_until(self, fetch_status, is_done, poll_interval: float, timeout: float):
+        deadline = time.monotonic() + timeout
+        while True:
+            status = fetch_status()
+            if is_done(status):
+                return status
+            if time.monotonic() >= deadline:
+                raise WebGleanError(f"Timed out after {timeout}s waiting for job to finish", 0)
+            time.sleep(poll_interval)

webglean/errors.py ADDED Viewed

@@ -0,0 +1,10 @@
+class WebGleanError(Exception):
+    """Raised for any non-2xx response from the WebGlean API, and for
+    client-side polling timeouts (status 0)."""
+    def __init__(self, message: str, status: int) -> None:
+        super().__init__(message)
+        self.status = status
+    def __repr__(self) -> str:
+        return f"WebGleanError(status={self.status!r}, message={str(self)!r})"

webglean/py.typed ADDED Viewed

File without changes

webglean/types.py ADDED Viewed

@@ -0,0 +1,92 @@
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, TypedDict
+class ScrapeMetadata(TypedDict, total=False):
+    title: Optional[str]
+    description: Optional[str]
+    url: Optional[str]
+    statusCode: Optional[int]
+class ScrapeResult(TypedDict):
+    markdown: str
+    html: str
+    text: str
+    metadata: ScrapeMetadata
+class CrawlPage(TypedDict):
+    url: str
+    markdown: str
+    html: str
+    text: str
+    metadata: ScrapeMetadata
+class CrawlStatus(TypedDict):
+    id: str
+    url: str
+    status: str  # "pending" | "processing" | "done" | "failed"
+    pagesCrawled: int
+    maxPages: int
+    creditsUsed: int
+    createdAt: str
+    completedAt: Optional[str]
+    pages: List[CrawlPage]
+class MapResult(TypedDict):
+    links: List[str]
+    total: int
+class SearchResultItem(TypedDict):
+    url: str
+    title: str
+    snippet: str
+    markdown: Optional[str]
+    """None if this specific page failed to scrape — check `error` in that case."""
+    error: Optional[str]
+class Monitor(TypedDict):
+    id: str
+    url: str
+    interval: str  # "hourly" | "daily" | "weekly"
+    status: str  # "active" | "paused" | "cancelled"
+    createdAt: str
+class MonitorChange(TypedDict):
+    id: str
+    detectedAt: str
+    snapshot: str
+class MonitorDetail(Monitor):
+    changes: List[MonitorChange]
+class BatchResultItem(TypedDict, total=False):
+    id: str
+    """Only present if you supplied an id via `items`."""
+    url: str
+    status: str  # "pending" | "processing" | "done" | "failed"
+    data: ScrapeResult
+    """Only present when status is "done"."""
+    error: str
+    """Only present when status is "failed"."""
+class BatchStatus(TypedDict):
+    status: str  # "pending" | "processing" | "done"
+    total: int
+    completed: int
+    failed: int
+    creditsUsed: int
+    results: List[BatchResultItem]
+BatchScrapeItem = Dict[str, Any]  # {"id": Optional[str], "url": str}

webglean-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,65 @@
+Metadata-Version: 2.4
+Name: webglean
+Version: 0.1.0
+Summary: Official Python client for the WebGlean API
+Project-URL: Homepage, https://webglean.dev/docs/sdks
+Project-URL: Documentation, https://webglean.dev/docs/sdks
+Project-URL: Repository, https://github.com/qubomax/webglean
+Project-URL: Issues, https://github.com/qubomax/webglean/issues
+License: MIT
+License-File: LICENSE
+Keywords: ai-agents,crawler,llm,markdown,rag,scraping,web-scraping,webglean
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Requires-Python: >=3.9
+Requires-Dist: httpx>=0.27
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: respx>=0.21; extra == 'dev'
+Description-Content-Type: text/markdown
+# webglean
+Official Python client for the [WebGlean](https://webglean.dev) API.
+```bash
+pip install webglean
+```
+```python
+from webglean import WebGlean
+client = WebGlean(api_key="wg_your_key")  # or set WEBGLEAN_API_KEY
+result = client.scrape("https://example.com")
+print(result["markdown"])
+```
+`WebGlean` also works as a context manager (`with WebGlean(...) as client:`), which closes the underlying HTTP connection pool on exit.
+## Methods
+| Method | Endpoint |
+|---|---|
+| `scrape(url, ...)` | `POST /v1/scrape` |
+| `crawl(url, ...)` / `get_crawl(id)` / `crawl_and_wait(id, ...)` | `POST /v1/crawl`, `GET /v1/crawl/:id` |
+| `extract(url, schema=, prompt=)` | `POST /v1/extract` |
+| `map(url, ...)` | `POST /v1/map` |
+| `search(query, ...)` | `POST /v1/search` |
+| `create_monitor(url, ...)` / `list_monitors()` / `get_monitor(id)` / `delete_monitor(id)` | `POST/GET/DELETE /v1/monitor` |
+| `batch_scrape(urls=, items=, ...)` / `get_batch(id)` / `batch_scrape_and_wait(id, ...)` | `POST /v1/batch/scrape`, `GET /v1/batch/scrape/:id` |
+`crawl_and_wait` and `batch_scrape_and_wait` poll every 2s by default (`poll_interval`, `timeout` in seconds) until the job finishes, raising `WebGleanError` (status `0`) on timeout.
+## Errors
+Any non-2xx response raises `WebGleanError`, with `.status` and the message set from the API's error body.
+## Development
+```bash
+python3 -m venv .venv && source .venv/bin/activate
+pip install -e ".[dev]"
+pytest
+```

webglean-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+webglean/__init__.py,sha256=xtU8AhX7lu5VCmMDmyvfuwnB_ropqoPum8m0vlFHbjU,552
+webglean/_http.py,sha256=MBs-m24v_FWccKMIusWZsfgZdQg4Wt3bnnYxvNSEgFU,1101
+webglean/client.py,sha256=jJgj6JOa4FzLHlD6dVXOWsIfo0IncTBeQhDXjntYidk,7927
+webglean/errors.py,sha256=j7vtxFO8Ar0wgjbP_oMnwhXaP918loykDKQzhh0VRHo,384
+webglean/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+webglean/types.py,sha256=hSl2VFw7ZQaLEuWHpq_OjJSESIgoGfHGARqmcHwqfF4,1937
+webglean-0.1.0.dist-info/METADATA,sha256=rsZ8_1ArZ3pPJni9_DuISk0zDZy9K0vbkvjg7EGCI1o,2267
+webglean-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+webglean-0.1.0.dist-info/licenses/LICENSE,sha256=CDaq1vSLNtc5piM2cqQaqQkopo6qAxURB9FKSewKJJg,1065
+webglean-0.1.0.dist-info/RECORD,,

webglean-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

webglean-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 WebGlean
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.