PyPI - crawlora - Versions diffs - 1.5.0.dev1__py3-none-any.whl - Mend

crawlora 1.5.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

crawlora/__init__.py +24 -0
crawlora/_pagination.py +44 -0
crawlora/_transport_sync.py +114 -0
crawlora/async_client.py +321 -0
crawlora/client.py +671 -0
crawlora/client.pyi +20710 -0
crawlora/operations.py +6784 -0
crawlora/py.typed +1 -0
crawlora-1.5.0.dev1.dist-info/METADATA +213 -0
crawlora-1.5.0.dev1.dist-info/RECORD +13 -0
crawlora-1.5.0.dev1.dist-info/WHEEL +5 -0
crawlora-1.5.0.dev1.dist-info/licenses/LICENSE +21 -0
crawlora-1.5.0.dev1.dist-info/top_level.txt +1 -0

crawlora/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+from .async_client import AsyncCrawloraClient
+from .client import (
+    VERSION,
+    CrawloraClient,
+    CrawloraClientError,
+    CrawloraError,
+    CrawloraNetworkError,
+    CrawloraServerError,
+)
+from .operations import GROUPS, OPERATION_COUNT, OPERATIONS, OperationId
+__all__ = [
+    "AsyncCrawloraClient",
+    "CrawloraClient",
+    "CrawloraError",
+    "CrawloraClientError",
+    "CrawloraServerError",
+    "CrawloraNetworkError",
+    "GROUPS",
+    "OPERATIONS",
+    "OPERATION_COUNT",
+    "OperationId",
+    "VERSION",
+]

crawlora/_pagination.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Shared pagination helpers used by the sync and async clients.
+This module deliberately has no `.pyi` stub so type checkers read its inline
+annotations directly (the `client.pyi` stub shadows `client.py`).
+"""
+from __future__ import annotations
+from typing import Any, Mapping
+PAGE_PARAM_NAMES = ("page", "offset")
+def detect_page_param(operation: Mapping[str, Any]) -> str | None:
+    names = {parameter["name"] for parameter in operation.get("queryParams", [])}
+    for candidate in PAGE_PARAM_NAMES:
+        if candidate in names:
+            return candidate
+    return None
+def page_is_empty(response: Any) -> bool:
+    data = response
+    if isinstance(response, Mapping) and "data" in response:
+        data = response["data"]
+    if data is None:
+        return True
+    if isinstance(data, (list, tuple, dict, str)):
+        return len(data) == 0
+    return not data
+def default_start(page_param: str) -> int:
+    return 0 if page_param == "offset" else 1
+def default_items(response: Any) -> list[Any]:
+    """Default item extractor: the response's ``data`` list (Crawlora envelope),
+    or the response itself when it is already a list."""
+    if isinstance(response, Mapping) and isinstance(response.get("data"), list):
+        return list(response["data"])
+    if isinstance(response, list):
+        return list(response)
+    return []

crawlora/_transport_sync.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Keep-alive HTTP transport for the synchronous client (standard library only).
+Maintains a small pool of reusable connections per ``(scheme, host, port)`` so
+the sync client avoids a fresh TCP + TLS handshake on every request. Each
+request checks out its own connection, so the transport is safe to use from
+multiple threads (e.g. under ``max_concurrency``). This module is stub-free so
+type checkers read its inline annotations directly.
+The transport returns a lightweight response object exposing ``status``,
+``headers`` (a dict), and ``body`` (bytes) — the only fields the client reads.
+"""
+from __future__ import annotations
+import http.client
+import threading
+from dataclasses import dataclass
+from typing import Any, Mapping
+from urllib.parse import urlsplit
+from urllib.request import Request
+def _title_case(name: str) -> str:
+    return "-".join(part.capitalize() for part in name.split("-"))
+@dataclass(frozen=True)
+class _PooledResponse:
+    status: int
+    headers: Mapping[str, str]
+    body: bytes
+class KeepAliveTransport:
+    """Connection-pooling transport. Drop-in for the urlopen transport: callable
+    as ``transport(request, timeout) -> response``."""
+    def __init__(self, max_per_host: int = 8) -> None:
+        self._lock = threading.Lock()
+        self._pools: dict[tuple, list[http.client.HTTPConnection]] = {}
+        self._max_per_host = max_per_host
+    def __call__(self, request: Request, timeout: float) -> _PooledResponse:
+        parts = urlsplit(request.full_url)
+        key = (parts.scheme, parts.hostname, parts.port)
+        path = parts.path or "/"
+        if parts.query:
+            path = f"{path}?{parts.query}"
+        method = request.get_method()
+        # Send canonical HTTP title-case header names (matching the urlopen
+        # transport's behavior), so receivers see e.g. "X-Api-Key".
+        headers = {_title_case(name): value for name, value in request.header_items()}
+        body = request.data
+        last_exc: Exception | None = None
+        for attempt in range(2):
+            conn = self._checkout(key, parts, timeout)
+            try:
+                conn.request(method, path, body=body, headers=headers)
+                response = conn.getresponse()
+                data = response.read()
+                result = _PooledResponse(response.status, dict(response.getheaders()), data)
+            except (http.client.HTTPException, ConnectionError, OSError) as exc:
+                # Likely a stale pooled connection the server already closed;
+                # discard it and retry once on a fresh connection.
+                last_exc = exc
+                self._close(conn)
+                if attempt == 1:
+                    raise
+                continue
+            if response.will_close:
+                self._close(conn)
+            else:
+                self._checkin(key, conn)
+            return result
+        raise last_exc if last_exc else RuntimeError("keep-alive transport failed")
+    def close(self) -> None:
+        with self._lock:
+            pools = list(self._pools.values())
+            self._pools.clear()
+        for pool in pools:
+            for conn in pool:
+                self._close(conn)
+    def _checkout(self, key: tuple, parts: Any, timeout: float) -> http.client.HTTPConnection:
+        with self._lock:
+            pool = self._pools.get(key)
+            if pool:
+                conn = pool.pop()
+                conn.timeout = timeout
+                return conn
+        return self._new(parts, timeout)
+    def _checkin(self, key: tuple, conn: http.client.HTTPConnection) -> None:
+        with self._lock:
+            pool = self._pools.setdefault(key, [])
+            if len(pool) < self._max_per_host:
+                pool.append(conn)
+                return
+        self._close(conn)
+    @staticmethod
+    def _new(parts: Any, timeout: float) -> http.client.HTTPConnection:
+        if parts.scheme == "https":
+            return http.client.HTTPSConnection(parts.hostname, parts.port or 443, timeout=timeout)
+        return http.client.HTTPConnection(parts.hostname, parts.port or 80, timeout=timeout)
+    @staticmethod
+    def _close(conn: http.client.HTTPConnection) -> None:
+        try:
+            conn.close()
+        except Exception:
+            pass

crawlora/async_client.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Asyncio client for the Crawlora API.
+Two transports:
+* When ``httpx`` is installed (``pip install crawlora[async]``) the client uses
+  ``httpx.AsyncClient`` for true non-blocking I/O with connection pooling.
+* Otherwise it falls back to running the synchronous client in a worker thread
+  via :func:`asyncio.to_thread`, keeping the base package dependency-free.
+Both paths reuse the synchronous client's request building, validation, retry,
+``Retry-After`` handling, error classification, and observability options, so
+behavior stays aligned with :class:`CrawloraClient`.
+    client = AsyncCrawloraClient(api_key="...")
+    result = await client.bing.search(q="coffee")
+    async for item in client.paginate_items("ebay-seller-feedback", {"seller": "acme"}):
+        ...
+    await client.aclose()
+"""
+from __future__ import annotations
+import asyncio
+import io
+from typing import Any, AsyncIterator, Callable, Mapping
+from ._pagination import default_items, default_start, detect_page_param, page_is_empty
+from .client import (
+    CrawloraClient,
+    CrawloraNetworkError,
+    ResponseType,
+    _allowed_params,
+    _api_error_class,
+    _auth_headers,
+    _build_request,
+    _ensure_request_id,
+    _header_value,
+    _merge_headers,
+    _parse_response,
+    _run_after_response,
+    _run_before_request,
+    _validate_response_type,
+)
+from .operations import GROUPS, OPERATIONS
+try:  # optional dependency: pip install crawlora[async]
+    import httpx
+except ImportError:  # pragma: no cover - exercised only without httpx
+    httpx = None  # type: ignore[assignment]
+class _AsyncRateLimiter:
+    """Async client-side throttle: caps concurrency and spaces requests."""
+    def __init__(self, rps: float | None, concurrency: int | None) -> None:
+        self._interval = (1.0 / rps) if rps and rps > 0 else 0.0
+        self._sem = asyncio.Semaphore(concurrency) if concurrency and concurrency > 0 else None
+        self._lock = asyncio.Lock()
+        self._next = 0.0
+    async def __aenter__(self) -> "_AsyncRateLimiter":
+        if self._sem is not None:
+            await self._sem.acquire()
+        if self._interval:
+            async with self._lock:
+                now = asyncio.get_running_loop().time()
+                wait = max(0.0, self._next - now)
+                self._next = max(now, self._next) + self._interval
+            if wait > 0:
+                await asyncio.sleep(wait)
+        return self
+    async def __aexit__(self, *_exc: Any) -> None:
+        if self._sem is not None:
+            self._sem.release()
+class AsyncCrawloraClient:
+    def __init__(self, **kwargs: Any) -> None:
+        self._client = CrawloraClient(**kwargs)
+        self._httpx = httpx.AsyncClient() if httpx is not None else None
+        c = self._client
+        self._limiter = _AsyncRateLimiter(c.rate_limit, c.max_concurrency) if (c.rate_limit or c.max_concurrency) else None
+        for group_name, operations in GROUPS.items():
+            setattr(self, group_name, _AsyncOperationGroup(self, operations))
+    @property
+    def sync_client(self) -> CrawloraClient:
+        """The underlying synchronous client (holds the shared configuration)."""
+        return self._client
+    @property
+    def uses_httpx(self) -> bool:
+        return self._httpx is not None
+    async def aclose(self) -> None:
+        if self._httpx is not None:
+            await self._httpx.aclose()
+    async def __aenter__(self) -> "AsyncCrawloraClient":
+        return self
+    async def __aexit__(self, *_exc: Any) -> None:
+        await self.aclose()
+    async def operation(
+        self,
+        operation_id: str,
+        params: Mapping[str, Any] | None = None,
+        *,
+        response_type: ResponseType = "auto",
+        timeout: float | None = None,
+        headers: Mapping[str, str] | None = None,
+        retries: int | None = None,
+        retry_predicate: Callable[[int, BaseException | None], bool] | None = None,
+    ) -> Any:
+        return await self.request(operation_id, params, response_type=response_type, timeout=timeout, headers=headers, retries=retries, retry_predicate=retry_predicate)
+    async def request(
+        self,
+        operation_id: str,
+        params: Mapping[str, Any] | None = None,
+        *,
+        response_type: ResponseType = "auto",
+        timeout: float | None = None,
+        headers: Mapping[str, str] | None = None,
+        retries: int | None = None,
+        retry_predicate: Callable[[int, BaseException | None], bool] | None = None,
+    ) -> Any:
+        if self._httpx is None:
+            return await asyncio.to_thread(
+                lambda: self._client.request(
+                    operation_id, params, response_type=response_type, timeout=timeout,
+                    headers=headers, retries=retries, retry_predicate=retry_predicate,
+                )
+            )
+        operation: Any = OPERATIONS.get(operation_id)
+        if operation is None:
+            raise ValueError(f"unknown Crawlora operation: {operation_id}")
+        response_type = _validate_response_type(response_type)
+        c = self._client
+        c._log({"event": "request", "operation": operation_id})
+        max_retries = c.retries if retries is None else max(0, int(retries))
+        import uuid
+        idempotency_key = uuid.uuid4().hex if c.idempotency_keys and operation["method"] in ("POST", "PATCH") else None
+        attempt = 0
+        while True:
+            try:
+                return await self._send(operation, dict(params or {}), response_type, timeout, headers, idempotency_key)
+            except Exception as exc:  # noqa: BLE001 - re-raised unless retryable
+                from .client import CrawloraError
+                retryable = retry_predicate(exc.status, exc) if (isinstance(exc, CrawloraError) and retry_predicate is not None) else (isinstance(exc, CrawloraError) and c._is_retryable(exc.status, exc))
+                if not isinstance(exc, CrawloraError) or attempt >= max_retries or not retryable:
+                    raise
+                attempt += 1
+                delay = c._compute_retry_delay(attempt, exc.headers)
+                c._log({"event": "retry", "operation": operation_id, "attempt": attempt, "status": exc.status, "delay": delay})
+                if c.on_retry is not None:
+                    c.on_retry(attempt, exc, delay)
+                if delay > 0:
+                    await asyncio.sleep(delay)
+    async def _send(
+        self,
+        operation: Mapping[str, Any],
+        params: dict[str, Any],
+        response_type: ResponseType,
+        timeout: float | None,
+        headers: Mapping[str, str] | None,
+        idempotency_key: str | None = None,
+    ) -> Any:
+        c = self._client
+        url, body, body_headers = _build_request(c.base_url, operation, params)
+        request_headers = _merge_headers(
+            c.headers,
+            _auth_headers(operation.get("security", []), c.api_key, c.jwt_token),
+            {"User-Agent": c.user_agent} if c.user_agent else {},
+            body_headers,
+            headers or {},
+        )
+        req_id = _ensure_request_id(request_headers) if c.request_id else _header_value(request_headers, "x-request-id") or None
+        if idempotency_key and not _header_value(request_headers, "idempotency-key"):
+            request_headers["Idempotency-Key"] = idempotency_key
+        if c.before_request:
+            ctx = {"operation": operation.get("id"), "method": operation["method"], "url": url, "headers": request_headers}
+            _run_before_request(c.before_request, ctx)
+            url, request_headers = ctx["url"], ctx["headers"]
+        request_timeout = timeout if timeout is not None else c.timeout
+        try:
+            if self._limiter is not None:
+                async with self._limiter:
+                    response = await self._httpx.request(operation["method"], url, content=body, headers=request_headers, timeout=request_timeout)
+            else:
+                response = await self._httpx.request(operation["method"], url, content=body, headers=request_headers, timeout=request_timeout)
+        except httpx.TimeoutException as exc:
+            raise CrawloraNetworkError("Crawlora request timed out", request_id=req_id, cause=exc) from exc
+        except httpx.HTTPError as exc:
+            raise CrawloraNetworkError("Crawlora transport error", request_id=req_id, cause=exc) from exc
+        raw = bytes(response.content)
+        status = response.status_code
+        resp_headers = dict(response.headers)
+        is_error = status < 200 or status >= 300
+        if response_type == "stream" and not is_error:
+            return io.BytesIO(raw)
+        parse_mode = "auto" if response_type == "stream" else response_type
+        import json
+        raw_body = raw.decode(errors="replace")
+        try:
+            parsed = _parse_response(raw, _header_value(resp_headers, "content-type"), parse_mode)
+        except json.JSONDecodeError as exc:
+            from .client import CrawloraError
+            raise CrawloraError("Crawlora JSON parse error", status=status, raw_body=raw_body, headers=resp_headers, request_id=req_id, cause=exc) from exc
+        if is_error:
+            code = parsed.get("code") if isinstance(parsed, dict) else None
+            raw_msg = parsed.get("msg") if isinstance(parsed, dict) else None
+            message = str(raw_msg) if raw_msg else f"HTTP {status}"
+            error_class = _api_error_class(status)
+            raise error_class(message, status=status, code=code, body=parsed, raw_body=raw_body, headers=resp_headers, request_id=req_id)
+        if c.after_response:
+            parsed = _run_after_response(c.after_response, operation.get("id"), status, resp_headers, parsed)
+        return parsed
+    async def paginate(
+        self,
+        operation_id: str,
+        params: Mapping[str, Any] | None = None,
+        *,
+        page_param: str | None = None,
+        cursor_param: str | None = None,
+        next_cursor: Callable[[Any], Any] | None = None,
+        start: Any = None,
+        step: int = 1,
+        max_pages: int | None = None,
+        response_type: ResponseType = "auto",
+        timeout: float | None = None,
+        headers: Mapping[str, str] | None = None,
+    ) -> AsyncIterator[Any]:
+        """Async iterator over pages. Mirrors :meth:`CrawloraClient.paginate`."""
+        operation: Any = OPERATIONS.get(operation_id)
+        if operation is None:
+            raise ValueError(f"unknown Crawlora operation: {operation_id}")
+        base_params = dict(params or {})
+        if cursor_param or next_cursor:
+            if not (cursor_param and next_cursor):
+                raise ValueError("cursor pagination requires both cursor_param and next_cursor")
+            if cursor_param not in {p["name"] for p in operation.get("queryParams", [])}:
+                raise ValueError(f"cursor_param {cursor_param!r} is not a query parameter of operation {operation_id}")
+            cursor = start
+            fetched = 0
+            while max_pages is None or fetched < max_pages:
+                page_params = dict(base_params)
+                if cursor is not None:
+                    page_params[cursor_param] = cursor
+                response = await self.request(operation_id, page_params, response_type=response_type, timeout=timeout, headers=headers)
+                yield response
+                fetched += 1
+                cursor = next_cursor(response)
+                if not cursor:
+                    break
+            return
+        page_param = page_param or detect_page_param(operation)
+        if not page_param:
+            raise ValueError(f"operation {operation_id} has no page or offset query parameter to paginate")
+        page_value = default_start(page_param) if start is None else start
+        fetched = 0
+        while max_pages is None or fetched < max_pages:
+            page_params = {**base_params, page_param: page_value}
+            response = await self.request(operation_id, page_params, response_type=response_type, timeout=timeout, headers=headers)
+            yield response
+            fetched += 1
+            if page_is_empty(response):
+                break
+            page_value += step
+    async def paginate_items(
+        self,
+        operation_id: str,
+        params: Mapping[str, Any] | None = None,
+        *,
+        items: Callable[[Any], Any] | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[Any]:
+        """Async iterator over individual items across pages."""
+        extract = items or default_items
+        async for page in self.paginate(operation_id, params, **kwargs):
+            for item in extract(page):
+                yield item
+class _AsyncOperationGroup:
+    def __init__(self, client: AsyncCrawloraClient, operations: Mapping[str, str]) -> None:
+        self._client = client
+        self._operations = operations
+    def __getattr__(self, name: str) -> Callable[..., Any]:
+        operation_id = self._operations.get(name)
+        if operation_id is None:
+            raise AttributeError(name)
+        allowed = _allowed_params(operation_id)
+        async def call(**params: Any) -> Any:
+            response_type = params.pop("_response_type", "auto")
+            timeout = params.pop("_timeout", None)
+            headers = params.pop("_headers", None)
+            unknown = set(params) - allowed
+            if unknown:
+                raise TypeError(f"unexpected parameter(s) for {operation_id}: {', '.join(sorted(unknown))}")
+            return await self._client.request(
+                operation_id, params, response_type=response_type, timeout=timeout, headers=headers
+            )
+        return call