crawlora 1.5.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
crawlora/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ from .async_client import AsyncCrawloraClient
2
+ from .client import (
3
+ VERSION,
4
+ CrawloraClient,
5
+ CrawloraClientError,
6
+ CrawloraError,
7
+ CrawloraNetworkError,
8
+ CrawloraServerError,
9
+ )
10
+ from .operations import GROUPS, OPERATION_COUNT, OPERATIONS, OperationId
11
+
12
+ __all__ = [
13
+ "AsyncCrawloraClient",
14
+ "CrawloraClient",
15
+ "CrawloraError",
16
+ "CrawloraClientError",
17
+ "CrawloraServerError",
18
+ "CrawloraNetworkError",
19
+ "GROUPS",
20
+ "OPERATIONS",
21
+ "OPERATION_COUNT",
22
+ "OperationId",
23
+ "VERSION",
24
+ ]
@@ -0,0 +1,44 @@
1
+ """Shared pagination helpers used by the sync and async clients.
2
+
3
+ This module deliberately has no `.pyi` stub so type checkers read its inline
4
+ annotations directly (the `client.pyi` stub shadows `client.py`).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Mapping
10
+
11
+ PAGE_PARAM_NAMES = ("page", "offset")
12
+
13
+
14
+ def detect_page_param(operation: Mapping[str, Any]) -> str | None:
15
+ names = {parameter["name"] for parameter in operation.get("queryParams", [])}
16
+ for candidate in PAGE_PARAM_NAMES:
17
+ if candidate in names:
18
+ return candidate
19
+ return None
20
+
21
+
22
+ def page_is_empty(response: Any) -> bool:
23
+ data = response
24
+ if isinstance(response, Mapping) and "data" in response:
25
+ data = response["data"]
26
+ if data is None:
27
+ return True
28
+ if isinstance(data, (list, tuple, dict, str)):
29
+ return len(data) == 0
30
+ return not data
31
+
32
+
33
+ def default_start(page_param: str) -> int:
34
+ return 0 if page_param == "offset" else 1
35
+
36
+
37
+ def default_items(response: Any) -> list[Any]:
38
+ """Default item extractor: the response's ``data`` list (Crawlora envelope),
39
+ or the response itself when it is already a list."""
40
+ if isinstance(response, Mapping) and isinstance(response.get("data"), list):
41
+ return list(response["data"])
42
+ if isinstance(response, list):
43
+ return list(response)
44
+ return []
@@ -0,0 +1,114 @@
1
+ """Keep-alive HTTP transport for the synchronous client (standard library only).
2
+
3
+ Maintains a small pool of reusable connections per ``(scheme, host, port)`` so
4
+ the sync client avoids a fresh TCP + TLS handshake on every request. Each
5
+ request checks out its own connection, so the transport is safe to use from
6
+ multiple threads (e.g. under ``max_concurrency``). This module is stub-free so
7
+ type checkers read its inline annotations directly.
8
+
9
+ The transport returns a lightweight response object exposing ``status``,
10
+ ``headers`` (a dict), and ``body`` (bytes) — the only fields the client reads.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import http.client
16
+ import threading
17
+ from dataclasses import dataclass
18
+ from typing import Any, Mapping
19
+ from urllib.parse import urlsplit
20
+ from urllib.request import Request
21
+
22
+
23
+ def _title_case(name: str) -> str:
24
+ return "-".join(part.capitalize() for part in name.split("-"))
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class _PooledResponse:
29
+ status: int
30
+ headers: Mapping[str, str]
31
+ body: bytes
32
+
33
+
34
+ class KeepAliveTransport:
35
+ """Connection-pooling transport. Drop-in for the urlopen transport: callable
36
+ as ``transport(request, timeout) -> response``."""
37
+
38
+ def __init__(self, max_per_host: int = 8) -> None:
39
+ self._lock = threading.Lock()
40
+ self._pools: dict[tuple, list[http.client.HTTPConnection]] = {}
41
+ self._max_per_host = max_per_host
42
+
43
+ def __call__(self, request: Request, timeout: float) -> _PooledResponse:
44
+ parts = urlsplit(request.full_url)
45
+ key = (parts.scheme, parts.hostname, parts.port)
46
+ path = parts.path or "/"
47
+ if parts.query:
48
+ path = f"{path}?{parts.query}"
49
+ method = request.get_method()
50
+ # Send canonical HTTP title-case header names (matching the urlopen
51
+ # transport's behavior), so receivers see e.g. "X-Api-Key".
52
+ headers = {_title_case(name): value for name, value in request.header_items()}
53
+ body = request.data
54
+
55
+ last_exc: Exception | None = None
56
+ for attempt in range(2):
57
+ conn = self._checkout(key, parts, timeout)
58
+ try:
59
+ conn.request(method, path, body=body, headers=headers)
60
+ response = conn.getresponse()
61
+ data = response.read()
62
+ result = _PooledResponse(response.status, dict(response.getheaders()), data)
63
+ except (http.client.HTTPException, ConnectionError, OSError) as exc:
64
+ # Likely a stale pooled connection the server already closed;
65
+ # discard it and retry once on a fresh connection.
66
+ last_exc = exc
67
+ self._close(conn)
68
+ if attempt == 1:
69
+ raise
70
+ continue
71
+ if response.will_close:
72
+ self._close(conn)
73
+ else:
74
+ self._checkin(key, conn)
75
+ return result
76
+ raise last_exc if last_exc else RuntimeError("keep-alive transport failed")
77
+
78
+ def close(self) -> None:
79
+ with self._lock:
80
+ pools = list(self._pools.values())
81
+ self._pools.clear()
82
+ for pool in pools:
83
+ for conn in pool:
84
+ self._close(conn)
85
+
86
+ def _checkout(self, key: tuple, parts: Any, timeout: float) -> http.client.HTTPConnection:
87
+ with self._lock:
88
+ pool = self._pools.get(key)
89
+ if pool:
90
+ conn = pool.pop()
91
+ conn.timeout = timeout
92
+ return conn
93
+ return self._new(parts, timeout)
94
+
95
+ def _checkin(self, key: tuple, conn: http.client.HTTPConnection) -> None:
96
+ with self._lock:
97
+ pool = self._pools.setdefault(key, [])
98
+ if len(pool) < self._max_per_host:
99
+ pool.append(conn)
100
+ return
101
+ self._close(conn)
102
+
103
+ @staticmethod
104
+ def _new(parts: Any, timeout: float) -> http.client.HTTPConnection:
105
+ if parts.scheme == "https":
106
+ return http.client.HTTPSConnection(parts.hostname, parts.port or 443, timeout=timeout)
107
+ return http.client.HTTPConnection(parts.hostname, parts.port or 80, timeout=timeout)
108
+
109
+ @staticmethod
110
+ def _close(conn: http.client.HTTPConnection) -> None:
111
+ try:
112
+ conn.close()
113
+ except Exception:
114
+ pass
@@ -0,0 +1,321 @@
1
+ """Asyncio client for the Crawlora API.
2
+
3
+ Two transports:
4
+
5
+ * When ``httpx`` is installed (``pip install crawlora[async]``) the client uses
6
+ ``httpx.AsyncClient`` for true non-blocking I/O with connection pooling.
7
+ * Otherwise it falls back to running the synchronous client in a worker thread
8
+ via :func:`asyncio.to_thread`, keeping the base package dependency-free.
9
+
10
+ Both paths reuse the synchronous client's request building, validation, retry,
11
+ ``Retry-After`` handling, error classification, and observability options, so
12
+ behavior stays aligned with :class:`CrawloraClient`.
13
+
14
+ client = AsyncCrawloraClient(api_key="...")
15
+ result = await client.bing.search(q="coffee")
16
+ async for item in client.paginate_items("ebay-seller-feedback", {"seller": "acme"}):
17
+ ...
18
+ await client.aclose()
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import asyncio
24
+ import io
25
+ from typing import Any, AsyncIterator, Callable, Mapping
26
+
27
+ from ._pagination import default_items, default_start, detect_page_param, page_is_empty
28
+ from .client import (
29
+ CrawloraClient,
30
+ CrawloraNetworkError,
31
+ ResponseType,
32
+ _allowed_params,
33
+ _api_error_class,
34
+ _auth_headers,
35
+ _build_request,
36
+ _ensure_request_id,
37
+ _header_value,
38
+ _merge_headers,
39
+ _parse_response,
40
+ _run_after_response,
41
+ _run_before_request,
42
+ _validate_response_type,
43
+ )
44
+ from .operations import GROUPS, OPERATIONS
45
+
46
+ try: # optional dependency: pip install crawlora[async]
47
+ import httpx
48
+ except ImportError: # pragma: no cover - exercised only without httpx
49
+ httpx = None # type: ignore[assignment]
50
+
51
+
52
+ class _AsyncRateLimiter:
53
+ """Async client-side throttle: caps concurrency and spaces requests."""
54
+
55
+ def __init__(self, rps: float | None, concurrency: int | None) -> None:
56
+ self._interval = (1.0 / rps) if rps and rps > 0 else 0.0
57
+ self._sem = asyncio.Semaphore(concurrency) if concurrency and concurrency > 0 else None
58
+ self._lock = asyncio.Lock()
59
+ self._next = 0.0
60
+
61
+ async def __aenter__(self) -> "_AsyncRateLimiter":
62
+ if self._sem is not None:
63
+ await self._sem.acquire()
64
+ if self._interval:
65
+ async with self._lock:
66
+ now = asyncio.get_running_loop().time()
67
+ wait = max(0.0, self._next - now)
68
+ self._next = max(now, self._next) + self._interval
69
+ if wait > 0:
70
+ await asyncio.sleep(wait)
71
+ return self
72
+
73
+ async def __aexit__(self, *_exc: Any) -> None:
74
+ if self._sem is not None:
75
+ self._sem.release()
76
+
77
+
78
+ class AsyncCrawloraClient:
79
+ def __init__(self, **kwargs: Any) -> None:
80
+ self._client = CrawloraClient(**kwargs)
81
+ self._httpx = httpx.AsyncClient() if httpx is not None else None
82
+ c = self._client
83
+ self._limiter = _AsyncRateLimiter(c.rate_limit, c.max_concurrency) if (c.rate_limit or c.max_concurrency) else None
84
+ for group_name, operations in GROUPS.items():
85
+ setattr(self, group_name, _AsyncOperationGroup(self, operations))
86
+
87
+ @property
88
+ def sync_client(self) -> CrawloraClient:
89
+ """The underlying synchronous client (holds the shared configuration)."""
90
+ return self._client
91
+
92
+ @property
93
+ def uses_httpx(self) -> bool:
94
+ return self._httpx is not None
95
+
96
+ async def aclose(self) -> None:
97
+ if self._httpx is not None:
98
+ await self._httpx.aclose()
99
+
100
+ async def __aenter__(self) -> "AsyncCrawloraClient":
101
+ return self
102
+
103
+ async def __aexit__(self, *_exc: Any) -> None:
104
+ await self.aclose()
105
+
106
+ async def operation(
107
+ self,
108
+ operation_id: str,
109
+ params: Mapping[str, Any] | None = None,
110
+ *,
111
+ response_type: ResponseType = "auto",
112
+ timeout: float | None = None,
113
+ headers: Mapping[str, str] | None = None,
114
+ retries: int | None = None,
115
+ retry_predicate: Callable[[int, BaseException | None], bool] | None = None,
116
+ ) -> Any:
117
+ return await self.request(operation_id, params, response_type=response_type, timeout=timeout, headers=headers, retries=retries, retry_predicate=retry_predicate)
118
+
119
+ async def request(
120
+ self,
121
+ operation_id: str,
122
+ params: Mapping[str, Any] | None = None,
123
+ *,
124
+ response_type: ResponseType = "auto",
125
+ timeout: float | None = None,
126
+ headers: Mapping[str, str] | None = None,
127
+ retries: int | None = None,
128
+ retry_predicate: Callable[[int, BaseException | None], bool] | None = None,
129
+ ) -> Any:
130
+ if self._httpx is None:
131
+ return await asyncio.to_thread(
132
+ lambda: self._client.request(
133
+ operation_id, params, response_type=response_type, timeout=timeout,
134
+ headers=headers, retries=retries, retry_predicate=retry_predicate,
135
+ )
136
+ )
137
+
138
+ operation: Any = OPERATIONS.get(operation_id)
139
+ if operation is None:
140
+ raise ValueError(f"unknown Crawlora operation: {operation_id}")
141
+ response_type = _validate_response_type(response_type)
142
+ c = self._client
143
+ c._log({"event": "request", "operation": operation_id})
144
+ max_retries = c.retries if retries is None else max(0, int(retries))
145
+ import uuid
146
+
147
+ idempotency_key = uuid.uuid4().hex if c.idempotency_keys and operation["method"] in ("POST", "PATCH") else None
148
+
149
+ attempt = 0
150
+ while True:
151
+ try:
152
+ return await self._send(operation, dict(params or {}), response_type, timeout, headers, idempotency_key)
153
+ except Exception as exc: # noqa: BLE001 - re-raised unless retryable
154
+ from .client import CrawloraError
155
+
156
+ retryable = retry_predicate(exc.status, exc) if (isinstance(exc, CrawloraError) and retry_predicate is not None) else (isinstance(exc, CrawloraError) and c._is_retryable(exc.status, exc))
157
+ if not isinstance(exc, CrawloraError) or attempt >= max_retries or not retryable:
158
+ raise
159
+ attempt += 1
160
+ delay = c._compute_retry_delay(attempt, exc.headers)
161
+ c._log({"event": "retry", "operation": operation_id, "attempt": attempt, "status": exc.status, "delay": delay})
162
+ if c.on_retry is not None:
163
+ c.on_retry(attempt, exc, delay)
164
+ if delay > 0:
165
+ await asyncio.sleep(delay)
166
+
167
+ async def _send(
168
+ self,
169
+ operation: Mapping[str, Any],
170
+ params: dict[str, Any],
171
+ response_type: ResponseType,
172
+ timeout: float | None,
173
+ headers: Mapping[str, str] | None,
174
+ idempotency_key: str | None = None,
175
+ ) -> Any:
176
+ c = self._client
177
+ url, body, body_headers = _build_request(c.base_url, operation, params)
178
+ request_headers = _merge_headers(
179
+ c.headers,
180
+ _auth_headers(operation.get("security", []), c.api_key, c.jwt_token),
181
+ {"User-Agent": c.user_agent} if c.user_agent else {},
182
+ body_headers,
183
+ headers or {},
184
+ )
185
+ req_id = _ensure_request_id(request_headers) if c.request_id else _header_value(request_headers, "x-request-id") or None
186
+ if idempotency_key and not _header_value(request_headers, "idempotency-key"):
187
+ request_headers["Idempotency-Key"] = idempotency_key
188
+ if c.before_request:
189
+ ctx = {"operation": operation.get("id"), "method": operation["method"], "url": url, "headers": request_headers}
190
+ _run_before_request(c.before_request, ctx)
191
+ url, request_headers = ctx["url"], ctx["headers"]
192
+ request_timeout = timeout if timeout is not None else c.timeout
193
+ try:
194
+ if self._limiter is not None:
195
+ async with self._limiter:
196
+ response = await self._httpx.request(operation["method"], url, content=body, headers=request_headers, timeout=request_timeout)
197
+ else:
198
+ response = await self._httpx.request(operation["method"], url, content=body, headers=request_headers, timeout=request_timeout)
199
+ except httpx.TimeoutException as exc:
200
+ raise CrawloraNetworkError("Crawlora request timed out", request_id=req_id, cause=exc) from exc
201
+ except httpx.HTTPError as exc:
202
+ raise CrawloraNetworkError("Crawlora transport error", request_id=req_id, cause=exc) from exc
203
+
204
+ raw = bytes(response.content)
205
+ status = response.status_code
206
+ resp_headers = dict(response.headers)
207
+ is_error = status < 200 or status >= 300
208
+ if response_type == "stream" and not is_error:
209
+ return io.BytesIO(raw)
210
+ parse_mode = "auto" if response_type == "stream" else response_type
211
+ import json
212
+
213
+ raw_body = raw.decode(errors="replace")
214
+ try:
215
+ parsed = _parse_response(raw, _header_value(resp_headers, "content-type"), parse_mode)
216
+ except json.JSONDecodeError as exc:
217
+ from .client import CrawloraError
218
+
219
+ raise CrawloraError("Crawlora JSON parse error", status=status, raw_body=raw_body, headers=resp_headers, request_id=req_id, cause=exc) from exc
220
+ if is_error:
221
+ code = parsed.get("code") if isinstance(parsed, dict) else None
222
+ raw_msg = parsed.get("msg") if isinstance(parsed, dict) else None
223
+ message = str(raw_msg) if raw_msg else f"HTTP {status}"
224
+ error_class = _api_error_class(status)
225
+ raise error_class(message, status=status, code=code, body=parsed, raw_body=raw_body, headers=resp_headers, request_id=req_id)
226
+ if c.after_response:
227
+ parsed = _run_after_response(c.after_response, operation.get("id"), status, resp_headers, parsed)
228
+ return parsed
229
+
230
+ async def paginate(
231
+ self,
232
+ operation_id: str,
233
+ params: Mapping[str, Any] | None = None,
234
+ *,
235
+ page_param: str | None = None,
236
+ cursor_param: str | None = None,
237
+ next_cursor: Callable[[Any], Any] | None = None,
238
+ start: Any = None,
239
+ step: int = 1,
240
+ max_pages: int | None = None,
241
+ response_type: ResponseType = "auto",
242
+ timeout: float | None = None,
243
+ headers: Mapping[str, str] | None = None,
244
+ ) -> AsyncIterator[Any]:
245
+ """Async iterator over pages. Mirrors :meth:`CrawloraClient.paginate`."""
246
+ operation: Any = OPERATIONS.get(operation_id)
247
+ if operation is None:
248
+ raise ValueError(f"unknown Crawlora operation: {operation_id}")
249
+ base_params = dict(params or {})
250
+
251
+ if cursor_param or next_cursor:
252
+ if not (cursor_param and next_cursor):
253
+ raise ValueError("cursor pagination requires both cursor_param and next_cursor")
254
+ if cursor_param not in {p["name"] for p in operation.get("queryParams", [])}:
255
+ raise ValueError(f"cursor_param {cursor_param!r} is not a query parameter of operation {operation_id}")
256
+ cursor = start
257
+ fetched = 0
258
+ while max_pages is None or fetched < max_pages:
259
+ page_params = dict(base_params)
260
+ if cursor is not None:
261
+ page_params[cursor_param] = cursor
262
+ response = await self.request(operation_id, page_params, response_type=response_type, timeout=timeout, headers=headers)
263
+ yield response
264
+ fetched += 1
265
+ cursor = next_cursor(response)
266
+ if not cursor:
267
+ break
268
+ return
269
+
270
+ page_param = page_param or detect_page_param(operation)
271
+ if not page_param:
272
+ raise ValueError(f"operation {operation_id} has no page or offset query parameter to paginate")
273
+ page_value = default_start(page_param) if start is None else start
274
+ fetched = 0
275
+ while max_pages is None or fetched < max_pages:
276
+ page_params = {**base_params, page_param: page_value}
277
+ response = await self.request(operation_id, page_params, response_type=response_type, timeout=timeout, headers=headers)
278
+ yield response
279
+ fetched += 1
280
+ if page_is_empty(response):
281
+ break
282
+ page_value += step
283
+
284
+ async def paginate_items(
285
+ self,
286
+ operation_id: str,
287
+ params: Mapping[str, Any] | None = None,
288
+ *,
289
+ items: Callable[[Any], Any] | None = None,
290
+ **kwargs: Any,
291
+ ) -> AsyncIterator[Any]:
292
+ """Async iterator over individual items across pages."""
293
+ extract = items or default_items
294
+ async for page in self.paginate(operation_id, params, **kwargs):
295
+ for item in extract(page):
296
+ yield item
297
+
298
+
299
+ class _AsyncOperationGroup:
300
+ def __init__(self, client: AsyncCrawloraClient, operations: Mapping[str, str]) -> None:
301
+ self._client = client
302
+ self._operations = operations
303
+
304
+ def __getattr__(self, name: str) -> Callable[..., Any]:
305
+ operation_id = self._operations.get(name)
306
+ if operation_id is None:
307
+ raise AttributeError(name)
308
+ allowed = _allowed_params(operation_id)
309
+
310
+ async def call(**params: Any) -> Any:
311
+ response_type = params.pop("_response_type", "auto")
312
+ timeout = params.pop("_timeout", None)
313
+ headers = params.pop("_headers", None)
314
+ unknown = set(params) - allowed
315
+ if unknown:
316
+ raise TypeError(f"unexpected parameter(s) for {operation_id}: {', '.join(sorted(unknown))}")
317
+ return await self._client.request(
318
+ operation_id, params, response_type=response_type, timeout=timeout, headers=headers
319
+ )
320
+
321
+ return call