crawlora 1.5.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlora/__init__.py +24 -0
- crawlora/_pagination.py +44 -0
- crawlora/_transport_sync.py +114 -0
- crawlora/async_client.py +321 -0
- crawlora/client.py +671 -0
- crawlora/client.pyi +20710 -0
- crawlora/operations.py +6784 -0
- crawlora/py.typed +1 -0
- crawlora-1.5.0.dev1.dist-info/METADATA +213 -0
- crawlora-1.5.0.dev1.dist-info/RECORD +13 -0
- crawlora-1.5.0.dev1.dist-info/WHEEL +5 -0
- crawlora-1.5.0.dev1.dist-info/licenses/LICENSE +21 -0
- crawlora-1.5.0.dev1.dist-info/top_level.txt +1 -0
crawlora/client.py
ADDED
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import json
|
|
5
|
+
import mimetypes
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
import socket
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
import uuid
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any, Callable, Iterable, Mapping, Literal
|
|
14
|
+
from urllib.error import HTTPError, URLError
|
|
15
|
+
from urllib.parse import urlencode, quote
|
|
16
|
+
from urllib.request import Request, urlopen
|
|
17
|
+
|
|
18
|
+
from ._pagination import default_items, default_start, detect_page_param, page_is_empty
|
|
19
|
+
from ._transport_sync import KeepAliveTransport
|
|
20
|
+
from .operations import GROUPS, OPERATIONS
|
|
21
|
+
|
|
22
|
+
DEFAULT_BASE_URL = "https://api.crawlora.net/api/v1"
|
|
23
|
+
VERSION = "1.5.0-sdk.1"
|
|
24
|
+
DEFAULT_USER_AGENT = f"crawlora-python-sdk/{VERSION}"
|
|
25
|
+
DEFAULT_MAX_RETRY_DELAY = 30.0
|
|
26
|
+
DEFAULT_RETRY_STATUSES = (408, 409, 425, 429)
|
|
27
|
+
ResponseType = Literal["auto", "json", "text", "stream"]
|
|
28
|
+
RetryPredicate = Callable[[int, "BaseException | None"], bool]
|
|
29
|
+
RetryHook = Callable[[int, "BaseException", float], None]
|
|
30
|
+
Logger = Callable[[Mapping[str, Any]], None]
|
|
31
|
+
# before_request receives a mutable context dict {operation, method, url, headers};
|
|
32
|
+
# mutating "headers"/"url" rewrites the outgoing request. after_response receives
|
|
33
|
+
# (operation_id, status, headers, body) and may return a replacement body.
|
|
34
|
+
BeforeRequest = Callable[[dict], None]
|
|
35
|
+
AfterResponse = Callable[[str, int, Mapping[str, str], Any], Any]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _as_hook_list(value: Any) -> list:
|
|
39
|
+
if value is None:
|
|
40
|
+
return []
|
|
41
|
+
if callable(value):
|
|
42
|
+
return [value]
|
|
43
|
+
return list(value)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _run_before_request(hooks: list, ctx: dict) -> None:
|
|
47
|
+
for hook in hooks:
|
|
48
|
+
hook(ctx)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _run_after_response(hooks: list, operation_id: str, status: int, headers: Mapping[str, str], body: Any) -> Any:
|
|
52
|
+
for hook in hooks:
|
|
53
|
+
result = hook(operation_id, status, headers, body)
|
|
54
|
+
if result is not None:
|
|
55
|
+
body = result
|
|
56
|
+
return body
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CrawloraError(Exception):
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
message: str,
|
|
63
|
+
*,
|
|
64
|
+
status: int = 0,
|
|
65
|
+
code: int | None = None,
|
|
66
|
+
body: Any = None,
|
|
67
|
+
raw_body: str = "",
|
|
68
|
+
headers: Mapping[str, str] | None = None,
|
|
69
|
+
request_id: str | None = None,
|
|
70
|
+
cause: BaseException | None = None,
|
|
71
|
+
):
|
|
72
|
+
super().__init__(message)
|
|
73
|
+
self.status = status
|
|
74
|
+
self.code = code
|
|
75
|
+
self.body = body
|
|
76
|
+
self.raw_body = raw_body
|
|
77
|
+
self.headers = dict(headers or {})
|
|
78
|
+
self.request_id = request_id
|
|
79
|
+
self.__cause__ = cause
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class CrawloraClientError(CrawloraError):
|
|
83
|
+
"""Raised for 4xx API responses: the request was rejected by the API."""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class CrawloraServerError(CrawloraError):
|
|
87
|
+
"""Raised for 5xx API responses: the API failed to handle a valid request."""
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class CrawloraNetworkError(CrawloraError):
|
|
91
|
+
"""Raised for transport failures and timeouts before a response arrived."""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _api_error_class(status: int) -> type[CrawloraError]:
|
|
95
|
+
if 400 <= status < 500:
|
|
96
|
+
return CrawloraClientError
|
|
97
|
+
if status >= 500:
|
|
98
|
+
return CrawloraServerError
|
|
99
|
+
return CrawloraError
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass(frozen=True)
|
|
103
|
+
class _Response:
|
|
104
|
+
status: int
|
|
105
|
+
headers: Mapping[str, str]
|
|
106
|
+
body: bytes
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class _RateLimiter:
|
|
110
|
+
"""Optional client-side throttle: caps concurrency and spaces requests to a
|
|
111
|
+
maximum rate (requests per second)."""
|
|
112
|
+
|
|
113
|
+
def __init__(self, rps: float | None, concurrency: int | None) -> None:
|
|
114
|
+
self._interval = (1.0 / rps) if rps and rps > 0 else 0.0
|
|
115
|
+
self._sem = threading.Semaphore(concurrency) if concurrency and concurrency > 0 else None
|
|
116
|
+
self._lock = threading.Lock()
|
|
117
|
+
self._next = 0.0
|
|
118
|
+
|
|
119
|
+
def __enter__(self) -> "_RateLimiter":
|
|
120
|
+
if self._sem is not None:
|
|
121
|
+
self._sem.acquire()
|
|
122
|
+
if self._interval:
|
|
123
|
+
with self._lock:
|
|
124
|
+
now = time.monotonic()
|
|
125
|
+
wait = max(0.0, self._next - now)
|
|
126
|
+
self._next = max(now, self._next) + self._interval
|
|
127
|
+
if wait > 0:
|
|
128
|
+
time.sleep(wait)
|
|
129
|
+
return self
|
|
130
|
+
|
|
131
|
+
def __exit__(self, *_exc: Any) -> None:
|
|
132
|
+
if self._sem is not None:
|
|
133
|
+
self._sem.release()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class CrawloraClient:
|
|
137
|
+
"""Synchronous client for the Crawlora API.
|
|
138
|
+
|
|
139
|
+
Call operations via grouped helpers (``client.bing.search(q="...")``) or
|
|
140
|
+
dynamically (``client.request("bing-search", {"q": "..."})``). Supports
|
|
141
|
+
configurable retries, an ``on_retry`` hook, opt-in ``request_id`` and
|
|
142
|
+
``idempotency_keys``, ``before_request``/``after_response`` middleware,
|
|
143
|
+
client-side ``rate_limit``/``max_concurrency``, pagination
|
|
144
|
+
(``paginate``/``paginate_items``), and ``response_type="stream"``. Uses a
|
|
145
|
+
keep-alive connection pool by default; use it as a context manager (or call
|
|
146
|
+
``close()``) to release pooled connections. See ``AsyncCrawloraClient`` for
|
|
147
|
+
an asyncio client.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
*,
|
|
153
|
+
api_key: str | None = None,
|
|
154
|
+
jwt_token: str | None = None,
|
|
155
|
+
base_url: str | None = None,
|
|
156
|
+
timeout: float = 30,
|
|
157
|
+
retries: int = 0,
|
|
158
|
+
retry_delay: float = 0.25,
|
|
159
|
+
max_retry_delay: float = DEFAULT_MAX_RETRY_DELAY,
|
|
160
|
+
retry_statuses: Iterable[int] | None = None,
|
|
161
|
+
retry_predicate: RetryPredicate | None = None,
|
|
162
|
+
on_retry: RetryHook | None = None,
|
|
163
|
+
request_id: bool = False,
|
|
164
|
+
idempotency_keys: bool = False,
|
|
165
|
+
rate_limit: float | None = None,
|
|
166
|
+
max_concurrency: int | None = None,
|
|
167
|
+
logger: Logger | None = None,
|
|
168
|
+
before_request: BeforeRequest | Iterable[BeforeRequest] | None = None,
|
|
169
|
+
after_response: AfterResponse | Iterable[AfterResponse] | None = None,
|
|
170
|
+
headers: Mapping[str, str] | None = None,
|
|
171
|
+
user_agent: str | None = DEFAULT_USER_AGENT,
|
|
172
|
+
transport: Callable[[Request, float], _Response] | None = None,
|
|
173
|
+
) -> None:
|
|
174
|
+
# Precedence: explicit argument > environment variable > default.
|
|
175
|
+
self.api_key = api_key or os.environ.get("CRAWLORA_API_KEY", "")
|
|
176
|
+
self.jwt_token = jwt_token or ""
|
|
177
|
+
self.base_url = (base_url or os.environ.get("CRAWLORA_BASE_URL") or DEFAULT_BASE_URL).rstrip("/")
|
|
178
|
+
self.timeout = timeout
|
|
179
|
+
self.retries = max(0, int(retries))
|
|
180
|
+
self.retry_delay = max(0.0, float(retry_delay))
|
|
181
|
+
self.max_retry_delay = max(0.0, float(max_retry_delay))
|
|
182
|
+
self.retry_statuses = frozenset(retry_statuses) if retry_statuses is not None else None
|
|
183
|
+
self.retry_predicate = retry_predicate
|
|
184
|
+
self.on_retry = on_retry
|
|
185
|
+
self.request_id = request_id
|
|
186
|
+
self.idempotency_keys = idempotency_keys
|
|
187
|
+
self.rate_limit = rate_limit
|
|
188
|
+
self.max_concurrency = max_concurrency
|
|
189
|
+
self._rate_limiter = _RateLimiter(rate_limit, max_concurrency) if (rate_limit or max_concurrency) else None
|
|
190
|
+
self.logger = logger
|
|
191
|
+
self.before_request = _as_hook_list(before_request)
|
|
192
|
+
self.after_response = _as_hook_list(after_response)
|
|
193
|
+
self.headers = dict(headers or {})
|
|
194
|
+
self.user_agent = user_agent or ""
|
|
195
|
+
# Default to a keep-alive pool (connection reuse); an injected transport
|
|
196
|
+
# (e.g. tests) is used as-is.
|
|
197
|
+
self._transport = transport or KeepAliveTransport()
|
|
198
|
+
|
|
199
|
+
for group_name, operations in GROUPS.items():
|
|
200
|
+
setattr(self, group_name, _OperationGroup(self, operations))
|
|
201
|
+
|
|
202
|
+
def close(self) -> None:
|
|
203
|
+
"""Close pooled keep-alive connections, if any."""
|
|
204
|
+
closer = getattr(self._transport, "close", None)
|
|
205
|
+
if callable(closer):
|
|
206
|
+
closer()
|
|
207
|
+
|
|
208
|
+
def __enter__(self) -> "CrawloraClient":
|
|
209
|
+
return self
|
|
210
|
+
|
|
211
|
+
def __exit__(self, *_exc: Any) -> None:
|
|
212
|
+
self.close()
|
|
213
|
+
|
|
214
|
+
def _is_retryable(self, status: int, exc: BaseException | None) -> bool:
|
|
215
|
+
if self.retry_predicate is not None:
|
|
216
|
+
return bool(self.retry_predicate(status, exc))
|
|
217
|
+
if self.retry_statuses is not None:
|
|
218
|
+
# Network failures (status 0) stay retryable unless a predicate decides.
|
|
219
|
+
return status == 0 or status in self.retry_statuses
|
|
220
|
+
return _should_retry(status)
|
|
221
|
+
|
|
222
|
+
def _compute_retry_delay(self, attempt: int, headers: Mapping[str, str]) -> float:
|
|
223
|
+
retry_after = _retry_after_delay(headers, self.max_retry_delay)
|
|
224
|
+
if retry_after is not None:
|
|
225
|
+
return retry_after
|
|
226
|
+
if self.retry_delay <= 0:
|
|
227
|
+
return 0.0
|
|
228
|
+
delay = self.retry_delay * (2 ** max(0, attempt - 1))
|
|
229
|
+
jitter = random.uniform(0, self.retry_delay / 2)
|
|
230
|
+
return delay + jitter
|
|
231
|
+
|
|
232
|
+
def _log(self, event: Mapping[str, Any]) -> None:
|
|
233
|
+
if self.logger is not None:
|
|
234
|
+
self.logger(event)
|
|
235
|
+
|
|
236
|
+
def operation(
|
|
237
|
+
self,
|
|
238
|
+
operation_id: str,
|
|
239
|
+
params: Mapping[str, Any] | None = None,
|
|
240
|
+
*,
|
|
241
|
+
response_type: ResponseType = "auto",
|
|
242
|
+
timeout: float | None = None,
|
|
243
|
+
headers: Mapping[str, str] | None = None,
|
|
244
|
+
retries: int | None = None,
|
|
245
|
+
retry_predicate: RetryPredicate | None = None,
|
|
246
|
+
) -> Any:
|
|
247
|
+
return self.request(
|
|
248
|
+
operation_id, params, response_type=response_type, timeout=timeout, headers=headers,
|
|
249
|
+
retries=retries, retry_predicate=retry_predicate,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def request(
|
|
253
|
+
self,
|
|
254
|
+
operation_id: str,
|
|
255
|
+
params: Mapping[str, Any] | None = None,
|
|
256
|
+
*,
|
|
257
|
+
response_type: ResponseType = "auto",
|
|
258
|
+
timeout: float | None = None,
|
|
259
|
+
headers: Mapping[str, str] | None = None,
|
|
260
|
+
retries: int | None = None,
|
|
261
|
+
retry_predicate: RetryPredicate | None = None,
|
|
262
|
+
) -> Any:
|
|
263
|
+
operation = OPERATIONS.get(operation_id)
|
|
264
|
+
if operation is None:
|
|
265
|
+
raise ValueError(f"unknown Crawlora operation: {operation_id}")
|
|
266
|
+
response_type = _validate_response_type(response_type)
|
|
267
|
+
self._log({"event": "request", "operation": operation_id})
|
|
268
|
+
max_retries = self.retries if retries is None else max(0, int(retries))
|
|
269
|
+
idempotency_key = uuid.uuid4().hex if self.idempotency_keys and operation["method"] in ("POST", "PATCH") else None
|
|
270
|
+
|
|
271
|
+
attempt = 0
|
|
272
|
+
while True:
|
|
273
|
+
try:
|
|
274
|
+
return self._send(operation, dict(params or {}), response_type=response_type, timeout=timeout, headers=headers, idempotency_key=idempotency_key)
|
|
275
|
+
except CrawloraError as exc:
|
|
276
|
+
retryable = retry_predicate(exc.status, exc) if retry_predicate is not None else self._is_retryable(exc.status, exc)
|
|
277
|
+
if attempt >= max_retries or not retryable:
|
|
278
|
+
raise
|
|
279
|
+
attempt += 1
|
|
280
|
+
delay = self._compute_retry_delay(attempt, exc.headers)
|
|
281
|
+
self._log({"event": "retry", "operation": operation_id, "attempt": attempt, "status": exc.status, "delay": delay})
|
|
282
|
+
if self.on_retry is not None:
|
|
283
|
+
self.on_retry(attempt, exc, delay)
|
|
284
|
+
if delay > 0:
|
|
285
|
+
time.sleep(delay)
|
|
286
|
+
|
|
287
|
+
def _send(
|
|
288
|
+
self,
|
|
289
|
+
operation: Mapping[str, Any],
|
|
290
|
+
params: dict[str, Any],
|
|
291
|
+
*,
|
|
292
|
+
response_type: ResponseType,
|
|
293
|
+
timeout: float | None,
|
|
294
|
+
headers: Mapping[str, str] | None,
|
|
295
|
+
idempotency_key: str | None = None,
|
|
296
|
+
) -> Any:
|
|
297
|
+
url, body, body_headers = _build_request(self.base_url, operation, params)
|
|
298
|
+
request_headers = _merge_headers(
|
|
299
|
+
self.headers,
|
|
300
|
+
_auth_headers(operation.get("security", []), self.api_key, self.jwt_token),
|
|
301
|
+
{"User-Agent": self.user_agent} if self.user_agent else {},
|
|
302
|
+
body_headers,
|
|
303
|
+
headers or {},
|
|
304
|
+
)
|
|
305
|
+
req_id = _ensure_request_id(request_headers) if self.request_id else _header_value(request_headers, "x-request-id") or None
|
|
306
|
+
if idempotency_key and not _header_value(request_headers, "idempotency-key"):
|
|
307
|
+
request_headers["Idempotency-Key"] = idempotency_key
|
|
308
|
+
if self.before_request:
|
|
309
|
+
ctx = {"operation": operation.get("id"), "method": operation["method"], "url": url, "headers": request_headers}
|
|
310
|
+
_run_before_request(self.before_request, ctx)
|
|
311
|
+
url, request_headers = ctx["url"], ctx["headers"]
|
|
312
|
+
request = Request(url, data=body, headers=request_headers, method=operation["method"])
|
|
313
|
+
request_timeout = timeout if timeout is not None else self.timeout
|
|
314
|
+
try:
|
|
315
|
+
if self._rate_limiter is not None:
|
|
316
|
+
with self._rate_limiter:
|
|
317
|
+
response = self._transport(request, request_timeout)
|
|
318
|
+
else:
|
|
319
|
+
response = self._transport(request, request_timeout)
|
|
320
|
+
except Exception as exc:
|
|
321
|
+
message = "Crawlora request timed out" if _is_timeout_error(exc) else "Crawlora transport error"
|
|
322
|
+
raise CrawloraNetworkError(message, request_id=req_id, cause=exc) from exc
|
|
323
|
+
raw_body = response.body.decode(errors="replace")
|
|
324
|
+
is_error = response.status < 200 or response.status >= 300
|
|
325
|
+
if response_type == "stream" and not is_error:
|
|
326
|
+
# Caller reads the file-like body; truly incremental streaming is
|
|
327
|
+
# available on AsyncCrawloraClient (httpx).
|
|
328
|
+
return io.BytesIO(response.body)
|
|
329
|
+
parse_mode = "auto" if response_type == "stream" else response_type
|
|
330
|
+
try:
|
|
331
|
+
parsed = _parse_response(response.body, _header_value(response.headers, "content-type"), parse_mode)
|
|
332
|
+
except json.JSONDecodeError as exc:
|
|
333
|
+
raise CrawloraError(
|
|
334
|
+
"Crawlora JSON parse error",
|
|
335
|
+
status=response.status,
|
|
336
|
+
raw_body=raw_body,
|
|
337
|
+
headers=response.headers,
|
|
338
|
+
request_id=req_id,
|
|
339
|
+
cause=exc,
|
|
340
|
+
) from exc
|
|
341
|
+
if response.status < 200 or response.status >= 300:
|
|
342
|
+
code = parsed.get("code") if isinstance(parsed, dict) else None
|
|
343
|
+
message = parsed.get("msg") if isinstance(parsed, dict) and parsed.get("msg") else f"HTTP {response.status}"
|
|
344
|
+
error_class = _api_error_class(response.status)
|
|
345
|
+
raise error_class(message, status=response.status, code=code, body=parsed, raw_body=raw_body, headers=response.headers, request_id=req_id)
|
|
346
|
+
if self.after_response:
|
|
347
|
+
parsed = _run_after_response(self.after_response, operation.get("id"), response.status, response.headers, parsed)
|
|
348
|
+
return parsed
|
|
349
|
+
|
|
350
|
+
def paginate(
|
|
351
|
+
self,
|
|
352
|
+
operation_id: str,
|
|
353
|
+
params: Mapping[str, Any] | None = None,
|
|
354
|
+
*,
|
|
355
|
+
page_param: str | None = None,
|
|
356
|
+
cursor_param: str | None = None,
|
|
357
|
+
next_cursor: Callable[[Any], Any] | None = None,
|
|
358
|
+
start: Any = None,
|
|
359
|
+
step: int = 1,
|
|
360
|
+
max_pages: int | None = None,
|
|
361
|
+
response_type: ResponseType = "auto",
|
|
362
|
+
timeout: float | None = None,
|
|
363
|
+
headers: Mapping[str, str] | None = None,
|
|
364
|
+
):
|
|
365
|
+
"""Yield successive pages of a paginated operation.
|
|
366
|
+
|
|
367
|
+
Numeric mode (default) advances the ``page``/``offset`` query parameter
|
|
368
|
+
and stops on an empty page. Cursor mode (pass both ``cursor_param`` and a
|
|
369
|
+
``next_cursor`` extractor) sends the cursor parameter and stops when
|
|
370
|
+
``next_cursor`` returns a falsy value.
|
|
371
|
+
"""
|
|
372
|
+
operation = OPERATIONS.get(operation_id)
|
|
373
|
+
if operation is None:
|
|
374
|
+
raise ValueError(f"unknown Crawlora operation: {operation_id}")
|
|
375
|
+
base_params = dict(params or {})
|
|
376
|
+
|
|
377
|
+
if cursor_param or next_cursor:
|
|
378
|
+
if not (cursor_param and next_cursor):
|
|
379
|
+
raise ValueError("cursor pagination requires both cursor_param and next_cursor")
|
|
380
|
+
if cursor_param not in {p["name"] for p in operation.get("queryParams", [])}:
|
|
381
|
+
raise ValueError(f"cursor_param {cursor_param!r} is not a query parameter of operation {operation_id}")
|
|
382
|
+
cursor = start
|
|
383
|
+
fetched = 0
|
|
384
|
+
while max_pages is None or fetched < max_pages:
|
|
385
|
+
page_params = dict(base_params)
|
|
386
|
+
if cursor is not None:
|
|
387
|
+
page_params[cursor_param] = cursor
|
|
388
|
+
response = self.request(operation_id, page_params, response_type=response_type, timeout=timeout, headers=headers)
|
|
389
|
+
yield response
|
|
390
|
+
fetched += 1
|
|
391
|
+
cursor = next_cursor(response)
|
|
392
|
+
if not cursor:
|
|
393
|
+
break
|
|
394
|
+
return
|
|
395
|
+
|
|
396
|
+
page_param = page_param or detect_page_param(operation)
|
|
397
|
+
if not page_param:
|
|
398
|
+
raise ValueError(f"operation {operation_id} has no page or offset query parameter to paginate")
|
|
399
|
+
page_value = default_start(page_param) if start is None else start
|
|
400
|
+
fetched = 0
|
|
401
|
+
while max_pages is None or fetched < max_pages:
|
|
402
|
+
page_params = {**base_params, page_param: page_value}
|
|
403
|
+
response = self.request(operation_id, page_params, response_type=response_type, timeout=timeout, headers=headers)
|
|
404
|
+
yield response
|
|
405
|
+
fetched += 1
|
|
406
|
+
if page_is_empty(response):
|
|
407
|
+
break
|
|
408
|
+
page_value += step
|
|
409
|
+
|
|
410
|
+
def paginate_items(
|
|
411
|
+
self,
|
|
412
|
+
operation_id: str,
|
|
413
|
+
params: Mapping[str, Any] | None = None,
|
|
414
|
+
*,
|
|
415
|
+
items: Callable[[Any], Any] | None = None,
|
|
416
|
+
**kwargs: Any,
|
|
417
|
+
):
|
|
418
|
+
"""Yield individual items across pages. ``items`` extracts the list from
|
|
419
|
+
a page (default: the Crawlora ``data`` array)."""
|
|
420
|
+
extract = items or default_items
|
|
421
|
+
for page in self.paginate(operation_id, params, **kwargs):
|
|
422
|
+
for item in extract(page):
|
|
423
|
+
yield item
|
|
424
|
+
|
|
425
|
+
@staticmethod
|
|
426
|
+
def _urlopen_transport(request: Request, timeout: float) -> _Response:
|
|
427
|
+
try:
|
|
428
|
+
with urlopen(request, timeout=timeout) as response:
|
|
429
|
+
return _Response(response.status, dict(response.headers.items()), response.read())
|
|
430
|
+
except HTTPError as exc:
|
|
431
|
+
return _Response(exc.code, dict(exc.headers.items()), exc.read())
|
|
432
|
+
except URLError:
|
|
433
|
+
raise
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _allowed_params(operation_id: str) -> set[str]:
|
|
437
|
+
operation = OPERATIONS.get(operation_id) or {}
|
|
438
|
+
allowed = set(operation.get("pathParams", []))
|
|
439
|
+
allowed |= {p["name"] for p in operation.get("queryParams", [])}
|
|
440
|
+
allowed |= {p["name"] for p in operation.get("formParams", [])}
|
|
441
|
+
if operation.get("bodyParam"):
|
|
442
|
+
allowed.add(operation["bodyParam"])
|
|
443
|
+
allowed.add("body")
|
|
444
|
+
return allowed
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
_REQUEST_OPTION_KWARGS = ("_response_type", "_timeout", "_headers")
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
class _OperationGroup:
|
|
451
|
+
def __init__(self, client: CrawloraClient, operations: Mapping[str, str]) -> None:
|
|
452
|
+
self._client = client
|
|
453
|
+
self._operations = operations
|
|
454
|
+
|
|
455
|
+
def __getattr__(self, name: str) -> Callable[..., Any]:
|
|
456
|
+
operation_id = self._operations.get(name)
|
|
457
|
+
if operation_id is None:
|
|
458
|
+
raise AttributeError(name)
|
|
459
|
+
allowed = _allowed_params(operation_id)
|
|
460
|
+
|
|
461
|
+
def call(**params: Any) -> Any:
|
|
462
|
+
response_type = params.pop("_response_type", "auto")
|
|
463
|
+
timeout = params.pop("_timeout", None)
|
|
464
|
+
headers = params.pop("_headers", None)
|
|
465
|
+
unknown = set(params) - allowed
|
|
466
|
+
if unknown:
|
|
467
|
+
raise TypeError(f"unexpected parameter(s) for {operation_id}: {', '.join(sorted(unknown))}")
|
|
468
|
+
return self._client.request(operation_id, params, response_type=response_type, timeout=timeout, headers=headers)
|
|
469
|
+
|
|
470
|
+
return call
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def _build_request(base_url: str, operation: Mapping[str, Any], params: dict[str, Any]) -> tuple[str, bytes | None, dict[str, str]]:
|
|
474
|
+
_validate_required_params(operation, params)
|
|
475
|
+
_validate_enum_params(operation, params)
|
|
476
|
+
path = operation["path"]
|
|
477
|
+
for name in operation.get("pathParams", []):
|
|
478
|
+
value = params.get(name)
|
|
479
|
+
if value in (None, ""):
|
|
480
|
+
raise ValueError(f"missing required path parameter: {name}")
|
|
481
|
+
path = path.replace("{" + name + "}", quote(str(value), safe=""))
|
|
482
|
+
|
|
483
|
+
query: list[tuple[str, Any]] = []
|
|
484
|
+
for parameter in operation.get("queryParams", []):
|
|
485
|
+
name = parameter["name"]
|
|
486
|
+
value = params.get(name)
|
|
487
|
+
if value in (None, ""):
|
|
488
|
+
continue
|
|
489
|
+
if isinstance(value, (list, tuple)):
|
|
490
|
+
query.extend((name, _stringify_param(item)) for item in value)
|
|
491
|
+
else:
|
|
492
|
+
query.append((name, _stringify_param(value)))
|
|
493
|
+
url = base_url + path
|
|
494
|
+
if query:
|
|
495
|
+
url += "?" + urlencode(query, doseq=True)
|
|
496
|
+
|
|
497
|
+
if operation.get("formParams"):
|
|
498
|
+
return url, *_multipart_body(operation["formParams"], params)
|
|
499
|
+
|
|
500
|
+
body_param = operation.get("bodyParam")
|
|
501
|
+
if body_param:
|
|
502
|
+
value = params.get(body_param, params.get("body"))
|
|
503
|
+
if value is not None:
|
|
504
|
+
return url, json.dumps(value).encode(), {"content-type": "application/json"}
|
|
505
|
+
|
|
506
|
+
return url, None, {}
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def _validate_required_params(operation: Mapping[str, Any], params: Mapping[str, Any]) -> None:
|
|
510
|
+
for name in operation.get("pathParams", []):
|
|
511
|
+
if _is_missing(params.get(name)):
|
|
512
|
+
raise ValueError(f"missing required path parameter: {name}")
|
|
513
|
+
for location in ("queryParams", "formParams"):
|
|
514
|
+
for parameter in operation.get(location, []):
|
|
515
|
+
if parameter.get("required") and _is_missing(params.get(parameter["name"])):
|
|
516
|
+
param_location = parameter.get("in", "request")
|
|
517
|
+
raise ValueError(f"missing required {param_location} parameter: {parameter['name']}")
|
|
518
|
+
if operation.get("bodyRequired"):
|
|
519
|
+
body_param = operation.get("bodyParam")
|
|
520
|
+
if _is_missing(params.get(body_param)) and _is_missing(params.get("body")):
|
|
521
|
+
raise ValueError(f"missing required body parameter: {body_param}")
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def _validate_enum_params(operation: Mapping[str, Any], params: Mapping[str, Any]) -> None:
|
|
525
|
+
for location in ("queryParams", "formParams"):
|
|
526
|
+
for parameter in operation.get(location, []):
|
|
527
|
+
enum_values = parameter.get("enum") or []
|
|
528
|
+
value = params.get(parameter["name"])
|
|
529
|
+
if not enum_values or _is_missing(value):
|
|
530
|
+
continue
|
|
531
|
+
values = value if isinstance(value, (list, tuple)) else [value]
|
|
532
|
+
for item in values:
|
|
533
|
+
if _stringify_param(item) not in enum_values:
|
|
534
|
+
param_location = parameter.get("in", "request")
|
|
535
|
+
expected = ", ".join(enum_values)
|
|
536
|
+
raise ValueError(f"invalid {param_location} parameter {parameter['name']}: expected one of {expected}")
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _is_missing(value: Any) -> bool:
|
|
540
|
+
return value is None or value == "" or (isinstance(value, (list, tuple)) and len(value) == 0)
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _multipart_body(form_params: list[Mapping[str, Any]], params: Mapping[str, Any]) -> tuple[bytes, dict[str, str]]:
|
|
544
|
+
boundary = f"crawlora-{uuid.uuid4().hex}"
|
|
545
|
+
chunks: list[bytes] = []
|
|
546
|
+
for parameter in form_params:
|
|
547
|
+
name = parameter["name"]
|
|
548
|
+
if name not in params or params[name] is None:
|
|
549
|
+
continue
|
|
550
|
+
value = params[name]
|
|
551
|
+
chunks.append(f"--{boundary}\r\n".encode())
|
|
552
|
+
if parameter.get("type") == "file":
|
|
553
|
+
filename, data = _read_file_value(value)
|
|
554
|
+
content_type = mimetypes.guess_type(filename)[0] or "application/octet-stream"
|
|
555
|
+
chunks.append(
|
|
556
|
+
f'Content-Disposition: form-data; name="{name}"; filename="{filename}"\r\n'
|
|
557
|
+
f"Content-Type: {content_type}\r\n\r\n".encode()
|
|
558
|
+
)
|
|
559
|
+
chunks.append(data)
|
|
560
|
+
chunks.append(b"\r\n")
|
|
561
|
+
else:
|
|
562
|
+
chunks.append(f'Content-Disposition: form-data; name="{name}"\r\n\r\n{value}\r\n'.encode())
|
|
563
|
+
chunks.append(f"--{boundary}--\r\n".encode())
|
|
564
|
+
return b"".join(chunks), {"content-type": f"multipart/form-data; boundary={boundary}"}
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _read_file_value(value: Any) -> tuple[str, bytes]:
|
|
568
|
+
if isinstance(value, (bytes, bytearray)):
|
|
569
|
+
return "upload.bin", bytes(value)
|
|
570
|
+
if isinstance(value, os.PathLike) or isinstance(value, str):
|
|
571
|
+
path = os.fspath(value)
|
|
572
|
+
with open(path, "rb") as file:
|
|
573
|
+
return os.path.basename(path), file.read()
|
|
574
|
+
name = os.path.basename(getattr(value, "name", "upload.bin"))
|
|
575
|
+
return name, value.read()
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def _auth_headers(security: list[str], api_key: str, jwt_token: str) -> dict[str, str]:
|
|
579
|
+
headers: dict[str, str] = {}
|
|
580
|
+
if "ApiKeyAuth" in security and api_key:
|
|
581
|
+
headers["x-api-key"] = api_key
|
|
582
|
+
if "JWTAuth" in security and jwt_token:
|
|
583
|
+
headers["Authorization"] = jwt_token if jwt_token.lower().startswith(("token ", "bearer ")) else f"Token {jwt_token}"
|
|
584
|
+
return headers
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _merge_headers(*sources: Mapping[str, str]) -> dict[str, str]:
|
|
588
|
+
headers: dict[str, str] = {}
|
|
589
|
+
names: dict[str, str] = {}
|
|
590
|
+
for source in sources:
|
|
591
|
+
for name, value in source.items():
|
|
592
|
+
lower = name.lower()
|
|
593
|
+
existing = names.get(lower)
|
|
594
|
+
if existing and existing != name:
|
|
595
|
+
headers.pop(existing, None)
|
|
596
|
+
headers[name] = str(value)
|
|
597
|
+
names[lower] = name
|
|
598
|
+
return headers
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _validate_response_type(response_type: str) -> ResponseType:
|
|
602
|
+
if response_type in ("auto", "json", "text", "stream"):
|
|
603
|
+
return response_type # type: ignore[return-value]
|
|
604
|
+
raise ValueError("invalid response_type: expected one of auto, json, text, stream")
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def _parse_response(body: bytes, content_type: str, response_type: str) -> Any:
|
|
608
|
+
if response_type == "text":
|
|
609
|
+
return body.decode()
|
|
610
|
+
if response_type == "json" or "application/json" in content_type.lower():
|
|
611
|
+
return json.loads(body.decode()) if body else None
|
|
612
|
+
return body.decode()
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def _stringify_param(value: Any) -> str:
|
|
616
|
+
if isinstance(value, bool):
|
|
617
|
+
return "true" if value else "false"
|
|
618
|
+
return str(value)
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def _should_retry(status: int) -> bool:
|
|
622
|
+
return status == 0 or status in DEFAULT_RETRY_STATUSES or status >= 500
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def _ensure_request_id(headers: dict[str, str]) -> str:
|
|
626
|
+
existing = _header_value(headers, "x-request-id")
|
|
627
|
+
if existing:
|
|
628
|
+
return existing
|
|
629
|
+
request_id = uuid.uuid4().hex
|
|
630
|
+
headers["x-request-id"] = request_id
|
|
631
|
+
return request_id
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def _retry_after_delay(headers: Mapping[str, str], cap: float) -> float | None:
|
|
635
|
+
value = _header_value(headers, "retry-after")
|
|
636
|
+
if not value:
|
|
637
|
+
return None
|
|
638
|
+
try:
|
|
639
|
+
seconds = float(value)
|
|
640
|
+
except ValueError:
|
|
641
|
+
seconds = None
|
|
642
|
+
if seconds is not None and seconds > 0:
|
|
643
|
+
return min(seconds, cap)
|
|
644
|
+
try:
|
|
645
|
+
from email.utils import parsedate_to_datetime
|
|
646
|
+
|
|
647
|
+
target = parsedate_to_datetime(value)
|
|
648
|
+
delay = target.timestamp() - time.time()
|
|
649
|
+
except (TypeError, ValueError, OverflowError):
|
|
650
|
+
return None
|
|
651
|
+
if delay > 0:
|
|
652
|
+
return min(delay, cap)
|
|
653
|
+
return None
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def _header_value(headers: Mapping[str, str], name: str) -> str:
|
|
657
|
+
for key, value in headers.items():
|
|
658
|
+
if key.lower() == name.lower():
|
|
659
|
+
return value
|
|
660
|
+
return ""
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def _is_timeout_error(exc: BaseException) -> bool:
|
|
664
|
+
if isinstance(exc, (TimeoutError, socket.timeout)):
|
|
665
|
+
return True
|
|
666
|
+
if isinstance(exc, URLError):
|
|
667
|
+
reason = exc.reason
|
|
668
|
+
if isinstance(reason, (TimeoutError, socket.timeout)):
|
|
669
|
+
return True
|
|
670
|
+
return "timed out" in str(reason).lower()
|
|
671
|
+
return "timed out" in str(exc).lower()
|