reader-py 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reader_py/__init__.py +87 -0
- reader_py/async_client.py +373 -0
- reader_py/client.py +434 -0
- reader_py/errors.py +135 -0
- reader_py/types.py +213 -0
- reader_py-0.2.0.dist-info/METADATA +138 -0
- reader_py-0.2.0.dist-info/RECORD +9 -0
- reader_py-0.2.0.dist-info/WHEEL +5 -0
- reader_py-0.2.0.dist-info/top_level.txt +1 -0
reader_py/__init__.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Reader Python SDK."""
|
|
2
|
+
|
|
3
|
+
from .async_client import AsyncReaderClient
|
|
4
|
+
from .client import ReaderClient
|
|
5
|
+
from .errors import (
|
|
6
|
+
ConcurrencyLimitedError,
|
|
7
|
+
ConflictError,
|
|
8
|
+
InsufficientCreditsError,
|
|
9
|
+
InternalServerError,
|
|
10
|
+
InvalidRequestError,
|
|
11
|
+
NotFoundError,
|
|
12
|
+
RateLimitedError,
|
|
13
|
+
ReaderApiError,
|
|
14
|
+
ReaderError,
|
|
15
|
+
ScrapeTimeoutError,
|
|
16
|
+
UnauthenticatedError,
|
|
17
|
+
UpstreamUnavailableError,
|
|
18
|
+
UrlBlockedError,
|
|
19
|
+
to_reader_api_error,
|
|
20
|
+
)
|
|
21
|
+
from .types import (
|
|
22
|
+
CreditInfo,
|
|
23
|
+
Credits,
|
|
24
|
+
DoneEvent,
|
|
25
|
+
ErrorEvent,
|
|
26
|
+
Job,
|
|
27
|
+
JobInfo,
|
|
28
|
+
JobReadResult,
|
|
29
|
+
Page,
|
|
30
|
+
PageEvent,
|
|
31
|
+
Pagination,
|
|
32
|
+
ProgressEvent,
|
|
33
|
+
ReadParams,
|
|
34
|
+
ReadResult,
|
|
35
|
+
ScrapeMetadata,
|
|
36
|
+
ScrapeReadResult,
|
|
37
|
+
ScrapeResult,
|
|
38
|
+
SessionInfo,
|
|
39
|
+
StopSessionResult,
|
|
40
|
+
StreamEvent,
|
|
41
|
+
UsageEntry,
|
|
42
|
+
WebhookConfig,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"ReaderClient",
|
|
47
|
+
"AsyncReaderClient",
|
|
48
|
+
# Errors
|
|
49
|
+
"ReaderApiError",
|
|
50
|
+
"ReaderError",
|
|
51
|
+
"InvalidRequestError",
|
|
52
|
+
"UnauthenticatedError",
|
|
53
|
+
"InsufficientCreditsError",
|
|
54
|
+
"UrlBlockedError",
|
|
55
|
+
"NotFoundError",
|
|
56
|
+
"ConflictError",
|
|
57
|
+
"RateLimitedError",
|
|
58
|
+
"ConcurrencyLimitedError",
|
|
59
|
+
"InternalServerError",
|
|
60
|
+
"UpstreamUnavailableError",
|
|
61
|
+
"ScrapeTimeoutError",
|
|
62
|
+
"to_reader_api_error",
|
|
63
|
+
# Types
|
|
64
|
+
"ReadParams",
|
|
65
|
+
"ReadResult",
|
|
66
|
+
"ScrapeReadResult",
|
|
67
|
+
"JobReadResult",
|
|
68
|
+
"ScrapeResult",
|
|
69
|
+
"ScrapeMetadata",
|
|
70
|
+
"Page",
|
|
71
|
+
"Job",
|
|
72
|
+
"JobInfo",
|
|
73
|
+
"Credits",
|
|
74
|
+
"CreditInfo",
|
|
75
|
+
"Pagination",
|
|
76
|
+
"UsageEntry",
|
|
77
|
+
# Stream events
|
|
78
|
+
"StreamEvent",
|
|
79
|
+
"ProgressEvent",
|
|
80
|
+
"PageEvent",
|
|
81
|
+
"ErrorEvent",
|
|
82
|
+
"DoneEvent",
|
|
83
|
+
"WebhookConfig",
|
|
84
|
+
# Sessions
|
|
85
|
+
"SessionInfo",
|
|
86
|
+
"StopSessionResult",
|
|
87
|
+
]
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
"""Async Reader SDK client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any, AsyncIterator, Optional
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from .client import (
|
|
13
|
+
DEFAULT_BASE_URL,
|
|
14
|
+
DEFAULT_MAX_RETRIES,
|
|
15
|
+
DEFAULT_POLL_INTERVAL,
|
|
16
|
+
DEFAULT_POLL_TIMEOUT,
|
|
17
|
+
DEFAULT_STREAM_TIMEOUT,
|
|
18
|
+
DEFAULT_TIMEOUT,
|
|
19
|
+
_parse_sse_event,
|
|
20
|
+
_to_camel_case,
|
|
21
|
+
_to_snake_case,
|
|
22
|
+
)
|
|
23
|
+
from .errors import (
|
|
24
|
+
RateLimitedError,
|
|
25
|
+
ReaderApiError,
|
|
26
|
+
ScrapeTimeoutError,
|
|
27
|
+
to_reader_api_error,
|
|
28
|
+
)
|
|
29
|
+
from .types import (
|
|
30
|
+
Credits,
|
|
31
|
+
DoneEvent,
|
|
32
|
+
Job,
|
|
33
|
+
JobReadResult,
|
|
34
|
+
Page,
|
|
35
|
+
ReadParams,
|
|
36
|
+
ReadResult,
|
|
37
|
+
ScrapeReadResult,
|
|
38
|
+
ScrapeResult,
|
|
39
|
+
SessionInfo,
|
|
40
|
+
StopSessionResult,
|
|
41
|
+
StreamEvent,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AsyncReaderClient:
|
|
46
|
+
"""Async Reader API client.
|
|
47
|
+
|
|
48
|
+
Example::
|
|
49
|
+
|
|
50
|
+
async with AsyncReaderClient(api_key="rdr_your_key") as client:
|
|
51
|
+
result = await client.read(url="https://example.com")
|
|
52
|
+
if result.kind == "scrape":
|
|
53
|
+
print(result.data.markdown)
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
api_key: str,
|
|
59
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
60
|
+
timeout: int = DEFAULT_TIMEOUT,
|
|
61
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
62
|
+
):
|
|
63
|
+
if not api_key:
|
|
64
|
+
raise ReaderApiError(
|
|
65
|
+
"API key is required",
|
|
66
|
+
code="unauthenticated",
|
|
67
|
+
http_status=401,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self._api_key = api_key
|
|
71
|
+
self._base_url = base_url.rstrip("/")
|
|
72
|
+
self._max_retries = max_retries
|
|
73
|
+
|
|
74
|
+
self._client = httpx.AsyncClient(
|
|
75
|
+
base_url=self._base_url,
|
|
76
|
+
headers={
|
|
77
|
+
"x-api-key": api_key,
|
|
78
|
+
"Content-Type": "application/json",
|
|
79
|
+
},
|
|
80
|
+
timeout=timeout,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
self.sessions = AsyncSessionsAPI(self._request)
|
|
84
|
+
|
|
85
|
+
async def read(self, **kwargs: Any) -> ReadResult:
|
|
86
|
+
"""Read (scrape, batch, or crawl) URLs."""
|
|
87
|
+
params = ReadParams(**kwargs)
|
|
88
|
+
body = params.model_dump(exclude_none=True)
|
|
89
|
+
api_body = _to_camel_case(body)
|
|
90
|
+
|
|
91
|
+
envelope = await self._request("POST", "/v1/read", json=api_body)
|
|
92
|
+
data = envelope.get("data") or {}
|
|
93
|
+
|
|
94
|
+
if "status" in data and "mode" in data and "metadata" not in data:
|
|
95
|
+
job = await self.wait_for_job(str(data["id"]))
|
|
96
|
+
return JobReadResult(kind="job", data=job)
|
|
97
|
+
|
|
98
|
+
scrape = ScrapeResult(**_to_snake_case(data))
|
|
99
|
+
return ScrapeReadResult(kind="scrape", data=scrape)
|
|
100
|
+
|
|
101
|
+
async def get_job(
|
|
102
|
+
self,
|
|
103
|
+
job_id: str,
|
|
104
|
+
skip: Optional[int] = None,
|
|
105
|
+
limit: Optional[int] = None,
|
|
106
|
+
) -> tuple[Job, bool]:
|
|
107
|
+
params: dict[str, Any] = {}
|
|
108
|
+
if skip is not None:
|
|
109
|
+
params["skip"] = skip
|
|
110
|
+
if limit is not None:
|
|
111
|
+
params["limit"] = limit
|
|
112
|
+
|
|
113
|
+
envelope = await self._request("GET", f"/v1/jobs/{job_id}", params=params or None)
|
|
114
|
+
job = Job(**_to_snake_case(envelope["data"]))
|
|
115
|
+
pagination = envelope.get("pagination") or {}
|
|
116
|
+
return job, bool(pagination.get("hasMore"))
|
|
117
|
+
|
|
118
|
+
async def get_all_job_results(self, job_id: str) -> list[Page]:
|
|
119
|
+
pages: list[Page] = []
|
|
120
|
+
skip = 0
|
|
121
|
+
limit = 100
|
|
122
|
+
while True:
|
|
123
|
+
job, has_more = await self.get_job(job_id, skip=skip, limit=limit)
|
|
124
|
+
pages.extend(job.results)
|
|
125
|
+
if not has_more:
|
|
126
|
+
break
|
|
127
|
+
skip += limit
|
|
128
|
+
return pages
|
|
129
|
+
|
|
130
|
+
async def cancel_job(self, job_id: str) -> None:
|
|
131
|
+
await self._request("DELETE", f"/v1/jobs/{job_id}")
|
|
132
|
+
|
|
133
|
+
async def retry_job(self, job_id: str) -> dict[str, Any]:
|
|
134
|
+
envelope = await self._request("POST", f"/v1/jobs/{job_id}/retry")
|
|
135
|
+
return envelope["data"]
|
|
136
|
+
|
|
137
|
+
async def get_credits(self) -> Credits:
|
|
138
|
+
envelope = await self._request("GET", "/v1/usage/credits")
|
|
139
|
+
return Credits(**_to_snake_case(envelope["data"]))
|
|
140
|
+
|
|
141
|
+
async def wait_for_job(
|
|
142
|
+
self,
|
|
143
|
+
job_id: str,
|
|
144
|
+
poll_interval: int = DEFAULT_POLL_INTERVAL,
|
|
145
|
+
timeout: int = DEFAULT_POLL_TIMEOUT,
|
|
146
|
+
) -> Job:
|
|
147
|
+
start = time.time()
|
|
148
|
+
while time.time() - start < timeout:
|
|
149
|
+
job, _ = await self.get_job(job_id, limit=1)
|
|
150
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
151
|
+
if job.status == "completed":
|
|
152
|
+
job.results = await self.get_all_job_results(job_id)
|
|
153
|
+
return job
|
|
154
|
+
await asyncio.sleep(poll_interval)
|
|
155
|
+
|
|
156
|
+
raise ScrapeTimeoutError(
|
|
157
|
+
f"Job {job_id} polling timed out after {timeout}s",
|
|
158
|
+
code="scrape_timeout",
|
|
159
|
+
http_status=504,
|
|
160
|
+
details={"timeoutMs": timeout * 1000},
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
async def stream(
|
|
164
|
+
self,
|
|
165
|
+
job_id: str,
|
|
166
|
+
timeout: int = DEFAULT_STREAM_TIMEOUT,
|
|
167
|
+
) -> AsyncIterator[StreamEvent]:
|
|
168
|
+
"""Stream real-time events for a running job via Server-Sent Events.
|
|
169
|
+
|
|
170
|
+
Yields parsed :class:`StreamEvent` instances as the job makes progress.
|
|
171
|
+
The stream closes automatically when the job reaches a terminal state.
|
|
172
|
+
|
|
173
|
+
Example::
|
|
174
|
+
|
|
175
|
+
async for event in client.stream(job_id):
|
|
176
|
+
if event.type == "page":
|
|
177
|
+
print("page:", event.data.url)
|
|
178
|
+
elif event.type == "done":
|
|
179
|
+
break
|
|
180
|
+
"""
|
|
181
|
+
url = f"{self._base_url}/v1/jobs/{job_id}/stream"
|
|
182
|
+
headers = {"x-api-key": self._api_key, "Accept": "text/event-stream"}
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
async with httpx.AsyncClient(
|
|
186
|
+
timeout=httpx.Timeout(None, connect=10.0),
|
|
187
|
+
) as sse_client:
|
|
188
|
+
async with sse_client.stream("GET", url, headers=headers) as response:
|
|
189
|
+
if response.status_code >= 400:
|
|
190
|
+
body_bytes = b""
|
|
191
|
+
async for chunk in response.aiter_bytes():
|
|
192
|
+
body_bytes += chunk
|
|
193
|
+
try:
|
|
194
|
+
body = json.loads(body_bytes.decode("utf-8"))
|
|
195
|
+
except (ValueError, UnicodeDecodeError):
|
|
196
|
+
body = None
|
|
197
|
+
request_id = response.headers.get("x-request-id")
|
|
198
|
+
if (
|
|
199
|
+
isinstance(body, dict)
|
|
200
|
+
and "error" in body
|
|
201
|
+
and isinstance(body["error"], dict)
|
|
202
|
+
):
|
|
203
|
+
raise to_reader_api_error(
|
|
204
|
+
body["error"], response.status_code, request_id
|
|
205
|
+
)
|
|
206
|
+
raise ReaderApiError(
|
|
207
|
+
f"Stream failed with status {response.status_code}",
|
|
208
|
+
code="internal_error",
|
|
209
|
+
http_status=response.status_code,
|
|
210
|
+
request_id=request_id,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
async for event in _parse_async_sse_stream(
|
|
214
|
+
response.aiter_lines(),
|
|
215
|
+
timeout,
|
|
216
|
+
):
|
|
217
|
+
yield event
|
|
218
|
+
except httpx.TimeoutException as exc:
|
|
219
|
+
raise ScrapeTimeoutError(
|
|
220
|
+
f"Job {job_id} stream timed out",
|
|
221
|
+
code="scrape_timeout",
|
|
222
|
+
http_status=504,
|
|
223
|
+
) from exc
|
|
224
|
+
|
|
225
|
+
async def close(self) -> None:
|
|
226
|
+
await self._client.aclose()
|
|
227
|
+
|
|
228
|
+
async def __aenter__(self):
|
|
229
|
+
return self
|
|
230
|
+
|
|
231
|
+
async def __aexit__(self, *args):
|
|
232
|
+
await self.close()
|
|
233
|
+
|
|
234
|
+
async def _request(self, method: str, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
235
|
+
"""Send a request with retries on transient failures.
|
|
236
|
+
|
|
237
|
+
Retries on 5xx and 429 with exponential backoff (1s, 2s, 4s...).
|
|
238
|
+
For 429 the ``Retry-After`` header (seconds) overrides the backoff.
|
|
239
|
+
Client errors (4xx other than 429) are raised immediately.
|
|
240
|
+
"""
|
|
241
|
+
last_error: Optional[Exception] = None
|
|
242
|
+
|
|
243
|
+
for attempt in range(self._max_retries + 1):
|
|
244
|
+
try:
|
|
245
|
+
res = await self._client.request(method, path, **kwargs)
|
|
246
|
+
except httpx.TimeoutException as exc:
|
|
247
|
+
last_error = ScrapeTimeoutError(
|
|
248
|
+
"Request timed out",
|
|
249
|
+
code="scrape_timeout",
|
|
250
|
+
http_status=504,
|
|
251
|
+
)
|
|
252
|
+
last_error.__cause__ = exc
|
|
253
|
+
except httpx.ConnectError as exc:
|
|
254
|
+
last_error = ReaderApiError(
|
|
255
|
+
"Connection failed",
|
|
256
|
+
code="upstream_unavailable",
|
|
257
|
+
http_status=502,
|
|
258
|
+
)
|
|
259
|
+
last_error.__cause__ = exc
|
|
260
|
+
else:
|
|
261
|
+
request_id = res.headers.get("x-request-id")
|
|
262
|
+
try:
|
|
263
|
+
data = res.json()
|
|
264
|
+
except Exception:
|
|
265
|
+
data = None
|
|
266
|
+
|
|
267
|
+
if res.status_code < 400:
|
|
268
|
+
if not isinstance(data, dict):
|
|
269
|
+
raise ReaderApiError(
|
|
270
|
+
"Invalid response from Reader API",
|
|
271
|
+
code="internal_error",
|
|
272
|
+
http_status=res.status_code,
|
|
273
|
+
request_id=request_id,
|
|
274
|
+
)
|
|
275
|
+
return data
|
|
276
|
+
|
|
277
|
+
if isinstance(data, dict) and isinstance(data.get("error"), dict):
|
|
278
|
+
err: ReaderApiError = to_reader_api_error(
|
|
279
|
+
data["error"], res.status_code, request_id
|
|
280
|
+
)
|
|
281
|
+
else:
|
|
282
|
+
err = ReaderApiError(
|
|
283
|
+
f"Request failed with status {res.status_code}",
|
|
284
|
+
code="internal_error",
|
|
285
|
+
http_status=res.status_code,
|
|
286
|
+
request_id=request_id,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
if res.status_code < 500 and res.status_code != 429:
|
|
290
|
+
raise err
|
|
291
|
+
|
|
292
|
+
if isinstance(err, RateLimitedError) and err.retry_after_seconds:
|
|
293
|
+
if attempt < self._max_retries:
|
|
294
|
+
await asyncio.sleep(err.retry_after_seconds)
|
|
295
|
+
last_error = err
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
last_error = err
|
|
299
|
+
|
|
300
|
+
if attempt < self._max_retries:
|
|
301
|
+
await asyncio.sleep(2 ** attempt)
|
|
302
|
+
|
|
303
|
+
assert last_error is not None
|
|
304
|
+
raise last_error
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
async def _parse_async_sse_stream(
|
|
308
|
+
lines: AsyncIterator[str],
|
|
309
|
+
timeout: int,
|
|
310
|
+
) -> AsyncIterator[StreamEvent]:
|
|
311
|
+
"""Async variant of client._parse_sse_stream — accumulates SSE lines
|
|
312
|
+
into frames and yields parsed StreamEvents. Uses the same frame parser
|
|
313
|
+
helper (_parse_sse_event) as the sync client.
|
|
314
|
+
"""
|
|
315
|
+
start = time.time()
|
|
316
|
+
current_event = ""
|
|
317
|
+
current_data: list[str] = []
|
|
318
|
+
|
|
319
|
+
async for line in lines:
|
|
320
|
+
if time.time() - start > timeout:
|
|
321
|
+
raise ScrapeTimeoutError(
|
|
322
|
+
"Stream read timed out",
|
|
323
|
+
code="scrape_timeout",
|
|
324
|
+
http_status=504,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if line == "":
|
|
328
|
+
if current_event and current_data:
|
|
329
|
+
parsed = _parse_sse_event(current_event, "\n".join(current_data))
|
|
330
|
+
if parsed is not None:
|
|
331
|
+
yield parsed
|
|
332
|
+
if isinstance(parsed, DoneEvent):
|
|
333
|
+
return
|
|
334
|
+
current_event = ""
|
|
335
|
+
current_data = []
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
if line.startswith(":"):
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
if line.startswith("event:"):
|
|
342
|
+
current_event = line[len("event:") :].strip()
|
|
343
|
+
elif line.startswith("data:"):
|
|
344
|
+
current_data.append(line[len("data:") :].strip())
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
class AsyncSessionsAPI:
|
|
348
|
+
"""Browser sessions API (async)."""
|
|
349
|
+
|
|
350
|
+
def __init__(self, request_fn: Any):
|
|
351
|
+
self._request = request_fn
|
|
352
|
+
|
|
353
|
+
async def create(self, **kwargs: Any) -> SessionInfo:
|
|
354
|
+
"""Create a browser session. Returns a CDP WebSocket URL."""
|
|
355
|
+
body = _to_camel_case(kwargs) if kwargs else {}
|
|
356
|
+
envelope = await self._request("POST", "/v1/sessions", json=body)
|
|
357
|
+
return SessionInfo(**_to_snake_case(envelope["data"]))
|
|
358
|
+
|
|
359
|
+
async def get(self, session_id: str) -> SessionInfo:
|
|
360
|
+
"""Get session status."""
|
|
361
|
+
envelope = await self._request("GET", f"/v1/sessions/{session_id}")
|
|
362
|
+
return SessionInfo(**_to_snake_case(envelope["data"]))
|
|
363
|
+
|
|
364
|
+
async def stop(self, session_id: str) -> StopSessionResult:
|
|
365
|
+
"""Stop a browser session."""
|
|
366
|
+
envelope = await self._request("DELETE", f"/v1/sessions/{session_id}")
|
|
367
|
+
return StopSessionResult(**_to_snake_case(envelope["data"]))
|
|
368
|
+
|
|
369
|
+
async def list(self) -> list[SessionInfo]:
|
|
370
|
+
"""List active sessions."""
|
|
371
|
+
envelope = await self._request("GET", "/v1/sessions")
|
|
372
|
+
data = envelope["data"]
|
|
373
|
+
return [SessionInfo(**_to_snake_case(s)) for s in data]
|
reader_py/client.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""Synchronous Reader SDK client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any, Iterator, Optional
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
from .errors import (
|
|
12
|
+
RateLimitedError,
|
|
13
|
+
ReaderApiError,
|
|
14
|
+
ScrapeTimeoutError,
|
|
15
|
+
to_reader_api_error,
|
|
16
|
+
)
|
|
17
|
+
from .types import (
|
|
18
|
+
Credits,
|
|
19
|
+
DoneEvent,
|
|
20
|
+
ErrorEvent,
|
|
21
|
+
Job,
|
|
22
|
+
JobReadResult,
|
|
23
|
+
Page,
|
|
24
|
+
PageEvent,
|
|
25
|
+
ProgressEvent,
|
|
26
|
+
ReadParams,
|
|
27
|
+
ReadResult,
|
|
28
|
+
ScrapeReadResult,
|
|
29
|
+
ScrapeResult,
|
|
30
|
+
SessionInfo,
|
|
31
|
+
StopSessionResult,
|
|
32
|
+
StreamEvent,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
DEFAULT_BASE_URL = "https://api.reader.dev"
|
|
36
|
+
DEFAULT_TIMEOUT = 60
|
|
37
|
+
DEFAULT_MAX_RETRIES = 2
|
|
38
|
+
DEFAULT_POLL_INTERVAL = 2
|
|
39
|
+
DEFAULT_POLL_TIMEOUT = 300
|
|
40
|
+
DEFAULT_STREAM_TIMEOUT = 600 # per-job stream can run longer than a poll
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ReaderClient:
|
|
44
|
+
"""Synchronous Reader API client.
|
|
45
|
+
|
|
46
|
+
Example::
|
|
47
|
+
|
|
48
|
+
client = ReaderClient(api_key="rdr_your_key")
|
|
49
|
+
result = client.read(url="https://example.com")
|
|
50
|
+
if result.kind == "scrape":
|
|
51
|
+
print(result.data.markdown)
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
api_key: str,
|
|
57
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
58
|
+
timeout: int = DEFAULT_TIMEOUT,
|
|
59
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
60
|
+
):
|
|
61
|
+
if not api_key:
|
|
62
|
+
raise ReaderApiError(
|
|
63
|
+
"API key is required",
|
|
64
|
+
code="unauthenticated",
|
|
65
|
+
http_status=401,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
self._api_key = api_key
|
|
69
|
+
self._base_url = base_url.rstrip("/")
|
|
70
|
+
self._timeout = timeout
|
|
71
|
+
self._max_retries = max_retries
|
|
72
|
+
|
|
73
|
+
self._client = httpx.Client(
|
|
74
|
+
base_url=self._base_url,
|
|
75
|
+
headers={
|
|
76
|
+
"x-api-key": api_key,
|
|
77
|
+
"Content-Type": "application/json",
|
|
78
|
+
},
|
|
79
|
+
timeout=timeout,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self.sessions = SessionsAPI(self._request)
|
|
83
|
+
|
|
84
|
+
def read(self, **kwargs: Any) -> ReadResult:
|
|
85
|
+
"""Read (scrape, batch, or crawl) URLs.
|
|
86
|
+
|
|
87
|
+
Single URL → sync scrape, returned immediately.
|
|
88
|
+
Multiple URLs or ``max_depth``/``max_pages`` → async job, polled to
|
|
89
|
+
completion.
|
|
90
|
+
"""
|
|
91
|
+
params = ReadParams(**kwargs)
|
|
92
|
+
body = params.model_dump(exclude_none=True)
|
|
93
|
+
api_body = _to_camel_case(body)
|
|
94
|
+
|
|
95
|
+
envelope = self._request("POST", "/v1/read", json=api_body)
|
|
96
|
+
data = envelope.get("data") or {}
|
|
97
|
+
|
|
98
|
+
# Async job: data has status + mode, no markdown/metadata
|
|
99
|
+
if "status" in data and "mode" in data and "metadata" not in data:
|
|
100
|
+
job = self.wait_for_job(str(data["id"]))
|
|
101
|
+
return JobReadResult(kind="job", data=job)
|
|
102
|
+
|
|
103
|
+
# Sync scrape
|
|
104
|
+
scrape = ScrapeResult(**_to_snake_case(data))
|
|
105
|
+
return ScrapeReadResult(kind="scrape", data=scrape)
|
|
106
|
+
|
|
107
|
+
def get_job(
|
|
108
|
+
self,
|
|
109
|
+
job_id: str,
|
|
110
|
+
skip: Optional[int] = None,
|
|
111
|
+
limit: Optional[int] = None,
|
|
112
|
+
) -> tuple[Job, bool]:
|
|
113
|
+
"""Get a single page of job results. Returns ``(job, has_more)``."""
|
|
114
|
+
params: dict[str, Any] = {}
|
|
115
|
+
if skip is not None:
|
|
116
|
+
params["skip"] = skip
|
|
117
|
+
if limit is not None:
|
|
118
|
+
params["limit"] = limit
|
|
119
|
+
|
|
120
|
+
envelope = self._request("GET", f"/v1/jobs/{job_id}", params=params or None)
|
|
121
|
+
job = Job(**_to_snake_case(envelope["data"]))
|
|
122
|
+
pagination = envelope.get("pagination") or {}
|
|
123
|
+
return job, bool(pagination.get("hasMore"))
|
|
124
|
+
|
|
125
|
+
def get_all_job_results(self, job_id: str) -> list[Page]:
|
|
126
|
+
"""Fetch every page result by following pagination."""
|
|
127
|
+
pages: list[Page] = []
|
|
128
|
+
skip = 0
|
|
129
|
+
limit = 100
|
|
130
|
+
while True:
|
|
131
|
+
job, has_more = self.get_job(job_id, skip=skip, limit=limit)
|
|
132
|
+
pages.extend(job.results)
|
|
133
|
+
if not has_more:
|
|
134
|
+
break
|
|
135
|
+
skip += limit
|
|
136
|
+
return pages
|
|
137
|
+
|
|
138
|
+
def cancel_job(self, job_id: str) -> None:
|
|
139
|
+
"""Cancel a running job. Raises :class:`ConflictError` if terminal."""
|
|
140
|
+
self._request("DELETE", f"/v1/jobs/{job_id}")
|
|
141
|
+
|
|
142
|
+
def retry_job(self, job_id: str) -> dict[str, Any]:
|
|
143
|
+
"""Retry the failed URLs in a job."""
|
|
144
|
+
envelope = self._request("POST", f"/v1/jobs/{job_id}/retry")
|
|
145
|
+
return envelope["data"]
|
|
146
|
+
|
|
147
|
+
def get_credits(self) -> Credits:
|
|
148
|
+
"""Get the current credit balance for this workspace."""
|
|
149
|
+
envelope = self._request("GET", "/v1/usage/credits")
|
|
150
|
+
return Credits(**_to_snake_case(envelope["data"]))
|
|
151
|
+
|
|
152
|
+
def wait_for_job(
|
|
153
|
+
self,
|
|
154
|
+
job_id: str,
|
|
155
|
+
poll_interval: int = DEFAULT_POLL_INTERVAL,
|
|
156
|
+
timeout: int = DEFAULT_POLL_TIMEOUT,
|
|
157
|
+
) -> Job:
|
|
158
|
+
"""Poll a job until it terminates. Collects all results when done."""
|
|
159
|
+
start = time.time()
|
|
160
|
+
while time.time() - start < timeout:
|
|
161
|
+
job, _ = self.get_job(job_id, limit=1)
|
|
162
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
163
|
+
if job.status == "completed":
|
|
164
|
+
job.results = self.get_all_job_results(job_id)
|
|
165
|
+
return job
|
|
166
|
+
time.sleep(poll_interval)
|
|
167
|
+
|
|
168
|
+
raise ScrapeTimeoutError(
|
|
169
|
+
f"Job {job_id} polling timed out after {timeout}s",
|
|
170
|
+
code="scrape_timeout",
|
|
171
|
+
http_status=504,
|
|
172
|
+
details={"timeoutMs": timeout * 1000},
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def stream(
|
|
176
|
+
self,
|
|
177
|
+
job_id: str,
|
|
178
|
+
timeout: int = DEFAULT_STREAM_TIMEOUT,
|
|
179
|
+
) -> Iterator[StreamEvent]:
|
|
180
|
+
"""Stream real-time events for a running job via Server-Sent Events.
|
|
181
|
+
|
|
182
|
+
Yields parsed :class:`StreamEvent` instances as the job makes progress.
|
|
183
|
+
The stream closes automatically when the job reaches a terminal state.
|
|
184
|
+
|
|
185
|
+
Example::
|
|
186
|
+
|
|
187
|
+
for event in client.stream(job_id):
|
|
188
|
+
if event.type == "page":
|
|
189
|
+
print("page:", event.data.url)
|
|
190
|
+
elif event.type == "done":
|
|
191
|
+
print("finished:", event.status)
|
|
192
|
+
break
|
|
193
|
+
"""
|
|
194
|
+
# SSE uses a long-lived connection; disconnect the default httpx
|
|
195
|
+
# read-timeout (otherwise idle keep-alives trigger a timeout).
|
|
196
|
+
url = f"{self._base_url}/v1/jobs/{job_id}/stream"
|
|
197
|
+
headers = {"x-api-key": self._api_key, "Accept": "text/event-stream"}
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
with httpx.Client(timeout=httpx.Timeout(None, connect=10.0)) as sse_client:
|
|
201
|
+
with sse_client.stream("GET", url, headers=headers) as response:
|
|
202
|
+
if response.status_code >= 400:
|
|
203
|
+
# Drain the body so we can parse an error envelope
|
|
204
|
+
body_bytes = b"".join(response.iter_bytes())
|
|
205
|
+
try:
|
|
206
|
+
body = json.loads(body_bytes.decode("utf-8"))
|
|
207
|
+
except (ValueError, UnicodeDecodeError):
|
|
208
|
+
body = None
|
|
209
|
+
request_id = response.headers.get("x-request-id")
|
|
210
|
+
if (
|
|
211
|
+
isinstance(body, dict)
|
|
212
|
+
and "error" in body
|
|
213
|
+
and isinstance(body["error"], dict)
|
|
214
|
+
):
|
|
215
|
+
raise to_reader_api_error(
|
|
216
|
+
body["error"], response.status_code, request_id
|
|
217
|
+
)
|
|
218
|
+
raise ReaderApiError(
|
|
219
|
+
f"Stream failed with status {response.status_code}",
|
|
220
|
+
code="internal_error",
|
|
221
|
+
http_status=response.status_code,
|
|
222
|
+
request_id=request_id,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
yield from _parse_sse_stream(response.iter_lines(), timeout)
|
|
226
|
+
except httpx.TimeoutException as exc:
|
|
227
|
+
raise ScrapeTimeoutError(
|
|
228
|
+
f"Job {job_id} stream timed out",
|
|
229
|
+
code="scrape_timeout",
|
|
230
|
+
http_status=504,
|
|
231
|
+
) from exc
|
|
232
|
+
|
|
233
|
+
def close(self) -> None:
|
|
234
|
+
self._client.close()
|
|
235
|
+
|
|
236
|
+
def __enter__(self):
|
|
237
|
+
return self
|
|
238
|
+
|
|
239
|
+
def __exit__(self, *args):
|
|
240
|
+
self.close()
|
|
241
|
+
|
|
242
|
+
# ── Internal ─────────────────────────────────────────────────────
|
|
243
|
+
|
|
244
|
+
def _request(self, method: str, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
245
|
+
"""Send a request with retries on transient failures.
|
|
246
|
+
|
|
247
|
+
Retries on 5xx and 429 with exponential backoff (1s, 2s, 4s...).
|
|
248
|
+
For 429 the ``Retry-After`` header (seconds) overrides the backoff.
|
|
249
|
+
Client errors (4xx other than 429) are raised immediately.
|
|
250
|
+
"""
|
|
251
|
+
last_error: Optional[Exception] = None
|
|
252
|
+
|
|
253
|
+
for attempt in range(self._max_retries + 1):
|
|
254
|
+
try:
|
|
255
|
+
res = self._client.request(method, path, **kwargs)
|
|
256
|
+
except httpx.TimeoutException as exc:
|
|
257
|
+
last_error = ScrapeTimeoutError(
|
|
258
|
+
"Request timed out",
|
|
259
|
+
code="scrape_timeout",
|
|
260
|
+
http_status=504,
|
|
261
|
+
)
|
|
262
|
+
last_error.__cause__ = exc
|
|
263
|
+
except httpx.ConnectError as exc:
|
|
264
|
+
last_error = ReaderApiError(
|
|
265
|
+
"Connection failed",
|
|
266
|
+
code="upstream_unavailable",
|
|
267
|
+
http_status=502,
|
|
268
|
+
)
|
|
269
|
+
last_error.__cause__ = exc
|
|
270
|
+
else:
|
|
271
|
+
request_id = res.headers.get("x-request-id")
|
|
272
|
+
try:
|
|
273
|
+
data = res.json()
|
|
274
|
+
except Exception:
|
|
275
|
+
data = None
|
|
276
|
+
|
|
277
|
+
if res.status_code < 400:
|
|
278
|
+
if not isinstance(data, dict):
|
|
279
|
+
raise ReaderApiError(
|
|
280
|
+
"Invalid response from Reader API",
|
|
281
|
+
code="internal_error",
|
|
282
|
+
http_status=res.status_code,
|
|
283
|
+
request_id=request_id,
|
|
284
|
+
)
|
|
285
|
+
return data
|
|
286
|
+
|
|
287
|
+
# Error response — build typed exception
|
|
288
|
+
if isinstance(data, dict) and isinstance(data.get("error"), dict):
|
|
289
|
+
err: ReaderApiError = to_reader_api_error(
|
|
290
|
+
data["error"], res.status_code, request_id
|
|
291
|
+
)
|
|
292
|
+
else:
|
|
293
|
+
err = ReaderApiError(
|
|
294
|
+
f"Request failed with status {res.status_code}",
|
|
295
|
+
code="internal_error",
|
|
296
|
+
http_status=res.status_code,
|
|
297
|
+
request_id=request_id,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Never retry 4xx except 429
|
|
301
|
+
if res.status_code < 500 and res.status_code != 429:
|
|
302
|
+
raise err
|
|
303
|
+
|
|
304
|
+
# Honor Retry-After header on 429
|
|
305
|
+
if isinstance(err, RateLimitedError) and err.retry_after_seconds:
|
|
306
|
+
if attempt < self._max_retries:
|
|
307
|
+
time.sleep(err.retry_after_seconds)
|
|
308
|
+
last_error = err
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
last_error = err
|
|
312
|
+
|
|
313
|
+
# Exponential backoff before next attempt
|
|
314
|
+
if attempt < self._max_retries:
|
|
315
|
+
time.sleep(2 ** attempt)
|
|
316
|
+
|
|
317
|
+
assert last_error is not None
|
|
318
|
+
raise last_error
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class SessionsAPI:
|
|
322
|
+
"""Browser sessions API (synchronous)."""
|
|
323
|
+
|
|
324
|
+
def __init__(self, request_fn: Any):
|
|
325
|
+
self._request = request_fn
|
|
326
|
+
|
|
327
|
+
def create(self, **kwargs: Any) -> SessionInfo:
|
|
328
|
+
"""Create a browser session. Returns a CDP WebSocket URL."""
|
|
329
|
+
body = _to_camel_case(kwargs) if kwargs else {}
|
|
330
|
+
envelope = self._request("POST", "/v1/sessions", json=body)
|
|
331
|
+
return SessionInfo(**_to_snake_case(envelope["data"]))
|
|
332
|
+
|
|
333
|
+
def get(self, session_id: str) -> SessionInfo:
|
|
334
|
+
"""Get session status."""
|
|
335
|
+
envelope = self._request("GET", f"/v1/sessions/{session_id}")
|
|
336
|
+
return SessionInfo(**_to_snake_case(envelope["data"]))
|
|
337
|
+
|
|
338
|
+
def stop(self, session_id: str) -> StopSessionResult:
|
|
339
|
+
"""Stop a browser session."""
|
|
340
|
+
envelope = self._request("DELETE", f"/v1/sessions/{session_id}")
|
|
341
|
+
return StopSessionResult(**_to_snake_case(envelope["data"]))
|
|
342
|
+
|
|
343
|
+
def list(self) -> list[SessionInfo]:
|
|
344
|
+
"""List active sessions."""
|
|
345
|
+
envelope = self._request("GET", "/v1/sessions")
|
|
346
|
+
data = envelope["data"]
|
|
347
|
+
return [SessionInfo(**_to_snake_case(s)) for s in data]
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _to_camel_case(data: dict[str, Any]) -> dict[str, Any]:
|
|
351
|
+
"""Convert snake_case dict keys to camelCase (top-level only)."""
|
|
352
|
+
result: dict[str, Any] = {}
|
|
353
|
+
for key, value in data.items():
|
|
354
|
+
parts = key.split("_")
|
|
355
|
+
camel = parts[0] + "".join(p.capitalize() for p in parts[1:])
|
|
356
|
+
result[camel] = value
|
|
357
|
+
return result
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _to_snake_case(data: Any) -> Any:
|
|
361
|
+
"""Recursively convert camelCase dict keys to snake_case."""
|
|
362
|
+
import re
|
|
363
|
+
|
|
364
|
+
if isinstance(data, dict):
|
|
365
|
+
result: dict[str, Any] = {}
|
|
366
|
+
for key, value in data.items():
|
|
367
|
+
snake = re.sub(r"(?<!^)(?=[A-Z])", "_", key).lower()
|
|
368
|
+
result[snake] = _to_snake_case(value)
|
|
369
|
+
return result
|
|
370
|
+
if isinstance(data, list):
|
|
371
|
+
return [_to_snake_case(v) for v in data]
|
|
372
|
+
return data
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _parse_sse_event(event_name: str, raw_data: str) -> Optional[StreamEvent]:
|
|
376
|
+
"""Parse a single SSE frame into a typed StreamEvent, or None if unknown."""
|
|
377
|
+
try:
|
|
378
|
+
payload = json.loads(raw_data)
|
|
379
|
+
except json.JSONDecodeError:
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
snake = _to_snake_case(payload)
|
|
383
|
+
|
|
384
|
+
if event_name == "progress":
|
|
385
|
+
return ProgressEvent(**snake)
|
|
386
|
+
if event_name == "page":
|
|
387
|
+
return PageEvent(data=Page(**snake))
|
|
388
|
+
if event_name == "error":
|
|
389
|
+
return ErrorEvent(**snake)
|
|
390
|
+
if event_name == "done":
|
|
391
|
+
return DoneEvent(**snake)
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _parse_sse_stream(lines: Iterator[str], timeout: int) -> Iterator[StreamEvent]:
|
|
396
|
+
"""Accumulate SSE lines into frames and yield parsed StreamEvents.
|
|
397
|
+
|
|
398
|
+
SSE frames are separated by blank lines. Each frame has `event: <name>`
|
|
399
|
+
and `data: <json>` lines. Comment lines (starting with `:`) are keep-alive
|
|
400
|
+
pings and skipped. The generator completes when it yields a ``done`` event
|
|
401
|
+
or the stream closes.
|
|
402
|
+
"""
|
|
403
|
+
start = time.time()
|
|
404
|
+
current_event = ""
|
|
405
|
+
current_data: list[str] = []
|
|
406
|
+
|
|
407
|
+
for line in lines:
|
|
408
|
+
if time.time() - start > timeout:
|
|
409
|
+
raise ScrapeTimeoutError(
|
|
410
|
+
"Stream read timed out",
|
|
411
|
+
code="scrape_timeout",
|
|
412
|
+
http_status=504,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Blank line = frame boundary
|
|
416
|
+
if line == "":
|
|
417
|
+
if current_event and current_data:
|
|
418
|
+
parsed = _parse_sse_event(current_event, "\n".join(current_data))
|
|
419
|
+
if parsed is not None:
|
|
420
|
+
yield parsed
|
|
421
|
+
if isinstance(parsed, DoneEvent):
|
|
422
|
+
return
|
|
423
|
+
current_event = ""
|
|
424
|
+
current_data = []
|
|
425
|
+
continue
|
|
426
|
+
|
|
427
|
+
# Comment / keep-alive
|
|
428
|
+
if line.startswith(":"):
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
if line.startswith("event:"):
|
|
432
|
+
current_event = line[len("event:") :].strip()
|
|
433
|
+
elif line.startswith("data:"):
|
|
434
|
+
current_data.append(line[len("data:") :].strip())
|
reader_py/errors.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Typed error classes mirroring the reader-api error code catalog.
|
|
2
|
+
|
|
3
|
+
The API returns a stable ``code`` field on every error response. The SDK
|
|
4
|
+
branches on that code and raises a specific subclass so callers can write::
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
client.read(url=url)
|
|
8
|
+
except InsufficientCreditsError as err:
|
|
9
|
+
print(err.required, err.available, err.reset_at)
|
|
10
|
+
|
|
11
|
+
There is one subclass per code. Unknown codes fall through to the base
|
|
12
|
+
:class:`ReaderApiError`.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from typing import Any, Optional
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ReaderApiError(Exception):
|
|
21
|
+
"""Base error for all Reader API responses."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
message: str,
|
|
26
|
+
*,
|
|
27
|
+
code: str = "internal_error",
|
|
28
|
+
http_status: int = 0,
|
|
29
|
+
details: Optional[dict[str, Any]] = None,
|
|
30
|
+
docs_url: Optional[str] = None,
|
|
31
|
+
request_id: Optional[str] = None,
|
|
32
|
+
) -> None:
|
|
33
|
+
super().__init__(message)
|
|
34
|
+
self.code = code
|
|
35
|
+
self.http_status = http_status
|
|
36
|
+
self.details = details or {}
|
|
37
|
+
self.docs_url = docs_url
|
|
38
|
+
self.request_id = request_id
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class InvalidRequestError(ReaderApiError):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class UnauthenticatedError(ReaderApiError):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class InsufficientCreditsError(ReaderApiError):
|
|
50
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
51
|
+
super().__init__(*args, **kwargs)
|
|
52
|
+
self.required: Optional[int] = self.details.get("required")
|
|
53
|
+
self.available: Optional[int] = self.details.get("available")
|
|
54
|
+
self.reset_at: Optional[str] = self.details.get("resetAt")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class UrlBlockedError(ReaderApiError):
|
|
58
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
59
|
+
super().__init__(*args, **kwargs)
|
|
60
|
+
self.url: Optional[str] = self.details.get("url")
|
|
61
|
+
self.reason: Optional[str] = self.details.get("reason")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class NotFoundError(ReaderApiError):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ConflictError(ReaderApiError):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class RateLimitedError(ReaderApiError):
|
|
73
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
74
|
+
super().__init__(*args, **kwargs)
|
|
75
|
+
self.retry_after_seconds: Optional[int] = self.details.get("retryAfterSeconds")
|
|
76
|
+
self.limit: Optional[int] = self.details.get("limit")
|
|
77
|
+
self.window_seconds: Optional[int] = self.details.get("windowSeconds")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ConcurrencyLimitedError(ReaderApiError):
|
|
81
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
82
|
+
super().__init__(*args, **kwargs)
|
|
83
|
+
self.active: Optional[int] = self.details.get("active")
|
|
84
|
+
self.max: Optional[int] = self.details.get("max")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class InternalServerError(ReaderApiError):
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class UpstreamUnavailableError(ReaderApiError):
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ScrapeTimeoutError(ReaderApiError):
|
|
96
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
97
|
+
super().__init__(*args, **kwargs)
|
|
98
|
+
self.timeout_ms: Optional[int] = self.details.get("timeoutMs")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
_CODE_MAP: dict[str, type[ReaderApiError]] = {
|
|
102
|
+
"invalid_request": InvalidRequestError,
|
|
103
|
+
"unauthenticated": UnauthenticatedError,
|
|
104
|
+
"insufficient_credits": InsufficientCreditsError,
|
|
105
|
+
"url_blocked": UrlBlockedError,
|
|
106
|
+
"not_found": NotFoundError,
|
|
107
|
+
"conflict": ConflictError,
|
|
108
|
+
"rate_limited": RateLimitedError,
|
|
109
|
+
"concurrency_limited": ConcurrencyLimitedError,
|
|
110
|
+
"internal_error": InternalServerError,
|
|
111
|
+
"upstream_unavailable": UpstreamUnavailableError,
|
|
112
|
+
"scrape_timeout": ScrapeTimeoutError,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def to_reader_api_error(
|
|
117
|
+
body: dict[str, Any],
|
|
118
|
+
http_status: int,
|
|
119
|
+
request_id: Optional[str] = None,
|
|
120
|
+
) -> ReaderApiError:
|
|
121
|
+
"""Build the right error subclass from an error envelope body."""
|
|
122
|
+
code = body.get("code", "internal_error")
|
|
123
|
+
cls = _CODE_MAP.get(code, ReaderApiError)
|
|
124
|
+
return cls(
|
|
125
|
+
body.get("message", "Reader API error"),
|
|
126
|
+
code=code,
|
|
127
|
+
http_status=http_status,
|
|
128
|
+
details=body.get("details") or {},
|
|
129
|
+
docs_url=body.get("docsUrl"),
|
|
130
|
+
request_id=request_id,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# Backwards-compat alias for users still catching the old class name.
|
|
135
|
+
ReaderError = ReaderApiError
|
reader_py/types.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Reader SDK types — mirror the reader-api envelope contract."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Annotated, Any, Literal, Optional, Union
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
ProxyMode = Literal["standard", "stealth", "auto"]
|
|
11
|
+
ResolvedProxyMode = Literal["standard", "stealth"]
|
|
12
|
+
JobStatus = Literal["queued", "processing", "completed", "failed", "cancelled"]
|
|
13
|
+
JobMode = Literal["scrape", "batch", "crawl"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class WebhookConfig(BaseModel):
|
|
17
|
+
"""Per-request webhook configuration for async job notifications."""
|
|
18
|
+
|
|
19
|
+
url: str
|
|
20
|
+
events: Optional[list[str]] = None
|
|
21
|
+
secret: Optional[str] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ReadParams(BaseModel):
|
|
25
|
+
url: Optional[str] = None
|
|
26
|
+
urls: Optional[list[str]] = None
|
|
27
|
+
formats: list[str] = ["markdown"]
|
|
28
|
+
only_main_content: bool = True
|
|
29
|
+
include_tags: Optional[list[str]] = None
|
|
30
|
+
exclude_tags: Optional[list[str]] = None
|
|
31
|
+
wait_for_selector: Optional[str] = None
|
|
32
|
+
timeout_ms: int = 30000
|
|
33
|
+
proxy_mode: Optional[ProxyMode] = None
|
|
34
|
+
max_depth: Optional[int] = None
|
|
35
|
+
max_pages: Optional[int] = None
|
|
36
|
+
cache: bool = True
|
|
37
|
+
webhook: Optional[WebhookConfig] = None
|
|
38
|
+
batch_concurrency: Optional[int] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ScrapeMetadata(BaseModel):
|
|
42
|
+
title: Optional[str] = None
|
|
43
|
+
description: Optional[str] = None
|
|
44
|
+
status_code: Optional[int] = None
|
|
45
|
+
duration: int
|
|
46
|
+
cached: bool
|
|
47
|
+
proxy_mode: Optional[ResolvedProxyMode] = None
|
|
48
|
+
proxy_escalated: Optional[bool] = None
|
|
49
|
+
scraped_at: str
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ScrapeResult(BaseModel):
|
|
53
|
+
"""Result of a synchronous single-URL scrape."""
|
|
54
|
+
|
|
55
|
+
kind: Literal["scrape"] = "scrape"
|
|
56
|
+
url: str
|
|
57
|
+
final_url: Optional[str] = None # Present if URL redirected
|
|
58
|
+
markdown: Optional[str] = None
|
|
59
|
+
html: Optional[str] = None
|
|
60
|
+
metadata: ScrapeMetadata
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Page(BaseModel):
|
|
64
|
+
"""An individual result inside a job's `results` array."""
|
|
65
|
+
|
|
66
|
+
url: str
|
|
67
|
+
markdown: Optional[str] = None
|
|
68
|
+
html: Optional[str] = None
|
|
69
|
+
status_code: Optional[int] = None
|
|
70
|
+
proxy_mode: Optional[ResolvedProxyMode] = None
|
|
71
|
+
proxy_escalated: Optional[bool] = None
|
|
72
|
+
credits: Optional[int] = None
|
|
73
|
+
metadata: Optional[dict[str, Any]] = None
|
|
74
|
+
error: Optional[str] = None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class Job(BaseModel):
|
|
78
|
+
"""Job as returned by GET /v1/jobs/:id (the `data` portion of the envelope)."""
|
|
79
|
+
|
|
80
|
+
kind: Literal["job"] = "job"
|
|
81
|
+
id: str
|
|
82
|
+
status: JobStatus
|
|
83
|
+
mode: JobMode
|
|
84
|
+
completed: int = 0
|
|
85
|
+
total: int = 0
|
|
86
|
+
credits_used: int = 0
|
|
87
|
+
error: Optional[str] = None
|
|
88
|
+
results: list[Page] = Field(default_factory=list)
|
|
89
|
+
started_at: Optional[str] = None
|
|
90
|
+
completed_at: Optional[str] = None
|
|
91
|
+
created_at: Optional[str] = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class Pagination(BaseModel):
|
|
95
|
+
total: int
|
|
96
|
+
skip: int
|
|
97
|
+
limit: int
|
|
98
|
+
has_more: bool
|
|
99
|
+
next: Optional[str] = None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Credits(BaseModel):
|
|
103
|
+
balance: int
|
|
104
|
+
limit: int
|
|
105
|
+
used: int
|
|
106
|
+
tier: str
|
|
107
|
+
reset_at: str
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class UsageEntry(BaseModel):
|
|
111
|
+
id: str
|
|
112
|
+
url: str
|
|
113
|
+
duration: int
|
|
114
|
+
status: Literal["success", "error"]
|
|
115
|
+
cached: bool
|
|
116
|
+
proxy_mode: Optional[ResolvedProxyMode] = None
|
|
117
|
+
credits: int
|
|
118
|
+
error: Optional[str] = None
|
|
119
|
+
created_at: str
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# Backwards-compat aliases — older code used these names.
|
|
123
|
+
JobInfo = Job
|
|
124
|
+
CreditInfo = Credits
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class ScrapeReadResult(BaseModel):
|
|
128
|
+
"""ReadResult variant returned by single-URL scrapes."""
|
|
129
|
+
|
|
130
|
+
kind: Literal["scrape"]
|
|
131
|
+
data: ScrapeResult
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class JobReadResult(BaseModel):
|
|
135
|
+
"""ReadResult variant returned by async batch and crawl jobs."""
|
|
136
|
+
|
|
137
|
+
kind: Literal["job"]
|
|
138
|
+
data: Job
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# Discriminated union: the `kind` field selects the variant. The explicit
|
|
142
|
+
# Field(discriminator="kind") annotation tells Pydantic v2 to dispatch on
|
|
143
|
+
# `kind` at validation time (faster and gives better error messages than
|
|
144
|
+
# structural matching) and enables IDE / mypy narrowing on `result.data`
|
|
145
|
+
# once the caller has branched on `result.kind`.
|
|
146
|
+
ReadResult = Annotated[
|
|
147
|
+
Union[ScrapeReadResult, JobReadResult],
|
|
148
|
+
Field(discriminator="kind"),
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ──────────────────────────────────────────────────────────────
|
|
153
|
+
# Streaming events (yielded by client.stream / AsyncReaderClient.stream)
|
|
154
|
+
# ──────────────────────────────────────────────────────────────
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class ProgressEvent(BaseModel):
|
|
158
|
+
type: Literal["progress"] = "progress"
|
|
159
|
+
status: JobStatus
|
|
160
|
+
completed: int
|
|
161
|
+
total: int
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class PageEvent(BaseModel):
|
|
165
|
+
type: Literal["page"] = "page"
|
|
166
|
+
data: Page
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class ErrorEvent(BaseModel):
|
|
170
|
+
type: Literal["error"] = "error"
|
|
171
|
+
url: str
|
|
172
|
+
error: str
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class DoneEvent(BaseModel):
|
|
176
|
+
type: Literal["done"] = "done"
|
|
177
|
+
status: JobStatus
|
|
178
|
+
completed: int
|
|
179
|
+
total: int
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
StreamEvent = Union[ProgressEvent, PageEvent, ErrorEvent, DoneEvent]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ─── Browser Sessions ────────────────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
SessionStatus = Literal["active", "stopped", "expired"]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class SessionInfo(BaseModel):
|
|
191
|
+
"""Active browser session with a CDP WebSocket endpoint."""
|
|
192
|
+
|
|
193
|
+
session_id: str
|
|
194
|
+
ws_endpoint: str
|
|
195
|
+
token: str
|
|
196
|
+
status: SessionStatus
|
|
197
|
+
created_at: str
|
|
198
|
+
expires_at: str
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class CreateSessionParams(BaseModel):
|
|
202
|
+
"""Options for creating a browser session."""
|
|
203
|
+
|
|
204
|
+
max_duration_ms: Optional[int] = None
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class StopSessionResult(BaseModel):
|
|
208
|
+
"""Result from stopping a browser session."""
|
|
209
|
+
|
|
210
|
+
session_id: str
|
|
211
|
+
status: Literal["stopped"]
|
|
212
|
+
duration_ms: int
|
|
213
|
+
credits_charged: int
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: reader-py
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python SDK for the Reader API
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: reader,scraper,web-scraping,markdown,llm,sdk
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: httpx>=0.25.0
|
|
10
|
+
Requires-Dist: pydantic>=2.0.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest; extra == "dev"
|
|
13
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
14
|
+
|
|
15
|
+
# reader-py
|
|
16
|
+
|
|
17
|
+
Python SDK for the [Reader API](https://reader.dev) — content extraction for LLMs. Wraps `POST /v1/read`, parses responses into Pydantic models, raises typed exceptions, and auto-polls async jobs to completion.
|
|
18
|
+
|
|
19
|
+
**Version:** 0.2.0 · **Python:** 3.9+
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install reader-py
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick start (sync)
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import os
|
|
31
|
+
from reader_py import ReaderClient
|
|
32
|
+
|
|
33
|
+
reader = ReaderClient(api_key=os.environ["READER_KEY"])
|
|
34
|
+
|
|
35
|
+
result = reader.read(url="https://example.com")
|
|
36
|
+
if result.kind == "scrape":
|
|
37
|
+
print(result.data.markdown)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick start (async)
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import asyncio
|
|
44
|
+
import os
|
|
45
|
+
from reader_py import AsyncReaderClient
|
|
46
|
+
|
|
47
|
+
async def main():
|
|
48
|
+
async with AsyncReaderClient(api_key=os.environ["READER_KEY"]) as reader:
|
|
49
|
+
result = await reader.read(url="https://example.com")
|
|
50
|
+
if result.kind == "scrape":
|
|
51
|
+
print(result.data.markdown)
|
|
52
|
+
|
|
53
|
+
asyncio.run(main())
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
`reader.read(...)` returns a discriminated union (Pydantic):
|
|
57
|
+
|
|
58
|
+
- `ScrapeReadResult(kind="scrape", data=ScrapeResult)` — single-URL requests, returned immediately
|
|
59
|
+
- `JobReadResult(kind="job", data=Job)` — batch and crawl requests, auto-polled to completion
|
|
60
|
+
|
|
61
|
+
## Features
|
|
62
|
+
|
|
63
|
+
- **Sync and async clients** — `ReaderClient` (blocking, backed by `httpx.Client`) and `AsyncReaderClient` (backed by `httpx.AsyncClient`). Same method surface.
|
|
64
|
+
- **Typed errors for all 11 Reader error codes.** `InsufficientCreditsError`, `RateLimitedError`, `UrlBlockedError`, `ScrapeTimeoutError`, and more. Each subclass exposes the relevant fields (e.g. `err.required`, `err.retry_after_seconds`).
|
|
65
|
+
- **Automatic retries with exponential backoff** for transient codes. Honors the `Retry-After` header on 429.
|
|
66
|
+
- **Pagination-aware job collection.** `wait_for_job()` returns the full job with every page result.
|
|
67
|
+
- **SSE streaming.** `for event in reader.stream(job_id)` (sync) or `async for` (async) yields `ProgressEvent` / `PageEvent` / `ErrorEvent` / `DoneEvent`.
|
|
68
|
+
- **Pydantic models everywhere** — all responses are parsed into typed models with IDE autocomplete.
|
|
69
|
+
- **Request ID tracing.** Every error carries the `x-request-id` header value on `err.request_id` for support tickets.
|
|
70
|
+
|
|
71
|
+
## Browser Sessions
|
|
72
|
+
|
|
73
|
+
Launch a stealthed Chrome and connect Playwright:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
session = reader.sessions.create()
|
|
77
|
+
|
|
78
|
+
from playwright.sync_api import sync_playwright
|
|
79
|
+
with sync_playwright() as p:
|
|
80
|
+
browser = p.chromium.connect_over_cdp(session.ws_endpoint)
|
|
81
|
+
page = browser.contexts[0].new_page()
|
|
82
|
+
page.goto("https://example.com")
|
|
83
|
+
print(page.title())
|
|
84
|
+
browser.close()
|
|
85
|
+
|
|
86
|
+
reader.sessions.stop(session.session_id)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Async:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
session = await reader.sessions.create()
|
|
93
|
+
# ... use async playwright ...
|
|
94
|
+
await reader.sessions.stop(session.session_id)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Methods: `reader.sessions.create()`, `.get(id)`, `.stop(id)`, `.list()`
|
|
98
|
+
|
|
99
|
+
## Errors
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from reader_py import (
|
|
103
|
+
ReaderApiError,
|
|
104
|
+
InsufficientCreditsError,
|
|
105
|
+
RateLimitedError,
|
|
106
|
+
UrlBlockedError,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
reader.read(url=url)
|
|
111
|
+
except InsufficientCreditsError as err:
|
|
112
|
+
print(f"Need {err.required}, have {err.available}")
|
|
113
|
+
except RateLimitedError as err:
|
|
114
|
+
print(f"Retry after {err.retry_after_seconds}s")
|
|
115
|
+
except UrlBlockedError as err:
|
|
116
|
+
print(f"Blocked: {err.reason}")
|
|
117
|
+
except ReaderApiError as err:
|
|
118
|
+
print(f"[{err.code}] {err} — see {err.docs_url}")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
`ReaderError` is re-exported as an alias for `ReaderApiError` so code written against the 0.1 SDK continues to work. New code should use `ReaderApiError`.
|
|
122
|
+
|
|
123
|
+
Full catalog of error codes: https://reader.dev/docs/home/concepts/errors
|
|
124
|
+
|
|
125
|
+
## Links
|
|
126
|
+
|
|
127
|
+
- **Docs:** https://reader.dev/docs
|
|
128
|
+
- **SDK reference:** https://reader.dev/docs/sdk/python
|
|
129
|
+
- **API reference:** https://reader.dev/docs/api-reference/read
|
|
130
|
+
- **Discord:** https://discord.gg/6tjkq7J5WV
|
|
131
|
+
|
|
132
|
+
## Development
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python -m venv .venv && source .venv/bin/activate
|
|
136
|
+
pip install -e .[dev]
|
|
137
|
+
pytest
|
|
138
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
reader_py/__init__.py,sha256=YUumSMGdwuL5iLtCnxQhft2-JYyI-Bk0s_F9yFyIico,1694
|
|
2
|
+
reader_py/async_client.py,sha256=u4atvVKRgyYdwuwD-Cyi58i3R7EkfEbM1lbV_Aoz4Eg,12846
|
|
3
|
+
reader_py/client.py,sha256=8I585bzIYVjw7357FHORYZwtWPQ_rmYfpPwf11w_fuk,15247
|
|
4
|
+
reader_py/errors.py,sha256=p6JeE4ga_4nCizDhQ4uv7CWiuiptoiIxk5k36Itoogs,4053
|
|
5
|
+
reader_py/types.py,sha256=sPHtQ47bMskzJZHcUAlIo5fDuNfd4XJf1VdXn6RJgWQ,5695
|
|
6
|
+
reader_py-0.2.0.dist-info/METADATA,sha256=T57wx2JXGnsFTO49ipN-wTfzJ9s0GmK0f5eXdbUWgWU,4280
|
|
7
|
+
reader_py-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
reader_py-0.2.0.dist-info/top_level.txt,sha256=7oV7DerbpeN6_gURVfw80HLHOoagNvXRd4pcrFopnBU,10
|
|
9
|
+
reader_py-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
reader_py
|