reader-py 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reader_py-0.2.0/PKG-INFO +138 -0
- reader_py-0.2.0/README.md +124 -0
- reader_py-0.2.0/pyproject.toml +19 -0
- reader_py-0.2.0/setup.cfg +4 -0
- reader_py-0.2.0/src/reader_py/__init__.py +87 -0
- reader_py-0.2.0/src/reader_py/async_client.py +373 -0
- reader_py-0.2.0/src/reader_py/client.py +434 -0
- reader_py-0.2.0/src/reader_py/errors.py +135 -0
- reader_py-0.2.0/src/reader_py/types.py +213 -0
- reader_py-0.2.0/src/reader_py.egg-info/PKG-INFO +138 -0
- reader_py-0.2.0/src/reader_py.egg-info/SOURCES.txt +13 -0
- reader_py-0.2.0/src/reader_py.egg-info/dependency_links.txt +1 -0
- reader_py-0.2.0/src/reader_py.egg-info/requires.txt +6 -0
- reader_py-0.2.0/src/reader_py.egg-info/top_level.txt +1 -0
- reader_py-0.2.0/tests/test_client.py +316 -0
reader_py-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: reader-py
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python SDK for the Reader API
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: reader,scraper,web-scraping,markdown,llm,sdk
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: httpx>=0.25.0
|
|
10
|
+
Requires-Dist: pydantic>=2.0.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest; extra == "dev"
|
|
13
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
14
|
+
|
|
15
|
+
# reader-py
|
|
16
|
+
|
|
17
|
+
Python SDK for the [Reader API](https://reader.dev) — content extraction for LLMs. Wraps `POST /v1/read`, parses responses into Pydantic models, raises typed exceptions, and auto-polls async jobs to completion.
|
|
18
|
+
|
|
19
|
+
**Version:** 0.2.0 · **Python:** 3.9+
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install reader-py
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick start (sync)
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import os
|
|
31
|
+
from reader_py import ReaderClient
|
|
32
|
+
|
|
33
|
+
reader = ReaderClient(api_key=os.environ["READER_KEY"])
|
|
34
|
+
|
|
35
|
+
result = reader.read(url="https://example.com")
|
|
36
|
+
if result.kind == "scrape":
|
|
37
|
+
print(result.data.markdown)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick start (async)
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import asyncio
|
|
44
|
+
import os
|
|
45
|
+
from reader_py import AsyncReaderClient
|
|
46
|
+
|
|
47
|
+
async def main():
|
|
48
|
+
async with AsyncReaderClient(api_key=os.environ["READER_KEY"]) as reader:
|
|
49
|
+
result = await reader.read(url="https://example.com")
|
|
50
|
+
if result.kind == "scrape":
|
|
51
|
+
print(result.data.markdown)
|
|
52
|
+
|
|
53
|
+
asyncio.run(main())
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
`reader.read(...)` returns a discriminated union (Pydantic):
|
|
57
|
+
|
|
58
|
+
- `ScrapeReadResult(kind="scrape", data=ScrapeResult)` — single-URL requests, returned immediately
|
|
59
|
+
- `JobReadResult(kind="job", data=Job)` — batch and crawl requests, auto-polled to completion
|
|
60
|
+
|
|
61
|
+
## Features
|
|
62
|
+
|
|
63
|
+
- **Sync and async clients** — `ReaderClient` (blocking, backed by `httpx.Client`) and `AsyncReaderClient` (backed by `httpx.AsyncClient`). Same method surface.
|
|
64
|
+
- **Typed errors for all 11 Reader error codes.** `InsufficientCreditsError`, `RateLimitedError`, `UrlBlockedError`, `ScrapeTimeoutError`, and more. Each subclass exposes the relevant fields (e.g. `err.required`, `err.retry_after_seconds`).
|
|
65
|
+
- **Automatic retries with exponential backoff** for transient codes. Honors the `Retry-After` header on 429.
|
|
66
|
+
- **Pagination-aware job collection.** `wait_for_job()` returns the full job with every page result.
|
|
67
|
+
- **SSE streaming.** `for event in reader.stream(job_id)` (sync) or `async for` (async) yields `ProgressEvent` / `PageEvent` / `ErrorEvent` / `DoneEvent`.
|
|
68
|
+
- **Pydantic models everywhere** — all responses are parsed into typed models with IDE autocomplete.
|
|
69
|
+
- **Request ID tracing.** Every error carries the `x-request-id` header value on `err.request_id` for support tickets.
|
|
70
|
+
|
|
71
|
+
## Browser Sessions
|
|
72
|
+
|
|
73
|
+
Launch a stealthed Chrome and connect Playwright:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
session = reader.sessions.create()
|
|
77
|
+
|
|
78
|
+
from playwright.sync_api import sync_playwright
|
|
79
|
+
with sync_playwright() as p:
|
|
80
|
+
browser = p.chromium.connect_over_cdp(session.ws_endpoint)
|
|
81
|
+
page = browser.contexts[0].new_page()
|
|
82
|
+
page.goto("https://example.com")
|
|
83
|
+
print(page.title())
|
|
84
|
+
browser.close()
|
|
85
|
+
|
|
86
|
+
reader.sessions.stop(session.session_id)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Async:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
session = await reader.sessions.create()
|
|
93
|
+
# ... use async playwright ...
|
|
94
|
+
await reader.sessions.stop(session.session_id)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Methods: `reader.sessions.create()`, `.get(id)`, `.stop(id)`, `.list()`
|
|
98
|
+
|
|
99
|
+
## Errors
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from reader_py import (
|
|
103
|
+
ReaderApiError,
|
|
104
|
+
InsufficientCreditsError,
|
|
105
|
+
RateLimitedError,
|
|
106
|
+
UrlBlockedError,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
reader.read(url=url)
|
|
111
|
+
except InsufficientCreditsError as err:
|
|
112
|
+
print(f"Need {err.required}, have {err.available}")
|
|
113
|
+
except RateLimitedError as err:
|
|
114
|
+
print(f"Retry after {err.retry_after_seconds}s")
|
|
115
|
+
except UrlBlockedError as err:
|
|
116
|
+
print(f"Blocked: {err.reason}")
|
|
117
|
+
except ReaderApiError as err:
|
|
118
|
+
print(f"[{err.code}] {err} — see {err.docs_url}")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
`ReaderError` is re-exported as an alias for `ReaderApiError` so code written against the 0.1 SDK continues to work. New code should use `ReaderApiError`.
|
|
122
|
+
|
|
123
|
+
Full catalog of error codes: https://reader.dev/docs/home/concepts/errors
|
|
124
|
+
|
|
125
|
+
## Links
|
|
126
|
+
|
|
127
|
+
- **Docs:** https://reader.dev/docs
|
|
128
|
+
- **SDK reference:** https://reader.dev/docs/sdk/python
|
|
129
|
+
- **API reference:** https://reader.dev/docs/api-reference/read
|
|
130
|
+
- **Discord:** https://discord.gg/6tjkq7J5WV
|
|
131
|
+
|
|
132
|
+
## Development
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python -m venv .venv && source .venv/bin/activate
|
|
136
|
+
pip install -e .[dev]
|
|
137
|
+
pytest
|
|
138
|
+
```
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# reader-py
|
|
2
|
+
|
|
3
|
+
Python SDK for the [Reader API](https://reader.dev) — content extraction for LLMs. Wraps `POST /v1/read`, parses responses into Pydantic models, raises typed exceptions, and auto-polls async jobs to completion.
|
|
4
|
+
|
|
5
|
+
**Version:** 0.2.0 · **Python:** 3.9+
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install reader-py
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick start (sync)
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import os
|
|
17
|
+
from reader_py import ReaderClient
|
|
18
|
+
|
|
19
|
+
reader = ReaderClient(api_key=os.environ["READER_KEY"])
|
|
20
|
+
|
|
21
|
+
result = reader.read(url="https://example.com")
|
|
22
|
+
if result.kind == "scrape":
|
|
23
|
+
print(result.data.markdown)
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick start (async)
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import asyncio
|
|
30
|
+
import os
|
|
31
|
+
from reader_py import AsyncReaderClient
|
|
32
|
+
|
|
33
|
+
async def main():
|
|
34
|
+
async with AsyncReaderClient(api_key=os.environ["READER_KEY"]) as reader:
|
|
35
|
+
result = await reader.read(url="https://example.com")
|
|
36
|
+
if result.kind == "scrape":
|
|
37
|
+
print(result.data.markdown)
|
|
38
|
+
|
|
39
|
+
asyncio.run(main())
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
`reader.read(...)` returns a discriminated union (Pydantic):
|
|
43
|
+
|
|
44
|
+
- `ScrapeReadResult(kind="scrape", data=ScrapeResult)` — single-URL requests, returned immediately
|
|
45
|
+
- `JobReadResult(kind="job", data=Job)` — batch and crawl requests, auto-polled to completion
|
|
46
|
+
|
|
47
|
+
## Features
|
|
48
|
+
|
|
49
|
+
- **Sync and async clients** — `ReaderClient` (blocking, backed by `httpx.Client`) and `AsyncReaderClient` (backed by `httpx.AsyncClient`). Same method surface.
|
|
50
|
+
- **Typed errors for all 11 Reader error codes.** `InsufficientCreditsError`, `RateLimitedError`, `UrlBlockedError`, `ScrapeTimeoutError`, and more. Each subclass exposes the relevant fields (e.g. `err.required`, `err.retry_after_seconds`).
|
|
51
|
+
- **Automatic retries with exponential backoff** for transient codes. Honors the `Retry-After` header on 429.
|
|
52
|
+
- **Pagination-aware job collection.** `wait_for_job()` returns the full job with every page result.
|
|
53
|
+
- **SSE streaming.** `for event in reader.stream(job_id)` (sync) or `async for` (async) yields `ProgressEvent` / `PageEvent` / `ErrorEvent` / `DoneEvent`.
|
|
54
|
+
- **Pydantic models everywhere** — all responses are parsed into typed models with IDE autocomplete.
|
|
55
|
+
- **Request ID tracing.** Every error carries the `x-request-id` header value on `err.request_id` for support tickets.
|
|
56
|
+
|
|
57
|
+
## Browser Sessions
|
|
58
|
+
|
|
59
|
+
Launch a stealthed Chrome and connect Playwright:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
session = reader.sessions.create()
|
|
63
|
+
|
|
64
|
+
from playwright.sync_api import sync_playwright
|
|
65
|
+
with sync_playwright() as p:
|
|
66
|
+
browser = p.chromium.connect_over_cdp(session.ws_endpoint)
|
|
67
|
+
page = browser.contexts[0].new_page()
|
|
68
|
+
page.goto("https://example.com")
|
|
69
|
+
print(page.title())
|
|
70
|
+
browser.close()
|
|
71
|
+
|
|
72
|
+
reader.sessions.stop(session.session_id)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Async:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
session = await reader.sessions.create()
|
|
79
|
+
# ... use async playwright ...
|
|
80
|
+
await reader.sessions.stop(session.session_id)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Methods: `reader.sessions.create()`, `.get(id)`, `.stop(id)`, `.list()`
|
|
84
|
+
|
|
85
|
+
## Errors
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from reader_py import (
|
|
89
|
+
ReaderApiError,
|
|
90
|
+
InsufficientCreditsError,
|
|
91
|
+
RateLimitedError,
|
|
92
|
+
UrlBlockedError,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
reader.read(url=url)
|
|
97
|
+
except InsufficientCreditsError as err:
|
|
98
|
+
print(f"Need {err.required}, have {err.available}")
|
|
99
|
+
except RateLimitedError as err:
|
|
100
|
+
print(f"Retry after {err.retry_after_seconds}s")
|
|
101
|
+
except UrlBlockedError as err:
|
|
102
|
+
print(f"Blocked: {err.reason}")
|
|
103
|
+
except ReaderApiError as err:
|
|
104
|
+
print(f"[{err.code}] {err} — see {err.docs_url}")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
`ReaderError` is re-exported as an alias for `ReaderApiError` so code written against the 0.1 SDK continues to work. New code should use `ReaderApiError`.
|
|
108
|
+
|
|
109
|
+
Full catalog of error codes: https://reader.dev/docs/home/concepts/errors
|
|
110
|
+
|
|
111
|
+
## Links
|
|
112
|
+
|
|
113
|
+
- **Docs:** https://reader.dev/docs
|
|
114
|
+
- **SDK reference:** https://reader.dev/docs/sdk/python
|
|
115
|
+
- **API reference:** https://reader.dev/docs/api-reference/read
|
|
116
|
+
- **Discord:** https://discord.gg/6tjkq7J5WV
|
|
117
|
+
|
|
118
|
+
## Development
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
python -m venv .venv && source .venv/bin/activate
|
|
122
|
+
pip install -e .[dev]
|
|
123
|
+
pytest
|
|
124
|
+
```
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "reader-py"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Python SDK for the Reader API"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"httpx>=0.25.0",
|
|
14
|
+
"pydantic>=2.0.0",
|
|
15
|
+
]
|
|
16
|
+
keywords = ["reader", "scraper", "web-scraping", "markdown", "llm", "sdk"]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
dev = ["pytest", "pytest-asyncio"]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Reader Python SDK."""
|
|
2
|
+
|
|
3
|
+
from .async_client import AsyncReaderClient
|
|
4
|
+
from .client import ReaderClient
|
|
5
|
+
from .errors import (
|
|
6
|
+
ConcurrencyLimitedError,
|
|
7
|
+
ConflictError,
|
|
8
|
+
InsufficientCreditsError,
|
|
9
|
+
InternalServerError,
|
|
10
|
+
InvalidRequestError,
|
|
11
|
+
NotFoundError,
|
|
12
|
+
RateLimitedError,
|
|
13
|
+
ReaderApiError,
|
|
14
|
+
ReaderError,
|
|
15
|
+
ScrapeTimeoutError,
|
|
16
|
+
UnauthenticatedError,
|
|
17
|
+
UpstreamUnavailableError,
|
|
18
|
+
UrlBlockedError,
|
|
19
|
+
to_reader_api_error,
|
|
20
|
+
)
|
|
21
|
+
from .types import (
|
|
22
|
+
CreditInfo,
|
|
23
|
+
Credits,
|
|
24
|
+
DoneEvent,
|
|
25
|
+
ErrorEvent,
|
|
26
|
+
Job,
|
|
27
|
+
JobInfo,
|
|
28
|
+
JobReadResult,
|
|
29
|
+
Page,
|
|
30
|
+
PageEvent,
|
|
31
|
+
Pagination,
|
|
32
|
+
ProgressEvent,
|
|
33
|
+
ReadParams,
|
|
34
|
+
ReadResult,
|
|
35
|
+
ScrapeMetadata,
|
|
36
|
+
ScrapeReadResult,
|
|
37
|
+
ScrapeResult,
|
|
38
|
+
SessionInfo,
|
|
39
|
+
StopSessionResult,
|
|
40
|
+
StreamEvent,
|
|
41
|
+
UsageEntry,
|
|
42
|
+
WebhookConfig,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"ReaderClient",
|
|
47
|
+
"AsyncReaderClient",
|
|
48
|
+
# Errors
|
|
49
|
+
"ReaderApiError",
|
|
50
|
+
"ReaderError",
|
|
51
|
+
"InvalidRequestError",
|
|
52
|
+
"UnauthenticatedError",
|
|
53
|
+
"InsufficientCreditsError",
|
|
54
|
+
"UrlBlockedError",
|
|
55
|
+
"NotFoundError",
|
|
56
|
+
"ConflictError",
|
|
57
|
+
"RateLimitedError",
|
|
58
|
+
"ConcurrencyLimitedError",
|
|
59
|
+
"InternalServerError",
|
|
60
|
+
"UpstreamUnavailableError",
|
|
61
|
+
"ScrapeTimeoutError",
|
|
62
|
+
"to_reader_api_error",
|
|
63
|
+
# Types
|
|
64
|
+
"ReadParams",
|
|
65
|
+
"ReadResult",
|
|
66
|
+
"ScrapeReadResult",
|
|
67
|
+
"JobReadResult",
|
|
68
|
+
"ScrapeResult",
|
|
69
|
+
"ScrapeMetadata",
|
|
70
|
+
"Page",
|
|
71
|
+
"Job",
|
|
72
|
+
"JobInfo",
|
|
73
|
+
"Credits",
|
|
74
|
+
"CreditInfo",
|
|
75
|
+
"Pagination",
|
|
76
|
+
"UsageEntry",
|
|
77
|
+
# Stream events
|
|
78
|
+
"StreamEvent",
|
|
79
|
+
"ProgressEvent",
|
|
80
|
+
"PageEvent",
|
|
81
|
+
"ErrorEvent",
|
|
82
|
+
"DoneEvent",
|
|
83
|
+
"WebhookConfig",
|
|
84
|
+
# Sessions
|
|
85
|
+
"SessionInfo",
|
|
86
|
+
"StopSessionResult",
|
|
87
|
+
]
|