crawlora 1.5.0.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlora-1.5.0.dev1/LICENSE +21 -0
- crawlora-1.5.0.dev1/MANIFEST.in +2 -0
- crawlora-1.5.0.dev1/PKG-INFO +213 -0
- crawlora-1.5.0.dev1/README.md +188 -0
- crawlora-1.5.0.dev1/crawlora/__init__.py +24 -0
- crawlora-1.5.0.dev1/crawlora/_pagination.py +44 -0
- crawlora-1.5.0.dev1/crawlora/_transport_sync.py +114 -0
- crawlora-1.5.0.dev1/crawlora/async_client.py +321 -0
- crawlora-1.5.0.dev1/crawlora/client.py +671 -0
- crawlora-1.5.0.dev1/crawlora/client.pyi +20710 -0
- crawlora-1.5.0.dev1/crawlora/operations.py +6784 -0
- crawlora-1.5.0.dev1/crawlora/py.typed +1 -0
- crawlora-1.5.0.dev1/crawlora.egg-info/PKG-INFO +213 -0
- crawlora-1.5.0.dev1/crawlora.egg-info/SOURCES.txt +29 -0
- crawlora-1.5.0.dev1/crawlora.egg-info/dependency_links.txt +1 -0
- crawlora-1.5.0.dev1/crawlora.egg-info/requires.txt +6 -0
- crawlora-1.5.0.dev1/crawlora.egg-info/top_level.txt +1 -0
- crawlora-1.5.0.dev1/docs/operations.md +338 -0
- crawlora-1.5.0.dev1/docs/recipes.md +157 -0
- crawlora-1.5.0.dev1/examples/async_search.py +26 -0
- crawlora-1.5.0.dev1/examples/bing_search.py +31 -0
- crawlora-1.5.0.dev1/examples/paginate.py +27 -0
- crawlora-1.5.0.dev1/examples/youtube_transcript.py +35 -0
- crawlora-1.5.0.dev1/pyproject.toml +43 -0
- crawlora-1.5.0.dev1/setup.cfg +4 -0
- crawlora-1.5.0.dev1/tests/test_async_httpx.py +95 -0
- crawlora-1.5.0.dev1/tests/test_client.py +316 -0
- crawlora-1.5.0.dev1/tests/test_examples.py +39 -0
- crawlora-1.5.0.dev1/tests/test_v14_features.py +141 -0
- crawlora-1.5.0.dev1/tests/test_v15_features.py +114 -0
- crawlora-1.5.0.dev1/tests/test_w2_features.py +121 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Crawlora
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlora
|
|
3
|
+
Version: 1.5.0.dev1
|
|
4
|
+
Summary: Python SDK for the public Crawlora API.
|
|
5
|
+
Author: Crawlora
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Crawlora-org/crawlora-python-sdk
|
|
8
|
+
Project-URL: Repository, https://github.com/Crawlora-org/crawlora-python-sdk
|
|
9
|
+
Project-URL: Issues, https://github.com/Crawlora-org/crawlora-python-sdk/issues
|
|
10
|
+
Keywords: crawlora,sdk,web-scraping,api-client
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: typing-extensions>=4.8; python_version < "3.11"
|
|
22
|
+
Provides-Extra: async
|
|
23
|
+
Requires-Dist: httpx>=0.27; extra == "async"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# Crawlora Python SDK
|
|
27
|
+
|
|
28
|
+
Python client for the public Crawlora API. Use it to call Crawlora scraping,
|
|
29
|
+
search, marketplace, media, maps, finance, and usage endpoints with generated
|
|
30
|
+
type stubs for editor and type-checker support.
|
|
31
|
+
|
|
32
|
+
- Runtime: Python 3.10+
|
|
33
|
+
- Auth: `x-api-key`
|
|
34
|
+
- Default API base URL: `https://api.crawlora.net/api/v1`
|
|
35
|
+
- Reference: [operations](docs/operations.md) and [recipes](docs/recipes.md)
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
The Python SDK is currently distributed from Git beta tags:
|
|
40
|
+
|
|
41
|
+
```sh
|
|
42
|
+
pip install "git+https://github.com/Crawlora-org/crawlora-python-sdk.git@latest"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
For reproducible builds, pin a released tag:
|
|
46
|
+
|
|
47
|
+
```sh
|
|
48
|
+
pip install "git+https://github.com/Crawlora-org/crawlora-python-sdk.git@TAG"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## API Key
|
|
52
|
+
|
|
53
|
+
Create or sign in to your Crawlora account at [crawlora.net](https://crawlora.net),
|
|
54
|
+
then create an API key in the dashboard.
|
|
55
|
+
|
|
56
|
+
```sh
|
|
57
|
+
read -r CRAWLORA_API_KEY
|
|
58
|
+
export CRAWLORA_API_KEY
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## First Request
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import os
|
|
65
|
+
from crawlora import CrawloraClient
|
|
66
|
+
|
|
67
|
+
crawlora = CrawloraClient(api_key=os.environ["CRAWLORA_API_KEY"])
|
|
68
|
+
|
|
69
|
+
response = crawlora.bing.search(
|
|
70
|
+
q="coffee shops",
|
|
71
|
+
count=10,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
print(response["data"]["results"][0])
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Endpoint groups are generated from the public API contract, so common calls are
|
|
78
|
+
available as methods such as `crawlora.bing.search(...)`,
|
|
79
|
+
`crawlora.youtube.transcript(...)`, and `crawlora.google.map_search(...)`.
|
|
80
|
+
|
|
81
|
+
## Typed Dynamic Calls
|
|
82
|
+
|
|
83
|
+
You can also call by operation id. Literal operation ids are covered by the
|
|
84
|
+
generated `.pyi` stubs, so type checkers can infer the matching parameter and
|
|
85
|
+
response aliases:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
response = crawlora.request("bing-search", {
|
|
89
|
+
"q": "coffee shops",
|
|
90
|
+
"count": 10,
|
|
91
|
+
})
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Generated stubs include operation ids, endpoint groups, keyword parameters,
|
|
95
|
+
enum values, response aliases, and reserved request options.
|
|
96
|
+
|
|
97
|
+
## Configuration
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
crawlora = CrawloraClient(
|
|
101
|
+
api_key=os.environ["CRAWLORA_API_KEY"],
|
|
102
|
+
base_url="https://api.crawlora.net/api/v1",
|
|
103
|
+
timeout=30,
|
|
104
|
+
retries=2,
|
|
105
|
+
retry_delay=0.25,
|
|
106
|
+
headers={"x-client": "my-app"},
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Per-request options are available through reserved keyword arguments. Header
|
|
111
|
+
names are matched case-insensitively, so request headers can override default
|
|
112
|
+
auth, user-agent, and content headers without duplicating variants such as
|
|
113
|
+
`x-api-key` and `X-API-KEY`:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
response = crawlora.bing.search(
|
|
117
|
+
q="coffee shops",
|
|
118
|
+
_timeout=10,
|
|
119
|
+
_headers={"x-request-id": "search-001"},
|
|
120
|
+
)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Text Responses
|
|
124
|
+
|
|
125
|
+
Most endpoints return JSON. `_response_type` must be `auto`, `json`, or
|
|
126
|
+
`text`. Endpoints that support alternate text output, such as YouTube
|
|
127
|
+
transcripts, can opt into text mode:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
transcript = crawlora.youtube.transcript(
|
|
131
|
+
id="VIDEO_ID",
|
|
132
|
+
format="text",
|
|
133
|
+
_response_type="text",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
print(transcript)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Errors
|
|
140
|
+
|
|
141
|
+
Failed API calls raise `CrawloraError`:
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
from crawlora import CrawloraError
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
crawlora.bing.search(q="coffee shops")
|
|
148
|
+
except CrawloraError as error:
|
|
149
|
+
print(error.status, error.code, error.body)
|
|
150
|
+
raise
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
The error includes `status`, optional API `code`, parsed `body`, `raw_body`,
|
|
154
|
+
response `headers`, and the underlying parser or transport exception as
|
|
155
|
+
`__cause__` when available. Retryable responses honor positive `Retry-After`
|
|
156
|
+
headers, capped at 30 seconds. Timeout-like transport failures use the
|
|
157
|
+
`Crawlora request timed out` SDK message.
|
|
158
|
+
|
|
159
|
+
`CrawloraError` has three subclasses for branching on the failure kind:
|
|
160
|
+
`CrawloraClientError` (4xx, request rejected), `CrawloraServerError` (5xx), and
|
|
161
|
+
`CrawloraNetworkError` (transport failure or timeout before a response).
|
|
162
|
+
|
|
163
|
+
## Async
|
|
164
|
+
|
|
165
|
+
`AsyncCrawloraClient` mirrors the synchronous client for asyncio applications:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
from crawlora import AsyncCrawloraClient
|
|
169
|
+
|
|
170
|
+
crawlora = AsyncCrawloraClient(api_key="YOUR_API_KEY")
|
|
171
|
+
result = await crawlora.bing.search(q="coffee shops")
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
It reuses the same validation, retries, and `Retry-After` handling, running each
|
|
175
|
+
request in a worker thread so the package stays dependency-free.
|
|
176
|
+
|
|
177
|
+
## Pagination
|
|
178
|
+
|
|
179
|
+
`client.paginate` yields successive pages, advancing the page/offset query
|
|
180
|
+
parameter and stopping when a page returns no data:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
for page in crawlora.paginate("ebay-seller-feedback", {"seller": "acme"}):
|
|
184
|
+
for review in page["data"]:
|
|
185
|
+
print(review)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
`AsyncCrawloraClient.paginate` is the `async for` equivalent. Override detection
|
|
189
|
+
with `page_param`, `start`, `step`, and `max_pages`.
|
|
190
|
+
|
|
191
|
+
## Examples
|
|
192
|
+
|
|
193
|
+
Runnable examples live under `examples/` and skip cleanly when required
|
|
194
|
+
environment variables are missing:
|
|
195
|
+
|
|
196
|
+
```sh
|
|
197
|
+
python3 examples/bing_search.py
|
|
198
|
+
python3 examples/youtube_transcript.py
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Set `CRAWLORA_BASE_URL` to point examples at a staging or local API.
|
|
202
|
+
|
|
203
|
+
## Package Notes
|
|
204
|
+
|
|
205
|
+
The import name is `crawlora`:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
from crawlora import CrawloraClient
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
The future PyPI package target is also `crawlora`, but registry publication is
|
|
212
|
+
not enabled yet. Until then, install from an explicit Git beta tag or the
|
|
213
|
+
moving `latest` tag as shown above.
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# Crawlora Python SDK
|
|
2
|
+
|
|
3
|
+
Python client for the public Crawlora API. Use it to call Crawlora scraping,
|
|
4
|
+
search, marketplace, media, maps, finance, and usage endpoints with generated
|
|
5
|
+
type stubs for editor and type-checker support.
|
|
6
|
+
|
|
7
|
+
- Runtime: Python 3.10+
|
|
8
|
+
- Auth: `x-api-key`
|
|
9
|
+
- Default API base URL: `https://api.crawlora.net/api/v1`
|
|
10
|
+
- Reference: [operations](docs/operations.md) and [recipes](docs/recipes.md)
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
The Python SDK is currently distributed from Git beta tags:
|
|
15
|
+
|
|
16
|
+
```sh
|
|
17
|
+
pip install "git+https://github.com/Crawlora-org/crawlora-python-sdk.git@latest"
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
For reproducible builds, pin a released tag:
|
|
21
|
+
|
|
22
|
+
```sh
|
|
23
|
+
pip install "git+https://github.com/Crawlora-org/crawlora-python-sdk.git@TAG"
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## API Key
|
|
27
|
+
|
|
28
|
+
Create or sign in to your Crawlora account at [crawlora.net](https://crawlora.net),
|
|
29
|
+
then create an API key in the dashboard.
|
|
30
|
+
|
|
31
|
+
```sh
|
|
32
|
+
read -r CRAWLORA_API_KEY
|
|
33
|
+
export CRAWLORA_API_KEY
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## First Request
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import os
|
|
40
|
+
from crawlora import CrawloraClient
|
|
41
|
+
|
|
42
|
+
crawlora = CrawloraClient(api_key=os.environ["CRAWLORA_API_KEY"])
|
|
43
|
+
|
|
44
|
+
response = crawlora.bing.search(
|
|
45
|
+
q="coffee shops",
|
|
46
|
+
count=10,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
print(response["data"]["results"][0])
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Endpoint groups are generated from the public API contract, so common calls are
|
|
53
|
+
available as methods such as `crawlora.bing.search(...)`,
|
|
54
|
+
`crawlora.youtube.transcript(...)`, and `crawlora.google.map_search(...)`.
|
|
55
|
+
|
|
56
|
+
## Typed Dynamic Calls
|
|
57
|
+
|
|
58
|
+
You can also call by operation id. Literal operation ids are covered by the
|
|
59
|
+
generated `.pyi` stubs, so type checkers can infer the matching parameter and
|
|
60
|
+
response aliases:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
response = crawlora.request("bing-search", {
|
|
64
|
+
"q": "coffee shops",
|
|
65
|
+
"count": 10,
|
|
66
|
+
})
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Generated stubs include operation ids, endpoint groups, keyword parameters,
|
|
70
|
+
enum values, response aliases, and reserved request options.
|
|
71
|
+
|
|
72
|
+
## Configuration
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
crawlora = CrawloraClient(
|
|
76
|
+
api_key=os.environ["CRAWLORA_API_KEY"],
|
|
77
|
+
base_url="https://api.crawlora.net/api/v1",
|
|
78
|
+
timeout=30,
|
|
79
|
+
retries=2,
|
|
80
|
+
retry_delay=0.25,
|
|
81
|
+
headers={"x-client": "my-app"},
|
|
82
|
+
)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Per-request options are available through reserved keyword arguments. Header
|
|
86
|
+
names are matched case-insensitively, so request headers can override default
|
|
87
|
+
auth, user-agent, and content headers without duplicating variants such as
|
|
88
|
+
`x-api-key` and `X-API-KEY`:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
response = crawlora.bing.search(
|
|
92
|
+
q="coffee shops",
|
|
93
|
+
_timeout=10,
|
|
94
|
+
_headers={"x-request-id": "search-001"},
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Text Responses
|
|
99
|
+
|
|
100
|
+
Most endpoints return JSON. `_response_type` must be `auto`, `json`, or
|
|
101
|
+
`text`. Endpoints that support alternate text output, such as YouTube
|
|
102
|
+
transcripts, can opt into text mode:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
transcript = crawlora.youtube.transcript(
|
|
106
|
+
id="VIDEO_ID",
|
|
107
|
+
format="text",
|
|
108
|
+
_response_type="text",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
print(transcript)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Errors
|
|
115
|
+
|
|
116
|
+
Failed API calls raise `CrawloraError`:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from crawlora import CrawloraError
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
crawlora.bing.search(q="coffee shops")
|
|
123
|
+
except CrawloraError as error:
|
|
124
|
+
print(error.status, error.code, error.body)
|
|
125
|
+
raise
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
The error includes `status`, optional API `code`, parsed `body`, `raw_body`,
|
|
129
|
+
response `headers`, and the underlying parser or transport exception as
|
|
130
|
+
`__cause__` when available. Retryable responses honor positive `Retry-After`
|
|
131
|
+
headers, capped at 30 seconds. Timeout-like transport failures use the
|
|
132
|
+
`Crawlora request timed out` SDK message.
|
|
133
|
+
|
|
134
|
+
`CrawloraError` has three subclasses for branching on the failure kind:
|
|
135
|
+
`CrawloraClientError` (4xx, request rejected), `CrawloraServerError` (5xx), and
|
|
136
|
+
`CrawloraNetworkError` (transport failure or timeout before a response).
|
|
137
|
+
|
|
138
|
+
## Async
|
|
139
|
+
|
|
140
|
+
`AsyncCrawloraClient` mirrors the synchronous client for asyncio applications:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from crawlora import AsyncCrawloraClient
|
|
144
|
+
|
|
145
|
+
crawlora = AsyncCrawloraClient(api_key="YOUR_API_KEY")
|
|
146
|
+
result = await crawlora.bing.search(q="coffee shops")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
It reuses the same validation, retries, and `Retry-After` handling, running each
|
|
150
|
+
request in a worker thread so the package stays dependency-free.
|
|
151
|
+
|
|
152
|
+
## Pagination
|
|
153
|
+
|
|
154
|
+
`client.paginate` yields successive pages, advancing the page/offset query
|
|
155
|
+
parameter and stopping when a page returns no data:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
for page in crawlora.paginate("ebay-seller-feedback", {"seller": "acme"}):
|
|
159
|
+
for review in page["data"]:
|
|
160
|
+
print(review)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
`AsyncCrawloraClient.paginate` is the `async for` equivalent. Override detection
|
|
164
|
+
with `page_param`, `start`, `step`, and `max_pages`.
|
|
165
|
+
|
|
166
|
+
## Examples
|
|
167
|
+
|
|
168
|
+
Runnable examples live under `examples/` and skip cleanly when required
|
|
169
|
+
environment variables are missing:
|
|
170
|
+
|
|
171
|
+
```sh
|
|
172
|
+
python3 examples/bing_search.py
|
|
173
|
+
python3 examples/youtube_transcript.py
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Set `CRAWLORA_BASE_URL` to point examples at a staging or local API.
|
|
177
|
+
|
|
178
|
+
## Package Notes
|
|
179
|
+
|
|
180
|
+
The import name is `crawlora`:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from crawlora import CrawloraClient
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
The future PyPI package target is also `crawlora`, but registry publication is
|
|
187
|
+
not enabled yet. Until then, install from an explicit Git beta tag or the
|
|
188
|
+
moving `latest` tag as shown above.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .async_client import AsyncCrawloraClient
|
|
2
|
+
from .client import (
|
|
3
|
+
VERSION,
|
|
4
|
+
CrawloraClient,
|
|
5
|
+
CrawloraClientError,
|
|
6
|
+
CrawloraError,
|
|
7
|
+
CrawloraNetworkError,
|
|
8
|
+
CrawloraServerError,
|
|
9
|
+
)
|
|
10
|
+
from .operations import GROUPS, OPERATION_COUNT, OPERATIONS, OperationId
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"AsyncCrawloraClient",
|
|
14
|
+
"CrawloraClient",
|
|
15
|
+
"CrawloraError",
|
|
16
|
+
"CrawloraClientError",
|
|
17
|
+
"CrawloraServerError",
|
|
18
|
+
"CrawloraNetworkError",
|
|
19
|
+
"GROUPS",
|
|
20
|
+
"OPERATIONS",
|
|
21
|
+
"OPERATION_COUNT",
|
|
22
|
+
"OperationId",
|
|
23
|
+
"VERSION",
|
|
24
|
+
]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Shared pagination helpers used by the sync and async clients.
|
|
2
|
+
|
|
3
|
+
This module deliberately has no `.pyi` stub so type checkers read its inline
|
|
4
|
+
annotations directly (the `client.pyi` stub shadows `client.py`).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Mapping
|
|
10
|
+
|
|
11
|
+
PAGE_PARAM_NAMES = ("page", "offset")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def detect_page_param(operation: Mapping[str, Any]) -> str | None:
|
|
15
|
+
names = {parameter["name"] for parameter in operation.get("queryParams", [])}
|
|
16
|
+
for candidate in PAGE_PARAM_NAMES:
|
|
17
|
+
if candidate in names:
|
|
18
|
+
return candidate
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def page_is_empty(response: Any) -> bool:
|
|
23
|
+
data = response
|
|
24
|
+
if isinstance(response, Mapping) and "data" in response:
|
|
25
|
+
data = response["data"]
|
|
26
|
+
if data is None:
|
|
27
|
+
return True
|
|
28
|
+
if isinstance(data, (list, tuple, dict, str)):
|
|
29
|
+
return len(data) == 0
|
|
30
|
+
return not data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def default_start(page_param: str) -> int:
|
|
34
|
+
return 0 if page_param == "offset" else 1
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def default_items(response: Any) -> list[Any]:
|
|
38
|
+
"""Default item extractor: the response's ``data`` list (Crawlora envelope),
|
|
39
|
+
or the response itself when it is already a list."""
|
|
40
|
+
if isinstance(response, Mapping) and isinstance(response.get("data"), list):
|
|
41
|
+
return list(response["data"])
|
|
42
|
+
if isinstance(response, list):
|
|
43
|
+
return list(response)
|
|
44
|
+
return []
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Keep-alive HTTP transport for the synchronous client (standard library only).
|
|
2
|
+
|
|
3
|
+
Maintains a small pool of reusable connections per ``(scheme, host, port)`` so
|
|
4
|
+
the sync client avoids a fresh TCP + TLS handshake on every request. Each
|
|
5
|
+
request checks out its own connection, so the transport is safe to use from
|
|
6
|
+
multiple threads (e.g. under ``max_concurrency``). This module is stub-free so
|
|
7
|
+
type checkers read its inline annotations directly.
|
|
8
|
+
|
|
9
|
+
The transport returns a lightweight response object exposing ``status``,
|
|
10
|
+
``headers`` (a dict), and ``body`` (bytes) — the only fields the client reads.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import http.client
|
|
16
|
+
import threading
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Any, Mapping
|
|
19
|
+
from urllib.parse import urlsplit
|
|
20
|
+
from urllib.request import Request
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _title_case(name: str) -> str:
|
|
24
|
+
return "-".join(part.capitalize() for part in name.split("-"))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class _PooledResponse:
|
|
29
|
+
status: int
|
|
30
|
+
headers: Mapping[str, str]
|
|
31
|
+
body: bytes
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class KeepAliveTransport:
|
|
35
|
+
"""Connection-pooling transport. Drop-in for the urlopen transport: callable
|
|
36
|
+
as ``transport(request, timeout) -> response``."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, max_per_host: int = 8) -> None:
|
|
39
|
+
self._lock = threading.Lock()
|
|
40
|
+
self._pools: dict[tuple, list[http.client.HTTPConnection]] = {}
|
|
41
|
+
self._max_per_host = max_per_host
|
|
42
|
+
|
|
43
|
+
def __call__(self, request: Request, timeout: float) -> _PooledResponse:
|
|
44
|
+
parts = urlsplit(request.full_url)
|
|
45
|
+
key = (parts.scheme, parts.hostname, parts.port)
|
|
46
|
+
path = parts.path or "/"
|
|
47
|
+
if parts.query:
|
|
48
|
+
path = f"{path}?{parts.query}"
|
|
49
|
+
method = request.get_method()
|
|
50
|
+
# Send canonical HTTP title-case header names (matching the urlopen
|
|
51
|
+
# transport's behavior), so receivers see e.g. "X-Api-Key".
|
|
52
|
+
headers = {_title_case(name): value for name, value in request.header_items()}
|
|
53
|
+
body = request.data
|
|
54
|
+
|
|
55
|
+
last_exc: Exception | None = None
|
|
56
|
+
for attempt in range(2):
|
|
57
|
+
conn = self._checkout(key, parts, timeout)
|
|
58
|
+
try:
|
|
59
|
+
conn.request(method, path, body=body, headers=headers)
|
|
60
|
+
response = conn.getresponse()
|
|
61
|
+
data = response.read()
|
|
62
|
+
result = _PooledResponse(response.status, dict(response.getheaders()), data)
|
|
63
|
+
except (http.client.HTTPException, ConnectionError, OSError) as exc:
|
|
64
|
+
# Likely a stale pooled connection the server already closed;
|
|
65
|
+
# discard it and retry once on a fresh connection.
|
|
66
|
+
last_exc = exc
|
|
67
|
+
self._close(conn)
|
|
68
|
+
if attempt == 1:
|
|
69
|
+
raise
|
|
70
|
+
continue
|
|
71
|
+
if response.will_close:
|
|
72
|
+
self._close(conn)
|
|
73
|
+
else:
|
|
74
|
+
self._checkin(key, conn)
|
|
75
|
+
return result
|
|
76
|
+
raise last_exc if last_exc else RuntimeError("keep-alive transport failed")
|
|
77
|
+
|
|
78
|
+
def close(self) -> None:
|
|
79
|
+
with self._lock:
|
|
80
|
+
pools = list(self._pools.values())
|
|
81
|
+
self._pools.clear()
|
|
82
|
+
for pool in pools:
|
|
83
|
+
for conn in pool:
|
|
84
|
+
self._close(conn)
|
|
85
|
+
|
|
86
|
+
def _checkout(self, key: tuple, parts: Any, timeout: float) -> http.client.HTTPConnection:
|
|
87
|
+
with self._lock:
|
|
88
|
+
pool = self._pools.get(key)
|
|
89
|
+
if pool:
|
|
90
|
+
conn = pool.pop()
|
|
91
|
+
conn.timeout = timeout
|
|
92
|
+
return conn
|
|
93
|
+
return self._new(parts, timeout)
|
|
94
|
+
|
|
95
|
+
def _checkin(self, key: tuple, conn: http.client.HTTPConnection) -> None:
|
|
96
|
+
with self._lock:
|
|
97
|
+
pool = self._pools.setdefault(key, [])
|
|
98
|
+
if len(pool) < self._max_per_host:
|
|
99
|
+
pool.append(conn)
|
|
100
|
+
return
|
|
101
|
+
self._close(conn)
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _new(parts: Any, timeout: float) -> http.client.HTTPConnection:
|
|
105
|
+
if parts.scheme == "https":
|
|
106
|
+
return http.client.HTTPSConnection(parts.hostname, parts.port or 443, timeout=timeout)
|
|
107
|
+
return http.client.HTTPConnection(parts.hostname, parts.port or 80, timeout=timeout)
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _close(conn: http.client.HTTPConnection) -> None:
|
|
111
|
+
try:
|
|
112
|
+
conn.close()
|
|
113
|
+
except Exception:
|
|
114
|
+
pass
|