scrapio-py 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapio_py-1.0.0/.gitignore +8 -0
- scrapio_py-1.0.0/PKG-INFO +173 -0
- scrapio_py-1.0.0/README.md +159 -0
- scrapio_py-1.0.0/pyproject.toml +24 -0
- scrapio_py-1.0.0/src/scrapio/__init__.py +21 -0
- scrapio_py-1.0.0/src/scrapio/_http.py +159 -0
- scrapio_py-1.0.0/src/scrapio/client.py +81 -0
- scrapio_py-1.0.0/src/scrapio/errors.py +30 -0
- scrapio_py-1.0.0/src/scrapio/resources/__init__.py +0 -0
- scrapio_py-1.0.0/src/scrapio/resources/amazon.py +53 -0
- scrapio_py-1.0.0/src/scrapio/resources/fetch.py +29 -0
- scrapio_py-1.0.0/src/scrapio/resources/google.py +27 -0
- scrapio_py-1.0.0/src/scrapio/resources/jobs.py +73 -0
- scrapio_py-1.0.0/src/scrapio/resources/walmart.py +53 -0
- scrapio_py-1.0.0/src/scrapio/resources/youtube.py +76 -0
- scrapio_py-1.0.0/src/scrapio/types.py +136 -0
- scrapio_py-1.0.0/tests/__init__.py +0 -0
- scrapio_py-1.0.0/tests/test_integration.py +177 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapio-py
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Official Python SDK for the Scrapio
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Requires-Dist: httpx>=0.25.0
|
|
8
|
+
Requires-Dist: pydantic>=2.0.0
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: anyio[trio]; extra == 'dev'
|
|
11
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
|
|
12
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# scrapio
|
|
16
|
+
|
|
17
|
+
Official Python SDK for [Scrapio](https://scrapio.dev) — fetch, crawl, search, and extract structured data from any URL.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install scrapio-py
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Requires Python 3.9 or later.
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from scrapio import ApiClient, FetchRequest
|
|
31
|
+
|
|
32
|
+
client = ApiClient(api_key="YOUR_API_KEY")
|
|
33
|
+
|
|
34
|
+
result = client.fetch.fetch(FetchRequest(
|
|
35
|
+
url="https://example.com",
|
|
36
|
+
output=["markdown"],
|
|
37
|
+
))
|
|
38
|
+
|
|
39
|
+
print(result.outputs["markdown"])
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### Fetch a page
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
result = client.fetch.fetch(FetchRequest(
|
|
48
|
+
url="https://news.ycombinator.com",
|
|
49
|
+
render_js=True,
|
|
50
|
+
output=["markdown"],
|
|
51
|
+
))
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Google Search
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from scrapio import GoogleSearchParams
|
|
58
|
+
|
|
59
|
+
results = client.google.search(GoogleSearchParams(
|
|
60
|
+
search="best web scraping API 2025",
|
|
61
|
+
country_code="us",
|
|
62
|
+
))
|
|
63
|
+
print(results.organic_results)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Amazon product
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
product = client.amazon.get_product("B08N5WRWNW")
|
|
70
|
+
print(product.title, product.price)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Walmart search
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
items = client.walmart.search("headphones")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### YouTube transcript
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
video = client.youtube.get_video("dQw4w9WgXcQ")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Browser automation
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
result = client.interact.interact({
|
|
89
|
+
"url": "https://example.com",
|
|
90
|
+
"actions": [
|
|
91
|
+
{"type": "click", "selector": "#login"},
|
|
92
|
+
{"type": "type", "selector": "#email", "text": "user@example.com"},
|
|
93
|
+
],
|
|
94
|
+
})
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Crawl a site
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
result = client.crawl.crawl({
|
|
101
|
+
"seeds": ["https://docs.example.com"],
|
|
102
|
+
"max_pages": 50,
|
|
103
|
+
})
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Async jobs
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from scrapio import CreateJobRequest
|
|
110
|
+
|
|
111
|
+
job = client.jobs.create(CreateJobRequest(
|
|
112
|
+
job_type="fetch",
|
|
113
|
+
payload={"url": "https://example.com", "output": ["markdown"]},
|
|
114
|
+
))
|
|
115
|
+
result = client.jobs.wait_for_completion(job.job_id, poll_interval=2.0, timeout=120.0)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Async client
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import asyncio
|
|
122
|
+
from scrapio import AsyncApiClient, FetchRequest
|
|
123
|
+
|
|
124
|
+
async def main():
|
|
125
|
+
async with AsyncApiClient(api_key="YOUR_API_KEY") as client:
|
|
126
|
+
result = await client.fetch.fetch(FetchRequest(
|
|
127
|
+
url="https://example.com",
|
|
128
|
+
output=["markdown"],
|
|
129
|
+
))
|
|
130
|
+
print(result.outputs["markdown"])
|
|
131
|
+
|
|
132
|
+
asyncio.run(main())
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Configuration
|
|
136
|
+
|
|
137
|
+
| Option | Type | Default | Description |
|
|
138
|
+
|--------|------|---------|-------------|
|
|
139
|
+
| `api_key` | `str` | required | Your API key |
|
|
140
|
+
| `base_url` | `str` | `https://api.scrapio.dev` | Override for local/staging |
|
|
141
|
+
| `timeout` | `float` | `30.0` | Per-request timeout (seconds) |
|
|
142
|
+
| `max_retries` | `int` | `3` | Max retries on 429/503 |
|
|
143
|
+
|
|
144
|
+
## Error handling
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from scrapio import (
|
|
148
|
+
ApiClient, FetchRequest,
|
|
149
|
+
AuthError, RateLimitError, CreditsExhaustedError, ApiError,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
client.fetch.fetch(FetchRequest(url="https://example.com"))
|
|
154
|
+
except AuthError:
|
|
155
|
+
print("Invalid API key")
|
|
156
|
+
except CreditsExhaustedError:
|
|
157
|
+
print("No credits remaining")
|
|
158
|
+
except RateLimitError:
|
|
159
|
+
print("Rate limited — back off and retry")
|
|
160
|
+
except ApiError as e:
|
|
161
|
+
print(f"API error {e.status_code}: {e}")
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Links
|
|
165
|
+
|
|
166
|
+
- [Documentation](https://scrapio.dev/docs)
|
|
167
|
+
- [API Reference](https://scrapio.dev/docs/api-reference/fetch)
|
|
168
|
+
- [Dashboard](https://app.scrapio.dev)
|
|
169
|
+
- [Get an API key](https://scrapio.dev#pricing)
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
MIT
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# scrapio
|
|
2
|
+
|
|
3
|
+
Official Python SDK for [Scrapio](https://scrapio.dev) — fetch, crawl, search, and extract structured data from any URL.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install scrapio-py
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Requires Python 3.9 or later.
|
|
12
|
+
|
|
13
|
+
## Quickstart
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from scrapio import ApiClient, FetchRequest
|
|
17
|
+
|
|
18
|
+
client = ApiClient(api_key="YOUR_API_KEY")
|
|
19
|
+
|
|
20
|
+
result = client.fetch.fetch(FetchRequest(
|
|
21
|
+
url="https://example.com",
|
|
22
|
+
output=["markdown"],
|
|
23
|
+
))
|
|
24
|
+
|
|
25
|
+
print(result.outputs["markdown"])
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Fetch a page
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
result = client.fetch.fetch(FetchRequest(
|
|
34
|
+
url="https://news.ycombinator.com",
|
|
35
|
+
render_js=True,
|
|
36
|
+
output=["markdown"],
|
|
37
|
+
))
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Google Search
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from scrapio import GoogleSearchParams
|
|
44
|
+
|
|
45
|
+
results = client.google.search(GoogleSearchParams(
|
|
46
|
+
search="best web scraping API 2025",
|
|
47
|
+
country_code="us",
|
|
48
|
+
))
|
|
49
|
+
print(results.organic_results)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Amazon product
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
product = client.amazon.get_product("B08N5WRWNW")
|
|
56
|
+
print(product.title, product.price)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Walmart search
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
items = client.walmart.search("headphones")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### YouTube transcript
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
video = client.youtube.get_video("dQw4w9WgXcQ")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Browser automation
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
result = client.interact.interact({
|
|
75
|
+
"url": "https://example.com",
|
|
76
|
+
"actions": [
|
|
77
|
+
{"type": "click", "selector": "#login"},
|
|
78
|
+
{"type": "type", "selector": "#email", "text": "user@example.com"},
|
|
79
|
+
],
|
|
80
|
+
})
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Crawl a site
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
result = client.crawl.crawl({
|
|
87
|
+
"seeds": ["https://docs.example.com"],
|
|
88
|
+
"max_pages": 50,
|
|
89
|
+
})
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Async jobs
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from scrapio import CreateJobRequest
|
|
96
|
+
|
|
97
|
+
job = client.jobs.create(CreateJobRequest(
|
|
98
|
+
job_type="fetch",
|
|
99
|
+
payload={"url": "https://example.com", "output": ["markdown"]},
|
|
100
|
+
))
|
|
101
|
+
result = client.jobs.wait_for_completion(job.job_id, poll_interval=2.0, timeout=120.0)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Async client
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
import asyncio
|
|
108
|
+
from scrapio import AsyncApiClient, FetchRequest
|
|
109
|
+
|
|
110
|
+
async def main():
|
|
111
|
+
async with AsyncApiClient(api_key="YOUR_API_KEY") as client:
|
|
112
|
+
result = await client.fetch.fetch(FetchRequest(
|
|
113
|
+
url="https://example.com",
|
|
114
|
+
output=["markdown"],
|
|
115
|
+
))
|
|
116
|
+
print(result.outputs["markdown"])
|
|
117
|
+
|
|
118
|
+
asyncio.run(main())
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Configuration
|
|
122
|
+
|
|
123
|
+
| Option | Type | Default | Description |
|
|
124
|
+
|--------|------|---------|-------------|
|
|
125
|
+
| `api_key` | `str` | required | Your API key |
|
|
126
|
+
| `base_url` | `str` | `https://api.scrapio.dev` | Override for local/staging |
|
|
127
|
+
| `timeout` | `float` | `30.0` | Per-request timeout (seconds) |
|
|
128
|
+
| `max_retries` | `int` | `3` | Max retries on 429/503 |
|
|
129
|
+
|
|
130
|
+
## Error handling
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from scrapio import (
|
|
134
|
+
ApiClient, FetchRequest,
|
|
135
|
+
AuthError, RateLimitError, CreditsExhaustedError, ApiError,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
client.fetch.fetch(FetchRequest(url="https://example.com"))
|
|
140
|
+
except AuthError:
|
|
141
|
+
print("Invalid API key")
|
|
142
|
+
except CreditsExhaustedError:
|
|
143
|
+
print("No credits remaining")
|
|
144
|
+
except RateLimitError:
|
|
145
|
+
print("Rate limited — back off and retry")
|
|
146
|
+
except ApiError as e:
|
|
147
|
+
print(f"API error {e.status_code}: {e}")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Links
|
|
151
|
+
|
|
152
|
+
- [Documentation](https://scrapio.dev/docs)
|
|
153
|
+
- [API Reference](https://scrapio.dev/docs/api-reference/fetch)
|
|
154
|
+
- [Dashboard](https://app.scrapio.dev)
|
|
155
|
+
- [Get an API key](https://scrapio.dev#pricing)
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
MIT
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scrapio-py"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Official Python SDK for the Scrapio"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
dependencies = [
|
|
13
|
+
"httpx>=0.25.0",
|
|
14
|
+
"pydantic>=2.0.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
dev = ["pytest>=7.0", "pytest-asyncio>=0.21", "anyio[trio]"]
|
|
19
|
+
|
|
20
|
+
[tool.hatch.build.targets.wheel]
|
|
21
|
+
packages = ["src/scrapio"]
|
|
22
|
+
|
|
23
|
+
[tool.pytest.ini_options]
|
|
24
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .client import ApiClient, AsyncApiClient
|
|
2
|
+
from .errors import ApiError, AuthError, RateLimitError, CreditsExhaustedError
|
|
3
|
+
from .types import (
|
|
4
|
+
FetchRequest, FetchResponse,
|
|
5
|
+
CreateJobRequest, Job, JobResult,
|
|
6
|
+
GoogleSearchParams, GoogleSearchResponse,
|
|
7
|
+
AmazonProductResponse, AmazonSearchResponse,
|
|
8
|
+
WalmartProductResponse, WalmartSearchResponse,
|
|
9
|
+
YouTubeVideoResponse, YouTubeSearchResponse, YouTubeSubtitleResponse,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"ApiClient", "AsyncApiClient",
|
|
14
|
+
"ApiError", "AuthError", "RateLimitError", "CreditsExhaustedError",
|
|
15
|
+
"FetchRequest", "FetchResponse",
|
|
16
|
+
"CreateJobRequest", "Job", "JobResult",
|
|
17
|
+
"GoogleSearchParams", "GoogleSearchResponse",
|
|
18
|
+
"AmazonProductResponse", "AmazonSearchResponse",
|
|
19
|
+
"WalmartProductResponse", "WalmartSearchResponse",
|
|
20
|
+
"YouTubeVideoResponse", "YouTubeSearchResponse", "YouTubeSubtitleResponse",
|
|
21
|
+
]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Optional, Type, TypeVar
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from .errors import ApiError, AuthError, CreditsExhaustedError, RateLimitError
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T", bound=BaseModel)
|
|
13
|
+
|
|
14
|
+
RETRYABLE_STATUS = {429, 503}
|
|
15
|
+
DEFAULT_TIMEOUT = 30.0
|
|
16
|
+
DEFAULT_MAX_RETRIES = 3
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _raise_for_status(status_code: int, body: dict[str, Any]) -> None:
|
|
20
|
+
code = body.get("error", {}).get("code", "")
|
|
21
|
+
if status_code == 401:
|
|
22
|
+
raise AuthError(body)
|
|
23
|
+
if status_code == 429:
|
|
24
|
+
raise RateLimitError(body)
|
|
25
|
+
if status_code == 402 or code == "credits_exhausted":
|
|
26
|
+
raise CreditsExhaustedError(body)
|
|
27
|
+
raise ApiError(status_code, body)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SyncHttpClient:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
base_url: str,
|
|
34
|
+
api_key: str,
|
|
35
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
36
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
37
|
+
) -> None:
|
|
38
|
+
self._base_url = base_url.rstrip("/")
|
|
39
|
+
self._headers = {"Authorization": f"Bearer {api_key}"}
|
|
40
|
+
self._timeout = timeout
|
|
41
|
+
self._max_retries = max_retries
|
|
42
|
+
self._client = httpx.Client(
|
|
43
|
+
base_url=self._base_url,
|
|
44
|
+
headers=self._headers,
|
|
45
|
+
timeout=self._timeout,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def close(self) -> None:
|
|
49
|
+
self._client.close()
|
|
50
|
+
|
|
51
|
+
def __enter__(self) -> "SyncHttpClient":
|
|
52
|
+
return self
|
|
53
|
+
|
|
54
|
+
def __exit__(self, *args: Any) -> None:
|
|
55
|
+
self.close()
|
|
56
|
+
|
|
57
|
+
def request(
|
|
58
|
+
self,
|
|
59
|
+
method: str,
|
|
60
|
+
path: str,
|
|
61
|
+
*,
|
|
62
|
+
params: Optional[dict[str, Any]] = None,
|
|
63
|
+
json: Optional[Any] = None,
|
|
64
|
+
response_model: Type[T],
|
|
65
|
+
timeout: Optional[float] = None,
|
|
66
|
+
) -> T:
|
|
67
|
+
clean_params = {k: v for k, v in (params or {}).items() if v is not None}
|
|
68
|
+
|
|
69
|
+
for attempt in range(self._max_retries + 1):
|
|
70
|
+
res = self._client.request(
|
|
71
|
+
method,
|
|
72
|
+
path,
|
|
73
|
+
params=clean_params or None,
|
|
74
|
+
json=json,
|
|
75
|
+
timeout=timeout or self._timeout,
|
|
76
|
+
)
|
|
77
|
+
if res.is_success:
|
|
78
|
+
return response_model.model_validate(res.json())
|
|
79
|
+
|
|
80
|
+
body: dict[str, Any] = {}
|
|
81
|
+
try:
|
|
82
|
+
body = res.json()
|
|
83
|
+
except Exception:
|
|
84
|
+
body = {"request_id": "", "error": {"code": "unknown", "message": res.text}}
|
|
85
|
+
|
|
86
|
+
if res.status_code in RETRYABLE_STATUS and attempt < self._max_retries:
|
|
87
|
+
backoff = min(1.0 * (2**attempt), 8.0)
|
|
88
|
+
time.sleep(backoff)
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
_raise_for_status(res.status_code, body)
|
|
92
|
+
|
|
93
|
+
raise RuntimeError("Unexpected end of retry loop") # unreachable
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class AsyncHttpClient:
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
base_url: str,
|
|
100
|
+
api_key: str,
|
|
101
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
102
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
103
|
+
) -> None:
|
|
104
|
+
self._base_url = base_url.rstrip("/")
|
|
105
|
+
self._headers = {"Authorization": f"Bearer {api_key}"}
|
|
106
|
+
self._timeout = timeout
|
|
107
|
+
self._max_retries = max_retries
|
|
108
|
+
self._client = httpx.AsyncClient(
|
|
109
|
+
base_url=self._base_url,
|
|
110
|
+
headers=self._headers,
|
|
111
|
+
timeout=self._timeout,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
async def aclose(self) -> None:
|
|
115
|
+
await self._client.aclose()
|
|
116
|
+
|
|
117
|
+
async def __aenter__(self) -> "AsyncHttpClient":
|
|
118
|
+
return self
|
|
119
|
+
|
|
120
|
+
async def __aexit__(self, *args: Any) -> None:
|
|
121
|
+
await self.aclose()
|
|
122
|
+
|
|
123
|
+
async def request(
|
|
124
|
+
self,
|
|
125
|
+
method: str,
|
|
126
|
+
path: str,
|
|
127
|
+
*,
|
|
128
|
+
params: Optional[dict[str, Any]] = None,
|
|
129
|
+
json: Optional[Any] = None,
|
|
130
|
+
response_model: Type[T],
|
|
131
|
+
timeout: Optional[float] = None,
|
|
132
|
+
) -> T:
|
|
133
|
+
clean_params = {k: v for k, v in (params or {}).items() if v is not None}
|
|
134
|
+
|
|
135
|
+
for attempt in range(self._max_retries + 1):
|
|
136
|
+
res = await self._client.request(
|
|
137
|
+
method,
|
|
138
|
+
path,
|
|
139
|
+
params=clean_params or None,
|
|
140
|
+
json=json,
|
|
141
|
+
timeout=timeout or self._timeout,
|
|
142
|
+
)
|
|
143
|
+
if res.is_success:
|
|
144
|
+
return response_model.model_validate(res.json())
|
|
145
|
+
|
|
146
|
+
body: dict[str, Any] = {}
|
|
147
|
+
try:
|
|
148
|
+
body = res.json()
|
|
149
|
+
except Exception:
|
|
150
|
+
body = {"request_id": "", "error": {"code": "unknown", "message": res.text}}
|
|
151
|
+
|
|
152
|
+
if res.status_code in RETRYABLE_STATUS and attempt < self._max_retries:
|
|
153
|
+
backoff = min(1.0 * (2**attempt), 8.0)
|
|
154
|
+
await asyncio.sleep(backoff)
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
_raise_for_status(res.status_code, body)
|
|
158
|
+
|
|
159
|
+
raise RuntimeError("Unexpected end of retry loop") # unreachable
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from ._http import SyncHttpClient, AsyncHttpClient, DEFAULT_TIMEOUT, DEFAULT_MAX_RETRIES
|
|
4
|
+
from .resources.fetch import FetchResource, AsyncFetchResource
|
|
5
|
+
from .resources.jobs import JobsResource, AsyncJobsResource
|
|
6
|
+
from .resources.google import GoogleResource, AsyncGoogleResource
|
|
7
|
+
from .resources.amazon import AmazonResource, AsyncAmazonResource
|
|
8
|
+
from .resources.walmart import WalmartResource, AsyncWalmartResource
|
|
9
|
+
from .resources.youtube import YouTubeResource, AsyncYouTubeResource
|
|
10
|
+
|
|
11
|
+
DEFAULT_BASE_URL = "https://api.webdataapi.com"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ApiClient:
|
|
15
|
+
fetch: FetchResource
|
|
16
|
+
jobs: JobsResource
|
|
17
|
+
google: GoogleResource
|
|
18
|
+
amazon: AmazonResource
|
|
19
|
+
walmart: WalmartResource
|
|
20
|
+
youtube: YouTubeResource
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
api_key: str,
|
|
25
|
+
*,
|
|
26
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
27
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
28
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
29
|
+
) -> None:
|
|
30
|
+
http = SyncHttpClient(base_url, api_key, timeout=timeout, max_retries=max_retries)
|
|
31
|
+
self.fetch = FetchResource(http)
|
|
32
|
+
self.jobs = JobsResource(http)
|
|
33
|
+
self.google = GoogleResource(http)
|
|
34
|
+
self.amazon = AmazonResource(http)
|
|
35
|
+
self.walmart = WalmartResource(http)
|
|
36
|
+
self.youtube = YouTubeResource(http)
|
|
37
|
+
self._http = http
|
|
38
|
+
|
|
39
|
+
def close(self) -> None:
|
|
40
|
+
self._http.close()
|
|
41
|
+
|
|
42
|
+
def __enter__(self) -> "ApiClient":
|
|
43
|
+
return self
|
|
44
|
+
|
|
45
|
+
def __exit__(self, *args: object) -> None:
|
|
46
|
+
self.close()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AsyncApiClient:
|
|
50
|
+
fetch: AsyncFetchResource
|
|
51
|
+
jobs: AsyncJobsResource
|
|
52
|
+
google: AsyncGoogleResource
|
|
53
|
+
amazon: AsyncAmazonResource
|
|
54
|
+
walmart: AsyncWalmartResource
|
|
55
|
+
youtube: AsyncYouTubeResource
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
api_key: str,
|
|
60
|
+
*,
|
|
61
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
62
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
63
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
64
|
+
) -> None:
|
|
65
|
+
http = AsyncHttpClient(base_url, api_key, timeout=timeout, max_retries=max_retries)
|
|
66
|
+
self.fetch = AsyncFetchResource(http)
|
|
67
|
+
self.jobs = AsyncJobsResource(http)
|
|
68
|
+
self.google = AsyncGoogleResource(http)
|
|
69
|
+
self.amazon = AsyncAmazonResource(http)
|
|
70
|
+
self.walmart = AsyncWalmartResource(http)
|
|
71
|
+
self.youtube = AsyncYouTubeResource(http)
|
|
72
|
+
self._http = http
|
|
73
|
+
|
|
74
|
+
async def aclose(self) -> None:
|
|
75
|
+
await self._http.aclose()
|
|
76
|
+
|
|
77
|
+
async def __aenter__(self) -> "AsyncApiClient":
|
|
78
|
+
return self
|
|
79
|
+
|
|
80
|
+
async def __aexit__(self, *args: object) -> None:
|
|
81
|
+
await self.aclose()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ApiError(Exception):
|
|
6
|
+
status_code: int
|
|
7
|
+
request_id: str
|
|
8
|
+
code: str
|
|
9
|
+
|
|
10
|
+
def __init__(self, status_code: int, body: dict[str, Any]) -> None:
|
|
11
|
+
error = body.get("error", {})
|
|
12
|
+
super().__init__(error.get("message", "Unknown error"))
|
|
13
|
+
self.status_code = status_code
|
|
14
|
+
self.request_id = body.get("request_id", "")
|
|
15
|
+
self.code = error.get("code", "unknown")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AuthError(ApiError):
|
|
19
|
+
def __init__(self, body: dict[str, Any]) -> None:
|
|
20
|
+
super().__init__(401, body)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RateLimitError(ApiError):
|
|
24
|
+
def __init__(self, body: dict[str, Any]) -> None:
|
|
25
|
+
super().__init__(429, body)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CreditsExhaustedError(ApiError):
|
|
29
|
+
def __init__(self, body: dict[str, Any]) -> None:
|
|
30
|
+
super().__init__(402, body)
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from .._http import SyncHttpClient, AsyncHttpClient
|
|
4
|
+
from ..types import AmazonProductResponse, AmazonSearchResponse
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AmazonResource:
|
|
8
|
+
def __init__(self, http: SyncHttpClient) -> None:
|
|
9
|
+
self._http = http
|
|
10
|
+
|
|
11
|
+
def get_product(self, asin: str, *, country: Optional[str] = None) -> AmazonProductResponse:
|
|
12
|
+
return self._http.request(
|
|
13
|
+
"GET", "/v1/amazon/product",
|
|
14
|
+
params={"asin": asin, "country": country},
|
|
15
|
+
response_model=AmazonProductResponse,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> AmazonSearchResponse:
|
|
19
|
+
return self._http.request(
|
|
20
|
+
"GET", "/v1/amazon/search",
|
|
21
|
+
params={"query": query, "country": country, "page": page},
|
|
22
|
+
response_model=AmazonSearchResponse,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def queue_search_crawl(self, query: str, *, country: Optional[str] = None) -> dict:
|
|
26
|
+
from pydantic import RootModel
|
|
27
|
+
from typing import Any
|
|
28
|
+
class _R(RootModel[dict[str, Any]]): pass
|
|
29
|
+
result = self._http.request(
|
|
30
|
+
"GET", "/v1/amazon/search/crawl",
|
|
31
|
+
params={"query": query, "country": country},
|
|
32
|
+
response_model=_R,
|
|
33
|
+
)
|
|
34
|
+
return result.root
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class AsyncAmazonResource:
|
|
38
|
+
def __init__(self, http: AsyncHttpClient) -> None:
|
|
39
|
+
self._http = http
|
|
40
|
+
|
|
41
|
+
async def get_product(self, asin: str, *, country: Optional[str] = None) -> AmazonProductResponse:
|
|
42
|
+
return await self._http.request(
|
|
43
|
+
"GET", "/v1/amazon/product",
|
|
44
|
+
params={"asin": asin, "country": country},
|
|
45
|
+
response_model=AmazonProductResponse,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
async def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> AmazonSearchResponse:
|
|
49
|
+
return await self._http.request(
|
|
50
|
+
"GET", "/v1/amazon/search",
|
|
51
|
+
params={"query": query, "country": country, "page": page},
|
|
52
|
+
response_model=AmazonSearchResponse,
|
|
53
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from .._http import SyncHttpClient, AsyncHttpClient
|
|
3
|
+
from ..types import FetchRequest, FetchResponse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FetchResource:
|
|
7
|
+
def __init__(self, http: SyncHttpClient) -> None:
|
|
8
|
+
self._http = http
|
|
9
|
+
|
|
10
|
+
def fetch(self, request: FetchRequest, *, timeout: float | None = None) -> FetchResponse:
|
|
11
|
+
return self._http.request(
|
|
12
|
+
"POST", "/v1/fetch",
|
|
13
|
+
json=request.model_dump(exclude_none=True),
|
|
14
|
+
response_model=FetchResponse,
|
|
15
|
+
timeout=timeout,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AsyncFetchResource:
|
|
20
|
+
def __init__(self, http: AsyncHttpClient) -> None:
|
|
21
|
+
self._http = http
|
|
22
|
+
|
|
23
|
+
async def fetch(self, request: FetchRequest, *, timeout: float | None = None) -> FetchResponse:
|
|
24
|
+
return await self._http.request(
|
|
25
|
+
"POST", "/v1/fetch",
|
|
26
|
+
json=request.model_dump(exclude_none=True),
|
|
27
|
+
response_model=FetchResponse,
|
|
28
|
+
timeout=timeout,
|
|
29
|
+
)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from .._http import SyncHttpClient, AsyncHttpClient
|
|
3
|
+
from ..types import GoogleSearchParams, GoogleSearchResponse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class GoogleResource:
|
|
7
|
+
def __init__(self, http: SyncHttpClient) -> None:
|
|
8
|
+
self._http = http
|
|
9
|
+
|
|
10
|
+
def search(self, params: GoogleSearchParams) -> GoogleSearchResponse:
|
|
11
|
+
return self._http.request(
|
|
12
|
+
"GET", "/v1/google/search",
|
|
13
|
+
params=params.model_dump(exclude_none=True),
|
|
14
|
+
response_model=GoogleSearchResponse,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AsyncGoogleResource:
|
|
19
|
+
def __init__(self, http: AsyncHttpClient) -> None:
|
|
20
|
+
self._http = http
|
|
21
|
+
|
|
22
|
+
async def search(self, params: GoogleSearchParams) -> GoogleSearchResponse:
|
|
23
|
+
return await self._http.request(
|
|
24
|
+
"GET", "/v1/google/search",
|
|
25
|
+
params=params.model_dump(exclude_none=True),
|
|
26
|
+
response_model=GoogleSearchResponse,
|
|
27
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import asyncio
|
|
3
|
+
import time
|
|
4
|
+
from .._http import SyncHttpClient, AsyncHttpClient
|
|
5
|
+
from ..types import CreateJobRequest, Job, JobResult
|
|
6
|
+
|
|
7
|
+
TERMINAL = {"completed", "partial", "failed", "cancelled"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class JobsResource:
|
|
11
|
+
def __init__(self, http: SyncHttpClient) -> None:
|
|
12
|
+
self._http = http
|
|
13
|
+
|
|
14
|
+
def create(self, request: CreateJobRequest) -> Job:
|
|
15
|
+
return self._http.request(
|
|
16
|
+
"POST", "/v1/jobs",
|
|
17
|
+
json=request.model_dump(exclude_none=True),
|
|
18
|
+
response_model=Job,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def get(self, job_id: str) -> Job:
|
|
22
|
+
return self._http.request("GET", f"/v1/jobs/{job_id}", response_model=Job)
|
|
23
|
+
|
|
24
|
+
def get_result(self, job_id: str) -> JobResult:
|
|
25
|
+
return self._http.request("GET", f"/v1/jobs/{job_id}/result", response_model=JobResult)
|
|
26
|
+
|
|
27
|
+
def wait_for_completion(
|
|
28
|
+
self,
|
|
29
|
+
job_id: str,
|
|
30
|
+
*,
|
|
31
|
+
poll_interval: float = 2.0,
|
|
32
|
+
timeout: float = 300.0,
|
|
33
|
+
) -> JobResult:
|
|
34
|
+
deadline = time.monotonic() + timeout
|
|
35
|
+
while time.monotonic() < deadline:
|
|
36
|
+
job = self.get(job_id)
|
|
37
|
+
if job.status in TERMINAL:
|
|
38
|
+
return self.get_result(job_id)
|
|
39
|
+
time.sleep(poll_interval)
|
|
40
|
+
raise TimeoutError(f"Job {job_id} did not complete within {timeout}s")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class AsyncJobsResource:
|
|
44
|
+
def __init__(self, http: AsyncHttpClient) -> None:
|
|
45
|
+
self._http = http
|
|
46
|
+
|
|
47
|
+
async def create(self, request: CreateJobRequest) -> Job:
|
|
48
|
+
return await self._http.request(
|
|
49
|
+
"POST", "/v1/jobs",
|
|
50
|
+
json=request.model_dump(exclude_none=True),
|
|
51
|
+
response_model=Job,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
async def get(self, job_id: str) -> Job:
|
|
55
|
+
return await self._http.request("GET", f"/v1/jobs/{job_id}", response_model=Job)
|
|
56
|
+
|
|
57
|
+
async def get_result(self, job_id: str) -> JobResult:
|
|
58
|
+
return await self._http.request("GET", f"/v1/jobs/{job_id}/result", response_model=JobResult)
|
|
59
|
+
|
|
60
|
+
async def wait_for_completion(
|
|
61
|
+
self,
|
|
62
|
+
job_id: str,
|
|
63
|
+
*,
|
|
64
|
+
poll_interval: float = 2.0,
|
|
65
|
+
timeout: float = 300.0,
|
|
66
|
+
) -> JobResult:
|
|
67
|
+
deadline = asyncio.get_event_loop().time() + timeout
|
|
68
|
+
while asyncio.get_event_loop().time() < deadline:
|
|
69
|
+
job = await self.get(job_id)
|
|
70
|
+
if job.status in TERMINAL:
|
|
71
|
+
return await self.get_result(job_id)
|
|
72
|
+
await asyncio.sleep(poll_interval)
|
|
73
|
+
raise TimeoutError(f"Job {job_id} did not complete within {timeout}s")
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from .._http import SyncHttpClient, AsyncHttpClient
|
|
4
|
+
from ..types import WalmartProductResponse, WalmartSearchResponse
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class WalmartResource:
|
|
8
|
+
def __init__(self, http: SyncHttpClient) -> None:
|
|
9
|
+
self._http = http
|
|
10
|
+
|
|
11
|
+
def get_product(self, product_id: str, *, country: Optional[str] = None) -> WalmartProductResponse:
|
|
12
|
+
return self._http.request(
|
|
13
|
+
"GET", "/v1/walmart/product",
|
|
14
|
+
params={"product_id": product_id, "country": country},
|
|
15
|
+
response_model=WalmartProductResponse,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> WalmartSearchResponse:
|
|
19
|
+
return self._http.request(
|
|
20
|
+
"GET", "/v1/walmart/search",
|
|
21
|
+
params={"query": query, "country": country, "page": page},
|
|
22
|
+
response_model=WalmartSearchResponse,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def queue_search_crawl(self, query: str, *, country: Optional[str] = None) -> dict:
|
|
26
|
+
from pydantic import RootModel
|
|
27
|
+
from typing import Any
|
|
28
|
+
class _R(RootModel[dict[str, Any]]): pass
|
|
29
|
+
result = self._http.request(
|
|
30
|
+
"GET", "/v1/walmart/search/crawl",
|
|
31
|
+
params={"query": query, "country": country},
|
|
32
|
+
response_model=_R,
|
|
33
|
+
)
|
|
34
|
+
return result.root
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class AsyncWalmartResource:
|
|
38
|
+
def __init__(self, http: AsyncHttpClient) -> None:
|
|
39
|
+
self._http = http
|
|
40
|
+
|
|
41
|
+
async def get_product(self, product_id: str, *, country: Optional[str] = None) -> WalmartProductResponse:
|
|
42
|
+
return await self._http.request(
|
|
43
|
+
"GET", "/v1/walmart/product",
|
|
44
|
+
params={"product_id": product_id, "country": country},
|
|
45
|
+
response_model=WalmartProductResponse,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
async def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> WalmartSearchResponse:
|
|
49
|
+
return await self._http.request(
|
|
50
|
+
"GET", "/v1/walmart/search",
|
|
51
|
+
params={"query": query, "country": country, "page": page},
|
|
52
|
+
response_model=WalmartSearchResponse,
|
|
53
|
+
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from .._http import SyncHttpClient, AsyncHttpClient
|
|
4
|
+
from ..types import YouTubeVideoResponse, YouTubeSearchResponse, YouTubeSubtitleResponse
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class YouTubeResource:
|
|
8
|
+
def __init__(self, http: SyncHttpClient) -> None:
|
|
9
|
+
self._http = http
|
|
10
|
+
|
|
11
|
+
def search(self, query: str, *, page: Optional[int] = None, country: Optional[str] = None, language: Optional[str] = None) -> YouTubeSearchResponse:
|
|
12
|
+
return self._http.request(
|
|
13
|
+
"GET", "/v1/youtube/search",
|
|
14
|
+
params={"query": query, "page": page, "country": country, "language": language},
|
|
15
|
+
response_model=YouTubeSearchResponse,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
def get_video(self, video_id: str) -> YouTubeVideoResponse:
|
|
19
|
+
return self._http.request(
|
|
20
|
+
"GET", f"/v1/youtube/videos/{video_id}",
|
|
21
|
+
response_model=YouTubeVideoResponse,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def get_subtitles(self, video_id: str, *, language: Optional[str] = None) -> YouTubeSubtitleResponse:
|
|
25
|
+
return self._http.request(
|
|
26
|
+
"GET", "/v1/youtube/subtitles",
|
|
27
|
+
params={"video_id": video_id, "language": language},
|
|
28
|
+
response_model=YouTubeSubtitleResponse,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def queue_search_crawl(self, query: str, *, page: Optional[int] = None) -> dict:
|
|
32
|
+
from pydantic import RootModel
|
|
33
|
+
from typing import Any
|
|
34
|
+
class _R(RootModel[dict[str, Any]]): pass
|
|
35
|
+
result = self._http.request(
|
|
36
|
+
"POST", "/v1/youtube/search/crawl",
|
|
37
|
+
json={"query": query, "page": page},
|
|
38
|
+
response_model=_R,
|
|
39
|
+
)
|
|
40
|
+
return result.root
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class AsyncYouTubeResource:
|
|
44
|
+
def __init__(self, http: AsyncHttpClient) -> None:
|
|
45
|
+
self._http = http
|
|
46
|
+
|
|
47
|
+
async def search(self, query: str, *, page: Optional[int] = None, country: Optional[str] = None, language: Optional[str] = None) -> YouTubeSearchResponse:
|
|
48
|
+
return await self._http.request(
|
|
49
|
+
"GET", "/v1/youtube/search",
|
|
50
|
+
params={"query": query, "page": page, "country": country, "language": language},
|
|
51
|
+
response_model=YouTubeSearchResponse,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
async def get_video(self, video_id: str) -> YouTubeVideoResponse:
|
|
55
|
+
return await self._http.request(
|
|
56
|
+
"GET", f"/v1/youtube/videos/{video_id}",
|
|
57
|
+
response_model=YouTubeVideoResponse,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
async def get_subtitles(self, video_id: str, *, language: Optional[str] = None) -> YouTubeSubtitleResponse:
|
|
61
|
+
return await self._http.request(
|
|
62
|
+
"GET", "/v1/youtube/subtitles",
|
|
63
|
+
params={"video_id": video_id, "language": language},
|
|
64
|
+
response_model=YouTubeSubtitleResponse,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
async def queue_search_crawl(self, query: str, *, page: Optional[int] = None) -> dict:
|
|
68
|
+
from pydantic import RootModel
|
|
69
|
+
from typing import Any
|
|
70
|
+
class _R(RootModel[dict[str, Any]]): pass
|
|
71
|
+
result = await self._http.request(
|
|
72
|
+
"POST", "/v1/youtube/search/crawl",
|
|
73
|
+
json={"query": query, "page": page},
|
|
74
|
+
response_model=_R,
|
|
75
|
+
)
|
|
76
|
+
return result.root
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any, Literal, Optional, Union
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# ---- Fetch ----
|
|
7
|
+
|
|
8
|
+
class FetchSession(BaseModel):
|
|
9
|
+
id: str
|
|
10
|
+
|
|
11
|
+
class FetchRequest(BaseModel):
|
|
12
|
+
url: str
|
|
13
|
+
render_js: Optional[bool] = None
|
|
14
|
+
device: Optional[Literal["desktop", "mobile", "tablet"]] = None
|
|
15
|
+
session: Optional[FetchSession] = None
|
|
16
|
+
output: Optional[list[str]] = None
|
|
17
|
+
extract: Optional[dict[str, Any]] = None
|
|
18
|
+
actions: Optional[list[Any]] = None
|
|
19
|
+
timeout: Optional[int] = None
|
|
20
|
+
proxy: Optional[str] = None
|
|
21
|
+
country: Optional[str] = None
|
|
22
|
+
|
|
23
|
+
class FetchResponse(BaseModel):
|
|
24
|
+
request_id: str
|
|
25
|
+
url: str
|
|
26
|
+
status_code: int
|
|
27
|
+
outputs: dict[str, Any]
|
|
28
|
+
diagnostics: Optional[dict[str, Any]] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---- Jobs ----
|
|
32
|
+
|
|
33
|
+
JobStatus = Literal["queued", "running", "completed", "partial", "failed", "cancelled"]
|
|
34
|
+
|
|
35
|
+
class CreateJobRequest(BaseModel):
|
|
36
|
+
job_type: str
|
|
37
|
+
payload: dict[str, Any]
|
|
38
|
+
webhook_url: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
class Job(BaseModel):
|
|
41
|
+
request_id: str
|
|
42
|
+
job_id: str
|
|
43
|
+
job_type: str
|
|
44
|
+
status: str
|
|
45
|
+
created_at: str
|
|
46
|
+
updated_at: Optional[str] = None
|
|
47
|
+
webhook_url: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
class JobError(BaseModel):
|
|
50
|
+
code: str
|
|
51
|
+
message: str
|
|
52
|
+
|
|
53
|
+
class JobResult(Job):
|
|
54
|
+
result: Optional[Any] = None
|
|
55
|
+
error: Optional[JobError] = None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ---- Google ----
|
|
59
|
+
|
|
60
|
+
GoogleSearchType = Literal["classic", "news", "maps", "images", "lens", "shopping", "ai_mode", "ads"]
|
|
61
|
+
GoogleDevice = Literal["desktop", "mobile"]
|
|
62
|
+
GoogleDateRange = Literal["past_hour", "past_day", "past_week", "past_month", "past_year"]
|
|
63
|
+
GoogleSortBy = Literal["relevance", "reviews", "price_asc", "price_desc"]
|
|
64
|
+
|
|
65
|
+
class GoogleSearchParams(BaseModel):
|
|
66
|
+
search: str
|
|
67
|
+
search_type: Optional[GoogleSearchType] = None
|
|
68
|
+
country_code: Optional[str] = None
|
|
69
|
+
language: Optional[str] = None
|
|
70
|
+
device: Optional[GoogleDevice] = None
|
|
71
|
+
page: Optional[Union[int, str]] = None
|
|
72
|
+
date_range: Optional[GoogleDateRange] = None
|
|
73
|
+
latitude: Optional[Union[float, str]] = None
|
|
74
|
+
longitude: Optional[Union[float, str]] = None
|
|
75
|
+
radius: Optional[Union[float, str]] = None
|
|
76
|
+
sort_by: Optional[GoogleSortBy] = None
|
|
77
|
+
|
|
78
|
+
class GoogleSearchResponse(BaseModel):
|
|
79
|
+
request_id: str
|
|
80
|
+
results: list[Any]
|
|
81
|
+
pagination: Optional[dict[str, Any]] = None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ---- Amazon ----
|
|
85
|
+
|
|
86
|
+
class AmazonProductResponse(BaseModel):
|
|
87
|
+
provider: str
|
|
88
|
+
asin: str
|
|
89
|
+
title: str
|
|
90
|
+
brand: Optional[str] = None
|
|
91
|
+
price: Optional[float] = None
|
|
92
|
+
currency: Optional[str] = None
|
|
93
|
+
availability: Optional[str] = None
|
|
94
|
+
rating: Optional[float] = None
|
|
95
|
+
review_count: Optional[int] = None
|
|
96
|
+
images: Optional[list[str]] = None
|
|
97
|
+
bullet_points: Optional[list[str]] = None
|
|
98
|
+
url: str
|
|
99
|
+
model_config = {"extra": "allow"}
|
|
100
|
+
|
|
101
|
+
class AmazonSearchResponse(BaseModel):
|
|
102
|
+
request_id: str
|
|
103
|
+
results: list[AmazonProductResponse]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---- Walmart ----
|
|
107
|
+
|
|
108
|
+
class WalmartProductResponse(BaseModel):
|
|
109
|
+
provider: str
|
|
110
|
+
product_id: str
|
|
111
|
+
title: str
|
|
112
|
+
brand: Optional[str] = None
|
|
113
|
+
price: Optional[float] = None
|
|
114
|
+
availability: Optional[str] = None
|
|
115
|
+
url: str
|
|
116
|
+
model_config = {"extra": "allow"}
|
|
117
|
+
|
|
118
|
+
class WalmartSearchResponse(BaseModel):
|
|
119
|
+
request_id: str
|
|
120
|
+
results: list[WalmartProductResponse]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ---- YouTube ----
|
|
124
|
+
|
|
125
|
+
class YouTubeVideoResponse(BaseModel):
|
|
126
|
+
request_id: str
|
|
127
|
+
video: dict[str, Any]
|
|
128
|
+
|
|
129
|
+
class YouTubeSearchResponse(BaseModel):
|
|
130
|
+
request_id: str
|
|
131
|
+
results: list[Any]
|
|
132
|
+
|
|
133
|
+
class YouTubeSubtitleResponse(BaseModel):
|
|
134
|
+
request_id: str
|
|
135
|
+
video_id: str
|
|
136
|
+
subtitles: list[Any]
|
|
File without changes
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SDK integration tests — require a running API stack.
|
|
3
|
+
|
|
4
|
+
Set WDA_API_KEY and WDA_BASE_URL, then run:
|
|
5
|
+
WDA_API_KEY=sk-test WDA_BASE_URL=http://localhost:3000 pytest tests/
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import pytest
|
|
10
|
+
import pytest_asyncio
|
|
11
|
+
|
|
12
|
+
API_KEY = os.environ.get("WDA_API_KEY", "")
|
|
13
|
+
BASE_URL = os.environ.get("WDA_BASE_URL", "http://localhost:3000")
|
|
14
|
+
|
|
15
|
+
skip_if_no_key = pytest.mark.skipif(not API_KEY, reason="WDA_API_KEY not set")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ---- Sync client tests ----
|
|
19
|
+
|
|
20
|
+
@skip_if_no_key
|
|
21
|
+
class TestSyncFetch:
|
|
22
|
+
def setup_method(self):
|
|
23
|
+
from webdataapi import ApiClient
|
|
24
|
+
self.client = ApiClient(api_key=API_KEY, base_url=BASE_URL)
|
|
25
|
+
|
|
26
|
+
def teardown_method(self):
|
|
27
|
+
self.client.close()
|
|
28
|
+
|
|
29
|
+
def test_fetch_returns_markdown(self):
|
|
30
|
+
from webdataapi import FetchRequest
|
|
31
|
+
res = self.client.fetch.fetch(FetchRequest(url="https://example.com", output=["markdown"]))
|
|
32
|
+
assert res.request_id
|
|
33
|
+
assert res.status_code == 200
|
|
34
|
+
assert "markdown" in res.outputs
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@skip_if_no_key
|
|
38
|
+
class TestSyncJobs:
|
|
39
|
+
def setup_method(self):
|
|
40
|
+
from webdataapi import ApiClient
|
|
41
|
+
self.client = ApiClient(api_key=API_KEY, base_url=BASE_URL)
|
|
42
|
+
|
|
43
|
+
def teardown_method(self):
|
|
44
|
+
self.client.close()
|
|
45
|
+
|
|
46
|
+
def test_create_and_poll_job(self):
|
|
47
|
+
from webdataapi import CreateJobRequest
|
|
48
|
+
job = self.client.jobs.create(
|
|
49
|
+
CreateJobRequest(job_type="fetch", payload={"url": "https://example.com", "output": ["markdown"]})
|
|
50
|
+
)
|
|
51
|
+
assert job.job_id
|
|
52
|
+
assert job.status in ("queued", "running", "completed", "partial")
|
|
53
|
+
|
|
54
|
+
polled = self.client.jobs.get(job.job_id)
|
|
55
|
+
assert polled.job_id == job.job_id
|
|
56
|
+
|
|
57
|
+
def test_wait_for_completion(self):
|
|
58
|
+
from webdataapi import CreateJobRequest
|
|
59
|
+
job = self.client.jobs.create(
|
|
60
|
+
CreateJobRequest(job_type="fetch", payload={"url": "https://example.com", "output": ["markdown"]})
|
|
61
|
+
)
|
|
62
|
+
result = self.client.jobs.wait_for_completion(job.job_id, poll_interval=1.0, timeout=60.0)
|
|
63
|
+
assert result.status in ("completed", "partial", "failed", "cancelled")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@skip_if_no_key
|
|
67
|
+
class TestSyncGoogle:
|
|
68
|
+
def setup_method(self):
|
|
69
|
+
from webdataapi import ApiClient
|
|
70
|
+
self.client = ApiClient(api_key=API_KEY, base_url=BASE_URL)
|
|
71
|
+
|
|
72
|
+
def teardown_method(self):
|
|
73
|
+
self.client.close()
|
|
74
|
+
|
|
75
|
+
def test_search_returns_results(self):
|
|
76
|
+
from webdataapi import GoogleSearchParams
|
|
77
|
+
res = self.client.google.search(GoogleSearchParams(search="python web scraping"))
|
|
78
|
+
assert res.request_id
|
|
79
|
+
assert isinstance(res.results, list)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@skip_if_no_key
|
|
83
|
+
class TestSyncAmazon:
|
|
84
|
+
def setup_method(self):
|
|
85
|
+
from webdataapi import ApiClient
|
|
86
|
+
self.client = ApiClient(api_key=API_KEY, base_url=BASE_URL)
|
|
87
|
+
|
|
88
|
+
def teardown_method(self):
|
|
89
|
+
self.client.close()
|
|
90
|
+
|
|
91
|
+
def test_search_amazon(self):
|
|
92
|
+
res = self.client.amazon.search("laptop")
|
|
93
|
+
assert res.request_id
|
|
94
|
+
assert isinstance(res.results, list)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@skip_if_no_key
|
|
98
|
+
class TestSyncWalmart:
|
|
99
|
+
def setup_method(self):
|
|
100
|
+
from webdataapi import ApiClient
|
|
101
|
+
self.client = ApiClient(api_key=API_KEY, base_url=BASE_URL)
|
|
102
|
+
|
|
103
|
+
def teardown_method(self):
|
|
104
|
+
self.client.close()
|
|
105
|
+
|
|
106
|
+
def test_search_walmart(self):
|
|
107
|
+
res = self.client.walmart.search("headphones")
|
|
108
|
+
assert res.request_id
|
|
109
|
+
assert isinstance(res.results, list)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@skip_if_no_key
|
|
113
|
+
class TestSyncYouTube:
|
|
114
|
+
def setup_method(self):
|
|
115
|
+
from webdataapi import ApiClient
|
|
116
|
+
self.client = ApiClient(api_key=API_KEY, base_url=BASE_URL)
|
|
117
|
+
|
|
118
|
+
def teardown_method(self):
|
|
119
|
+
self.client.close()
|
|
120
|
+
|
|
121
|
+
def test_search_youtube(self):
|
|
122
|
+
res = self.client.youtube.search("python tutorial")
|
|
123
|
+
assert res.request_id
|
|
124
|
+
assert isinstance(res.results, list)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ---- Error types (sync) ----
|
|
128
|
+
|
|
129
|
+
class TestSyncErrors:
|
|
130
|
+
def test_auth_error_on_bad_key(self):
|
|
131
|
+
from webdataapi import ApiClient, AuthError, FetchRequest
|
|
132
|
+
client = ApiClient(api_key="invalid", base_url=BASE_URL)
|
|
133
|
+
with pytest.raises(AuthError) as exc_info:
|
|
134
|
+
client.fetch.fetch(FetchRequest(url="https://example.com"))
|
|
135
|
+
assert exc_info.value.status_code == 401
|
|
136
|
+
|
|
137
|
+
def test_credits_exhausted_error_is_catchable(self):
|
|
138
|
+
from webdataapi import CreditsExhaustedError
|
|
139
|
+
body = {"request_id": "r1", "error": {"code": "credits_exhausted", "message": "no credits"}}
|
|
140
|
+
err = CreditsExhaustedError(body)
|
|
141
|
+
assert err.status_code == 402
|
|
142
|
+
assert isinstance(err, CreditsExhaustedError)
|
|
143
|
+
|
|
144
|
+
def test_rate_limit_error_is_catchable(self):
|
|
145
|
+
from webdataapi import RateLimitError
|
|
146
|
+
body = {"request_id": "r2", "error": {"code": "rate_limited", "message": "slow down"}}
|
|
147
|
+
err = RateLimitError(body)
|
|
148
|
+
assert err.status_code == 429
|
|
149
|
+
assert isinstance(err, RateLimitError)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---- Async client tests ----
|
|
153
|
+
|
|
154
|
+
@skip_if_no_key
|
|
155
|
+
@pytest.mark.asyncio
|
|
156
|
+
class TestAsyncFetch:
|
|
157
|
+
async def test_fetch_returns_markdown(self):
|
|
158
|
+
from webdataapi import AsyncApiClient, FetchRequest
|
|
159
|
+
async with AsyncApiClient(api_key=API_KEY, base_url=BASE_URL) as client:
|
|
160
|
+
res = await client.fetch.fetch(FetchRequest(url="https://example.com", output=["markdown"]))
|
|
161
|
+
assert res.request_id
|
|
162
|
+
assert res.status_code == 200
|
|
163
|
+
|
|
164
|
+
async def test_wait_for_completion_async(self):
|
|
165
|
+
from webdataapi import AsyncApiClient, CreateJobRequest
|
|
166
|
+
async with AsyncApiClient(api_key=API_KEY, base_url=BASE_URL) as client:
|
|
167
|
+
job = await client.jobs.create(
|
|
168
|
+
CreateJobRequest(job_type="fetch", payload={"url": "https://example.com", "output": ["markdown"]})
|
|
169
|
+
)
|
|
170
|
+
result = await client.jobs.wait_for_completion(job.job_id, poll_interval=1.0, timeout=60.0)
|
|
171
|
+
assert result.status in ("completed", "partial", "failed", "cancelled")
|
|
172
|
+
|
|
173
|
+
async def test_auth_error_async(self):
|
|
174
|
+
from webdataapi import AsyncApiClient, AuthError, FetchRequest
|
|
175
|
+
async with AsyncApiClient(api_key="invalid", base_url=BASE_URL) as client:
|
|
176
|
+
with pytest.raises(AuthError):
|
|
177
|
+
await client.fetch.fetch(FetchRequest(url="https://example.com"))
|