scrapio-py 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapio/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ from .client import ApiClient, AsyncApiClient
2
+ from .errors import ApiError, AuthError, RateLimitError, CreditsExhaustedError
3
+ from .types import (
4
+ FetchRequest, FetchResponse,
5
+ CreateJobRequest, Job, JobResult,
6
+ GoogleSearchParams, GoogleSearchResponse,
7
+ AmazonProductResponse, AmazonSearchResponse,
8
+ WalmartProductResponse, WalmartSearchResponse,
9
+ YouTubeVideoResponse, YouTubeSearchResponse, YouTubeSubtitleResponse,
10
+ )
11
+
12
+ __all__ = [
13
+ "ApiClient", "AsyncApiClient",
14
+ "ApiError", "AuthError", "RateLimitError", "CreditsExhaustedError",
15
+ "FetchRequest", "FetchResponse",
16
+ "CreateJobRequest", "Job", "JobResult",
17
+ "GoogleSearchParams", "GoogleSearchResponse",
18
+ "AmazonProductResponse", "AmazonSearchResponse",
19
+ "WalmartProductResponse", "WalmartSearchResponse",
20
+ "YouTubeVideoResponse", "YouTubeSearchResponse", "YouTubeSubtitleResponse",
21
+ ]
scrapio/_http.py ADDED
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import time
5
+ from typing import Any, Optional, Type, TypeVar
6
+
7
+ import httpx
8
+ from pydantic import BaseModel
9
+
10
+ from .errors import ApiError, AuthError, CreditsExhaustedError, RateLimitError
11
+
12
+ T = TypeVar("T", bound=BaseModel)
13
+
14
+ RETRYABLE_STATUS = {429, 503}
15
+ DEFAULT_TIMEOUT = 30.0
16
+ DEFAULT_MAX_RETRIES = 3
17
+
18
+
19
+ def _raise_for_status(status_code: int, body: dict[str, Any]) -> None:
20
+ code = body.get("error", {}).get("code", "")
21
+ if status_code == 401:
22
+ raise AuthError(body)
23
+ if status_code == 429:
24
+ raise RateLimitError(body)
25
+ if status_code == 402 or code == "credits_exhausted":
26
+ raise CreditsExhaustedError(body)
27
+ raise ApiError(status_code, body)
28
+
29
+
30
+ class SyncHttpClient:
31
+ def __init__(
32
+ self,
33
+ base_url: str,
34
+ api_key: str,
35
+ timeout: float = DEFAULT_TIMEOUT,
36
+ max_retries: int = DEFAULT_MAX_RETRIES,
37
+ ) -> None:
38
+ self._base_url = base_url.rstrip("/")
39
+ self._headers = {"Authorization": f"Bearer {api_key}"}
40
+ self._timeout = timeout
41
+ self._max_retries = max_retries
42
+ self._client = httpx.Client(
43
+ base_url=self._base_url,
44
+ headers=self._headers,
45
+ timeout=self._timeout,
46
+ )
47
+
48
+ def close(self) -> None:
49
+ self._client.close()
50
+
51
+ def __enter__(self) -> "SyncHttpClient":
52
+ return self
53
+
54
+ def __exit__(self, *args: Any) -> None:
55
+ self.close()
56
+
57
+ def request(
58
+ self,
59
+ method: str,
60
+ path: str,
61
+ *,
62
+ params: Optional[dict[str, Any]] = None,
63
+ json: Optional[Any] = None,
64
+ response_model: Type[T],
65
+ timeout: Optional[float] = None,
66
+ ) -> T:
67
+ clean_params = {k: v for k, v in (params or {}).items() if v is not None}
68
+
69
+ for attempt in range(self._max_retries + 1):
70
+ res = self._client.request(
71
+ method,
72
+ path,
73
+ params=clean_params or None,
74
+ json=json,
75
+ timeout=timeout or self._timeout,
76
+ )
77
+ if res.is_success:
78
+ return response_model.model_validate(res.json())
79
+
80
+ body: dict[str, Any] = {}
81
+ try:
82
+ body = res.json()
83
+ except Exception:
84
+ body = {"request_id": "", "error": {"code": "unknown", "message": res.text}}
85
+
86
+ if res.status_code in RETRYABLE_STATUS and attempt < self._max_retries:
87
+ backoff = min(1.0 * (2**attempt), 8.0)
88
+ time.sleep(backoff)
89
+ continue
90
+
91
+ _raise_for_status(res.status_code, body)
92
+
93
+ raise RuntimeError("Unexpected end of retry loop") # unreachable
94
+
95
+
96
+ class AsyncHttpClient:
97
+ def __init__(
98
+ self,
99
+ base_url: str,
100
+ api_key: str,
101
+ timeout: float = DEFAULT_TIMEOUT,
102
+ max_retries: int = DEFAULT_MAX_RETRIES,
103
+ ) -> None:
104
+ self._base_url = base_url.rstrip("/")
105
+ self._headers = {"Authorization": f"Bearer {api_key}"}
106
+ self._timeout = timeout
107
+ self._max_retries = max_retries
108
+ self._client = httpx.AsyncClient(
109
+ base_url=self._base_url,
110
+ headers=self._headers,
111
+ timeout=self._timeout,
112
+ )
113
+
114
+ async def aclose(self) -> None:
115
+ await self._client.aclose()
116
+
117
+ async def __aenter__(self) -> "AsyncHttpClient":
118
+ return self
119
+
120
+ async def __aexit__(self, *args: Any) -> None:
121
+ await self.aclose()
122
+
123
+ async def request(
124
+ self,
125
+ method: str,
126
+ path: str,
127
+ *,
128
+ params: Optional[dict[str, Any]] = None,
129
+ json: Optional[Any] = None,
130
+ response_model: Type[T],
131
+ timeout: Optional[float] = None,
132
+ ) -> T:
133
+ clean_params = {k: v for k, v in (params or {}).items() if v is not None}
134
+
135
+ for attempt in range(self._max_retries + 1):
136
+ res = await self._client.request(
137
+ method,
138
+ path,
139
+ params=clean_params or None,
140
+ json=json,
141
+ timeout=timeout or self._timeout,
142
+ )
143
+ if res.is_success:
144
+ return response_model.model_validate(res.json())
145
+
146
+ body: dict[str, Any] = {}
147
+ try:
148
+ body = res.json()
149
+ except Exception:
150
+ body = {"request_id": "", "error": {"code": "unknown", "message": res.text}}
151
+
152
+ if res.status_code in RETRYABLE_STATUS and attempt < self._max_retries:
153
+ backoff = min(1.0 * (2**attempt), 8.0)
154
+ await asyncio.sleep(backoff)
155
+ continue
156
+
157
+ _raise_for_status(res.status_code, body)
158
+
159
+ raise RuntimeError("Unexpected end of retry loop") # unreachable
scrapio/client.py ADDED
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+ from typing import Optional
3
+ from ._http import SyncHttpClient, AsyncHttpClient, DEFAULT_TIMEOUT, DEFAULT_MAX_RETRIES
4
+ from .resources.fetch import FetchResource, AsyncFetchResource
5
+ from .resources.jobs import JobsResource, AsyncJobsResource
6
+ from .resources.google import GoogleResource, AsyncGoogleResource
7
+ from .resources.amazon import AmazonResource, AsyncAmazonResource
8
+ from .resources.walmart import WalmartResource, AsyncWalmartResource
9
+ from .resources.youtube import YouTubeResource, AsyncYouTubeResource
10
+
11
+ DEFAULT_BASE_URL = "https://api.webdataapi.com"
12
+
13
+
14
+ class ApiClient:
15
+ fetch: FetchResource
16
+ jobs: JobsResource
17
+ google: GoogleResource
18
+ amazon: AmazonResource
19
+ walmart: WalmartResource
20
+ youtube: YouTubeResource
21
+
22
+ def __init__(
23
+ self,
24
+ api_key: str,
25
+ *,
26
+ base_url: str = DEFAULT_BASE_URL,
27
+ timeout: float = DEFAULT_TIMEOUT,
28
+ max_retries: int = DEFAULT_MAX_RETRIES,
29
+ ) -> None:
30
+ http = SyncHttpClient(base_url, api_key, timeout=timeout, max_retries=max_retries)
31
+ self.fetch = FetchResource(http)
32
+ self.jobs = JobsResource(http)
33
+ self.google = GoogleResource(http)
34
+ self.amazon = AmazonResource(http)
35
+ self.walmart = WalmartResource(http)
36
+ self.youtube = YouTubeResource(http)
37
+ self._http = http
38
+
39
+ def close(self) -> None:
40
+ self._http.close()
41
+
42
+ def __enter__(self) -> "ApiClient":
43
+ return self
44
+
45
+ def __exit__(self, *args: object) -> None:
46
+ self.close()
47
+
48
+
49
+ class AsyncApiClient:
50
+ fetch: AsyncFetchResource
51
+ jobs: AsyncJobsResource
52
+ google: AsyncGoogleResource
53
+ amazon: AsyncAmazonResource
54
+ walmart: AsyncWalmartResource
55
+ youtube: AsyncYouTubeResource
56
+
57
+ def __init__(
58
+ self,
59
+ api_key: str,
60
+ *,
61
+ base_url: str = DEFAULT_BASE_URL,
62
+ timeout: float = DEFAULT_TIMEOUT,
63
+ max_retries: int = DEFAULT_MAX_RETRIES,
64
+ ) -> None:
65
+ http = AsyncHttpClient(base_url, api_key, timeout=timeout, max_retries=max_retries)
66
+ self.fetch = AsyncFetchResource(http)
67
+ self.jobs = AsyncJobsResource(http)
68
+ self.google = AsyncGoogleResource(http)
69
+ self.amazon = AsyncAmazonResource(http)
70
+ self.walmart = AsyncWalmartResource(http)
71
+ self.youtube = AsyncYouTubeResource(http)
72
+ self._http = http
73
+
74
+ async def aclose(self) -> None:
75
+ await self._http.aclose()
76
+
77
+ async def __aenter__(self) -> "AsyncApiClient":
78
+ return self
79
+
80
+ async def __aexit__(self, *args: object) -> None:
81
+ await self.aclose()
scrapio/errors.py ADDED
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+ from typing import Any
3
+
4
+
5
+ class ApiError(Exception):
6
+ status_code: int
7
+ request_id: str
8
+ code: str
9
+
10
+ def __init__(self, status_code: int, body: dict[str, Any]) -> None:
11
+ error = body.get("error", {})
12
+ super().__init__(error.get("message", "Unknown error"))
13
+ self.status_code = status_code
14
+ self.request_id = body.get("request_id", "")
15
+ self.code = error.get("code", "unknown")
16
+
17
+
18
+ class AuthError(ApiError):
19
+ def __init__(self, body: dict[str, Any]) -> None:
20
+ super().__init__(401, body)
21
+
22
+
23
+ class RateLimitError(ApiError):
24
+ def __init__(self, body: dict[str, Any]) -> None:
25
+ super().__init__(429, body)
26
+
27
+
28
+ class CreditsExhaustedError(ApiError):
29
+ def __init__(self, body: dict[str, Any]) -> None:
30
+ super().__init__(402, body)
File without changes
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+ from typing import Optional
3
+ from .._http import SyncHttpClient, AsyncHttpClient
4
+ from ..types import AmazonProductResponse, AmazonSearchResponse
5
+
6
+
7
+ class AmazonResource:
8
+ def __init__(self, http: SyncHttpClient) -> None:
9
+ self._http = http
10
+
11
+ def get_product(self, asin: str, *, country: Optional[str] = None) -> AmazonProductResponse:
12
+ return self._http.request(
13
+ "GET", "/v1/amazon/product",
14
+ params={"asin": asin, "country": country},
15
+ response_model=AmazonProductResponse,
16
+ )
17
+
18
+ def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> AmazonSearchResponse:
19
+ return self._http.request(
20
+ "GET", "/v1/amazon/search",
21
+ params={"query": query, "country": country, "page": page},
22
+ response_model=AmazonSearchResponse,
23
+ )
24
+
25
+ def queue_search_crawl(self, query: str, *, country: Optional[str] = None) -> dict:
26
+ from pydantic import RootModel
27
+ from typing import Any
28
+ class _R(RootModel[dict[str, Any]]): pass
29
+ result = self._http.request(
30
+ "GET", "/v1/amazon/search/crawl",
31
+ params={"query": query, "country": country},
32
+ response_model=_R,
33
+ )
34
+ return result.root
35
+
36
+
37
+ class AsyncAmazonResource:
38
+ def __init__(self, http: AsyncHttpClient) -> None:
39
+ self._http = http
40
+
41
+ async def get_product(self, asin: str, *, country: Optional[str] = None) -> AmazonProductResponse:
42
+ return await self._http.request(
43
+ "GET", "/v1/amazon/product",
44
+ params={"asin": asin, "country": country},
45
+ response_model=AmazonProductResponse,
46
+ )
47
+
48
+ async def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> AmazonSearchResponse:
49
+ return await self._http.request(
50
+ "GET", "/v1/amazon/search",
51
+ params={"query": query, "country": country, "page": page},
52
+ response_model=AmazonSearchResponse,
53
+ )
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+ from .._http import SyncHttpClient, AsyncHttpClient
3
+ from ..types import FetchRequest, FetchResponse
4
+
5
+
6
+ class FetchResource:
7
+ def __init__(self, http: SyncHttpClient) -> None:
8
+ self._http = http
9
+
10
+ def fetch(self, request: FetchRequest, *, timeout: float | None = None) -> FetchResponse:
11
+ return self._http.request(
12
+ "POST", "/v1/fetch",
13
+ json=request.model_dump(exclude_none=True),
14
+ response_model=FetchResponse,
15
+ timeout=timeout,
16
+ )
17
+
18
+
19
+ class AsyncFetchResource:
20
+ def __init__(self, http: AsyncHttpClient) -> None:
21
+ self._http = http
22
+
23
+ async def fetch(self, request: FetchRequest, *, timeout: float | None = None) -> FetchResponse:
24
+ return await self._http.request(
25
+ "POST", "/v1/fetch",
26
+ json=request.model_dump(exclude_none=True),
27
+ response_model=FetchResponse,
28
+ timeout=timeout,
29
+ )
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+ from .._http import SyncHttpClient, AsyncHttpClient
3
+ from ..types import GoogleSearchParams, GoogleSearchResponse
4
+
5
+
6
+ class GoogleResource:
7
+ def __init__(self, http: SyncHttpClient) -> None:
8
+ self._http = http
9
+
10
+ def search(self, params: GoogleSearchParams) -> GoogleSearchResponse:
11
+ return self._http.request(
12
+ "GET", "/v1/google/search",
13
+ params=params.model_dump(exclude_none=True),
14
+ response_model=GoogleSearchResponse,
15
+ )
16
+
17
+
18
+ class AsyncGoogleResource:
19
+ def __init__(self, http: AsyncHttpClient) -> None:
20
+ self._http = http
21
+
22
+ async def search(self, params: GoogleSearchParams) -> GoogleSearchResponse:
23
+ return await self._http.request(
24
+ "GET", "/v1/google/search",
25
+ params=params.model_dump(exclude_none=True),
26
+ response_model=GoogleSearchResponse,
27
+ )
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+ import asyncio
3
+ import time
4
+ from .._http import SyncHttpClient, AsyncHttpClient
5
+ from ..types import CreateJobRequest, Job, JobResult
6
+
7
+ TERMINAL = {"completed", "partial", "failed", "cancelled"}
8
+
9
+
10
+ class JobsResource:
11
+ def __init__(self, http: SyncHttpClient) -> None:
12
+ self._http = http
13
+
14
+ def create(self, request: CreateJobRequest) -> Job:
15
+ return self._http.request(
16
+ "POST", "/v1/jobs",
17
+ json=request.model_dump(exclude_none=True),
18
+ response_model=Job,
19
+ )
20
+
21
+ def get(self, job_id: str) -> Job:
22
+ return self._http.request("GET", f"/v1/jobs/{job_id}", response_model=Job)
23
+
24
+ def get_result(self, job_id: str) -> JobResult:
25
+ return self._http.request("GET", f"/v1/jobs/{job_id}/result", response_model=JobResult)
26
+
27
+ def wait_for_completion(
28
+ self,
29
+ job_id: str,
30
+ *,
31
+ poll_interval: float = 2.0,
32
+ timeout: float = 300.0,
33
+ ) -> JobResult:
34
+ deadline = time.monotonic() + timeout
35
+ while time.monotonic() < deadline:
36
+ job = self.get(job_id)
37
+ if job.status in TERMINAL:
38
+ return self.get_result(job_id)
39
+ time.sleep(poll_interval)
40
+ raise TimeoutError(f"Job {job_id} did not complete within {timeout}s")
41
+
42
+
43
+ class AsyncJobsResource:
44
+ def __init__(self, http: AsyncHttpClient) -> None:
45
+ self._http = http
46
+
47
+ async def create(self, request: CreateJobRequest) -> Job:
48
+ return await self._http.request(
49
+ "POST", "/v1/jobs",
50
+ json=request.model_dump(exclude_none=True),
51
+ response_model=Job,
52
+ )
53
+
54
+ async def get(self, job_id: str) -> Job:
55
+ return await self._http.request("GET", f"/v1/jobs/{job_id}", response_model=Job)
56
+
57
+ async def get_result(self, job_id: str) -> JobResult:
58
+ return await self._http.request("GET", f"/v1/jobs/{job_id}/result", response_model=JobResult)
59
+
60
+ async def wait_for_completion(
61
+ self,
62
+ job_id: str,
63
+ *,
64
+ poll_interval: float = 2.0,
65
+ timeout: float = 300.0,
66
+ ) -> JobResult:
67
+ deadline = asyncio.get_event_loop().time() + timeout
68
+ while asyncio.get_event_loop().time() < deadline:
69
+ job = await self.get(job_id)
70
+ if job.status in TERMINAL:
71
+ return await self.get_result(job_id)
72
+ await asyncio.sleep(poll_interval)
73
+ raise TimeoutError(f"Job {job_id} did not complete within {timeout}s")
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+ from typing import Optional
3
+ from .._http import SyncHttpClient, AsyncHttpClient
4
+ from ..types import WalmartProductResponse, WalmartSearchResponse
5
+
6
+
7
+ class WalmartResource:
8
+ def __init__(self, http: SyncHttpClient) -> None:
9
+ self._http = http
10
+
11
+ def get_product(self, product_id: str, *, country: Optional[str] = None) -> WalmartProductResponse:
12
+ return self._http.request(
13
+ "GET", "/v1/walmart/product",
14
+ params={"product_id": product_id, "country": country},
15
+ response_model=WalmartProductResponse,
16
+ )
17
+
18
+ def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> WalmartSearchResponse:
19
+ return self._http.request(
20
+ "GET", "/v1/walmart/search",
21
+ params={"query": query, "country": country, "page": page},
22
+ response_model=WalmartSearchResponse,
23
+ )
24
+
25
+ def queue_search_crawl(self, query: str, *, country: Optional[str] = None) -> dict:
26
+ from pydantic import RootModel
27
+ from typing import Any
28
+ class _R(RootModel[dict[str, Any]]): pass
29
+ result = self._http.request(
30
+ "GET", "/v1/walmart/search/crawl",
31
+ params={"query": query, "country": country},
32
+ response_model=_R,
33
+ )
34
+ return result.root
35
+
36
+
37
+ class AsyncWalmartResource:
38
+ def __init__(self, http: AsyncHttpClient) -> None:
39
+ self._http = http
40
+
41
+ async def get_product(self, product_id: str, *, country: Optional[str] = None) -> WalmartProductResponse:
42
+ return await self._http.request(
43
+ "GET", "/v1/walmart/product",
44
+ params={"product_id": product_id, "country": country},
45
+ response_model=WalmartProductResponse,
46
+ )
47
+
48
+ async def search(self, query: str, *, country: Optional[str] = None, page: Optional[int] = None) -> WalmartSearchResponse:
49
+ return await self._http.request(
50
+ "GET", "/v1/walmart/search",
51
+ params={"query": query, "country": country, "page": page},
52
+ response_model=WalmartSearchResponse,
53
+ )
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+ from typing import Optional
3
+ from .._http import SyncHttpClient, AsyncHttpClient
4
+ from ..types import YouTubeVideoResponse, YouTubeSearchResponse, YouTubeSubtitleResponse
5
+
6
+
7
+ class YouTubeResource:
8
+ def __init__(self, http: SyncHttpClient) -> None:
9
+ self._http = http
10
+
11
+ def search(self, query: str, *, page: Optional[int] = None, country: Optional[str] = None, language: Optional[str] = None) -> YouTubeSearchResponse:
12
+ return self._http.request(
13
+ "GET", "/v1/youtube/search",
14
+ params={"query": query, "page": page, "country": country, "language": language},
15
+ response_model=YouTubeSearchResponse,
16
+ )
17
+
18
+ def get_video(self, video_id: str) -> YouTubeVideoResponse:
19
+ return self._http.request(
20
+ "GET", f"/v1/youtube/videos/{video_id}",
21
+ response_model=YouTubeVideoResponse,
22
+ )
23
+
24
+ def get_subtitles(self, video_id: str, *, language: Optional[str] = None) -> YouTubeSubtitleResponse:
25
+ return self._http.request(
26
+ "GET", "/v1/youtube/subtitles",
27
+ params={"video_id": video_id, "language": language},
28
+ response_model=YouTubeSubtitleResponse,
29
+ )
30
+
31
+ def queue_search_crawl(self, query: str, *, page: Optional[int] = None) -> dict:
32
+ from pydantic import RootModel
33
+ from typing import Any
34
+ class _R(RootModel[dict[str, Any]]): pass
35
+ result = self._http.request(
36
+ "POST", "/v1/youtube/search/crawl",
37
+ json={"query": query, "page": page},
38
+ response_model=_R,
39
+ )
40
+ return result.root
41
+
42
+
43
+ class AsyncYouTubeResource:
44
+ def __init__(self, http: AsyncHttpClient) -> None:
45
+ self._http = http
46
+
47
+ async def search(self, query: str, *, page: Optional[int] = None, country: Optional[str] = None, language: Optional[str] = None) -> YouTubeSearchResponse:
48
+ return await self._http.request(
49
+ "GET", "/v1/youtube/search",
50
+ params={"query": query, "page": page, "country": country, "language": language},
51
+ response_model=YouTubeSearchResponse,
52
+ )
53
+
54
+ async def get_video(self, video_id: str) -> YouTubeVideoResponse:
55
+ return await self._http.request(
56
+ "GET", f"/v1/youtube/videos/{video_id}",
57
+ response_model=YouTubeVideoResponse,
58
+ )
59
+
60
+ async def get_subtitles(self, video_id: str, *, language: Optional[str] = None) -> YouTubeSubtitleResponse:
61
+ return await self._http.request(
62
+ "GET", "/v1/youtube/subtitles",
63
+ params={"video_id": video_id, "language": language},
64
+ response_model=YouTubeSubtitleResponse,
65
+ )
66
+
67
+ async def queue_search_crawl(self, query: str, *, page: Optional[int] = None) -> dict:
68
+ from pydantic import RootModel
69
+ from typing import Any
70
+ class _R(RootModel[dict[str, Any]]): pass
71
+ result = await self._http.request(
72
+ "POST", "/v1/youtube/search/crawl",
73
+ json={"query": query, "page": page},
74
+ response_model=_R,
75
+ )
76
+ return result.root
scrapio/types.py ADDED
@@ -0,0 +1,136 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Literal, Optional, Union
3
+ from pydantic import BaseModel
4
+
5
+
6
+ # ---- Fetch ----
7
+
8
+ class FetchSession(BaseModel):
9
+ id: str
10
+
11
+ class FetchRequest(BaseModel):
12
+ url: str
13
+ render_js: Optional[bool] = None
14
+ device: Optional[Literal["desktop", "mobile", "tablet"]] = None
15
+ session: Optional[FetchSession] = None
16
+ output: Optional[list[str]] = None
17
+ extract: Optional[dict[str, Any]] = None
18
+ actions: Optional[list[Any]] = None
19
+ timeout: Optional[int] = None
20
+ proxy: Optional[str] = None
21
+ country: Optional[str] = None
22
+
23
+ class FetchResponse(BaseModel):
24
+ request_id: str
25
+ url: str
26
+ status_code: int
27
+ outputs: dict[str, Any]
28
+ diagnostics: Optional[dict[str, Any]] = None
29
+
30
+
31
+ # ---- Jobs ----
32
+
33
+ JobStatus = Literal["queued", "running", "completed", "partial", "failed", "cancelled"]
34
+
35
+ class CreateJobRequest(BaseModel):
36
+ job_type: str
37
+ payload: dict[str, Any]
38
+ webhook_url: Optional[str] = None
39
+
40
+ class Job(BaseModel):
41
+ request_id: str
42
+ job_id: str
43
+ job_type: str
44
+ status: str
45
+ created_at: str
46
+ updated_at: Optional[str] = None
47
+ webhook_url: Optional[str] = None
48
+
49
+ class JobError(BaseModel):
50
+ code: str
51
+ message: str
52
+
53
+ class JobResult(Job):
54
+ result: Optional[Any] = None
55
+ error: Optional[JobError] = None
56
+
57
+
58
+ # ---- Google ----
59
+
60
+ GoogleSearchType = Literal["classic", "news", "maps", "images", "lens", "shopping", "ai_mode", "ads"]
61
+ GoogleDevice = Literal["desktop", "mobile"]
62
+ GoogleDateRange = Literal["past_hour", "past_day", "past_week", "past_month", "past_year"]
63
+ GoogleSortBy = Literal["relevance", "reviews", "price_asc", "price_desc"]
64
+
65
+ class GoogleSearchParams(BaseModel):
66
+ search: str
67
+ search_type: Optional[GoogleSearchType] = None
68
+ country_code: Optional[str] = None
69
+ language: Optional[str] = None
70
+ device: Optional[GoogleDevice] = None
71
+ page: Optional[Union[int, str]] = None
72
+ date_range: Optional[GoogleDateRange] = None
73
+ latitude: Optional[Union[float, str]] = None
74
+ longitude: Optional[Union[float, str]] = None
75
+ radius: Optional[Union[float, str]] = None
76
+ sort_by: Optional[GoogleSortBy] = None
77
+
78
+ class GoogleSearchResponse(BaseModel):
79
+ request_id: str
80
+ results: list[Any]
81
+ pagination: Optional[dict[str, Any]] = None
82
+
83
+
84
+ # ---- Amazon ----
85
+
86
+ class AmazonProductResponse(BaseModel):
87
+ provider: str
88
+ asin: str
89
+ title: str
90
+ brand: Optional[str] = None
91
+ price: Optional[float] = None
92
+ currency: Optional[str] = None
93
+ availability: Optional[str] = None
94
+ rating: Optional[float] = None
95
+ review_count: Optional[int] = None
96
+ images: Optional[list[str]] = None
97
+ bullet_points: Optional[list[str]] = None
98
+ url: str
99
+ model_config = {"extra": "allow"}
100
+
101
+ class AmazonSearchResponse(BaseModel):
102
+ request_id: str
103
+ results: list[AmazonProductResponse]
104
+
105
+
106
+ # ---- Walmart ----
107
+
108
+ class WalmartProductResponse(BaseModel):
109
+ provider: str
110
+ product_id: str
111
+ title: str
112
+ brand: Optional[str] = None
113
+ price: Optional[float] = None
114
+ availability: Optional[str] = None
115
+ url: str
116
+ model_config = {"extra": "allow"}
117
+
118
+ class WalmartSearchResponse(BaseModel):
119
+ request_id: str
120
+ results: list[WalmartProductResponse]
121
+
122
+
123
+ # ---- YouTube ----
124
+
125
+ class YouTubeVideoResponse(BaseModel):
126
+ request_id: str
127
+ video: dict[str, Any]
128
+
129
+ class YouTubeSearchResponse(BaseModel):
130
+ request_id: str
131
+ results: list[Any]
132
+
133
+ class YouTubeSubtitleResponse(BaseModel):
134
+ request_id: str
135
+ video_id: str
136
+ subtitles: list[Any]
@@ -0,0 +1,173 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapio-py
3
+ Version: 1.0.0
4
+ Summary: Official Python SDK for the Scrapio
5
+ License: MIT
6
+ Requires-Python: >=3.9
7
+ Requires-Dist: httpx>=0.25.0
8
+ Requires-Dist: pydantic>=2.0.0
9
+ Provides-Extra: dev
10
+ Requires-Dist: anyio[trio]; extra == 'dev'
11
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
12
+ Requires-Dist: pytest>=7.0; extra == 'dev'
13
+ Description-Content-Type: text/markdown
14
+
15
+ # scrapio
16
+
17
+ Official Python SDK for [Scrapio](https://scrapio.dev) — fetch, crawl, search, and extract structured data from any URL.
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install scrapio-py
23
+ ```
24
+
25
+ Requires Python 3.9 or later.
26
+
27
+ ## Quickstart
28
+
29
+ ```python
30
+ from scrapio import ApiClient, FetchRequest
31
+
32
+ client = ApiClient(api_key="YOUR_API_KEY")
33
+
34
+ result = client.fetch.fetch(FetchRequest(
35
+ url="https://example.com",
36
+ output=["markdown"],
37
+ ))
38
+
39
+ print(result.outputs["markdown"])
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### Fetch a page
45
+
46
+ ```python
47
+ result = client.fetch.fetch(FetchRequest(
48
+ url="https://news.ycombinator.com",
49
+ render_js=True,
50
+ output=["markdown"],
51
+ ))
52
+ ```
53
+
54
+ ### Google Search
55
+
56
+ ```python
57
+ from scrapio import GoogleSearchParams
58
+
59
+ results = client.google.search(GoogleSearchParams(
60
+ search="best web scraping API 2025",
61
+ country_code="us",
62
+ ))
63
+ print(results.organic_results)
64
+ ```
65
+
66
+ ### Amazon product
67
+
68
+ ```python
69
+ product = client.amazon.get_product("B08N5WRWNW")
70
+ print(product.title, product.price)
71
+ ```
72
+
73
+ ### Walmart search
74
+
75
+ ```python
76
+ items = client.walmart.search("headphones")
77
+ ```
78
+
79
+ ### YouTube transcript
80
+
81
+ ```python
82
+ video = client.youtube.get_video("dQw4w9WgXcQ")
83
+ ```
84
+
85
+ ### Browser automation
86
+
87
+ ```python
88
+ result = client.interact.interact({
89
+ "url": "https://example.com",
90
+ "actions": [
91
+ {"type": "click", "selector": "#login"},
92
+ {"type": "type", "selector": "#email", "text": "user@example.com"},
93
+ ],
94
+ })
95
+ ```
96
+
97
+ ### Crawl a site
98
+
99
+ ```python
100
+ result = client.crawl.crawl({
101
+ "seeds": ["https://docs.example.com"],
102
+ "max_pages": 50,
103
+ })
104
+ ```
105
+
106
+ ### Async jobs
107
+
108
+ ```python
109
+ from scrapio import CreateJobRequest
110
+
111
+ job = client.jobs.create(CreateJobRequest(
112
+ job_type="fetch",
113
+ payload={"url": "https://example.com", "output": ["markdown"]},
114
+ ))
115
+ result = client.jobs.wait_for_completion(job.job_id, poll_interval=2.0, timeout=120.0)
116
+ ```
117
+
118
+ ### Async client
119
+
120
+ ```python
121
+ import asyncio
122
+ from scrapio import AsyncApiClient, FetchRequest
123
+
124
+ async def main():
125
+ async with AsyncApiClient(api_key="YOUR_API_KEY") as client:
126
+ result = await client.fetch.fetch(FetchRequest(
127
+ url="https://example.com",
128
+ output=["markdown"],
129
+ ))
130
+ print(result.outputs["markdown"])
131
+
132
+ asyncio.run(main())
133
+ ```
134
+
135
+ ## Configuration
136
+
137
+ | Option | Type | Default | Description |
138
+ |--------|------|---------|-------------|
139
+ | `api_key` | `str` | required | Your API key |
140
+ | `base_url` | `str` | `https://api.scrapio.dev` | Override for local/staging |
141
+ | `timeout` | `float` | `30.0` | Per-request timeout (seconds) |
142
+ | `max_retries` | `int` | `3` | Max retries on 429/503 |
143
+
144
+ ## Error handling
145
+
146
+ ```python
147
+ from scrapio import (
148
+ ApiClient, FetchRequest,
149
+ AuthError, RateLimitError, CreditsExhaustedError, ApiError,
150
+ )
151
+
152
+ try:
153
+ client.fetch.fetch(FetchRequest(url="https://example.com"))
154
+ except AuthError:
155
+ print("Invalid API key")
156
+ except CreditsExhaustedError:
157
+ print("No credits remaining")
158
+ except RateLimitError:
159
+ print("Rate limited — back off and retry")
160
+ except ApiError as e:
161
+ print(f"API error {e.status_code}: {e}")
162
+ ```
163
+
164
+ ## Links
165
+
166
+ - [Documentation](https://scrapio.dev/docs)
167
+ - [API Reference](https://scrapio.dev/docs/api-reference/fetch)
168
+ - [Dashboard](https://app.scrapio.dev)
169
+ - [Get an API key](https://scrapio.dev#pricing)
170
+
171
+ ## License
172
+
173
+ MIT
@@ -0,0 +1,15 @@
1
+ scrapio/__init__.py,sha256=-iCqn0QoLJbAP_kOp6fnOhno9r1eZHC8YTscelSM8wc,880
2
+ scrapio/_http.py,sha256=3-TE7q5rNEqstKonl9qpRb9dauXzW4y4kQjBFQrYDHM,4757
3
+ scrapio/client.py,sha256=2huKExDSASsRMylWJoipGN9bIw0-FJkhxxpR-HvK-q8,2604
4
+ scrapio/errors.py,sha256=UFYyZZPA8uyZpLs-DnD06JuxgzHsHe6Uqmg9zCgyUTg,832
5
+ scrapio/types.py,sha256=ullzSFMNhPF-t9G5fdvXNqPt4XQhysXGTdljOoVj8Mc,3528
6
+ scrapio/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ scrapio/resources/amazon.py,sha256=pjPvRJlmY-cV9TB3lntQQQcSaOd0KzmO5lfhqXIe9Ys,2064
8
+ scrapio/resources/fetch.py,sha256=ZsGY31HtrmLoXB7bUEkK0b-RMvdfl6de8ADi-dCQgzU,959
9
+ scrapio/resources/google.py,sha256=AJ97IjU9bQOokUqhvmy33NqIUmIQrepR_Jz4bsbt02U,906
10
+ scrapio/resources/jobs.py,sha256=mTzYqfUvrEqe7qNgUknPj-MZJtKnl9Fe6HUlqP1yxRE,2500
11
+ scrapio/resources/walmart.py,sha256=1UV0_9z-5RwhXWelUve-a59HMTcAVdX1rLz4iBd1-QA,2117
12
+ scrapio/resources/youtube.py,sha256=8VI7RpgtrDxzesWt3tsnjiT_M4iPrSDLBU1SBX8fyBs,3100
13
+ scrapio_py-1.0.0.dist-info/METADATA,sha256=lONsXnzZXmbXDWyXOTz6Do3kNNGIYyg9NpKLBcSLn8g,3658
14
+ scrapio_py-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
15
+ scrapio_py-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any