webcrawlerapi 2.0.0__tar.gz → 2.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/PKG-INFO +7 -11
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/README.md +6 -10
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/setup.py +1 -1
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi/__init__.py +4 -2
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi/client.py +129 -18
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi/models.py +51 -14
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/PKG-INFO +7 -11
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/setup.cfg +0 -0
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: webcrawlerapi
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.3
|
|
4
4
|
Summary: Python SDK for WebCrawler API
|
|
5
5
|
Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
|
|
6
6
|
Author: Andrew
|
|
@@ -102,19 +102,15 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
102
102
|
|
|
103
103
|
### Scraping
|
|
104
104
|
|
|
105
|
-
Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
|
|
106
|
-
|
|
107
105
|
```python
|
|
108
106
|
# Returns structured data directly
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
input_data={
|
|
112
|
-
"url": "https://example.com" # Scraper-specific input parameters. Check scraper description
|
|
113
|
-
},
|
|
114
|
-
webhook_url="https://yourserver.com/webhook", # Optional webhook
|
|
115
|
-
max_polls=20 # Optional: maximum number of status checks
|
|
107
|
+
response = crawler.scrape(
|
|
108
|
+
"url": "https://webcrawlerapi.com"
|
|
116
109
|
)
|
|
117
|
-
|
|
110
|
+
if response.success:
|
|
111
|
+
print(response.markdown)
|
|
112
|
+
else:
|
|
113
|
+
print(f"Code: {response.error_code} Error: {response.error_message}")
|
|
118
114
|
```
|
|
119
115
|
|
|
120
116
|
## API Methods
|
|
@@ -81,19 +81,15 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
81
81
|
|
|
82
82
|
### Scraping
|
|
83
83
|
|
|
84
|
-
Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
|
|
85
|
-
|
|
86
84
|
```python
|
|
87
85
|
# Returns structured data directly
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
input_data={
|
|
91
|
-
"url": "https://example.com" # Scraper-specific input parameters. Check scraper description
|
|
92
|
-
},
|
|
93
|
-
webhook_url="https://yourserver.com/webhook", # Optional webhook
|
|
94
|
-
max_polls=20 # Optional: maximum number of status checks
|
|
86
|
+
response = crawler.scrape(
|
|
87
|
+
"url": "https://webcrawlerapi.com"
|
|
95
88
|
)
|
|
96
|
-
|
|
89
|
+
if response.success:
|
|
90
|
+
print(response.markdown)
|
|
91
|
+
else:
|
|
92
|
+
print(f"Code: {response.error_code} Error: {response.error_message}")
|
|
97
93
|
```
|
|
98
94
|
|
|
99
95
|
## API Methods
|
|
@@ -24,7 +24,8 @@ Basic usage:
|
|
|
24
24
|
|
|
25
25
|
from .models import (
|
|
26
26
|
CrawlResponse,
|
|
27
|
-
|
|
27
|
+
ScrapeId,
|
|
28
|
+
ScrapeResponse,
|
|
28
29
|
ScrapeResponseError,
|
|
29
30
|
Job,
|
|
30
31
|
JobItem,
|
|
@@ -39,7 +40,8 @@ __all__ = [
|
|
|
39
40
|
"Job",
|
|
40
41
|
"JobItem",
|
|
41
42
|
"CrawlResponse",
|
|
42
|
-
"
|
|
43
|
+
"ScrapeId",
|
|
44
|
+
"ScrapeResponse",
|
|
43
45
|
"ScrapeResponseError",
|
|
44
46
|
"Action",
|
|
45
47
|
"UploadS3Action",
|
|
@@ -5,7 +5,8 @@ import time
|
|
|
5
5
|
|
|
6
6
|
from .models import (
|
|
7
7
|
CrawlResponse,
|
|
8
|
-
|
|
8
|
+
ScrapeId,
|
|
9
|
+
ScrapeResponse,
|
|
9
10
|
ScrapeResponseError,
|
|
10
11
|
Job,
|
|
11
12
|
Action,
|
|
@@ -17,14 +18,14 @@ class WebCrawlerAPI:
|
|
|
17
18
|
|
|
18
19
|
DEFAULT_POLL_DELAY_SECONDS = 5
|
|
19
20
|
|
|
20
|
-
def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "
|
|
21
|
+
def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v1"):
|
|
21
22
|
"""
|
|
22
23
|
Initialize the WebCrawler API client.
|
|
23
24
|
|
|
24
25
|
Args:
|
|
25
26
|
api_key (str): Your API key for authentication
|
|
26
27
|
base_url (str): The base URL of the API (optional)
|
|
27
|
-
version (str): API version to use (optional, defaults to '
|
|
28
|
+
version (str): API version to use (optional, defaults to 'v1')
|
|
28
29
|
"""
|
|
29
30
|
self.api_key = api_key
|
|
30
31
|
self.base_url = base_url.rstrip('/')
|
|
@@ -201,26 +202,28 @@ class WebCrawlerAPI:
|
|
|
201
202
|
# Return the last known state if max_polls is reached
|
|
202
203
|
return job
|
|
203
204
|
|
|
204
|
-
def
|
|
205
|
+
def scrape_async(
|
|
205
206
|
self,
|
|
206
207
|
url: str,
|
|
207
208
|
output_format: str = "markdown",
|
|
208
209
|
webhook_url: Optional[str] = None,
|
|
209
210
|
clean_selectors: Optional[str] = None,
|
|
211
|
+
prompt: Optional[str] = None,
|
|
210
212
|
actions: Optional[Union[Action, List[Action]]] = None
|
|
211
|
-
) ->
|
|
213
|
+
) -> ScrapeId:
|
|
212
214
|
"""
|
|
213
|
-
|
|
215
|
+
Start a new scraping job asynchronously.
|
|
214
216
|
|
|
215
217
|
Args:
|
|
216
218
|
url (str): The URL to scrape
|
|
217
219
|
output_format (str): Output format (markdown, cleaned, html)
|
|
218
220
|
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
219
221
|
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
220
|
-
|
|
222
|
+
prompt (str, optional): Prompt to guide the AI response
|
|
223
|
+
actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
|
|
221
224
|
|
|
222
225
|
Returns:
|
|
223
|
-
|
|
226
|
+
ScrapeId: Response containing the scrape job ID
|
|
224
227
|
|
|
225
228
|
Raises:
|
|
226
229
|
requests.exceptions.RequestException: If the API request fails
|
|
@@ -234,6 +237,8 @@ class WebCrawlerAPI:
|
|
|
234
237
|
payload["webhook_url"] = webhook_url
|
|
235
238
|
if clean_selectors:
|
|
236
239
|
payload["clean_selectors"] = clean_selectors
|
|
240
|
+
if prompt:
|
|
241
|
+
payload["prompt"] = prompt
|
|
237
242
|
if actions:
|
|
238
243
|
# Convert single action to list if needed
|
|
239
244
|
action_list = [actions] if not isinstance(actions, list) else actions
|
|
@@ -241,25 +246,131 @@ class WebCrawlerAPI:
|
|
|
241
246
|
payload["actions"] = [vars(action) for action in action_list]
|
|
242
247
|
|
|
243
248
|
response = self.session.post(
|
|
244
|
-
urljoin(self.base_url, f"/{self.version}/scrape"),
|
|
249
|
+
urljoin(self.base_url, f"/{self.version}/scrape?async=true"),
|
|
245
250
|
json=payload
|
|
246
251
|
)
|
|
247
252
|
|
|
253
|
+
if not response.ok:
|
|
254
|
+
try:
|
|
255
|
+
error_data = response.json()
|
|
256
|
+
raise requests.exceptions.HTTPError(
|
|
257
|
+
f"{response.status_code} {response.reason}: {error_data.get('error', 'Unknown error')}"
|
|
258
|
+
)
|
|
259
|
+
except ValueError:
|
|
260
|
+
# If response is not JSON, raise with status and text
|
|
261
|
+
raise requests.exceptions.HTTPError(
|
|
262
|
+
f"{response.status_code} {response.reason}: {response.text}"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
response.raise_for_status()
|
|
266
|
+
return ScrapeId(id=response.json()["id"])
|
|
267
|
+
|
|
268
|
+
def get_scrape(self, scrape_id: str) -> Union[ScrapeResponse, ScrapeResponseError]:
|
|
269
|
+
"""
|
|
270
|
+
Get the status and result of a specific scrape job.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
scrape_id (str): The unique identifier of the scrape job
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Union[ScrapeResponse, ScrapeResponseError]: The scrape result or error
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
requests.exceptions.RequestException: If the API request fails
|
|
280
|
+
"""
|
|
281
|
+
response = self.session.get(
|
|
282
|
+
urljoin(self.base_url, f"/{self.version}/scrape/{scrape_id}")
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
response.raise_for_status()
|
|
248
286
|
response_data = response.json()
|
|
249
287
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
288
|
+
status = response_data.get("status")
|
|
289
|
+
|
|
290
|
+
if status == "done":
|
|
291
|
+
return ScrapeResponse(
|
|
292
|
+
success=response_data.get("success", True),
|
|
293
|
+
status=status,
|
|
254
294
|
markdown=response_data.get("markdown"),
|
|
255
295
|
cleaned_content=response_data.get("cleaned_content"),
|
|
256
296
|
raw_content=response_data.get("raw_content"),
|
|
257
297
|
page_status_code=response_data.get("page_status_code", 0),
|
|
258
|
-
page_title=response_data.get("page_title")
|
|
298
|
+
page_title=response_data.get("page_title"),
|
|
299
|
+
structured_data=response_data.get("structured_data")
|
|
259
300
|
)
|
|
260
|
-
|
|
301
|
+
elif status == "error":
|
|
261
302
|
return ScrapeResponseError(
|
|
262
|
-
success=
|
|
303
|
+
success=False,
|
|
263
304
|
error_code=response_data.get("error_code", "unknown"),
|
|
264
|
-
error_message=response_data.get("error_message", "
|
|
265
|
-
|
|
305
|
+
error_message=response_data.get("error_message", "Scraping failed"),
|
|
306
|
+
status=status
|
|
307
|
+
)
|
|
308
|
+
else: # in_progress or any other status
|
|
309
|
+
return ScrapeResponse(
|
|
310
|
+
success=False,
|
|
311
|
+
status=status
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
def scrape(
|
|
315
|
+
self,
|
|
316
|
+
url: str,
|
|
317
|
+
output_format: str = "markdown",
|
|
318
|
+
webhook_url: Optional[str] = None,
|
|
319
|
+
clean_selectors: Optional[str] = None,
|
|
320
|
+
prompt: Optional[str] = None,
|
|
321
|
+
actions: Optional[Union[Action, List[Action]]] = None,
|
|
322
|
+
max_polls: int = 100
|
|
323
|
+
) -> Union[ScrapeResponse, ScrapeResponseError]:
|
|
324
|
+
"""
|
|
325
|
+
Scrape a single URL and wait for completion.
|
|
326
|
+
|
|
327
|
+
This method will start a scraping job and continuously poll its status
|
|
328
|
+
until it reaches a terminal state (done or error) or until
|
|
329
|
+
the maximum number of polls is reached.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
url (str): The URL to scrape
|
|
333
|
+
output_format (str): Output format (markdown, cleaned, html)
|
|
334
|
+
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
335
|
+
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
336
|
+
prompt (str, optional): Prompt to guide the AI response
|
|
337
|
+
actions (Action or List[Action], optional): Actions to perform during scraping
|
|
338
|
+
max_polls (int): Maximum number of status checks before returning (default: 100)
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Union[ScrapeResponse, ScrapeResponseError]: The final scrape result
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
requests.exceptions.RequestException: If any API request fails
|
|
345
|
+
"""
|
|
346
|
+
# Start the scraping job
|
|
347
|
+
response = self.scrape_async(
|
|
348
|
+
url=url,
|
|
349
|
+
output_format=output_format,
|
|
350
|
+
webhook_url=webhook_url,
|
|
351
|
+
clean_selectors=clean_selectors,
|
|
352
|
+
prompt=prompt,
|
|
353
|
+
actions=actions
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
scrape_id = response.id
|
|
357
|
+
polls = 0
|
|
358
|
+
|
|
359
|
+
while polls < max_polls:
|
|
360
|
+
result = self.get_scrape(scrape_id)
|
|
361
|
+
|
|
362
|
+
# Return immediately if scrape is done
|
|
363
|
+
if isinstance(result, ScrapeResponse) and result.status == "done":
|
|
364
|
+
return result
|
|
365
|
+
|
|
366
|
+
# Return immediately if there's an error
|
|
367
|
+
if isinstance(result, ScrapeResponseError):
|
|
368
|
+
return result
|
|
369
|
+
|
|
370
|
+
# Continue polling if status is in_progress or any other non-terminal status
|
|
371
|
+
# Wait before next poll
|
|
372
|
+
time.sleep(self.DEFAULT_POLL_DELAY_SECONDS)
|
|
373
|
+
polls += 1
|
|
374
|
+
|
|
375
|
+
# Return the last known state if max_polls is reached
|
|
376
|
+
return result
|
|
@@ -1,6 +1,34 @@
|
|
|
1
1
|
from typing import Optional, Dict, Any, List
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_datetime(datetime_str: str) -> datetime:
|
|
8
|
+
"""
|
|
9
|
+
Parse datetime string from API response, handling various microsecond formats.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
datetime_str (str): Datetime string from API
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
datetime: Parsed datetime object
|
|
16
|
+
"""
|
|
17
|
+
# Replace 'Z' with '+00:00' for timezone
|
|
18
|
+
datetime_str = datetime_str.replace('Z', '+00:00')
|
|
19
|
+
|
|
20
|
+
# Handle microseconds - pad to 6 digits or remove if present
|
|
21
|
+
# Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
|
|
22
|
+
pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
|
|
23
|
+
match = re.match(pattern, datetime_str)
|
|
24
|
+
|
|
25
|
+
if match:
|
|
26
|
+
base_time, microseconds, timezone_part = match.groups()
|
|
27
|
+
# Pad microseconds to 6 digits or truncate if longer
|
|
28
|
+
microseconds = microseconds.ljust(6, '0')[:6]
|
|
29
|
+
datetime_str = f"{base_time}.{microseconds}{timezone_part}"
|
|
30
|
+
|
|
31
|
+
return datetime.fromisoformat(datetime_str)
|
|
4
32
|
|
|
5
33
|
|
|
6
34
|
@dataclass
|
|
@@ -10,14 +38,22 @@ class CrawlResponse:
|
|
|
10
38
|
|
|
11
39
|
|
|
12
40
|
@dataclass
|
|
13
|
-
class
|
|
14
|
-
"""Response from
|
|
41
|
+
class ScrapeId:
|
|
42
|
+
"""Response from an asynchronous scrape request."""
|
|
43
|
+
id: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class ScrapeResponse:
|
|
48
|
+
"""Response from a scrape request."""
|
|
15
49
|
success: bool
|
|
50
|
+
status: Optional[str] = None
|
|
16
51
|
markdown: Optional[str] = None
|
|
17
52
|
cleaned_content: Optional[str] = None
|
|
18
53
|
raw_content: Optional[str] = None
|
|
19
54
|
page_status_code: int = 0
|
|
20
55
|
page_title: Optional[str] = None
|
|
56
|
+
structured_data: Optional[Dict[str, Any]] = None
|
|
21
57
|
|
|
22
58
|
|
|
23
59
|
@dataclass
|
|
@@ -26,6 +62,7 @@ class ScrapeResponseError:
|
|
|
26
62
|
success: bool
|
|
27
63
|
error_code: str
|
|
28
64
|
error_message: str
|
|
65
|
+
status: Optional[str] = None
|
|
29
66
|
|
|
30
67
|
|
|
31
68
|
@dataclass
|
|
@@ -69,11 +106,11 @@ class JobItem:
|
|
|
69
106
|
self.page_status_code: int = data["page_status_code"]
|
|
70
107
|
self.status: str = data["status"]
|
|
71
108
|
self.title: str = data["title"]
|
|
72
|
-
self.created_at: datetime =
|
|
73
|
-
self.updated_at: datetime =
|
|
74
|
-
self.cost: int = data
|
|
75
|
-
self.referred_url: str = data
|
|
76
|
-
self.last_error: str = data
|
|
109
|
+
self.created_at: datetime = parse_datetime(data["created_at"])
|
|
110
|
+
self.updated_at: datetime = parse_datetime(data["updated_at"])
|
|
111
|
+
self.cost: int = data.get("cost", 0)
|
|
112
|
+
self.referred_url: Optional[str] = data.get("referred_url")
|
|
113
|
+
self.last_error: Optional[str] = data.get("last_error")
|
|
77
114
|
self.error_code: Optional[str] = data.get("error_code")
|
|
78
115
|
|
|
79
116
|
# Optional content URLs based on scrape_type
|
|
@@ -146,19 +183,19 @@ class Job:
|
|
|
146
183
|
self.url: str = data["url"]
|
|
147
184
|
self.status: str = data["status"]
|
|
148
185
|
self.scrape_type: str = data["scrape_type"]
|
|
149
|
-
self.whitelist_regexp: str = data
|
|
150
|
-
self.blacklist_regexp: str = data
|
|
151
|
-
self.allow_subdomains: bool = data
|
|
186
|
+
self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
|
|
187
|
+
self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
|
|
188
|
+
self.allow_subdomains: bool = data.get("allow_subdomains", False)
|
|
152
189
|
self.items_limit: int = data["items_limit"]
|
|
153
|
-
self.created_at: datetime =
|
|
154
|
-
self.updated_at: datetime =
|
|
155
|
-
self.webhook_url: str = data
|
|
190
|
+
self.created_at: datetime = parse_datetime(data["created_at"])
|
|
191
|
+
self.updated_at: datetime = parse_datetime(data["updated_at"])
|
|
192
|
+
self.webhook_url: Optional[str] = data.get("webhook_url")
|
|
156
193
|
self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
|
|
157
194
|
|
|
158
195
|
# Optional fields
|
|
159
196
|
self.finished_at: Optional[datetime] = None
|
|
160
197
|
if data.get("finished_at"):
|
|
161
|
-
self.finished_at =
|
|
198
|
+
self.finished_at = parse_datetime(data["finished_at"])
|
|
162
199
|
|
|
163
200
|
self.webhook_status: Optional[str] = data.get("webhook_status")
|
|
164
201
|
self.webhook_error: Optional[str] = data.get("webhook_error")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: webcrawlerapi
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.3
|
|
4
4
|
Summary: Python SDK for WebCrawler API
|
|
5
5
|
Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
|
|
6
6
|
Author: Andrew
|
|
@@ -102,19 +102,15 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
102
102
|
|
|
103
103
|
### Scraping
|
|
104
104
|
|
|
105
|
-
Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
|
|
106
|
-
|
|
107
105
|
```python
|
|
108
106
|
# Returns structured data directly
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
input_data={
|
|
112
|
-
"url": "https://example.com" # Scraper-specific input parameters. Check scraper description
|
|
113
|
-
},
|
|
114
|
-
webhook_url="https://yourserver.com/webhook", # Optional webhook
|
|
115
|
-
max_polls=20 # Optional: maximum number of status checks
|
|
107
|
+
response = crawler.scrape(
|
|
108
|
+
"url": "https://webcrawlerapi.com"
|
|
116
109
|
)
|
|
117
|
-
|
|
110
|
+
if response.success:
|
|
111
|
+
print(response.markdown)
|
|
112
|
+
else:
|
|
113
|
+
print(f"Code: {response.error_code} Error: {response.error_message}")
|
|
118
114
|
```
|
|
119
115
|
|
|
120
116
|
## API Methods
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|