webcrawlerapi 2.0.0__tar.gz → 2.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.0
3
+ Version: 2.0.3
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -102,19 +102,15 @@ print(f"Cancellation response: {cancel_response['message']}")
102
102
 
103
103
  ### Scraping
104
104
 
105
- Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
106
-
107
105
  ```python
108
106
  # Returns structured data directly
109
- structured_data = crawler.scrape(
110
- crawler_id="webcrawler/url-to-md", # ID of the scraper
111
- input_data={
112
- "url": "https://example.com" # Scraper-specific input parameters. Check scraper description
113
- },
114
- webhook_url="https://yourserver.com/webhook", # Optional webhook
115
- max_polls=20 # Optional: maximum number of status checks
107
+ response = crawler.scrape(
108
+ "url": "https://webcrawlerapi.com"
116
109
  )
117
- print(structured_data) # Direct access to scraped data
110
+ if response.success:
111
+ print(response.markdown)
112
+ else:
113
+ print(f"Code: {response.error_code} Error: {response.error_message}")
118
114
  ```
119
115
 
120
116
  ## API Methods
@@ -81,19 +81,15 @@ print(f"Cancellation response: {cancel_response['message']}")
81
81
 
82
82
  ### Scraping
83
83
 
84
- Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
85
-
86
84
  ```python
87
85
  # Returns structured data directly
88
- structured_data = crawler.scrape(
89
- crawler_id="webcrawler/url-to-md", # ID of the scraper
90
- input_data={
91
- "url": "https://example.com" # Scraper-specific input parameters. Check scraper description
92
- },
93
- webhook_url="https://yourserver.com/webhook", # Optional webhook
94
- max_polls=20 # Optional: maximum number of status checks
86
+ response = crawler.scrape(
87
+ "url": "https://webcrawlerapi.com"
95
88
  )
96
- print(structured_data) # Direct access to scraped data
89
+ if response.success:
90
+ print(response.markdown)
91
+ else:
92
+ print(f"Code: {response.error_code} Error: {response.error_message}")
97
93
  ```
98
94
 
99
95
  ## API Methods
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.0",
5
+ version="2.0.3",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -24,7 +24,8 @@ Basic usage:
24
24
 
25
25
  from .models import (
26
26
  CrawlResponse,
27
- ScrapeResponseV2,
27
+ ScrapeId,
28
+ ScrapeResponse,
28
29
  ScrapeResponseError,
29
30
  Job,
30
31
  JobItem,
@@ -39,7 +40,8 @@ __all__ = [
39
40
  "Job",
40
41
  "JobItem",
41
42
  "CrawlResponse",
42
- "ScrapeResponseV2",
43
+ "ScrapeId",
44
+ "ScrapeResponse",
43
45
  "ScrapeResponseError",
44
46
  "Action",
45
47
  "UploadS3Action",
@@ -5,7 +5,8 @@ import time
5
5
 
6
6
  from .models import (
7
7
  CrawlResponse,
8
- ScrapeResponseV2,
8
+ ScrapeId,
9
+ ScrapeResponse,
9
10
  ScrapeResponseError,
10
11
  Job,
11
12
  Action,
@@ -17,14 +18,14 @@ class WebCrawlerAPI:
17
18
 
18
19
  DEFAULT_POLL_DELAY_SECONDS = 5
19
20
 
20
- def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v2"):
21
+ def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v1"):
21
22
  """
22
23
  Initialize the WebCrawler API client.
23
24
 
24
25
  Args:
25
26
  api_key (str): Your API key for authentication
26
27
  base_url (str): The base URL of the API (optional)
27
- version (str): API version to use (optional, defaults to 'v2')
28
+ version (str): API version to use (optional, defaults to 'v1')
28
29
  """
29
30
  self.api_key = api_key
30
31
  self.base_url = base_url.rstrip('/')
@@ -201,26 +202,28 @@ class WebCrawlerAPI:
201
202
  # Return the last known state if max_polls is reached
202
203
  return job
203
204
 
204
- def scrape(
205
+ def scrape_async(
205
206
  self,
206
207
  url: str,
207
208
  output_format: str = "markdown",
208
209
  webhook_url: Optional[str] = None,
209
210
  clean_selectors: Optional[str] = None,
211
+ prompt: Optional[str] = None,
210
212
  actions: Optional[Union[Action, List[Action]]] = None
211
- ) -> Union[ScrapeResponseV2, ScrapeResponseError]:
213
+ ) -> ScrapeId:
212
214
  """
213
- Scrape a single URL synchronously.
215
+ Start a new scraping job asynchronously.
214
216
 
215
217
  Args:
216
218
  url (str): The URL to scrape
217
219
  output_format (str): Output format (markdown, cleaned, html)
218
220
  webhook_url (str, optional): URL to receive a POST request when scraping is complete
219
221
  clean_selectors (str, optional): CSS selectors to clean from the content
220
- actions (Action or List[Action], optional): Actions to perform during scraping
222
+ prompt (str, optional): Prompt to guide the AI response
223
+ actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
221
224
 
222
225
  Returns:
223
- Union[ScrapeResponseV2, ScrapeResponseError]: Response containing the scraped content or error
226
+ ScrapeId: Response containing the scrape job ID
224
227
 
225
228
  Raises:
226
229
  requests.exceptions.RequestException: If the API request fails
@@ -234,6 +237,8 @@ class WebCrawlerAPI:
234
237
  payload["webhook_url"] = webhook_url
235
238
  if clean_selectors:
236
239
  payload["clean_selectors"] = clean_selectors
240
+ if prompt:
241
+ payload["prompt"] = prompt
237
242
  if actions:
238
243
  # Convert single action to list if needed
239
244
  action_list = [actions] if not isinstance(actions, list) else actions
@@ -241,25 +246,131 @@ class WebCrawlerAPI:
241
246
  payload["actions"] = [vars(action) for action in action_list]
242
247
 
243
248
  response = self.session.post(
244
- urljoin(self.base_url, f"/{self.version}/scrape"),
249
+ urljoin(self.base_url, f"/{self.version}/scrape?async=true"),
245
250
  json=payload
246
251
  )
247
252
 
253
+ if not response.ok:
254
+ try:
255
+ error_data = response.json()
256
+ raise requests.exceptions.HTTPError(
257
+ f"{response.status_code} {response.reason}: {error_data.get('error', 'Unknown error')}"
258
+ )
259
+ except ValueError:
260
+ # If response is not JSON, raise with status and text
261
+ raise requests.exceptions.HTTPError(
262
+ f"{response.status_code} {response.reason}: {response.text}"
263
+ )
264
+
265
+ response.raise_for_status()
266
+ return ScrapeId(id=response.json()["id"])
267
+
268
+ def get_scrape(self, scrape_id: str) -> Union[ScrapeResponse, ScrapeResponseError]:
269
+ """
270
+ Get the status and result of a specific scrape job.
271
+
272
+ Args:
273
+ scrape_id (str): The unique identifier of the scrape job
274
+
275
+ Returns:
276
+ Union[ScrapeResponse, ScrapeResponseError]: The scrape result or error
277
+
278
+ Raises:
279
+ requests.exceptions.RequestException: If the API request fails
280
+ """
281
+ response = self.session.get(
282
+ urljoin(self.base_url, f"/{self.version}/scrape/{scrape_id}")
283
+ )
284
+
285
+ response.raise_for_status()
248
286
  response_data = response.json()
249
287
 
250
- # Check if the response indicates success or error
251
- if response_data.get("success", False):
252
- return ScrapeResponseV2(
253
- success=response_data["success"],
288
+ status = response_data.get("status")
289
+
290
+ if status == "done":
291
+ return ScrapeResponse(
292
+ success=response_data.get("success", True),
293
+ status=status,
254
294
  markdown=response_data.get("markdown"),
255
295
  cleaned_content=response_data.get("cleaned_content"),
256
296
  raw_content=response_data.get("raw_content"),
257
297
  page_status_code=response_data.get("page_status_code", 0),
258
- page_title=response_data.get("page_title")
298
+ page_title=response_data.get("page_title"),
299
+ structured_data=response_data.get("structured_data")
259
300
  )
260
- else:
301
+ elif status == "error":
261
302
  return ScrapeResponseError(
262
- success=response_data.get("success", False),
303
+ success=False,
263
304
  error_code=response_data.get("error_code", "unknown"),
264
- error_message=response_data.get("error_message", "Unknown error")
265
- )
305
+ error_message=response_data.get("error_message", "Scraping failed"),
306
+ status=status
307
+ )
308
+ else: # in_progress or any other status
309
+ return ScrapeResponse(
310
+ success=False,
311
+ status=status
312
+ )
313
+
314
+ def scrape(
315
+ self,
316
+ url: str,
317
+ output_format: str = "markdown",
318
+ webhook_url: Optional[str] = None,
319
+ clean_selectors: Optional[str] = None,
320
+ prompt: Optional[str] = None,
321
+ actions: Optional[Union[Action, List[Action]]] = None,
322
+ max_polls: int = 100
323
+ ) -> Union[ScrapeResponse, ScrapeResponseError]:
324
+ """
325
+ Scrape a single URL and wait for completion.
326
+
327
+ This method will start a scraping job and continuously poll its status
328
+ until it reaches a terminal state (done or error) or until
329
+ the maximum number of polls is reached.
330
+
331
+ Args:
332
+ url (str): The URL to scrape
333
+ output_format (str): Output format (markdown, cleaned, html)
334
+ webhook_url (str, optional): URL to receive a POST request when scraping is complete
335
+ clean_selectors (str, optional): CSS selectors to clean from the content
336
+ prompt (str, optional): Prompt to guide the AI response
337
+ actions (Action or List[Action], optional): Actions to perform during scraping
338
+ max_polls (int): Maximum number of status checks before returning (default: 100)
339
+
340
+ Returns:
341
+ Union[ScrapeResponse, ScrapeResponseError]: The final scrape result
342
+
343
+ Raises:
344
+ requests.exceptions.RequestException: If any API request fails
345
+ """
346
+ # Start the scraping job
347
+ response = self.scrape_async(
348
+ url=url,
349
+ output_format=output_format,
350
+ webhook_url=webhook_url,
351
+ clean_selectors=clean_selectors,
352
+ prompt=prompt,
353
+ actions=actions
354
+ )
355
+
356
+ scrape_id = response.id
357
+ polls = 0
358
+
359
+ while polls < max_polls:
360
+ result = self.get_scrape(scrape_id)
361
+
362
+ # Return immediately if scrape is done
363
+ if isinstance(result, ScrapeResponse) and result.status == "done":
364
+ return result
365
+
366
+ # Return immediately if there's an error
367
+ if isinstance(result, ScrapeResponseError):
368
+ return result
369
+
370
+ # Continue polling if status is in_progress or any other non-terminal status
371
+ # Wait before next poll
372
+ time.sleep(self.DEFAULT_POLL_DELAY_SECONDS)
373
+ polls += 1
374
+
375
+ # Return the last known state if max_polls is reached
376
+ return result
@@ -1,6 +1,34 @@
1
1
  from typing import Optional, Dict, Any, List
2
2
  from datetime import datetime
3
3
  from dataclasses import dataclass
4
+ import re
5
+
6
+
7
+ def parse_datetime(datetime_str: str) -> datetime:
8
+ """
9
+ Parse datetime string from API response, handling various microsecond formats.
10
+
11
+ Args:
12
+ datetime_str (str): Datetime string from API
13
+
14
+ Returns:
15
+ datetime: Parsed datetime object
16
+ """
17
+ # Replace 'Z' with '+00:00' for timezone
18
+ datetime_str = datetime_str.replace('Z', '+00:00')
19
+
20
+ # Handle microseconds - pad to 6 digits or remove if present
21
+ # Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
22
+ pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
23
+ match = re.match(pattern, datetime_str)
24
+
25
+ if match:
26
+ base_time, microseconds, timezone_part = match.groups()
27
+ # Pad microseconds to 6 digits or truncate if longer
28
+ microseconds = microseconds.ljust(6, '0')[:6]
29
+ datetime_str = f"{base_time}.{microseconds}{timezone_part}"
30
+
31
+ return datetime.fromisoformat(datetime_str)
4
32
 
5
33
 
6
34
  @dataclass
@@ -10,14 +38,22 @@ class CrawlResponse:
10
38
 
11
39
 
12
40
  @dataclass
13
- class ScrapeResponseV2:
14
- """Response from a synchronous scrape request."""
41
+ class ScrapeId:
42
+ """Response from an asynchronous scrape request."""
43
+ id: str
44
+
45
+
46
+ @dataclass
47
+ class ScrapeResponse:
48
+ """Response from a scrape request."""
15
49
  success: bool
50
+ status: Optional[str] = None
16
51
  markdown: Optional[str] = None
17
52
  cleaned_content: Optional[str] = None
18
53
  raw_content: Optional[str] = None
19
54
  page_status_code: int = 0
20
55
  page_title: Optional[str] = None
56
+ structured_data: Optional[Dict[str, Any]] = None
21
57
 
22
58
 
23
59
  @dataclass
@@ -26,6 +62,7 @@ class ScrapeResponseError:
26
62
  success: bool
27
63
  error_code: str
28
64
  error_message: str
65
+ status: Optional[str] = None
29
66
 
30
67
 
31
68
  @dataclass
@@ -69,11 +106,11 @@ class JobItem:
69
106
  self.page_status_code: int = data["page_status_code"]
70
107
  self.status: str = data["status"]
71
108
  self.title: str = data["title"]
72
- self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
73
- self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
74
- self.cost: int = data["cost"]
75
- self.referred_url: str = data["referred_url"]
76
- self.last_error: str = data["last_error"]
109
+ self.created_at: datetime = parse_datetime(data["created_at"])
110
+ self.updated_at: datetime = parse_datetime(data["updated_at"])
111
+ self.cost: int = data.get("cost", 0)
112
+ self.referred_url: Optional[str] = data.get("referred_url")
113
+ self.last_error: Optional[str] = data.get("last_error")
77
114
  self.error_code: Optional[str] = data.get("error_code")
78
115
 
79
116
  # Optional content URLs based on scrape_type
@@ -146,19 +183,19 @@ class Job:
146
183
  self.url: str = data["url"]
147
184
  self.status: str = data["status"]
148
185
  self.scrape_type: str = data["scrape_type"]
149
- self.whitelist_regexp: str = data["whitelist_regexp"]
150
- self.blacklist_regexp: str = data["blacklist_regexp"]
151
- self.allow_subdomains: bool = data["allow_subdomains"]
186
+ self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
187
+ self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
188
+ self.allow_subdomains: bool = data.get("allow_subdomains", False)
152
189
  self.items_limit: int = data["items_limit"]
153
- self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
154
- self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
155
- self.webhook_url: str = data["webhook_url"]
190
+ self.created_at: datetime = parse_datetime(data["created_at"])
191
+ self.updated_at: datetime = parse_datetime(data["updated_at"])
192
+ self.webhook_url: Optional[str] = data.get("webhook_url")
156
193
  self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
157
194
 
158
195
  # Optional fields
159
196
  self.finished_at: Optional[datetime] = None
160
197
  if data.get("finished_at"):
161
- self.finished_at = datetime.fromisoformat(data["finished_at"].replace('Z', '+00:00'))
198
+ self.finished_at = parse_datetime(data["finished_at"])
162
199
 
163
200
  self.webhook_status: Optional[str] = data.get("webhook_status")
164
201
  self.webhook_error: Optional[str] = data.get("webhook_error")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.0
3
+ Version: 2.0.3
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -102,19 +102,15 @@ print(f"Cancellation response: {cancel_response['message']}")
102
102
 
103
103
  ### Scraping
104
104
 
105
- Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
106
-
107
105
  ```python
108
106
  # Returns structured data directly
109
- structured_data = crawler.scrape(
110
- crawler_id="webcrawler/url-to-md", # ID of the scraper
111
- input_data={
112
- "url": "https://example.com" # Scraper-specific input parameters. Check scraper description
113
- },
114
- webhook_url="https://yourserver.com/webhook", # Optional webhook
115
- max_polls=20 # Optional: maximum number of status checks
107
+ response = crawler.scrape(
108
+ "url": "https://webcrawlerapi.com"
116
109
  )
117
- print(structured_data) # Direct access to scraped data
110
+ if response.success:
111
+ print(response.markdown)
112
+ else:
113
+ print(f"Code: {response.error_code} Error: {response.error_message}")
118
114
  ```
119
115
 
120
116
  ## API Methods
File without changes