webcrawlerapi 2.0.1__tar.gz → 2.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.1
3
+ Version: 2.0.3
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.1",
5
+ version="2.0.3",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -25,7 +25,7 @@ class WebCrawlerAPI:
25
25
  Args:
26
26
  api_key (str): Your API key for authentication
27
27
  base_url (str): The base URL of the API (optional)
28
- version (str): API version to use (optional, defaults to 'v2')
28
+ version (str): API version to use (optional, defaults to 'v1')
29
29
  """
30
30
  self.api_key = api_key
31
31
  self.base_url = base_url.rstrip('/')
@@ -208,6 +208,7 @@ class WebCrawlerAPI:
208
208
  output_format: str = "markdown",
209
209
  webhook_url: Optional[str] = None,
210
210
  clean_selectors: Optional[str] = None,
211
+ prompt: Optional[str] = None,
211
212
  actions: Optional[Union[Action, List[Action]]] = None
212
213
  ) -> ScrapeId:
213
214
  """
@@ -218,6 +219,7 @@ class WebCrawlerAPI:
218
219
  output_format (str): Output format (markdown, cleaned, html)
219
220
  webhook_url (str, optional): URL to receive a POST request when scraping is complete
220
221
  clean_selectors (str, optional): CSS selectors to clean from the content
222
+ prompt (str, optional): Prompt to guide the AI response
221
223
  actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
222
224
 
223
225
  Returns:
@@ -235,6 +237,8 @@ class WebCrawlerAPI:
235
237
  payload["webhook_url"] = webhook_url
236
238
  if clean_selectors:
237
239
  payload["clean_selectors"] = clean_selectors
240
+ if prompt:
241
+ payload["prompt"] = prompt
238
242
  if actions:
239
243
  # Convert single action to list if needed
240
244
  action_list = [actions] if not isinstance(actions, list) else actions
@@ -291,7 +295,8 @@ class WebCrawlerAPI:
291
295
  cleaned_content=response_data.get("cleaned_content"),
292
296
  raw_content=response_data.get("raw_content"),
293
297
  page_status_code=response_data.get("page_status_code", 0),
294
- page_title=response_data.get("page_title")
298
+ page_title=response_data.get("page_title"),
299
+ structured_data=response_data.get("structured_data")
295
300
  )
296
301
  elif status == "error":
297
302
  return ScrapeResponseError(
@@ -312,6 +317,7 @@ class WebCrawlerAPI:
312
317
  output_format: str = "markdown",
313
318
  webhook_url: Optional[str] = None,
314
319
  clean_selectors: Optional[str] = None,
320
+ prompt: Optional[str] = None,
315
321
  actions: Optional[Union[Action, List[Action]]] = None,
316
322
  max_polls: int = 100
317
323
  ) -> Union[ScrapeResponse, ScrapeResponseError]:
@@ -327,6 +333,7 @@ class WebCrawlerAPI:
327
333
  output_format (str): Output format (markdown, cleaned, html)
328
334
  webhook_url (str, optional): URL to receive a POST request when scraping is complete
329
335
  clean_selectors (str, optional): CSS selectors to clean from the content
336
+ prompt (str, optional): Prompt to guide the AI response
330
337
  actions (Action or List[Action], optional): Actions to perform during scraping
331
338
  max_polls (int): Maximum number of status checks before returning (default: 100)
332
339
 
@@ -342,6 +349,7 @@ class WebCrawlerAPI:
342
349
  output_format=output_format,
343
350
  webhook_url=webhook_url,
344
351
  clean_selectors=clean_selectors,
352
+ prompt=prompt,
345
353
  actions=actions
346
354
  )
347
355
 
@@ -1,6 +1,34 @@
1
1
  from typing import Optional, Dict, Any, List
2
2
  from datetime import datetime
3
3
  from dataclasses import dataclass
4
+ import re
5
+
6
+
7
+ def parse_datetime(datetime_str: str) -> datetime:
8
+ """
9
+ Parse datetime string from API response, handling various microsecond formats.
10
+
11
+ Args:
12
+ datetime_str (str): Datetime string from API
13
+
14
+ Returns:
15
+ datetime: Parsed datetime object
16
+ """
17
+ # Replace 'Z' with '+00:00' for timezone
18
+ datetime_str = datetime_str.replace('Z', '+00:00')
19
+
20
+ # Handle microseconds - pad to 6 digits or remove if present
21
+ # Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
22
+ pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
23
+ match = re.match(pattern, datetime_str)
24
+
25
+ if match:
26
+ base_time, microseconds, timezone_part = match.groups()
27
+ # Pad microseconds to 6 digits or truncate if longer
28
+ microseconds = microseconds.ljust(6, '0')[:6]
29
+ datetime_str = f"{base_time}.{microseconds}{timezone_part}"
30
+
31
+ return datetime.fromisoformat(datetime_str)
4
32
 
5
33
 
6
34
  @dataclass
@@ -25,6 +53,7 @@ class ScrapeResponse:
25
53
  raw_content: Optional[str] = None
26
54
  page_status_code: int = 0
27
55
  page_title: Optional[str] = None
56
+ structured_data: Optional[Dict[str, Any]] = None
28
57
 
29
58
 
30
59
  @dataclass
@@ -77,8 +106,8 @@ class JobItem:
77
106
  self.page_status_code: int = data["page_status_code"]
78
107
  self.status: str = data["status"]
79
108
  self.title: str = data["title"]
80
- self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
81
- self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
109
+ self.created_at: datetime = parse_datetime(data["created_at"])
110
+ self.updated_at: datetime = parse_datetime(data["updated_at"])
82
111
  self.cost: int = data.get("cost", 0)
83
112
  self.referred_url: Optional[str] = data.get("referred_url")
84
113
  self.last_error: Optional[str] = data.get("last_error")
@@ -156,17 +185,17 @@ class Job:
156
185
  self.scrape_type: str = data["scrape_type"]
157
186
  self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
158
187
  self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
159
- self.allow_subdomains: bool = data["allow_subdomains"]
188
+ self.allow_subdomains: bool = data.get("allow_subdomains", False)
160
189
  self.items_limit: int = data["items_limit"]
161
- self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
162
- self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
190
+ self.created_at: datetime = parse_datetime(data["created_at"])
191
+ self.updated_at: datetime = parse_datetime(data["updated_at"])
163
192
  self.webhook_url: Optional[str] = data.get("webhook_url")
164
193
  self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
165
194
 
166
195
  # Optional fields
167
196
  self.finished_at: Optional[datetime] = None
168
197
  if data.get("finished_at"):
169
- self.finished_at = datetime.fromisoformat(data["finished_at"].replace('Z', '+00:00'))
198
+ self.finished_at = parse_datetime(data["finished_at"])
170
199
 
171
200
  self.webhook_status: Optional[str] = data.get("webhook_status")
172
201
  self.webhook_error: Optional[str] = data.get("webhook_error")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.1
3
+ Version: 2.0.3
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
File without changes
File without changes