webcrawlerapi 2.0.1__tar.gz → 2.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/PKG-INFO +1 -1
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/setup.py +1 -1
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi/client.py +10 -2
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi/models.py +35 -6
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/PKG-INFO +1 -1
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/README.md +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/setup.cfg +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi/__init__.py +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -25,7 +25,7 @@ class WebCrawlerAPI:
|
|
|
25
25
|
Args:
|
|
26
26
|
api_key (str): Your API key for authentication
|
|
27
27
|
base_url (str): The base URL of the API (optional)
|
|
28
|
-
version (str): API version to use (optional, defaults to '
|
|
28
|
+
version (str): API version to use (optional, defaults to 'v1')
|
|
29
29
|
"""
|
|
30
30
|
self.api_key = api_key
|
|
31
31
|
self.base_url = base_url.rstrip('/')
|
|
@@ -208,6 +208,7 @@ class WebCrawlerAPI:
|
|
|
208
208
|
output_format: str = "markdown",
|
|
209
209
|
webhook_url: Optional[str] = None,
|
|
210
210
|
clean_selectors: Optional[str] = None,
|
|
211
|
+
prompt: Optional[str] = None,
|
|
211
212
|
actions: Optional[Union[Action, List[Action]]] = None
|
|
212
213
|
) -> ScrapeId:
|
|
213
214
|
"""
|
|
@@ -218,6 +219,7 @@ class WebCrawlerAPI:
|
|
|
218
219
|
output_format (str): Output format (markdown, cleaned, html)
|
|
219
220
|
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
220
221
|
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
222
|
+
prompt (str, optional): Prompt to guide the AI response
|
|
221
223
|
actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
|
|
222
224
|
|
|
223
225
|
Returns:
|
|
@@ -235,6 +237,8 @@ class WebCrawlerAPI:
|
|
|
235
237
|
payload["webhook_url"] = webhook_url
|
|
236
238
|
if clean_selectors:
|
|
237
239
|
payload["clean_selectors"] = clean_selectors
|
|
240
|
+
if prompt:
|
|
241
|
+
payload["prompt"] = prompt
|
|
238
242
|
if actions:
|
|
239
243
|
# Convert single action to list if needed
|
|
240
244
|
action_list = [actions] if not isinstance(actions, list) else actions
|
|
@@ -291,7 +295,8 @@ class WebCrawlerAPI:
|
|
|
291
295
|
cleaned_content=response_data.get("cleaned_content"),
|
|
292
296
|
raw_content=response_data.get("raw_content"),
|
|
293
297
|
page_status_code=response_data.get("page_status_code", 0),
|
|
294
|
-
page_title=response_data.get("page_title")
|
|
298
|
+
page_title=response_data.get("page_title"),
|
|
299
|
+
structured_data=response_data.get("structured_data")
|
|
295
300
|
)
|
|
296
301
|
elif status == "error":
|
|
297
302
|
return ScrapeResponseError(
|
|
@@ -312,6 +317,7 @@ class WebCrawlerAPI:
|
|
|
312
317
|
output_format: str = "markdown",
|
|
313
318
|
webhook_url: Optional[str] = None,
|
|
314
319
|
clean_selectors: Optional[str] = None,
|
|
320
|
+
prompt: Optional[str] = None,
|
|
315
321
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
316
322
|
max_polls: int = 100
|
|
317
323
|
) -> Union[ScrapeResponse, ScrapeResponseError]:
|
|
@@ -327,6 +333,7 @@ class WebCrawlerAPI:
|
|
|
327
333
|
output_format (str): Output format (markdown, cleaned, html)
|
|
328
334
|
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
329
335
|
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
336
|
+
prompt (str, optional): Prompt to guide the AI response
|
|
330
337
|
actions (Action or List[Action], optional): Actions to perform during scraping
|
|
331
338
|
max_polls (int): Maximum number of status checks before returning (default: 100)
|
|
332
339
|
|
|
@@ -342,6 +349,7 @@ class WebCrawlerAPI:
|
|
|
342
349
|
output_format=output_format,
|
|
343
350
|
webhook_url=webhook_url,
|
|
344
351
|
clean_selectors=clean_selectors,
|
|
352
|
+
prompt=prompt,
|
|
345
353
|
actions=actions
|
|
346
354
|
)
|
|
347
355
|
|
|
@@ -1,6 +1,34 @@
|
|
|
1
1
|
from typing import Optional, Dict, Any, List
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_datetime(datetime_str: str) -> datetime:
|
|
8
|
+
"""
|
|
9
|
+
Parse datetime string from API response, handling various microsecond formats.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
datetime_str (str): Datetime string from API
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
datetime: Parsed datetime object
|
|
16
|
+
"""
|
|
17
|
+
# Replace 'Z' with '+00:00' for timezone
|
|
18
|
+
datetime_str = datetime_str.replace('Z', '+00:00')
|
|
19
|
+
|
|
20
|
+
# Handle microseconds - pad to 6 digits or remove if present
|
|
21
|
+
# Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
|
|
22
|
+
pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
|
|
23
|
+
match = re.match(pattern, datetime_str)
|
|
24
|
+
|
|
25
|
+
if match:
|
|
26
|
+
base_time, microseconds, timezone_part = match.groups()
|
|
27
|
+
# Pad microseconds to 6 digits or truncate if longer
|
|
28
|
+
microseconds = microseconds.ljust(6, '0')[:6]
|
|
29
|
+
datetime_str = f"{base_time}.{microseconds}{timezone_part}"
|
|
30
|
+
|
|
31
|
+
return datetime.fromisoformat(datetime_str)
|
|
4
32
|
|
|
5
33
|
|
|
6
34
|
@dataclass
|
|
@@ -25,6 +53,7 @@ class ScrapeResponse:
|
|
|
25
53
|
raw_content: Optional[str] = None
|
|
26
54
|
page_status_code: int = 0
|
|
27
55
|
page_title: Optional[str] = None
|
|
56
|
+
structured_data: Optional[Dict[str, Any]] = None
|
|
28
57
|
|
|
29
58
|
|
|
30
59
|
@dataclass
|
|
@@ -77,8 +106,8 @@ class JobItem:
|
|
|
77
106
|
self.page_status_code: int = data["page_status_code"]
|
|
78
107
|
self.status: str = data["status"]
|
|
79
108
|
self.title: str = data["title"]
|
|
80
|
-
self.created_at: datetime =
|
|
81
|
-
self.updated_at: datetime =
|
|
109
|
+
self.created_at: datetime = parse_datetime(data["created_at"])
|
|
110
|
+
self.updated_at: datetime = parse_datetime(data["updated_at"])
|
|
82
111
|
self.cost: int = data.get("cost", 0)
|
|
83
112
|
self.referred_url: Optional[str] = data.get("referred_url")
|
|
84
113
|
self.last_error: Optional[str] = data.get("last_error")
|
|
@@ -156,17 +185,17 @@ class Job:
|
|
|
156
185
|
self.scrape_type: str = data["scrape_type"]
|
|
157
186
|
self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
|
|
158
187
|
self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
|
|
159
|
-
self.allow_subdomains: bool = data
|
|
188
|
+
self.allow_subdomains: bool = data.get("allow_subdomains", False)
|
|
160
189
|
self.items_limit: int = data["items_limit"]
|
|
161
|
-
self.created_at: datetime =
|
|
162
|
-
self.updated_at: datetime =
|
|
190
|
+
self.created_at: datetime = parse_datetime(data["created_at"])
|
|
191
|
+
self.updated_at: datetime = parse_datetime(data["updated_at"])
|
|
163
192
|
self.webhook_url: Optional[str] = data.get("webhook_url")
|
|
164
193
|
self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
|
|
165
194
|
|
|
166
195
|
# Optional fields
|
|
167
196
|
self.finished_at: Optional[datetime] = None
|
|
168
197
|
if data.get("finished_at"):
|
|
169
|
-
self.finished_at =
|
|
198
|
+
self.finished_at = parse_datetime(data["finished_at"])
|
|
170
199
|
|
|
171
200
|
self.webhook_status: Optional[str] = data.get("webhook_status")
|
|
172
201
|
self.webhook_error: Optional[str] = data.get("webhook_error")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|