webcrawlerapi 2.0.1__tar.gz → 2.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/PKG-INFO +1 -1
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/setup.py +1 -1
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi/client.py +18 -9
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi/models.py +1 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/PKG-INFO +1 -1
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/README.md +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/setup.cfg +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi/__init__.py +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.1 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -12,24 +12,25 @@ from .models import (
|
|
|
12
12
|
Action,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
+
CRAWLER_VERSION = "v1"
|
|
16
|
+
SCRAPER_VERSION = "v2"
|
|
15
17
|
|
|
16
18
|
class WebCrawlerAPI:
|
|
17
19
|
"""Python SDK for WebCrawler API."""
|
|
18
20
|
|
|
19
21
|
DEFAULT_POLL_DELAY_SECONDS = 5
|
|
20
22
|
|
|
21
|
-
def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com"
|
|
23
|
+
def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com"):
|
|
22
24
|
"""
|
|
23
25
|
Initialize the WebCrawler API client.
|
|
24
26
|
|
|
25
27
|
Args:
|
|
26
28
|
api_key (str): Your API key for authentication
|
|
27
29
|
base_url (str): The base URL of the API (optional)
|
|
28
|
-
version (str): API version to use (optional, defaults to '
|
|
30
|
+
version (str): API version to use (optional, defaults to 'v1')
|
|
29
31
|
"""
|
|
30
32
|
self.api_key = api_key
|
|
31
33
|
self.base_url = base_url.rstrip('/')
|
|
32
|
-
self.version = version
|
|
33
34
|
self.session = requests.Session()
|
|
34
35
|
self.session.headers.update({
|
|
35
36
|
'Authorization': f'Bearer {api_key}',
|
|
@@ -86,7 +87,7 @@ class WebCrawlerAPI:
|
|
|
86
87
|
payload["actions"] = [vars(action) for action in action_list]
|
|
87
88
|
|
|
88
89
|
response = self.session.post(
|
|
89
|
-
urljoin(self.base_url, f"/{
|
|
90
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/crawl"),
|
|
90
91
|
json=payload
|
|
91
92
|
)
|
|
92
93
|
response.raise_for_status()
|
|
@@ -106,7 +107,7 @@ class WebCrawlerAPI:
|
|
|
106
107
|
requests.exceptions.RequestException: If the API request fails
|
|
107
108
|
"""
|
|
108
109
|
response = self.session.get(
|
|
109
|
-
urljoin(self.base_url, f"/{
|
|
110
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}")
|
|
110
111
|
)
|
|
111
112
|
response.raise_for_status()
|
|
112
113
|
return Job(response.json())
|
|
@@ -126,7 +127,7 @@ class WebCrawlerAPI:
|
|
|
126
127
|
requests.exceptions.RequestException: If the API request fails
|
|
127
128
|
"""
|
|
128
129
|
response = self.session.put(
|
|
129
|
-
urljoin(self.base_url, f"/{
|
|
130
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/cancel")
|
|
130
131
|
)
|
|
131
132
|
response.raise_for_status()
|
|
132
133
|
return response.json()
|
|
@@ -208,6 +209,7 @@ class WebCrawlerAPI:
|
|
|
208
209
|
output_format: str = "markdown",
|
|
209
210
|
webhook_url: Optional[str] = None,
|
|
210
211
|
clean_selectors: Optional[str] = None,
|
|
212
|
+
prompt: Optional[str] = None,
|
|
211
213
|
actions: Optional[Union[Action, List[Action]]] = None
|
|
212
214
|
) -> ScrapeId:
|
|
213
215
|
"""
|
|
@@ -218,6 +220,7 @@ class WebCrawlerAPI:
|
|
|
218
220
|
output_format (str): Output format (markdown, cleaned, html)
|
|
219
221
|
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
220
222
|
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
223
|
+
prompt (str, optional): Prompt to guide the AI response
|
|
221
224
|
actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
|
|
222
225
|
|
|
223
226
|
Returns:
|
|
@@ -235,6 +238,8 @@ class WebCrawlerAPI:
|
|
|
235
238
|
payload["webhook_url"] = webhook_url
|
|
236
239
|
if clean_selectors:
|
|
237
240
|
payload["clean_selectors"] = clean_selectors
|
|
241
|
+
if prompt:
|
|
242
|
+
payload["prompt"] = prompt
|
|
238
243
|
if actions:
|
|
239
244
|
# Convert single action to list if needed
|
|
240
245
|
action_list = [actions] if not isinstance(actions, list) else actions
|
|
@@ -242,7 +247,7 @@ class WebCrawlerAPI:
|
|
|
242
247
|
payload["actions"] = [vars(action) for action in action_list]
|
|
243
248
|
|
|
244
249
|
response = self.session.post(
|
|
245
|
-
urljoin(self.base_url, f"/{
|
|
250
|
+
urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape?async=true"),
|
|
246
251
|
json=payload
|
|
247
252
|
)
|
|
248
253
|
|
|
@@ -275,7 +280,7 @@ class WebCrawlerAPI:
|
|
|
275
280
|
requests.exceptions.RequestException: If the API request fails
|
|
276
281
|
"""
|
|
277
282
|
response = self.session.get(
|
|
278
|
-
urljoin(self.base_url, f"/{
|
|
283
|
+
urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape/{scrape_id}")
|
|
279
284
|
)
|
|
280
285
|
|
|
281
286
|
response.raise_for_status()
|
|
@@ -291,7 +296,8 @@ class WebCrawlerAPI:
|
|
|
291
296
|
cleaned_content=response_data.get("cleaned_content"),
|
|
292
297
|
raw_content=response_data.get("raw_content"),
|
|
293
298
|
page_status_code=response_data.get("page_status_code", 0),
|
|
294
|
-
page_title=response_data.get("page_title")
|
|
299
|
+
page_title=response_data.get("page_title"),
|
|
300
|
+
structured_data=response_data.get("structured_data")
|
|
295
301
|
)
|
|
296
302
|
elif status == "error":
|
|
297
303
|
return ScrapeResponseError(
|
|
@@ -312,6 +318,7 @@ class WebCrawlerAPI:
|
|
|
312
318
|
output_format: str = "markdown",
|
|
313
319
|
webhook_url: Optional[str] = None,
|
|
314
320
|
clean_selectors: Optional[str] = None,
|
|
321
|
+
prompt: Optional[str] = None,
|
|
315
322
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
316
323
|
max_polls: int = 100
|
|
317
324
|
) -> Union[ScrapeResponse, ScrapeResponseError]:
|
|
@@ -327,6 +334,7 @@ class WebCrawlerAPI:
|
|
|
327
334
|
output_format (str): Output format (markdown, cleaned, html)
|
|
328
335
|
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
329
336
|
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
337
|
+
prompt (str, optional): Prompt to guide the AI response
|
|
330
338
|
actions (Action or List[Action], optional): Actions to perform during scraping
|
|
331
339
|
max_polls (int): Maximum number of status checks before returning (default: 100)
|
|
332
340
|
|
|
@@ -342,6 +350,7 @@ class WebCrawlerAPI:
|
|
|
342
350
|
output_format=output_format,
|
|
343
351
|
webhook_url=webhook_url,
|
|
344
352
|
clean_selectors=clean_selectors,
|
|
353
|
+
prompt=prompt,
|
|
345
354
|
actions=actions
|
|
346
355
|
)
|
|
347
356
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|