webcrawlerapi 2.0.1__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.1
3
+ Version: 2.0.4
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.1",
5
+ version="2.0.4",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -12,24 +12,25 @@ from .models import (
12
12
  Action,
13
13
  )
14
14
 
15
+ CRAWLER_VERSION = "v1"
16
+ SCRAPER_VERSION = "v2"
15
17
 
16
18
  class WebCrawlerAPI:
17
19
  """Python SDK for WebCrawler API."""
18
20
 
19
21
  DEFAULT_POLL_DELAY_SECONDS = 5
20
22
 
21
- def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v1"):
23
+ def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com"):
22
24
  """
23
25
  Initialize the WebCrawler API client.
24
26
 
25
27
  Args:
26
28
  api_key (str): Your API key for authentication
27
29
  base_url (str): The base URL of the API (optional)
28
- version (str): API version to use (optional, defaults to 'v2')
30
+ version (str): API version to use (optional, defaults to 'v1')
29
31
  """
30
32
  self.api_key = api_key
31
33
  self.base_url = base_url.rstrip('/')
32
- self.version = version
33
34
  self.session = requests.Session()
34
35
  self.session.headers.update({
35
36
  'Authorization': f'Bearer {api_key}',
@@ -86,7 +87,7 @@ class WebCrawlerAPI:
86
87
  payload["actions"] = [vars(action) for action in action_list]
87
88
 
88
89
  response = self.session.post(
89
- urljoin(self.base_url, f"/{self.version}/crawl"),
90
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/crawl"),
90
91
  json=payload
91
92
  )
92
93
  response.raise_for_status()
@@ -106,7 +107,7 @@ class WebCrawlerAPI:
106
107
  requests.exceptions.RequestException: If the API request fails
107
108
  """
108
109
  response = self.session.get(
109
- urljoin(self.base_url, f"/{self.version}/job/{job_id}")
110
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}")
110
111
  )
111
112
  response.raise_for_status()
112
113
  return Job(response.json())
@@ -126,7 +127,7 @@ class WebCrawlerAPI:
126
127
  requests.exceptions.RequestException: If the API request fails
127
128
  """
128
129
  response = self.session.put(
129
- urljoin(self.base_url, f"/{self.version}/job/{job_id}/cancel")
130
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/cancel")
130
131
  )
131
132
  response.raise_for_status()
132
133
  return response.json()
@@ -208,6 +209,7 @@ class WebCrawlerAPI:
208
209
  output_format: str = "markdown",
209
210
  webhook_url: Optional[str] = None,
210
211
  clean_selectors: Optional[str] = None,
212
+ prompt: Optional[str] = None,
211
213
  actions: Optional[Union[Action, List[Action]]] = None
212
214
  ) -> ScrapeId:
213
215
  """
@@ -218,6 +220,7 @@ class WebCrawlerAPI:
218
220
  output_format (str): Output format (markdown, cleaned, html)
219
221
  webhook_url (str, optional): URL to receive a POST request when scraping is complete
220
222
  clean_selectors (str, optional): CSS selectors to clean from the content
223
+ prompt (str, optional): Prompt to guide the AI response
221
224
  actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
222
225
 
223
226
  Returns:
@@ -235,6 +238,8 @@ class WebCrawlerAPI:
235
238
  payload["webhook_url"] = webhook_url
236
239
  if clean_selectors:
237
240
  payload["clean_selectors"] = clean_selectors
241
+ if prompt:
242
+ payload["prompt"] = prompt
238
243
  if actions:
239
244
  # Convert single action to list if needed
240
245
  action_list = [actions] if not isinstance(actions, list) else actions
@@ -242,7 +247,7 @@ class WebCrawlerAPI:
242
247
  payload["actions"] = [vars(action) for action in action_list]
243
248
 
244
249
  response = self.session.post(
245
- urljoin(self.base_url, f"/{self.version}/scrape?async=true"),
250
+ urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape?async=true"),
246
251
  json=payload
247
252
  )
248
253
 
@@ -275,7 +280,7 @@ class WebCrawlerAPI:
275
280
  requests.exceptions.RequestException: If the API request fails
276
281
  """
277
282
  response = self.session.get(
278
- urljoin(self.base_url, f"/{self.version}/scrape/{scrape_id}")
283
+ urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape/{scrape_id}")
279
284
  )
280
285
 
281
286
  response.raise_for_status()
@@ -291,7 +296,8 @@ class WebCrawlerAPI:
291
296
  cleaned_content=response_data.get("cleaned_content"),
292
297
  raw_content=response_data.get("raw_content"),
293
298
  page_status_code=response_data.get("page_status_code", 0),
294
- page_title=response_data.get("page_title")
299
+ page_title=response_data.get("page_title"),
300
+ structured_data=response_data.get("structured_data")
295
301
  )
296
302
  elif status == "error":
297
303
  return ScrapeResponseError(
@@ -312,6 +318,7 @@ class WebCrawlerAPI:
312
318
  output_format: str = "markdown",
313
319
  webhook_url: Optional[str] = None,
314
320
  clean_selectors: Optional[str] = None,
321
+ prompt: Optional[str] = None,
315
322
  actions: Optional[Union[Action, List[Action]]] = None,
316
323
  max_polls: int = 100
317
324
  ) -> Union[ScrapeResponse, ScrapeResponseError]:
@@ -327,6 +334,7 @@ class WebCrawlerAPI:
327
334
  output_format (str): Output format (markdown, cleaned, html)
328
335
  webhook_url (str, optional): URL to receive a POST request when scraping is complete
329
336
  clean_selectors (str, optional): CSS selectors to clean from the content
337
+ prompt (str, optional): Prompt to guide the AI response
330
338
  actions (Action or List[Action], optional): Actions to perform during scraping
331
339
  max_polls (int): Maximum number of status checks before returning (default: 100)
332
340
 
@@ -342,6 +350,7 @@ class WebCrawlerAPI:
342
350
  output_format=output_format,
343
351
  webhook_url=webhook_url,
344
352
  clean_selectors=clean_selectors,
353
+ prompt=prompt,
345
354
  actions=actions
346
355
  )
347
356
 
@@ -25,6 +25,7 @@ class ScrapeResponse:
25
25
  raw_content: Optional[str] = None
26
26
  page_status_code: int = 0
27
27
  page_title: Optional[str] = None
28
+ structured_data: Optional[Dict[str, Any]] = None
28
29
 
29
30
 
30
31
  @dataclass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.1
3
+ Version: 2.0.4
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
File without changes
File without changes