webcrawlerapi 2.0.6__tar.gz → 2.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.6
3
+ Version: 2.0.7
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -26,7 +26,7 @@ known_first_party = ["webcrawlerapi"]
26
26
  known_third_party = ["requests", "pytest", "responses"]
27
27
 
28
28
  [tool.mypy]
29
- python_version = "3.7"
29
+ python_version = "3.11"
30
30
  warn_return_any = true
31
31
  warn_unused_configs = true
32
32
  disallow_untyped_defs = false
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.6",
5
+ version="2.0.7",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -1,5 +1,5 @@
1
1
  import time
2
- from typing import Any, Dict, List, Optional, Union
2
+ from typing import Any, Dict, List, Optional, Union, cast
3
3
  from urllib.parse import urljoin
4
4
 
5
5
  import requests
@@ -49,6 +49,7 @@ class WebCrawlerAPI:
49
49
  blacklist_regexp: Optional[str] = None,
50
50
  actions: Optional[Union[Action, List[Action]]] = None,
51
51
  respect_robots_txt: bool = False,
52
+ main_content_only: bool = False,
52
53
  ) -> CrawlResponse:
53
54
  """
54
55
  Start a new crawling job asynchronously.
@@ -63,6 +64,7 @@ class WebCrawlerAPI:
63
64
  blacklist_regexp (str, optional): Regex pattern for URL blacklist
64
65
  actions (Action or List[Action], optional): Actions to perform during crawling
65
66
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
67
+ main_content_only (bool): Whether to extract only main content (default: False)
66
68
 
67
69
  Returns:
68
70
  CrawlResponse: Response containing the job ID
@@ -76,6 +78,7 @@ class WebCrawlerAPI:
76
78
  "items_limit": items_limit,
77
79
  "allow_subdomains": allow_subdomains,
78
80
  "respect_robots_txt": respect_robots_txt,
81
+ "main_content_only": main_content_only,
79
82
  }
80
83
 
81
84
  if webhook_url:
@@ -133,7 +136,7 @@ class WebCrawlerAPI:
133
136
  urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/cancel")
134
137
  )
135
138
  response.raise_for_status()
136
- return response.json()
139
+ return cast(Dict[str, str], response.json())
137
140
 
138
141
  def crawl(
139
142
  self,
@@ -146,6 +149,7 @@ class WebCrawlerAPI:
146
149
  blacklist_regexp: Optional[str] = None,
147
150
  actions: Optional[Union[Action, List[Action]]] = None,
148
151
  respect_robots_txt: bool = False,
152
+ main_content_only: bool = False,
149
153
  max_polls: int = 100,
150
154
  ) -> Job:
151
155
  """
@@ -165,6 +169,7 @@ class WebCrawlerAPI:
165
169
  blacklist_regexp (str, optional): Regex pattern for URL blacklist
166
170
  actions (Action or List[Action], optional): Actions to perform during crawling
167
171
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
172
+ main_content_only (bool): Whether to extract only main content (default: False)
168
173
  max_polls (int): Maximum number of status checks before returning (default: 100)
169
174
 
170
175
  Returns:
@@ -184,6 +189,7 @@ class WebCrawlerAPI:
184
189
  blacklist_regexp=blacklist_regexp,
185
190
  actions=actions,
186
191
  respect_robots_txt=respect_robots_txt,
192
+ main_content_only=main_content_only,
187
193
  )
188
194
 
189
195
  job_id = response.id
@@ -218,6 +224,7 @@ class WebCrawlerAPI:
218
224
  prompt: Optional[str] = None,
219
225
  actions: Optional[Union[Action, List[Action]]] = None,
220
226
  respect_robots_txt: bool = False,
227
+ main_content_only: bool = False,
221
228
  ) -> ScrapeId:
222
229
  """
223
230
  Start a new scraping job asynchronously.
@@ -230,6 +237,7 @@ class WebCrawlerAPI:
230
237
  prompt (str, optional): Prompt to guide the AI response
231
238
  actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
232
239
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
240
+ main_content_only (bool): Whether to extract only main content (default: False)
233
241
 
234
242
  Returns:
235
243
  ScrapeId: Response containing the scrape job ID
@@ -241,6 +249,7 @@ class WebCrawlerAPI:
241
249
  "url": url,
242
250
  "output_format": output_format,
243
251
  "respect_robots_txt": respect_robots_txt,
252
+ "main_content_only": main_content_only,
244
253
  }
245
254
 
246
255
  if webhook_url:
@@ -327,6 +336,7 @@ class WebCrawlerAPI:
327
336
  prompt: Optional[str] = None,
328
337
  actions: Optional[Union[Action, List[Action]]] = None,
329
338
  respect_robots_txt: bool = False,
339
+ main_content_only: bool = False,
330
340
  max_polls: int = 100,
331
341
  ) -> Union[ScrapeResponse, ScrapeResponseError]:
332
342
  """
@@ -344,6 +354,7 @@ class WebCrawlerAPI:
344
354
  prompt (str, optional): Prompt to guide the AI response
345
355
  actions (Action or List[Action], optional): Actions to perform during scraping
346
356
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
357
+ main_content_only (bool): Whether to extract only main content (default: False)
347
358
  max_polls (int): Maximum number of status checks before returning (default: 100)
348
359
 
349
360
  Returns:
@@ -361,6 +372,7 @@ class WebCrawlerAPI:
361
372
  prompt=prompt,
362
373
  actions=actions,
363
374
  respect_robots_txt=respect_robots_txt,
375
+ main_content_only=main_content_only,
364
376
  )
365
377
 
366
378
  scrape_id = response.id
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.6
3
+ Version: 2.0.7
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
File without changes
File without changes