PyPI - webcrawlerapi - Versions diffs - 2.0.6__tar.gz → 2.0.7__tar.gz - Mend

webcrawlerapi 2.0.6tar.gz → 2.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.6
+Version: 2.0.7
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew

{webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/pyproject.toml RENAMED Viewed

@@ -26,7 +26,7 @@ known_first_party = ["webcrawlerapi"]
 known_third_party = ["requests", "pytest", "responses"]
 [tool.mypy]
-python_version = "3.7"
+python_version = "3.11"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = false

{webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
 setup(
     name="webcrawlerapi",
-    version="2.0.6",
+    version="2.0.7",
     packages=find_packages(),
     install_requires=[
         "requests>=2.25.0",

{webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi/client.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import time
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, cast
 from urllib.parse import urljoin
 import requests
@@ -49,6 +49,7 @@ class WebCrawlerAPI:
         blacklist_regexp: Optional[str] = None,
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
+        main_content_only: bool = False,
     ) -> CrawlResponse:
         """
         Start a new crawling job asynchronously.
@@ -63,6 +64,7 @@ class WebCrawlerAPI:
             blacklist_regexp (str, optional): Regex pattern for URL blacklist
             actions (Action or List[Action], optional): Actions to perform during crawling
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
+            main_content_only (bool): Whether to extract only main content (default: False)
         Returns:
             CrawlResponse: Response containing the job ID
@@ -76,6 +78,7 @@ class WebCrawlerAPI:
             "items_limit": items_limit,
             "allow_subdomains": allow_subdomains,
             "respect_robots_txt": respect_robots_txt,
+            "main_content_only": main_content_only,
         }
         if webhook_url:
@@ -133,7 +136,7 @@ class WebCrawlerAPI:
             urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/cancel")
         )
         response.raise_for_status()
-        return response.json()
+        return cast(Dict[str, str], response.json())
     def crawl(
         self,
@@ -146,6 +149,7 @@ class WebCrawlerAPI:
         blacklist_regexp: Optional[str] = None,
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
+        main_content_only: bool = False,
         max_polls: int = 100,
     ) -> Job:
         """
@@ -165,6 +169,7 @@ class WebCrawlerAPI:
             blacklist_regexp (str, optional): Regex pattern for URL blacklist
             actions (Action or List[Action], optional): Actions to perform during crawling
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
+            main_content_only (bool): Whether to extract only main content (default: False)
             max_polls (int): Maximum number of status checks before returning (default: 100)
         Returns:
@@ -184,6 +189,7 @@ class WebCrawlerAPI:
             blacklist_regexp=blacklist_regexp,
             actions=actions,
             respect_robots_txt=respect_robots_txt,
+            main_content_only=main_content_only,
         )
         job_id = response.id
@@ -218,6 +224,7 @@ class WebCrawlerAPI:
         prompt: Optional[str] = None,
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
+        main_content_only: bool = False,
     ) -> ScrapeId:
         """
         Start a new scraping job asynchronously.
@@ -230,6 +237,7 @@ class WebCrawlerAPI:
             prompt (str, optional): Prompt to guide the AI response
             actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
+            main_content_only (bool): Whether to extract only main content (default: False)
         Returns:
             ScrapeId: Response containing the scrape job ID
@@ -241,6 +249,7 @@ class WebCrawlerAPI:
             "url": url,
             "output_format": output_format,
             "respect_robots_txt": respect_robots_txt,
+            "main_content_only": main_content_only,
         }
         if webhook_url:
@@ -327,6 +336,7 @@ class WebCrawlerAPI:
         prompt: Optional[str] = None,
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
+        main_content_only: bool = False,
         max_polls: int = 100,
     ) -> Union[ScrapeResponse, ScrapeResponseError]:
         """
@@ -344,6 +354,7 @@ class WebCrawlerAPI:
             prompt (str, optional): Prompt to guide the AI response
             actions (Action or List[Action], optional): Actions to perform during scraping
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
+            main_content_only (bool): Whether to extract only main content (default: False)
             max_polls (int): Maximum number of status checks before returning (default: 100)
         Returns:
@@ -361,6 +372,7 @@ class WebCrawlerAPI:
             prompt=prompt,
             actions=actions,
             respect_robots_txt=respect_robots_txt,
+            main_content_only=main_content_only,
         )
         scrape_id = response.id

{webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.6
+Version: 2.0.7
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew