PyPI - webcrawlerapi - Versions diffs - 2.0.7__tar.gz → 2.0.8__tar.gz - Mend

webcrawlerapi 2.0.7tar.gz → 2.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.7
+Version: 2.0.8
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew

{webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
 setup(
     name="webcrawlerapi",
-    version="2.0.7",
+    version="2.0.8",
     packages=find_packages(),
     install_requires=[
         "requests>=2.25.0",

{webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi/client.py RENAMED Viewed

@@ -50,6 +50,7 @@ class WebCrawlerAPI:
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
         main_content_only: bool = False,
+        max_depth: Optional[int] = None,
     ) -> CrawlResponse:
         """
         Start a new crawling job asynchronously.
@@ -65,6 +66,7 @@ class WebCrawlerAPI:
             actions (Action or List[Action], optional): Actions to perform during crawling
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
             main_content_only (bool): Whether to extract only main content (default: False)
+            max_depth (int, optional): Maximum depth of crawl (0 for seed URL only, 1 for one level deep, etc.)
         Returns:
             CrawlResponse: Response containing the job ID
@@ -87,6 +89,8 @@ class WebCrawlerAPI:
             payload["whitelist_regexp"] = whitelist_regexp
         if blacklist_regexp:
             payload["blacklist_regexp"] = blacklist_regexp
+        if max_depth is not None:
+            payload["max_depth"] = max_depth
         if actions:
             # Convert single action to list if needed
             action_list = [actions] if not isinstance(actions, list) else actions
@@ -150,6 +154,7 @@ class WebCrawlerAPI:
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
         main_content_only: bool = False,
+        max_depth: Optional[int] = None,
         max_polls: int = 100,
     ) -> Job:
         """
@@ -170,6 +175,7 @@ class WebCrawlerAPI:
             actions (Action or List[Action], optional): Actions to perform during crawling
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
             main_content_only (bool): Whether to extract only main content (default: False)
+            max_depth (int, optional): Maximum depth of crawl (0 for seed URL only, 1 for one level deep, etc.)
             max_polls (int): Maximum number of status checks before returning (default: 100)
         Returns:
@@ -190,6 +196,7 @@ class WebCrawlerAPI:
             actions=actions,
             respect_robots_txt=respect_robots_txt,
             main_content_only=main_content_only,
+            max_depth=max_depth,
         )
         job_id = response.id

{webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi/models.py RENAMED Viewed

@@ -125,6 +125,7 @@ class JobItem:
         self.referred_url: Optional[str] = data.get("referred_url")
         self.last_error: Optional[str] = data.get("last_error")
         self.error_code: Optional[str] = data.get("error_code")
+        self.depth: Optional[int] = data.get("depth")
         # Optional content URLs based on scrape_type
         self.raw_content_url: Optional[str] = data.get("raw_content_url")
@@ -201,6 +202,7 @@ class Job:
         self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
         self.allow_subdomains: bool = data.get("allow_subdomains", False)
         self.items_limit: int = data["items_limit"]
+        self.max_depth: Optional[int] = data.get("max_depth")
         self.created_at: datetime = parse_datetime(data["created_at"])
         self.updated_at: datetime = parse_datetime(data["updated_at"])
         self.webhook_url: Optional[str] = data.get("webhook_url")

{webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.7
+Version: 2.0.8
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew