PyPI - webcrawlerapi - Versions diffs - 2.0.0__tar.gz → 2.0.3__tar.gz - Mend

webcrawlerapi 2.0.0tar.gz → 2.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.0
+Version: 2.0.3
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew
@@ -102,19 +102,15 @@ print(f"Cancellation response: {cancel_response['message']}")
 ### Scraping
-Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
 ```python
 # Returns structured data directly
-structured_data = crawler.scrape(
-    crawler_id="webcrawler/url-to-md",  # ID of the scraper
-    input_data={
-        "url": "https://example.com"  # Scraper-specific input parameters. Check scraper description
-    },
-    webhook_url="https://yourserver.com/webhook",  # Optional webhook
-    max_polls=20  # Optional: maximum number of status checks
+response = crawler.scrape(
+    "url": "https://webcrawlerapi.com"
 )
-print(structured_data)  # Direct access to scraped data
+if response.success:
+    print(response.markdown)
+else:
+    print(f"Code: {response.error_code} Error: {response.error_message}")
 ```
 ## API Methods

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/README.md RENAMED Viewed

@@ -81,19 +81,15 @@ print(f"Cancellation response: {cancel_response['message']}")
 ### Scraping
-Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
 ```python
 # Returns structured data directly
-structured_data = crawler.scrape(
-    crawler_id="webcrawler/url-to-md",  # ID of the scraper
-    input_data={
-        "url": "https://example.com"  # Scraper-specific input parameters. Check scraper description
-    },
-    webhook_url="https://yourserver.com/webhook",  # Optional webhook
-    max_polls=20  # Optional: maximum number of status checks
+response = crawler.scrape(
+    "url": "https://webcrawlerapi.com"
 )
-print(structured_data)  # Direct access to scraped data
+if response.success:
+    print(response.markdown)
+else:
+    print(f"Code: {response.error_code} Error: {response.error_message}")
 ```
 ## API Methods

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="webcrawlerapi",
-    version="2.0.0",
+    version="2.0.3",
     packages=find_packages(),
     install_requires=[
         "requests>=2.25.0",

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi/__init__.py RENAMED Viewed

@@ -24,7 +24,8 @@ Basic usage:
 from .models import (
     CrawlResponse,
-    ScrapeResponseV2,
+    ScrapeId,
+    ScrapeResponse,
     ScrapeResponseError,
     Job,
     JobItem,
@@ -39,7 +40,8 @@ __all__ = [
     "Job",
     "JobItem",
     "CrawlResponse",
-    "ScrapeResponseV2",
+    "ScrapeId",
+    "ScrapeResponse",
     "ScrapeResponseError",
     "Action",
     "UploadS3Action",

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi/client.py RENAMED Viewed

@@ -5,7 +5,8 @@ import time
 from .models import (
     CrawlResponse,
-    ScrapeResponseV2,
+    ScrapeId,
+    ScrapeResponse,
     ScrapeResponseError,
     Job,
     Action,
@@ -17,14 +18,14 @@ class WebCrawlerAPI:
     DEFAULT_POLL_DELAY_SECONDS = 5
-    def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v2"):
+    def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v1"):
         """
         Initialize the WebCrawler API client.
         Args:
             api_key (str): Your API key for authentication
             base_url (str): The base URL of the API (optional)
-            version (str): API version to use (optional, defaults to 'v2')
+            version (str): API version to use (optional, defaults to 'v1')
         """
         self.api_key = api_key
         self.base_url = base_url.rstrip('/')
@@ -201,26 +202,28 @@ class WebCrawlerAPI:
         # Return the last known state if max_polls is reached
         return job
-    def scrape(
+    def scrape_async(
         self,
         url: str,
         output_format: str = "markdown",
         webhook_url: Optional[str] = None,
         clean_selectors: Optional[str] = None,
+        prompt: Optional[str] = None,
         actions: Optional[Union[Action, List[Action]]] = None
-    ) -> Union[ScrapeResponseV2, ScrapeResponseError]:
+    ) -> ScrapeId:
         """
-        Scrape a single URL synchronously.
+        Start a new scraping job asynchronously.
         Args:
             url (str): The URL to scrape
             output_format (str): Output format (markdown, cleaned, html)
             webhook_url (str, optional): URL to receive a POST request when scraping is complete
             clean_selectors (str, optional): CSS selectors to clean from the content
-            actions (Action or List[Action], optional): Actions to perform during scraping
+            prompt (str, optional): Prompt to guide the AI response
+            actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
         Returns:
-            Union[ScrapeResponseV2, ScrapeResponseError]: Response containing the scraped content or error
+            ScrapeId: Response containing the scrape job ID
         Raises:
             requests.exceptions.RequestException: If the API request fails
@@ -234,6 +237,8 @@ class WebCrawlerAPI:
             payload["webhook_url"] = webhook_url
         if clean_selectors:
             payload["clean_selectors"] = clean_selectors
+        if prompt:
+            payload["prompt"] = prompt
         if actions:
             # Convert single action to list if needed
             action_list = [actions] if not isinstance(actions, list) else actions
@@ -241,25 +246,131 @@ class WebCrawlerAPI:
             payload["actions"] = [vars(action) for action in action_list]
         response = self.session.post(
-            urljoin(self.base_url, f"/{self.version}/scrape"),
+            urljoin(self.base_url, f"/{self.version}/scrape?async=true"),
             json=payload
         )
+        if not response.ok:
+            try:
+                error_data = response.json()
+                raise requests.exceptions.HTTPError(
+                    f"{response.status_code} {response.reason}: {error_data.get('error', 'Unknown error')}"
+                )
+            except ValueError:
+                # If response is not JSON, raise with status and text
+                raise requests.exceptions.HTTPError(
+                    f"{response.status_code} {response.reason}: {response.text}"
+                )
+        response.raise_for_status()
+        return ScrapeId(id=response.json()["id"])
+    def get_scrape(self, scrape_id: str) -> Union[ScrapeResponse, ScrapeResponseError]:
+        """
+        Get the status and result of a specific scrape job.
+        Args:
+            scrape_id (str): The unique identifier of the scrape job
+        Returns:
+            Union[ScrapeResponse, ScrapeResponseError]: The scrape result or error
+        Raises:
+            requests.exceptions.RequestException: If the API request fails
+        """
+        response = self.session.get(
+            urljoin(self.base_url, f"/{self.version}/scrape/{scrape_id}")
+        )
+        response.raise_for_status()
         response_data = response.json()
-        # Check if the response indicates success or error
-        if response_data.get("success", False):
-            return ScrapeResponseV2(
-                success=response_data["success"],
+        status = response_data.get("status")
+        if status == "done":
+            return ScrapeResponse(
+                success=response_data.get("success", True),
+                status=status,
                 markdown=response_data.get("markdown"),
                 cleaned_content=response_data.get("cleaned_content"),
                 raw_content=response_data.get("raw_content"),
                 page_status_code=response_data.get("page_status_code", 0),
-                page_title=response_data.get("page_title")
+                page_title=response_data.get("page_title"),
+                structured_data=response_data.get("structured_data")
             )
-        else:
+        elif status == "error":
             return ScrapeResponseError(
-                success=response_data.get("success", False),
+                success=False,
                 error_code=response_data.get("error_code", "unknown"),
-                error_message=response_data.get("error_message", "Unknown error")
-            )
+                error_message=response_data.get("error_message", "Scraping failed"),
+                status=status
+            )
+        else:  # in_progress or any other status
+            return ScrapeResponse(
+                success=False,
+                status=status
+            )
+    def scrape(
+        self,
+        url: str,
+        output_format: str = "markdown",
+        webhook_url: Optional[str] = None,
+        clean_selectors: Optional[str] = None,
+        prompt: Optional[str] = None,
+        actions: Optional[Union[Action, List[Action]]] = None,
+        max_polls: int = 100
+    ) -> Union[ScrapeResponse, ScrapeResponseError]:
+        """
+        Scrape a single URL and wait for completion.
+        This method will start a scraping job and continuously poll its status
+        until it reaches a terminal state (done or error) or until
+        the maximum number of polls is reached.
+        Args:
+            url (str): The URL to scrape
+            output_format (str): Output format (markdown, cleaned, html)
+            webhook_url (str, optional): URL to receive a POST request when scraping is complete
+            clean_selectors (str, optional): CSS selectors to clean from the content
+            prompt (str, optional): Prompt to guide the AI response
+            actions (Action or List[Action], optional): Actions to perform during scraping
+            max_polls (int): Maximum number of status checks before returning (default: 100)
+        Returns:
+            Union[ScrapeResponse, ScrapeResponseError]: The final scrape result
+        Raises:
+            requests.exceptions.RequestException: If any API request fails
+        """
+        # Start the scraping job
+        response = self.scrape_async(
+            url=url,
+            output_format=output_format,
+            webhook_url=webhook_url,
+            clean_selectors=clean_selectors,
+            prompt=prompt,
+            actions=actions
+        )
+        scrape_id = response.id
+        polls = 0
+        while polls < max_polls:
+            result = self.get_scrape(scrape_id)
+            # Return immediately if scrape is done
+            if isinstance(result, ScrapeResponse) and result.status == "done":
+                return result
+            # Return immediately if there's an error
+            if isinstance(result, ScrapeResponseError):
+                return result
+            # Continue polling if status is in_progress or any other non-terminal status
+            # Wait before next poll
+            time.sleep(self.DEFAULT_POLL_DELAY_SECONDS)
+            polls += 1
+        # Return the last known state if max_polls is reached
+        return result

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi/models.py RENAMED Viewed

@@ -1,6 +1,34 @@
 from typing import Optional, Dict, Any, List
 from datetime import datetime
 from dataclasses import dataclass
+import re
+def parse_datetime(datetime_str: str) -> datetime:
+    """
+    Parse datetime string from API response, handling various microsecond formats.
+    Args:
+        datetime_str (str): Datetime string from API
+    Returns:
+        datetime: Parsed datetime object
+    """
+    # Replace 'Z' with '+00:00' for timezone
+    datetime_str = datetime_str.replace('Z', '+00:00')
+    # Handle microseconds - pad to 6 digits or remove if present
+    # Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
+    pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
+    match = re.match(pattern, datetime_str)
+    if match:
+        base_time, microseconds, timezone_part = match.groups()
+        # Pad microseconds to 6 digits or truncate if longer
+        microseconds = microseconds.ljust(6, '0')[:6]
+        datetime_str = f"{base_time}.{microseconds}{timezone_part}"
+    return datetime.fromisoformat(datetime_str)
 @dataclass
@@ -10,14 +38,22 @@ class CrawlResponse:
 @dataclass
-class ScrapeResponseV2:
-    """Response from a synchronous scrape request."""
+class ScrapeId:
+    """Response from an asynchronous scrape request."""
+    id: str
+@dataclass
+class ScrapeResponse:
+    """Response from a scrape request."""
     success: bool
+    status: Optional[str] = None
     markdown: Optional[str] = None
     cleaned_content: Optional[str] = None
     raw_content: Optional[str] = None
     page_status_code: int = 0
     page_title: Optional[str] = None
+    structured_data: Optional[Dict[str, Any]] = None
 @dataclass
@@ -26,6 +62,7 @@ class ScrapeResponseError:
     success: bool
     error_code: str
     error_message: str
+    status: Optional[str] = None
 @dataclass
@@ -69,11 +106,11 @@ class JobItem:
         self.page_status_code: int = data["page_status_code"]
         self.status: str = data["status"]
         self.title: str = data["title"]
-        self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
-        self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
-        self.cost: int = data["cost"]
-        self.referred_url: str = data["referred_url"]
-        self.last_error: str = data["last_error"]
+        self.created_at: datetime = parse_datetime(data["created_at"])
+        self.updated_at: datetime = parse_datetime(data["updated_at"])
+        self.cost: int = data.get("cost", 0)
+        self.referred_url: Optional[str] = data.get("referred_url")
+        self.last_error: Optional[str] = data.get("last_error")
         self.error_code: Optional[str] = data.get("error_code")
         # Optional content URLs based on scrape_type
@@ -146,19 +183,19 @@ class Job:
         self.url: str = data["url"]
         self.status: str = data["status"]
         self.scrape_type: str = data["scrape_type"]
-        self.whitelist_regexp: str = data["whitelist_regexp"]
-        self.blacklist_regexp: str = data["blacklist_regexp"]
-        self.allow_subdomains: bool = data["allow_subdomains"]
+        self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
+        self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
+        self.allow_subdomains: bool = data.get("allow_subdomains", False)
         self.items_limit: int = data["items_limit"]
-        self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
-        self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
-        self.webhook_url: str = data["webhook_url"]
+        self.created_at: datetime = parse_datetime(data["created_at"])
+        self.updated_at: datetime = parse_datetime(data["updated_at"])
+        self.webhook_url: Optional[str] = data.get("webhook_url")
         self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
         # Optional fields
         self.finished_at: Optional[datetime] = None
         if data.get("finished_at"):
-            self.finished_at = datetime.fromisoformat(data["finished_at"].replace('Z', '+00:00'))
+            self.finished_at = parse_datetime(data["finished_at"])
         self.webhook_status: Optional[str] = data.get("webhook_status")
         self.webhook_error: Optional[str] = data.get("webhook_error")

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.0
+Version: 2.0.3
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew
@@ -102,19 +102,15 @@ print(f"Cancellation response: {cancel_response['message']}")
 ### Scraping
-Find the list of available scrapers [here](https://webcrawlerapi.com/scrapers).
 ```python
 # Returns structured data directly
-structured_data = crawler.scrape(
-    crawler_id="webcrawler/url-to-md",  # ID of the scraper
-    input_data={
-        "url": "https://example.com"  # Scraper-specific input parameters. Check scraper description
-    },
-    webhook_url="https://yourserver.com/webhook",  # Optional webhook
-    max_polls=20  # Optional: maximum number of status checks
+response = crawler.scrape(
+    "url": "https://webcrawlerapi.com"
 )
-print(structured_data)  # Direct access to scraped data
+if response.success:
+    print(response.markdown)
+else:
+    print(f"Code: {response.error_code} Error: {response.error_message}")
 ```
 ## API Methods

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/setup.cfg RENAMED Viewed

File without changes

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/requires.txt RENAMED Viewed

File without changes

{webcrawlerapi-2.0.0 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/top_level.txt RENAMED Viewed

File without changes

webcrawlerapi 2.0.0__tar.gz → 2.0.3__tar.gz

webcrawlerapi 2.0.0tar.gz → 2.0.3tar.gz