PyPI - webcrawlerapi - Versions diffs - 2.0.1__tar.gz → 2.0.3__tar.gz - Mend

webcrawlerapi 2.0.1tar.gz → 2.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.1
+Version: 2.0.3
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="webcrawlerapi",
-    version="2.0.1",
+    version="2.0.3",
     packages=find_packages(),
     install_requires=[
         "requests>=2.25.0",

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi/client.py RENAMED Viewed

@@ -25,7 +25,7 @@ class WebCrawlerAPI:
         Args:
             api_key (str): Your API key for authentication
             base_url (str): The base URL of the API (optional)
-            version (str): API version to use (optional, defaults to 'v2')
+            version (str): API version to use (optional, defaults to 'v1')
         """
         self.api_key = api_key
         self.base_url = base_url.rstrip('/')
@@ -208,6 +208,7 @@ class WebCrawlerAPI:
         output_format: str = "markdown",
         webhook_url: Optional[str] = None,
         clean_selectors: Optional[str] = None,
+        prompt: Optional[str] = None,
         actions: Optional[Union[Action, List[Action]]] = None
     ) -> ScrapeId:
         """
@@ -218,6 +219,7 @@ class WebCrawlerAPI:
             output_format (str): Output format (markdown, cleaned, html)
             webhook_url (str, optional): URL to receive a POST request when scraping is complete
             clean_selectors (str, optional): CSS selectors to clean from the content
+            prompt (str, optional): Prompt to guide the AI response
             actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
         Returns:
@@ -235,6 +237,8 @@ class WebCrawlerAPI:
             payload["webhook_url"] = webhook_url
         if clean_selectors:
             payload["clean_selectors"] = clean_selectors
+        if prompt:
+            payload["prompt"] = prompt
         if actions:
             # Convert single action to list if needed
             action_list = [actions] if not isinstance(actions, list) else actions
@@ -291,7 +295,8 @@ class WebCrawlerAPI:
                 cleaned_content=response_data.get("cleaned_content"),
                 raw_content=response_data.get("raw_content"),
                 page_status_code=response_data.get("page_status_code", 0),
-                page_title=response_data.get("page_title")
+                page_title=response_data.get("page_title"),
+                structured_data=response_data.get("structured_data")
             )
         elif status == "error":
             return ScrapeResponseError(
@@ -312,6 +317,7 @@ class WebCrawlerAPI:
         output_format: str = "markdown",
         webhook_url: Optional[str] = None,
         clean_selectors: Optional[str] = None,
+        prompt: Optional[str] = None,
         actions: Optional[Union[Action, List[Action]]] = None,
         max_polls: int = 100
     ) -> Union[ScrapeResponse, ScrapeResponseError]:
@@ -327,6 +333,7 @@ class WebCrawlerAPI:
             output_format (str): Output format (markdown, cleaned, html)
             webhook_url (str, optional): URL to receive a POST request when scraping is complete
             clean_selectors (str, optional): CSS selectors to clean from the content
+            prompt (str, optional): Prompt to guide the AI response
             actions (Action or List[Action], optional): Actions to perform during scraping
             max_polls (int): Maximum number of status checks before returning (default: 100)
@@ -342,6 +349,7 @@ class WebCrawlerAPI:
             output_format=output_format,
             webhook_url=webhook_url,
             clean_selectors=clean_selectors,
+            prompt=prompt,
             actions=actions
         )

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi/models.py RENAMED Viewed

@@ -1,6 +1,34 @@
 from typing import Optional, Dict, Any, List
 from datetime import datetime
 from dataclasses import dataclass
+import re
+def parse_datetime(datetime_str: str) -> datetime:
+    """
+    Parse datetime string from API response, handling various microsecond formats.
+    Args:
+        datetime_str (str): Datetime string from API
+    Returns:
+        datetime: Parsed datetime object
+    """
+    # Replace 'Z' with '+00:00' for timezone
+    datetime_str = datetime_str.replace('Z', '+00:00')
+    # Handle microseconds - pad to 6 digits or remove if present
+    # Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
+    pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
+    match = re.match(pattern, datetime_str)
+    if match:
+        base_time, microseconds, timezone_part = match.groups()
+        # Pad microseconds to 6 digits or truncate if longer
+        microseconds = microseconds.ljust(6, '0')[:6]
+        datetime_str = f"{base_time}.{microseconds}{timezone_part}"
+    return datetime.fromisoformat(datetime_str)
 @dataclass
@@ -25,6 +53,7 @@ class ScrapeResponse:
     raw_content: Optional[str] = None
     page_status_code: int = 0
     page_title: Optional[str] = None
+    structured_data: Optional[Dict[str, Any]] = None
 @dataclass
@@ -77,8 +106,8 @@ class JobItem:
         self.page_status_code: int = data["page_status_code"]
         self.status: str = data["status"]
         self.title: str = data["title"]
-        self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
-        self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
+        self.created_at: datetime = parse_datetime(data["created_at"])
+        self.updated_at: datetime = parse_datetime(data["updated_at"])
         self.cost: int = data.get("cost", 0)
         self.referred_url: Optional[str] = data.get("referred_url")
         self.last_error: Optional[str] = data.get("last_error")
@@ -156,17 +185,17 @@ class Job:
         self.scrape_type: str = data["scrape_type"]
         self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
         self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
-        self.allow_subdomains: bool = data["allow_subdomains"]
+        self.allow_subdomains: bool = data.get("allow_subdomains", False)
         self.items_limit: int = data["items_limit"]
-        self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
-        self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
+        self.created_at: datetime = parse_datetime(data["created_at"])
+        self.updated_at: datetime = parse_datetime(data["updated_at"])
         self.webhook_url: Optional[str] = data.get("webhook_url")
         self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
         # Optional fields
         self.finished_at: Optional[datetime] = None
         if data.get("finished_at"):
-            self.finished_at = datetime.fromisoformat(data["finished_at"].replace('Z', '+00:00'))
+            self.finished_at = parse_datetime(data["finished_at"])
         self.webhook_status: Optional[str] = data.get("webhook_status")
         self.webhook_error: Optional[str] = data.get("webhook_error")

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.1
+Version: 2.0.3
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/README.md RENAMED Viewed

File without changes

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/setup.cfg RENAMED Viewed

File without changes

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi/__init__.py RENAMED Viewed

File without changes

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/requires.txt RENAMED Viewed

File without changes

{webcrawlerapi-2.0.1 → webcrawlerapi-2.0.3}/webcrawlerapi.egg-info/top_level.txt RENAMED Viewed

File without changes

webcrawlerapi 2.0.1__tar.gz → 2.0.3__tar.gz

webcrawlerapi 2.0.1tar.gz → 2.0.3tar.gz