PyPI - webcrawlerapi - Versions diffs - 2.0.10__tar.gz → 2.0.12__tar.gz - Mend

webcrawlerapi 2.0.10tar.gz → 2.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.10
+Version: 2.0.12
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew

{webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
 setup(
     name="webcrawlerapi",
-    version="2.0.10",
+    version="2.0.12",
     packages=find_packages(),
     install_requires=[
         "requests>=2.25.0",

{webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi/__init__.py RENAMED Viewed

@@ -28,6 +28,7 @@ from .models import (
     CrawlResponse,
     Job,
     JobItem,
+    JobMarkdownResponse,
     ScrapeId,
     ScrapeResponse,
     ScrapeResponseError,
@@ -39,6 +40,7 @@ __all__ = [
     "WebCrawlerAPI",
     "Job",
     "JobItem",
+    "JobMarkdownResponse",
     "CrawlResponse",
     "ScrapeId",
     "ScrapeResponse",

{webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi/client.py RENAMED Viewed

@@ -8,6 +8,7 @@ from .models import (
     Action,
     CrawlResponse,
     Job,
+    JobMarkdownResponse,
     ScrapeId,
     ScrapeResponse,
     ScrapeResponseError,
@@ -123,15 +124,15 @@ class WebCrawlerAPI:
         response.raise_for_status()
         return Job(response.json())
-    def get_job_markdown(self, job_id: str) -> str:
+    def get_job_markdown(self, job_id: str) -> JobMarkdownResponse:
         """
-        Get combined markdown content for a completed markdown job.
+        Get the URL to the combined markdown file for a completed markdown job.
         Args:
             job_id (str): The unique identifier of the job
         Returns:
-            str: Combined markdown content
+            JobMarkdownResponse: Response containing the content_url to the markdown file
         Raises:
             requests.exceptions.RequestException: If the API request fails
@@ -139,6 +140,26 @@ class WebCrawlerAPI:
         response = self.session.get(
             urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown")
         )
+        response.raise_for_status()
+        data = response.json()
+        return JobMarkdownResponse(content_url=data["content_url"])
+    def get_job_markdown_content(self, job_id: str) -> str:
+        """
+        Download the combined markdown content for a completed markdown job.
+        Args:
+            job_id (str): The unique identifier of the job
+        Returns:
+            str: Combined markdown content as plain text
+        Raises:
+            requests.exceptions.RequestException: If the API request fails
+        """
+        response = self.session.get(
+            urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown/content")
+        )
         if not response.ok:
             try:
@@ -303,7 +324,7 @@ class WebCrawlerAPI:
                 f"Job finished with status {job.status}"
             )
-        return self.get_job_markdown(job.id)
+        return self.get_job_markdown_content(job.id)
     def scrape_async(
         self,
@@ -312,6 +333,7 @@ class WebCrawlerAPI:
         webhook_url: Optional[str] = None,
         clean_selectors: Optional[str] = None,
         prompt: Optional[str] = None,
+        response_schema: Optional[Dict[str, Any]] = None,
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
         main_content_only: bool = False,
@@ -326,6 +348,7 @@ class WebCrawlerAPI:
             webhook_url (str, optional): URL to receive a POST request when scraping is complete
             clean_selectors (str, optional): CSS selectors to clean from the content
             prompt (str, optional): Prompt to guide the AI response
+            response_schema (dict, optional): JSON Schema for structured output format. Works with the prompt parameter.
             actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
             main_content_only (bool): Whether to extract only main content (default: False)
@@ -350,6 +373,8 @@ class WebCrawlerAPI:
             payload["clean_selectors"] = clean_selectors
         if prompt:
             payload["prompt"] = prompt
+        if response_schema is not None:
+            payload["response_schema"] = response_schema
         if max_age is not None:
             payload["max_age"] = max_age
         if actions:
@@ -410,6 +435,7 @@ class WebCrawlerAPI:
                 page_status_code=response_data.get("page_status_code", 0),
                 page_title=response_data.get("page_title"),
                 structured_data=response_data.get("structured_data"),
+                links=response_data.get("links"),
             )
         elif status == "error":
             return ScrapeResponseError(
@@ -428,6 +454,7 @@ class WebCrawlerAPI:
         webhook_url: Optional[str] = None,
         clean_selectors: Optional[str] = None,
         prompt: Optional[str] = None,
+        response_schema: Optional[Dict[str, Any]] = None,
         actions: Optional[Union[Action, List[Action]]] = None,
         respect_robots_txt: bool = False,
         main_content_only: bool = False,
@@ -447,6 +474,7 @@ class WebCrawlerAPI:
             webhook_url (str, optional): URL to receive a POST request when scraping is complete
             clean_selectors (str, optional): CSS selectors to clean from the content
             prompt (str, optional): Prompt to guide the AI response
+            response_schema (dict, optional): JSON Schema for structured output format. Works with the prompt parameter.
             actions (Action or List[Action], optional): Actions to perform during scraping
             respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
             main_content_only (bool): Whether to extract only main content (default: False)
@@ -466,6 +494,7 @@ class WebCrawlerAPI:
             webhook_url=webhook_url,
             clean_selectors=clean_selectors,
             prompt=prompt,
+            response_schema=response_schema,
             actions=actions,
             respect_robots_txt=respect_robots_txt,
             main_content_only=main_content_only,

{webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi/models.py RENAMED Viewed

@@ -38,6 +38,13 @@ class CrawlResponse:
     id: str
+@dataclass
+class JobMarkdownResponse:
+    """Response from the get_job_markdown endpoint."""
+    content_url: str
 @dataclass
 class ScrapeId:
     """Response from an asynchronous scrape request."""
@@ -57,6 +64,7 @@ class ScrapeResponse:
     page_status_code: int = 0
     page_title: Optional[str] = None
     structured_data: Optional[Dict[str, Any]] = None
+    links: Optional[List[str]] = None
 @dataclass

{webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webcrawlerapi
-Version: 2.0.10
+Version: 2.0.12
 Summary: Python SDK for WebCrawler API
 Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
 Author: Andrew