webcrawlerapi 2.0.11__tar.gz → 2.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.11
3
+ Version: 2.0.12
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.11",
5
+ version="2.0.12",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -28,6 +28,7 @@ from .models import (
28
28
  CrawlResponse,
29
29
  Job,
30
30
  JobItem,
31
+ JobMarkdownResponse,
31
32
  ScrapeId,
32
33
  ScrapeResponse,
33
34
  ScrapeResponseError,
@@ -39,6 +40,7 @@ __all__ = [
39
40
  "WebCrawlerAPI",
40
41
  "Job",
41
42
  "JobItem",
43
+ "JobMarkdownResponse",
42
44
  "CrawlResponse",
43
45
  "ScrapeId",
44
46
  "ScrapeResponse",
@@ -8,6 +8,7 @@ from .models import (
8
8
  Action,
9
9
  CrawlResponse,
10
10
  Job,
11
+ JobMarkdownResponse,
11
12
  ScrapeId,
12
13
  ScrapeResponse,
13
14
  ScrapeResponseError,
@@ -123,15 +124,15 @@ class WebCrawlerAPI:
123
124
  response.raise_for_status()
124
125
  return Job(response.json())
125
126
 
126
- def get_job_markdown(self, job_id: str) -> str:
127
+ def get_job_markdown(self, job_id: str) -> JobMarkdownResponse:
127
128
  """
128
- Get combined markdown content for a completed markdown job.
129
+ Get the URL to the combined markdown file for a completed markdown job.
129
130
 
130
131
  Args:
131
132
  job_id (str): The unique identifier of the job
132
133
 
133
134
  Returns:
134
- str: Combined markdown content
135
+ JobMarkdownResponse: Response containing the content_url to the markdown file
135
136
 
136
137
  Raises:
137
138
  requests.exceptions.RequestException: If the API request fails
@@ -139,6 +140,26 @@ class WebCrawlerAPI:
139
140
  response = self.session.get(
140
141
  urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown")
141
142
  )
143
+ response.raise_for_status()
144
+ data = response.json()
145
+ return JobMarkdownResponse(content_url=data["content_url"])
146
+
147
+ def get_job_markdown_content(self, job_id: str) -> str:
148
+ """
149
+ Download the combined markdown content for a completed markdown job.
150
+
151
+ Args:
152
+ job_id (str): The unique identifier of the job
153
+
154
+ Returns:
155
+ str: Combined markdown content as plain text
156
+
157
+ Raises:
158
+ requests.exceptions.RequestException: If the API request fails
159
+ """
160
+ response = self.session.get(
161
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown/content")
162
+ )
142
163
 
143
164
  if not response.ok:
144
165
  try:
@@ -303,7 +324,7 @@ class WebCrawlerAPI:
303
324
  f"Job finished with status {job.status}"
304
325
  )
305
326
 
306
- return self.get_job_markdown(job.id)
327
+ return self.get_job_markdown_content(job.id)
307
328
 
308
329
  def scrape_async(
309
330
  self,
@@ -38,6 +38,13 @@ class CrawlResponse:
38
38
  id: str
39
39
 
40
40
 
41
+ @dataclass
42
+ class JobMarkdownResponse:
43
+ """Response from the get_job_markdown endpoint."""
44
+
45
+ content_url: str
46
+
47
+
41
48
  @dataclass
42
49
  class ScrapeId:
43
50
  """Response from an asynchronous scrape request."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.11
3
+ Version: 2.0.12
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
File without changes
File without changes