webcrawlerapi 2.0.11__tar.gz → 2.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/PKG-INFO +1 -1
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/setup.py +1 -1
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi/__init__.py +2 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi/client.py +25 -4
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi/models.py +7 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/PKG-INFO +1 -1
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/README.md +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/pyproject.toml +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/setup.cfg +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/tests/__init__.py +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/tests/test_client.py +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/tests/test_models.py +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.11 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ from .models import (
|
|
|
28
28
|
CrawlResponse,
|
|
29
29
|
Job,
|
|
30
30
|
JobItem,
|
|
31
|
+
JobMarkdownResponse,
|
|
31
32
|
ScrapeId,
|
|
32
33
|
ScrapeResponse,
|
|
33
34
|
ScrapeResponseError,
|
|
@@ -39,6 +40,7 @@ __all__ = [
|
|
|
39
40
|
"WebCrawlerAPI",
|
|
40
41
|
"Job",
|
|
41
42
|
"JobItem",
|
|
43
|
+
"JobMarkdownResponse",
|
|
42
44
|
"CrawlResponse",
|
|
43
45
|
"ScrapeId",
|
|
44
46
|
"ScrapeResponse",
|
|
@@ -8,6 +8,7 @@ from .models import (
|
|
|
8
8
|
Action,
|
|
9
9
|
CrawlResponse,
|
|
10
10
|
Job,
|
|
11
|
+
JobMarkdownResponse,
|
|
11
12
|
ScrapeId,
|
|
12
13
|
ScrapeResponse,
|
|
13
14
|
ScrapeResponseError,
|
|
@@ -123,15 +124,15 @@ class WebCrawlerAPI:
|
|
|
123
124
|
response.raise_for_status()
|
|
124
125
|
return Job(response.json())
|
|
125
126
|
|
|
126
|
-
def get_job_markdown(self, job_id: str) ->
|
|
127
|
+
def get_job_markdown(self, job_id: str) -> JobMarkdownResponse:
|
|
127
128
|
"""
|
|
128
|
-
Get combined markdown
|
|
129
|
+
Get the URL to the combined markdown file for a completed markdown job.
|
|
129
130
|
|
|
130
131
|
Args:
|
|
131
132
|
job_id (str): The unique identifier of the job
|
|
132
133
|
|
|
133
134
|
Returns:
|
|
134
|
-
|
|
135
|
+
JobMarkdownResponse: Response containing the content_url to the markdown file
|
|
135
136
|
|
|
136
137
|
Raises:
|
|
137
138
|
requests.exceptions.RequestException: If the API request fails
|
|
@@ -139,6 +140,26 @@ class WebCrawlerAPI:
|
|
|
139
140
|
response = self.session.get(
|
|
140
141
|
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown")
|
|
141
142
|
)
|
|
143
|
+
response.raise_for_status()
|
|
144
|
+
data = response.json()
|
|
145
|
+
return JobMarkdownResponse(content_url=data["content_url"])
|
|
146
|
+
|
|
147
|
+
def get_job_markdown_content(self, job_id: str) -> str:
|
|
148
|
+
"""
|
|
149
|
+
Download the combined markdown content for a completed markdown job.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
job_id (str): The unique identifier of the job
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str: Combined markdown content as plain text
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
requests.exceptions.RequestException: If the API request fails
|
|
159
|
+
"""
|
|
160
|
+
response = self.session.get(
|
|
161
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown/content")
|
|
162
|
+
)
|
|
142
163
|
|
|
143
164
|
if not response.ok:
|
|
144
165
|
try:
|
|
@@ -303,7 +324,7 @@ class WebCrawlerAPI:
|
|
|
303
324
|
f"Job finished with status {job.status}"
|
|
304
325
|
)
|
|
305
326
|
|
|
306
|
-
return self.
|
|
327
|
+
return self.get_job_markdown_content(job.id)
|
|
307
328
|
|
|
308
329
|
def scrape_async(
|
|
309
330
|
self,
|
|
@@ -38,6 +38,13 @@ class CrawlResponse:
|
|
|
38
38
|
id: str
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
@dataclass
|
|
42
|
+
class JobMarkdownResponse:
|
|
43
|
+
"""Response from the get_job_markdown endpoint."""
|
|
44
|
+
|
|
45
|
+
content_url: str
|
|
46
|
+
|
|
47
|
+
|
|
41
48
|
@dataclass
|
|
42
49
|
class ScrapeId:
|
|
43
50
|
"""Response from an asynchronous scrape request."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|