webcrawlerapi 2.0.10__tar.gz → 2.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/PKG-INFO +1 -1
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/setup.py +1 -1
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi/__init__.py +2 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi/client.py +33 -4
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi/models.py +8 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/PKG-INFO +1 -1
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/README.md +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/pyproject.toml +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/setup.cfg +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/tests/__init__.py +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/tests/test_client.py +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/tests/test_models.py +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.10 → webcrawlerapi-2.0.12}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ from .models import (
|
|
|
28
28
|
CrawlResponse,
|
|
29
29
|
Job,
|
|
30
30
|
JobItem,
|
|
31
|
+
JobMarkdownResponse,
|
|
31
32
|
ScrapeId,
|
|
32
33
|
ScrapeResponse,
|
|
33
34
|
ScrapeResponseError,
|
|
@@ -39,6 +40,7 @@ __all__ = [
|
|
|
39
40
|
"WebCrawlerAPI",
|
|
40
41
|
"Job",
|
|
41
42
|
"JobItem",
|
|
43
|
+
"JobMarkdownResponse",
|
|
42
44
|
"CrawlResponse",
|
|
43
45
|
"ScrapeId",
|
|
44
46
|
"ScrapeResponse",
|
|
@@ -8,6 +8,7 @@ from .models import (
|
|
|
8
8
|
Action,
|
|
9
9
|
CrawlResponse,
|
|
10
10
|
Job,
|
|
11
|
+
JobMarkdownResponse,
|
|
11
12
|
ScrapeId,
|
|
12
13
|
ScrapeResponse,
|
|
13
14
|
ScrapeResponseError,
|
|
@@ -123,15 +124,15 @@ class WebCrawlerAPI:
|
|
|
123
124
|
response.raise_for_status()
|
|
124
125
|
return Job(response.json())
|
|
125
126
|
|
|
126
|
-
def get_job_markdown(self, job_id: str) ->
|
|
127
|
+
def get_job_markdown(self, job_id: str) -> JobMarkdownResponse:
|
|
127
128
|
"""
|
|
128
|
-
Get combined markdown
|
|
129
|
+
Get the URL to the combined markdown file for a completed markdown job.
|
|
129
130
|
|
|
130
131
|
Args:
|
|
131
132
|
job_id (str): The unique identifier of the job
|
|
132
133
|
|
|
133
134
|
Returns:
|
|
134
|
-
|
|
135
|
+
JobMarkdownResponse: Response containing the content_url to the markdown file
|
|
135
136
|
|
|
136
137
|
Raises:
|
|
137
138
|
requests.exceptions.RequestException: If the API request fails
|
|
@@ -139,6 +140,26 @@ class WebCrawlerAPI:
|
|
|
139
140
|
response = self.session.get(
|
|
140
141
|
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown")
|
|
141
142
|
)
|
|
143
|
+
response.raise_for_status()
|
|
144
|
+
data = response.json()
|
|
145
|
+
return JobMarkdownResponse(content_url=data["content_url"])
|
|
146
|
+
|
|
147
|
+
def get_job_markdown_content(self, job_id: str) -> str:
|
|
148
|
+
"""
|
|
149
|
+
Download the combined markdown content for a completed markdown job.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
job_id (str): The unique identifier of the job
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str: Combined markdown content as plain text
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
requests.exceptions.RequestException: If the API request fails
|
|
159
|
+
"""
|
|
160
|
+
response = self.session.get(
|
|
161
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown/content")
|
|
162
|
+
)
|
|
142
163
|
|
|
143
164
|
if not response.ok:
|
|
144
165
|
try:
|
|
@@ -303,7 +324,7 @@ class WebCrawlerAPI:
|
|
|
303
324
|
f"Job finished with status {job.status}"
|
|
304
325
|
)
|
|
305
326
|
|
|
306
|
-
return self.
|
|
327
|
+
return self.get_job_markdown_content(job.id)
|
|
307
328
|
|
|
308
329
|
def scrape_async(
|
|
309
330
|
self,
|
|
@@ -312,6 +333,7 @@ class WebCrawlerAPI:
|
|
|
312
333
|
webhook_url: Optional[str] = None,
|
|
313
334
|
clean_selectors: Optional[str] = None,
|
|
314
335
|
prompt: Optional[str] = None,
|
|
336
|
+
response_schema: Optional[Dict[str, Any]] = None,
|
|
315
337
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
316
338
|
respect_robots_txt: bool = False,
|
|
317
339
|
main_content_only: bool = False,
|
|
@@ -326,6 +348,7 @@ class WebCrawlerAPI:
|
|
|
326
348
|
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
327
349
|
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
328
350
|
prompt (str, optional): Prompt to guide the AI response
|
|
351
|
+
response_schema (dict, optional): JSON Schema for structured output format. Works with the prompt parameter.
|
|
329
352
|
actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
|
|
330
353
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
331
354
|
main_content_only (bool): Whether to extract only main content (default: False)
|
|
@@ -350,6 +373,8 @@ class WebCrawlerAPI:
|
|
|
350
373
|
payload["clean_selectors"] = clean_selectors
|
|
351
374
|
if prompt:
|
|
352
375
|
payload["prompt"] = prompt
|
|
376
|
+
if response_schema is not None:
|
|
377
|
+
payload["response_schema"] = response_schema
|
|
353
378
|
if max_age is not None:
|
|
354
379
|
payload["max_age"] = max_age
|
|
355
380
|
if actions:
|
|
@@ -410,6 +435,7 @@ class WebCrawlerAPI:
|
|
|
410
435
|
page_status_code=response_data.get("page_status_code", 0),
|
|
411
436
|
page_title=response_data.get("page_title"),
|
|
412
437
|
structured_data=response_data.get("structured_data"),
|
|
438
|
+
links=response_data.get("links"),
|
|
413
439
|
)
|
|
414
440
|
elif status == "error":
|
|
415
441
|
return ScrapeResponseError(
|
|
@@ -428,6 +454,7 @@ class WebCrawlerAPI:
|
|
|
428
454
|
webhook_url: Optional[str] = None,
|
|
429
455
|
clean_selectors: Optional[str] = None,
|
|
430
456
|
prompt: Optional[str] = None,
|
|
457
|
+
response_schema: Optional[Dict[str, Any]] = None,
|
|
431
458
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
432
459
|
respect_robots_txt: bool = False,
|
|
433
460
|
main_content_only: bool = False,
|
|
@@ -447,6 +474,7 @@ class WebCrawlerAPI:
|
|
|
447
474
|
webhook_url (str, optional): URL to receive a POST request when scraping is complete
|
|
448
475
|
clean_selectors (str, optional): CSS selectors to clean from the content
|
|
449
476
|
prompt (str, optional): Prompt to guide the AI response
|
|
477
|
+
response_schema (dict, optional): JSON Schema for structured output format. Works with the prompt parameter.
|
|
450
478
|
actions (Action or List[Action], optional): Actions to perform during scraping
|
|
451
479
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
452
480
|
main_content_only (bool): Whether to extract only main content (default: False)
|
|
@@ -466,6 +494,7 @@ class WebCrawlerAPI:
|
|
|
466
494
|
webhook_url=webhook_url,
|
|
467
495
|
clean_selectors=clean_selectors,
|
|
468
496
|
prompt=prompt,
|
|
497
|
+
response_schema=response_schema,
|
|
469
498
|
actions=actions,
|
|
470
499
|
respect_robots_txt=respect_robots_txt,
|
|
471
500
|
main_content_only=main_content_only,
|
|
@@ -38,6 +38,13 @@ class CrawlResponse:
|
|
|
38
38
|
id: str
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
@dataclass
|
|
42
|
+
class JobMarkdownResponse:
|
|
43
|
+
"""Response from the get_job_markdown endpoint."""
|
|
44
|
+
|
|
45
|
+
content_url: str
|
|
46
|
+
|
|
47
|
+
|
|
41
48
|
@dataclass
|
|
42
49
|
class ScrapeId:
|
|
43
50
|
"""Response from an asynchronous scrape request."""
|
|
@@ -57,6 +64,7 @@ class ScrapeResponse:
|
|
|
57
64
|
page_status_code: int = 0
|
|
58
65
|
page_title: Optional[str] = None
|
|
59
66
|
structured_data: Optional[Dict[str, Any]] = None
|
|
67
|
+
links: Optional[List[str]] = None
|
|
60
68
|
|
|
61
69
|
|
|
62
70
|
@dataclass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|