webcrawlerapi 2.0.10__tar.gz → 2.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.10
3
+ Version: 2.0.12
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.10",
5
+ version="2.0.12",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -28,6 +28,7 @@ from .models import (
28
28
  CrawlResponse,
29
29
  Job,
30
30
  JobItem,
31
+ JobMarkdownResponse,
31
32
  ScrapeId,
32
33
  ScrapeResponse,
33
34
  ScrapeResponseError,
@@ -39,6 +40,7 @@ __all__ = [
39
40
  "WebCrawlerAPI",
40
41
  "Job",
41
42
  "JobItem",
43
+ "JobMarkdownResponse",
42
44
  "CrawlResponse",
43
45
  "ScrapeId",
44
46
  "ScrapeResponse",
@@ -8,6 +8,7 @@ from .models import (
8
8
  Action,
9
9
  CrawlResponse,
10
10
  Job,
11
+ JobMarkdownResponse,
11
12
  ScrapeId,
12
13
  ScrapeResponse,
13
14
  ScrapeResponseError,
@@ -123,15 +124,15 @@ class WebCrawlerAPI:
123
124
  response.raise_for_status()
124
125
  return Job(response.json())
125
126
 
126
- def get_job_markdown(self, job_id: str) -> str:
127
+ def get_job_markdown(self, job_id: str) -> JobMarkdownResponse:
127
128
  """
128
- Get combined markdown content for a completed markdown job.
129
+ Get the URL to the combined markdown file for a completed markdown job.
129
130
 
130
131
  Args:
131
132
  job_id (str): The unique identifier of the job
132
133
 
133
134
  Returns:
134
- str: Combined markdown content
135
+ JobMarkdownResponse: Response containing the content_url to the markdown file
135
136
 
136
137
  Raises:
137
138
  requests.exceptions.RequestException: If the API request fails
@@ -139,6 +140,26 @@ class WebCrawlerAPI:
139
140
  response = self.session.get(
140
141
  urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown")
141
142
  )
143
+ response.raise_for_status()
144
+ data = response.json()
145
+ return JobMarkdownResponse(content_url=data["content_url"])
146
+
147
+ def get_job_markdown_content(self, job_id: str) -> str:
148
+ """
149
+ Download the combined markdown content for a completed markdown job.
150
+
151
+ Args:
152
+ job_id (str): The unique identifier of the job
153
+
154
+ Returns:
155
+ str: Combined markdown content as plain text
156
+
157
+ Raises:
158
+ requests.exceptions.RequestException: If the API request fails
159
+ """
160
+ response = self.session.get(
161
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/markdown/content")
162
+ )
142
163
 
143
164
  if not response.ok:
144
165
  try:
@@ -303,7 +324,7 @@ class WebCrawlerAPI:
303
324
  f"Job finished with status {job.status}"
304
325
  )
305
326
 
306
- return self.get_job_markdown(job.id)
327
+ return self.get_job_markdown_content(job.id)
307
328
 
308
329
  def scrape_async(
309
330
  self,
@@ -312,6 +333,7 @@ class WebCrawlerAPI:
312
333
  webhook_url: Optional[str] = None,
313
334
  clean_selectors: Optional[str] = None,
314
335
  prompt: Optional[str] = None,
336
+ response_schema: Optional[Dict[str, Any]] = None,
315
337
  actions: Optional[Union[Action, List[Action]]] = None,
316
338
  respect_robots_txt: bool = False,
317
339
  main_content_only: bool = False,
@@ -326,6 +348,7 @@ class WebCrawlerAPI:
326
348
  webhook_url (str, optional): URL to receive a POST request when scraping is complete
327
349
  clean_selectors (str, optional): CSS selectors to clean from the content
328
350
  prompt (str, optional): Prompt to guide the AI response
351
+ response_schema (dict, optional): JSON Schema for structured output format. Works with the prompt parameter.
329
352
  actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
330
353
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
331
354
  main_content_only (bool): Whether to extract only main content (default: False)
@@ -350,6 +373,8 @@ class WebCrawlerAPI:
350
373
  payload["clean_selectors"] = clean_selectors
351
374
  if prompt:
352
375
  payload["prompt"] = prompt
376
+ if response_schema is not None:
377
+ payload["response_schema"] = response_schema
353
378
  if max_age is not None:
354
379
  payload["max_age"] = max_age
355
380
  if actions:
@@ -410,6 +435,7 @@ class WebCrawlerAPI:
410
435
  page_status_code=response_data.get("page_status_code", 0),
411
436
  page_title=response_data.get("page_title"),
412
437
  structured_data=response_data.get("structured_data"),
438
+ links=response_data.get("links"),
413
439
  )
414
440
  elif status == "error":
415
441
  return ScrapeResponseError(
@@ -428,6 +454,7 @@ class WebCrawlerAPI:
428
454
  webhook_url: Optional[str] = None,
429
455
  clean_selectors: Optional[str] = None,
430
456
  prompt: Optional[str] = None,
457
+ response_schema: Optional[Dict[str, Any]] = None,
431
458
  actions: Optional[Union[Action, List[Action]]] = None,
432
459
  respect_robots_txt: bool = False,
433
460
  main_content_only: bool = False,
@@ -447,6 +474,7 @@ class WebCrawlerAPI:
447
474
  webhook_url (str, optional): URL to receive a POST request when scraping is complete
448
475
  clean_selectors (str, optional): CSS selectors to clean from the content
449
476
  prompt (str, optional): Prompt to guide the AI response
477
+ response_schema (dict, optional): JSON Schema for structured output format. Works with the prompt parameter.
450
478
  actions (Action or List[Action], optional): Actions to perform during scraping
451
479
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
452
480
  main_content_only (bool): Whether to extract only main content (default: False)
@@ -466,6 +494,7 @@ class WebCrawlerAPI:
466
494
  webhook_url=webhook_url,
467
495
  clean_selectors=clean_selectors,
468
496
  prompt=prompt,
497
+ response_schema=response_schema,
469
498
  actions=actions,
470
499
  respect_robots_txt=respect_robots_txt,
471
500
  main_content_only=main_content_only,
@@ -38,6 +38,13 @@ class CrawlResponse:
38
38
  id: str
39
39
 
40
40
 
41
+ @dataclass
42
+ class JobMarkdownResponse:
43
+ """Response from the get_job_markdown endpoint."""
44
+
45
+ content_url: str
46
+
47
+
41
48
  @dataclass
42
49
  class ScrapeId:
43
50
  """Response from an asynchronous scrape request."""
@@ -57,6 +64,7 @@ class ScrapeResponse:
57
64
  page_status_code: int = 0
58
65
  page_title: Optional[str] = None
59
66
  structured_data: Optional[Dict[str, Any]] = None
67
+ links: Optional[List[str]] = None
60
68
 
61
69
 
62
70
  @dataclass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.10
3
+ Version: 2.0.12
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
File without changes
File without changes