webcrawlerapi 2.0.6__tar.gz → 2.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/PKG-INFO +1 -1
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/pyproject.toml +1 -1
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/setup.py +1 -1
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi/client.py +14 -2
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi.egg-info/PKG-INFO +1 -1
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/README.md +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/setup.cfg +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/tests/__init__.py +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/tests/test_client.py +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/tests/test_models.py +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi/__init__.py +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi/models.py +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.6 → webcrawlerapi-2.0.7}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from typing import Any, Dict, List, Optional, Union
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union, cast
|
|
3
3
|
from urllib.parse import urljoin
|
|
4
4
|
|
|
5
5
|
import requests
|
|
@@ -49,6 +49,7 @@ class WebCrawlerAPI:
|
|
|
49
49
|
blacklist_regexp: Optional[str] = None,
|
|
50
50
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
51
51
|
respect_robots_txt: bool = False,
|
|
52
|
+
main_content_only: bool = False,
|
|
52
53
|
) -> CrawlResponse:
|
|
53
54
|
"""
|
|
54
55
|
Start a new crawling job asynchronously.
|
|
@@ -63,6 +64,7 @@ class WebCrawlerAPI:
|
|
|
63
64
|
blacklist_regexp (str, optional): Regex pattern for URL blacklist
|
|
64
65
|
actions (Action or List[Action], optional): Actions to perform during crawling
|
|
65
66
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
67
|
+
main_content_only (bool): Whether to extract only main content (default: False)
|
|
66
68
|
|
|
67
69
|
Returns:
|
|
68
70
|
CrawlResponse: Response containing the job ID
|
|
@@ -76,6 +78,7 @@ class WebCrawlerAPI:
|
|
|
76
78
|
"items_limit": items_limit,
|
|
77
79
|
"allow_subdomains": allow_subdomains,
|
|
78
80
|
"respect_robots_txt": respect_robots_txt,
|
|
81
|
+
"main_content_only": main_content_only,
|
|
79
82
|
}
|
|
80
83
|
|
|
81
84
|
if webhook_url:
|
|
@@ -133,7 +136,7 @@ class WebCrawlerAPI:
|
|
|
133
136
|
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/cancel")
|
|
134
137
|
)
|
|
135
138
|
response.raise_for_status()
|
|
136
|
-
return response.json()
|
|
139
|
+
return cast(Dict[str, str], response.json())
|
|
137
140
|
|
|
138
141
|
def crawl(
|
|
139
142
|
self,
|
|
@@ -146,6 +149,7 @@ class WebCrawlerAPI:
|
|
|
146
149
|
blacklist_regexp: Optional[str] = None,
|
|
147
150
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
148
151
|
respect_robots_txt: bool = False,
|
|
152
|
+
main_content_only: bool = False,
|
|
149
153
|
max_polls: int = 100,
|
|
150
154
|
) -> Job:
|
|
151
155
|
"""
|
|
@@ -165,6 +169,7 @@ class WebCrawlerAPI:
|
|
|
165
169
|
blacklist_regexp (str, optional): Regex pattern for URL blacklist
|
|
166
170
|
actions (Action or List[Action], optional): Actions to perform during crawling
|
|
167
171
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
172
|
+
main_content_only (bool): Whether to extract only main content (default: False)
|
|
168
173
|
max_polls (int): Maximum number of status checks before returning (default: 100)
|
|
169
174
|
|
|
170
175
|
Returns:
|
|
@@ -184,6 +189,7 @@ class WebCrawlerAPI:
|
|
|
184
189
|
blacklist_regexp=blacklist_regexp,
|
|
185
190
|
actions=actions,
|
|
186
191
|
respect_robots_txt=respect_robots_txt,
|
|
192
|
+
main_content_only=main_content_only,
|
|
187
193
|
)
|
|
188
194
|
|
|
189
195
|
job_id = response.id
|
|
@@ -218,6 +224,7 @@ class WebCrawlerAPI:
|
|
|
218
224
|
prompt: Optional[str] = None,
|
|
219
225
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
220
226
|
respect_robots_txt: bool = False,
|
|
227
|
+
main_content_only: bool = False,
|
|
221
228
|
) -> ScrapeId:
|
|
222
229
|
"""
|
|
223
230
|
Start a new scraping job asynchronously.
|
|
@@ -230,6 +237,7 @@ class WebCrawlerAPI:
|
|
|
230
237
|
prompt (str, optional): Prompt to guide the AI response
|
|
231
238
|
actions (Action or List[Action], optional): Actions to perform after scraping (for example S3 upload)
|
|
232
239
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
240
|
+
main_content_only (bool): Whether to extract only main content (default: False)
|
|
233
241
|
|
|
234
242
|
Returns:
|
|
235
243
|
ScrapeId: Response containing the scrape job ID
|
|
@@ -241,6 +249,7 @@ class WebCrawlerAPI:
|
|
|
241
249
|
"url": url,
|
|
242
250
|
"output_format": output_format,
|
|
243
251
|
"respect_robots_txt": respect_robots_txt,
|
|
252
|
+
"main_content_only": main_content_only,
|
|
244
253
|
}
|
|
245
254
|
|
|
246
255
|
if webhook_url:
|
|
@@ -327,6 +336,7 @@ class WebCrawlerAPI:
|
|
|
327
336
|
prompt: Optional[str] = None,
|
|
328
337
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
329
338
|
respect_robots_txt: bool = False,
|
|
339
|
+
main_content_only: bool = False,
|
|
330
340
|
max_polls: int = 100,
|
|
331
341
|
) -> Union[ScrapeResponse, ScrapeResponseError]:
|
|
332
342
|
"""
|
|
@@ -344,6 +354,7 @@ class WebCrawlerAPI:
|
|
|
344
354
|
prompt (str, optional): Prompt to guide the AI response
|
|
345
355
|
actions (Action or List[Action], optional): Actions to perform during scraping
|
|
346
356
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
357
|
+
main_content_only (bool): Whether to extract only main content (default: False)
|
|
347
358
|
max_polls (int): Maximum number of status checks before returning (default: 100)
|
|
348
359
|
|
|
349
360
|
Returns:
|
|
@@ -361,6 +372,7 @@ class WebCrawlerAPI:
|
|
|
361
372
|
prompt=prompt,
|
|
362
373
|
actions=actions,
|
|
363
374
|
respect_robots_txt=respect_robots_txt,
|
|
375
|
+
main_content_only=main_content_only,
|
|
364
376
|
)
|
|
365
377
|
|
|
366
378
|
scrape_id = response.id
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|