crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -49,7 +49,11 @@ def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
|
|
|
49
49
|
|
|
50
50
|
def get_one_line_error_summary_if_possible(error: Exception) -> str:
|
|
51
51
|
if isinstance(error, asyncio.exceptions.TimeoutError):
|
|
52
|
-
most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
|
|
52
|
+
most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
|
|
53
|
+
elif 'playwright._impl._errors.Error' in str(error.__class__):
|
|
54
|
+
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
|
|
55
|
+
# point to deep internals.
|
|
56
|
+
return ''
|
|
53
57
|
else:
|
|
54
58
|
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
55
59
|
# Commonly last traceback part is type of the error, and the second last part is the relevant file.
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from bs4 import BeautifulSoup, Tag
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
|
|
11
11
|
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
|
|
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
|
|
|
58
58
|
self,
|
|
59
59
|
*,
|
|
60
60
|
parser: BeautifulSoupParserType = 'lxml',
|
|
61
|
-
**kwargs: Unpack[
|
|
61
|
+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
|
|
62
62
|
) -> None:
|
|
63
63
|
"""Initialize a new instance.
|
|
64
64
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from parsel import Selector
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._parsel_crawling_context import ParselCrawlingContext
|
|
11
11
|
from ._parsel_parser import ParselParser
|
|
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
|
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
**kwargs: Unpack[
|
|
59
|
+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
|
|
60
60
|
) -> None:
|
|
61
61
|
"""Initialize a new instance.
|
|
62
62
|
|
|
@@ -3,18 +3,25 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from functools import partial
|
|
7
8
|
from typing import TYPE_CHECKING, Any, Generic, Literal
|
|
8
9
|
|
|
10
|
+
import playwright.async_api
|
|
9
11
|
from more_itertools import partition
|
|
10
12
|
from pydantic import ValidationError
|
|
11
13
|
from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
12
14
|
|
|
13
15
|
from crawlee import service_locator
|
|
14
16
|
from crawlee._request import Request, RequestOptions
|
|
17
|
+
from crawlee._types import (
|
|
18
|
+
BasicCrawlingContext,
|
|
19
|
+
ConcurrencySettings,
|
|
20
|
+
)
|
|
15
21
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
16
22
|
from crawlee._utils.docs import docs_group
|
|
17
23
|
from crawlee._utils.robots import RobotsTxtFile
|
|
24
|
+
from crawlee._utils.time import SharedTimeout
|
|
18
25
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
19
26
|
from crawlee.browsers import BrowserPool
|
|
20
27
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
@@ -28,6 +35,7 @@ from crawlee.statistics import StatisticsState
|
|
|
28
35
|
from ._playwright_crawling_context import PlaywrightCrawlingContext
|
|
29
36
|
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
|
|
30
37
|
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
|
|
38
|
+
from ._types import GotoOptions
|
|
31
39
|
from ._utils import block_requests, infinite_scroll
|
|
32
40
|
|
|
33
41
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
|
|
@@ -43,7 +51,6 @@ if TYPE_CHECKING:
|
|
|
43
51
|
|
|
44
52
|
from crawlee import RequestTransformAction
|
|
45
53
|
from crawlee._types import (
|
|
46
|
-
BasicCrawlingContext,
|
|
47
54
|
EnqueueLinksKwargs,
|
|
48
55
|
ExtractLinksFunction,
|
|
49
56
|
HttpHeaders,
|
|
@@ -102,9 +109,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
102
109
|
user_data_dir: str | Path | None = None,
|
|
103
110
|
browser_launch_options: Mapping[str, Any] | None = None,
|
|
104
111
|
browser_new_context_options: Mapping[str, Any] | None = None,
|
|
112
|
+
goto_options: GotoOptions | None = None,
|
|
105
113
|
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
|
|
106
114
|
headless: bool | None = None,
|
|
107
115
|
use_incognito_pages: bool | None = None,
|
|
116
|
+
navigation_timeout: timedelta | None = None,
|
|
108
117
|
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
|
|
109
118
|
) -> None:
|
|
110
119
|
"""Initialize a new instance.
|
|
@@ -113,7 +122,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
113
122
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
114
123
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
115
124
|
and local storage.
|
|
116
|
-
browser_type: The type of browser to launch
|
|
125
|
+
browser_type: The type of browser to launch:
|
|
126
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
127
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
128
|
+
the system.
|
|
117
129
|
This option should not be used if `browser_pool` is provided.
|
|
118
130
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
119
131
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -130,12 +142,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
130
142
|
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
|
|
131
143
|
own context that is destroyed once the page is closed or crashes.
|
|
132
144
|
This option should not be used if `browser_pool` is provided.
|
|
145
|
+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
|
|
146
|
+
the request handler)
|
|
147
|
+
goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
|
|
148
|
+
not supported, use `navigation_timeout` instead.
|
|
133
149
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
134
150
|
"""
|
|
135
151
|
configuration = kwargs.pop('configuration', None)
|
|
136
152
|
if configuration is not None:
|
|
137
153
|
service_locator.set_configuration(configuration)
|
|
138
154
|
|
|
155
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
156
|
+
|
|
139
157
|
if browser_pool:
|
|
140
158
|
# Raise an exception if browser_pool is provided together with other browser-related arguments.
|
|
141
159
|
if any(
|
|
@@ -152,7 +170,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
152
170
|
):
|
|
153
171
|
raise ValueError(
|
|
154
172
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
155
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
173
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
156
174
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
157
175
|
)
|
|
158
176
|
|
|
@@ -194,6 +212,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
194
212
|
|
|
195
213
|
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
|
|
196
214
|
|
|
215
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
216
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
217
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
218
|
+
|
|
219
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
220
|
+
self._goto_options = goto_options or GotoOptions()
|
|
221
|
+
|
|
197
222
|
super().__init__(**kwargs)
|
|
198
223
|
|
|
199
224
|
async def _open_page(
|
|
@@ -218,12 +243,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
218
243
|
log=context.log,
|
|
219
244
|
page=crawlee_page.page,
|
|
220
245
|
block_requests=partial(block_requests, page=crawlee_page.page),
|
|
246
|
+
goto_options=GotoOptions(**self._goto_options),
|
|
221
247
|
)
|
|
222
248
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
249
|
+
context_id = id(pre_navigation_context)
|
|
250
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
async with browser_page_context(crawlee_page.page):
|
|
254
|
+
for hook in self._pre_navigation_hooks:
|
|
255
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
256
|
+
await hook(pre_navigation_context)
|
|
257
|
+
|
|
258
|
+
yield pre_navigation_context
|
|
259
|
+
finally:
|
|
260
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
227
261
|
|
|
228
262
|
def _prepare_request_interceptor(
|
|
229
263
|
self,
|
|
@@ -258,6 +292,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
258
292
|
Raises:
|
|
259
293
|
ValueError: If the browser pool is not initialized.
|
|
260
294
|
SessionError: If the URL cannot be loaded by the browser.
|
|
295
|
+
TimeoutError: If navigation does not succeed within the navigation timeout.
|
|
261
296
|
|
|
262
297
|
Yields:
|
|
263
298
|
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
|
|
@@ -289,7 +324,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
289
324
|
# Set route_handler only for current request
|
|
290
325
|
await context.page.route(context.request.url, route_handler)
|
|
291
326
|
|
|
292
|
-
|
|
327
|
+
try:
|
|
328
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
329
|
+
response = await context.page.goto(
|
|
330
|
+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
|
|
331
|
+
)
|
|
332
|
+
except playwright.async_api.TimeoutError as exc:
|
|
333
|
+
raise asyncio.TimeoutError from exc
|
|
293
334
|
|
|
294
335
|
if response is None:
|
|
295
336
|
raise SessionError(f'Failed to load the URL: {context.request.url}')
|
|
@@ -316,6 +357,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
316
357
|
extract_links=extract_links,
|
|
317
358
|
enqueue_links=self._create_enqueue_links_function(context, extract_links),
|
|
318
359
|
block_requests=partial(block_requests, page=context.page),
|
|
360
|
+
goto_options=context.goto_options,
|
|
319
361
|
)
|
|
320
362
|
|
|
321
363
|
if context.session:
|
|
@@ -361,7 +403,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
361
403
|
links_iterator: Iterator[str] = iter(
|
|
362
404
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
363
405
|
)
|
|
364
|
-
|
|
406
|
+
|
|
407
|
+
# Get base URL from <base> tag if present
|
|
408
|
+
extracted_base_url = await context.page.evaluate('document.baseURI')
|
|
409
|
+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
|
|
410
|
+
|
|
411
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
365
412
|
|
|
366
413
|
if robots_txt_file:
|
|
367
414
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -489,7 +536,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
489
536
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
490
537
|
|
|
491
538
|
browser_type: NotRequired[BrowserType]
|
|
492
|
-
"""The type of browser to launch
|
|
539
|
+
"""The type of browser to launch:
|
|
540
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
541
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
493
542
|
This option should not be used if `browser_pool` is provided."""
|
|
494
543
|
|
|
495
544
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
|
@@ -509,9 +558,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
509
558
|
|
|
510
559
|
|
|
511
560
|
class PlaywrightCrawlerOptions(
|
|
512
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
513
561
|
_PlaywrightCrawlerAdditionalOptions,
|
|
514
562
|
BasicCrawlerOptions[TCrawlingContext, StatisticsState],
|
|
563
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
515
564
|
):
|
|
516
565
|
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
517
566
|
|
|
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
59
59
|
session: Session | None = None,
|
|
60
60
|
proxy_info: ProxyInfo | None = None,
|
|
61
61
|
statistics: Statistics | None = None,
|
|
62
|
+
timeout: timedelta | None = None,
|
|
62
63
|
) -> HttpCrawlingResult:
|
|
63
64
|
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
|
|
64
65
|
|
|
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
72
73
|
payload: HttpPayload | None = None,
|
|
73
74
|
session: Session | None = None,
|
|
74
75
|
proxy_info: ProxyInfo | None = None,
|
|
76
|
+
timeout: timedelta | None = None,
|
|
75
77
|
) -> HttpResponse:
|
|
76
78
|
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
|
|
77
79
|
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
|
|
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
87
89
|
|
|
88
90
|
# Proxies appropriate to the browser context are used
|
|
89
91
|
response = await browser_context.request.fetch(
|
|
90
|
-
url_or_request=url,
|
|
92
|
+
url_or_request=url,
|
|
93
|
+
method=method.lower(),
|
|
94
|
+
headers=dict(headers) if headers else None,
|
|
95
|
+
data=payload,
|
|
96
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
91
97
|
)
|
|
92
98
|
|
|
93
99
|
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
|
|
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from playwright.async_api import Page
|
|
11
11
|
|
|
12
|
-
from ._types import BlockRequestsFunction
|
|
12
|
+
from ._types import BlockRequestsFunction, GotoOptions
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass(frozen=True)
|
|
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
26
26
|
block_requests: BlockRequestsFunction
|
|
27
27
|
"""Blocks network requests matching specified URL patterns."""
|
|
28
28
|
|
|
29
|
+
goto_options: GotoOptions
|
|
30
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
31
|
+
|
|
29
32
|
async def get_snapshot(self) -> PageSnapshot:
|
|
30
33
|
"""Get snapshot of crawled page."""
|
|
31
34
|
html = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import TYPE_CHECKING, Protocol
|
|
4
|
+
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
|
|
5
5
|
|
|
6
6
|
from crawlee import HttpHeaders
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from collections.abc import AsyncGenerator
|
|
11
11
|
|
|
12
12
|
from playwright.async_api import APIResponse, Response
|
|
13
|
-
from typing_extensions import Self
|
|
13
|
+
from typing_extensions import NotRequired, Self
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
@docs_group('Functions')
|
|
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
|
|
|
58
58
|
_content = await response.body()
|
|
59
59
|
|
|
60
60
|
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GotoOptions(TypedDict):
|
|
64
|
+
"""Keyword arguments for Playwright's `Page.goto()` method."""
|
|
65
|
+
|
|
66
|
+
wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
|
|
67
|
+
"""When to consider operation succeeded, defaults to 'load' event."""
|
|
68
|
+
|
|
69
|
+
referer: NotRequired[str]
|
|
70
|
+
"""Referer header value."""
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -130,11 +130,13 @@ class EventManager:
|
|
|
130
130
|
if not self._active:
|
|
131
131
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
132
132
|
|
|
133
|
+
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
|
|
134
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
135
|
+
await self._emit_persist_state_event()
|
|
133
136
|
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
|
|
134
137
|
self._event_emitter.remove_all_listeners()
|
|
135
138
|
self._listener_tasks.clear()
|
|
136
139
|
self._listeners_to_wrappers.clear()
|
|
137
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
138
140
|
self._active = False
|
|
139
141
|
|
|
140
142
|
@overload
|
|
@@ -172,11 +174,9 @@ class EventManager:
|
|
|
172
174
|
# to avoid blocking the event loop
|
|
173
175
|
coro = (
|
|
174
176
|
listener(*bound_args.args, **bound_args.kwargs)
|
|
175
|
-
if
|
|
177
|
+
if inspect.iscoroutinefunction(listener)
|
|
176
178
|
else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
|
|
177
179
|
)
|
|
178
|
-
# Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
|
|
179
|
-
# unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
|
|
180
180
|
|
|
181
181
|
listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
|
|
182
182
|
self._listener_tasks.add(listener_task)
|
crawlee/events/_types.py
CHANGED
|
@@ -40,7 +40,7 @@ class Event(str, Enum):
|
|
|
40
40
|
class EventPersistStateData(BaseModel):
|
|
41
41
|
"""Data for the persist state event."""
|
|
42
42
|
|
|
43
|
-
model_config = ConfigDict(
|
|
43
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
44
44
|
|
|
45
45
|
is_migrating: Annotated[bool, Field(alias='isMigrating')]
|
|
46
46
|
|
|
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
|
|
|
49
49
|
class EventSystemInfoData(BaseModel):
|
|
50
50
|
"""Data for the system info event."""
|
|
51
51
|
|
|
52
|
-
model_config = ConfigDict(
|
|
52
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
53
53
|
|
|
54
54
|
cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
|
|
55
55
|
memory_info: Annotated[
|
|
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
|
|
|
62
62
|
class EventMigratingData(BaseModel):
|
|
63
63
|
"""Data for the migrating event."""
|
|
64
64
|
|
|
65
|
-
model_config = ConfigDict(
|
|
65
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
66
66
|
|
|
67
67
|
# The remaining time in seconds before the migration is forced and the process is killed
|
|
68
68
|
# Optional because it's not present when the event handler is called manually
|
|
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
|
|
|
73
73
|
class EventAbortingData(BaseModel):
|
|
74
74
|
"""Data for the aborting event."""
|
|
75
75
|
|
|
76
|
-
model_config = ConfigDict(
|
|
76
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
@docs_group('Event data')
|
|
80
80
|
class EventExitData(BaseModel):
|
|
81
81
|
"""Data for the exit event."""
|
|
82
82
|
|
|
83
|
-
model_config = ConfigDict(
|
|
83
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
@docs_group('Event data')
|
|
87
87
|
class EventCrawlerStatusData(BaseModel):
|
|
88
88
|
"""Data for the crawler status event."""
|
|
89
89
|
|
|
90
|
-
model_config = ConfigDict(
|
|
90
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
91
91
|
|
|
92
92
|
message: str
|
|
93
93
|
"""A message describing the current status of the crawler."""
|
|
@@ -3,10 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from typing import TYPE_CHECKING
|
|
5
5
|
|
|
6
|
+
from crawlee._utils.docs import docs_group
|
|
7
|
+
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from browserforge.fingerprints import Fingerprint
|
|
8
10
|
|
|
9
11
|
|
|
12
|
+
@docs_group('Other')
|
|
10
13
|
class FingerprintGenerator(ABC):
|
|
11
14
|
"""A class for creating browser fingerprints that mimic browser fingerprints of real users."""
|
|
12
15
|
|
|
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def fingerprint_browser_type_from_playwright_browser_type(
|
|
14
|
-
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
|
|
14
|
+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
|
|
15
15
|
) -> SupportedBrowserType:
|
|
16
|
-
if playwright_browser_type
|
|
16
|
+
if playwright_browser_type in {'chromium', 'chrome'}:
|
|
17
17
|
return 'chrome'
|
|
18
18
|
if playwright_browser_type == 'firefox':
|
|
19
19
|
return 'firefox'
|
|
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class ScreenOptions(BaseModel):
|
|
14
|
-
model_config = ConfigDict(extra='forbid',
|
|
14
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
15
15
|
|
|
16
16
|
"""Defines the screen constrains for the fingerprint generator."""
|
|
17
17
|
|
|
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
|
|
|
31
31
|
class HeaderGeneratorOptions(BaseModel):
|
|
32
32
|
"""Collection of header related attributes that can be used by the fingerprint generator."""
|
|
33
33
|
|
|
34
|
-
model_config = ConfigDict(extra='forbid',
|
|
34
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
35
35
|
|
|
36
36
|
browsers: list[SupportedBrowserType] | None = None
|
|
37
37
|
"""List of BrowserSpecifications to generate the headers for."""
|
crawlee/http_clients/_base.py
CHANGED
|
@@ -104,6 +104,7 @@ class HttpClient(ABC):
|
|
|
104
104
|
session: Session | None = None,
|
|
105
105
|
proxy_info: ProxyInfo | None = None,
|
|
106
106
|
statistics: Statistics | None = None,
|
|
107
|
+
timeout: timedelta | None = None,
|
|
107
108
|
) -> HttpCrawlingResult:
|
|
108
109
|
"""Perform the crawling for a given request.
|
|
109
110
|
|
|
@@ -114,6 +115,7 @@ class HttpClient(ABC):
|
|
|
114
115
|
session: The session associated with the request.
|
|
115
116
|
proxy_info: The information about the proxy to be used.
|
|
116
117
|
statistics: The statistics object to register status codes.
|
|
118
|
+
timeout: Maximum time allowed to process the request.
|
|
117
119
|
|
|
118
120
|
Raises:
|
|
119
121
|
ProxyError: Raised if a proxy-related error occurs.
|
|
@@ -132,6 +134,7 @@ class HttpClient(ABC):
|
|
|
132
134
|
payload: HttpPayload | None = None,
|
|
133
135
|
session: Session | None = None,
|
|
134
136
|
proxy_info: ProxyInfo | None = None,
|
|
137
|
+
timeout: timedelta | None = None,
|
|
135
138
|
) -> HttpResponse:
|
|
136
139
|
"""Send an HTTP request via the client.
|
|
137
140
|
|
|
@@ -144,6 +147,7 @@ class HttpClient(ABC):
|
|
|
144
147
|
payload: The data to be sent as the request body.
|
|
145
148
|
session: The session associated with the request.
|
|
146
149
|
proxy_info: The information about the proxy to be used.
|
|
150
|
+
timeout: Maximum time allowed to process the request.
|
|
147
151
|
|
|
148
152
|
Raises:
|
|
149
153
|
ProxyError: Raised if a proxy-related error occurs.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
5
|
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
@@ -10,6 +11,7 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
|
|
|
10
11
|
from curl_cffi.requests.cookies import CurlMorsel
|
|
11
12
|
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
|
|
12
13
|
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
|
|
14
|
+
from curl_cffi.requests.exceptions import Timeout
|
|
13
15
|
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
|
|
14
16
|
from typing_extensions import override
|
|
15
17
|
|
|
@@ -147,6 +149,7 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
147
149
|
session: Session | None = None,
|
|
148
150
|
proxy_info: ProxyInfo | None = None,
|
|
149
151
|
statistics: Statistics | None = None,
|
|
152
|
+
timeout: timedelta | None = None,
|
|
150
153
|
) -> HttpCrawlingResult:
|
|
151
154
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
152
155
|
|
|
@@ -157,7 +160,10 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
157
160
|
headers=request.headers,
|
|
158
161
|
data=request.payload,
|
|
159
162
|
cookies=session.cookies.jar if session else None,
|
|
163
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
160
164
|
)
|
|
165
|
+
except Timeout as exc:
|
|
166
|
+
raise asyncio.TimeoutError from exc
|
|
161
167
|
except CurlRequestError as exc:
|
|
162
168
|
if self._is_proxy_error(exc):
|
|
163
169
|
raise ProxyError from exc
|
|
@@ -186,6 +192,7 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
186
192
|
payload: HttpPayload | None = None,
|
|
187
193
|
session: Session | None = None,
|
|
188
194
|
proxy_info: ProxyInfo | None = None,
|
|
195
|
+
timeout: timedelta | None = None,
|
|
189
196
|
) -> HttpResponse:
|
|
190
197
|
if isinstance(headers, dict) or headers is None:
|
|
191
198
|
headers = HttpHeaders(headers or {})
|
|
@@ -200,7 +207,10 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
200
207
|
headers=dict(headers) if headers else None,
|
|
201
208
|
data=payload,
|
|
202
209
|
cookies=session.cookies.jar if session else None,
|
|
210
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
203
211
|
)
|
|
212
|
+
except Timeout as exc:
|
|
213
|
+
raise asyncio.TimeoutError from exc
|
|
204
214
|
except CurlRequestError as exc:
|
|
205
215
|
if self._is_proxy_error(exc):
|
|
206
216
|
raise ProxyError from exc
|
|
@@ -241,6 +251,8 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
241
251
|
stream=True,
|
|
242
252
|
timeout=timeout.total_seconds() if timeout else None,
|
|
243
253
|
)
|
|
254
|
+
except Timeout as exc:
|
|
255
|
+
raise asyncio.TimeoutError from exc
|
|
244
256
|
except CurlRequestError as exc:
|
|
245
257
|
if self._is_proxy_error(exc):
|
|
246
258
|
raise ProxyError from exc
|
crawlee/http_clients/_httpx.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
146
147
|
session: Session | None = None,
|
|
147
148
|
proxy_info: ProxyInfo | None = None,
|
|
148
149
|
statistics: Statistics | None = None,
|
|
150
|
+
timeout: timedelta | None = None,
|
|
149
151
|
) -> HttpCrawlingResult:
|
|
150
152
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
151
153
|
headers = self._combine_headers(request.headers)
|
|
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
157
159
|
content=request.payload,
|
|
158
160
|
cookies=session.cookies.jar if session else None,
|
|
159
161
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
162
|
+
timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
|
|
160
163
|
)
|
|
161
164
|
|
|
162
165
|
try:
|
|
163
166
|
response = await client.send(http_request)
|
|
167
|
+
except httpx.TimeoutException as exc:
|
|
168
|
+
raise asyncio.TimeoutError from exc
|
|
164
169
|
except httpx.TransportError as exc:
|
|
165
170
|
if self._is_proxy_error(exc):
|
|
166
171
|
raise ProxyError from exc
|
|
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
185
190
|
payload: HttpPayload | None = None,
|
|
186
191
|
session: Session | None = None,
|
|
187
192
|
proxy_info: ProxyInfo | None = None,
|
|
193
|
+
timeout: timedelta | None = None,
|
|
188
194
|
) -> HttpResponse:
|
|
189
195
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
190
196
|
|
|
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
195
201
|
headers=headers,
|
|
196
202
|
payload=payload,
|
|
197
203
|
session=session,
|
|
204
|
+
timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
|
|
198
205
|
)
|
|
199
206
|
|
|
200
207
|
try:
|
|
201
208
|
response = await client.send(http_request)
|
|
209
|
+
except httpx.TimeoutException as exc:
|
|
210
|
+
raise asyncio.TimeoutError from exc
|
|
202
211
|
except httpx.TransportError as exc:
|
|
203
212
|
if self._is_proxy_error(exc):
|
|
204
213
|
raise ProxyError from exc
|
|
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
228
237
|
headers=headers,
|
|
229
238
|
payload=payload,
|
|
230
239
|
session=session,
|
|
231
|
-
timeout=timeout,
|
|
240
|
+
timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
|
|
232
241
|
)
|
|
233
242
|
|
|
234
|
-
|
|
243
|
+
try:
|
|
244
|
+
response = await client.send(http_request, stream=True)
|
|
245
|
+
except httpx.TimeoutException as exc:
|
|
246
|
+
raise asyncio.TimeoutError from exc
|
|
235
247
|
|
|
236
248
|
try:
|
|
237
249
|
yield _HttpxResponse(response)
|
|
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
246
258
|
headers: HttpHeaders | dict[str, str] | None,
|
|
247
259
|
payload: HttpPayload | None,
|
|
248
260
|
session: Session | None = None,
|
|
249
|
-
timeout:
|
|
261
|
+
timeout: httpx.Timeout | None = None,
|
|
250
262
|
) -> httpx.Request:
|
|
251
263
|
"""Build an `httpx.Request` using the provided parameters."""
|
|
252
264
|
if isinstance(headers, dict) or headers is None:
|
|
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
254
266
|
|
|
255
267
|
headers = self._combine_headers(headers)
|
|
256
268
|
|
|
257
|
-
httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
|
|
258
|
-
|
|
259
269
|
return client.build_request(
|
|
260
270
|
url=url,
|
|
261
271
|
method=method,
|
|
262
272
|
headers=dict(headers) if headers else None,
|
|
263
273
|
content=payload,
|
|
264
274
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
265
|
-
timeout=
|
|
275
|
+
timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
|
|
266
276
|
)
|
|
267
277
|
|
|
268
278
|
def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
|