crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recurring_task.py +12 -3
- crawlee/_utils/sitemap.py +12 -5
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/browsers/_browser_pool.py +1 -1
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +138 -124
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +1 -3
- crawlee/request_loaders/_sitemap_request_loader.py +18 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +2 -21
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
- crawlee/storage_clients/_redis/_client_mixin.py +1 -4
- crawlee/storage_clients/_redis/_dataset_client.py +6 -2
- crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
- crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
- crawlee/storage_clients/_redis/_storage_client.py +12 -9
- crawlee/storage_clients/_redis/_utils.py +1 -1
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,19 +3,22 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from functools import partial
|
|
7
8
|
from typing import TYPE_CHECKING, Any, Generic, Literal
|
|
8
9
|
|
|
10
|
+
import playwright.async_api
|
|
9
11
|
from more_itertools import partition
|
|
10
12
|
from pydantic import ValidationError
|
|
11
13
|
from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
12
14
|
|
|
13
15
|
from crawlee import service_locator
|
|
14
|
-
from crawlee._request import Request, RequestOptions
|
|
15
|
-
from crawlee._types import ConcurrencySettings
|
|
16
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
17
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings
|
|
16
18
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
17
19
|
from crawlee._utils.docs import docs_group
|
|
18
20
|
from crawlee._utils.robots import RobotsTxtFile
|
|
21
|
+
from crawlee._utils.time import SharedTimeout
|
|
19
22
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
20
23
|
from crawlee.browsers import BrowserPool
|
|
21
24
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
@@ -29,6 +32,7 @@ from crawlee.statistics import StatisticsState
|
|
|
29
32
|
from ._playwright_crawling_context import PlaywrightCrawlingContext
|
|
30
33
|
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
|
|
31
34
|
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
|
|
35
|
+
from ._types import GotoOptions
|
|
32
36
|
from ._utils import block_requests, infinite_scroll
|
|
33
37
|
|
|
34
38
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
|
|
@@ -44,7 +48,6 @@ if TYPE_CHECKING:
|
|
|
44
48
|
|
|
45
49
|
from crawlee import RequestTransformAction
|
|
46
50
|
from crawlee._types import (
|
|
47
|
-
BasicCrawlingContext,
|
|
48
51
|
EnqueueLinksKwargs,
|
|
49
52
|
ExtractLinksFunction,
|
|
50
53
|
HttpHeaders,
|
|
@@ -103,9 +106,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
103
106
|
user_data_dir: str | Path | None = None,
|
|
104
107
|
browser_launch_options: Mapping[str, Any] | None = None,
|
|
105
108
|
browser_new_context_options: Mapping[str, Any] | None = None,
|
|
109
|
+
goto_options: GotoOptions | None = None,
|
|
106
110
|
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
|
|
107
111
|
headless: bool | None = None,
|
|
108
112
|
use_incognito_pages: bool | None = None,
|
|
113
|
+
navigation_timeout: timedelta | None = None,
|
|
109
114
|
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
|
|
110
115
|
) -> None:
|
|
111
116
|
"""Initialize a new instance.
|
|
@@ -134,12 +139,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
134
139
|
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
|
|
135
140
|
own context that is destroyed once the page is closed or crashes.
|
|
136
141
|
This option should not be used if `browser_pool` is provided.
|
|
142
|
+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
|
|
143
|
+
the request handler)
|
|
144
|
+
goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
|
|
145
|
+
not supported, use `navigation_timeout` instead.
|
|
137
146
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
138
147
|
"""
|
|
139
148
|
configuration = kwargs.pop('configuration', None)
|
|
140
149
|
if configuration is not None:
|
|
141
150
|
service_locator.set_configuration(configuration)
|
|
142
151
|
|
|
152
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
153
|
+
|
|
143
154
|
if browser_pool:
|
|
144
155
|
# Raise an exception if browser_pool is provided together with other browser-related arguments.
|
|
145
156
|
if any(
|
|
@@ -163,10 +174,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
163
174
|
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
|
|
164
175
|
else:
|
|
165
176
|
if fingerprint_generator == 'default':
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
|
|
177
|
+
generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
|
|
178
|
+
[fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
|
|
179
|
+
)
|
|
170
180
|
|
|
171
181
|
fingerprint_generator = DefaultFingerprintGenerator(
|
|
172
182
|
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
|
|
@@ -202,6 +212,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
202
212
|
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
203
213
|
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
204
214
|
|
|
215
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
216
|
+
self._goto_options = goto_options or GotoOptions()
|
|
217
|
+
|
|
205
218
|
super().__init__(**kwargs)
|
|
206
219
|
|
|
207
220
|
async def _open_page(
|
|
@@ -226,12 +239,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
226
239
|
log=context.log,
|
|
227
240
|
page=crawlee_page.page,
|
|
228
241
|
block_requests=partial(block_requests, page=crawlee_page.page),
|
|
242
|
+
goto_options=GotoOptions(**self._goto_options),
|
|
229
243
|
)
|
|
230
244
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
245
|
+
context_id = id(pre_navigation_context)
|
|
246
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
async with browser_page_context(crawlee_page.page):
|
|
250
|
+
for hook in self._pre_navigation_hooks:
|
|
251
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
252
|
+
await hook(pre_navigation_context)
|
|
253
|
+
|
|
254
|
+
yield pre_navigation_context
|
|
255
|
+
finally:
|
|
256
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
235
257
|
|
|
236
258
|
def _prepare_request_interceptor(
|
|
237
259
|
self,
|
|
@@ -266,6 +288,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
266
288
|
Raises:
|
|
267
289
|
ValueError: If the browser pool is not initialized.
|
|
268
290
|
SessionError: If the URL cannot be loaded by the browser.
|
|
291
|
+
TimeoutError: If navigation does not succeed within the navigation timeout.
|
|
269
292
|
|
|
270
293
|
Yields:
|
|
271
294
|
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
|
|
@@ -297,7 +320,14 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
297
320
|
# Set route_handler only for current request
|
|
298
321
|
await context.page.route(context.request.url, route_handler)
|
|
299
322
|
|
|
300
|
-
|
|
323
|
+
try:
|
|
324
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
325
|
+
response = await context.page.goto(
|
|
326
|
+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
|
|
327
|
+
)
|
|
328
|
+
context.request.state = RequestState.AFTER_NAV
|
|
329
|
+
except playwright.async_api.TimeoutError as exc:
|
|
330
|
+
raise asyncio.TimeoutError from exc
|
|
301
331
|
|
|
302
332
|
if response is None:
|
|
303
333
|
raise SessionError(f'Failed to load the URL: {context.request.url}')
|
|
@@ -324,6 +354,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
324
354
|
extract_links=extract_links,
|
|
325
355
|
enqueue_links=self._create_enqueue_links_function(context, extract_links),
|
|
326
356
|
block_requests=partial(block_requests, page=context.page),
|
|
357
|
+
goto_options=context.goto_options,
|
|
327
358
|
)
|
|
328
359
|
|
|
329
360
|
if context.session:
|
|
@@ -364,14 +395,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
364
395
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
365
396
|
|
|
366
397
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
398
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
367
399
|
|
|
368
400
|
elements = await context.page.query_selector_all(selector)
|
|
369
401
|
links_iterator: Iterator[str] = iter(
|
|
370
402
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
371
403
|
)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
)
|
|
404
|
+
|
|
405
|
+
# Get base URL from <base> tag if present
|
|
406
|
+
extracted_base_url = await context.page.evaluate('document.baseURI')
|
|
407
|
+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
|
|
408
|
+
|
|
409
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
375
410
|
|
|
376
411
|
if robots_txt_file:
|
|
377
412
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -379,17 +414,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
379
414
|
skipped = iter([])
|
|
380
415
|
|
|
381
416
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
382
|
-
|
|
417
|
+
request_options = RequestOptions(
|
|
418
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
419
|
+
)
|
|
383
420
|
|
|
384
421
|
if transform_request_function:
|
|
385
|
-
|
|
386
|
-
if
|
|
422
|
+
transform_request_options = transform_request_function(request_options)
|
|
423
|
+
if transform_request_options == 'skip':
|
|
387
424
|
continue
|
|
388
|
-
if
|
|
389
|
-
|
|
425
|
+
if transform_request_options != 'unchanged':
|
|
426
|
+
request_options = transform_request_options
|
|
390
427
|
|
|
391
428
|
try:
|
|
392
|
-
request = Request.from_url(**
|
|
429
|
+
request = Request.from_url(**request_options)
|
|
393
430
|
except ValidationError as exc:
|
|
394
431
|
context.log.debug(
|
|
395
432
|
f'Skipping URL "{url}" due to invalid format: {exc}. '
|
|
@@ -475,7 +512,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
475
512
|
|
|
476
513
|
async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
|
|
477
514
|
"""Update the cookies in the page context."""
|
|
478
|
-
|
|
515
|
+
# False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
|
|
516
|
+
await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type]
|
|
479
517
|
|
|
480
518
|
async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
|
|
481
519
|
"""Find the robots.txt file for a given URL.
|
|
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
59
59
|
session: Session | None = None,
|
|
60
60
|
proxy_info: ProxyInfo | None = None,
|
|
61
61
|
statistics: Statistics | None = None,
|
|
62
|
+
timeout: timedelta | None = None,
|
|
62
63
|
) -> HttpCrawlingResult:
|
|
63
64
|
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
|
|
64
65
|
|
|
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
72
73
|
payload: HttpPayload | None = None,
|
|
73
74
|
session: Session | None = None,
|
|
74
75
|
proxy_info: ProxyInfo | None = None,
|
|
76
|
+
timeout: timedelta | None = None,
|
|
75
77
|
) -> HttpResponse:
|
|
76
78
|
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
|
|
77
79
|
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
|
|
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
87
89
|
|
|
88
90
|
# Proxies appropriate to the browser context are used
|
|
89
91
|
response = await browser_context.request.fetch(
|
|
90
|
-
url_or_request=url,
|
|
92
|
+
url_or_request=url,
|
|
93
|
+
method=method.lower(),
|
|
94
|
+
headers=dict(headers) if headers else None,
|
|
95
|
+
data=payload,
|
|
96
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
91
97
|
)
|
|
92
98
|
|
|
93
99
|
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
|
|
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from playwright.async_api import Page
|
|
11
11
|
|
|
12
|
-
from ._types import BlockRequestsFunction
|
|
12
|
+
from ._types import BlockRequestsFunction, GotoOptions
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass(frozen=True)
|
|
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
26
26
|
block_requests: BlockRequestsFunction
|
|
27
27
|
"""Blocks network requests matching specified URL patterns."""
|
|
28
28
|
|
|
29
|
+
goto_options: GotoOptions
|
|
30
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
31
|
+
|
|
29
32
|
async def get_snapshot(self) -> PageSnapshot:
|
|
30
33
|
"""Get snapshot of crawled page."""
|
|
31
34
|
html = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import TYPE_CHECKING, Protocol
|
|
4
|
+
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
|
|
5
5
|
|
|
6
6
|
from crawlee import HttpHeaders
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from collections.abc import AsyncGenerator
|
|
11
11
|
|
|
12
12
|
from playwright.async_api import APIResponse, Response
|
|
13
|
-
from typing_extensions import Self
|
|
13
|
+
from typing_extensions import NotRequired, Self
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
@docs_group('Functions')
|
|
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
|
|
|
58
58
|
_content = await response.body()
|
|
59
59
|
|
|
60
60
|
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GotoOptions(TypedDict):
|
|
64
|
+
"""Keyword arguments for Playwright's `Page.goto()` method."""
|
|
65
|
+
|
|
66
|
+
wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
|
|
67
|
+
"""When to consider operation succeeded, defaults to 'load' event."""
|
|
68
|
+
|
|
69
|
+
referer: NotRequired[str]
|
|
70
|
+
"""Referer header value."""
|
crawlee/errors.py
CHANGED
|
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
|
|
|
29
29
|
"""Wraps an exception thrown from an user-defined error handler."""
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
|
|
33
|
+
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
|
|
34
|
+
|
|
35
|
+
|
|
32
36
|
@docs_group('Errors')
|
|
33
37
|
class SessionError(Exception):
|
|
34
38
|
"""Errors of `SessionError` type will trigger a session rotation.
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -130,11 +130,13 @@ class EventManager:
|
|
|
130
130
|
if not self._active:
|
|
131
131
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
132
132
|
|
|
133
|
+
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
|
|
134
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
135
|
+
await self._emit_persist_state_event()
|
|
133
136
|
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
|
|
134
137
|
self._event_emitter.remove_all_listeners()
|
|
135
138
|
self._listener_tasks.clear()
|
|
136
139
|
self._listeners_to_wrappers.clear()
|
|
137
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
138
140
|
self._active = False
|
|
139
141
|
|
|
140
142
|
@overload
|
|
@@ -172,13 +174,12 @@ class EventManager:
|
|
|
172
174
|
# to avoid blocking the event loop
|
|
173
175
|
coro = (
|
|
174
176
|
listener(*bound_args.args, **bound_args.kwargs)
|
|
175
|
-
if
|
|
177
|
+
if inspect.iscoroutinefunction(listener)
|
|
176
178
|
else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
|
|
177
179
|
)
|
|
178
|
-
# Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
|
|
179
|
-
# unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
|
|
180
180
|
|
|
181
|
-
|
|
181
|
+
listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
|
|
182
|
+
listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
|
|
182
183
|
self._listener_tasks.add(listener_task)
|
|
183
184
|
|
|
184
185
|
try:
|
|
@@ -189,7 +190,12 @@ class EventManager:
|
|
|
189
190
|
# We need to swallow the exception and just log it here, otherwise it could break the event emitter
|
|
190
191
|
logger.exception(
|
|
191
192
|
'Exception in the event listener',
|
|
192
|
-
extra={
|
|
193
|
+
extra={
|
|
194
|
+
'event_name': event.value,
|
|
195
|
+
'listener_name': listener.__name__
|
|
196
|
+
if hasattr(listener, '__name__')
|
|
197
|
+
else listener.__class__.__name__,
|
|
198
|
+
},
|
|
193
199
|
)
|
|
194
200
|
finally:
|
|
195
201
|
logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
|
|
@@ -154,7 +154,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator):
|
|
|
154
154
|
class PatchedFingerprintGenerator(bf_FingerprintGenerator):
|
|
155
155
|
"""Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
|
|
156
156
|
|
|
157
|
-
def __init__(
|
|
157
|
+
def __init__(
|
|
158
158
|
self,
|
|
159
159
|
*,
|
|
160
160
|
screen: Screen | None = None,
|
crawlee/http_clients/_base.py
CHANGED
|
@@ -104,6 +104,7 @@ class HttpClient(ABC):
|
|
|
104
104
|
session: Session | None = None,
|
|
105
105
|
proxy_info: ProxyInfo | None = None,
|
|
106
106
|
statistics: Statistics | None = None,
|
|
107
|
+
timeout: timedelta | None = None,
|
|
107
108
|
) -> HttpCrawlingResult:
|
|
108
109
|
"""Perform the crawling for a given request.
|
|
109
110
|
|
|
@@ -114,6 +115,7 @@ class HttpClient(ABC):
|
|
|
114
115
|
session: The session associated with the request.
|
|
115
116
|
proxy_info: The information about the proxy to be used.
|
|
116
117
|
statistics: The statistics object to register status codes.
|
|
118
|
+
timeout: Maximum time allowed to process the request.
|
|
117
119
|
|
|
118
120
|
Raises:
|
|
119
121
|
ProxyError: Raised if a proxy-related error occurs.
|
|
@@ -132,6 +134,7 @@ class HttpClient(ABC):
|
|
|
132
134
|
payload: HttpPayload | None = None,
|
|
133
135
|
session: Session | None = None,
|
|
134
136
|
proxy_info: ProxyInfo | None = None,
|
|
137
|
+
timeout: timedelta | None = None,
|
|
135
138
|
) -> HttpResponse:
|
|
136
139
|
"""Send an HTTP request via the client.
|
|
137
140
|
|
|
@@ -144,6 +147,7 @@ class HttpClient(ABC):
|
|
|
144
147
|
payload: The data to be sent as the request body.
|
|
145
148
|
session: The session associated with the request.
|
|
146
149
|
proxy_info: The information about the proxy to be used.
|
|
150
|
+
timeout: Maximum time allowed to process the request.
|
|
147
151
|
|
|
148
152
|
Raises:
|
|
149
153
|
ProxyError: Raised if a proxy-related error occurs.
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
|
-
from
|
|
5
|
+
from http.cookiejar import Cookie
|
|
6
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
5
7
|
|
|
6
8
|
from curl_cffi import CurlInfo
|
|
7
9
|
from curl_cffi.const import CurlHttpVersion
|
|
@@ -10,10 +12,11 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
|
|
|
10
12
|
from curl_cffi.requests.cookies import CurlMorsel
|
|
11
13
|
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
|
|
12
14
|
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
|
|
15
|
+
from curl_cffi.requests.exceptions import Timeout
|
|
13
16
|
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
|
|
14
17
|
from typing_extensions import override
|
|
15
18
|
|
|
16
|
-
from crawlee._types import HttpHeaders, HttpPayload
|
|
19
|
+
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
|
|
17
20
|
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
|
|
18
21
|
from crawlee._utils.docs import docs_group
|
|
19
22
|
from crawlee.errors import ProxyError
|
|
@@ -22,11 +25,11 @@ from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse
|
|
|
22
25
|
if TYPE_CHECKING:
|
|
23
26
|
from collections.abc import AsyncGenerator
|
|
24
27
|
from datetime import timedelta
|
|
25
|
-
from http.cookiejar import Cookie
|
|
26
28
|
|
|
27
29
|
from curl_cffi import Curl
|
|
28
30
|
from curl_cffi.requests import Request as CurlRequest
|
|
29
31
|
from curl_cffi.requests import Response
|
|
32
|
+
from curl_cffi.requests.session import HttpMethod as CurlHttpMethod
|
|
30
33
|
|
|
31
34
|
from crawlee import Request
|
|
32
35
|
from crawlee._types import HttpMethod
|
|
@@ -88,15 +91,17 @@ class _CurlImpersonateResponse:
|
|
|
88
91
|
async def read(self) -> bytes:
|
|
89
92
|
if self._response.astream_task:
|
|
90
93
|
raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
|
|
94
|
+
|
|
91
95
|
return self._response.content
|
|
92
96
|
|
|
93
97
|
async def read_stream(self) -> AsyncGenerator[bytes, None]:
|
|
94
|
-
if not self._response.astream_task
|
|
95
|
-
raise RuntimeError(
|
|
96
|
-
'Cannot read stream: either already consumed or Response not obtained from `stream` method'
|
|
97
|
-
)
|
|
98
|
+
if not self._response.astream_task:
|
|
99
|
+
raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.')
|
|
98
100
|
|
|
99
|
-
|
|
101
|
+
if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done():
|
|
102
|
+
raise RuntimeError('Cannot read stream, it was already consumed.')
|
|
103
|
+
|
|
104
|
+
async for chunk in self._response.aiter_content():
|
|
100
105
|
yield chunk
|
|
101
106
|
|
|
102
107
|
|
|
@@ -147,17 +152,21 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
147
152
|
session: Session | None = None,
|
|
148
153
|
proxy_info: ProxyInfo | None = None,
|
|
149
154
|
statistics: Statistics | None = None,
|
|
155
|
+
timeout: timedelta | None = None,
|
|
150
156
|
) -> HttpCrawlingResult:
|
|
151
157
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
152
158
|
|
|
153
159
|
try:
|
|
154
160
|
response = await client.request(
|
|
155
161
|
url=request.url,
|
|
156
|
-
method=request.method
|
|
162
|
+
method=self._convert_method(request.method),
|
|
157
163
|
headers=request.headers,
|
|
158
164
|
data=request.payload,
|
|
159
165
|
cookies=session.cookies.jar if session else None,
|
|
166
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
160
167
|
)
|
|
168
|
+
except Timeout as exc:
|
|
169
|
+
raise asyncio.TimeoutError from exc
|
|
161
170
|
except CurlRequestError as exc:
|
|
162
171
|
if self._is_proxy_error(exc):
|
|
163
172
|
raise ProxyError from exc
|
|
@@ -186,6 +195,7 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
186
195
|
payload: HttpPayload | None = None,
|
|
187
196
|
session: Session | None = None,
|
|
188
197
|
proxy_info: ProxyInfo | None = None,
|
|
198
|
+
timeout: timedelta | None = None,
|
|
189
199
|
) -> HttpResponse:
|
|
190
200
|
if isinstance(headers, dict) or headers is None:
|
|
191
201
|
headers = HttpHeaders(headers or {})
|
|
@@ -196,11 +206,14 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
196
206
|
try:
|
|
197
207
|
response = await client.request(
|
|
198
208
|
url=url,
|
|
199
|
-
method=
|
|
209
|
+
method=self._convert_method(method),
|
|
200
210
|
headers=dict(headers) if headers else None,
|
|
201
211
|
data=payload,
|
|
202
212
|
cookies=session.cookies.jar if session else None,
|
|
213
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
203
214
|
)
|
|
215
|
+
except Timeout as exc:
|
|
216
|
+
raise asyncio.TimeoutError from exc
|
|
204
217
|
except CurlRequestError as exc:
|
|
205
218
|
if self._is_proxy_error(exc):
|
|
206
219
|
raise ProxyError from exc
|
|
@@ -234,13 +247,15 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
234
247
|
try:
|
|
235
248
|
response = await client.request(
|
|
236
249
|
url=url,
|
|
237
|
-
method=
|
|
250
|
+
method=self._convert_method(method),
|
|
238
251
|
headers=dict(headers) if headers else None,
|
|
239
252
|
data=payload,
|
|
240
253
|
cookies=session.cookies.jar if session else None,
|
|
241
254
|
stream=True,
|
|
242
255
|
timeout=timeout.total_seconds() if timeout else None,
|
|
243
256
|
)
|
|
257
|
+
except Timeout as exc:
|
|
258
|
+
raise asyncio.TimeoutError from exc
|
|
244
259
|
except CurlRequestError as exc:
|
|
245
260
|
if self._is_proxy_error(exc):
|
|
246
261
|
raise ProxyError from exc
|
|
@@ -279,6 +294,40 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
279
294
|
|
|
280
295
|
return self._client_by_proxy_url[proxy_url]
|
|
281
296
|
|
|
297
|
+
def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:
|
|
298
|
+
"""Convert from Crawlee HTTP method to curl-cffi HTTP method.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
method: Crawlee HTTP method.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
Corresponding curl-cffi HTTP method.
|
|
305
|
+
|
|
306
|
+
Raises:
|
|
307
|
+
ValueError: If the provided HTTP method is not supported.
|
|
308
|
+
"""
|
|
309
|
+
method_upper = method.upper() # curl-cffi requires uppercase methods
|
|
310
|
+
|
|
311
|
+
match method_upper:
|
|
312
|
+
case 'GET':
|
|
313
|
+
return 'GET'
|
|
314
|
+
case 'POST':
|
|
315
|
+
return 'POST'
|
|
316
|
+
case 'PUT':
|
|
317
|
+
return 'PUT'
|
|
318
|
+
case 'DELETE':
|
|
319
|
+
return 'DELETE'
|
|
320
|
+
case 'OPTIONS':
|
|
321
|
+
return 'OPTIONS'
|
|
322
|
+
case 'HEAD':
|
|
323
|
+
return 'HEAD'
|
|
324
|
+
case 'TRACE':
|
|
325
|
+
return 'TRACE'
|
|
326
|
+
case 'PATCH':
|
|
327
|
+
return 'PATCH'
|
|
328
|
+
case _:
|
|
329
|
+
raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.')
|
|
330
|
+
|
|
282
331
|
@staticmethod
|
|
283
332
|
def _is_proxy_error(error: CurlRequestError) -> bool:
|
|
284
333
|
"""Determine whether the given error is related to a proxy issue.
|
|
@@ -296,11 +345,16 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
296
345
|
|
|
297
346
|
@staticmethod
|
|
298
347
|
def _get_cookies(curl: Curl) -> list[Cookie]:
|
|
299
|
-
cookies
|
|
300
|
-
|
|
301
|
-
|
|
348
|
+
cookies = list[Cookie]()
|
|
349
|
+
|
|
350
|
+
# Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST.
|
|
351
|
+
cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST))
|
|
352
|
+
|
|
353
|
+
for curl_cookie in cookie_list:
|
|
354
|
+
curl_morsel = CurlMorsel.from_curl_format(curl_cookie)
|
|
302
355
|
cookie = curl_morsel.to_cookiejar_cookie()
|
|
303
356
|
cookies.append(cookie)
|
|
357
|
+
|
|
304
358
|
return cookies
|
|
305
359
|
|
|
306
360
|
async def cleanup(self) -> None:
|
crawlee/http_clients/_httpx.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
146
147
|
session: Session | None = None,
|
|
147
148
|
proxy_info: ProxyInfo | None = None,
|
|
148
149
|
statistics: Statistics | None = None,
|
|
150
|
+
timeout: timedelta | None = None,
|
|
149
151
|
) -> HttpCrawlingResult:
|
|
150
152
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
151
153
|
headers = self._combine_headers(request.headers)
|
|
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
157
159
|
content=request.payload,
|
|
158
160
|
cookies=session.cookies.jar if session else None,
|
|
159
161
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
162
|
+
timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
|
|
160
163
|
)
|
|
161
164
|
|
|
162
165
|
try:
|
|
163
166
|
response = await client.send(http_request)
|
|
167
|
+
except httpx.TimeoutException as exc:
|
|
168
|
+
raise asyncio.TimeoutError from exc
|
|
164
169
|
except httpx.TransportError as exc:
|
|
165
170
|
if self._is_proxy_error(exc):
|
|
166
171
|
raise ProxyError from exc
|
|
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
185
190
|
payload: HttpPayload | None = None,
|
|
186
191
|
session: Session | None = None,
|
|
187
192
|
proxy_info: ProxyInfo | None = None,
|
|
193
|
+
timeout: timedelta | None = None,
|
|
188
194
|
) -> HttpResponse:
|
|
189
195
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
190
196
|
|
|
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
195
201
|
headers=headers,
|
|
196
202
|
payload=payload,
|
|
197
203
|
session=session,
|
|
204
|
+
timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
|
|
198
205
|
)
|
|
199
206
|
|
|
200
207
|
try:
|
|
201
208
|
response = await client.send(http_request)
|
|
209
|
+
except httpx.TimeoutException as exc:
|
|
210
|
+
raise asyncio.TimeoutError from exc
|
|
202
211
|
except httpx.TransportError as exc:
|
|
203
212
|
if self._is_proxy_error(exc):
|
|
204
213
|
raise ProxyError from exc
|
|
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
228
237
|
headers=headers,
|
|
229
238
|
payload=payload,
|
|
230
239
|
session=session,
|
|
231
|
-
timeout=timeout,
|
|
240
|
+
timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
|
|
232
241
|
)
|
|
233
242
|
|
|
234
|
-
|
|
243
|
+
try:
|
|
244
|
+
response = await client.send(http_request, stream=True)
|
|
245
|
+
except httpx.TimeoutException as exc:
|
|
246
|
+
raise asyncio.TimeoutError from exc
|
|
235
247
|
|
|
236
248
|
try:
|
|
237
249
|
yield _HttpxResponse(response)
|
|
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
246
258
|
headers: HttpHeaders | dict[str, str] | None,
|
|
247
259
|
payload: HttpPayload | None,
|
|
248
260
|
session: Session | None = None,
|
|
249
|
-
timeout:
|
|
261
|
+
timeout: httpx.Timeout | None = None,
|
|
250
262
|
) -> httpx.Request:
|
|
251
263
|
"""Build an `httpx.Request` using the provided parameters."""
|
|
252
264
|
if isinstance(headers, dict) or headers is None:
|
|
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
254
266
|
|
|
255
267
|
headers = self._combine_headers(headers)
|
|
256
268
|
|
|
257
|
-
httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
|
|
258
|
-
|
|
259
269
|
return client.build_request(
|
|
260
270
|
url=url,
|
|
261
271
|
method=method,
|
|
262
272
|
headers=dict(headers) if headers else None,
|
|
263
273
|
content=payload,
|
|
264
274
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
265
|
-
timeout=
|
|
275
|
+
timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
|
|
266
276
|
)
|
|
267
277
|
|
|
268
278
|
def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
|