crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +62 -32
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +52 -19
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +160 -134
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
- crawlee/storage_clients/_memory/_dataset_client.py +2 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_dataset_client.py +2 -2
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
- crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
- crawlee/storage_clients/_sql/_storage_client.py +1 -1
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +3 -0
- crawlee/storages/_key_value_store.py +8 -2
- crawlee/storages/_request_queue.py +3 -0
- crawlee/storages/_storage_instance_manager.py +109 -42
- crawlee/storages/_utils.py +11 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,9 +2,21 @@ import asyncio
|
|
|
2
2
|
import re
|
|
3
3
|
import traceback
|
|
4
4
|
|
|
5
|
+
import crawlee.errors
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
def _get_only_innermost_exception(error: BaseException) -> BaseException:
|
|
7
|
-
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
9
|
+
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
10
|
+
|
|
11
|
+
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
|
|
12
|
+
"""
|
|
13
|
+
if type(error) is crawlee.errors.UserHandlerTimeoutError:
|
|
14
|
+
if error.__cause__:
|
|
15
|
+
return error.__cause__
|
|
16
|
+
if error.__context__:
|
|
17
|
+
return error.__context__
|
|
18
|
+
return error
|
|
19
|
+
|
|
8
20
|
if error.__cause__:
|
|
9
21
|
return _get_only_innermost_exception(error.__cause__)
|
|
10
22
|
if error.__context__:
|
|
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
|
|
|
34
46
|
|
|
35
47
|
|
|
36
48
|
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
37
|
-
timeout_error: asyncio.exceptions.TimeoutError,
|
|
49
|
+
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
|
|
38
50
|
) -> list[str]:
|
|
39
51
|
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
|
|
40
52
|
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
|
|
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
|
43
55
|
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
|
|
44
56
|
innermost_error = _get_only_innermost_exception(error)
|
|
45
57
|
return traceback.format_exception(
|
|
46
|
-
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=
|
|
58
|
+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
|
|
47
59
|
)
|
|
48
60
|
|
|
49
61
|
|
|
50
62
|
def get_one_line_error_summary_if_possible(error: Exception) -> str:
|
|
51
63
|
if isinstance(error, asyncio.exceptions.TimeoutError):
|
|
52
|
-
|
|
64
|
+
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
65
|
+
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
|
|
66
|
+
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
|
|
67
|
+
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
|
|
68
|
+
# code and third line the topmost user error
|
|
69
|
+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
70
|
+
relevant_index_from_start = 3
|
|
71
|
+
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
|
|
53
72
|
elif 'playwright._impl._errors.Error' in str(error.__class__):
|
|
54
73
|
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
|
|
55
74
|
# point to deep internals.
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from bs4 import BeautifulSoup, Tag
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
|
|
11
11
|
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
|
|
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
|
|
|
58
58
|
self,
|
|
59
59
|
*,
|
|
60
60
|
parser: BeautifulSoupParserType = 'lxml',
|
|
61
|
-
**kwargs: Unpack[
|
|
61
|
+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
|
|
62
62
|
) -> None:
|
|
63
63
|
"""Initialize a new instance.
|
|
64
64
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from parsel import Selector
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._parsel_crawling_context import ParselCrawlingContext
|
|
11
11
|
from ._parsel_parser import ParselParser
|
|
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
|
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
**kwargs: Unpack[
|
|
59
|
+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
|
|
60
60
|
) -> None:
|
|
61
61
|
"""Initialize a new instance.
|
|
62
62
|
|
|
@@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
|
|
|
22
22
|
@override
|
|
23
23
|
async def parse(self, response: HttpResponse) -> Selector:
|
|
24
24
|
response_body = await response.read()
|
|
25
|
-
return await asyncio.to_thread(
|
|
25
|
+
return await asyncio.to_thread(Selector, body=response_body)
|
|
26
26
|
|
|
27
27
|
@override
|
|
28
28
|
async def parse_text(self, text: str) -> Selector:
|
|
@@ -3,19 +3,22 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from functools import partial
|
|
7
8
|
from typing import TYPE_CHECKING, Any, Generic, Literal
|
|
8
9
|
|
|
10
|
+
import playwright.async_api
|
|
9
11
|
from more_itertools import partition
|
|
10
12
|
from pydantic import ValidationError
|
|
11
13
|
from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
12
14
|
|
|
13
15
|
from crawlee import service_locator
|
|
14
|
-
from crawlee._request import Request, RequestOptions
|
|
15
|
-
from crawlee._types import ConcurrencySettings
|
|
16
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
17
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings
|
|
16
18
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
17
19
|
from crawlee._utils.docs import docs_group
|
|
18
20
|
from crawlee._utils.robots import RobotsTxtFile
|
|
21
|
+
from crawlee._utils.time import SharedTimeout
|
|
19
22
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
20
23
|
from crawlee.browsers import BrowserPool
|
|
21
24
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
@@ -29,6 +32,7 @@ from crawlee.statistics import StatisticsState
|
|
|
29
32
|
from ._playwright_crawling_context import PlaywrightCrawlingContext
|
|
30
33
|
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
|
|
31
34
|
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
|
|
35
|
+
from ._types import GotoOptions
|
|
32
36
|
from ._utils import block_requests, infinite_scroll
|
|
33
37
|
|
|
34
38
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
|
|
@@ -44,7 +48,6 @@ if TYPE_CHECKING:
|
|
|
44
48
|
|
|
45
49
|
from crawlee import RequestTransformAction
|
|
46
50
|
from crawlee._types import (
|
|
47
|
-
BasicCrawlingContext,
|
|
48
51
|
EnqueueLinksKwargs,
|
|
49
52
|
ExtractLinksFunction,
|
|
50
53
|
HttpHeaders,
|
|
@@ -103,9 +106,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
103
106
|
user_data_dir: str | Path | None = None,
|
|
104
107
|
browser_launch_options: Mapping[str, Any] | None = None,
|
|
105
108
|
browser_new_context_options: Mapping[str, Any] | None = None,
|
|
109
|
+
goto_options: GotoOptions | None = None,
|
|
106
110
|
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
|
|
107
111
|
headless: bool | None = None,
|
|
108
112
|
use_incognito_pages: bool | None = None,
|
|
113
|
+
navigation_timeout: timedelta | None = None,
|
|
109
114
|
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
|
|
110
115
|
) -> None:
|
|
111
116
|
"""Initialize a new instance.
|
|
@@ -114,7 +119,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
114
119
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
115
120
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
116
121
|
and local storage.
|
|
117
|
-
browser_type: The type of browser to launch
|
|
122
|
+
browser_type: The type of browser to launch:
|
|
123
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
124
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
125
|
+
the system.
|
|
118
126
|
This option should not be used if `browser_pool` is provided.
|
|
119
127
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
120
128
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -131,12 +139,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
131
139
|
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
|
|
132
140
|
own context that is destroyed once the page is closed or crashes.
|
|
133
141
|
This option should not be used if `browser_pool` is provided.
|
|
142
|
+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
|
|
143
|
+
the request handler)
|
|
144
|
+
goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
|
|
145
|
+
not supported, use `navigation_timeout` instead.
|
|
134
146
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
135
147
|
"""
|
|
136
148
|
configuration = kwargs.pop('configuration', None)
|
|
137
149
|
if configuration is not None:
|
|
138
150
|
service_locator.set_configuration(configuration)
|
|
139
151
|
|
|
152
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
153
|
+
|
|
140
154
|
if browser_pool:
|
|
141
155
|
# Raise an exception if browser_pool is provided together with other browser-related arguments.
|
|
142
156
|
if any(
|
|
@@ -153,17 +167,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
153
167
|
):
|
|
154
168
|
raise ValueError(
|
|
155
169
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
156
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
170
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
157
171
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
158
172
|
)
|
|
159
173
|
|
|
160
174
|
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
|
|
161
175
|
else:
|
|
162
176
|
if fingerprint_generator == 'default':
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
|
|
177
|
+
generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
|
|
178
|
+
[fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
|
|
179
|
+
)
|
|
167
180
|
|
|
168
181
|
fingerprint_generator = DefaultFingerprintGenerator(
|
|
169
182
|
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
|
|
@@ -199,6 +212,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
199
212
|
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
200
213
|
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
201
214
|
|
|
215
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
216
|
+
self._goto_options = goto_options or GotoOptions()
|
|
217
|
+
|
|
202
218
|
super().__init__(**kwargs)
|
|
203
219
|
|
|
204
220
|
async def _open_page(
|
|
@@ -223,12 +239,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
223
239
|
log=context.log,
|
|
224
240
|
page=crawlee_page.page,
|
|
225
241
|
block_requests=partial(block_requests, page=crawlee_page.page),
|
|
242
|
+
goto_options=GotoOptions(**self._goto_options),
|
|
226
243
|
)
|
|
227
244
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
245
|
+
context_id = id(pre_navigation_context)
|
|
246
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
async with browser_page_context(crawlee_page.page):
|
|
250
|
+
for hook in self._pre_navigation_hooks:
|
|
251
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
252
|
+
await hook(pre_navigation_context)
|
|
253
|
+
|
|
254
|
+
yield pre_navigation_context
|
|
255
|
+
finally:
|
|
256
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
232
257
|
|
|
233
258
|
def _prepare_request_interceptor(
|
|
234
259
|
self,
|
|
@@ -263,6 +288,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
263
288
|
Raises:
|
|
264
289
|
ValueError: If the browser pool is not initialized.
|
|
265
290
|
SessionError: If the URL cannot be loaded by the browser.
|
|
291
|
+
TimeoutError: If navigation does not succeed within the navigation timeout.
|
|
266
292
|
|
|
267
293
|
Yields:
|
|
268
294
|
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
|
|
@@ -294,7 +320,14 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
294
320
|
# Set route_handler only for current request
|
|
295
321
|
await context.page.route(context.request.url, route_handler)
|
|
296
322
|
|
|
297
|
-
|
|
323
|
+
try:
|
|
324
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
325
|
+
response = await context.page.goto(
|
|
326
|
+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
|
|
327
|
+
)
|
|
328
|
+
context.request.state = RequestState.AFTER_NAV
|
|
329
|
+
except playwright.async_api.TimeoutError as exc:
|
|
330
|
+
raise asyncio.TimeoutError from exc
|
|
298
331
|
|
|
299
332
|
if response is None:
|
|
300
333
|
raise SessionError(f'Failed to load the URL: {context.request.url}')
|
|
@@ -321,6 +354,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
321
354
|
extract_links=extract_links,
|
|
322
355
|
enqueue_links=self._create_enqueue_links_function(context, extract_links),
|
|
323
356
|
block_requests=partial(block_requests, page=context.page),
|
|
357
|
+
goto_options=context.goto_options,
|
|
324
358
|
)
|
|
325
359
|
|
|
326
360
|
if context.session:
|
|
@@ -361,12 +395,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
361
395
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
362
396
|
|
|
363
397
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
398
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
364
399
|
|
|
365
400
|
elements = await context.page.query_selector_all(selector)
|
|
366
401
|
links_iterator: Iterator[str] = iter(
|
|
367
402
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
368
403
|
)
|
|
369
|
-
|
|
404
|
+
|
|
405
|
+
# Get base URL from <base> tag if present
|
|
406
|
+
extracted_base_url = await context.page.evaluate('document.baseURI')
|
|
407
|
+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
|
|
408
|
+
|
|
409
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
370
410
|
|
|
371
411
|
if robots_txt_file:
|
|
372
412
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -374,17 +414,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
374
414
|
skipped = iter([])
|
|
375
415
|
|
|
376
416
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
377
|
-
|
|
417
|
+
request_options = RequestOptions(
|
|
418
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
419
|
+
)
|
|
378
420
|
|
|
379
421
|
if transform_request_function:
|
|
380
|
-
|
|
381
|
-
if
|
|
422
|
+
transform_request_options = transform_request_function(request_options)
|
|
423
|
+
if transform_request_options == 'skip':
|
|
382
424
|
continue
|
|
383
|
-
if
|
|
384
|
-
|
|
425
|
+
if transform_request_options != 'unchanged':
|
|
426
|
+
request_options = transform_request_options
|
|
385
427
|
|
|
386
428
|
try:
|
|
387
|
-
request = Request.from_url(**
|
|
429
|
+
request = Request.from_url(**request_options)
|
|
388
430
|
except ValidationError as exc:
|
|
389
431
|
context.log.debug(
|
|
390
432
|
f'Skipping URL "{url}" due to invalid format: {exc}. '
|
|
@@ -470,7 +512,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
470
512
|
|
|
471
513
|
async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
|
|
472
514
|
"""Update the cookies in the page context."""
|
|
473
|
-
|
|
515
|
+
# False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
|
|
516
|
+
await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type]
|
|
474
517
|
|
|
475
518
|
async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
|
|
476
519
|
"""Find the robots.txt file for a given URL.
|
|
@@ -494,7 +537,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
494
537
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
495
538
|
|
|
496
539
|
browser_type: NotRequired[BrowserType]
|
|
497
|
-
"""The type of browser to launch
|
|
540
|
+
"""The type of browser to launch:
|
|
541
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
542
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
498
543
|
This option should not be used if `browser_pool` is provided."""
|
|
499
544
|
|
|
500
545
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
|
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
59
59
|
session: Session | None = None,
|
|
60
60
|
proxy_info: ProxyInfo | None = None,
|
|
61
61
|
statistics: Statistics | None = None,
|
|
62
|
+
timeout: timedelta | None = None,
|
|
62
63
|
) -> HttpCrawlingResult:
|
|
63
64
|
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
|
|
64
65
|
|
|
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
72
73
|
payload: HttpPayload | None = None,
|
|
73
74
|
session: Session | None = None,
|
|
74
75
|
proxy_info: ProxyInfo | None = None,
|
|
76
|
+
timeout: timedelta | None = None,
|
|
75
77
|
) -> HttpResponse:
|
|
76
78
|
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
|
|
77
79
|
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
|
|
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
87
89
|
|
|
88
90
|
# Proxies appropriate to the browser context are used
|
|
89
91
|
response = await browser_context.request.fetch(
|
|
90
|
-
url_or_request=url,
|
|
92
|
+
url_or_request=url,
|
|
93
|
+
method=method.lower(),
|
|
94
|
+
headers=dict(headers) if headers else None,
|
|
95
|
+
data=payload,
|
|
96
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
91
97
|
)
|
|
92
98
|
|
|
93
99
|
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
|
|
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from playwright.async_api import Page
|
|
11
11
|
|
|
12
|
-
from ._types import BlockRequestsFunction
|
|
12
|
+
from ._types import BlockRequestsFunction, GotoOptions
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass(frozen=True)
|
|
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
26
26
|
block_requests: BlockRequestsFunction
|
|
27
27
|
"""Blocks network requests matching specified URL patterns."""
|
|
28
28
|
|
|
29
|
+
goto_options: GotoOptions
|
|
30
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
31
|
+
|
|
29
32
|
async def get_snapshot(self) -> PageSnapshot:
|
|
30
33
|
"""Get snapshot of crawled page."""
|
|
31
34
|
html = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import TYPE_CHECKING, Protocol
|
|
4
|
+
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
|
|
5
5
|
|
|
6
6
|
from crawlee import HttpHeaders
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from collections.abc import AsyncGenerator
|
|
11
11
|
|
|
12
12
|
from playwright.async_api import APIResponse, Response
|
|
13
|
-
from typing_extensions import Self
|
|
13
|
+
from typing_extensions import NotRequired, Self
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
@docs_group('Functions')
|
|
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
|
|
|
58
58
|
_content = await response.body()
|
|
59
59
|
|
|
60
60
|
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GotoOptions(TypedDict):
|
|
64
|
+
"""Keyword arguments for Playwright's `Page.goto()` method."""
|
|
65
|
+
|
|
66
|
+
wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
|
|
67
|
+
"""When to consider operation succeeded, defaults to 'load' event."""
|
|
68
|
+
|
|
69
|
+
referer: NotRequired[str]
|
|
70
|
+
"""Referer header value."""
|
crawlee/errors.py
CHANGED
|
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
|
|
|
29
29
|
"""Wraps an exception thrown from an user-defined error handler."""
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
|
|
33
|
+
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
|
|
34
|
+
|
|
35
|
+
|
|
32
36
|
@docs_group('Errors')
|
|
33
37
|
class SessionError(Exception):
|
|
34
38
|
"""Errors of `SessionError` type will trigger a session rotation.
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -130,11 +130,13 @@ class EventManager:
|
|
|
130
130
|
if not self._active:
|
|
131
131
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
132
132
|
|
|
133
|
+
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
|
|
134
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
135
|
+
await self._emit_persist_state_event()
|
|
133
136
|
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
|
|
134
137
|
self._event_emitter.remove_all_listeners()
|
|
135
138
|
self._listener_tasks.clear()
|
|
136
139
|
self._listeners_to_wrappers.clear()
|
|
137
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
138
140
|
self._active = False
|
|
139
141
|
|
|
140
142
|
@overload
|
|
@@ -172,13 +174,12 @@ class EventManager:
|
|
|
172
174
|
# to avoid blocking the event loop
|
|
173
175
|
coro = (
|
|
174
176
|
listener(*bound_args.args, **bound_args.kwargs)
|
|
175
|
-
if
|
|
177
|
+
if inspect.iscoroutinefunction(listener)
|
|
176
178
|
else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
|
|
177
179
|
)
|
|
178
|
-
# Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
|
|
179
|
-
# unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
|
|
180
180
|
|
|
181
|
-
|
|
181
|
+
listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
|
|
182
|
+
listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
|
|
182
183
|
self._listener_tasks.add(listener_task)
|
|
183
184
|
|
|
184
185
|
try:
|
|
@@ -189,7 +190,12 @@ class EventManager:
|
|
|
189
190
|
# We need to swallow the exception and just log it here, otherwise it could break the event emitter
|
|
190
191
|
logger.exception(
|
|
191
192
|
'Exception in the event listener',
|
|
192
|
-
extra={
|
|
193
|
+
extra={
|
|
194
|
+
'event_name': event.value,
|
|
195
|
+
'listener_name': listener.__name__
|
|
196
|
+
if hasattr(listener, '__name__')
|
|
197
|
+
else listener.__class__.__name__,
|
|
198
|
+
},
|
|
193
199
|
)
|
|
194
200
|
finally:
|
|
195
201
|
logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
|
|
@@ -154,7 +154,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator):
|
|
|
154
154
|
class PatchedFingerprintGenerator(bf_FingerprintGenerator):
|
|
155
155
|
"""Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
|
|
156
156
|
|
|
157
|
-
def __init__(
|
|
157
|
+
def __init__(
|
|
158
158
|
self,
|
|
159
159
|
*,
|
|
160
160
|
screen: Screen | None = None,
|
|
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def fingerprint_browser_type_from_playwright_browser_type(
|
|
14
|
-
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
|
|
14
|
+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
|
|
15
15
|
) -> SupportedBrowserType:
|
|
16
|
-
if playwright_browser_type
|
|
16
|
+
if playwright_browser_type in {'chromium', 'chrome'}:
|
|
17
17
|
return 'chrome'
|
|
18
18
|
if playwright_browser_type == 'firefox':
|
|
19
19
|
return 'firefox'
|
crawlee/http_clients/_base.py
CHANGED
|
@@ -104,6 +104,7 @@ class HttpClient(ABC):
|
|
|
104
104
|
session: Session | None = None,
|
|
105
105
|
proxy_info: ProxyInfo | None = None,
|
|
106
106
|
statistics: Statistics | None = None,
|
|
107
|
+
timeout: timedelta | None = None,
|
|
107
108
|
) -> HttpCrawlingResult:
|
|
108
109
|
"""Perform the crawling for a given request.
|
|
109
110
|
|
|
@@ -114,6 +115,7 @@ class HttpClient(ABC):
|
|
|
114
115
|
session: The session associated with the request.
|
|
115
116
|
proxy_info: The information about the proxy to be used.
|
|
116
117
|
statistics: The statistics object to register status codes.
|
|
118
|
+
timeout: Maximum time allowed to process the request.
|
|
117
119
|
|
|
118
120
|
Raises:
|
|
119
121
|
ProxyError: Raised if a proxy-related error occurs.
|
|
@@ -132,6 +134,7 @@ class HttpClient(ABC):
|
|
|
132
134
|
payload: HttpPayload | None = None,
|
|
133
135
|
session: Session | None = None,
|
|
134
136
|
proxy_info: ProxyInfo | None = None,
|
|
137
|
+
timeout: timedelta | None = None,
|
|
135
138
|
) -> HttpResponse:
|
|
136
139
|
"""Send an HTTP request via the client.
|
|
137
140
|
|
|
@@ -144,6 +147,7 @@ class HttpClient(ABC):
|
|
|
144
147
|
payload: The data to be sent as the request body.
|
|
145
148
|
session: The session associated with the request.
|
|
146
149
|
proxy_info: The information about the proxy to be used.
|
|
150
|
+
timeout: Maximum time allowed to process the request.
|
|
147
151
|
|
|
148
152
|
Raises:
|
|
149
153
|
ProxyError: Raised if a proxy-related error occurs.
|