crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
|
|
9
|
+
from crawlee._request import Request
|
|
10
|
+
|
|
11
|
+
from ._basic_crawling_context import BasicCrawlingContext
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@contextmanager
|
|
15
|
+
def swapped_context(
|
|
16
|
+
context: BasicCrawlingContext,
|
|
17
|
+
request: Request,
|
|
18
|
+
) -> Iterator[None]:
|
|
19
|
+
"""Replace context's isolated copies with originals after handler execution."""
|
|
20
|
+
try:
|
|
21
|
+
yield
|
|
22
|
+
finally:
|
|
23
|
+
# Restore original context state to avoid side effects between different handlers.
|
|
24
|
+
object.__setattr__(context, 'request', request)
|
|
@@ -2,9 +2,21 @@ import asyncio
|
|
|
2
2
|
import re
|
|
3
3
|
import traceback
|
|
4
4
|
|
|
5
|
+
import crawlee.errors
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
def _get_only_innermost_exception(error: BaseException) -> BaseException:
|
|
7
|
-
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
9
|
+
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
10
|
+
|
|
11
|
+
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
|
|
12
|
+
"""
|
|
13
|
+
if type(error) is crawlee.errors.UserHandlerTimeoutError:
|
|
14
|
+
if error.__cause__:
|
|
15
|
+
return error.__cause__
|
|
16
|
+
if error.__context__:
|
|
17
|
+
return error.__context__
|
|
18
|
+
return error
|
|
19
|
+
|
|
8
20
|
if error.__cause__:
|
|
9
21
|
return _get_only_innermost_exception(error.__cause__)
|
|
10
22
|
if error.__context__:
|
|
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
|
|
|
34
46
|
|
|
35
47
|
|
|
36
48
|
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
37
|
-
timeout_error: asyncio.exceptions.TimeoutError,
|
|
49
|
+
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
|
|
38
50
|
) -> list[str]:
|
|
39
51
|
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
|
|
40
52
|
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
|
|
@@ -43,13 +55,24 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
|
43
55
|
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
|
|
44
56
|
innermost_error = _get_only_innermost_exception(error)
|
|
45
57
|
return traceback.format_exception(
|
|
46
|
-
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=
|
|
58
|
+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
|
|
47
59
|
)
|
|
48
60
|
|
|
49
61
|
|
|
50
62
|
def get_one_line_error_summary_if_possible(error: Exception) -> str:
|
|
51
63
|
if isinstance(error, asyncio.exceptions.TimeoutError):
|
|
52
|
-
|
|
64
|
+
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
65
|
+
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
|
|
66
|
+
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
|
|
67
|
+
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
|
|
68
|
+
# code and third line the topmost user error
|
|
69
|
+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
70
|
+
relevant_index_from_start = 3
|
|
71
|
+
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
|
|
72
|
+
elif 'playwright._impl._errors.Error' in str(error.__class__):
|
|
73
|
+
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
|
|
74
|
+
# point to deep internals.
|
|
75
|
+
return ''
|
|
53
76
|
else:
|
|
54
77
|
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
55
78
|
# Commonly last traceback part is type of the error, and the second last part is the relevant file.
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from bs4 import BeautifulSoup, Tag
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
|
|
11
11
|
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
|
|
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
|
|
|
58
58
|
self,
|
|
59
59
|
*,
|
|
60
60
|
parser: BeautifulSoupParserType = 'lxml',
|
|
61
|
-
**kwargs: Unpack[
|
|
61
|
+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
|
|
62
62
|
) -> None:
|
|
63
63
|
"""Initialize a new instance.
|
|
64
64
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from parsel import Selector
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._parsel_crawling_context import ParselCrawlingContext
|
|
11
11
|
from ._parsel_parser import ParselParser
|
|
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
|
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
**kwargs: Unpack[
|
|
59
|
+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
|
|
60
60
|
) -> None:
|
|
61
61
|
"""Initialize a new instance.
|
|
62
62
|
|
|
@@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
|
|
|
22
22
|
@override
|
|
23
23
|
async def parse(self, response: HttpResponse) -> Selector:
|
|
24
24
|
response_body = await response.read()
|
|
25
|
-
return await asyncio.to_thread(
|
|
25
|
+
return await asyncio.to_thread(Selector, body=response_body)
|
|
26
26
|
|
|
27
27
|
@override
|
|
28
28
|
async def parse_text(self, text: str) -> Selector:
|
|
@@ -3,18 +3,22 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from functools import partial
|
|
7
8
|
from typing import TYPE_CHECKING, Any, Generic, Literal
|
|
8
9
|
|
|
10
|
+
import playwright.async_api
|
|
9
11
|
from more_itertools import partition
|
|
10
12
|
from pydantic import ValidationError
|
|
11
13
|
from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
12
14
|
|
|
13
15
|
from crawlee import service_locator
|
|
14
|
-
from crawlee._request import Request, RequestOptions
|
|
16
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
17
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings
|
|
15
18
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
16
19
|
from crawlee._utils.docs import docs_group
|
|
17
20
|
from crawlee._utils.robots import RobotsTxtFile
|
|
21
|
+
from crawlee._utils.time import SharedTimeout
|
|
18
22
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
19
23
|
from crawlee.browsers import BrowserPool
|
|
20
24
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
@@ -28,6 +32,7 @@ from crawlee.statistics import StatisticsState
|
|
|
28
32
|
from ._playwright_crawling_context import PlaywrightCrawlingContext
|
|
29
33
|
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
|
|
30
34
|
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
|
|
35
|
+
from ._types import GotoOptions
|
|
31
36
|
from ._utils import block_requests, infinite_scroll
|
|
32
37
|
|
|
33
38
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
|
|
@@ -43,7 +48,6 @@ if TYPE_CHECKING:
|
|
|
43
48
|
|
|
44
49
|
from crawlee import RequestTransformAction
|
|
45
50
|
from crawlee._types import (
|
|
46
|
-
BasicCrawlingContext,
|
|
47
51
|
EnqueueLinksKwargs,
|
|
48
52
|
ExtractLinksFunction,
|
|
49
53
|
HttpHeaders,
|
|
@@ -102,9 +106,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
102
106
|
user_data_dir: str | Path | None = None,
|
|
103
107
|
browser_launch_options: Mapping[str, Any] | None = None,
|
|
104
108
|
browser_new_context_options: Mapping[str, Any] | None = None,
|
|
109
|
+
goto_options: GotoOptions | None = None,
|
|
105
110
|
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
|
|
106
111
|
headless: bool | None = None,
|
|
107
112
|
use_incognito_pages: bool | None = None,
|
|
113
|
+
navigation_timeout: timedelta | None = None,
|
|
108
114
|
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
|
|
109
115
|
) -> None:
|
|
110
116
|
"""Initialize a new instance.
|
|
@@ -113,7 +119,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
113
119
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
114
120
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
115
121
|
and local storage.
|
|
116
|
-
browser_type: The type of browser to launch
|
|
122
|
+
browser_type: The type of browser to launch:
|
|
123
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
124
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
125
|
+
the system.
|
|
117
126
|
This option should not be used if `browser_pool` is provided.
|
|
118
127
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
119
128
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -130,12 +139,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
130
139
|
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
|
|
131
140
|
own context that is destroyed once the page is closed or crashes.
|
|
132
141
|
This option should not be used if `browser_pool` is provided.
|
|
142
|
+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
|
|
143
|
+
the request handler)
|
|
144
|
+
goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
|
|
145
|
+
not supported, use `navigation_timeout` instead.
|
|
133
146
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
134
147
|
"""
|
|
135
148
|
configuration = kwargs.pop('configuration', None)
|
|
136
149
|
if configuration is not None:
|
|
137
150
|
service_locator.set_configuration(configuration)
|
|
138
151
|
|
|
152
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
153
|
+
|
|
139
154
|
if browser_pool:
|
|
140
155
|
# Raise an exception if browser_pool is provided together with other browser-related arguments.
|
|
141
156
|
if any(
|
|
@@ -152,17 +167,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
152
167
|
):
|
|
153
168
|
raise ValueError(
|
|
154
169
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
155
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
170
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
156
171
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
157
172
|
)
|
|
158
173
|
|
|
159
174
|
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
|
|
160
175
|
else:
|
|
161
176
|
if fingerprint_generator == 'default':
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
|
|
177
|
+
generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
|
|
178
|
+
[fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
|
|
179
|
+
)
|
|
166
180
|
|
|
167
181
|
fingerprint_generator = DefaultFingerprintGenerator(
|
|
168
182
|
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
|
|
@@ -194,6 +208,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
194
208
|
|
|
195
209
|
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
|
|
196
210
|
|
|
211
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
212
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
213
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
214
|
+
|
|
215
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
216
|
+
self._goto_options = goto_options or GotoOptions()
|
|
217
|
+
|
|
197
218
|
super().__init__(**kwargs)
|
|
198
219
|
|
|
199
220
|
async def _open_page(
|
|
@@ -218,12 +239,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
218
239
|
log=context.log,
|
|
219
240
|
page=crawlee_page.page,
|
|
220
241
|
block_requests=partial(block_requests, page=crawlee_page.page),
|
|
242
|
+
goto_options=GotoOptions(**self._goto_options),
|
|
221
243
|
)
|
|
222
244
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
245
|
+
context_id = id(pre_navigation_context)
|
|
246
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
async with browser_page_context(crawlee_page.page):
|
|
250
|
+
for hook in self._pre_navigation_hooks:
|
|
251
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
252
|
+
await hook(pre_navigation_context)
|
|
253
|
+
|
|
254
|
+
yield pre_navigation_context
|
|
255
|
+
finally:
|
|
256
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
227
257
|
|
|
228
258
|
def _prepare_request_interceptor(
|
|
229
259
|
self,
|
|
@@ -258,6 +288,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
258
288
|
Raises:
|
|
259
289
|
ValueError: If the browser pool is not initialized.
|
|
260
290
|
SessionError: If the URL cannot be loaded by the browser.
|
|
291
|
+
TimeoutError: If navigation does not succeed within the navigation timeout.
|
|
261
292
|
|
|
262
293
|
Yields:
|
|
263
294
|
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
|
|
@@ -289,7 +320,14 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
289
320
|
# Set route_handler only for current request
|
|
290
321
|
await context.page.route(context.request.url, route_handler)
|
|
291
322
|
|
|
292
|
-
|
|
323
|
+
try:
|
|
324
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
325
|
+
response = await context.page.goto(
|
|
326
|
+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
|
|
327
|
+
)
|
|
328
|
+
context.request.state = RequestState.AFTER_NAV
|
|
329
|
+
except playwright.async_api.TimeoutError as exc:
|
|
330
|
+
raise asyncio.TimeoutError from exc
|
|
293
331
|
|
|
294
332
|
if response is None:
|
|
295
333
|
raise SessionError(f'Failed to load the URL: {context.request.url}')
|
|
@@ -316,6 +354,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
316
354
|
extract_links=extract_links,
|
|
317
355
|
enqueue_links=self._create_enqueue_links_function(context, extract_links),
|
|
318
356
|
block_requests=partial(block_requests, page=context.page),
|
|
357
|
+
goto_options=context.goto_options,
|
|
319
358
|
)
|
|
320
359
|
|
|
321
360
|
if context.session:
|
|
@@ -356,12 +395,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
356
395
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
357
396
|
|
|
358
397
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
398
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
359
399
|
|
|
360
400
|
elements = await context.page.query_selector_all(selector)
|
|
361
401
|
links_iterator: Iterator[str] = iter(
|
|
362
402
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
363
403
|
)
|
|
364
|
-
|
|
404
|
+
|
|
405
|
+
# Get base URL from <base> tag if present
|
|
406
|
+
extracted_base_url = await context.page.evaluate('document.baseURI')
|
|
407
|
+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
|
|
408
|
+
|
|
409
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
365
410
|
|
|
366
411
|
if robots_txt_file:
|
|
367
412
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -369,17 +414,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
369
414
|
skipped = iter([])
|
|
370
415
|
|
|
371
416
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
372
|
-
|
|
417
|
+
request_options = RequestOptions(
|
|
418
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
419
|
+
)
|
|
373
420
|
|
|
374
421
|
if transform_request_function:
|
|
375
|
-
|
|
376
|
-
if
|
|
422
|
+
transform_request_options = transform_request_function(request_options)
|
|
423
|
+
if transform_request_options == 'skip':
|
|
377
424
|
continue
|
|
378
|
-
if
|
|
379
|
-
|
|
425
|
+
if transform_request_options != 'unchanged':
|
|
426
|
+
request_options = transform_request_options
|
|
380
427
|
|
|
381
428
|
try:
|
|
382
|
-
request = Request.from_url(**
|
|
429
|
+
request = Request.from_url(**request_options)
|
|
383
430
|
except ValidationError as exc:
|
|
384
431
|
context.log.debug(
|
|
385
432
|
f'Skipping URL "{url}" due to invalid format: {exc}. '
|
|
@@ -465,7 +512,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
465
512
|
|
|
466
513
|
async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
|
|
467
514
|
"""Update the cookies in the page context."""
|
|
468
|
-
|
|
515
|
+
# False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
|
|
516
|
+
await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type]
|
|
469
517
|
|
|
470
518
|
async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
|
|
471
519
|
"""Find the robots.txt file for a given URL.
|
|
@@ -489,7 +537,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
489
537
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
490
538
|
|
|
491
539
|
browser_type: NotRequired[BrowserType]
|
|
492
|
-
"""The type of browser to launch
|
|
540
|
+
"""The type of browser to launch:
|
|
541
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
542
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
493
543
|
This option should not be used if `browser_pool` is provided."""
|
|
494
544
|
|
|
495
545
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
|
@@ -509,9 +559,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
509
559
|
|
|
510
560
|
|
|
511
561
|
class PlaywrightCrawlerOptions(
|
|
512
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
513
562
|
_PlaywrightCrawlerAdditionalOptions,
|
|
514
563
|
BasicCrawlerOptions[TCrawlingContext, StatisticsState],
|
|
564
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
515
565
|
):
|
|
516
566
|
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
517
567
|
|
|
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
59
59
|
session: Session | None = None,
|
|
60
60
|
proxy_info: ProxyInfo | None = None,
|
|
61
61
|
statistics: Statistics | None = None,
|
|
62
|
+
timeout: timedelta | None = None,
|
|
62
63
|
) -> HttpCrawlingResult:
|
|
63
64
|
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
|
|
64
65
|
|
|
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
72
73
|
payload: HttpPayload | None = None,
|
|
73
74
|
session: Session | None = None,
|
|
74
75
|
proxy_info: ProxyInfo | None = None,
|
|
76
|
+
timeout: timedelta | None = None,
|
|
75
77
|
) -> HttpResponse:
|
|
76
78
|
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
|
|
77
79
|
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
|
|
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
87
89
|
|
|
88
90
|
# Proxies appropriate to the browser context are used
|
|
89
91
|
response = await browser_context.request.fetch(
|
|
90
|
-
url_or_request=url,
|
|
92
|
+
url_or_request=url,
|
|
93
|
+
method=method.lower(),
|
|
94
|
+
headers=dict(headers) if headers else None,
|
|
95
|
+
data=payload,
|
|
96
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
91
97
|
)
|
|
92
98
|
|
|
93
99
|
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
|
|
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from playwright.async_api import Page
|
|
11
11
|
|
|
12
|
-
from ._types import BlockRequestsFunction
|
|
12
|
+
from ._types import BlockRequestsFunction, GotoOptions
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass(frozen=True)
|
|
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
26
26
|
block_requests: BlockRequestsFunction
|
|
27
27
|
"""Blocks network requests matching specified URL patterns."""
|
|
28
28
|
|
|
29
|
+
goto_options: GotoOptions
|
|
30
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
31
|
+
|
|
29
32
|
async def get_snapshot(self) -> PageSnapshot:
|
|
30
33
|
"""Get snapshot of crawled page."""
|
|
31
34
|
html = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import TYPE_CHECKING, Protocol
|
|
4
|
+
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
|
|
5
5
|
|
|
6
6
|
from crawlee import HttpHeaders
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from collections.abc import AsyncGenerator
|
|
11
11
|
|
|
12
12
|
from playwright.async_api import APIResponse, Response
|
|
13
|
-
from typing_extensions import Self
|
|
13
|
+
from typing_extensions import NotRequired, Self
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
@docs_group('Functions')
|
|
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
|
|
|
58
58
|
_content = await response.body()
|
|
59
59
|
|
|
60
60
|
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GotoOptions(TypedDict):
|
|
64
|
+
"""Keyword arguments for Playwright's `Page.goto()` method."""
|
|
65
|
+
|
|
66
|
+
wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
|
|
67
|
+
"""When to consider operation succeeded, defaults to 'load' event."""
|
|
68
|
+
|
|
69
|
+
referer: NotRequired[str]
|
|
70
|
+
"""Referer header value."""
|
crawlee/errors.py
CHANGED
|
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
|
|
|
29
29
|
"""Wraps an exception thrown from an user-defined error handler."""
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
|
|
33
|
+
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
|
|
34
|
+
|
|
35
|
+
|
|
32
36
|
@docs_group('Errors')
|
|
33
37
|
class SessionError(Exception):
|
|
34
38
|
"""Errors of `SessionError` type will trigger a session rotation.
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -130,11 +130,13 @@ class EventManager:
|
|
|
130
130
|
if not self._active:
|
|
131
131
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
132
132
|
|
|
133
|
+
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
|
|
134
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
135
|
+
await self._emit_persist_state_event()
|
|
133
136
|
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
|
|
134
137
|
self._event_emitter.remove_all_listeners()
|
|
135
138
|
self._listener_tasks.clear()
|
|
136
139
|
self._listeners_to_wrappers.clear()
|
|
137
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
138
140
|
self._active = False
|
|
139
141
|
|
|
140
142
|
@overload
|
|
@@ -172,13 +174,12 @@ class EventManager:
|
|
|
172
174
|
# to avoid blocking the event loop
|
|
173
175
|
coro = (
|
|
174
176
|
listener(*bound_args.args, **bound_args.kwargs)
|
|
175
|
-
if
|
|
177
|
+
if inspect.iscoroutinefunction(listener)
|
|
176
178
|
else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
|
|
177
179
|
)
|
|
178
|
-
# Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
|
|
179
|
-
# unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
|
|
180
180
|
|
|
181
|
-
|
|
181
|
+
listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
|
|
182
|
+
listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
|
|
182
183
|
self._listener_tasks.add(listener_task)
|
|
183
184
|
|
|
184
185
|
try:
|
|
@@ -189,7 +190,12 @@ class EventManager:
|
|
|
189
190
|
# We need to swallow the exception and just log it here, otherwise it could break the event emitter
|
|
190
191
|
logger.exception(
|
|
191
192
|
'Exception in the event listener',
|
|
192
|
-
extra={
|
|
193
|
+
extra={
|
|
194
|
+
'event_name': event.value,
|
|
195
|
+
'listener_name': listener.__name__
|
|
196
|
+
if hasattr(listener, '__name__')
|
|
197
|
+
else listener.__class__.__name__,
|
|
198
|
+
},
|
|
193
199
|
)
|
|
194
200
|
finally:
|
|
195
201
|
logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
|
crawlee/events/_types.py
CHANGED
|
@@ -40,7 +40,7 @@ class Event(str, Enum):
|
|
|
40
40
|
class EventPersistStateData(BaseModel):
|
|
41
41
|
"""Data for the persist state event."""
|
|
42
42
|
|
|
43
|
-
model_config = ConfigDict(
|
|
43
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
44
44
|
|
|
45
45
|
is_migrating: Annotated[bool, Field(alias='isMigrating')]
|
|
46
46
|
|
|
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
|
|
|
49
49
|
class EventSystemInfoData(BaseModel):
|
|
50
50
|
"""Data for the system info event."""
|
|
51
51
|
|
|
52
|
-
model_config = ConfigDict(
|
|
52
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
53
53
|
|
|
54
54
|
cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
|
|
55
55
|
memory_info: Annotated[
|
|
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
|
|
|
62
62
|
class EventMigratingData(BaseModel):
|
|
63
63
|
"""Data for the migrating event."""
|
|
64
64
|
|
|
65
|
-
model_config = ConfigDict(
|
|
65
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
66
66
|
|
|
67
67
|
# The remaining time in seconds before the migration is forced and the process is killed
|
|
68
68
|
# Optional because it's not present when the event handler is called manually
|
|
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
|
|
|
73
73
|
class EventAbortingData(BaseModel):
|
|
74
74
|
"""Data for the aborting event."""
|
|
75
75
|
|
|
76
|
-
model_config = ConfigDict(
|
|
76
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
@docs_group('Event data')
|
|
80
80
|
class EventExitData(BaseModel):
|
|
81
81
|
"""Data for the exit event."""
|
|
82
82
|
|
|
83
|
-
model_config = ConfigDict(
|
|
83
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
@docs_group('Event data')
|
|
87
87
|
class EventCrawlerStatusData(BaseModel):
|
|
88
88
|
"""Data for the crawler status event."""
|
|
89
89
|
|
|
90
|
-
model_config = ConfigDict(
|
|
90
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
91
91
|
|
|
92
92
|
message: str
|
|
93
93
|
"""A message describing the current status of the crawler."""
|
|
@@ -154,7 +154,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator):
|
|
|
154
154
|
class PatchedFingerprintGenerator(bf_FingerprintGenerator):
|
|
155
155
|
"""Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
|
|
156
156
|
|
|
157
|
-
def __init__(
|
|
157
|
+
def __init__(
|
|
158
158
|
self,
|
|
159
159
|
*,
|
|
160
160
|
screen: Screen | None = None,
|
|
@@ -3,10 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from typing import TYPE_CHECKING
|
|
5
5
|
|
|
6
|
+
from crawlee._utils.docs import docs_group
|
|
7
|
+
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from browserforge.fingerprints import Fingerprint
|
|
8
10
|
|
|
9
11
|
|
|
12
|
+
@docs_group('Other')
|
|
10
13
|
class FingerprintGenerator(ABC):
|
|
11
14
|
"""A class for creating browser fingerprints that mimic browser fingerprints of real users."""
|
|
12
15
|
|
|
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def fingerprint_browser_type_from_playwright_browser_type(
|
|
14
|
-
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
|
|
14
|
+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
|
|
15
15
|
) -> SupportedBrowserType:
|
|
16
|
-
if playwright_browser_type
|
|
16
|
+
if playwright_browser_type in {'chromium', 'chrome'}:
|
|
17
17
|
return 'chrome'
|
|
18
18
|
if playwright_browser_type == 'firefox':
|
|
19
19
|
return 'firefox'
|
|
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class ScreenOptions(BaseModel):
|
|
14
|
-
model_config = ConfigDict(extra='forbid',
|
|
14
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
15
15
|
|
|
16
16
|
"""Defines the screen constrains for the fingerprint generator."""
|
|
17
17
|
|
|
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
|
|
|
31
31
|
class HeaderGeneratorOptions(BaseModel):
|
|
32
32
|
"""Collection of header related attributes that can be used by the fingerprint generator."""
|
|
33
33
|
|
|
34
|
-
model_config = ConfigDict(extra='forbid',
|
|
34
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
35
35
|
|
|
36
36
|
browsers: list[SupportedBrowserType] | None = None
|
|
37
37
|
"""List of BrowserSpecifications to generate the headers for."""
|