crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recurring_task.py +12 -3
- crawlee/_utils/sitemap.py +12 -5
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/browsers/_browser_pool.py +1 -1
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +138 -124
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +1 -3
- crawlee/request_loaders/_sitemap_request_loader.py +18 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +2 -21
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
- crawlee/storage_clients/_redis/_client_mixin.py +1 -4
- crawlee/storage_clients/_redis/_dataset_client.py +6 -2
- crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
- crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
- crawlee/storage_clients/_redis/_storage_client.py +12 -9
- crawlee/storage_clients/_redis/_utils.py +1 -1
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
|
-
from crawlee._request import Request, RequestOptions
|
|
13
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
52
|
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
56
71
|
self,
|
|
57
72
|
*,
|
|
58
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
59
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
60
76
|
) -> None:
|
|
61
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
62
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
63
81
|
|
|
64
82
|
if '_context_pipeline' not in kwargs:
|
|
65
83
|
raise ValueError(
|
|
@@ -82,9 +100,7 @@ class AbstractHttpCrawler(
|
|
|
82
100
|
this method simplifies cases where `TParseResult` is used for both generic parameters.
|
|
83
101
|
"""
|
|
84
102
|
|
|
85
|
-
class _ParsedHttpCrawler(
|
|
86
|
-
AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
|
|
87
|
-
):
|
|
103
|
+
class _ParsedHttpCrawler(AbstractHttpCrawler):
|
|
88
104
|
def __init__(
|
|
89
105
|
self,
|
|
90
106
|
parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
|
|
@@ -112,9 +128,17 @@ class AbstractHttpCrawler(
|
|
|
112
128
|
async def _execute_pre_navigation_hooks(
|
|
113
129
|
self, context: BasicCrawlingContext
|
|
114
130
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
131
|
+
context_id = id(context)
|
|
132
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
for hook in self._pre_navigation_hooks:
|
|
136
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
137
|
+
await hook(context)
|
|
138
|
+
|
|
139
|
+
yield context
|
|
140
|
+
finally:
|
|
141
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
118
142
|
|
|
119
143
|
async def _parse_http_response(
|
|
120
144
|
self, context: HttpCrawlingContext
|
|
@@ -165,11 +189,18 @@ class AbstractHttpCrawler(
|
|
|
165
189
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
166
190
|
|
|
167
191
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
192
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
168
193
|
|
|
169
194
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
|
|
171
|
-
|
|
195
|
+
|
|
196
|
+
# Get base URL from <base> tag if present
|
|
197
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
198
|
+
base_url: str = (
|
|
199
|
+
str(extracted_base_urls[0])
|
|
200
|
+
if extracted_base_urls
|
|
201
|
+
else context.request.loaded_url or context.request.url
|
|
172
202
|
)
|
|
203
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
173
204
|
|
|
174
205
|
if robots_txt_file:
|
|
175
206
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -177,7 +208,9 @@ class AbstractHttpCrawler(
|
|
|
177
208
|
skipped = iter([])
|
|
178
209
|
|
|
179
210
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
180
|
-
request_options = RequestOptions(
|
|
211
|
+
request_options = RequestOptions(
|
|
212
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
213
|
+
)
|
|
181
214
|
|
|
182
215
|
if transform_request_function:
|
|
183
216
|
transform_request_options = transform_request_function(request_options)
|
|
@@ -216,13 +249,16 @@ class AbstractHttpCrawler(
|
|
|
216
249
|
Yields:
|
|
217
250
|
The original crawling context enhanced by HTTP response.
|
|
218
251
|
"""
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
252
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
253
|
+
result = await self._http_client.crawl(
|
|
254
|
+
request=context.request,
|
|
255
|
+
session=context.session,
|
|
256
|
+
proxy_info=context.proxy_info,
|
|
257
|
+
statistics=self._statistics,
|
|
258
|
+
timeout=remaining_timeout,
|
|
259
|
+
)
|
|
225
260
|
|
|
261
|
+
context.request.state = RequestState.AFTER_NAV
|
|
226
262
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
227
263
|
|
|
228
264
|
async def _handle_status_code_response(
|
|
@@ -11,13 +11,16 @@ _install_import_hook(__name__)
|
|
|
11
11
|
|
|
12
12
|
# The following imports are wrapped in try_import to handle optional dependencies,
|
|
13
13
|
# ensuring the module can still function even if these dependencies are missing.
|
|
14
|
-
with _try_import(__name__, '
|
|
14
|
+
with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
|
|
15
15
|
from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
|
|
16
|
-
with _try_import(__name__, '
|
|
16
|
+
with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
|
|
17
17
|
from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
|
|
18
|
+
with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
|
|
19
|
+
from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
|
|
18
20
|
|
|
19
21
|
__all__ = [
|
|
20
22
|
'AdaptivePlaywrightCrawler',
|
|
23
|
+
'AdaptivePlaywrightCrawlerStatisticState',
|
|
21
24
|
'AdaptivePlaywrightCrawlingContext',
|
|
22
25
|
'AdaptivePlaywrightPreNavCrawlingContext',
|
|
23
26
|
'RenderingType',
|
|
@@ -27,23 +27,16 @@ from crawlee.crawlers import (
|
|
|
27
27
|
)
|
|
28
28
|
from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
|
|
29
29
|
from crawlee.crawlers._parsel._parsel_parser import ParselParser
|
|
30
|
+
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
|
|
30
31
|
from crawlee.statistics import Statistics, StatisticsState
|
|
31
32
|
|
|
32
|
-
from ._adaptive_playwright_crawler_statistics import
|
|
33
|
-
AdaptivePlaywrightCrawlerStatisticState,
|
|
34
|
-
)
|
|
33
|
+
from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
|
|
35
34
|
from ._adaptive_playwright_crawling_context import (
|
|
36
35
|
AdaptivePlaywrightCrawlingContext,
|
|
37
36
|
AdaptivePlaywrightPreNavCrawlingContext,
|
|
38
37
|
)
|
|
39
|
-
from ._rendering_type_predictor import
|
|
40
|
-
|
|
41
|
-
RenderingType,
|
|
42
|
-
RenderingTypePredictor,
|
|
43
|
-
)
|
|
44
|
-
from ._result_comparator import (
|
|
45
|
-
create_default_comparator,
|
|
46
|
-
)
|
|
38
|
+
from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
|
|
39
|
+
from ._result_comparator import create_default_comparator
|
|
47
40
|
|
|
48
41
|
if TYPE_CHECKING:
|
|
49
42
|
from types import TracebackType
|
|
@@ -51,7 +44,6 @@ if TYPE_CHECKING:
|
|
|
51
44
|
from typing_extensions import Unpack
|
|
52
45
|
|
|
53
46
|
from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
|
|
54
|
-
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
|
|
55
47
|
|
|
56
48
|
|
|
57
49
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -157,10 +149,12 @@ class AdaptivePlaywrightCrawler(
|
|
|
157
149
|
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
158
150
|
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
159
151
|
|
|
160
|
-
|
|
152
|
+
adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
|
|
153
|
+
|
|
154
|
+
super().__init__(statistics=adaptive_statistics, **kwargs)
|
|
161
155
|
|
|
162
156
|
# Sub crawlers related.
|
|
163
|
-
playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or
|
|
157
|
+
playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
|
|
164
158
|
|
|
165
159
|
# Each sub crawler will use custom logger .
|
|
166
160
|
static_logger = getLogger('Subcrawler_static')
|
|
@@ -290,11 +284,14 @@ class AdaptivePlaywrightCrawler(
|
|
|
290
284
|
use_state_function = context.use_state
|
|
291
285
|
|
|
292
286
|
# New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
|
|
293
|
-
result = RequestHandlerRunResult(
|
|
287
|
+
result = RequestHandlerRunResult(
|
|
288
|
+
key_value_store_getter=self.get_key_value_store,
|
|
289
|
+
request=context.request,
|
|
290
|
+
)
|
|
294
291
|
context_linked_to_result = BasicCrawlingContext(
|
|
295
|
-
request=
|
|
296
|
-
session=
|
|
297
|
-
proxy_info=
|
|
292
|
+
request=result.request,
|
|
293
|
+
session=context.session,
|
|
294
|
+
proxy_info=context.proxy_info,
|
|
298
295
|
send_request=context.send_request,
|
|
299
296
|
add_requests=result.add_requests,
|
|
300
297
|
push_data=result.push_data,
|
|
@@ -314,7 +311,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
314
311
|
),
|
|
315
312
|
logger=self._logger,
|
|
316
313
|
)
|
|
317
|
-
return SubCrawlerRun(result=result
|
|
314
|
+
return SubCrawlerRun(result=result)
|
|
318
315
|
except Exception as e:
|
|
319
316
|
return SubCrawlerRun(exception=e)
|
|
320
317
|
|
|
@@ -332,7 +329,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
332
329
|
)
|
|
333
330
|
await self.router(adaptive_crawling_context)
|
|
334
331
|
|
|
335
|
-
return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)
|
|
332
|
+
return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
|
|
336
333
|
|
|
337
334
|
if rendering_type == 'client only':
|
|
338
335
|
|
|
@@ -342,7 +339,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
342
339
|
)
|
|
343
340
|
await self.router(adaptive_crawling_context)
|
|
344
341
|
|
|
345
|
-
return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)
|
|
342
|
+
return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
|
|
346
343
|
|
|
347
344
|
raise RuntimeError(
|
|
348
345
|
f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
|
|
@@ -370,8 +367,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
370
367
|
self.track_http_only_request_handler_runs()
|
|
371
368
|
|
|
372
369
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
373
|
-
if static_run.result and
|
|
374
|
-
self._update_context_from_copy(context, static_run.run_context)
|
|
370
|
+
if static_run.result and self.result_checker(static_run.result):
|
|
375
371
|
self._context_result_map[context] = static_run.result
|
|
376
372
|
return
|
|
377
373
|
if static_run.exception:
|
|
@@ -402,7 +398,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
398
|
if pw_run.exception is not None:
|
|
403
399
|
raise pw_run.exception
|
|
404
400
|
|
|
405
|
-
if pw_run.result
|
|
401
|
+
if pw_run.result:
|
|
406
402
|
if should_detect_rendering_type:
|
|
407
403
|
detection_result: RenderingType
|
|
408
404
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
@@ -414,7 +410,6 @@ class AdaptivePlaywrightCrawler(
|
|
|
414
410
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
415
411
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
416
412
|
|
|
417
|
-
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
413
|
self._context_result_map[context] = pw_run.result
|
|
419
414
|
|
|
420
415
|
def pre_navigation_hook(
|
|
@@ -451,32 +446,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
446
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
447
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
448
|
|
|
454
|
-
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
-
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
-
|
|
457
|
-
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
-
allowing state synchronization after isolated crawler execution.
|
|
459
|
-
"""
|
|
460
|
-
updating_attributes = {
|
|
461
|
-
'request': ('headers', 'user_data'),
|
|
462
|
-
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
-
original_sub_obj = getattr(context, attr)
|
|
467
|
-
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
-
|
|
469
|
-
# Check that both sub objects are not None
|
|
470
|
-
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
-
continue
|
|
472
|
-
|
|
473
|
-
for sub_attr in sub_attrs:
|
|
474
|
-
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
-
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
-
|
|
477
449
|
|
|
478
450
|
@dataclass(frozen=True)
|
|
479
451
|
class SubCrawlerRun:
|
|
480
452
|
result: RequestHandlerRunResult | None = None
|
|
481
453
|
exception: Exception | None = None
|
|
482
|
-
run_context: BasicCrawlingContext | None = None
|
|
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from playwright.async_api import Page, Response
|
|
18
18
|
from typing_extensions import Self
|
|
19
19
|
|
|
20
|
-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
|
|
20
|
+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
|
|
|
190
190
|
http_response = await PlaywrightHttpResponse.from_playwright_response(
|
|
191
191
|
response=context.response, protocol=protocol_guess or ''
|
|
192
192
|
)
|
|
193
|
-
# block_requests
|
|
193
|
+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
|
|
194
194
|
context_kwargs.pop('block_requests')
|
|
195
|
+
context_kwargs.pop('goto_options')
|
|
195
196
|
return cls(
|
|
196
197
|
parsed_content=await parser.parse(http_response),
|
|
197
198
|
http_response=http_response,
|
|
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
212
213
|
block_requests: BlockRequestsFunction | None = None
|
|
213
214
|
"""Blocks network requests matching specified URL patterns."""
|
|
214
215
|
|
|
216
|
+
goto_options: GotoOptions | None = None
|
|
217
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
218
|
+
|
|
215
219
|
@property
|
|
216
220
|
def page(self) -> Page:
|
|
217
221
|
"""The Playwright `Page` object for the current page.
|