crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
13
|
from crawlee._request import Request, RequestOptions
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,9 +34,24 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
|
-
|
|
52
|
+
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
53
|
+
ABC,
|
|
54
|
+
Generic[TCrawlingContext, TParseResult, TSelectResult],
|
|
38
55
|
):
|
|
39
56
|
"""A web crawler for performing HTTP requests.
|
|
40
57
|
|
|
@@ -54,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
54
71
|
self,
|
|
55
72
|
*,
|
|
56
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
57
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
58
76
|
) -> None:
|
|
59
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
60
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
61
81
|
|
|
62
82
|
if '_context_pipeline' not in kwargs:
|
|
63
83
|
raise ValueError(
|
|
@@ -110,9 +130,17 @@ class AbstractHttpCrawler(
|
|
|
110
130
|
async def _execute_pre_navigation_hooks(
|
|
111
131
|
self, context: BasicCrawlingContext
|
|
112
132
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
133
|
+
context_id = id(context)
|
|
134
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
for hook in self._pre_navigation_hooks:
|
|
138
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
139
|
+
await hook(context)
|
|
140
|
+
|
|
141
|
+
yield context
|
|
142
|
+
finally:
|
|
143
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
116
144
|
|
|
117
145
|
async def _parse_http_response(
|
|
118
146
|
self, context: HttpCrawlingContext
|
|
@@ -165,7 +193,15 @@ class AbstractHttpCrawler(
|
|
|
165
193
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
166
194
|
|
|
167
195
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
168
|
-
|
|
196
|
+
|
|
197
|
+
# Get base URL from <base> tag if present
|
|
198
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
199
|
+
base_url: str = (
|
|
200
|
+
str(extracted_base_urls[0])
|
|
201
|
+
if extracted_base_urls
|
|
202
|
+
else context.request.loaded_url or context.request.url
|
|
203
|
+
)
|
|
204
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
169
205
|
|
|
170
206
|
if robots_txt_file:
|
|
171
207
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -212,12 +248,14 @@ class AbstractHttpCrawler(
|
|
|
212
248
|
Yields:
|
|
213
249
|
The original crawling context enhanced by HTTP response.
|
|
214
250
|
"""
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
251
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
252
|
+
result = await self._http_client.crawl(
|
|
253
|
+
request=context.request,
|
|
254
|
+
session=context.session,
|
|
255
|
+
proxy_info=context.proxy_info,
|
|
256
|
+
statistics=self._statistics,
|
|
257
|
+
timeout=remaining_timeout,
|
|
258
|
+
)
|
|
221
259
|
|
|
222
260
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
223
261
|
|
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@docs_group('HTTP parsers')
|
|
19
|
-
class AbstractHttpParser(Generic[TParseResult, TSelectResult]
|
|
19
|
+
class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
|
|
20
20
|
"""Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
|
|
21
21
|
|
|
22
22
|
@abstractmethod
|
|
@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
|
|
|
31
31
|
|
|
32
32
|
@dataclass(frozen=True)
|
|
33
33
|
@docs_group('Crawling contexts')
|
|
34
|
-
class ParsedHttpCrawlingContext(Generic[TParseResult]
|
|
34
|
+
class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
|
|
35
35
|
"""The crawling context used by `AbstractHttpCrawler`.
|
|
36
36
|
|
|
37
37
|
It provides access to key objects as well as utility functions for handling crawling tasks.
|
|
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
|
|
|
12
12
|
from parsel import Selector
|
|
13
13
|
from typing_extensions import Self, TypeVar, override
|
|
14
14
|
|
|
15
|
-
from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
|
|
15
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
|
|
16
16
|
from crawlee._utils.docs import docs_group
|
|
17
17
|
from crawlee._utils.wait import wait_for
|
|
18
18
|
from crawlee.crawlers import (
|
|
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
|
|
|
71
71
|
async def __aenter__(self) -> Self:
|
|
72
72
|
self._active = True
|
|
73
73
|
await self._state.initialize()
|
|
74
|
-
self._after_initialize()
|
|
75
74
|
return self
|
|
76
75
|
|
|
77
76
|
async def __aexit__(
|
|
@@ -85,8 +84,8 @@ class _NonPersistentStatistics(Statistics):
|
|
|
85
84
|
|
|
86
85
|
@docs_group('Crawlers')
|
|
87
86
|
class AdaptivePlaywrightCrawler(
|
|
88
|
-
Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
|
|
89
87
|
BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
|
|
88
|
+
Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
|
|
90
89
|
):
|
|
91
90
|
"""An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
|
|
92
91
|
|
|
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
|
|
|
149
148
|
non-default configuration.
|
|
150
149
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
151
150
|
"""
|
|
152
|
-
# Some sub crawler kwargs are internally modified. Prepare copies.
|
|
153
|
-
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
|
|
154
|
-
basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
|
|
155
|
-
|
|
156
151
|
# Adaptive crawling related.
|
|
157
152
|
self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
|
|
158
153
|
self.result_checker = result_checker or (lambda _: True)
|
|
159
154
|
self.result_comparator = result_comparator or create_default_comparator(result_checker)
|
|
160
155
|
|
|
156
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
157
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
158
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
159
|
+
|
|
161
160
|
super().__init__(statistics=statistics, **kwargs)
|
|
162
161
|
|
|
163
162
|
# Sub crawlers related.
|
|
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
|
|
|
166
165
|
# Each sub crawler will use custom logger .
|
|
167
166
|
static_logger = getLogger('Subcrawler_static')
|
|
168
167
|
static_logger.setLevel(logging.ERROR)
|
|
169
|
-
basic_crawler_kwargs_for_static_crawler
|
|
168
|
+
basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
|
|
170
169
|
|
|
171
170
|
pw_logger = getLogger('Subcrawler_playwright')
|
|
172
171
|
pw_logger.setLevel(logging.ERROR)
|
|
173
|
-
basic_crawler_kwargs_for_pw_crawler
|
|
172
|
+
basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
|
|
174
173
|
|
|
175
174
|
# Initialize sub crawlers to create their pipelines.
|
|
176
175
|
static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
|
|
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
315
314
|
),
|
|
316
315
|
logger=self._logger,
|
|
317
316
|
)
|
|
318
|
-
return SubCrawlerRun(result=result)
|
|
317
|
+
return SubCrawlerRun(result=result, run_context=context_linked_to_result)
|
|
319
318
|
except Exception as e:
|
|
320
319
|
return SubCrawlerRun(exception=e)
|
|
321
320
|
|
|
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
371
370
|
self.track_http_only_request_handler_runs()
|
|
372
371
|
|
|
373
372
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
374
|
-
if static_run.result and self.result_checker(static_run.result):
|
|
373
|
+
if static_run.result and static_run.run_context and self.result_checker(static_run.result):
|
|
374
|
+
self._update_context_from_copy(context, static_run.run_context)
|
|
375
375
|
self._context_result_map[context] = static_run.result
|
|
376
376
|
return
|
|
377
377
|
if static_run.exception:
|
|
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
402
|
if pw_run.exception is not None:
|
|
403
403
|
raise pw_run.exception
|
|
404
404
|
|
|
405
|
-
if pw_run.result:
|
|
406
|
-
self._context_result_map[context] = pw_run.result
|
|
407
|
-
|
|
405
|
+
if pw_run.result and pw_run.run_context:
|
|
408
406
|
if should_detect_rendering_type:
|
|
409
407
|
detection_result: RenderingType
|
|
410
408
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
411
|
-
|
|
412
409
|
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
|
|
413
410
|
detection_result = 'static'
|
|
414
411
|
else:
|
|
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
|
|
|
417
414
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
418
415
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
419
416
|
|
|
417
|
+
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
|
+
self._context_result_map[context] = pw_run.result
|
|
419
|
+
|
|
420
420
|
def pre_navigation_hook(
|
|
421
421
|
self,
|
|
422
422
|
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
|
|
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
451
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
452
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
453
|
|
|
454
|
+
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
+
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
+
|
|
457
|
+
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
+
allowing state synchronization after isolated crawler execution.
|
|
459
|
+
"""
|
|
460
|
+
updating_attributes = {
|
|
461
|
+
'request': ('headers', 'user_data'),
|
|
462
|
+
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
+
original_sub_obj = getattr(context, attr)
|
|
467
|
+
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
+
|
|
469
|
+
# Check that both sub objects are not None
|
|
470
|
+
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
for sub_attr in sub_attrs:
|
|
474
|
+
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
+
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
+
|
|
454
477
|
|
|
455
478
|
@dataclass(frozen=True)
|
|
456
479
|
class SubCrawlerRun:
|
|
457
480
|
result: RequestHandlerRunResult | None = None
|
|
458
481
|
exception: Exception | None = None
|
|
482
|
+
run_context: BasicCrawlingContext | None = None
|
|
@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
|
|
|
12
12
|
class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
|
|
13
13
|
"""Statistic data about a crawler run with additional information related to adaptive crawling."""
|
|
14
14
|
|
|
15
|
-
model_config = ConfigDict(
|
|
15
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
|
|
16
16
|
|
|
17
17
|
http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
|
|
18
18
|
"""Number representing how many times static http based crawling was used."""
|
|
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from playwright.async_api import Page, Response
|
|
18
18
|
from typing_extensions import Self
|
|
19
19
|
|
|
20
|
-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
|
|
20
|
+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
|
|
|
31
31
|
@dataclass(frozen=True)
|
|
32
32
|
@docs_group('Crawling contexts')
|
|
33
33
|
class AdaptivePlaywrightCrawlingContext(
|
|
34
|
-
|
|
34
|
+
ParsedHttpCrawlingContext[TStaticParseResult],
|
|
35
|
+
Generic[TStaticParseResult, TStaticSelectResult],
|
|
35
36
|
):
|
|
36
37
|
_static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
|
|
37
38
|
"""The crawling context used by `AdaptivePlaywrightCrawler`.
|
|
@@ -189,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
|
|
|
189
190
|
http_response = await PlaywrightHttpResponse.from_playwright_response(
|
|
190
191
|
response=context.response, protocol=protocol_guess or ''
|
|
191
192
|
)
|
|
192
|
-
# block_requests
|
|
193
|
+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
|
|
193
194
|
context_kwargs.pop('block_requests')
|
|
195
|
+
context_kwargs.pop('goto_options')
|
|
194
196
|
return cls(
|
|
195
197
|
parsed_content=await parser.parse(http_response),
|
|
196
198
|
http_response=http_response,
|
|
@@ -211,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
211
213
|
block_requests: BlockRequestsFunction | None = None
|
|
212
214
|
"""Blocks network requests matching specified URL patterns."""
|
|
213
215
|
|
|
216
|
+
goto_options: GotoOptions | None = None
|
|
217
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
218
|
+
|
|
214
219
|
@property
|
|
215
220
|
def page(self) -> Page:
|
|
216
221
|
"""The Playwright `Page` object for the current page.
|
|
@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class RenderingTypePredictorState(BaseModel):
|
|
35
|
-
model_config = ConfigDict(
|
|
35
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
36
36
|
|
|
37
37
|
model: Annotated[
|
|
38
38
|
LogisticRegression,
|