crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +44 -5
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +156 -131
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
  61. crawlee/storage_clients/_redis/__init__.py +6 -0
  62. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  63. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  64. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  65. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  66. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  67. crawlee/storage_clients/_redis/_utils.py +23 -0
  68. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  69. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  70. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  71. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  72. crawlee/storage_clients/_redis/py.typed +0 -0
  73. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  74. crawlee/storage_clients/_sql/_db_models.py +1 -2
  75. crawlee/storage_clients/models.py +8 -3
  76. crawlee/storages/_key_value_store.py +5 -2
  77. crawlee/storages/_storage_instance_manager.py +103 -44
  78. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
  79. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
  80. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  81. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  82. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/time.py CHANGED
@@ -3,11 +3,14 @@ from __future__ import annotations
3
3
  import time
4
4
  from contextlib import contextmanager
5
5
  from dataclasses import dataclass
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING
7
8
 
9
+ from async_timeout import Timeout, timeout
10
+
8
11
  if TYPE_CHECKING:
9
12
  from collections.abc import Iterator
10
- from datetime import timedelta
13
+ from types import TracebackType
11
14
 
12
15
  _SECONDS_PER_MINUTE = 60
13
16
  _SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
35
38
  result.cpu = after_cpu - before_cpu
36
39
 
37
40
 
41
+ class SharedTimeout:
42
+ """Keeps track of a time budget shared by multiple independent async operations.
43
+
44
+ Provides a reusable, non-reentrant context manager interface.
45
+ """
46
+
47
+ def __init__(self, timeout: timedelta) -> None:
48
+ self._remaining_timeout = timeout
49
+ self._active_timeout: Timeout | None = None
50
+ self._activation_timestamp: float | None = None
51
+
52
+ async def __aenter__(self) -> timedelta:
53
+ if self._active_timeout is not None or self._activation_timestamp is not None:
54
+ raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55
+
56
+ self._activation_timestamp = time.monotonic()
57
+ self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58
+ await new_timeout.__aenter__()
59
+ return self._remaining_timeout
60
+
61
+ async def __aexit__(
62
+ self,
63
+ exc_type: type[BaseException] | None,
64
+ exc_value: BaseException | None,
65
+ exc_traceback: TracebackType | None,
66
+ ) -> None:
67
+ if self._active_timeout is None or self._activation_timestamp is None:
68
+ raise RuntimeError('Logic error')
69
+
70
+ await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71
+ elapsed = time.monotonic() - self._activation_timestamp
72
+ self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73
+
74
+ self._active_timeout = None
75
+ self._activation_timestamp = None
76
+
77
+
38
78
  def format_duration(duration: timedelta | None) -> str:
39
79
  """Format a timedelta into a human-readable string with appropriate units."""
40
80
  if duration is None:
crawlee/_utils/urls.py CHANGED
@@ -7,6 +7,7 @@ from yarl import URL
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from collections.abc import Iterator
10
+ from logging import Logger
10
11
 
11
12
 
12
13
  def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
22
23
  return str(URL(base_url).join(URL(relative_url)))
23
24
 
24
25
 
25
- def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
26
+ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
26
27
  """Convert an iterator of relative URLs to absolute URLs using a base URL."""
27
28
  for url in urls:
28
29
  if is_url_absolute(url):
29
30
  yield url
30
31
  else:
31
- yield convert_to_absolute_url(base_url, url)
32
+ converted_url = convert_to_absolute_url(base_url, url)
33
+ # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
34
+ if not is_url_absolute(converted_url):
35
+ if logger:
36
+ logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
37
+ continue
38
+ yield converted_url
32
39
 
33
40
 
34
41
  _http_url_adapter = TypeAdapter(AnyHttpUrl)
@@ -118,7 +118,10 @@ class BrowserPool:
118
118
  """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
119
119
 
120
120
  Args:
121
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
121
+ browser_type: The type of browser to launch:
122
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
123
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
124
+ the system.
122
125
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
123
126
  and local storage.
124
127
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -135,7 +138,7 @@ class BrowserPool:
135
138
  kwargs: Additional arguments for default constructor.
136
139
  """
137
140
  plugin_options: dict = defaultdict(dict)
138
- plugin_options['browser_launch_options'] = browser_launch_options or {}
141
+ plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
139
142
  plugin_options['browser_new_context_options'] = browser_new_context_options or {}
140
143
 
141
144
  if headless is not None:
@@ -78,7 +78,8 @@ class PlaywrightPersistentBrowser(Browser):
78
78
 
79
79
  async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
80
80
  if self._temp_dir and self._temp_dir.exists():
81
- await asyncio.to_thread(shutil.rmtree, self._temp_dir, ignore_errors=True)
81
+ temp_dir = self._temp_dir
82
+ await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
82
83
 
83
84
  @override
84
85
  async def close(self, **kwargs: Any) -> None:
@@ -216,7 +216,7 @@ class PlaywrightBrowserController(BrowserController):
216
216
  browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
217
217
  if proxy_info:
218
218
  if browser_new_context_options.get('proxy'):
219
- logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
219
+ logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
220
220
 
221
221
  browser_new_context_options['proxy'] = ProxySettings(
222
222
  server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
34
34
 
35
35
  It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
36
36
  for creating new browser instances and provides a unified interface for interacting with different browser types
37
- (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
38
- executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
37
+ (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
38
+ mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
39
39
  browser instance, ensuring that resource limits are respected.
40
40
  """
41
41
 
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
55
55
  """Initialize a new instance.
56
56
 
57
57
  Args:
58
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
58
+ browser_type: The type of browser to launch:
59
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
60
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
61
+ the system.
59
62
  user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
60
63
  storage.
61
64
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
80
83
  'chromium_sandbox': not config.disable_browser_sandbox,
81
84
  }
82
85
 
86
+ if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
87
+ raise ValueError(
88
+ 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
89
+ )
90
+
91
+ # Map 'chrome' to 'chromium' with the 'chrome' channel.
92
+ if browser_type == 'chrome':
93
+ browser_type = 'chromium'
94
+ # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
95
+ default_launch_browser_options['channel'] = 'chrome'
96
+
83
97
  self._browser_type: BrowserType = browser_type
84
98
  self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
85
99
  self._browser_new_context_options = browser_new_context_options or {}
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
6
6
  if TYPE_CHECKING:
7
7
  from playwright.async_api import Page
8
8
 
9
- BrowserType = Literal['chromium', 'firefox', 'webkit']
9
+ BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
10
10
 
11
11
 
12
12
  @dataclass
crawlee/configuration.py CHANGED
@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
28
28
  Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
29
29
  """
30
30
 
31
- model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
31
+ # TODO: https://github.com/pydantic/pydantic-settings/issues/706
32
+ # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
33
+ model_config = SettingsConfigDict(populate_by_name=True)
32
34
 
33
35
  internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
34
36
  """Timeout for the internal asynchronous operations."""
@@ -1,7 +1,7 @@
1
1
  from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
2
  from crawlee._utils.try_import import try_import as _try_import
3
3
 
4
- from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4
+ from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
5
5
  from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
6
6
  from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
7
7
 
@@ -23,12 +23,14 @@ with _try_import(
23
23
  'AdaptivePlaywrightCrawler',
24
24
  'AdaptivePlaywrightCrawlingContext',
25
25
  'AdaptivePlaywrightPreNavCrawlingContext',
26
+ 'AdaptivePlaywrightCrawlerStatisticState',
26
27
  'RenderingType',
27
28
  'RenderingTypePrediction',
28
29
  'RenderingTypePredictor',
29
30
  ):
30
31
  from ._adaptive_playwright import (
31
32
  AdaptivePlaywrightCrawler,
33
+ AdaptivePlaywrightCrawlerStatisticState,
32
34
  AdaptivePlaywrightCrawlingContext,
33
35
  AdaptivePlaywrightPreNavCrawlingContext,
34
36
  RenderingType,
@@ -41,6 +43,7 @@ __all__ = [
41
43
  'AbstractHttpCrawler',
42
44
  'AbstractHttpParser',
43
45
  'AdaptivePlaywrightCrawler',
46
+ 'AdaptivePlaywrightCrawlerStatisticState',
44
47
  'AdaptivePlaywrightCrawlingContext',
45
48
  'AdaptivePlaywrightPreNavCrawlingContext',
46
49
  'BasicCrawler',
@@ -51,6 +54,7 @@ __all__ = [
51
54
  'BeautifulSoupParserType',
52
55
  'ContextPipeline',
53
56
  'HttpCrawler',
57
+ 'HttpCrawlerOptions',
54
58
  'HttpCrawlingContext',
55
59
  'HttpCrawlingResult',
56
60
  'ParsedHttpCrawlingContext',
@@ -1,9 +1,10 @@
1
- from ._abstract_http_crawler import AbstractHttpCrawler
1
+ from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
2
2
  from ._abstract_http_parser import AbstractHttpParser
3
3
  from ._http_crawling_context import ParsedHttpCrawlingContext
4
4
 
5
5
  __all__ = [
6
6
  'AbstractHttpCrawler',
7
7
  'AbstractHttpParser',
8
+ 'HttpCrawlerOptions',
8
9
  'ParsedHttpCrawlingContext',
9
10
  ]
@@ -3,14 +3,16 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  from abc import ABC
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING, Any, Generic
7
8
 
8
9
  from more_itertools import partition
9
10
  from pydantic import ValidationError
10
- from typing_extensions import TypeVar
11
+ from typing_extensions import NotRequired, TypeVar
11
12
 
12
- from crawlee._request import Request, RequestOptions
13
+ from crawlee._request import Request, RequestOptions, RequestState
13
14
  from crawlee._utils.docs import docs_group
15
+ from crawlee._utils.time import SharedTimeout
14
16
  from crawlee._utils.urls import to_absolute_url_iterator
15
17
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
16
18
  from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
32
34
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
33
35
 
34
36
 
37
+ class HttpCrawlerOptions(
38
+ BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39
+ Generic[TCrawlingContext, TStatisticsState],
40
+ ):
41
+ """Arguments for the `AbstractHttpCrawler` constructor.
42
+
43
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
44
+ """
45
+
46
+ navigation_timeout: NotRequired[timedelta | None]
47
+ """Timeout for the HTTP request."""
48
+
49
+
35
50
  @docs_group('Crawlers')
36
51
  class AbstractHttpCrawler(
37
52
  BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
56
71
  self,
57
72
  *,
58
73
  parser: AbstractHttpParser[TParseResult, TSelectResult],
74
+ navigation_timeout: timedelta | None = None,
59
75
  **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
60
76
  ) -> None:
61
77
  self._parser = parser
78
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
62
79
  self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
63
81
 
64
82
  if '_context_pipeline' not in kwargs:
65
83
  raise ValueError(
@@ -82,9 +100,7 @@ class AbstractHttpCrawler(
82
100
  this method simplifies cases where `TParseResult` is used for both generic parameters.
83
101
  """
84
102
 
85
- class _ParsedHttpCrawler(
86
- AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
87
- ):
103
+ class _ParsedHttpCrawler(AbstractHttpCrawler):
88
104
  def __init__(
89
105
  self,
90
106
  parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
@@ -112,9 +128,17 @@ class AbstractHttpCrawler(
112
128
  async def _execute_pre_navigation_hooks(
113
129
  self, context: BasicCrawlingContext
114
130
  ) -> AsyncGenerator[BasicCrawlingContext, None]:
115
- for hook in self._pre_navigation_hooks:
116
- await hook(context)
117
- yield context
131
+ context_id = id(context)
132
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
133
+
134
+ try:
135
+ for hook in self._pre_navigation_hooks:
136
+ async with self._shared_navigation_timeouts[context_id]:
137
+ await hook(context)
138
+
139
+ yield context
140
+ finally:
141
+ self._shared_navigation_timeouts.pop(context_id, None)
118
142
 
119
143
  async def _parse_http_response(
120
144
  self, context: HttpCrawlingContext
@@ -165,9 +189,18 @@ class AbstractHttpCrawler(
165
189
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
166
190
 
167
191
  kwargs.setdefault('strategy', 'same-hostname')
192
+ strategy = kwargs.get('strategy', 'same-hostname')
168
193
 
169
194
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
195
+
196
+ # Get base URL from <base> tag if present
197
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
198
+ base_url: str = (
199
+ str(extracted_base_urls[0])
200
+ if extracted_base_urls
201
+ else context.request.loaded_url or context.request.url
202
+ )
203
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
171
204
 
172
205
  if robots_txt_file:
173
206
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -175,7 +208,9 @@ class AbstractHttpCrawler(
175
208
  skipped = iter([])
176
209
 
177
210
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
178
- request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
211
+ request_options = RequestOptions(
212
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
213
+ )
179
214
 
180
215
  if transform_request_function:
181
216
  transform_request_options = transform_request_function(request_options)
@@ -214,13 +249,16 @@ class AbstractHttpCrawler(
214
249
  Yields:
215
250
  The original crawling context enhanced by HTTP response.
216
251
  """
217
- result = await self._http_client.crawl(
218
- request=context.request,
219
- session=context.session,
220
- proxy_info=context.proxy_info,
221
- statistics=self._statistics,
222
- )
252
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
253
+ result = await self._http_client.crawl(
254
+ request=context.request,
255
+ session=context.session,
256
+ proxy_info=context.proxy_info,
257
+ statistics=self._statistics,
258
+ timeout=remaining_timeout,
259
+ )
223
260
 
261
+ context.request.state = RequestState.AFTER_NAV
224
262
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
225
263
 
226
264
  async def _handle_status_code_response(
@@ -11,13 +11,16 @@ _install_import_hook(__name__)
11
11
 
12
12
  # The following imports are wrapped in try_import to handle optional dependencies,
13
13
  # ensuring the module can still function even if these dependencies are missing.
14
- with _try_import(__name__, 'BeautifulSoupCrawler'):
14
+ with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
15
15
  from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
16
- with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
16
+ with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
17
17
  from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
18
+ with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
19
+ from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
18
20
 
19
21
  __all__ = [
20
22
  'AdaptivePlaywrightCrawler',
23
+ 'AdaptivePlaywrightCrawlerStatisticState',
21
24
  'AdaptivePlaywrightCrawlingContext',
22
25
  'AdaptivePlaywrightPreNavCrawlingContext',
23
26
  'RenderingType',
@@ -27,23 +27,16 @@ from crawlee.crawlers import (
27
27
  )
28
28
  from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
29
29
  from crawlee.crawlers._parsel._parsel_parser import ParselParser
30
+ from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
30
31
  from crawlee.statistics import Statistics, StatisticsState
31
32
 
32
- from ._adaptive_playwright_crawler_statistics import (
33
- AdaptivePlaywrightCrawlerStatisticState,
34
- )
33
+ from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
35
34
  from ._adaptive_playwright_crawling_context import (
36
35
  AdaptivePlaywrightCrawlingContext,
37
36
  AdaptivePlaywrightPreNavCrawlingContext,
38
37
  )
39
- from ._rendering_type_predictor import (
40
- DefaultRenderingTypePredictor,
41
- RenderingType,
42
- RenderingTypePredictor,
43
- )
44
- from ._result_comparator import (
45
- create_default_comparator,
46
- )
38
+ from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
39
+ from ._result_comparator import create_default_comparator
47
40
 
48
41
  if TYPE_CHECKING:
49
42
  from types import TracebackType
@@ -51,7 +44,6 @@ if TYPE_CHECKING:
51
44
  from typing_extensions import Unpack
52
45
 
53
46
  from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
54
- from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
55
47
 
56
48
 
57
49
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -71,7 +63,6 @@ class _NonPersistentStatistics(Statistics):
71
63
  async def __aenter__(self) -> Self:
72
64
  self._active = True
73
65
  await self._state.initialize()
74
- self._after_initialize()
75
66
  return self
76
67
 
77
68
  async def __aexit__(
@@ -149,10 +140,6 @@ class AdaptivePlaywrightCrawler(
149
140
  non-default configuration.
150
141
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
151
142
  """
152
- # Some sub crawler kwargs are internally modified. Prepare copies.
153
- basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
154
- basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
155
-
156
143
  # Adaptive crawling related.
157
144
  self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
158
145
  self.result_checker = result_checker or (lambda _: True)
@@ -162,19 +149,21 @@ class AdaptivePlaywrightCrawler(
162
149
  if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
163
150
  kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
164
151
 
165
- super().__init__(statistics=statistics, **kwargs)
152
+ adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
153
+
154
+ super().__init__(statistics=adaptive_statistics, **kwargs)
166
155
 
167
156
  # Sub crawlers related.
168
- playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {}
157
+ playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
169
158
 
170
159
  # Each sub crawler will use custom logger .
171
160
  static_logger = getLogger('Subcrawler_static')
172
161
  static_logger.setLevel(logging.ERROR)
173
- basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
162
+ basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
174
163
 
175
164
  pw_logger = getLogger('Subcrawler_playwright')
176
165
  pw_logger.setLevel(logging.ERROR)
177
- basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
166
+ basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
178
167
 
179
168
  # Initialize sub crawlers to create their pipelines.
180
169
  static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -295,11 +284,14 @@ class AdaptivePlaywrightCrawler(
295
284
  use_state_function = context.use_state
296
285
 
297
286
  # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
298
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
287
+ result = RequestHandlerRunResult(
288
+ key_value_store_getter=self.get_key_value_store,
289
+ request=context.request,
290
+ )
299
291
  context_linked_to_result = BasicCrawlingContext(
300
- request=deepcopy(context.request),
301
- session=deepcopy(context.session),
302
- proxy_info=deepcopy(context.proxy_info),
292
+ request=result.request,
293
+ session=context.session,
294
+ proxy_info=context.proxy_info,
303
295
  send_request=context.send_request,
304
296
  add_requests=result.add_requests,
305
297
  push_data=result.push_data,
@@ -337,7 +329,7 @@ class AdaptivePlaywrightCrawler(
337
329
  )
338
330
  await self.router(adaptive_crawling_context)
339
331
 
340
- return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)
332
+ return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
341
333
 
342
334
  if rendering_type == 'client only':
343
335
 
@@ -347,7 +339,7 @@ class AdaptivePlaywrightCrawler(
347
339
  )
348
340
  await self.router(adaptive_crawling_context)
349
341
 
350
- return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)
342
+ return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
351
343
 
352
344
  raise RuntimeError(
353
345
  f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
@@ -407,12 +399,9 @@ class AdaptivePlaywrightCrawler(
407
399
  raise pw_run.exception
408
400
 
409
401
  if pw_run.result:
410
- self._context_result_map[context] = pw_run.result
411
-
412
402
  if should_detect_rendering_type:
413
403
  detection_result: RenderingType
414
404
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
415
-
416
405
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
417
406
  detection_result = 'static'
418
407
  else:
@@ -421,6 +410,8 @@ class AdaptivePlaywrightCrawler(
421
410
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
422
411
  self.rendering_type_predictor.store_result(context.request, detection_result)
423
412
 
413
+ self._context_result_map[context] = pw_run.result
414
+
424
415
  def pre_navigation_hook(
425
416
  self,
426
417
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
17
17
  from playwright.async_api import Page, Response
18
18
  from typing_extensions import Self
19
19
 
20
- from crawlee.crawlers._playwright._types import BlockRequestsFunction
20
+ from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
21
21
 
22
22
 
23
23
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
190
190
  http_response = await PlaywrightHttpResponse.from_playwright_response(
191
191
  response=context.response, protocol=protocol_guess or ''
192
192
  )
193
- # block_requests is useful only on pre-navigation contexts. It is useless here.
193
+ # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
194
194
  context_kwargs.pop('block_requests')
195
+ context_kwargs.pop('goto_options')
195
196
  return cls(
196
197
  parsed_content=await parser.parse(http_response),
197
198
  http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
212
213
  block_requests: BlockRequestsFunction | None = None
213
214
  """Blocks network requests matching specified URL patterns."""
214
215
 
216
+ goto_options: GotoOptions | None = None
217
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218
+
215
219
  @property
216
220
  def page(self) -> Page:
217
221
  """The Playwright `Page` object for the current page.