crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+ from crawlee._request import Request
10
+
11
+ from ._basic_crawling_context import BasicCrawlingContext
12
+
13
+
14
+ @contextmanager
15
+ def swapped_context(
16
+ context: BasicCrawlingContext,
17
+ request: Request,
18
+ ) -> Iterator[None]:
19
+ """Replace context's isolated copies with originals after handler execution."""
20
+ try:
21
+ yield
22
+ finally:
23
+ # Restore original context state to avoid side effects between different handlers.
24
+ object.__setattr__(context, 'request', request)
@@ -2,9 +2,21 @@ import asyncio
2
2
  import re
3
3
  import traceback
4
4
 
5
+ import crawlee.errors
6
+
5
7
 
6
8
  def _get_only_innermost_exception(error: BaseException) -> BaseException:
7
- """Get innermost exception by following __cause__ and __context__ attributes of exception."""
9
+ """Get innermost exception by following __cause__ and __context__ attributes of exception.
10
+
11
+ If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12
+ """
13
+ if type(error) is crawlee.errors.UserHandlerTimeoutError:
14
+ if error.__cause__:
15
+ return error.__cause__
16
+ if error.__context__:
17
+ return error.__context__
18
+ return error
19
+
8
20
  if error.__cause__:
9
21
  return _get_only_innermost_exception(error.__cause__)
10
22
  if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
34
46
 
35
47
 
36
48
  def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37
- timeout_error: asyncio.exceptions.TimeoutError,
49
+ timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
38
50
  ) -> list[str]:
39
51
  innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
40
52
  return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,24 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
43
55
  def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
44
56
  innermost_error = _get_only_innermost_exception(error)
45
57
  return traceback.format_exception(
46
- type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58
+ type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
47
59
  )
48
60
 
49
61
 
50
62
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
63
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64
+ relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65
+ most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66
+ elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
67
+ # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68
+ # code and third line the topmost user error
69
+ traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70
+ relevant_index_from_start = 3
71
+ most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
72
+ elif 'playwright._impl._errors.Error' in str(error.__class__):
73
+ # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
74
+ # point to deep internals.
75
+ return ''
53
76
  else:
54
77
  traceback_parts = _get_traceback_parts_for_innermost_exception(error)
55
78
  # Commonly last traceback part is type of the error, and the second last part is the relevant file.
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from parsel import Selector
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._parsel_crawling_context import ParselCrawlingContext
11
11
  from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
56
56
 
57
57
  def __init__(
58
58
  self,
59
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
60
60
  ) -> None:
61
61
  """Initialize a new instance.
62
62
 
@@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
22
22
  @override
23
23
  async def parse(self, response: HttpResponse) -> Selector:
24
24
  response_body = await response.read()
25
- return await asyncio.to_thread(lambda: Selector(body=response_body))
25
+ return await asyncio.to_thread(Selector, body=response_body)
26
26
 
27
27
  @override
28
28
  async def parse_text(self, text: str) -> Selector:
@@ -3,18 +3,22 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  import warnings
6
+ from datetime import timedelta
6
7
  from functools import partial
7
8
  from typing import TYPE_CHECKING, Any, Generic, Literal
8
9
 
10
+ import playwright.async_api
9
11
  from more_itertools import partition
10
12
  from pydantic import ValidationError
11
13
  from typing_extensions import NotRequired, TypedDict, TypeVar
12
14
 
13
15
  from crawlee import service_locator
14
- from crawlee._request import Request, RequestOptions
16
+ from crawlee._request import Request, RequestOptions, RequestState
17
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings
15
18
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
16
19
  from crawlee._utils.docs import docs_group
17
20
  from crawlee._utils.robots import RobotsTxtFile
21
+ from crawlee._utils.time import SharedTimeout
18
22
  from crawlee._utils.urls import to_absolute_url_iterator
19
23
  from crawlee.browsers import BrowserPool
20
24
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -28,6 +32,7 @@ from crawlee.statistics import StatisticsState
28
32
  from ._playwright_crawling_context import PlaywrightCrawlingContext
29
33
  from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
30
34
  from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
35
+ from ._types import GotoOptions
31
36
  from ._utils import block_requests, infinite_scroll
32
37
 
33
38
  TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
@@ -43,7 +48,6 @@ if TYPE_CHECKING:
43
48
 
44
49
  from crawlee import RequestTransformAction
45
50
  from crawlee._types import (
46
- BasicCrawlingContext,
47
51
  EnqueueLinksKwargs,
48
52
  ExtractLinksFunction,
49
53
  HttpHeaders,
@@ -102,9 +106,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
102
106
  user_data_dir: str | Path | None = None,
103
107
  browser_launch_options: Mapping[str, Any] | None = None,
104
108
  browser_new_context_options: Mapping[str, Any] | None = None,
109
+ goto_options: GotoOptions | None = None,
105
110
  fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
106
111
  headless: bool | None = None,
107
112
  use_incognito_pages: bool | None = None,
113
+ navigation_timeout: timedelta | None = None,
108
114
  **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
109
115
  ) -> None:
110
116
  """Initialize a new instance.
@@ -113,7 +119,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
113
119
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
114
120
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
115
121
  and local storage.
116
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
122
+ browser_type: The type of browser to launch:
123
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
124
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
125
+ the system.
117
126
  This option should not be used if `browser_pool` is provided.
118
127
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
119
128
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -130,12 +139,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
130
139
  use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
131
140
  own context that is destroyed once the page is closed or crashes.
132
141
  This option should not be used if `browser_pool` is provided.
142
+ navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
143
+ the request handler)
144
+ goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
145
+ not supported, use `navigation_timeout` instead.
133
146
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
134
147
  """
135
148
  configuration = kwargs.pop('configuration', None)
136
149
  if configuration is not None:
137
150
  service_locator.set_configuration(configuration)
138
151
 
152
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
153
+
139
154
  if browser_pool:
140
155
  # Raise an exception if browser_pool is provided together with other browser-related arguments.
141
156
  if any(
@@ -152,17 +167,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
152
167
  ):
153
168
  raise ValueError(
154
169
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
155
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
170
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
156
171
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
157
172
  )
158
173
 
159
174
  # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
160
175
  else:
161
176
  if fingerprint_generator == 'default':
162
- if not browser_type:
163
- generator_browser_type = None
164
- else:
165
- generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
177
+ generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
178
+ [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
179
+ )
166
180
 
167
181
  fingerprint_generator = DefaultFingerprintGenerator(
168
182
  header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
@@ -194,6 +208,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
194
208
 
195
209
  kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
196
210
 
211
+ # Set default concurrency settings for browser crawlers if not provided
212
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
213
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
214
+
215
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
216
+ self._goto_options = goto_options or GotoOptions()
217
+
197
218
  super().__init__(**kwargs)
198
219
 
199
220
  async def _open_page(
@@ -218,12 +239,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
218
239
  log=context.log,
219
240
  page=crawlee_page.page,
220
241
  block_requests=partial(block_requests, page=crawlee_page.page),
242
+ goto_options=GotoOptions(**self._goto_options),
221
243
  )
222
244
 
223
- async with browser_page_context(crawlee_page.page):
224
- for hook in self._pre_navigation_hooks:
225
- await hook(pre_navigation_context)
226
- yield pre_navigation_context
245
+ context_id = id(pre_navigation_context)
246
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
247
+
248
+ try:
249
+ async with browser_page_context(crawlee_page.page):
250
+ for hook in self._pre_navigation_hooks:
251
+ async with self._shared_navigation_timeouts[context_id]:
252
+ await hook(pre_navigation_context)
253
+
254
+ yield pre_navigation_context
255
+ finally:
256
+ self._shared_navigation_timeouts.pop(context_id, None)
227
257
 
228
258
  def _prepare_request_interceptor(
229
259
  self,
@@ -258,6 +288,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
258
288
  Raises:
259
289
  ValueError: If the browser pool is not initialized.
260
290
  SessionError: If the URL cannot be loaded by the browser.
291
+ TimeoutError: If navigation does not succeed within the navigation timeout.
261
292
 
262
293
  Yields:
263
294
  The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -289,7 +320,14 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
289
320
  # Set route_handler only for current request
290
321
  await context.page.route(context.request.url, route_handler)
291
322
 
292
- response = await context.page.goto(context.request.url)
323
+ try:
324
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
325
+ response = await context.page.goto(
326
+ context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
327
+ )
328
+ context.request.state = RequestState.AFTER_NAV
329
+ except playwright.async_api.TimeoutError as exc:
330
+ raise asyncio.TimeoutError from exc
293
331
 
294
332
  if response is None:
295
333
  raise SessionError(f'Failed to load the URL: {context.request.url}')
@@ -316,6 +354,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
316
354
  extract_links=extract_links,
317
355
  enqueue_links=self._create_enqueue_links_function(context, extract_links),
318
356
  block_requests=partial(block_requests, page=context.page),
357
+ goto_options=context.goto_options,
319
358
  )
320
359
 
321
360
  if context.session:
@@ -356,12 +395,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
356
395
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
357
396
 
358
397
  kwargs.setdefault('strategy', 'same-hostname')
398
+ strategy = kwargs.get('strategy', 'same-hostname')
359
399
 
360
400
  elements = await context.page.query_selector_all(selector)
361
401
  links_iterator: Iterator[str] = iter(
362
402
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
363
403
  )
364
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
404
+
405
+ # Get base URL from <base> tag if present
406
+ extracted_base_url = await context.page.evaluate('document.baseURI')
407
+ base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
408
+
409
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
365
410
 
366
411
  if robots_txt_file:
367
412
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -369,17 +414,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
369
414
  skipped = iter([])
370
415
 
371
416
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
372
- request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
417
+ request_options = RequestOptions(
418
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
419
+ )
373
420
 
374
421
  if transform_request_function:
375
- transform_request_option = transform_request_function(request_option)
376
- if transform_request_option == 'skip':
422
+ transform_request_options = transform_request_function(request_options)
423
+ if transform_request_options == 'skip':
377
424
  continue
378
- if transform_request_option != 'unchanged':
379
- request_option = transform_request_option
425
+ if transform_request_options != 'unchanged':
426
+ request_options = transform_request_options
380
427
 
381
428
  try:
382
- request = Request.from_url(**request_option)
429
+ request = Request.from_url(**request_options)
383
430
  except ValidationError as exc:
384
431
  context.log.debug(
385
432
  f'Skipping URL "{url}" due to invalid format: {exc}. '
@@ -465,7 +512,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
465
512
 
466
513
  async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
467
514
  """Update the cookies in the page context."""
468
- await page.context.add_cookies([{**cookie} for cookie in cookies])
515
+ # False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
516
+ await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type]
469
517
 
470
518
  async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
471
519
  """Find the robots.txt file for a given URL.
@@ -489,7 +537,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
489
537
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
490
538
 
491
539
  browser_type: NotRequired[BrowserType]
492
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
540
+ """The type of browser to launch:
541
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
542
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
493
543
  This option should not be used if `browser_pool` is provided."""
494
544
 
495
545
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -509,9 +559,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
509
559
 
510
560
 
511
561
  class PlaywrightCrawlerOptions(
512
- Generic[TCrawlingContext, TStatisticsState],
513
562
  _PlaywrightCrawlerAdditionalOptions,
514
563
  BasicCrawlerOptions[TCrawlingContext, StatisticsState],
564
+ Generic[TCrawlingContext, TStatisticsState],
515
565
  ):
516
566
  """Arguments for the `AbstractHttpCrawler` constructor.
517
567
 
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
59
59
  session: Session | None = None,
60
60
  proxy_info: ProxyInfo | None = None,
61
61
  statistics: Statistics | None = None,
62
+ timeout: timedelta | None = None,
62
63
  ) -> HttpCrawlingResult:
63
64
  raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
64
65
 
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
72
73
  payload: HttpPayload | None = None,
73
74
  session: Session | None = None,
74
75
  proxy_info: ProxyInfo | None = None,
76
+ timeout: timedelta | None = None,
75
77
  ) -> HttpResponse:
76
78
  # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
77
79
  # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
87
89
 
88
90
  # Proxies appropriate to the browser context are used
89
91
  response = await browser_context.request.fetch(
90
- url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
92
+ url_or_request=url,
93
+ method=method.lower(),
94
+ headers=dict(headers) if headers else None,
95
+ data=payload,
96
+ timeout=timeout.total_seconds() if timeout else None,
91
97
  )
92
98
 
93
99
  return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
9
9
  if TYPE_CHECKING:
10
10
  from playwright.async_api import Page
11
11
 
12
- from ._types import BlockRequestsFunction
12
+ from ._types import BlockRequestsFunction, GotoOptions
13
13
 
14
14
 
15
15
  @dataclass(frozen=True)
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
26
26
  block_requests: BlockRequestsFunction
27
27
  """Blocks network requests matching specified URL patterns."""
28
28
 
29
+ goto_options: GotoOptions
30
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
31
+
29
32
  async def get_snapshot(self) -> PageSnapshot:
30
33
  """Get snapshot of crawled page."""
31
34
  html = None
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import TYPE_CHECKING, Protocol
4
+ from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
5
5
 
6
6
  from crawlee import HttpHeaders
7
7
  from crawlee._utils.docs import docs_group
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
10
10
  from collections.abc import AsyncGenerator
11
11
 
12
12
  from playwright.async_api import APIResponse, Response
13
- from typing_extensions import Self
13
+ from typing_extensions import NotRequired, Self
14
14
 
15
15
 
16
16
  @docs_group('Functions')
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
58
58
  _content = await response.body()
59
59
 
60
60
  return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
61
+
62
+
63
+ class GotoOptions(TypedDict):
64
+ """Keyword arguments for Playwright's `Page.goto()` method."""
65
+
66
+ wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
67
+ """When to consider operation succeeded, defaults to 'load' event."""
68
+
69
+ referer: NotRequired[str]
70
+ """Referer header value."""
crawlee/errors.py CHANGED
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
29
29
  """Wraps an exception thrown from an user-defined error handler."""
30
30
 
31
31
 
32
+ class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
33
+ """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
34
+
35
+
32
36
  @docs_group('Errors')
33
37
  class SessionError(Exception):
34
38
  """Errors of `SessionError` type will trigger a session rotation.
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
@@ -172,13 +174,12 @@ class EventManager:
172
174
  # to avoid blocking the event loop
173
175
  coro = (
174
176
  listener(*bound_args.args, **bound_args.kwargs)
175
- if asyncio.iscoroutinefunction(listener)
177
+ if inspect.iscoroutinefunction(listener)
176
178
  else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
177
179
  )
178
- # Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
179
- # unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
180
180
 
181
- listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
181
+ listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
182
+ listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
182
183
  self._listener_tasks.add(listener_task)
183
184
 
184
185
  try:
@@ -189,7 +190,12 @@ class EventManager:
189
190
  # We need to swallow the exception and just log it here, otherwise it could break the event emitter
190
191
  logger.exception(
191
192
  'Exception in the event listener',
192
- extra={'event_name': event.value, 'listener_name': listener.__name__},
193
+ extra={
194
+ 'event_name': event.value,
195
+ 'listener_name': listener.__name__
196
+ if hasattr(listener, '__name__')
197
+ else listener.__class__.__name__,
198
+ },
193
199
  )
194
200
  finally:
195
201
  logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
crawlee/events/_types.py CHANGED
@@ -40,7 +40,7 @@ class Event(str, Enum):
40
40
  class EventPersistStateData(BaseModel):
41
41
  """Data for the persist state event."""
42
42
 
43
- model_config = ConfigDict(populate_by_name=True)
43
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
44
44
 
45
45
  is_migrating: Annotated[bool, Field(alias='isMigrating')]
46
46
 
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
49
49
  class EventSystemInfoData(BaseModel):
50
50
  """Data for the system info event."""
51
51
 
52
- model_config = ConfigDict(populate_by_name=True)
52
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
53
53
 
54
54
  cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
55
55
  memory_info: Annotated[
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
62
62
  class EventMigratingData(BaseModel):
63
63
  """Data for the migrating event."""
64
64
 
65
- model_config = ConfigDict(populate_by_name=True)
65
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
66
66
 
67
67
  # The remaining time in seconds before the migration is forced and the process is killed
68
68
  # Optional because it's not present when the event handler is called manually
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
73
73
  class EventAbortingData(BaseModel):
74
74
  """Data for the aborting event."""
75
75
 
76
- model_config = ConfigDict(populate_by_name=True)
76
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
77
77
 
78
78
 
79
79
  @docs_group('Event data')
80
80
  class EventExitData(BaseModel):
81
81
  """Data for the exit event."""
82
82
 
83
- model_config = ConfigDict(populate_by_name=True)
83
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
84
84
 
85
85
 
86
86
  @docs_group('Event data')
87
87
  class EventCrawlerStatusData(BaseModel):
88
88
  """Data for the crawler status event."""
89
89
 
90
- model_config = ConfigDict(populate_by_name=True)
90
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
91
91
 
92
92
  message: str
93
93
  """A message describing the current status of the crawler."""
@@ -154,7 +154,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator):
154
154
  class PatchedFingerprintGenerator(bf_FingerprintGenerator):
155
155
  """Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
156
156
 
157
- def __init__( # type:ignore[no-untyped-def] # Upstream repo types missing.
157
+ def __init__(
158
158
  self,
159
159
  *,
160
160
  screen: Screen | None = None,
@@ -3,10 +3,13 @@ from __future__ import annotations
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING
5
5
 
6
+ from crawlee._utils.docs import docs_group
7
+
6
8
  if TYPE_CHECKING:
7
9
  from browserforge.fingerprints import Fingerprint
8
10
 
9
11
 
12
+ @docs_group('Other')
10
13
  class FingerprintGenerator(ABC):
11
14
  """A class for creating browser fingerprints that mimic browser fingerprints of real users."""
12
15
 
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  def fingerprint_browser_type_from_playwright_browser_type(
14
- playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
14
+ playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
15
15
  ) -> SupportedBrowserType:
16
- if playwright_browser_type == 'chromium':
16
+ if playwright_browser_type in {'chromium', 'chrome'}:
17
17
  return 'chrome'
18
18
  if playwright_browser_type == 'firefox':
19
19
  return 'firefox'
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
11
11
 
12
12
 
13
13
  class ScreenOptions(BaseModel):
14
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
14
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
15
15
 
16
16
  """Defines the screen constrains for the fingerprint generator."""
17
17
 
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
31
31
  class HeaderGeneratorOptions(BaseModel):
32
32
  """Collection of header related attributes that can be used by the fingerprint generator."""
33
33
 
34
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
34
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
35
35
 
36
36
  browsers: list[SupportedBrowserType] | None = None
37
37
  """List of BrowserSpecifications to generate the headers for."""