crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (93) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +62 -32
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +52 -19
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +160 -134
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
  61. crawlee/storage_clients/_memory/_dataset_client.py +2 -2
  62. crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
  63. crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
  64. crawlee/storage_clients/_redis/__init__.py +6 -0
  65. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  66. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  67. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  68. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  69. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  70. crawlee/storage_clients/_redis/_utils.py +23 -0
  71. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  72. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  73. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  74. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  75. crawlee/storage_clients/_redis/py.typed +0 -0
  76. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  77. crawlee/storage_clients/_sql/_dataset_client.py +2 -2
  78. crawlee/storage_clients/_sql/_db_models.py +1 -2
  79. crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
  80. crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
  81. crawlee/storage_clients/_sql/_storage_client.py +1 -1
  82. crawlee/storage_clients/models.py +8 -3
  83. crawlee/storages/_base.py +3 -1
  84. crawlee/storages/_dataset.py +3 -0
  85. crawlee/storages/_key_value_store.py +8 -2
  86. crawlee/storages/_request_queue.py +3 -0
  87. crawlee/storages/_storage_instance_manager.py +109 -42
  88. crawlee/storages/_utils.py +11 -0
  89. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
  90. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
  91. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  92. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  93. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -2,9 +2,21 @@ import asyncio
2
2
  import re
3
3
  import traceback
4
4
 
5
+ import crawlee.errors
6
+
5
7
 
6
8
  def _get_only_innermost_exception(error: BaseException) -> BaseException:
7
- """Get innermost exception by following __cause__ and __context__ attributes of exception."""
9
+ """Get innermost exception by following __cause__ and __context__ attributes of exception.
10
+
11
+ If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12
+ """
13
+ if type(error) is crawlee.errors.UserHandlerTimeoutError:
14
+ if error.__cause__:
15
+ return error.__cause__
16
+ if error.__context__:
17
+ return error.__context__
18
+ return error
19
+
8
20
  if error.__cause__:
9
21
  return _get_only_innermost_exception(error.__cause__)
10
22
  if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
34
46
 
35
47
 
36
48
  def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37
- timeout_error: asyncio.exceptions.TimeoutError,
49
+ timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
38
50
  ) -> list[str]:
39
51
  innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
40
52
  return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
43
55
  def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
44
56
  innermost_error = _get_only_innermost_exception(error)
45
57
  return traceback.format_exception(
46
- type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58
+ type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
47
59
  )
48
60
 
49
61
 
50
62
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
63
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64
+ relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65
+ most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66
+ elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
67
+ # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68
+ # code and third line the topmost user error
69
+ traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70
+ relevant_index_from_start = 3
71
+ most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
53
72
  elif 'playwright._impl._errors.Error' in str(error.__class__):
54
73
  # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55
74
  # point to deep internals.
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from parsel import Selector
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._parsel_crawling_context import ParselCrawlingContext
11
11
  from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
56
56
 
57
57
  def __init__(
58
58
  self,
59
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
60
60
  ) -> None:
61
61
  """Initialize a new instance.
62
62
 
@@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
22
22
  @override
23
23
  async def parse(self, response: HttpResponse) -> Selector:
24
24
  response_body = await response.read()
25
- return await asyncio.to_thread(lambda: Selector(body=response_body))
25
+ return await asyncio.to_thread(Selector, body=response_body)
26
26
 
27
27
  @override
28
28
  async def parse_text(self, text: str) -> Selector:
@@ -3,19 +3,22 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  import warnings
6
+ from datetime import timedelta
6
7
  from functools import partial
7
8
  from typing import TYPE_CHECKING, Any, Generic, Literal
8
9
 
10
+ import playwright.async_api
9
11
  from more_itertools import partition
10
12
  from pydantic import ValidationError
11
13
  from typing_extensions import NotRequired, TypedDict, TypeVar
12
14
 
13
15
  from crawlee import service_locator
14
- from crawlee._request import Request, RequestOptions
15
- from crawlee._types import ConcurrencySettings
16
+ from crawlee._request import Request, RequestOptions, RequestState
17
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings
16
18
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
17
19
  from crawlee._utils.docs import docs_group
18
20
  from crawlee._utils.robots import RobotsTxtFile
21
+ from crawlee._utils.time import SharedTimeout
19
22
  from crawlee._utils.urls import to_absolute_url_iterator
20
23
  from crawlee.browsers import BrowserPool
21
24
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -29,6 +32,7 @@ from crawlee.statistics import StatisticsState
29
32
  from ._playwright_crawling_context import PlaywrightCrawlingContext
30
33
  from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
31
34
  from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
35
+ from ._types import GotoOptions
32
36
  from ._utils import block_requests, infinite_scroll
33
37
 
34
38
  TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
@@ -44,7 +48,6 @@ if TYPE_CHECKING:
44
48
 
45
49
  from crawlee import RequestTransformAction
46
50
  from crawlee._types import (
47
- BasicCrawlingContext,
48
51
  EnqueueLinksKwargs,
49
52
  ExtractLinksFunction,
50
53
  HttpHeaders,
@@ -103,9 +106,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
103
106
  user_data_dir: str | Path | None = None,
104
107
  browser_launch_options: Mapping[str, Any] | None = None,
105
108
  browser_new_context_options: Mapping[str, Any] | None = None,
109
+ goto_options: GotoOptions | None = None,
106
110
  fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
107
111
  headless: bool | None = None,
108
112
  use_incognito_pages: bool | None = None,
113
+ navigation_timeout: timedelta | None = None,
109
114
  **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
110
115
  ) -> None:
111
116
  """Initialize a new instance.
@@ -114,7 +119,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
114
119
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
115
120
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
116
121
  and local storage.
117
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
122
+ browser_type: The type of browser to launch:
123
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
124
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
125
+ the system.
118
126
  This option should not be used if `browser_pool` is provided.
119
127
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
120
128
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -131,12 +139,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
131
139
  use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
132
140
  own context that is destroyed once the page is closed or crashes.
133
141
  This option should not be used if `browser_pool` is provided.
142
+ navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
143
+ the request handler)
144
+ goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
145
+ not supported, use `navigation_timeout` instead.
134
146
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
135
147
  """
136
148
  configuration = kwargs.pop('configuration', None)
137
149
  if configuration is not None:
138
150
  service_locator.set_configuration(configuration)
139
151
 
152
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
153
+
140
154
  if browser_pool:
141
155
  # Raise an exception if browser_pool is provided together with other browser-related arguments.
142
156
  if any(
@@ -153,17 +167,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
153
167
  ):
154
168
  raise ValueError(
155
169
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
156
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
170
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
157
171
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
158
172
  )
159
173
 
160
174
  # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
161
175
  else:
162
176
  if fingerprint_generator == 'default':
163
- if not browser_type:
164
- generator_browser_type = None
165
- else:
166
- generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
177
+ generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
178
+ [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
179
+ )
167
180
 
168
181
  fingerprint_generator = DefaultFingerprintGenerator(
169
182
  header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
@@ -199,6 +212,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
199
212
  if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
200
213
  kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
201
214
 
215
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
216
+ self._goto_options = goto_options or GotoOptions()
217
+
202
218
  super().__init__(**kwargs)
203
219
 
204
220
  async def _open_page(
@@ -223,12 +239,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
223
239
  log=context.log,
224
240
  page=crawlee_page.page,
225
241
  block_requests=partial(block_requests, page=crawlee_page.page),
242
+ goto_options=GotoOptions(**self._goto_options),
226
243
  )
227
244
 
228
- async with browser_page_context(crawlee_page.page):
229
- for hook in self._pre_navigation_hooks:
230
- await hook(pre_navigation_context)
231
- yield pre_navigation_context
245
+ context_id = id(pre_navigation_context)
246
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
247
+
248
+ try:
249
+ async with browser_page_context(crawlee_page.page):
250
+ for hook in self._pre_navigation_hooks:
251
+ async with self._shared_navigation_timeouts[context_id]:
252
+ await hook(pre_navigation_context)
253
+
254
+ yield pre_navigation_context
255
+ finally:
256
+ self._shared_navigation_timeouts.pop(context_id, None)
232
257
 
233
258
  def _prepare_request_interceptor(
234
259
  self,
@@ -263,6 +288,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
263
288
  Raises:
264
289
  ValueError: If the browser pool is not initialized.
265
290
  SessionError: If the URL cannot be loaded by the browser.
291
+ TimeoutError: If navigation does not succeed within the navigation timeout.
266
292
 
267
293
  Yields:
268
294
  The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -294,7 +320,14 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
294
320
  # Set route_handler only for current request
295
321
  await context.page.route(context.request.url, route_handler)
296
322
 
297
- response = await context.page.goto(context.request.url)
323
+ try:
324
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
325
+ response = await context.page.goto(
326
+ context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
327
+ )
328
+ context.request.state = RequestState.AFTER_NAV
329
+ except playwright.async_api.TimeoutError as exc:
330
+ raise asyncio.TimeoutError from exc
298
331
 
299
332
  if response is None:
300
333
  raise SessionError(f'Failed to load the URL: {context.request.url}')
@@ -321,6 +354,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
321
354
  extract_links=extract_links,
322
355
  enqueue_links=self._create_enqueue_links_function(context, extract_links),
323
356
  block_requests=partial(block_requests, page=context.page),
357
+ goto_options=context.goto_options,
324
358
  )
325
359
 
326
360
  if context.session:
@@ -361,12 +395,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
361
395
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
362
396
 
363
397
  kwargs.setdefault('strategy', 'same-hostname')
398
+ strategy = kwargs.get('strategy', 'same-hostname')
364
399
 
365
400
  elements = await context.page.query_selector_all(selector)
366
401
  links_iterator: Iterator[str] = iter(
367
402
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
368
403
  )
369
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
404
+
405
+ # Get base URL from <base> tag if present
406
+ extracted_base_url = await context.page.evaluate('document.baseURI')
407
+ base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
408
+
409
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
370
410
 
371
411
  if robots_txt_file:
372
412
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -374,17 +414,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
374
414
  skipped = iter([])
375
415
 
376
416
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
377
- request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
417
+ request_options = RequestOptions(
418
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
419
+ )
378
420
 
379
421
  if transform_request_function:
380
- transform_request_option = transform_request_function(request_option)
381
- if transform_request_option == 'skip':
422
+ transform_request_options = transform_request_function(request_options)
423
+ if transform_request_options == 'skip':
382
424
  continue
383
- if transform_request_option != 'unchanged':
384
- request_option = transform_request_option
425
+ if transform_request_options != 'unchanged':
426
+ request_options = transform_request_options
385
427
 
386
428
  try:
387
- request = Request.from_url(**request_option)
429
+ request = Request.from_url(**request_options)
388
430
  except ValidationError as exc:
389
431
  context.log.debug(
390
432
  f'Skipping URL "{url}" due to invalid format: {exc}. '
@@ -470,7 +512,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
470
512
 
471
513
  async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
472
514
  """Update the cookies in the page context."""
473
- await page.context.add_cookies([{**cookie} for cookie in cookies])
515
+ # False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
516
+ await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type]
474
517
 
475
518
  async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
476
519
  """Find the robots.txt file for a given URL.
@@ -494,7 +537,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
494
537
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
495
538
 
496
539
  browser_type: NotRequired[BrowserType]
497
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
540
+ """The type of browser to launch:
541
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
542
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
498
543
  This option should not be used if `browser_pool` is provided."""
499
544
 
500
545
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
59
59
  session: Session | None = None,
60
60
  proxy_info: ProxyInfo | None = None,
61
61
  statistics: Statistics | None = None,
62
+ timeout: timedelta | None = None,
62
63
  ) -> HttpCrawlingResult:
63
64
  raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
64
65
 
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
72
73
  payload: HttpPayload | None = None,
73
74
  session: Session | None = None,
74
75
  proxy_info: ProxyInfo | None = None,
76
+ timeout: timedelta | None = None,
75
77
  ) -> HttpResponse:
76
78
  # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
77
79
  # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
87
89
 
88
90
  # Proxies appropriate to the browser context are used
89
91
  response = await browser_context.request.fetch(
90
- url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
92
+ url_or_request=url,
93
+ method=method.lower(),
94
+ headers=dict(headers) if headers else None,
95
+ data=payload,
96
+ timeout=timeout.total_seconds() if timeout else None,
91
97
  )
92
98
 
93
99
  return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
9
9
  if TYPE_CHECKING:
10
10
  from playwright.async_api import Page
11
11
 
12
- from ._types import BlockRequestsFunction
12
+ from ._types import BlockRequestsFunction, GotoOptions
13
13
 
14
14
 
15
15
  @dataclass(frozen=True)
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
26
26
  block_requests: BlockRequestsFunction
27
27
  """Blocks network requests matching specified URL patterns."""
28
28
 
29
+ goto_options: GotoOptions
30
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
31
+
29
32
  async def get_snapshot(self) -> PageSnapshot:
30
33
  """Get snapshot of crawled page."""
31
34
  html = None
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import TYPE_CHECKING, Protocol
4
+ from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
5
5
 
6
6
  from crawlee import HttpHeaders
7
7
  from crawlee._utils.docs import docs_group
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
10
10
  from collections.abc import AsyncGenerator
11
11
 
12
12
  from playwright.async_api import APIResponse, Response
13
- from typing_extensions import Self
13
+ from typing_extensions import NotRequired, Self
14
14
 
15
15
 
16
16
  @docs_group('Functions')
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
58
58
  _content = await response.body()
59
59
 
60
60
  return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
61
+
62
+
63
+ class GotoOptions(TypedDict):
64
+ """Keyword arguments for Playwright's `Page.goto()` method."""
65
+
66
+ wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
67
+ """When to consider operation succeeded, defaults to 'load' event."""
68
+
69
+ referer: NotRequired[str]
70
+ """Referer header value."""
crawlee/errors.py CHANGED
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
29
29
  """Wraps an exception thrown from an user-defined error handler."""
30
30
 
31
31
 
32
+ class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
33
+ """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
34
+
35
+
32
36
  @docs_group('Errors')
33
37
  class SessionError(Exception):
34
38
  """Errors of `SessionError` type will trigger a session rotation.
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
@@ -172,13 +174,12 @@ class EventManager:
172
174
  # to avoid blocking the event loop
173
175
  coro = (
174
176
  listener(*bound_args.args, **bound_args.kwargs)
175
- if asyncio.iscoroutinefunction(listener)
177
+ if inspect.iscoroutinefunction(listener)
176
178
  else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
177
179
  )
178
- # Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
179
- # unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
180
180
 
181
- listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
181
+ listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
182
+ listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
182
183
  self._listener_tasks.add(listener_task)
183
184
 
184
185
  try:
@@ -189,7 +190,12 @@ class EventManager:
189
190
  # We need to swallow the exception and just log it here, otherwise it could break the event emitter
190
191
  logger.exception(
191
192
  'Exception in the event listener',
192
- extra={'event_name': event.value, 'listener_name': listener.__name__},
193
+ extra={
194
+ 'event_name': event.value,
195
+ 'listener_name': listener.__name__
196
+ if hasattr(listener, '__name__')
197
+ else listener.__class__.__name__,
198
+ },
193
199
  )
194
200
  finally:
195
201
  logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
@@ -154,7 +154,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator):
154
154
  class PatchedFingerprintGenerator(bf_FingerprintGenerator):
155
155
  """Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
156
156
 
157
- def __init__( # type:ignore[no-untyped-def] # Upstream repo types missing.
157
+ def __init__(
158
158
  self,
159
159
  *,
160
160
  screen: Screen | None = None,
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  def fingerprint_browser_type_from_playwright_browser_type(
14
- playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
14
+ playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
15
15
  ) -> SupportedBrowserType:
16
- if playwright_browser_type == 'chromium':
16
+ if playwright_browser_type in {'chromium', 'chrome'}:
17
17
  return 'chrome'
18
18
  if playwright_browser_type == 'firefox':
19
19
  return 'firefox'
@@ -104,6 +104,7 @@ class HttpClient(ABC):
104
104
  session: Session | None = None,
105
105
  proxy_info: ProxyInfo | None = None,
106
106
  statistics: Statistics | None = None,
107
+ timeout: timedelta | None = None,
107
108
  ) -> HttpCrawlingResult:
108
109
  """Perform the crawling for a given request.
109
110
 
@@ -114,6 +115,7 @@ class HttpClient(ABC):
114
115
  session: The session associated with the request.
115
116
  proxy_info: The information about the proxy to be used.
116
117
  statistics: The statistics object to register status codes.
118
+ timeout: Maximum time allowed to process the request.
117
119
 
118
120
  Raises:
119
121
  ProxyError: Raised if a proxy-related error occurs.
@@ -132,6 +134,7 @@ class HttpClient(ABC):
132
134
  payload: HttpPayload | None = None,
133
135
  session: Session | None = None,
134
136
  proxy_info: ProxyInfo | None = None,
137
+ timeout: timedelta | None = None,
135
138
  ) -> HttpResponse:
136
139
  """Send an HTTP request via the client.
137
140
 
@@ -144,6 +147,7 @@ class HttpClient(ABC):
144
147
  payload: The data to be sent as the request body.
145
148
  session: The session associated with the request.
146
149
  proxy_info: The information about the proxy to be used.
150
+ timeout: Maximum time allowed to process the request.
147
151
 
148
152
  Raises:
149
153
  ProxyError: Raised if a proxy-related error occurs.