crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -49,7 +49,11 @@ def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
49
49
 
50
50
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
51
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
52
+ most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
53
+ elif 'playwright._impl._errors.Error' in str(error.__class__):
54
+ # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55
+ # point to deep internals.
56
+ return ''
53
57
  else:
54
58
  traceback_parts = _get_traceback_parts_for_innermost_exception(error)
55
59
  # Commonly last traceback part is type of the error, and the second last part is the relevant file.
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from parsel import Selector
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._parsel_crawling_context import ParselCrawlingContext
11
11
  from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
56
56
 
57
57
  def __init__(
58
58
  self,
59
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
60
60
  ) -> None:
61
61
  """Initialize a new instance.
62
62
 
@@ -3,18 +3,25 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  import warnings
6
+ from datetime import timedelta
6
7
  from functools import partial
7
8
  from typing import TYPE_CHECKING, Any, Generic, Literal
8
9
 
10
+ import playwright.async_api
9
11
  from more_itertools import partition
10
12
  from pydantic import ValidationError
11
13
  from typing_extensions import NotRequired, TypedDict, TypeVar
12
14
 
13
15
  from crawlee import service_locator
14
16
  from crawlee._request import Request, RequestOptions
17
+ from crawlee._types import (
18
+ BasicCrawlingContext,
19
+ ConcurrencySettings,
20
+ )
15
21
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
16
22
  from crawlee._utils.docs import docs_group
17
23
  from crawlee._utils.robots import RobotsTxtFile
24
+ from crawlee._utils.time import SharedTimeout
18
25
  from crawlee._utils.urls import to_absolute_url_iterator
19
26
  from crawlee.browsers import BrowserPool
20
27
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -28,6 +35,7 @@ from crawlee.statistics import StatisticsState
28
35
  from ._playwright_crawling_context import PlaywrightCrawlingContext
29
36
  from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
30
37
  from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
38
+ from ._types import GotoOptions
31
39
  from ._utils import block_requests, infinite_scroll
32
40
 
33
41
  TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
@@ -43,7 +51,6 @@ if TYPE_CHECKING:
43
51
 
44
52
  from crawlee import RequestTransformAction
45
53
  from crawlee._types import (
46
- BasicCrawlingContext,
47
54
  EnqueueLinksKwargs,
48
55
  ExtractLinksFunction,
49
56
  HttpHeaders,
@@ -102,9 +109,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
102
109
  user_data_dir: str | Path | None = None,
103
110
  browser_launch_options: Mapping[str, Any] | None = None,
104
111
  browser_new_context_options: Mapping[str, Any] | None = None,
112
+ goto_options: GotoOptions | None = None,
105
113
  fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
106
114
  headless: bool | None = None,
107
115
  use_incognito_pages: bool | None = None,
116
+ navigation_timeout: timedelta | None = None,
108
117
  **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
109
118
  ) -> None:
110
119
  """Initialize a new instance.
@@ -113,7 +122,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
113
122
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
114
123
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
115
124
  and local storage.
116
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
125
+ browser_type: The type of browser to launch:
126
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
127
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
128
+ the system.
117
129
  This option should not be used if `browser_pool` is provided.
118
130
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
119
131
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -130,12 +142,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
130
142
  use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
131
143
  own context that is destroyed once the page is closed or crashes.
132
144
  This option should not be used if `browser_pool` is provided.
145
+ navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
146
+ the request handler)
147
+ goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
148
+ not supported, use `navigation_timeout` instead.
133
149
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
134
150
  """
135
151
  configuration = kwargs.pop('configuration', None)
136
152
  if configuration is not None:
137
153
  service_locator.set_configuration(configuration)
138
154
 
155
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
156
+
139
157
  if browser_pool:
140
158
  # Raise an exception if browser_pool is provided together with other browser-related arguments.
141
159
  if any(
@@ -152,7 +170,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
152
170
  ):
153
171
  raise ValueError(
154
172
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
155
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
173
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
156
174
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
157
175
  )
158
176
 
@@ -194,6 +212,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
194
212
 
195
213
  kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
196
214
 
215
+ # Set default concurrency settings for browser crawlers if not provided
216
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
217
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
218
+
219
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
220
+ self._goto_options = goto_options or GotoOptions()
221
+
197
222
  super().__init__(**kwargs)
198
223
 
199
224
  async def _open_page(
@@ -218,12 +243,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
218
243
  log=context.log,
219
244
  page=crawlee_page.page,
220
245
  block_requests=partial(block_requests, page=crawlee_page.page),
246
+ goto_options=GotoOptions(**self._goto_options),
221
247
  )
222
248
 
223
- async with browser_page_context(crawlee_page.page):
224
- for hook in self._pre_navigation_hooks:
225
- await hook(pre_navigation_context)
226
- yield pre_navigation_context
249
+ context_id = id(pre_navigation_context)
250
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
251
+
252
+ try:
253
+ async with browser_page_context(crawlee_page.page):
254
+ for hook in self._pre_navigation_hooks:
255
+ async with self._shared_navigation_timeouts[context_id]:
256
+ await hook(pre_navigation_context)
257
+
258
+ yield pre_navigation_context
259
+ finally:
260
+ self._shared_navigation_timeouts.pop(context_id, None)
227
261
 
228
262
  def _prepare_request_interceptor(
229
263
  self,
@@ -258,6 +292,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
258
292
  Raises:
259
293
  ValueError: If the browser pool is not initialized.
260
294
  SessionError: If the URL cannot be loaded by the browser.
295
+ TimeoutError: If navigation does not succeed within the navigation timeout.
261
296
 
262
297
  Yields:
263
298
  The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -289,7 +324,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
289
324
  # Set route_handler only for current request
290
325
  await context.page.route(context.request.url, route_handler)
291
326
 
292
- response = await context.page.goto(context.request.url)
327
+ try:
328
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
329
+ response = await context.page.goto(
330
+ context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
331
+ )
332
+ except playwright.async_api.TimeoutError as exc:
333
+ raise asyncio.TimeoutError from exc
293
334
 
294
335
  if response is None:
295
336
  raise SessionError(f'Failed to load the URL: {context.request.url}')
@@ -316,6 +357,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
316
357
  extract_links=extract_links,
317
358
  enqueue_links=self._create_enqueue_links_function(context, extract_links),
318
359
  block_requests=partial(block_requests, page=context.page),
360
+ goto_options=context.goto_options,
319
361
  )
320
362
 
321
363
  if context.session:
@@ -361,7 +403,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
361
403
  links_iterator: Iterator[str] = iter(
362
404
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
363
405
  )
364
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
406
+
407
+ # Get base URL from <base> tag if present
408
+ extracted_base_url = await context.page.evaluate('document.baseURI')
409
+ base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
410
+
411
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
365
412
 
366
413
  if robots_txt_file:
367
414
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -489,7 +536,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
489
536
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
490
537
 
491
538
  browser_type: NotRequired[BrowserType]
492
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
539
+ """The type of browser to launch:
540
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
541
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
493
542
  This option should not be used if `browser_pool` is provided."""
494
543
 
495
544
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -509,9 +558,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
509
558
 
510
559
 
511
560
  class PlaywrightCrawlerOptions(
512
- Generic[TCrawlingContext, TStatisticsState],
513
561
  _PlaywrightCrawlerAdditionalOptions,
514
562
  BasicCrawlerOptions[TCrawlingContext, StatisticsState],
563
+ Generic[TCrawlingContext, TStatisticsState],
515
564
  ):
516
565
  """Arguments for the `AbstractHttpCrawler` constructor.
517
566
 
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
59
59
  session: Session | None = None,
60
60
  proxy_info: ProxyInfo | None = None,
61
61
  statistics: Statistics | None = None,
62
+ timeout: timedelta | None = None,
62
63
  ) -> HttpCrawlingResult:
63
64
  raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
64
65
 
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
72
73
  payload: HttpPayload | None = None,
73
74
  session: Session | None = None,
74
75
  proxy_info: ProxyInfo | None = None,
76
+ timeout: timedelta | None = None,
75
77
  ) -> HttpResponse:
76
78
  # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
77
79
  # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
87
89
 
88
90
  # Proxies appropriate to the browser context are used
89
91
  response = await browser_context.request.fetch(
90
- url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
92
+ url_or_request=url,
93
+ method=method.lower(),
94
+ headers=dict(headers) if headers else None,
95
+ data=payload,
96
+ timeout=timeout.total_seconds() if timeout else None,
91
97
  )
92
98
 
93
99
  return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
9
9
  if TYPE_CHECKING:
10
10
  from playwright.async_api import Page
11
11
 
12
- from ._types import BlockRequestsFunction
12
+ from ._types import BlockRequestsFunction, GotoOptions
13
13
 
14
14
 
15
15
  @dataclass(frozen=True)
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
26
26
  block_requests: BlockRequestsFunction
27
27
  """Blocks network requests matching specified URL patterns."""
28
28
 
29
+ goto_options: GotoOptions
30
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
31
+
29
32
  async def get_snapshot(self) -> PageSnapshot:
30
33
  """Get snapshot of crawled page."""
31
34
  html = None
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import TYPE_CHECKING, Protocol
4
+ from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
5
5
 
6
6
  from crawlee import HttpHeaders
7
7
  from crawlee._utils.docs import docs_group
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
10
10
  from collections.abc import AsyncGenerator
11
11
 
12
12
  from playwright.async_api import APIResponse, Response
13
- from typing_extensions import Self
13
+ from typing_extensions import NotRequired, Self
14
14
 
15
15
 
16
16
  @docs_group('Functions')
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
58
58
  _content = await response.body()
59
59
 
60
60
  return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
61
+
62
+
63
+ class GotoOptions(TypedDict):
64
+ """Keyword arguments for Playwright's `Page.goto()` method."""
65
+
66
+ wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
67
+ """When to consider operation succeeded, defaults to 'load' event."""
68
+
69
+ referer: NotRequired[str]
70
+ """Referer header value."""
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
@@ -172,11 +174,9 @@ class EventManager:
172
174
  # to avoid blocking the event loop
173
175
  coro = (
174
176
  listener(*bound_args.args, **bound_args.kwargs)
175
- if asyncio.iscoroutinefunction(listener)
177
+ if inspect.iscoroutinefunction(listener)
176
178
  else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
177
179
  )
178
- # Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
179
- # unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
180
180
 
181
181
  listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
182
182
  self._listener_tasks.add(listener_task)
crawlee/events/_types.py CHANGED
@@ -40,7 +40,7 @@ class Event(str, Enum):
40
40
  class EventPersistStateData(BaseModel):
41
41
  """Data for the persist state event."""
42
42
 
43
- model_config = ConfigDict(populate_by_name=True)
43
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
44
44
 
45
45
  is_migrating: Annotated[bool, Field(alias='isMigrating')]
46
46
 
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
49
49
  class EventSystemInfoData(BaseModel):
50
50
  """Data for the system info event."""
51
51
 
52
- model_config = ConfigDict(populate_by_name=True)
52
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
53
53
 
54
54
  cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
55
55
  memory_info: Annotated[
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
62
62
  class EventMigratingData(BaseModel):
63
63
  """Data for the migrating event."""
64
64
 
65
- model_config = ConfigDict(populate_by_name=True)
65
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
66
66
 
67
67
  # The remaining time in seconds before the migration is forced and the process is killed
68
68
  # Optional because it's not present when the event handler is called manually
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
73
73
  class EventAbortingData(BaseModel):
74
74
  """Data for the aborting event."""
75
75
 
76
- model_config = ConfigDict(populate_by_name=True)
76
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
77
77
 
78
78
 
79
79
  @docs_group('Event data')
80
80
  class EventExitData(BaseModel):
81
81
  """Data for the exit event."""
82
82
 
83
- model_config = ConfigDict(populate_by_name=True)
83
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
84
84
 
85
85
 
86
86
  @docs_group('Event data')
87
87
  class EventCrawlerStatusData(BaseModel):
88
88
  """Data for the crawler status event."""
89
89
 
90
- model_config = ConfigDict(populate_by_name=True)
90
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
91
91
 
92
92
  message: str
93
93
  """A message describing the current status of the crawler."""
@@ -3,10 +3,13 @@ from __future__ import annotations
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING
5
5
 
6
+ from crawlee._utils.docs import docs_group
7
+
6
8
  if TYPE_CHECKING:
7
9
  from browserforge.fingerprints import Fingerprint
8
10
 
9
11
 
12
+ @docs_group('Other')
10
13
  class FingerprintGenerator(ABC):
11
14
  """A class for creating browser fingerprints that mimic browser fingerprints of real users."""
12
15
 
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  def fingerprint_browser_type_from_playwright_browser_type(
14
- playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
14
+ playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
15
15
  ) -> SupportedBrowserType:
16
- if playwright_browser_type == 'chromium':
16
+ if playwright_browser_type in {'chromium', 'chrome'}:
17
17
  return 'chrome'
18
18
  if playwright_browser_type == 'firefox':
19
19
  return 'firefox'
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
11
11
 
12
12
 
13
13
  class ScreenOptions(BaseModel):
14
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
14
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
15
15
 
16
16
  """Defines the screen constrains for the fingerprint generator."""
17
17
 
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
31
31
  class HeaderGeneratorOptions(BaseModel):
32
32
  """Collection of header related attributes that can be used by the fingerprint generator."""
33
33
 
34
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
34
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
35
35
 
36
36
  browsers: list[SupportedBrowserType] | None = None
37
37
  """List of BrowserSpecifications to generate the headers for."""
@@ -104,6 +104,7 @@ class HttpClient(ABC):
104
104
  session: Session | None = None,
105
105
  proxy_info: ProxyInfo | None = None,
106
106
  statistics: Statistics | None = None,
107
+ timeout: timedelta | None = None,
107
108
  ) -> HttpCrawlingResult:
108
109
  """Perform the crawling for a given request.
109
110
 
@@ -114,6 +115,7 @@ class HttpClient(ABC):
114
115
  session: The session associated with the request.
115
116
  proxy_info: The information about the proxy to be used.
116
117
  statistics: The statistics object to register status codes.
118
+ timeout: Maximum time allowed to process the request.
117
119
 
118
120
  Raises:
119
121
  ProxyError: Raised if a proxy-related error occurs.
@@ -132,6 +134,7 @@ class HttpClient(ABC):
132
134
  payload: HttpPayload | None = None,
133
135
  session: Session | None = None,
134
136
  proxy_info: ProxyInfo | None = None,
137
+ timeout: timedelta | None = None,
135
138
  ) -> HttpResponse:
136
139
  """Send an HTTP request via the client.
137
140
 
@@ -144,6 +147,7 @@ class HttpClient(ABC):
144
147
  payload: The data to be sent as the request body.
145
148
  session: The session associated with the request.
146
149
  proxy_info: The information about the proxy to be used.
150
+ timeout: Maximum time allowed to process the request.
147
151
 
148
152
  Raises:
149
153
  ProxyError: Raised if a proxy-related error occurs.
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
5
  from typing import TYPE_CHECKING, Any
5
6
 
@@ -10,6 +11,7 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
10
11
  from curl_cffi.requests.cookies import CurlMorsel
11
12
  from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
12
13
  from curl_cffi.requests.exceptions import RequestException as CurlRequestError
14
+ from curl_cffi.requests.exceptions import Timeout
13
15
  from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
14
16
  from typing_extensions import override
15
17
 
@@ -147,6 +149,7 @@ class CurlImpersonateHttpClient(HttpClient):
147
149
  session: Session | None = None,
148
150
  proxy_info: ProxyInfo | None = None,
149
151
  statistics: Statistics | None = None,
152
+ timeout: timedelta | None = None,
150
153
  ) -> HttpCrawlingResult:
151
154
  client = self._get_client(proxy_info.url if proxy_info else None)
152
155
 
@@ -157,7 +160,10 @@ class CurlImpersonateHttpClient(HttpClient):
157
160
  headers=request.headers,
158
161
  data=request.payload,
159
162
  cookies=session.cookies.jar if session else None,
163
+ timeout=timeout.total_seconds() if timeout else None,
160
164
  )
165
+ except Timeout as exc:
166
+ raise asyncio.TimeoutError from exc
161
167
  except CurlRequestError as exc:
162
168
  if self._is_proxy_error(exc):
163
169
  raise ProxyError from exc
@@ -186,6 +192,7 @@ class CurlImpersonateHttpClient(HttpClient):
186
192
  payload: HttpPayload | None = None,
187
193
  session: Session | None = None,
188
194
  proxy_info: ProxyInfo | None = None,
195
+ timeout: timedelta | None = None,
189
196
  ) -> HttpResponse:
190
197
  if isinstance(headers, dict) or headers is None:
191
198
  headers = HttpHeaders(headers or {})
@@ -200,7 +207,10 @@ class CurlImpersonateHttpClient(HttpClient):
200
207
  headers=dict(headers) if headers else None,
201
208
  data=payload,
202
209
  cookies=session.cookies.jar if session else None,
210
+ timeout=timeout.total_seconds() if timeout else None,
203
211
  )
212
+ except Timeout as exc:
213
+ raise asyncio.TimeoutError from exc
204
214
  except CurlRequestError as exc:
205
215
  if self._is_proxy_error(exc):
206
216
  raise ProxyError from exc
@@ -241,6 +251,8 @@ class CurlImpersonateHttpClient(HttpClient):
241
251
  stream=True,
242
252
  timeout=timeout.total_seconds() if timeout else None,
243
253
  )
254
+ except Timeout as exc:
255
+ raise asyncio.TimeoutError from exc
244
256
  except CurlRequestError as exc:
245
257
  if self._is_proxy_error(exc):
246
258
  raise ProxyError from exc
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any, cast
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
146
147
  session: Session | None = None,
147
148
  proxy_info: ProxyInfo | None = None,
148
149
  statistics: Statistics | None = None,
150
+ timeout: timedelta | None = None,
149
151
  ) -> HttpCrawlingResult:
150
152
  client = self._get_client(proxy_info.url if proxy_info else None)
151
153
  headers = self._combine_headers(request.headers)
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
157
159
  content=request.payload,
158
160
  cookies=session.cookies.jar if session else None,
159
161
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
162
+ timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
160
163
  )
161
164
 
162
165
  try:
163
166
  response = await client.send(http_request)
167
+ except httpx.TimeoutException as exc:
168
+ raise asyncio.TimeoutError from exc
164
169
  except httpx.TransportError as exc:
165
170
  if self._is_proxy_error(exc):
166
171
  raise ProxyError from exc
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
185
190
  payload: HttpPayload | None = None,
186
191
  session: Session | None = None,
187
192
  proxy_info: ProxyInfo | None = None,
193
+ timeout: timedelta | None = None,
188
194
  ) -> HttpResponse:
189
195
  client = self._get_client(proxy_info.url if proxy_info else None)
190
196
 
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
195
201
  headers=headers,
196
202
  payload=payload,
197
203
  session=session,
204
+ timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
198
205
  )
199
206
 
200
207
  try:
201
208
  response = await client.send(http_request)
209
+ except httpx.TimeoutException as exc:
210
+ raise asyncio.TimeoutError from exc
202
211
  except httpx.TransportError as exc:
203
212
  if self._is_proxy_error(exc):
204
213
  raise ProxyError from exc
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
228
237
  headers=headers,
229
238
  payload=payload,
230
239
  session=session,
231
- timeout=timeout,
240
+ timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
232
241
  )
233
242
 
234
- response = await client.send(http_request, stream=True)
243
+ try:
244
+ response = await client.send(http_request, stream=True)
245
+ except httpx.TimeoutException as exc:
246
+ raise asyncio.TimeoutError from exc
235
247
 
236
248
  try:
237
249
  yield _HttpxResponse(response)
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
246
258
  headers: HttpHeaders | dict[str, str] | None,
247
259
  payload: HttpPayload | None,
248
260
  session: Session | None = None,
249
- timeout: timedelta | None = None,
261
+ timeout: httpx.Timeout | None = None,
250
262
  ) -> httpx.Request:
251
263
  """Build an `httpx.Request` using the provided parameters."""
252
264
  if isinstance(headers, dict) or headers is None:
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
254
266
 
255
267
  headers = self._combine_headers(headers)
256
268
 
257
- httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
258
-
259
269
  return client.build_request(
260
270
  url=url,
261
271
  method=method,
262
272
  headers=dict(headers) if headers else None,
263
273
  content=payload,
264
274
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
265
- timeout=httpx_timeout,
275
+ timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
266
276
  )
267
277
 
268
278
  def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient: