crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_types.py +44 -5
  5. crawlee/_utils/context.py +3 -3
  6. crawlee/_utils/file.py +8 -1
  7. crawlee/_utils/globs.py +4 -4
  8. crawlee/_utils/recurring_task.py +12 -3
  9. crawlee/_utils/sitemap.py +12 -5
  10. crawlee/_utils/system.py +27 -11
  11. crawlee/_utils/time.py +41 -1
  12. crawlee/browsers/_browser_pool.py +1 -1
  13. crawlee/browsers/_playwright_browser.py +2 -1
  14. crawlee/crawlers/__init__.py +5 -1
  15. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  16. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
  17. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  18. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
  19. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  20. crawlee/crawlers/_basic/_basic_crawler.py +138 -124
  21. crawlee/crawlers/_basic/_context_utils.py +24 -0
  22. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  26. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
  27. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  28. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  29. crawlee/crawlers/_playwright/_types.py +12 -2
  30. crawlee/errors.py +4 -0
  31. crawlee/events/_event_manager.py +12 -6
  32. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  33. crawlee/http_clients/_base.py +4 -0
  34. crawlee/http_clients/_curl_impersonate.py +68 -14
  35. crawlee/http_clients/_httpx.py +16 -6
  36. crawlee/http_clients/_impit.py +25 -10
  37. crawlee/otel/crawler_instrumentor.py +1 -3
  38. crawlee/request_loaders/_sitemap_request_loader.py +18 -5
  39. crawlee/router.py +13 -3
  40. crawlee/sessions/_cookies.py +13 -8
  41. crawlee/sessions/_models.py +3 -3
  42. crawlee/statistics/_models.py +51 -9
  43. crawlee/statistics/_statistics.py +2 -21
  44. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  45. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  46. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  47. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  48. crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
  49. crawlee/storage_clients/_redis/_client_mixin.py +1 -4
  50. crawlee/storage_clients/_redis/_dataset_client.py +6 -2
  51. crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
  52. crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
  53. crawlee/storage_clients/_redis/_storage_client.py +12 -9
  54. crawlee/storage_clients/_redis/_utils.py +1 -1
  55. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  56. crawlee/storage_clients/_sql/_storage_client.py +0 -9
  57. crawlee/storage_clients/models.py +8 -3
  58. crawlee/storages/_storage_instance_manager.py +103 -44
  59. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
  60. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
  61. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  62. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  63. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -3,19 +3,22 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  import warnings
6
+ from datetime import timedelta
6
7
  from functools import partial
7
8
  from typing import TYPE_CHECKING, Any, Generic, Literal
8
9
 
10
+ import playwright.async_api
9
11
  from more_itertools import partition
10
12
  from pydantic import ValidationError
11
13
  from typing_extensions import NotRequired, TypedDict, TypeVar
12
14
 
13
15
  from crawlee import service_locator
14
- from crawlee._request import Request, RequestOptions
15
- from crawlee._types import ConcurrencySettings
16
+ from crawlee._request import Request, RequestOptions, RequestState
17
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings
16
18
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
17
19
  from crawlee._utils.docs import docs_group
18
20
  from crawlee._utils.robots import RobotsTxtFile
21
+ from crawlee._utils.time import SharedTimeout
19
22
  from crawlee._utils.urls import to_absolute_url_iterator
20
23
  from crawlee.browsers import BrowserPool
21
24
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -29,6 +32,7 @@ from crawlee.statistics import StatisticsState
29
32
  from ._playwright_crawling_context import PlaywrightCrawlingContext
30
33
  from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
31
34
  from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
35
+ from ._types import GotoOptions
32
36
  from ._utils import block_requests, infinite_scroll
33
37
 
34
38
  TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
@@ -44,7 +48,6 @@ if TYPE_CHECKING:
44
48
 
45
49
  from crawlee import RequestTransformAction
46
50
  from crawlee._types import (
47
- BasicCrawlingContext,
48
51
  EnqueueLinksKwargs,
49
52
  ExtractLinksFunction,
50
53
  HttpHeaders,
@@ -103,9 +106,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
103
106
  user_data_dir: str | Path | None = None,
104
107
  browser_launch_options: Mapping[str, Any] | None = None,
105
108
  browser_new_context_options: Mapping[str, Any] | None = None,
109
+ goto_options: GotoOptions | None = None,
106
110
  fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
107
111
  headless: bool | None = None,
108
112
  use_incognito_pages: bool | None = None,
113
+ navigation_timeout: timedelta | None = None,
109
114
  **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
110
115
  ) -> None:
111
116
  """Initialize a new instance.
@@ -134,12 +139,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
134
139
  use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
135
140
  own context that is destroyed once the page is closed or crashes.
136
141
  This option should not be used if `browser_pool` is provided.
142
+ navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
143
+ the request handler)
144
+ goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
145
+ not supported, use `navigation_timeout` instead.
137
146
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
138
147
  """
139
148
  configuration = kwargs.pop('configuration', None)
140
149
  if configuration is not None:
141
150
  service_locator.set_configuration(configuration)
142
151
 
152
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
153
+
143
154
  if browser_pool:
144
155
  # Raise an exception if browser_pool is provided together with other browser-related arguments.
145
156
  if any(
@@ -163,10 +174,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
163
174
  # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
164
175
  else:
165
176
  if fingerprint_generator == 'default':
166
- if not browser_type:
167
- generator_browser_type = None
168
- else:
169
- generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
177
+ generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
178
+ [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
179
+ )
170
180
 
171
181
  fingerprint_generator = DefaultFingerprintGenerator(
172
182
  header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
@@ -202,6 +212,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
202
212
  if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
203
213
  kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
204
214
 
215
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
216
+ self._goto_options = goto_options or GotoOptions()
217
+
205
218
  super().__init__(**kwargs)
206
219
 
207
220
  async def _open_page(
@@ -226,12 +239,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
226
239
  log=context.log,
227
240
  page=crawlee_page.page,
228
241
  block_requests=partial(block_requests, page=crawlee_page.page),
242
+ goto_options=GotoOptions(**self._goto_options),
229
243
  )
230
244
 
231
- async with browser_page_context(crawlee_page.page):
232
- for hook in self._pre_navigation_hooks:
233
- await hook(pre_navigation_context)
234
- yield pre_navigation_context
245
+ context_id = id(pre_navigation_context)
246
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
247
+
248
+ try:
249
+ async with browser_page_context(crawlee_page.page):
250
+ for hook in self._pre_navigation_hooks:
251
+ async with self._shared_navigation_timeouts[context_id]:
252
+ await hook(pre_navigation_context)
253
+
254
+ yield pre_navigation_context
255
+ finally:
256
+ self._shared_navigation_timeouts.pop(context_id, None)
235
257
 
236
258
  def _prepare_request_interceptor(
237
259
  self,
@@ -266,6 +288,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
266
288
  Raises:
267
289
  ValueError: If the browser pool is not initialized.
268
290
  SessionError: If the URL cannot be loaded by the browser.
291
+ TimeoutError: If navigation does not succeed within the navigation timeout.
269
292
 
270
293
  Yields:
271
294
  The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -297,7 +320,14 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
297
320
  # Set route_handler only for current request
298
321
  await context.page.route(context.request.url, route_handler)
299
322
 
300
- response = await context.page.goto(context.request.url)
323
+ try:
324
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
325
+ response = await context.page.goto(
326
+ context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
327
+ )
328
+ context.request.state = RequestState.AFTER_NAV
329
+ except playwright.async_api.TimeoutError as exc:
330
+ raise asyncio.TimeoutError from exc
301
331
 
302
332
  if response is None:
303
333
  raise SessionError(f'Failed to load the URL: {context.request.url}')
@@ -324,6 +354,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
324
354
  extract_links=extract_links,
325
355
  enqueue_links=self._create_enqueue_links_function(context, extract_links),
326
356
  block_requests=partial(block_requests, page=context.page),
357
+ goto_options=context.goto_options,
327
358
  )
328
359
 
329
360
  if context.session:
@@ -364,14 +395,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
364
395
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
365
396
 
366
397
  kwargs.setdefault('strategy', 'same-hostname')
398
+ strategy = kwargs.get('strategy', 'same-hostname')
367
399
 
368
400
  elements = await context.page.query_selector_all(selector)
369
401
  links_iterator: Iterator[str] = iter(
370
402
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
371
403
  )
372
- links_iterator = to_absolute_url_iterator(
373
- context.request.loaded_url or context.request.url, links_iterator, logger=context.log
374
- )
404
+
405
+ # Get base URL from <base> tag if present
406
+ extracted_base_url = await context.page.evaluate('document.baseURI')
407
+ base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
408
+
409
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
375
410
 
376
411
  if robots_txt_file:
377
412
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -379,17 +414,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
379
414
  skipped = iter([])
380
415
 
381
416
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
382
- request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
417
+ request_options = RequestOptions(
418
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
419
+ )
383
420
 
384
421
  if transform_request_function:
385
- transform_request_option = transform_request_function(request_option)
386
- if transform_request_option == 'skip':
422
+ transform_request_options = transform_request_function(request_options)
423
+ if transform_request_options == 'skip':
387
424
  continue
388
- if transform_request_option != 'unchanged':
389
- request_option = transform_request_option
425
+ if transform_request_options != 'unchanged':
426
+ request_options = transform_request_options
390
427
 
391
428
  try:
392
- request = Request.from_url(**request_option)
429
+ request = Request.from_url(**request_options)
393
430
  except ValidationError as exc:
394
431
  context.log.debug(
395
432
  f'Skipping URL "{url}" due to invalid format: {exc}. '
@@ -475,7 +512,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
475
512
 
476
513
  async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
477
514
  """Update the cookies in the page context."""
478
- await page.context.add_cookies([{**cookie} for cookie in cookies])
515
+ # False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
516
+ await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type]
479
517
 
480
518
  async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
481
519
  """Find the robots.txt file for a given URL.
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
59
59
  session: Session | None = None,
60
60
  proxy_info: ProxyInfo | None = None,
61
61
  statistics: Statistics | None = None,
62
+ timeout: timedelta | None = None,
62
63
  ) -> HttpCrawlingResult:
63
64
  raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
64
65
 
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
72
73
  payload: HttpPayload | None = None,
73
74
  session: Session | None = None,
74
75
  proxy_info: ProxyInfo | None = None,
76
+ timeout: timedelta | None = None,
75
77
  ) -> HttpResponse:
76
78
  # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
77
79
  # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
87
89
 
88
90
  # Proxies appropriate to the browser context are used
89
91
  response = await browser_context.request.fetch(
90
- url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
92
+ url_or_request=url,
93
+ method=method.lower(),
94
+ headers=dict(headers) if headers else None,
95
+ data=payload,
96
+ timeout=timeout.total_seconds() if timeout else None,
91
97
  )
92
98
 
93
99
  return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
9
9
  if TYPE_CHECKING:
10
10
  from playwright.async_api import Page
11
11
 
12
- from ._types import BlockRequestsFunction
12
+ from ._types import BlockRequestsFunction, GotoOptions
13
13
 
14
14
 
15
15
  @dataclass(frozen=True)
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
26
26
  block_requests: BlockRequestsFunction
27
27
  """Blocks network requests matching specified URL patterns."""
28
28
 
29
+ goto_options: GotoOptions
30
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
31
+
29
32
  async def get_snapshot(self) -> PageSnapshot:
30
33
  """Get snapshot of crawled page."""
31
34
  html = None
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import TYPE_CHECKING, Protocol
4
+ from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
5
5
 
6
6
  from crawlee import HttpHeaders
7
7
  from crawlee._utils.docs import docs_group
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
10
10
  from collections.abc import AsyncGenerator
11
11
 
12
12
  from playwright.async_api import APIResponse, Response
13
- from typing_extensions import Self
13
+ from typing_extensions import NotRequired, Self
14
14
 
15
15
 
16
16
  @docs_group('Functions')
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
58
58
  _content = await response.body()
59
59
 
60
60
  return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
61
+
62
+
63
+ class GotoOptions(TypedDict):
64
+ """Keyword arguments for Playwright's `Page.goto()` method."""
65
+
66
+ wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
67
+ """When to consider operation succeeded, defaults to 'load' event."""
68
+
69
+ referer: NotRequired[str]
70
+ """Referer header value."""
crawlee/errors.py CHANGED
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
29
29
  """Wraps an exception thrown from an user-defined error handler."""
30
30
 
31
31
 
32
+ class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
33
+ """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
34
+
35
+
32
36
  @docs_group('Errors')
33
37
  class SessionError(Exception):
34
38
  """Errors of `SessionError` type will trigger a session rotation.
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
@@ -172,13 +174,12 @@ class EventManager:
172
174
  # to avoid blocking the event loop
173
175
  coro = (
174
176
  listener(*bound_args.args, **bound_args.kwargs)
175
- if asyncio.iscoroutinefunction(listener)
177
+ if inspect.iscoroutinefunction(listener)
176
178
  else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
177
179
  )
178
- # Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
179
- # unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
180
180
 
181
- listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
181
+ listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
182
+ listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
182
183
  self._listener_tasks.add(listener_task)
183
184
 
184
185
  try:
@@ -189,7 +190,12 @@ class EventManager:
189
190
  # We need to swallow the exception and just log it here, otherwise it could break the event emitter
190
191
  logger.exception(
191
192
  'Exception in the event listener',
192
- extra={'event_name': event.value, 'listener_name': listener.__name__},
193
+ extra={
194
+ 'event_name': event.value,
195
+ 'listener_name': listener.__name__
196
+ if hasattr(listener, '__name__')
197
+ else listener.__class__.__name__,
198
+ },
193
199
  )
194
200
  finally:
195
201
  logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
@@ -154,7 +154,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator):
154
154
  class PatchedFingerprintGenerator(bf_FingerprintGenerator):
155
155
  """Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
156
156
 
157
- def __init__( # type:ignore[no-untyped-def] # Upstream repo types missing.
157
+ def __init__(
158
158
  self,
159
159
  *,
160
160
  screen: Screen | None = None,
@@ -104,6 +104,7 @@ class HttpClient(ABC):
104
104
  session: Session | None = None,
105
105
  proxy_info: ProxyInfo | None = None,
106
106
  statistics: Statistics | None = None,
107
+ timeout: timedelta | None = None,
107
108
  ) -> HttpCrawlingResult:
108
109
  """Perform the crawling for a given request.
109
110
 
@@ -114,6 +115,7 @@ class HttpClient(ABC):
114
115
  session: The session associated with the request.
115
116
  proxy_info: The information about the proxy to be used.
116
117
  statistics: The statistics object to register status codes.
118
+ timeout: Maximum time allowed to process the request.
117
119
 
118
120
  Raises:
119
121
  ProxyError: Raised if a proxy-related error occurs.
@@ -132,6 +134,7 @@ class HttpClient(ABC):
132
134
  payload: HttpPayload | None = None,
133
135
  session: Session | None = None,
134
136
  proxy_info: ProxyInfo | None = None,
137
+ timeout: timedelta | None = None,
135
138
  ) -> HttpResponse:
136
139
  """Send an HTTP request via the client.
137
140
 
@@ -144,6 +147,7 @@ class HttpClient(ABC):
144
147
  payload: The data to be sent as the request body.
145
148
  session: The session associated with the request.
146
149
  proxy_info: The information about the proxy to be used.
150
+ timeout: Maximum time allowed to process the request.
147
151
 
148
152
  Raises:
149
153
  ProxyError: Raised if a proxy-related error occurs.
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
- from typing import TYPE_CHECKING, Any
5
+ from http.cookiejar import Cookie
6
+ from typing import TYPE_CHECKING, Any, cast
5
7
 
6
8
  from curl_cffi import CurlInfo
7
9
  from curl_cffi.const import CurlHttpVersion
@@ -10,10 +12,11 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
10
12
  from curl_cffi.requests.cookies import CurlMorsel
11
13
  from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
12
14
  from curl_cffi.requests.exceptions import RequestException as CurlRequestError
15
+ from curl_cffi.requests.exceptions import Timeout
13
16
  from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
14
17
  from typing_extensions import override
15
18
 
16
- from crawlee._types import HttpHeaders, HttpPayload
19
+ from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
17
20
  from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
18
21
  from crawlee._utils.docs import docs_group
19
22
  from crawlee.errors import ProxyError
@@ -22,11 +25,11 @@ from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse
22
25
  if TYPE_CHECKING:
23
26
  from collections.abc import AsyncGenerator
24
27
  from datetime import timedelta
25
- from http.cookiejar import Cookie
26
28
 
27
29
  from curl_cffi import Curl
28
30
  from curl_cffi.requests import Request as CurlRequest
29
31
  from curl_cffi.requests import Response
32
+ from curl_cffi.requests.session import HttpMethod as CurlHttpMethod
30
33
 
31
34
  from crawlee import Request
32
35
  from crawlee._types import HttpMethod
@@ -88,15 +91,17 @@ class _CurlImpersonateResponse:
88
91
  async def read(self) -> bytes:
89
92
  if self._response.astream_task:
90
93
  raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
94
+
91
95
  return self._response.content
92
96
 
93
97
  async def read_stream(self) -> AsyncGenerator[bytes, None]:
94
- if not self._response.astream_task or self._response.astream_task.done(): # type: ignore[attr-defined]
95
- raise RuntimeError(
96
- 'Cannot read stream: either already consumed or Response not obtained from `stream` method'
97
- )
98
+ if not self._response.astream_task:
99
+ raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.')
98
100
 
99
- async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call]
101
+ if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done():
102
+ raise RuntimeError('Cannot read stream, it was already consumed.')
103
+
104
+ async for chunk in self._response.aiter_content():
100
105
  yield chunk
101
106
 
102
107
 
@@ -147,17 +152,21 @@ class CurlImpersonateHttpClient(HttpClient):
147
152
  session: Session | None = None,
148
153
  proxy_info: ProxyInfo | None = None,
149
154
  statistics: Statistics | None = None,
155
+ timeout: timedelta | None = None,
150
156
  ) -> HttpCrawlingResult:
151
157
  client = self._get_client(proxy_info.url if proxy_info else None)
152
158
 
153
159
  try:
154
160
  response = await client.request(
155
161
  url=request.url,
156
- method=request.method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
162
+ method=self._convert_method(request.method),
157
163
  headers=request.headers,
158
164
  data=request.payload,
159
165
  cookies=session.cookies.jar if session else None,
166
+ timeout=timeout.total_seconds() if timeout else None,
160
167
  )
168
+ except Timeout as exc:
169
+ raise asyncio.TimeoutError from exc
161
170
  except CurlRequestError as exc:
162
171
  if self._is_proxy_error(exc):
163
172
  raise ProxyError from exc
@@ -186,6 +195,7 @@ class CurlImpersonateHttpClient(HttpClient):
186
195
  payload: HttpPayload | None = None,
187
196
  session: Session | None = None,
188
197
  proxy_info: ProxyInfo | None = None,
198
+ timeout: timedelta | None = None,
189
199
  ) -> HttpResponse:
190
200
  if isinstance(headers, dict) or headers is None:
191
201
  headers = HttpHeaders(headers or {})
@@ -196,11 +206,14 @@ class CurlImpersonateHttpClient(HttpClient):
196
206
  try:
197
207
  response = await client.request(
198
208
  url=url,
199
- method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
209
+ method=self._convert_method(method),
200
210
  headers=dict(headers) if headers else None,
201
211
  data=payload,
202
212
  cookies=session.cookies.jar if session else None,
213
+ timeout=timeout.total_seconds() if timeout else None,
203
214
  )
215
+ except Timeout as exc:
216
+ raise asyncio.TimeoutError from exc
204
217
  except CurlRequestError as exc:
205
218
  if self._is_proxy_error(exc):
206
219
  raise ProxyError from exc
@@ -234,13 +247,15 @@ class CurlImpersonateHttpClient(HttpClient):
234
247
  try:
235
248
  response = await client.request(
236
249
  url=url,
237
- method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
250
+ method=self._convert_method(method),
238
251
  headers=dict(headers) if headers else None,
239
252
  data=payload,
240
253
  cookies=session.cookies.jar if session else None,
241
254
  stream=True,
242
255
  timeout=timeout.total_seconds() if timeout else None,
243
256
  )
257
+ except Timeout as exc:
258
+ raise asyncio.TimeoutError from exc
244
259
  except CurlRequestError as exc:
245
260
  if self._is_proxy_error(exc):
246
261
  raise ProxyError from exc
@@ -279,6 +294,40 @@ class CurlImpersonateHttpClient(HttpClient):
279
294
 
280
295
  return self._client_by_proxy_url[proxy_url]
281
296
 
297
+ def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:
298
+ """Convert from Crawlee HTTP method to curl-cffi HTTP method.
299
+
300
+ Args:
301
+ method: Crawlee HTTP method.
302
+
303
+ Returns:
304
+ Corresponding curl-cffi HTTP method.
305
+
306
+ Raises:
307
+ ValueError: If the provided HTTP method is not supported.
308
+ """
309
+ method_upper = method.upper() # curl-cffi requires uppercase methods
310
+
311
+ match method_upper:
312
+ case 'GET':
313
+ return 'GET'
314
+ case 'POST':
315
+ return 'POST'
316
+ case 'PUT':
317
+ return 'PUT'
318
+ case 'DELETE':
319
+ return 'DELETE'
320
+ case 'OPTIONS':
321
+ return 'OPTIONS'
322
+ case 'HEAD':
323
+ return 'HEAD'
324
+ case 'TRACE':
325
+ return 'TRACE'
326
+ case 'PATCH':
327
+ return 'PATCH'
328
+ case _:
329
+ raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.')
330
+
282
331
  @staticmethod
283
332
  def _is_proxy_error(error: CurlRequestError) -> bool:
284
333
  """Determine whether the given error is related to a proxy issue.
@@ -296,11 +345,16 @@ class CurlImpersonateHttpClient(HttpClient):
296
345
 
297
346
  @staticmethod
298
347
  def _get_cookies(curl: Curl) -> list[Cookie]:
299
- cookies: list[Cookie] = []
300
- for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr]
301
- curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type]
348
+ cookies = list[Cookie]()
349
+
350
+ # Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST.
351
+ cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST))
352
+
353
+ for curl_cookie in cookie_list:
354
+ curl_morsel = CurlMorsel.from_curl_format(curl_cookie)
302
355
  cookie = curl_morsel.to_cookiejar_cookie()
303
356
  cookies.append(cookie)
357
+
304
358
  return cookies
305
359
 
306
360
  async def cleanup(self) -> None:
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any, cast
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
146
147
  session: Session | None = None,
147
148
  proxy_info: ProxyInfo | None = None,
148
149
  statistics: Statistics | None = None,
150
+ timeout: timedelta | None = None,
149
151
  ) -> HttpCrawlingResult:
150
152
  client = self._get_client(proxy_info.url if proxy_info else None)
151
153
  headers = self._combine_headers(request.headers)
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
157
159
  content=request.payload,
158
160
  cookies=session.cookies.jar if session else None,
159
161
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
162
+ timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
160
163
  )
161
164
 
162
165
  try:
163
166
  response = await client.send(http_request)
167
+ except httpx.TimeoutException as exc:
168
+ raise asyncio.TimeoutError from exc
164
169
  except httpx.TransportError as exc:
165
170
  if self._is_proxy_error(exc):
166
171
  raise ProxyError from exc
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
185
190
  payload: HttpPayload | None = None,
186
191
  session: Session | None = None,
187
192
  proxy_info: ProxyInfo | None = None,
193
+ timeout: timedelta | None = None,
188
194
  ) -> HttpResponse:
189
195
  client = self._get_client(proxy_info.url if proxy_info else None)
190
196
 
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
195
201
  headers=headers,
196
202
  payload=payload,
197
203
  session=session,
204
+ timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
198
205
  )
199
206
 
200
207
  try:
201
208
  response = await client.send(http_request)
209
+ except httpx.TimeoutException as exc:
210
+ raise asyncio.TimeoutError from exc
202
211
  except httpx.TransportError as exc:
203
212
  if self._is_proxy_error(exc):
204
213
  raise ProxyError from exc
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
228
237
  headers=headers,
229
238
  payload=payload,
230
239
  session=session,
231
- timeout=timeout,
240
+ timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
232
241
  )
233
242
 
234
- response = await client.send(http_request, stream=True)
243
+ try:
244
+ response = await client.send(http_request, stream=True)
245
+ except httpx.TimeoutException as exc:
246
+ raise asyncio.TimeoutError from exc
235
247
 
236
248
  try:
237
249
  yield _HttpxResponse(response)
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
246
258
  headers: HttpHeaders | dict[str, str] | None,
247
259
  payload: HttpPayload | None,
248
260
  session: Session | None = None,
249
- timeout: timedelta | None = None,
261
+ timeout: httpx.Timeout | None = None,
250
262
  ) -> httpx.Request:
251
263
  """Build an `httpx.Request` using the provided parameters."""
252
264
  if isinstance(headers, dict) or headers is None:
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
254
266
 
255
267
  headers = self._combine_headers(headers)
256
268
 
257
- httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
258
-
259
269
  return client.build_request(
260
270
  url=url,
261
271
  method=method,
262
272
  headers=dict(headers) if headers else None,
263
273
  content=payload,
264
274
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
265
- timeout=httpx_timeout,
275
+ timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
266
276
  )
267
277
 
268
278
  def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient: