crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (80) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +87 -25
  4. crawlee/_utils/file.py +7 -0
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/time.py +41 -1
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +3 -1
  17. crawlee/crawlers/__init__.py +2 -1
  18. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  19. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
  21. crawlee/crawlers/_basic/_basic_crawler.py +139 -96
  22. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  23. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  24. crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
  25. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/fingerprint_suite/_header_generator.py +2 -2
  28. crawlee/http_clients/_base.py +4 -0
  29. crawlee/http_clients/_curl_impersonate.py +12 -0
  30. crawlee/http_clients/_httpx.py +16 -6
  31. crawlee/http_clients/_impit.py +25 -10
  32. crawlee/otel/crawler_instrumentor.py +3 -3
  33. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  34. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  35. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +32 -1
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +5 -4
  42. crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
  45. crawlee/storage_clients/_file_system/_storage_client.py +2 -2
  46. crawlee/storage_clients/_memory/_dataset_client.py +4 -5
  47. crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
  48. crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
  49. crawlee/storage_clients/_redis/__init__.py +6 -0
  50. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  51. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  52. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  53. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  54. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  55. crawlee/storage_clients/_redis/_utils.py +23 -0
  56. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  57. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  59. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  60. crawlee/storage_clients/_redis/py.typed +0 -0
  61. crawlee/storage_clients/_sql/__init__.py +6 -0
  62. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  63. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  64. crawlee/storage_clients/_sql/_db_models.py +268 -0
  65. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  66. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  67. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  68. crawlee/storage_clients/_sql/py.typed +0 -0
  69. crawlee/storage_clients/models.py +10 -10
  70. crawlee/storages/_base.py +3 -1
  71. crawlee/storages/_dataset.py +5 -3
  72. crawlee/storages/_key_value_store.py +11 -6
  73. crawlee/storages/_request_queue.py +5 -3
  74. crawlee/storages/_storage_instance_manager.py +54 -68
  75. crawlee/storages/_utils.py +11 -0
  76. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
  77. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
  78. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
  79. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
  80. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
31
33
  from crawlee._types import (
32
34
  BasicCrawlingContext,
33
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
34
38
  GetKeyValueStoreFromRequestHandlerFunction,
35
39
  HttpHeaders,
36
40
  HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
40
44
  SkippedReason,
41
45
  )
42
46
  from crawlee._utils.docs import docs_group
43
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
44
48
  from crawlee._utils.recurring_task import RecurringTask
45
49
  from crawlee._utils.robots import RobotsTxtFile
46
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -96,6 +100,9 @@ if TYPE_CHECKING:
96
100
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
101
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
102
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
103
+ TParams = ParamSpec('TParams')
104
+ T = TypeVar('T')
105
+
99
106
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
107
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
108
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -437,14 +444,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
437
444
  self._statistics_log_format = statistics_log_format
438
445
 
439
446
  # Statistics
440
- self._statistics = statistics or cast(
441
- 'Statistics[TStatisticsState]',
442
- Statistics.with_default_state(
443
- periodic_message_logger=self._logger,
444
- statistics_log_format=self._statistics_log_format,
445
- log_message='Current request statistics:',
446
- ),
447
- )
447
+ if statistics:
448
+ self._statistics = statistics
449
+ else:
450
+
451
+ async def persist_state_factory() -> KeyValueStore:
452
+ return await self.get_key_value_store()
453
+
454
+ self._statistics = cast(
455
+ 'Statistics[TStatisticsState]',
456
+ Statistics.with_default_state(
457
+ persistence_enabled=True,
458
+ periodic_message_logger=self._logger,
459
+ statistics_log_format=self._statistics_log_format,
460
+ log_message='Current request statistics:',
461
+ persist_state_kvs_factory=persist_state_factory,
462
+ ),
463
+ )
448
464
 
449
465
  # Additional context managers to enter and exit
450
466
  self._additional_context_managers = _additional_context_managers or []
@@ -511,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
511
527
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
512
528
  self._unexpected_stop = True
513
529
 
530
+ def _wrap_handler_with_error_context(
531
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
532
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
533
+ """Decorate error handlers to make their context helpers usable."""
534
+
535
+ @functools.wraps(handler)
536
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
537
+ # Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
538
+ # failed. Modified context provides context helpers with direct access to the storages.
539
+ error_context = context.create_modified_copy(
540
+ push_data=self._push_data,
541
+ get_key_value_store=self.get_key_value_store,
542
+ add_requests=functools.partial(self._add_requests, context),
543
+ )
544
+ return await handler(error_context, exception)
545
+
546
+ return wrapped_handler
547
+
514
548
  def _stop_if_max_requests_count_exceeded(self) -> None:
515
549
  """Call `stop` when the maximum number of requests to crawl has been reached."""
516
550
  if self._max_requests_per_crawl is None:
@@ -609,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
609
643
 
610
644
  The error handler is invoked after a request handler error occurs and before a retry attempt.
611
645
  """
612
- self._error_handler = handler
646
+ self._error_handler = self._wrap_handler_with_error_context(handler)
613
647
  return handler
614
648
 
615
649
  def failed_request_handler(
@@ -619,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
619
653
 
620
654
  The failed request handler is invoked when a request has failed all retry attempts.
621
655
  """
622
- self._failed_request_handler = handler
656
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
623
657
  return handler
624
658
 
625
659
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -659,7 +693,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
659
693
  request_manager = await self.get_request_manager()
660
694
  if purge_request_queue and isinstance(request_manager, RequestQueue):
661
695
  await request_manager.drop()
662
- self._request_manager = await RequestQueue.open()
696
+ self._request_manager = await RequestQueue.open(
697
+ storage_client=self._service_locator.get_storage_client(),
698
+ configuration=self._service_locator.get_configuration(),
699
+ )
663
700
 
664
701
  if requests is not None:
665
702
  await self.add_requests(requests)
@@ -686,7 +723,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
686
723
  except CancelledError:
687
724
  pass
688
725
  finally:
689
- await self._crawler_state_rec_task.stop()
690
726
  if threading.current_thread() is threading.main_thread():
691
727
  with suppress(NotImplementedError):
692
728
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -718,8 +754,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
718
754
  async def _run_crawler(self) -> None:
719
755
  event_manager = self._service_locator.get_event_manager()
720
756
 
721
- self._crawler_state_rec_task.start()
722
-
723
757
  # Collect the context managers to be entered. Context managers that are already active are excluded,
724
758
  # as they were likely entered by the caller, who will also be responsible for exiting them.
725
759
  contexts_to_enter = [
@@ -730,6 +764,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
730
764
  self._statistics,
731
765
  self._session_pool if self._use_session_pool else None,
732
766
  self._http_client,
767
+ self._crawler_state_rec_task,
733
768
  *self._additional_context_managers,
734
769
  )
735
770
  if cm and getattr(cm, 'active', False) is False
@@ -836,6 +871,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
836
871
  dataset_id: str | None = None,
837
872
  dataset_name: str | None = None,
838
873
  dataset_alias: str | None = None,
874
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
839
875
  ) -> None:
840
876
  """Export all items from a Dataset to a JSON or CSV file.
841
877
 
@@ -848,6 +884,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
848
884
  dataset_id: The ID of the Dataset to export from.
849
885
  dataset_name: The name of the Dataset to export from (global scope, named storage).
850
886
  dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
887
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
851
888
  """
852
889
  dataset = await Dataset.open(
853
890
  id=dataset_id,
@@ -857,13 +894,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
857
894
  configuration=self._service_locator.get_configuration(),
858
895
  )
859
896
 
860
- path = path if isinstance(path, Path) else Path(path)
861
- dst = path.open('w', newline='')
897
+ path = Path(path)
862
898
 
863
899
  if path.suffix == '.csv':
864
- await export_csv_to_stream(dataset.iterate_items(), dst)
900
+ dst = StringIO()
901
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
902
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
903
+ await atomic_write(path, dst.getvalue())
865
904
  elif path.suffix == '.json':
866
- await export_json_to_stream(dataset.iterate_items(), dst)
905
+ dst = StringIO()
906
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
907
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
908
+ await atomic_write(path, dst.getvalue())
867
909
  else:
868
910
  raise ValueError(f'Unsupported file extension: {path.suffix}')
869
911
 
@@ -944,6 +986,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
944
986
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
945
987
  | None = None,
946
988
  requests: Sequence[str | Request] | None = None,
989
+ rq_id: str | None = None,
990
+ rq_name: str | None = None,
991
+ rq_alias: str | None = None,
947
992
  **kwargs: Unpack[EnqueueLinksKwargs],
948
993
  ) -> None:
949
994
  kwargs.setdefault('strategy', 'same-hostname')
@@ -955,7 +1000,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
955
1000
  '`transform_request_function` arguments when `requests` is provided.'
956
1001
  )
957
1002
  # Add directly passed requests.
958
- await context.add_requests(requests or list[str | Request](), **kwargs)
1003
+ await context.add_requests(
1004
+ requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
1005
+ )
959
1006
  else:
960
1007
  # Add requests from extracted links.
961
1008
  await context.add_requests(
@@ -964,7 +1011,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
964
1011
  label=label,
965
1012
  user_data=user_data,
966
1013
  transform_request_function=transform_request_function,
1014
+ **kwargs,
967
1015
  ),
1016
+ rq_id=rq_id,
1017
+ rq_name=rq_name,
1018
+ rq_alias=rq_alias,
968
1019
  **kwargs,
969
1020
  )
970
1021
 
@@ -1024,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1024
1075
  return target_url.hostname == origin_url.hostname
1025
1076
 
1026
1077
  if strategy == 'same-domain':
1027
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1028
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1078
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1079
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1029
1080
  return origin_domain == target_domain
1030
1081
 
1031
1082
  if strategy == 'same-origin':
@@ -1094,19 +1145,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1094
1145
  except Exception as e:
1095
1146
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1096
1147
  else:
1097
- if new_request is not None:
1098
- request = new_request
1148
+ if new_request is not None and new_request != request:
1149
+ await request_manager.add_request(new_request)
1150
+ await self._mark_request_as_handled(request)
1151
+ return
1099
1152
 
1100
1153
  await request_manager.reclaim_request(request)
1101
1154
  else:
1102
- await wait_for(
1103
- lambda: request_manager.mark_request_as_handled(context.request),
1104
- timeout=self._internal_timeout,
1105
- timeout_message='Marking request as handled timed out after '
1106
- f'{self._internal_timeout.total_seconds()} seconds',
1107
- logger=self._logger,
1108
- max_retries=3,
1109
- )
1155
+ await self._mark_request_as_handled(request)
1110
1156
  await self._handle_failed_request(context, error)
1111
1157
  self._statistics.record_request_processing_failure(request.unique_key)
1112
1158
 
@@ -1155,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1155
1201
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1156
1202
  ) -> None:
1157
1203
  if need_mark and isinstance(request, Request):
1158
- request_manager = await self.get_request_manager()
1159
-
1160
- await wait_for(
1161
- lambda: request_manager.mark_request_as_handled(request),
1162
- timeout=self._internal_timeout,
1163
- timeout_message='Marking request as handled timed out after '
1164
- f'{self._internal_timeout.total_seconds()} seconds',
1165
- logger=self._logger,
1166
- max_retries=3,
1167
- )
1204
+ await self._mark_request_as_handled(request)
1168
1205
  request.state = RequestState.SKIPPED
1169
1206
 
1170
1207
  url = request.url if isinstance(request, Request) else request
@@ -1237,34 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1237
1274
  else:
1238
1275
  yield Request.from_url(url)
1239
1276
 
1240
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1241
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1242
- result = self._context_result_map[context]
1243
-
1244
- request_manager = await self.get_request_manager()
1245
- origin = context.request.loaded_url or context.request.url
1246
-
1247
- for add_requests_call in result.add_requests_calls:
1248
- requests = list[Request]()
1249
-
1250
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1251
-
1252
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1277
+ async def _add_requests(
1278
+ self,
1279
+ context: BasicCrawlingContext,
1280
+ requests: Sequence[str | Request],
1281
+ rq_id: str | None = None,
1282
+ rq_name: str | None = None,
1283
+ rq_alias: str | None = None,
1284
+ **kwargs: Unpack[EnqueueLinksKwargs],
1285
+ ) -> None:
1286
+ """Add requests method aware of the crawling context."""
1287
+ if rq_id or rq_name or rq_alias:
1288
+ request_manager: RequestManager = await RequestQueue.open(
1289
+ id=rq_id,
1290
+ name=rq_name,
1291
+ alias=rq_alias,
1292
+ storage_client=self._service_locator.get_storage_client(),
1293
+ configuration=self._service_locator.get_configuration(),
1294
+ )
1295
+ else:
1296
+ request_manager = await self.get_request_manager()
1253
1297
 
1254
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1298
+ context_aware_requests = list[Request]()
1299
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1300
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1301
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1302
+ for dst_request in filter_requests_iterator:
1303
+ # Update the crawl depth of the request.
1304
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1255
1305
 
1256
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1257
- requests_iterator, context.request.url, **enqueue_links_kwargs
1258
- )
1306
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1307
+ context_aware_requests.append(dst_request)
1259
1308
 
1260
- for dst_request in filter_requests_iterator:
1261
- # Update the crawl depth of the request.
1262
- dst_request.crawl_depth = context.request.crawl_depth + 1
1309
+ return await request_manager.add_requests(context_aware_requests)
1263
1310
 
1264
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1265
- requests.append(dst_request)
1311
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1312
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1313
+ result = self._context_result_map[context]
1266
1314
 
1267
- await request_manager.add_requests(requests)
1315
+ for add_requests_call in result.add_requests_calls:
1316
+ await self._add_requests(context, **add_requests_call)
1268
1317
 
1269
1318
  for push_data_call in result.push_data_calls:
1270
1319
  await self._push_data(**push_data_call)
@@ -1364,14 +1413,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1364
1413
  raise RequestHandlerError(e, context) from e
1365
1414
 
1366
1415
  await self._commit_request_handler_result(context)
1367
- await wait_for(
1368
- lambda: request_manager.mark_request_as_handled(context.request),
1369
- timeout=self._internal_timeout,
1370
- timeout_message='Marking request as handled timed out after '
1371
- f'{self._internal_timeout.total_seconds()} seconds',
1372
- logger=self._logger,
1373
- max_retries=3,
1374
- )
1416
+
1417
+ await self._mark_request_as_handled(request)
1375
1418
 
1376
1419
  request.state = RequestState.DONE
1377
1420
 
@@ -1414,14 +1457,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1414
1457
  await request_manager.reclaim_request(request)
1415
1458
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1416
1459
  else:
1417
- await wait_for(
1418
- lambda: request_manager.mark_request_as_handled(context.request),
1419
- timeout=self._internal_timeout,
1420
- timeout_message='Marking request as handled timed out after '
1421
- f'{self._internal_timeout.total_seconds()} seconds',
1422
- logger=self._logger,
1423
- max_retries=3,
1424
- )
1460
+ await self._mark_request_as_handled(request)
1425
1461
 
1426
1462
  await self._handle_failed_request(context, session_error)
1427
1463
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1429,14 +1465,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1429
1465
  except ContextPipelineInterruptedError as interrupted_error:
1430
1466
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1431
1467
 
1432
- await wait_for(
1433
- lambda: request_manager.mark_request_as_handled(context.request),
1434
- timeout=self._internal_timeout,
1435
- timeout_message='Marking request as handled timed out after '
1436
- f'{self._internal_timeout.total_seconds()} seconds',
1437
- logger=self._logger,
1438
- max_retries=3,
1439
- )
1468
+ await self._mark_request_as_handled(request)
1440
1469
 
1441
1470
  except ContextPipelineInitializationError as initialization_error:
1442
1471
  self._logger.debug(
@@ -1454,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1454
1483
  raise
1455
1484
 
1456
1485
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1457
- await wait_for(
1458
- lambda: self._context_pipeline(context, self.router),
1459
- timeout=self._request_handler_timeout,
1460
- timeout_message=f'{self._request_handler_timeout_text}'
1461
- f' {self._request_handler_timeout.total_seconds()} seconds',
1462
- logger=self._logger,
1486
+ await self._context_pipeline(
1487
+ context,
1488
+ lambda final_context: wait_for(
1489
+ lambda: self.router(final_context),
1490
+ timeout=self._request_handler_timeout,
1491
+ timeout_message=f'{self._request_handler_timeout_text}'
1492
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1493
+ logger=self._logger,
1494
+ ),
1463
1495
  )
1464
1496
 
1465
1497
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1607,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1607
1639
  )
1608
1640
 
1609
1641
  self._previous_crawler_state = current_state
1642
+
1643
+ async def _mark_request_as_handled(self, request: Request) -> None:
1644
+ request_manager = await self.get_request_manager()
1645
+ await wait_for(
1646
+ lambda: request_manager.mark_request_as_handled(request),
1647
+ timeout=self._internal_timeout,
1648
+ timeout_message='Marking request as handled timed out after '
1649
+ f'{self._internal_timeout.total_seconds()} seconds',
1650
+ logger=self._logger,
1651
+ max_retries=3,
1652
+ )
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from parsel import Selector
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._parsel_crawling_context import ParselCrawlingContext
11
11
  from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
56
56
 
57
57
  def __init__(
58
58
  self,
59
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
60
60
  ) -> None:
61
61
  """Initialize a new instance.
62
62
 
@@ -3,18 +3,25 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  import warnings
6
+ from datetime import timedelta
6
7
  from functools import partial
7
8
  from typing import TYPE_CHECKING, Any, Generic, Literal
8
9
 
10
+ import playwright.async_api
9
11
  from more_itertools import partition
10
12
  from pydantic import ValidationError
11
13
  from typing_extensions import NotRequired, TypedDict, TypeVar
12
14
 
13
15
  from crawlee import service_locator
14
16
  from crawlee._request import Request, RequestOptions
17
+ from crawlee._types import (
18
+ BasicCrawlingContext,
19
+ ConcurrencySettings,
20
+ )
15
21
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
16
22
  from crawlee._utils.docs import docs_group
17
23
  from crawlee._utils.robots import RobotsTxtFile
24
+ from crawlee._utils.time import SharedTimeout
18
25
  from crawlee._utils.urls import to_absolute_url_iterator
19
26
  from crawlee.browsers import BrowserPool
20
27
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -43,7 +50,6 @@ if TYPE_CHECKING:
43
50
 
44
51
  from crawlee import RequestTransformAction
45
52
  from crawlee._types import (
46
- BasicCrawlingContext,
47
53
  EnqueueLinksKwargs,
48
54
  ExtractLinksFunction,
49
55
  HttpHeaders,
@@ -105,6 +111,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
105
111
  fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
106
112
  headless: bool | None = None,
107
113
  use_incognito_pages: bool | None = None,
114
+ navigation_timeout: timedelta | None = None,
108
115
  **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
109
116
  ) -> None:
110
117
  """Initialize a new instance.
@@ -113,7 +120,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
113
120
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
114
121
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
115
122
  and local storage.
116
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
123
+ browser_type: The type of browser to launch:
124
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
125
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
126
+ the system.
117
127
  This option should not be used if `browser_pool` is provided.
118
128
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
119
129
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -130,12 +140,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
130
140
  use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
131
141
  own context that is destroyed once the page is closed or crashes.
132
142
  This option should not be used if `browser_pool` is provided.
143
+ navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
144
+ the request handler)
133
145
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
134
146
  """
135
147
  configuration = kwargs.pop('configuration', None)
136
148
  if configuration is not None:
137
149
  service_locator.set_configuration(configuration)
138
150
 
151
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
152
+
139
153
  if browser_pool:
140
154
  # Raise an exception if browser_pool is provided together with other browser-related arguments.
141
155
  if any(
@@ -152,7 +166,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
152
166
  ):
153
167
  raise ValueError(
154
168
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
155
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
169
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
156
170
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
157
171
  )
158
172
 
@@ -194,6 +208,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
194
208
 
195
209
  kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
196
210
 
211
+ # Set default concurrency settings for browser crawlers if not provided
212
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
213
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
214
+
215
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
216
+
197
217
  super().__init__(**kwargs)
198
218
 
199
219
  async def _open_page(
@@ -220,10 +240,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
220
240
  block_requests=partial(block_requests, page=crawlee_page.page),
221
241
  )
222
242
 
223
- async with browser_page_context(crawlee_page.page):
224
- for hook in self._pre_navigation_hooks:
225
- await hook(pre_navigation_context)
226
- yield pre_navigation_context
243
+ context_id = id(pre_navigation_context)
244
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
245
+
246
+ try:
247
+ async with browser_page_context(crawlee_page.page):
248
+ for hook in self._pre_navigation_hooks:
249
+ async with self._shared_navigation_timeouts[context_id]:
250
+ await hook(pre_navigation_context)
251
+
252
+ yield pre_navigation_context
253
+ finally:
254
+ self._shared_navigation_timeouts.pop(context_id, None)
227
255
 
228
256
  def _prepare_request_interceptor(
229
257
  self,
@@ -258,6 +286,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
258
286
  Raises:
259
287
  ValueError: If the browser pool is not initialized.
260
288
  SessionError: If the URL cannot be loaded by the browser.
289
+ TimeoutError: If navigation does not succeed within the navigation timeout.
261
290
 
262
291
  Yields:
263
292
  The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -289,7 +318,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
289
318
  # Set route_handler only for current request
290
319
  await context.page.route(context.request.url, route_handler)
291
320
 
292
- response = await context.page.goto(context.request.url)
321
+ try:
322
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
323
+ response = await context.page.goto(
324
+ context.request.url, timeout=remaining_timeout.total_seconds() * 1000
325
+ )
326
+ except playwright.async_api.TimeoutError as exc:
327
+ raise asyncio.TimeoutError from exc
293
328
 
294
329
  if response is None:
295
330
  raise SessionError(f'Failed to load the URL: {context.request.url}')
@@ -361,7 +396,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
361
396
  links_iterator: Iterator[str] = iter(
362
397
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
363
398
  )
364
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
399
+
400
+ # Get base URL from <base> tag if present
401
+ extracted_base_url = await context.page.evaluate('document.baseURI')
402
+ base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
403
+
404
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
365
405
 
366
406
  if robots_txt_file:
367
407
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -489,7 +529,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
489
529
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
490
530
 
491
531
  browser_type: NotRequired[BrowserType]
492
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
532
+ """The type of browser to launch:
533
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
534
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
493
535
  This option should not be used if `browser_pool` is provided."""
494
536
 
495
537
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
59
59
  session: Session | None = None,
60
60
  proxy_info: ProxyInfo | None = None,
61
61
  statistics: Statistics | None = None,
62
+ timeout: timedelta | None = None,
62
63
  ) -> HttpCrawlingResult:
63
64
  raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
64
65
 
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
72
73
  payload: HttpPayload | None = None,
73
74
  session: Session | None = None,
74
75
  proxy_info: ProxyInfo | None = None,
76
+ timeout: timedelta | None = None,
75
77
  ) -> HttpResponse:
76
78
  # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
77
79
  # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
87
89
 
88
90
  # Proxies appropriate to the browser context are used
89
91
  response = await browser_context.request.fetch(
90
- url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
92
+ url_or_request=url,
93
+ method=method.lower(),
94
+ headers=dict(headers) if headers else None,
95
+ data=payload,
96
+ timeout=timeout.total_seconds() if timeout else None,
91
97
  )
92
98
 
93
99
  return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')