crawlee 1.1.1b1__py3-none-any.whl → 1.1.2b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

crawlee/_types.py CHANGED
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
15
15
  import re
16
16
  from collections.abc import Callable, Coroutine, Sequence
17
17
 
18
- from typing_extensions import NotRequired, Required, Unpack
18
+ from typing_extensions import NotRequired, Required, Self, Unpack
19
19
 
20
20
  from crawlee import Glob, Request
21
21
  from crawlee._request import RequestOptions
@@ -643,6 +643,25 @@ class BasicCrawlingContext:
643
643
  """Return hash of the context. Each context is considered unique."""
644
644
  return id(self)
645
645
 
646
+ def create_modified_copy(
647
+ self,
648
+ push_data: PushDataFunction | None = None,
649
+ add_requests: AddRequestsFunction | None = None,
650
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
651
+ ) -> Self:
652
+ """Create a modified copy of the crawling context with specified changes."""
653
+ original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
654
+ modified_fields = {
655
+ key: value
656
+ for key, value in {
657
+ 'push_data': push_data,
658
+ 'add_requests': add_requests,
659
+ 'get_key_value_store': get_key_value_store,
660
+ }.items()
661
+ if value
662
+ }
663
+ return self.__class__(**{**original_fields, **modified_fields})
664
+
646
665
 
647
666
  class GetDataKwargs(TypedDict):
648
667
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/file.py CHANGED
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
166
173
  writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
167
174
  write_header = True
168
175
 
crawlee/_utils/time.py CHANGED
@@ -3,11 +3,14 @@ from __future__ import annotations
3
3
  import time
4
4
  from contextlib import contextmanager
5
5
  from dataclasses import dataclass
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING
7
8
 
9
+ from async_timeout import Timeout, timeout
10
+
8
11
  if TYPE_CHECKING:
9
12
  from collections.abc import Iterator
10
- from datetime import timedelta
13
+ from types import TracebackType
11
14
 
12
15
  _SECONDS_PER_MINUTE = 60
13
16
  _SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
35
38
  result.cpu = after_cpu - before_cpu
36
39
 
37
40
 
41
+ class SharedTimeout:
42
+ """Keeps track of a time budget shared by multiple independent async operations.
43
+
44
+ Provides a reusable, non-reentrant context manager interface.
45
+ """
46
+
47
+ def __init__(self, timeout: timedelta) -> None:
48
+ self._remaining_timeout = timeout
49
+ self._active_timeout: Timeout | None = None
50
+ self._activation_timestamp: float | None = None
51
+
52
+ async def __aenter__(self) -> timedelta:
53
+ if self._active_timeout is not None or self._activation_timestamp is not None:
54
+ raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55
+
56
+ self._activation_timestamp = time.monotonic()
57
+ self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58
+ await new_timeout.__aenter__()
59
+ return self._remaining_timeout
60
+
61
+ async def __aexit__(
62
+ self,
63
+ exc_type: type[BaseException] | None,
64
+ exc_value: BaseException | None,
65
+ exc_traceback: TracebackType | None,
66
+ ) -> None:
67
+ if self._active_timeout is None or self._activation_timestamp is None:
68
+ raise RuntimeError('Logic error')
69
+
70
+ await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71
+ elapsed = time.monotonic() - self._activation_timestamp
72
+ self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73
+
74
+ self._active_timeout = None
75
+ self._activation_timestamp = None
76
+
77
+
38
78
  def format_duration(duration: timedelta | None) -> str:
39
79
  """Format a timedelta into a human-readable string with appropriate units."""
40
80
  if duration is None:
@@ -1,7 +1,7 @@
1
1
  from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
2
  from crawlee._utils.try_import import try_import as _try_import
3
3
 
4
- from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4
+ from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
5
5
  from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
6
6
  from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
7
7
 
@@ -51,6 +51,7 @@ __all__ = [
51
51
  'BeautifulSoupParserType',
52
52
  'ContextPipeline',
53
53
  'HttpCrawler',
54
+ 'HttpCrawlerOptions',
54
55
  'HttpCrawlingContext',
55
56
  'HttpCrawlingResult',
56
57
  'ParsedHttpCrawlingContext',
@@ -1,9 +1,10 @@
1
- from ._abstract_http_crawler import AbstractHttpCrawler
1
+ from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
2
2
  from ._abstract_http_parser import AbstractHttpParser
3
3
  from ._http_crawling_context import ParsedHttpCrawlingContext
4
4
 
5
5
  __all__ = [
6
6
  'AbstractHttpCrawler',
7
7
  'AbstractHttpParser',
8
+ 'HttpCrawlerOptions',
8
9
  'ParsedHttpCrawlingContext',
9
10
  ]
@@ -3,14 +3,16 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  from abc import ABC
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING, Any, Generic
7
8
 
8
9
  from more_itertools import partition
9
10
  from pydantic import ValidationError
10
- from typing_extensions import TypeVar
11
+ from typing_extensions import NotRequired, TypeVar
11
12
 
12
13
  from crawlee._request import Request, RequestOptions
13
14
  from crawlee._utils.docs import docs_group
15
+ from crawlee._utils.time import SharedTimeout
14
16
  from crawlee._utils.urls import to_absolute_url_iterator
15
17
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
16
18
  from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
32
34
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
33
35
 
34
36
 
37
+ class HttpCrawlerOptions(
38
+ BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39
+ Generic[TCrawlingContext, TStatisticsState],
40
+ ):
41
+ """Arguments for the `AbstractHttpCrawler` constructor.
42
+
43
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
44
+ """
45
+
46
+ navigation_timeout: NotRequired[timedelta | None]
47
+ """Timeout for the HTTP request."""
48
+
49
+
35
50
  @docs_group('Crawlers')
36
51
  class AbstractHttpCrawler(
37
52
  BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
56
71
  self,
57
72
  *,
58
73
  parser: AbstractHttpParser[TParseResult, TSelectResult],
74
+ navigation_timeout: timedelta | None = None,
59
75
  **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
60
76
  ) -> None:
61
77
  self._parser = parser
78
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
62
79
  self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
63
81
 
64
82
  if '_context_pipeline' not in kwargs:
65
83
  raise ValueError(
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
112
130
  async def _execute_pre_navigation_hooks(
113
131
  self, context: BasicCrawlingContext
114
132
  ) -> AsyncGenerator[BasicCrawlingContext, None]:
115
- for hook in self._pre_navigation_hooks:
116
- await hook(context)
117
- yield context
133
+ context_id = id(context)
134
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
135
+
136
+ try:
137
+ for hook in self._pre_navigation_hooks:
138
+ async with self._shared_navigation_timeouts[context_id]:
139
+ await hook(context)
140
+
141
+ yield context
142
+ finally:
143
+ self._shared_navigation_timeouts.pop(context_id, None)
118
144
 
119
145
  async def _parse_http_response(
120
146
  self, context: HttpCrawlingContext
@@ -167,9 +193,15 @@ class AbstractHttpCrawler(
167
193
  kwargs.setdefault('strategy', 'same-hostname')
168
194
 
169
195
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170
- links_iterator = to_absolute_url_iterator(
171
- context.request.loaded_url or context.request.url, links_iterator, logger=context.log
196
+
197
+ # Get base URL from <base> tag if present
198
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
199
+ base_url: str = (
200
+ str(extracted_base_urls[0])
201
+ if extracted_base_urls
202
+ else context.request.loaded_url or context.request.url
172
203
  )
204
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
173
205
 
174
206
  if robots_txt_file:
175
207
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -216,12 +248,14 @@ class AbstractHttpCrawler(
216
248
  Yields:
217
249
  The original crawling context enhanced by HTTP response.
218
250
  """
219
- result = await self._http_client.crawl(
220
- request=context.request,
221
- session=context.session,
222
- proxy_info=context.proxy_info,
223
- statistics=self._statistics,
224
- )
251
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
252
+ result = await self._http_client.crawl(
253
+ request=context.request,
254
+ session=context.session,
255
+ proxy_info=context.proxy_info,
256
+ statistics=self._statistics,
257
+ timeout=remaining_timeout,
258
+ )
225
259
 
226
260
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
227
261
 
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
31
33
  from crawlee._types import (
32
34
  BasicCrawlingContext,
33
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
34
38
  GetKeyValueStoreFromRequestHandlerFunction,
35
39
  HttpHeaders,
36
40
  HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
40
44
  SkippedReason,
41
45
  )
42
46
  from crawlee._utils.docs import docs_group
43
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
44
48
  from crawlee._utils.recurring_task import RecurringTask
45
49
  from crawlee._utils.robots import RobotsTxtFile
46
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -96,6 +100,9 @@ if TYPE_CHECKING:
96
100
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
101
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
102
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
103
+ TParams = ParamSpec('TParams')
104
+ T = TypeVar('T')
105
+
99
106
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
107
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
108
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -520,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
520
527
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
521
528
  self._unexpected_stop = True
522
529
 
530
+ def _wrap_handler_with_error_context(
531
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
532
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
533
+ """Decorate error handlers to make their context helpers usable."""
534
+
535
+ @functools.wraps(handler)
536
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
537
+ # Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
538
+ # failed. Modified context provides context helpers with direct access to the storages.
539
+ error_context = context.create_modified_copy(
540
+ push_data=self._push_data,
541
+ get_key_value_store=self.get_key_value_store,
542
+ add_requests=functools.partial(self._add_requests, context),
543
+ )
544
+ return await handler(error_context, exception)
545
+
546
+ return wrapped_handler
547
+
523
548
  def _stop_if_max_requests_count_exceeded(self) -> None:
524
549
  """Call `stop` when the maximum number of requests to crawl has been reached."""
525
550
  if self._max_requests_per_crawl is None:
@@ -618,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
618
643
 
619
644
  The error handler is invoked after a request handler error occurs and before a retry attempt.
620
645
  """
621
- self._error_handler = handler
646
+ self._error_handler = self._wrap_handler_with_error_context(handler)
622
647
  return handler
623
648
 
624
649
  def failed_request_handler(
@@ -628,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
628
653
 
629
654
  The failed request handler is invoked when a request has failed all retry attempts.
630
655
  """
631
- self._failed_request_handler = handler
656
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
632
657
  return handler
633
658
 
634
659
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -846,6 +871,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
846
871
  dataset_id: str | None = None,
847
872
  dataset_name: str | None = None,
848
873
  dataset_alias: str | None = None,
874
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
849
875
  ) -> None:
850
876
  """Export all items from a Dataset to a JSON or CSV file.
851
877
 
@@ -858,6 +884,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
858
884
  dataset_id: The ID of the Dataset to export from.
859
885
  dataset_name: The name of the Dataset to export from (global scope, named storage).
860
886
  dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
887
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
861
888
  """
862
889
  dataset = await Dataset.open(
863
890
  id=dataset_id,
@@ -867,13 +894,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
867
894
  configuration=self._service_locator.get_configuration(),
868
895
  )
869
896
 
870
- path = path if isinstance(path, Path) else Path(path)
871
- dst = path.open('w', newline='')
897
+ path = Path(path)
872
898
 
873
899
  if path.suffix == '.csv':
874
- await export_csv_to_stream(dataset.iterate_items(), dst)
900
+ dst = StringIO()
901
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
902
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
903
+ await atomic_write(path, dst.getvalue())
875
904
  elif path.suffix == '.json':
876
- await export_json_to_stream(dataset.iterate_items(), dst)
905
+ dst = StringIO()
906
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
907
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
908
+ await atomic_write(path, dst.getvalue())
877
909
  else:
878
910
  raise ValueError(f'Unsupported file extension: {path.suffix}')
879
911
 
@@ -1043,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1043
1075
  return target_url.hostname == origin_url.hostname
1044
1076
 
1045
1077
  if strategy == 'same-domain':
1046
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1047
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1078
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1079
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1048
1080
  return origin_domain == target_domain
1049
1081
 
1050
1082
  if strategy == 'same-origin':
@@ -1113,19 +1145,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1113
1145
  except Exception as e:
1114
1146
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1115
1147
  else:
1116
- if new_request is not None:
1117
- request = new_request
1148
+ if new_request is not None and new_request != request:
1149
+ await request_manager.add_request(new_request)
1150
+ await self._mark_request_as_handled(request)
1151
+ return
1118
1152
 
1119
1153
  await request_manager.reclaim_request(request)
1120
1154
  else:
1121
- await wait_for(
1122
- lambda: request_manager.mark_request_as_handled(context.request),
1123
- timeout=self._internal_timeout,
1124
- timeout_message='Marking request as handled timed out after '
1125
- f'{self._internal_timeout.total_seconds()} seconds',
1126
- logger=self._logger,
1127
- max_retries=3,
1128
- )
1155
+ await self._mark_request_as_handled(request)
1129
1156
  await self._handle_failed_request(context, error)
1130
1157
  self._statistics.record_request_processing_failure(request.unique_key)
1131
1158
 
@@ -1174,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1174
1201
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1175
1202
  ) -> None:
1176
1203
  if need_mark and isinstance(request, Request):
1177
- request_manager = await self.get_request_manager()
1178
-
1179
- await wait_for(
1180
- lambda: request_manager.mark_request_as_handled(request),
1181
- timeout=self._internal_timeout,
1182
- timeout_message='Marking request as handled timed out after '
1183
- f'{self._internal_timeout.total_seconds()} seconds',
1184
- logger=self._logger,
1185
- max_retries=3,
1186
- )
1204
+ await self._mark_request_as_handled(request)
1187
1205
  request.state = RequestState.SKIPPED
1188
1206
 
1189
1207
  url = request.url if isinstance(request, Request) else request
@@ -1256,52 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1256
1274
  else:
1257
1275
  yield Request.from_url(url)
1258
1276
 
1259
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1260
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1261
- result = self._context_result_map[context]
1262
-
1263
- base_request_manager = await self.get_request_manager()
1264
-
1265
- origin = context.request.loaded_url or context.request.url
1266
-
1267
- for add_requests_call in result.add_requests_calls:
1268
- rq_id = add_requests_call.get('rq_id')
1269
- rq_name = add_requests_call.get('rq_name')
1270
- rq_alias = add_requests_call.get('rq_alias')
1271
- specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1272
- if specified_params > 1:
1273
- raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1274
- if rq_id or rq_name or rq_alias:
1275
- request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1276
- id=rq_id,
1277
- name=rq_name,
1278
- alias=rq_alias,
1279
- storage_client=self._service_locator.get_storage_client(),
1280
- configuration=self._service_locator.get_configuration(),
1281
- )
1282
- else:
1283
- request_manager = base_request_manager
1284
-
1285
- requests = list[Request]()
1286
-
1287
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1288
-
1289
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1277
+ async def _add_requests(
1278
+ self,
1279
+ context: BasicCrawlingContext,
1280
+ requests: Sequence[str | Request],
1281
+ rq_id: str | None = None,
1282
+ rq_name: str | None = None,
1283
+ rq_alias: str | None = None,
1284
+ **kwargs: Unpack[EnqueueLinksKwargs],
1285
+ ) -> None:
1286
+ """Add requests method aware of the crawling context."""
1287
+ if rq_id or rq_name or rq_alias:
1288
+ request_manager: RequestManager = await RequestQueue.open(
1289
+ id=rq_id,
1290
+ name=rq_name,
1291
+ alias=rq_alias,
1292
+ storage_client=self._service_locator.get_storage_client(),
1293
+ configuration=self._service_locator.get_configuration(),
1294
+ )
1295
+ else:
1296
+ request_manager = await self.get_request_manager()
1290
1297
 
1291
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1298
+ context_aware_requests = list[Request]()
1299
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1300
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1301
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1302
+ for dst_request in filter_requests_iterator:
1303
+ # Update the crawl depth of the request.
1304
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1292
1305
 
1293
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1294
- requests_iterator, context.request.url, **enqueue_links_kwargs
1295
- )
1306
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1307
+ context_aware_requests.append(dst_request)
1296
1308
 
1297
- for dst_request in filter_requests_iterator:
1298
- # Update the crawl depth of the request.
1299
- dst_request.crawl_depth = context.request.crawl_depth + 1
1309
+ return await request_manager.add_requests(context_aware_requests)
1300
1310
 
1301
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1302
- requests.append(dst_request)
1311
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1312
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1313
+ result = self._context_result_map[context]
1303
1314
 
1304
- await request_manager.add_requests(requests)
1315
+ for add_requests_call in result.add_requests_calls:
1316
+ await self._add_requests(context, **add_requests_call)
1305
1317
 
1306
1318
  for push_data_call in result.push_data_calls:
1307
1319
  await self._push_data(**push_data_call)
@@ -1401,14 +1413,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1401
1413
  raise RequestHandlerError(e, context) from e
1402
1414
 
1403
1415
  await self._commit_request_handler_result(context)
1404
- await wait_for(
1405
- lambda: request_manager.mark_request_as_handled(context.request),
1406
- timeout=self._internal_timeout,
1407
- timeout_message='Marking request as handled timed out after '
1408
- f'{self._internal_timeout.total_seconds()} seconds',
1409
- logger=self._logger,
1410
- max_retries=3,
1411
- )
1416
+
1417
+ await self._mark_request_as_handled(request)
1412
1418
 
1413
1419
  request.state = RequestState.DONE
1414
1420
 
@@ -1451,14 +1457,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1451
1457
  await request_manager.reclaim_request(request)
1452
1458
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1453
1459
  else:
1454
- await wait_for(
1455
- lambda: request_manager.mark_request_as_handled(context.request),
1456
- timeout=self._internal_timeout,
1457
- timeout_message='Marking request as handled timed out after '
1458
- f'{self._internal_timeout.total_seconds()} seconds',
1459
- logger=self._logger,
1460
- max_retries=3,
1461
- )
1460
+ await self._mark_request_as_handled(request)
1462
1461
 
1463
1462
  await self._handle_failed_request(context, session_error)
1464
1463
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1466,14 +1465,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1466
1465
  except ContextPipelineInterruptedError as interrupted_error:
1467
1466
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1468
1467
 
1469
- await wait_for(
1470
- lambda: request_manager.mark_request_as_handled(context.request),
1471
- timeout=self._internal_timeout,
1472
- timeout_message='Marking request as handled timed out after '
1473
- f'{self._internal_timeout.total_seconds()} seconds',
1474
- logger=self._logger,
1475
- max_retries=3,
1476
- )
1468
+ await self._mark_request_as_handled(request)
1477
1469
 
1478
1470
  except ContextPipelineInitializationError as initialization_error:
1479
1471
  self._logger.debug(
@@ -1491,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1491
1483
  raise
1492
1484
 
1493
1485
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1494
- await wait_for(
1495
- lambda: self._context_pipeline(context, self.router),
1496
- timeout=self._request_handler_timeout,
1497
- timeout_message=f'{self._request_handler_timeout_text}'
1498
- f' {self._request_handler_timeout.total_seconds()} seconds',
1499
- logger=self._logger,
1486
+ await self._context_pipeline(
1487
+ context,
1488
+ lambda final_context: wait_for(
1489
+ lambda: self.router(final_context),
1490
+ timeout=self._request_handler_timeout,
1491
+ timeout_message=f'{self._request_handler_timeout_text}'
1492
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1493
+ logger=self._logger,
1494
+ ),
1500
1495
  )
1501
1496
 
1502
1497
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1644,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1644
1639
  )
1645
1640
 
1646
1641
  self._previous_crawler_state = current_state
1642
+
1643
+ async def _mark_request_as_handled(self, request: Request) -> None:
1644
+ request_manager = await self.get_request_manager()
1645
+ await wait_for(
1646
+ lambda: request_manager.mark_request_as_handled(request),
1647
+ timeout=self._internal_timeout,
1648
+ timeout_message='Marking request as handled timed out after '
1649
+ f'{self._internal_timeout.total_seconds()} seconds',
1650
+ logger=self._logger,
1651
+ max_retries=3,
1652
+ )
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from parsel import Selector
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._parsel_crawling_context import ParselCrawlingContext
11
11
  from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
56
56
 
57
57
  def __init__(
58
58
  self,
59
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
60
60
  ) -> None:
61
61
  """Initialize a new instance.
62
62