crawlee 1.0.2b3__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (66) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +30 -17
  4. crawlee/_utils/context.py +2 -2
  5. crawlee/_utils/file.py +7 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +17 -1
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/time.py +41 -1
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +1 -1
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +3 -1
  17. crawlee/crawlers/__init__.py +2 -1
  18. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  19. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  22. crawlee/crawlers/_basic/_basic_crawler.py +126 -112
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +55 -11
  26. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  27. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  28. crawlee/crawlers/_playwright/_types.py +12 -2
  29. crawlee/events/_event_manager.py +4 -4
  30. crawlee/fingerprint_suite/_header_generator.py +2 -2
  31. crawlee/http_clients/_base.py +4 -0
  32. crawlee/http_clients/_curl_impersonate.py +12 -0
  33. crawlee/http_clients/_httpx.py +16 -6
  34. crawlee/http_clients/_impit.py +25 -10
  35. crawlee/otel/crawler_instrumentor.py +3 -3
  36. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  37. crawlee/sessions/_session_pool.py +1 -1
  38. crawlee/statistics/_error_snapshotter.py +1 -1
  39. crawlee/statistics/_models.py +32 -1
  40. crawlee/statistics/_statistics.py +24 -33
  41. crawlee/storage_clients/__init__.py +4 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +27 -9
  45. crawlee/storage_clients/_redis/__init__.py +6 -0
  46. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  47. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  48. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  49. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  50. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  51. crawlee/storage_clients/_redis/_utils.py +23 -0
  52. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  53. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  54. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  55. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  56. crawlee/storage_clients/_redis/py.typed +0 -0
  57. crawlee/storage_clients/_sql/_db_models.py +1 -2
  58. crawlee/storage_clients/_sql/_key_value_store_client.py +3 -2
  59. crawlee/storage_clients/_sql/_request_queue_client.py +18 -4
  60. crawlee/storage_clients/_sql/_storage_client.py +1 -1
  61. crawlee/storages/_key_value_store.py +5 -2
  62. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +8 -3
  63. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +66 -54
  64. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  65. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  66. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -3,14 +3,16 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  from abc import ABC
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING, Any, Generic
7
8
 
8
9
  from more_itertools import partition
9
10
  from pydantic import ValidationError
10
- from typing_extensions import TypeVar
11
+ from typing_extensions import NotRequired, TypeVar
11
12
 
12
13
  from crawlee._request import Request, RequestOptions
13
14
  from crawlee._utils.docs import docs_group
15
+ from crawlee._utils.time import SharedTimeout
14
16
  from crawlee._utils.urls import to_absolute_url_iterator
15
17
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
16
18
  from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
32
34
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
33
35
 
34
36
 
37
+ class HttpCrawlerOptions(
38
+ BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39
+ Generic[TCrawlingContext, TStatisticsState],
40
+ ):
41
+ """Arguments for the `AbstractHttpCrawler` constructor.
42
+
43
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
44
+ """
45
+
46
+ navigation_timeout: NotRequired[timedelta | None]
47
+ """Timeout for the HTTP request."""
48
+
49
+
35
50
  @docs_group('Crawlers')
36
51
  class AbstractHttpCrawler(
37
52
  BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
56
71
  self,
57
72
  *,
58
73
  parser: AbstractHttpParser[TParseResult, TSelectResult],
74
+ navigation_timeout: timedelta | None = None,
59
75
  **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
60
76
  ) -> None:
61
77
  self._parser = parser
78
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
62
79
  self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
63
81
 
64
82
  if '_context_pipeline' not in kwargs:
65
83
  raise ValueError(
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
112
130
  async def _execute_pre_navigation_hooks(
113
131
  self, context: BasicCrawlingContext
114
132
  ) -> AsyncGenerator[BasicCrawlingContext, None]:
115
- for hook in self._pre_navigation_hooks:
116
- await hook(context)
117
- yield context
133
+ context_id = id(context)
134
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
135
+
136
+ try:
137
+ for hook in self._pre_navigation_hooks:
138
+ async with self._shared_navigation_timeouts[context_id]:
139
+ await hook(context)
140
+
141
+ yield context
142
+ finally:
143
+ self._shared_navigation_timeouts.pop(context_id, None)
118
144
 
119
145
  async def _parse_http_response(
120
146
  self, context: HttpCrawlingContext
@@ -167,7 +193,15 @@ class AbstractHttpCrawler(
167
193
  kwargs.setdefault('strategy', 'same-hostname')
168
194
 
169
195
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
196
+
197
+ # Get base URL from <base> tag if present
198
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
199
+ base_url: str = (
200
+ str(extracted_base_urls[0])
201
+ if extracted_base_urls
202
+ else context.request.loaded_url or context.request.url
203
+ )
204
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
171
205
 
172
206
  if robots_txt_file:
173
207
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -214,12 +248,14 @@ class AbstractHttpCrawler(
214
248
  Yields:
215
249
  The original crawling context enhanced by HTTP response.
216
250
  """
217
- result = await self._http_client.crawl(
218
- request=context.request,
219
- session=context.session,
220
- proxy_info=context.proxy_info,
221
- statistics=self._statistics,
222
- )
251
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
252
+ result = await self._http_client.crawl(
253
+ request=context.request,
254
+ session=context.session,
255
+ proxy_info=context.proxy_info,
256
+ statistics=self._statistics,
257
+ timeout=remaining_timeout,
258
+ )
223
259
 
224
260
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
225
261
 
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
71
71
  async def __aenter__(self) -> Self:
72
72
  self._active = True
73
73
  await self._state.initialize()
74
- self._after_initialize()
75
74
  return self
76
75
 
77
76
  async def __aexit__(
@@ -149,10 +148,6 @@ class AdaptivePlaywrightCrawler(
149
148
  non-default configuration.
150
149
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
151
150
  """
152
- # Some sub crawler kwargs are internally modified. Prepare copies.
153
- basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
154
- basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
155
-
156
151
  # Adaptive crawling related.
157
152
  self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
158
153
  self.result_checker = result_checker or (lambda _: True)
@@ -170,11 +165,11 @@ class AdaptivePlaywrightCrawler(
170
165
  # Each sub crawler will use custom logger .
171
166
  static_logger = getLogger('Subcrawler_static')
172
167
  static_logger.setLevel(logging.ERROR)
173
- basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
168
+ basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
174
169
 
175
170
  pw_logger = getLogger('Subcrawler_playwright')
176
171
  pw_logger.setLevel(logging.ERROR)
177
- basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
172
+ basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
178
173
 
179
174
  # Initialize sub crawlers to create their pipelines.
180
175
  static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -319,7 +314,7 @@ class AdaptivePlaywrightCrawler(
319
314
  ),
320
315
  logger=self._logger,
321
316
  )
322
- return SubCrawlerRun(result=result)
317
+ return SubCrawlerRun(result=result, run_context=context_linked_to_result)
323
318
  except Exception as e:
324
319
  return SubCrawlerRun(exception=e)
325
320
 
@@ -375,7 +370,8 @@ class AdaptivePlaywrightCrawler(
375
370
  self.track_http_only_request_handler_runs()
376
371
 
377
372
  static_run = await self._crawl_one(rendering_type='static', context=context)
378
- if static_run.result and self.result_checker(static_run.result):
373
+ if static_run.result and static_run.run_context and self.result_checker(static_run.result):
374
+ self._update_context_from_copy(context, static_run.run_context)
379
375
  self._context_result_map[context] = static_run.result
380
376
  return
381
377
  if static_run.exception:
@@ -406,13 +402,10 @@ class AdaptivePlaywrightCrawler(
406
402
  if pw_run.exception is not None:
407
403
  raise pw_run.exception
408
404
 
409
- if pw_run.result:
410
- self._context_result_map[context] = pw_run.result
411
-
405
+ if pw_run.result and pw_run.run_context:
412
406
  if should_detect_rendering_type:
413
407
  detection_result: RenderingType
414
408
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
415
-
416
409
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
417
410
  detection_result = 'static'
418
411
  else:
@@ -421,6 +414,9 @@ class AdaptivePlaywrightCrawler(
421
414
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
422
415
  self.rendering_type_predictor.store_result(context.request, detection_result)
423
416
 
417
+ self._update_context_from_copy(context, pw_run.run_context)
418
+ self._context_result_map[context] = pw_run.result
419
+
424
420
  def pre_navigation_hook(
425
421
  self,
426
422
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -455,8 +451,32 @@ class AdaptivePlaywrightCrawler(
455
451
  def track_rendering_type_mispredictions(self) -> None:
456
452
  self.statistics.state.rendering_type_mispredictions += 1
457
453
 
454
+ def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
455
+ """Update mutable fields of `context` from `context_copy`.
456
+
457
+ Uses object.__setattr__ to bypass frozen dataclass restrictions,
458
+ allowing state synchronization after isolated crawler execution.
459
+ """
460
+ updating_attributes = {
461
+ 'request': ('headers', 'user_data'),
462
+ 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
463
+ }
464
+
465
+ for attr, sub_attrs in updating_attributes.items():
466
+ original_sub_obj = getattr(context, attr)
467
+ copy_sub_obj = getattr(context_copy, attr)
468
+
469
+ # Check that both sub objects are not None
470
+ if original_sub_obj is None or copy_sub_obj is None:
471
+ continue
472
+
473
+ for sub_attr in sub_attrs:
474
+ new_value = getattr(copy_sub_obj, sub_attr)
475
+ object.__setattr__(original_sub_obj, sub_attr, new_value)
476
+
458
477
 
459
478
  @dataclass(frozen=True)
460
479
  class SubCrawlerRun:
461
480
  result: RequestHandlerRunResult | None = None
462
481
  exception: Exception | None = None
482
+ run_context: BasicCrawlingContext | None = None
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
17
17
  from playwright.async_api import Page, Response
18
18
  from typing_extensions import Self
19
19
 
20
- from crawlee.crawlers._playwright._types import BlockRequestsFunction
20
+ from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
21
21
 
22
22
 
23
23
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
190
190
  http_response = await PlaywrightHttpResponse.from_playwright_response(
191
191
  response=context.response, protocol=protocol_guess or ''
192
192
  )
193
- # block_requests is useful only on pre-navigation contexts. It is useless here.
193
+ # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
194
194
  context_kwargs.pop('block_requests')
195
+ context_kwargs.pop('goto_options')
195
196
  return cls(
196
197
  parsed_content=await parser.parse(http_response),
197
198
  http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
212
213
  block_requests: BlockRequestsFunction | None = None
213
214
  """Blocks network requests matching specified URL patterns."""
214
215
 
216
+ goto_options: GotoOptions | None = None
217
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218
+
215
219
  @property
216
220
  def page(self) -> Page:
217
221
  """The Playwright `Page` object for the current page.
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
31
33
  from crawlee._types import (
32
34
  BasicCrawlingContext,
33
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
34
38
  GetKeyValueStoreFromRequestHandlerFunction,
35
39
  HttpHeaders,
36
40
  HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
40
44
  SkippedReason,
41
45
  )
42
46
  from crawlee._utils.docs import docs_group
43
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
44
48
  from crawlee._utils.recurring_task import RecurringTask
45
49
  from crawlee._utils.robots import RobotsTxtFile
46
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -96,6 +100,9 @@ if TYPE_CHECKING:
96
100
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
101
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
102
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
103
+ TParams = ParamSpec('TParams')
104
+ T = TypeVar('T')
105
+
99
106
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
107
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
108
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -437,14 +444,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
437
444
  self._statistics_log_format = statistics_log_format
438
445
 
439
446
  # Statistics
440
- self._statistics = statistics or cast(
441
- 'Statistics[TStatisticsState]',
442
- Statistics.with_default_state(
443
- periodic_message_logger=self._logger,
444
- statistics_log_format=self._statistics_log_format,
445
- log_message='Current request statistics:',
446
- ),
447
- )
447
+ if statistics:
448
+ self._statistics = statistics
449
+ else:
450
+
451
+ async def persist_state_factory() -> KeyValueStore:
452
+ return await self.get_key_value_store()
453
+
454
+ self._statistics = cast(
455
+ 'Statistics[TStatisticsState]',
456
+ Statistics.with_default_state(
457
+ persistence_enabled=True,
458
+ periodic_message_logger=self._logger,
459
+ statistics_log_format=self._statistics_log_format,
460
+ log_message='Current request statistics:',
461
+ persist_state_kvs_factory=persist_state_factory,
462
+ ),
463
+ )
448
464
 
449
465
  # Additional context managers to enter and exit
450
466
  self._additional_context_managers = _additional_context_managers or []
@@ -511,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
511
527
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
512
528
  self._unexpected_stop = True
513
529
 
530
+ def _wrap_handler_with_error_context(
531
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
532
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
533
+ """Decorate error handlers to make their context helpers usable."""
534
+
535
+ @functools.wraps(handler)
536
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
537
+ # Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
538
+ # failed. Modified context provides context helpers with direct access to the storages.
539
+ error_context = context.create_modified_copy(
540
+ push_data=self._push_data,
541
+ get_key_value_store=self.get_key_value_store,
542
+ add_requests=functools.partial(self._add_requests, context),
543
+ )
544
+ return await handler(error_context, exception)
545
+
546
+ return wrapped_handler
547
+
514
548
  def _stop_if_max_requests_count_exceeded(self) -> None:
515
549
  """Call `stop` when the maximum number of requests to crawl has been reached."""
516
550
  if self._max_requests_per_crawl is None:
@@ -609,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
609
643
 
610
644
  The error handler is invoked after a request handler error occurs and before a retry attempt.
611
645
  """
612
- self._error_handler = handler
646
+ self._error_handler = self._wrap_handler_with_error_context(handler)
613
647
  return handler
614
648
 
615
649
  def failed_request_handler(
@@ -619,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
619
653
 
620
654
  The failed request handler is invoked when a request has failed all retry attempts.
621
655
  """
622
- self._failed_request_handler = handler
656
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
623
657
  return handler
624
658
 
625
659
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -689,7 +723,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
689
723
  except CancelledError:
690
724
  pass
691
725
  finally:
692
- await self._crawler_state_rec_task.stop()
693
726
  if threading.current_thread() is threading.main_thread():
694
727
  with suppress(NotImplementedError):
695
728
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -721,8 +754,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
721
754
  async def _run_crawler(self) -> None:
722
755
  event_manager = self._service_locator.get_event_manager()
723
756
 
724
- self._crawler_state_rec_task.start()
725
-
726
757
  # Collect the context managers to be entered. Context managers that are already active are excluded,
727
758
  # as they were likely entered by the caller, who will also be responsible for exiting them.
728
759
  contexts_to_enter = [
@@ -733,6 +764,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
733
764
  self._statistics,
734
765
  self._session_pool if self._use_session_pool else None,
735
766
  self._http_client,
767
+ self._crawler_state_rec_task,
736
768
  *self._additional_context_managers,
737
769
  )
738
770
  if cm and getattr(cm, 'active', False) is False
@@ -839,6 +871,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
839
871
  dataset_id: str | None = None,
840
872
  dataset_name: str | None = None,
841
873
  dataset_alias: str | None = None,
874
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
842
875
  ) -> None:
843
876
  """Export all items from a Dataset to a JSON or CSV file.
844
877
 
@@ -851,6 +884,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
851
884
  dataset_id: The ID of the Dataset to export from.
852
885
  dataset_name: The name of the Dataset to export from (global scope, named storage).
853
886
  dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
887
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
854
888
  """
855
889
  dataset = await Dataset.open(
856
890
  id=dataset_id,
@@ -860,13 +894,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
860
894
  configuration=self._service_locator.get_configuration(),
861
895
  )
862
896
 
863
- path = path if isinstance(path, Path) else Path(path)
864
- dst = path.open('w', newline='')
897
+ path = Path(path)
865
898
 
866
899
  if path.suffix == '.csv':
867
- await export_csv_to_stream(dataset.iterate_items(), dst)
900
+ dst = StringIO()
901
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
902
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
903
+ await atomic_write(path, dst.getvalue())
868
904
  elif path.suffix == '.json':
869
- await export_json_to_stream(dataset.iterate_items(), dst)
905
+ dst = StringIO()
906
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
907
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
908
+ await atomic_write(path, dst.getvalue())
870
909
  else:
871
910
  raise ValueError(f'Unsupported file extension: {path.suffix}')
872
911
 
@@ -972,6 +1011,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
972
1011
  label=label,
973
1012
  user_data=user_data,
974
1013
  transform_request_function=transform_request_function,
1014
+ **kwargs,
975
1015
  ),
976
1016
  rq_id=rq_id,
977
1017
  rq_name=rq_name,
@@ -1035,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1035
1075
  return target_url.hostname == origin_url.hostname
1036
1076
 
1037
1077
  if strategy == 'same-domain':
1038
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1039
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1078
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1079
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1040
1080
  return origin_domain == target_domain
1041
1081
 
1042
1082
  if strategy == 'same-origin':
@@ -1105,19 +1145,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1105
1145
  except Exception as e:
1106
1146
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1107
1147
  else:
1108
- if new_request is not None:
1109
- request = new_request
1148
+ if new_request is not None and new_request != request:
1149
+ await request_manager.add_request(new_request)
1150
+ await self._mark_request_as_handled(request)
1151
+ return
1110
1152
 
1111
1153
  await request_manager.reclaim_request(request)
1112
1154
  else:
1113
- await wait_for(
1114
- lambda: request_manager.mark_request_as_handled(context.request),
1115
- timeout=self._internal_timeout,
1116
- timeout_message='Marking request as handled timed out after '
1117
- f'{self._internal_timeout.total_seconds()} seconds',
1118
- logger=self._logger,
1119
- max_retries=3,
1120
- )
1155
+ await self._mark_request_as_handled(request)
1121
1156
  await self._handle_failed_request(context, error)
1122
1157
  self._statistics.record_request_processing_failure(request.unique_key)
1123
1158
 
@@ -1166,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1166
1201
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1167
1202
  ) -> None:
1168
1203
  if need_mark and isinstance(request, Request):
1169
- request_manager = await self.get_request_manager()
1170
-
1171
- await wait_for(
1172
- lambda: request_manager.mark_request_as_handled(request),
1173
- timeout=self._internal_timeout,
1174
- timeout_message='Marking request as handled timed out after '
1175
- f'{self._internal_timeout.total_seconds()} seconds',
1176
- logger=self._logger,
1177
- max_retries=3,
1178
- )
1204
+ await self._mark_request_as_handled(request)
1179
1205
  request.state = RequestState.SKIPPED
1180
1206
 
1181
1207
  url = request.url if isinstance(request, Request) else request
@@ -1248,52 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1248
1274
  else:
1249
1275
  yield Request.from_url(url)
1250
1276
 
1251
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1252
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1253
- result = self._context_result_map[context]
1254
-
1255
- base_request_manager = await self.get_request_manager()
1256
-
1257
- origin = context.request.loaded_url or context.request.url
1258
-
1259
- for add_requests_call in result.add_requests_calls:
1260
- rq_id = add_requests_call.get('rq_id')
1261
- rq_name = add_requests_call.get('rq_name')
1262
- rq_alias = add_requests_call.get('rq_alias')
1263
- specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1264
- if specified_params > 1:
1265
- raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1266
- if rq_id or rq_name or rq_alias:
1267
- request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1268
- id=rq_id,
1269
- name=rq_name,
1270
- alias=rq_alias,
1271
- storage_client=self._service_locator.get_storage_client(),
1272
- configuration=self._service_locator.get_configuration(),
1273
- )
1274
- else:
1275
- request_manager = base_request_manager
1276
-
1277
- requests = list[Request]()
1278
-
1279
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1280
-
1281
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1277
+ async def _add_requests(
1278
+ self,
1279
+ context: BasicCrawlingContext,
1280
+ requests: Sequence[str | Request],
1281
+ rq_id: str | None = None,
1282
+ rq_name: str | None = None,
1283
+ rq_alias: str | None = None,
1284
+ **kwargs: Unpack[EnqueueLinksKwargs],
1285
+ ) -> None:
1286
+ """Add requests method aware of the crawling context."""
1287
+ if rq_id or rq_name or rq_alias:
1288
+ request_manager: RequestManager = await RequestQueue.open(
1289
+ id=rq_id,
1290
+ name=rq_name,
1291
+ alias=rq_alias,
1292
+ storage_client=self._service_locator.get_storage_client(),
1293
+ configuration=self._service_locator.get_configuration(),
1294
+ )
1295
+ else:
1296
+ request_manager = await self.get_request_manager()
1282
1297
 
1283
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1298
+ context_aware_requests = list[Request]()
1299
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1300
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1301
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1302
+ for dst_request in filter_requests_iterator:
1303
+ # Update the crawl depth of the request.
1304
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1284
1305
 
1285
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1286
- requests_iterator, context.request.url, **enqueue_links_kwargs
1287
- )
1306
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1307
+ context_aware_requests.append(dst_request)
1288
1308
 
1289
- for dst_request in filter_requests_iterator:
1290
- # Update the crawl depth of the request.
1291
- dst_request.crawl_depth = context.request.crawl_depth + 1
1309
+ return await request_manager.add_requests(context_aware_requests)
1292
1310
 
1293
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1294
- requests.append(dst_request)
1311
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1312
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1313
+ result = self._context_result_map[context]
1295
1314
 
1296
- await request_manager.add_requests(requests)
1315
+ for add_requests_call in result.add_requests_calls:
1316
+ await self._add_requests(context, **add_requests_call)
1297
1317
 
1298
1318
  for push_data_call in result.push_data_calls:
1299
1319
  await self._push_data(**push_data_call)
@@ -1393,14 +1413,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1393
1413
  raise RequestHandlerError(e, context) from e
1394
1414
 
1395
1415
  await self._commit_request_handler_result(context)
1396
- await wait_for(
1397
- lambda: request_manager.mark_request_as_handled(context.request),
1398
- timeout=self._internal_timeout,
1399
- timeout_message='Marking request as handled timed out after '
1400
- f'{self._internal_timeout.total_seconds()} seconds',
1401
- logger=self._logger,
1402
- max_retries=3,
1403
- )
1416
+
1417
+ await self._mark_request_as_handled(request)
1404
1418
 
1405
1419
  request.state = RequestState.DONE
1406
1420
 
@@ -1443,14 +1457,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1443
1457
  await request_manager.reclaim_request(request)
1444
1458
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1445
1459
  else:
1446
- await wait_for(
1447
- lambda: request_manager.mark_request_as_handled(context.request),
1448
- timeout=self._internal_timeout,
1449
- timeout_message='Marking request as handled timed out after '
1450
- f'{self._internal_timeout.total_seconds()} seconds',
1451
- logger=self._logger,
1452
- max_retries=3,
1453
- )
1460
+ await self._mark_request_as_handled(request)
1454
1461
 
1455
1462
  await self._handle_failed_request(context, session_error)
1456
1463
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1458,14 +1465,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1458
1465
  except ContextPipelineInterruptedError as interrupted_error:
1459
1466
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1460
1467
 
1461
- await wait_for(
1462
- lambda: request_manager.mark_request_as_handled(context.request),
1463
- timeout=self._internal_timeout,
1464
- timeout_message='Marking request as handled timed out after '
1465
- f'{self._internal_timeout.total_seconds()} seconds',
1466
- logger=self._logger,
1467
- max_retries=3,
1468
- )
1468
+ await self._mark_request_as_handled(request)
1469
1469
 
1470
1470
  except ContextPipelineInitializationError as initialization_error:
1471
1471
  self._logger.debug(
@@ -1483,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1483
1483
  raise
1484
1484
 
1485
1485
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1486
- await wait_for(
1487
- lambda: self._context_pipeline(context, self.router),
1488
- timeout=self._request_handler_timeout,
1489
- timeout_message=f'{self._request_handler_timeout_text}'
1490
- f' {self._request_handler_timeout.total_seconds()} seconds',
1491
- logger=self._logger,
1486
+ await self._context_pipeline(
1487
+ context,
1488
+ lambda final_context: wait_for(
1489
+ lambda: self.router(final_context),
1490
+ timeout=self._request_handler_timeout,
1491
+ timeout_message=f'{self._request_handler_timeout_text}'
1492
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1493
+ logger=self._logger,
1494
+ ),
1492
1495
  )
1493
1496
 
1494
1497
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1636,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1636
1639
  )
1637
1640
 
1638
1641
  self._previous_crawler_state = current_state
1642
+
1643
+ async def _mark_request_as_handled(self, request: Request) -> None:
1644
+ request_manager = await self.get_request_manager()
1645
+ await wait_for(
1646
+ lambda: request_manager.mark_request_as_handled(request),
1647
+ timeout=self._internal_timeout,
1648
+ timeout_message='Marking request as handled timed out after '
1649
+ f'{self._internal_timeout.total_seconds()} seconds',
1650
+ logger=self._logger,
1651
+ max_retries=3,
1652
+ )