crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (69) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +67 -24
  4. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  5. crawlee/_utils/recoverable_state.py +32 -8
  6. crawlee/_utils/recurring_task.py +15 -0
  7. crawlee/_utils/robots.py +17 -5
  8. crawlee/_utils/sitemap.py +1 -1
  9. crawlee/_utils/urls.py +9 -2
  10. crawlee/browsers/_browser_pool.py +4 -1
  11. crawlee/browsers/_playwright_browser_controller.py +21 -15
  12. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  13. crawlee/browsers/_types.py +1 -1
  14. crawlee/configuration.py +3 -1
  15. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  16. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
  17. crawlee/crawlers/_basic/_basic_crawler.py +51 -14
  18. crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
  19. crawlee/events/_event_manager.py +3 -1
  20. crawlee/fingerprint_suite/_header_generator.py +2 -2
  21. crawlee/otel/crawler_instrumentor.py +3 -3
  22. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  23. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  24. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  25. crawlee/sessions/_session_pool.py +1 -1
  26. crawlee/statistics/_error_snapshotter.py +1 -1
  27. crawlee/statistics/_models.py +32 -1
  28. crawlee/statistics/_statistics.py +24 -33
  29. crawlee/storage_clients/__init__.py +16 -0
  30. crawlee/storage_clients/_base/_storage_client.py +5 -4
  31. crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
  32. crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
  33. crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
  34. crawlee/storage_clients/_file_system/_storage_client.py +2 -2
  35. crawlee/storage_clients/_memory/_dataset_client.py +4 -5
  36. crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
  37. crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
  38. crawlee/storage_clients/_redis/__init__.py +6 -0
  39. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  40. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  41. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  42. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  43. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  44. crawlee/storage_clients/_redis/_utils.py +23 -0
  45. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  46. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  47. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  48. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  49. crawlee/storage_clients/_redis/py.typed +0 -0
  50. crawlee/storage_clients/_sql/__init__.py +6 -0
  51. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  52. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  53. crawlee/storage_clients/_sql/_db_models.py +268 -0
  54. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  55. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  56. crawlee/storage_clients/_sql/_storage_client.py +291 -0
  57. crawlee/storage_clients/_sql/py.typed +0 -0
  58. crawlee/storage_clients/models.py +10 -10
  59. crawlee/storages/_base.py +3 -1
  60. crawlee/storages/_dataset.py +5 -3
  61. crawlee/storages/_key_value_store.py +11 -6
  62. crawlee/storages/_request_queue.py +5 -3
  63. crawlee/storages/_storage_instance_manager.py +54 -68
  64. crawlee/storages/_utils.py +11 -0
  65. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
  66. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
  67. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
  68. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
  69. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -167,7 +167,9 @@ class AbstractHttpCrawler(
167
167
  kwargs.setdefault('strategy', 'same-hostname')
168
168
 
169
169
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
170
+ links_iterator = to_absolute_url_iterator(
171
+ context.request.loaded_url or context.request.url, links_iterator, logger=context.log
172
+ )
171
173
 
172
174
  if robots_txt_file:
173
175
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
12
12
  from parsel import Selector
13
13
  from typing_extensions import Self, TypeVar, override
14
14
 
15
- from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
15
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
16
16
  from crawlee._utils.docs import docs_group
17
17
  from crawlee._utils.wait import wait_for
18
18
  from crawlee.crawlers import (
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
71
71
  async def __aenter__(self) -> Self:
72
72
  self._active = True
73
73
  await self._state.initialize()
74
- self._after_initialize()
75
74
  return self
76
75
 
77
76
  async def __aexit__(
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
149
148
  non-default configuration.
150
149
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
151
150
  """
152
- # Some sub crawler kwargs are internally modified. Prepare copies.
153
- basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
154
- basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
155
-
156
151
  # Adaptive crawling related.
157
152
  self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
158
153
  self.result_checker = result_checker or (lambda _: True)
159
154
  self.result_comparator = result_comparator or create_default_comparator(result_checker)
160
155
 
156
+ # Set default concurrency settings for browser crawlers if not provided
157
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
158
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
159
+
161
160
  super().__init__(statistics=statistics, **kwargs)
162
161
 
163
162
  # Sub crawlers related.
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
166
165
  # Each sub crawler will use custom logger .
167
166
  static_logger = getLogger('Subcrawler_static')
168
167
  static_logger.setLevel(logging.ERROR)
169
- basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
168
+ basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
170
169
 
171
170
  pw_logger = getLogger('Subcrawler_playwright')
172
171
  pw_logger.setLevel(logging.ERROR)
173
- basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
172
+ basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
174
173
 
175
174
  # Initialize sub crawlers to create their pipelines.
176
175
  static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
315
314
  ),
316
315
  logger=self._logger,
317
316
  )
318
- return SubCrawlerRun(result=result)
317
+ return SubCrawlerRun(result=result, run_context=context_linked_to_result)
319
318
  except Exception as e:
320
319
  return SubCrawlerRun(exception=e)
321
320
 
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
371
370
  self.track_http_only_request_handler_runs()
372
371
 
373
372
  static_run = await self._crawl_one(rendering_type='static', context=context)
374
- if static_run.result and self.result_checker(static_run.result):
373
+ if static_run.result and static_run.run_context and self.result_checker(static_run.result):
374
+ self._update_context_from_copy(context, static_run.run_context)
375
375
  self._context_result_map[context] = static_run.result
376
376
  return
377
377
  if static_run.exception:
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
402
402
  if pw_run.exception is not None:
403
403
  raise pw_run.exception
404
404
 
405
- if pw_run.result:
406
- self._context_result_map[context] = pw_run.result
407
-
405
+ if pw_run.result and pw_run.run_context:
408
406
  if should_detect_rendering_type:
409
407
  detection_result: RenderingType
410
408
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
411
-
412
409
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
413
410
  detection_result = 'static'
414
411
  else:
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
417
414
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
418
415
  self.rendering_type_predictor.store_result(context.request, detection_result)
419
416
 
417
+ self._update_context_from_copy(context, pw_run.run_context)
418
+ self._context_result_map[context] = pw_run.result
419
+
420
420
  def pre_navigation_hook(
421
421
  self,
422
422
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
451
451
  def track_rendering_type_mispredictions(self) -> None:
452
452
  self.statistics.state.rendering_type_mispredictions += 1
453
453
 
454
+ def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
455
+ """Update mutable fields of `context` from `context_copy`.
456
+
457
+ Uses object.__setattr__ to bypass frozen dataclass restrictions,
458
+ allowing state synchronization after isolated crawler execution.
459
+ """
460
+ updating_attributes = {
461
+ 'request': ('headers', 'user_data'),
462
+ 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
463
+ }
464
+
465
+ for attr, sub_attrs in updating_attributes.items():
466
+ original_sub_obj = getattr(context, attr)
467
+ copy_sub_obj = getattr(context_copy, attr)
468
+
469
+ # Check that both sub objects are not None
470
+ if original_sub_obj is None or copy_sub_obj is None:
471
+ continue
472
+
473
+ for sub_attr in sub_attrs:
474
+ new_value = getattr(copy_sub_obj, sub_attr)
475
+ object.__setattr__(original_sub_obj, sub_attr, new_value)
476
+
454
477
 
455
478
  @dataclass(frozen=True)
456
479
  class SubCrawlerRun:
457
480
  result: RequestHandlerRunResult | None = None
458
481
  exception: Exception | None = None
482
+ run_context: BasicCrawlingContext | None = None
@@ -437,14 +437,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
437
437
  self._statistics_log_format = statistics_log_format
438
438
 
439
439
  # Statistics
440
- self._statistics = statistics or cast(
441
- 'Statistics[TStatisticsState]',
442
- Statistics.with_default_state(
443
- periodic_message_logger=self._logger,
444
- statistics_log_format=self._statistics_log_format,
445
- log_message='Current request statistics:',
446
- ),
447
- )
440
+ if statistics:
441
+ self._statistics = statistics
442
+ else:
443
+
444
+ async def persist_state_factory() -> KeyValueStore:
445
+ return await self.get_key_value_store()
446
+
447
+ self._statistics = cast(
448
+ 'Statistics[TStatisticsState]',
449
+ Statistics.with_default_state(
450
+ persistence_enabled=True,
451
+ periodic_message_logger=self._logger,
452
+ statistics_log_format=self._statistics_log_format,
453
+ log_message='Current request statistics:',
454
+ persist_state_kvs_factory=persist_state_factory,
455
+ ),
456
+ )
448
457
 
449
458
  # Additional context managers to enter and exit
450
459
  self._additional_context_managers = _additional_context_managers or []
@@ -659,7 +668,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
659
668
  request_manager = await self.get_request_manager()
660
669
  if purge_request_queue and isinstance(request_manager, RequestQueue):
661
670
  await request_manager.drop()
662
- self._request_manager = await RequestQueue.open()
671
+ self._request_manager = await RequestQueue.open(
672
+ storage_client=self._service_locator.get_storage_client(),
673
+ configuration=self._service_locator.get_configuration(),
674
+ )
663
675
 
664
676
  if requests is not None:
665
677
  await self.add_requests(requests)
@@ -686,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
686
698
  except CancelledError:
687
699
  pass
688
700
  finally:
689
- await self._crawler_state_rec_task.stop()
690
701
  if threading.current_thread() is threading.main_thread():
691
702
  with suppress(NotImplementedError):
692
703
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -718,8 +729,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
718
729
  async def _run_crawler(self) -> None:
719
730
  event_manager = self._service_locator.get_event_manager()
720
731
 
721
- self._crawler_state_rec_task.start()
722
-
723
732
  # Collect the context managers to be entered. Context managers that are already active are excluded,
724
733
  # as they were likely entered by the caller, who will also be responsible for exiting them.
725
734
  contexts_to_enter = [
@@ -730,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
730
739
  self._statistics,
731
740
  self._session_pool if self._use_session_pool else None,
732
741
  self._http_client,
742
+ self._crawler_state_rec_task,
733
743
  *self._additional_context_managers,
734
744
  )
735
745
  if cm and getattr(cm, 'active', False) is False
@@ -944,6 +954,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
944
954
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
945
955
  | None = None,
946
956
  requests: Sequence[str | Request] | None = None,
957
+ rq_id: str | None = None,
958
+ rq_name: str | None = None,
959
+ rq_alias: str | None = None,
947
960
  **kwargs: Unpack[EnqueueLinksKwargs],
948
961
  ) -> None:
949
962
  kwargs.setdefault('strategy', 'same-hostname')
@@ -955,7 +968,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
955
968
  '`transform_request_function` arguments when `requests` is provided.'
956
969
  )
957
970
  # Add directly passed requests.
958
- await context.add_requests(requests or list[str | Request](), **kwargs)
971
+ await context.add_requests(
972
+ requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
973
+ )
959
974
  else:
960
975
  # Add requests from extracted links.
961
976
  await context.add_requests(
@@ -964,7 +979,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
964
979
  label=label,
965
980
  user_data=user_data,
966
981
  transform_request_function=transform_request_function,
982
+ **kwargs,
967
983
  ),
984
+ rq_id=rq_id,
985
+ rq_name=rq_name,
986
+ rq_alias=rq_alias,
968
987
  **kwargs,
969
988
  )
970
989
 
@@ -1241,10 +1260,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1241
1260
  """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1242
1261
  result = self._context_result_map[context]
1243
1262
 
1244
- request_manager = await self.get_request_manager()
1263
+ base_request_manager = await self.get_request_manager()
1264
+
1245
1265
  origin = context.request.loaded_url or context.request.url
1246
1266
 
1247
1267
  for add_requests_call in result.add_requests_calls:
1268
+ rq_id = add_requests_call.get('rq_id')
1269
+ rq_name = add_requests_call.get('rq_name')
1270
+ rq_alias = add_requests_call.get('rq_alias')
1271
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1272
+ if specified_params > 1:
1273
+ raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1274
+ if rq_id or rq_name or rq_alias:
1275
+ request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1276
+ id=rq_id,
1277
+ name=rq_name,
1278
+ alias=rq_alias,
1279
+ storage_client=self._service_locator.get_storage_client(),
1280
+ configuration=self._service_locator.get_configuration(),
1281
+ )
1282
+ else:
1283
+ request_manager = base_request_manager
1284
+
1248
1285
  requests = list[Request]()
1249
1286
 
1250
1287
  base_url = url if (url := add_requests_call.get('base_url')) else origin
@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
12
12
 
13
13
  from crawlee import service_locator
14
14
  from crawlee._request import Request, RequestOptions
15
+ from crawlee._types import ConcurrencySettings
15
16
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
16
17
  from crawlee._utils.docs import docs_group
17
18
  from crawlee._utils.robots import RobotsTxtFile
@@ -113,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
113
114
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
114
115
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
115
116
  and local storage.
116
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
117
+ browser_type: The type of browser to launch:
118
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
119
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
120
+ the system.
117
121
  This option should not be used if `browser_pool` is provided.
118
122
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
119
123
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -152,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
152
156
  ):
153
157
  raise ValueError(
154
158
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
155
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
159
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
156
160
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
157
161
  )
158
162
 
@@ -194,6 +198,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
194
198
 
195
199
  kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
196
200
 
201
+ # Set default concurrency settings for browser crawlers if not provided
202
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
203
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
204
+
197
205
  super().__init__(**kwargs)
198
206
 
199
207
  async def _open_page(
@@ -361,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
361
369
  links_iterator: Iterator[str] = iter(
362
370
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
363
371
  )
364
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
372
+ links_iterator = to_absolute_url_iterator(
373
+ context.request.loaded_url or context.request.url, links_iterator, logger=context.log
374
+ )
365
375
 
366
376
  if robots_txt_file:
367
377
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -489,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
489
499
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
490
500
 
491
501
  browser_type: NotRequired[BrowserType]
492
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
502
+ """The type of browser to launch:
503
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
504
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
493
505
  This option should not be used if `browser_pool` is provided."""
494
506
 
495
507
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  def fingerprint_browser_type_from_playwright_browser_type(
14
- playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
14
+ playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
15
15
  ) -> SupportedBrowserType:
16
- if playwright_browser_type == 'chromium':
16
+ if playwright_browser_type in {'chromium', 'chrome'}:
17
17
  return 'chrome'
18
18
  if playwright_browser_type == 'firefox':
19
19
  return 'firefox'
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
69
69
 
70
70
  if request_handling_instrumentation:
71
71
 
72
- async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
72
+ async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
73
73
  with self._tracer.start_as_current_span(
74
74
  name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
75
75
  attributes={
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
111
111
  # Handpicked interesting methods to instrument
112
112
  self._instrumented.extend(
113
113
  [
114
- (_Middleware, 'action', middlware_wrapper),
115
- (_Middleware, 'cleanup', middlware_wrapper),
114
+ (_Middleware, 'action', middleware_wrapper),
115
+ (_Middleware, 'cleanup', middleware_wrapper),
116
116
  (ContextPipeline, '__call__', context_pipeline_wrapper),
117
117
  (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
118
118
  (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
@@ -5,8 +5,8 @@
5
5
  # % endif
6
6
  # % if cookiecutter.http_client == 'curl-impersonate'
7
7
  # % do extras.append('curl-impersonate')
8
- # % elif cookiecutter.http_client == 'impit'
9
- # % do extras.append('impit')
8
+ # % elif cookiecutter.http_client == 'httpx'
9
+ # % do extras.append('httpx')
10
10
  # % endif
11
11
 
12
12
  [project]
@@ -10,4 +10,7 @@ apify
10
10
  # % if cookiecutter.http_client == 'curl-impersonate'
11
11
  # % do extras.append('curl-impersonate')
12
12
  # % endif
13
+ # % if cookiecutter.http_client == 'httpx'
14
+ # % do extras.append('httpx')
15
+ # % endif
13
16
  crawlee[{{ extras | join(',') }}]
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
  from typing_extensions import override
11
11
 
12
- from crawlee import Request
12
+ from crawlee import Request, RequestOptions
13
13
  from crawlee._utils.docs import docs_group
14
14
  from crawlee._utils.globs import Glob
15
15
  from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import re
21
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
22
  from types import TracebackType
23
23
 
24
+ from crawlee import RequestTransformAction
24
25
  from crawlee.http_clients import HttpClient
25
26
  from crawlee.proxy_configuration import ProxyInfo
26
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
90
91
  class SitemapRequestLoader(RequestLoader):
91
92
  """A request loader that reads URLs from sitemap(s).
92
93
 
94
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
95
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
96
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
97
+ and the `enqueue_links` functionality.
98
+
93
99
  The loader fetches and parses sitemaps in the background, allowing crawling to start
94
100
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
101
 
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
107
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
108
114
  max_buffer_size: int = 200,
109
115
  persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
110
117
  ) -> None:
111
118
  """Initialize the sitemap request loader.
112
119
 
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
120
127
  persist_state_key: A key for persisting the loader's state in the KeyValueStore.
121
128
  When provided, allows resuming from where it left off after interruption.
122
129
  If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
123
133
  """
124
134
  self._http_client = http_client
125
135
  self._sitemap_urls = sitemap_urls
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
127
137
  self._exclude = exclude
128
138
  self._proxy_info = proxy_info
129
139
  self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
130
141
 
131
142
  # Synchronization for queue operations
132
143
  self._queue_has_capacity = asyncio.Event()
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
308
319
 
309
320
  async with self._queue_lock:
310
321
  url = state.url_queue.popleft()
311
-
312
- request = Request.from_url(url)
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
313
331
  state.in_progress.add(request.url)
314
332
  if len(state.url_queue) < self._max_buffer_size:
315
333
  self._queue_has_capacity.set()
@@ -163,7 +163,7 @@ class SessionPool:
163
163
  def add_session(self, session: Session) -> None:
164
164
  """Add an externally created session to the pool.
165
165
 
166
- This is intened only for the cases when you want to add a session that was created outside of the pool.
166
+ This is intended only for the cases when you want to add a session that was created outside of the pool.
167
167
  Otherwise, the pool will create new sessions automatically.
168
168
 
169
169
  Args:
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
32
32
  """Capture error snapshot and save it to key value store.
33
33
 
34
34
  It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
35
- it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
35
+ it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
36
36
  returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
37
37
  an exception.
38
38
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import Annotated, Any
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
80
  errors: dict[str, Any] = Field(default_factory=dict)
81
81
  retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
82
  requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
93
93
  ),
94
94
  ] = {}
95
95
 
96
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
97
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
98
+
99
+ def model_post_init(self, /, __context: Any) -> None:
100
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
101
+
102
+ @property
103
+ def crawler_runtime(self) -> timedelta:
104
+ if self.crawler_last_started_at:
105
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
106
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
107
+ return self._runtime_offset
108
+
109
+ @crawler_runtime.setter
110
+ def crawler_runtime(self, value: timedelta) -> None:
111
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
112
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
113
+ warnings.warn(
114
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
115
+ f' Value {value} will not be used.',
116
+ DeprecationWarning,
117
+ stacklevel=2,
118
+ )
119
+
120
+ @computed_field(alias='crawlerRuntimeMillis')
121
+ def crawler_runtime_for_serialization(self) -> timedelta:
122
+ if self.crawler_last_started_at:
123
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
124
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
125
+ return self._runtime_offset
126
+
96
127
  @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
97
128
  @property
98
129
  def request_total_duration(self) -> timedelta: