crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +34 -22
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +86 -33
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/system.py +3 -3
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +2 -0
  17. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
  18. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  19. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  22. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  23. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  24. crawlee/crawlers/_basic/_basic_crawler.py +124 -37
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/events/_types.py +6 -6
  28. crawlee/fingerprint_suite/_header_generator.py +2 -2
  29. crawlee/fingerprint_suite/_types.py +2 -2
  30. crawlee/otel/crawler_instrumentor.py +3 -3
  31. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  32. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  33. crawlee/request_loaders/_request_list.py +1 -1
  34. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  35. crawlee/sessions/_models.py +2 -2
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +33 -2
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +13 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
  45. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  46. crawlee/storage_clients/_file_system/_utils.py +0 -0
  47. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  48. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  49. crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
  50. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  51. crawlee/storage_clients/_redis/__init__.py +6 -0
  52. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  53. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  54. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  55. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  56. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  57. crawlee/storage_clients/_redis/_utils.py +23 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  59. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  60. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  61. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  62. crawlee/storage_clients/_redis/py.typed +0 -0
  63. crawlee/storage_clients/_sql/__init__.py +6 -0
  64. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  65. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  66. crawlee/storage_clients/_sql/_db_models.py +268 -0
  67. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  68. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  69. crawlee/storage_clients/_sql/_storage_client.py +291 -0
  70. crawlee/storage_clients/_sql/py.typed +0 -0
  71. crawlee/storage_clients/models.py +10 -10
  72. crawlee/storages/_base.py +5 -1
  73. crawlee/storages/_dataset.py +12 -2
  74. crawlee/storages/_key_value_store.py +17 -4
  75. crawlee/storages/_request_queue.py +10 -2
  76. crawlee/storages/_storage_instance_manager.py +133 -71
  77. crawlee/storages/_utils.py +11 -0
  78. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
  79. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
  80. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
  81. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
  82. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -113,7 +113,7 @@ class Snapshotter:
113
113
  Args:
114
114
  config: The `Configuration` instance. Uses the global (default) one if not provided.
115
115
  """
116
- config = service_locator.get_configuration()
116
+ config = config or service_locator.get_configuration()
117
117
 
118
118
  # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
119
119
  # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
crawlee/_request.py CHANGED
@@ -117,6 +117,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
117
117
  user_data_adapter = TypeAdapter(UserData)
118
118
 
119
119
 
120
+ @docs_group('Other')
120
121
  class RequestOptions(TypedDict):
121
122
  """Options that can be used to customize request creation.
122
123
 
@@ -163,7 +164,7 @@ class Request(BaseModel):
163
164
  ```
164
165
  """
165
166
 
166
- model_config = ConfigDict(populate_by_name=True)
167
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
167
168
 
168
169
  unique_key: Annotated[str, Field(alias='uniqueKey')]
169
170
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
@@ -184,9 +185,6 @@ class Request(BaseModel):
184
185
  method: HttpMethod = 'GET'
185
186
  """HTTP request method."""
186
187
 
187
- headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
188
- """HTTP request headers."""
189
-
190
188
  payload: Annotated[
191
189
  HttpPayload | None,
192
190
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
@@ -194,23 +192,37 @@ class Request(BaseModel):
194
192
  ] = None
195
193
  """HTTP request payload."""
196
194
 
197
- user_data: Annotated[
198
- dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
199
- Field(alias='userData', default_factory=lambda: UserData()),
200
- PlainValidator(user_data_adapter.validate_python),
201
- PlainSerializer(
202
- lambda instance: user_data_adapter.dump_python(
203
- instance,
204
- by_alias=True,
205
- exclude_none=True,
206
- exclude_unset=True,
207
- exclude_defaults=True,
208
- )
209
- ),
210
- ] = {}
211
- """Custom user data assigned to the request. Use this to save any request related data to the
212
- request's scope, keeping them accessible on retries, failures etc.
213
- """
195
+ # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
196
+ if TYPE_CHECKING:
197
+ headers: HttpHeaders = HttpHeaders()
198
+ """HTTP request headers."""
199
+
200
+ user_data: dict[str, JsonSerializable] = {}
201
+ """Custom user data assigned to the request. Use this to save any request related data to the
202
+ request's scope, keeping them accessible on retries, failures etc.
203
+ """
204
+
205
+ else:
206
+ headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
207
+ """HTTP request headers."""
208
+
209
+ user_data: Annotated[
210
+ dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
211
+ Field(alias='userData', default_factory=lambda: UserData()),
212
+ PlainValidator(user_data_adapter.validate_python),
213
+ PlainSerializer(
214
+ lambda instance: user_data_adapter.dump_python(
215
+ instance,
216
+ by_alias=True,
217
+ exclude_none=True,
218
+ exclude_unset=True,
219
+ exclude_defaults=True,
220
+ )
221
+ ),
222
+ ]
223
+ """Custom user data assigned to the request. Use this to save any request related data to the
224
+ request's scope, keeping them accessible on retries, failures etc.
225
+ """
214
226
 
215
227
  retry_count: Annotated[int, Field(alias='retryCount')] = 0
216
228
  """Number of times the request has been retried."""
@@ -287,7 +299,7 @@ class Request(BaseModel):
287
299
  )
288
300
 
289
301
  if always_enqueue:
290
- unique_key = f'{unique_key}_{crypto_random_object_id()}'
302
+ unique_key = f'{crypto_random_object_id()}|{unique_key}'
291
303
 
292
304
  request = cls(
293
305
  url=url,
@@ -11,6 +11,10 @@ from crawlee.storage_clients import FileSystemStorageClient, StorageClient
11
11
  if TYPE_CHECKING:
12
12
  from crawlee.storages._storage_instance_manager import StorageInstanceManager
13
13
 
14
+ from logging import getLogger
15
+
16
+ logger = getLogger(__name__)
17
+
14
18
 
15
19
  @docs_group('Configuration')
16
20
  class ServiceLocator:
@@ -19,23 +23,24 @@ class ServiceLocator:
19
23
  All services are initialized to its default value lazily.
20
24
  """
21
25
 
22
- def __init__(self) -> None:
23
- self._configuration: Configuration | None = None
24
- self._event_manager: EventManager | None = None
25
- self._storage_client: StorageClient | None = None
26
- self._storage_instance_manager: StorageInstanceManager | None = None
26
+ global_storage_instance_manager: StorageInstanceManager | None = None
27
27
 
28
- # Flags to check if the services were already set.
29
- self._configuration_was_retrieved = False
30
- self._event_manager_was_retrieved = False
31
- self._storage_client_was_retrieved = False
28
+ def __init__(
29
+ self,
30
+ configuration: Configuration | None = None,
31
+ event_manager: EventManager | None = None,
32
+ storage_client: StorageClient | None = None,
33
+ ) -> None:
34
+ self._configuration = configuration
35
+ self._event_manager = event_manager
36
+ self._storage_client = storage_client
32
37
 
33
38
  def get_configuration(self) -> Configuration:
34
39
  """Get the configuration."""
35
40
  if self._configuration is None:
41
+ logger.debug('No configuration set, implicitly creating and using default Configuration.')
36
42
  self._configuration = Configuration()
37
43
 
38
- self._configuration_was_retrieved = True
39
44
  return self._configuration
40
45
 
41
46
  def set_configuration(self, configuration: Configuration) -> None:
@@ -47,7 +52,10 @@ class ServiceLocator:
47
52
  Raises:
48
53
  ServiceConflictError: If the configuration has already been retrieved before.
49
54
  """
50
- if self._configuration_was_retrieved:
55
+ if self._configuration is configuration:
56
+ # Same instance, no need to anything
57
+ return
58
+ if self._configuration:
51
59
  raise ServiceConflictError(Configuration, configuration, self._configuration)
52
60
 
53
61
  self._configuration = configuration
@@ -55,13 +63,14 @@ class ServiceLocator:
55
63
  def get_event_manager(self) -> EventManager:
56
64
  """Get the event manager."""
57
65
  if self._event_manager is None:
58
- self._event_manager = (
59
- LocalEventManager().from_config(config=self._configuration)
60
- if self._configuration
61
- else LocalEventManager.from_config()
62
- )
66
+ logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
67
+ if self._configuration is None:
68
+ logger.debug(
69
+ 'Implicit creation of event manager will implicitly set configuration as side effect. '
70
+ 'It is advised to explicitly first set the configuration instead.'
71
+ )
72
+ self._event_manager = LocalEventManager().from_config(config=self._configuration)
63
73
 
64
- self._event_manager_was_retrieved = True
65
74
  return self._event_manager
66
75
 
67
76
  def set_event_manager(self, event_manager: EventManager) -> None:
@@ -73,7 +82,10 @@ class ServiceLocator:
73
82
  Raises:
74
83
  ServiceConflictError: If the event manager has already been retrieved before.
75
84
  """
76
- if self._event_manager_was_retrieved:
85
+ if self._event_manager is event_manager:
86
+ # Same instance, no need to anything
87
+ return
88
+ if self._event_manager:
77
89
  raise ServiceConflictError(EventManager, event_manager, self._event_manager)
78
90
 
79
91
  self._event_manager = event_manager
@@ -81,9 +93,14 @@ class ServiceLocator:
81
93
  def get_storage_client(self) -> StorageClient:
82
94
  """Get the storage client."""
83
95
  if self._storage_client is None:
96
+ logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
97
+ if self._configuration is None:
98
+ logger.warning(
99
+ 'Implicit creation of storage client will implicitly set configuration as side effect. '
100
+ 'It is advised to explicitly first set the configuration instead.'
101
+ )
84
102
  self._storage_client = FileSystemStorageClient()
85
103
 
86
- self._storage_client_was_retrieved = True
87
104
  return self._storage_client
88
105
 
89
106
  def set_storage_client(self, storage_client: StorageClient) -> None:
@@ -95,21 +112,24 @@ class ServiceLocator:
95
112
  Raises:
96
113
  ServiceConflictError: If the storage client has already been retrieved before.
97
114
  """
98
- if self._storage_client_was_retrieved:
115
+ if self._storage_client is storage_client:
116
+ # Same instance, no need to anything
117
+ return
118
+ if self._storage_client:
99
119
  raise ServiceConflictError(StorageClient, storage_client, self._storage_client)
100
120
 
101
121
  self._storage_client = storage_client
102
122
 
103
123
  @property
104
124
  def storage_instance_manager(self) -> StorageInstanceManager:
105
- """Get the storage instance manager."""
106
- if self._storage_instance_manager is None:
125
+ """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
126
+ if ServiceLocator.global_storage_instance_manager is None:
107
127
  # Import here to avoid circular imports.
108
128
  from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415
109
129
 
110
- self._storage_instance_manager = StorageInstanceManager()
130
+ ServiceLocator.global_storage_instance_manager = StorageInstanceManager()
111
131
 
112
- return self._storage_instance_manager
132
+ return ServiceLocator.global_storage_instance_manager
113
133
 
114
134
 
115
135
  service_locator = ServiceLocator()
crawlee/_types.py CHANGED
@@ -3,17 +3,7 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
5
  from dataclasses import dataclass
6
- from typing import (
7
- TYPE_CHECKING,
8
- Annotated,
9
- Any,
10
- Literal,
11
- Protocol,
12
- TypedDict,
13
- TypeVar,
14
- cast,
15
- overload,
16
- )
6
+ from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
17
7
 
18
8
  from pydantic import ConfigDict, Field, PlainValidator, RootModel
19
9
 
@@ -69,13 +59,17 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
69
59
  class HttpHeaders(RootModel, Mapping[str, str]):
70
60
  """A dictionary-like object representing HTTP headers."""
71
61
 
72
- model_config = ConfigDict(populate_by_name=True)
62
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
73
63
 
74
- root: Annotated[
75
- dict[str, str],
76
- PlainValidator(lambda value: _normalize_headers(value)),
77
- Field(default_factory=dict),
78
- ] = {}
64
+ # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
65
+ if TYPE_CHECKING:
66
+ root: dict[str, str] = {}
67
+ else:
68
+ root: Annotated[
69
+ dict[str, str],
70
+ PlainValidator(lambda value: _normalize_headers(value)),
71
+ Field(default_factory=dict),
72
+ ]
79
73
 
80
74
  def __getitem__(self, key: str) -> str:
81
75
  return self.root[key.lower()]
@@ -110,9 +104,9 @@ class ConcurrencySettings:
110
104
  def __init__(
111
105
  self,
112
106
  min_concurrency: int = 1,
113
- max_concurrency: int = 200,
107
+ max_concurrency: int = 100,
114
108
  max_tasks_per_minute: float = float('inf'),
115
- desired_concurrency: int | None = None,
109
+ desired_concurrency: int = 10,
116
110
  ) -> None:
117
111
  """Initialize a new instance.
118
112
 
@@ -125,21 +119,24 @@ class ConcurrencySettings:
125
119
  desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
126
120
  if there is a large enough supply of them. By default, it is `min_concurrency`.
127
121
  """
128
- if desired_concurrency is not None and desired_concurrency < 1:
129
- raise ValueError('desired_concurrency must be 1 or larger')
130
-
131
122
  if min_concurrency < 1:
132
123
  raise ValueError('min_concurrency must be 1 or larger')
133
124
 
134
125
  if max_concurrency < min_concurrency:
135
126
  raise ValueError('max_concurrency cannot be less than min_concurrency')
136
127
 
128
+ if desired_concurrency < min_concurrency:
129
+ raise ValueError('desired_concurrency cannot be less than min_concurrency')
130
+
131
+ if desired_concurrency > max_concurrency:
132
+ raise ValueError('desired_concurrency cannot be greater than max_concurrency')
133
+
137
134
  if max_tasks_per_minute <= 0:
138
135
  raise ValueError('max_tasks_per_minute must be positive')
139
136
 
140
137
  self.min_concurrency = min_concurrency
141
138
  self.max_concurrency = max_concurrency
142
- self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
139
+ self.desired_concurrency = desired_concurrency
143
140
  self.max_tasks_per_minute = max_tasks_per_minute
144
141
 
145
142
 
@@ -180,6 +177,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
180
177
  requests: Sequence[str | Request]
181
178
  """Requests to be added to the `RequestManager`."""
182
179
 
180
+ rq_id: str | None
181
+ """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
182
+
183
+ rq_name: str | None
184
+ """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
185
+ """
186
+
187
+ rq_alias: str | None
188
+ """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
189
+ """
190
+
183
191
 
184
192
  class PushDataKwargs(TypedDict):
185
193
  """Keyword arguments for dataset's `push_data` method."""
@@ -189,6 +197,7 @@ class PushDataFunctionCall(PushDataKwargs):
189
197
  data: list[dict[str, Any]] | dict[str, Any]
190
198
  dataset_id: str | None
191
199
  dataset_name: str | None
200
+ dataset_alias: str | None
192
201
 
193
202
 
194
203
  class KeyValueStoreInterface(Protocol):
@@ -255,21 +264,30 @@ class RequestHandlerRunResult:
255
264
  self._key_value_store_getter = key_value_store_getter
256
265
  self.add_requests_calls = list[AddRequestsKwargs]()
257
266
  self.push_data_calls = list[PushDataFunctionCall]()
258
- self.key_value_store_changes = dict[tuple[str | None, str | None], KeyValueStoreChangeRecords]()
267
+ self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
259
268
 
260
269
  async def add_requests(
261
270
  self,
262
271
  requests: Sequence[str | Request],
272
+ rq_id: str | None = None,
273
+ rq_name: str | None = None,
274
+ rq_alias: str | None = None,
263
275
  **kwargs: Unpack[EnqueueLinksKwargs],
264
276
  ) -> None:
265
277
  """Track a call to the `add_requests` context helper."""
266
- self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs))
278
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
279
+ if specified_params > 1:
280
+ raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
281
+ self.add_requests_calls.append(
282
+ AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
283
+ )
267
284
 
268
285
  async def push_data(
269
286
  self,
270
287
  data: list[dict[str, Any]] | dict[str, Any],
271
288
  dataset_id: str | None = None,
272
289
  dataset_name: str | None = None,
290
+ dataset_alias: str | None = None,
273
291
  **kwargs: Unpack[PushDataKwargs],
274
292
  ) -> None:
275
293
  """Track a call to the `push_data` context helper."""
@@ -278,6 +296,7 @@ class RequestHandlerRunResult:
278
296
  data=data,
279
297
  dataset_id=dataset_id,
280
298
  dataset_name=dataset_name,
299
+ dataset_alias=dataset_alias,
281
300
  **kwargs,
282
301
  )
283
302
  )
@@ -287,13 +306,14 @@ class RequestHandlerRunResult:
287
306
  *,
288
307
  id: str | None = None,
289
308
  name: str | None = None,
309
+ alias: str | None = None,
290
310
  ) -> KeyValueStoreInterface:
291
- if (id, name) not in self.key_value_store_changes:
292
- self.key_value_store_changes[id, name] = KeyValueStoreChangeRecords(
293
- await self._key_value_store_getter(id=id, name=name)
311
+ if (id, name, alias) not in self.key_value_store_changes:
312
+ self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
313
+ await self._key_value_store_getter(id=id, name=name, alias=alias)
294
314
  )
295
315
 
296
- return self.key_value_store_changes[id, name]
316
+ return self.key_value_store_changes[id, name, alias]
297
317
 
298
318
 
299
319
  @docs_group('Functions')
@@ -307,12 +327,21 @@ class AddRequestsFunction(Protocol):
307
327
  def __call__(
308
328
  self,
309
329
  requests: Sequence[str | Request],
330
+ rq_id: str | None = None,
331
+ rq_name: str | None = None,
332
+ rq_alias: str | None = None,
310
333
  **kwargs: Unpack[EnqueueLinksKwargs],
311
334
  ) -> Coroutine[None, None, None]:
312
335
  """Call dunder method.
313
336
 
314
337
  Args:
315
338
  requests: Requests to be added to the `RequestManager`.
339
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
340
+ provided.
341
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
342
+ can be provided.
343
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
344
+ can be provided.
316
345
  **kwargs: Additional keyword arguments.
317
346
  """
318
347
 
@@ -340,12 +369,21 @@ class EnqueueLinksFunction(Protocol):
340
369
  label: str | None = None,
341
370
  user_data: dict[str, Any] | None = None,
342
371
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
372
+ rq_id: str | None = None,
373
+ rq_name: str | None = None,
374
+ rq_alias: str | None = None,
343
375
  **kwargs: Unpack[EnqueueLinksKwargs],
344
376
  ) -> Coroutine[None, None, None]: ...
345
377
 
346
378
  @overload
347
379
  def __call__(
348
- self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
380
+ self,
381
+ *,
382
+ requests: Sequence[str | Request] | None = None,
383
+ rq_id: str | None = None,
384
+ rq_name: str | None = None,
385
+ rq_alias: str | None = None,
386
+ **kwargs: Unpack[EnqueueLinksKwargs],
349
387
  ) -> Coroutine[None, None, None]: ...
350
388
 
351
389
  def __call__(
@@ -356,6 +394,9 @@ class EnqueueLinksFunction(Protocol):
356
394
  user_data: dict[str, Any] | None = None,
357
395
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
358
396
  requests: Sequence[str | Request] | None = None,
397
+ rq_id: str | None = None,
398
+ rq_name: str | None = None,
399
+ rq_alias: str | None = None,
359
400
  **kwargs: Unpack[EnqueueLinksKwargs],
360
401
  ) -> Coroutine[None, None, None]:
361
402
  """Call enqueue links function.
@@ -373,6 +414,12 @@ class EnqueueLinksFunction(Protocol):
373
414
  - `'skip'` to exclude the request from being enqueued,
374
415
  - `'unchanged'` to use the original request options without modification.
375
416
  requests: Requests to be added to the `RequestManager`.
417
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
418
+ provided.
419
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
420
+ can be provided.
421
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
422
+ can be provided.
376
423
  **kwargs: Additional keyword arguments.
377
424
  """
378
425
 
@@ -424,12 +471,14 @@ class GetKeyValueStoreFunction(Protocol):
424
471
  *,
425
472
  id: str | None = None,
426
473
  name: str | None = None,
474
+ alias: str | None = None,
427
475
  ) -> Coroutine[None, None, KeyValueStore]:
428
476
  """Call dunder method.
429
477
 
430
478
  Args:
431
479
  id: The ID of the `KeyValueStore` to get.
432
- name: The name of the `KeyValueStore` to get.
480
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
481
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
433
482
  """
434
483
 
435
484
 
@@ -444,12 +493,14 @@ class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
444
493
  *,
445
494
  id: str | None = None,
446
495
  name: str | None = None,
496
+ alias: str | None = None,
447
497
  ) -> Coroutine[None, None, KeyValueStoreInterface]:
448
498
  """Call dunder method.
449
499
 
450
500
  Args:
451
501
  id: The ID of the `KeyValueStore` to get.
452
- name: The name of the `KeyValueStore` to get.
502
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
503
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
453
504
  """
454
505
 
455
506
 
@@ -466,6 +517,7 @@ class PushDataFunction(Protocol):
466
517
  data: list[dict[str, Any]] | dict[str, Any],
467
518
  dataset_id: str | None = None,
468
519
  dataset_name: str | None = None,
520
+ dataset_alias: str | None = None,
469
521
  **kwargs: Unpack[PushDataKwargs],
470
522
  ) -> Coroutine[None, None, None]:
471
523
  """Call dunder method.
@@ -473,7 +525,8 @@ class PushDataFunction(Protocol):
473
525
  Args:
474
526
  data: The data to push to the `Dataset`.
475
527
  dataset_id: The ID of the `Dataset` to push the data to.
476
- dataset_name: The name of the `Dataset` to push the data to.
528
+ dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
529
+ dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
477
530
  **kwargs: Additional keyword arguments.
478
531
  """
479
532
 
@@ -0,0 +1,12 @@
1
+ from typing import Any
2
+
3
+
4
+ def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
5
+ """Raise ValueError if there are more non-None kwargs then max_kwargs."""
6
+ none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
7
+ if len(none_kwargs_names) > max_kwargs:
8
+ all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
9
+ raise ValueError(
10
+ f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
11
+ f'specified: {", ".join(none_kwargs_names)}.'
12
+ )
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
7
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
7
8
  from crawlee.events._types import Event, EventPersistStateData
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  import logging
12
+ from collections.abc import Callable, Coroutine
11
13
 
12
- from crawlee.storages._key_value_store import KeyValueStore
14
+ from crawlee.storages import KeyValueStore
13
15
 
14
16
  TStateModel = TypeVar('TStateModel', bound=BaseModel)
15
17
 
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
37
39
  persistence_enabled: Literal[True, False, 'explicit_only'] = False,
38
40
  persist_state_kvs_name: str | None = None,
39
41
  persist_state_kvs_id: str | None = None,
42
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
40
43
  logger: logging.Logger,
41
44
  ) -> None:
42
45
  """Initialize a new recoverable state object.
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
51
54
  If neither a name nor and id are supplied, the default store will be used.
52
55
  persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
53
56
  If neither a name nor and id are supplied, the default store will be used.
57
+ persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
58
+ not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
54
59
  logger: A logger instance for logging operations related to state persistence
55
60
  """
61
+ raise_if_too_many_kwargs(
62
+ persist_state_kvs_name=persist_state_kvs_name,
63
+ persist_state_kvs_id=persist_state_kvs_id,
64
+ persist_state_kvs_factory=persist_state_kvs_factory,
65
+ )
66
+ if not persist_state_kvs_factory:
67
+ logger.debug(
68
+ 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
69
+ 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
70
+ 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
71
+ 'global side effects.'
72
+ )
73
+
56
74
  self._default_state = default_state
57
75
  self._state_type: type[TStateModel] = self._default_state.__class__
58
76
  self._state: TStateModel | None = None
59
77
  self._persistence_enabled = persistence_enabled
60
78
  self._persist_state_key = persist_state_key
61
- self._persist_state_kvs_name = persist_state_kvs_name
62
- self._persist_state_kvs_id = persist_state_kvs_id
63
- self._key_value_store: 'KeyValueStore | None' = None # noqa: UP037
79
+ if persist_state_kvs_factory is None:
80
+
81
+ async def kvs_factory() -> KeyValueStore:
82
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
83
+
84
+ return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
85
+
86
+ self._persist_state_kvs_factory = kvs_factory
87
+ else:
88
+ self._persist_state_kvs_factory = persist_state_kvs_factory
89
+
90
+ self._key_value_store: KeyValueStore | None = None
64
91
  self._log = logger
65
92
 
66
93
  async def initialize(self) -> TStateModel:
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
77
104
  return self.current_value
78
105
 
79
106
  # Import here to avoid circular imports.
80
- from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
81
107
 
82
- self._key_value_store = await KeyValueStore.open(
83
- name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
84
- )
108
+ self._key_value_store = await self._persist_state_kvs_factory()
85
109
 
86
110
  await self._load_saved_state()
87
111
 
@@ -7,6 +7,9 @@ from typing import TYPE_CHECKING
7
7
  if TYPE_CHECKING:
8
8
  from collections.abc import Callable
9
9
  from datetime import timedelta
10
+ from types import TracebackType
11
+
12
+ from typing_extensions import Self
10
13
 
11
14
  logger = getLogger(__name__)
12
15
 
@@ -26,6 +29,18 @@ class RecurringTask:
26
29
  self.delay = delay
27
30
  self.task: asyncio.Task | None = None
28
31
 
32
+ async def __aenter__(self) -> Self:
33
+ self.start()
34
+ return self
35
+
36
+ async def __aexit__(
37
+ self,
38
+ exc_type: type[BaseException] | None,
39
+ exc_value: BaseException | None,
40
+ exc_traceback: TracebackType | None,
41
+ ) -> None:
42
+ await self.stop()
43
+
29
44
  async def _wrapper(self) -> None:
30
45
  """Continuously execute the provided function with the specified delay.
31
46