crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -113,7 +113,7 @@ class Snapshotter:
113
113
  Args:
114
114
  config: The `Configuration` instance. Uses the global (default) one if not provided.
115
115
  """
116
- config = service_locator.get_configuration()
116
+ config = config or service_locator.get_configuration()
117
117
 
118
118
  # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
119
119
  # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
crawlee/_request.py CHANGED
@@ -11,7 +11,7 @@ from yarl import URL
11
11
  from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
12
12
  from crawlee._utils.crypto import crypto_random_object_id
13
13
  from crawlee._utils.docs import docs_group
14
- from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
14
+ from crawlee._utils.requests import compute_unique_key
15
15
  from crawlee._utils.urls import validate_http_url
16
16
 
17
17
  if TYPE_CHECKING:
@@ -117,6 +117,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
117
117
  user_data_adapter = TypeAdapter(UserData)
118
118
 
119
119
 
120
+ @docs_group('Other')
120
121
  class RequestOptions(TypedDict):
121
122
  """Options that can be used to customize request creation.
122
123
 
@@ -163,11 +164,7 @@ class Request(BaseModel):
163
164
  ```
164
165
  """
165
166
 
166
- model_config = ConfigDict(populate_by_name=True)
167
-
168
- id: str
169
- """A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
170
- with `unique_key`."""
167
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
171
168
 
172
169
  unique_key: Annotated[str, Field(alias='uniqueKey')]
173
170
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
@@ -188,9 +185,6 @@ class Request(BaseModel):
188
185
  method: HttpMethod = 'GET'
189
186
  """HTTP request method."""
190
187
 
191
- headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
192
- """HTTP request headers."""
193
-
194
188
  payload: Annotated[
195
189
  HttpPayload | None,
196
190
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
@@ -198,23 +192,37 @@ class Request(BaseModel):
198
192
  ] = None
199
193
  """HTTP request payload."""
200
194
 
201
- user_data: Annotated[
202
- dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
203
- Field(alias='userData', default_factory=lambda: UserData()),
204
- PlainValidator(user_data_adapter.validate_python),
205
- PlainSerializer(
206
- lambda instance: user_data_adapter.dump_python(
207
- instance,
208
- by_alias=True,
209
- exclude_none=True,
210
- exclude_unset=True,
211
- exclude_defaults=True,
212
- )
213
- ),
214
- ] = {}
215
- """Custom user data assigned to the request. Use this to save any request related data to the
216
- request's scope, keeping them accessible on retries, failures etc.
217
- """
195
+ # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
196
+ if TYPE_CHECKING:
197
+ headers: HttpHeaders = HttpHeaders()
198
+ """HTTP request headers."""
199
+
200
+ user_data: dict[str, JsonSerializable] = {}
201
+ """Custom user data assigned to the request. Use this to save any request related data to the
202
+ request's scope, keeping them accessible on retries, failures etc.
203
+ """
204
+
205
+ else:
206
+ headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
207
+ """HTTP request headers."""
208
+
209
+ user_data: Annotated[
210
+ dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
211
+ Field(alias='userData', default_factory=lambda: UserData()),
212
+ PlainValidator(user_data_adapter.validate_python),
213
+ PlainSerializer(
214
+ lambda instance: user_data_adapter.dump_python(
215
+ instance,
216
+ by_alias=True,
217
+ exclude_none=True,
218
+ exclude_unset=True,
219
+ exclude_defaults=True,
220
+ )
221
+ ),
222
+ ]
223
+ """Custom user data assigned to the request. Use this to save any request related data to the
224
+ request's scope, keeping them accessible on retries, failures etc.
225
+ """
218
226
 
219
227
  retry_count: Annotated[int, Field(alias='retryCount')] = 0
220
228
  """Number of times the request has been retried."""
@@ -239,7 +247,6 @@ class Request(BaseModel):
239
247
  label: str | None = None,
240
248
  session_id: str | None = None,
241
249
  unique_key: str | None = None,
242
- id: str | None = None,
243
250
  keep_url_fragment: bool = False,
244
251
  use_extended_unique_key: bool = False,
245
252
  always_enqueue: bool = False,
@@ -264,8 +271,6 @@ class Request(BaseModel):
264
271
  raised.
265
272
  unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
266
273
  the URL and other parameters. Requests with the same `unique_key` are treated as identical.
267
- id: A unique identifier for the request. If not provided, it is automatically generated from the
268
- `unique_key`.
269
274
  keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
270
275
  the `unique_key` computation. This is only relevant when `unique_key` is not provided.
271
276
  use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the
@@ -294,14 +299,11 @@ class Request(BaseModel):
294
299
  )
295
300
 
296
301
  if always_enqueue:
297
- unique_key = f'{unique_key}_{crypto_random_object_id()}'
298
-
299
- id = id or unique_key_to_request_id(unique_key)
302
+ unique_key = f'{crypto_random_object_id()}|{unique_key}'
300
303
 
301
304
  request = cls(
302
305
  url=url,
303
306
  unique_key=unique_key,
304
- id=id,
305
307
  method=method,
306
308
  headers=headers,
307
309
  payload=payload,
@@ -11,6 +11,10 @@ from crawlee.storage_clients import FileSystemStorageClient, StorageClient
11
11
  if TYPE_CHECKING:
12
12
  from crawlee.storages._storage_instance_manager import StorageInstanceManager
13
13
 
14
+ from logging import getLogger
15
+
16
+ logger = getLogger(__name__)
17
+
14
18
 
15
19
  @docs_group('Configuration')
16
20
  class ServiceLocator:
@@ -19,23 +23,24 @@ class ServiceLocator:
19
23
  All services are initialized to its default value lazily.
20
24
  """
21
25
 
22
- def __init__(self) -> None:
23
- self._configuration: Configuration | None = None
24
- self._event_manager: EventManager | None = None
25
- self._storage_client: StorageClient | None = None
26
- self._storage_instance_manager: StorageInstanceManager | None = None
26
+ global_storage_instance_manager: StorageInstanceManager | None = None
27
27
 
28
- # Flags to check if the services were already set.
29
- self._configuration_was_retrieved = False
30
- self._event_manager_was_retrieved = False
31
- self._storage_client_was_retrieved = False
28
+ def __init__(
29
+ self,
30
+ configuration: Configuration | None = None,
31
+ event_manager: EventManager | None = None,
32
+ storage_client: StorageClient | None = None,
33
+ ) -> None:
34
+ self._configuration = configuration
35
+ self._event_manager = event_manager
36
+ self._storage_client = storage_client
32
37
 
33
38
  def get_configuration(self) -> Configuration:
34
39
  """Get the configuration."""
35
40
  if self._configuration is None:
41
+ logger.debug('No configuration set, implicitly creating and using default Configuration.')
36
42
  self._configuration = Configuration()
37
43
 
38
- self._configuration_was_retrieved = True
39
44
  return self._configuration
40
45
 
41
46
  def set_configuration(self, configuration: Configuration) -> None:
@@ -47,7 +52,10 @@ class ServiceLocator:
47
52
  Raises:
48
53
  ServiceConflictError: If the configuration has already been retrieved before.
49
54
  """
50
- if self._configuration_was_retrieved:
55
+ if self._configuration is configuration:
56
+ # Same instance, no need to anything
57
+ return
58
+ if self._configuration:
51
59
  raise ServiceConflictError(Configuration, configuration, self._configuration)
52
60
 
53
61
  self._configuration = configuration
@@ -55,13 +63,14 @@ class ServiceLocator:
55
63
  def get_event_manager(self) -> EventManager:
56
64
  """Get the event manager."""
57
65
  if self._event_manager is None:
58
- self._event_manager = (
59
- LocalEventManager().from_config(config=self._configuration)
60
- if self._configuration
61
- else LocalEventManager.from_config()
62
- )
66
+ logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
67
+ if self._configuration is None:
68
+ logger.debug(
69
+ 'Implicit creation of event manager will implicitly set configuration as side effect. '
70
+ 'It is advised to explicitly first set the configuration instead.'
71
+ )
72
+ self._event_manager = LocalEventManager().from_config(config=self._configuration)
63
73
 
64
- self._event_manager_was_retrieved = True
65
74
  return self._event_manager
66
75
 
67
76
  def set_event_manager(self, event_manager: EventManager) -> None:
@@ -73,7 +82,10 @@ class ServiceLocator:
73
82
  Raises:
74
83
  ServiceConflictError: If the event manager has already been retrieved before.
75
84
  """
76
- if self._event_manager_was_retrieved:
85
+ if self._event_manager is event_manager:
86
+ # Same instance, no need to anything
87
+ return
88
+ if self._event_manager:
77
89
  raise ServiceConflictError(EventManager, event_manager, self._event_manager)
78
90
 
79
91
  self._event_manager = event_manager
@@ -81,9 +93,14 @@ class ServiceLocator:
81
93
  def get_storage_client(self) -> StorageClient:
82
94
  """Get the storage client."""
83
95
  if self._storage_client is None:
96
+ logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
97
+ if self._configuration is None:
98
+ logger.warning(
99
+ 'Implicit creation of storage client will implicitly set configuration as side effect. '
100
+ 'It is advised to explicitly first set the configuration instead.'
101
+ )
84
102
  self._storage_client = FileSystemStorageClient()
85
103
 
86
- self._storage_client_was_retrieved = True
87
104
  return self._storage_client
88
105
 
89
106
  def set_storage_client(self, storage_client: StorageClient) -> None:
@@ -95,21 +112,24 @@ class ServiceLocator:
95
112
  Raises:
96
113
  ServiceConflictError: If the storage client has already been retrieved before.
97
114
  """
98
- if self._storage_client_was_retrieved:
115
+ if self._storage_client is storage_client:
116
+ # Same instance, no need to anything
117
+ return
118
+ if self._storage_client:
99
119
  raise ServiceConflictError(StorageClient, storage_client, self._storage_client)
100
120
 
101
121
  self._storage_client = storage_client
102
122
 
103
123
  @property
104
124
  def storage_instance_manager(self) -> StorageInstanceManager:
105
- """Get the storage instance manager."""
106
- if self._storage_instance_manager is None:
125
+ """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
126
+ if ServiceLocator.global_storage_instance_manager is None:
107
127
  # Import here to avoid circular imports.
108
128
  from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415
109
129
 
110
- self._storage_instance_manager = StorageInstanceManager()
130
+ ServiceLocator.global_storage_instance_manager = StorageInstanceManager()
111
131
 
112
- return self._storage_instance_manager
132
+ return ServiceLocator.global_storage_instance_manager
113
133
 
114
134
 
115
135
  service_locator = ServiceLocator()
crawlee/_types.py CHANGED
@@ -3,17 +3,7 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
5
  from dataclasses import dataclass
6
- from typing import (
7
- TYPE_CHECKING,
8
- Annotated,
9
- Any,
10
- Literal,
11
- Protocol,
12
- TypedDict,
13
- TypeVar,
14
- cast,
15
- overload,
16
- )
6
+ from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
17
7
 
18
8
  from pydantic import ConfigDict, Field, PlainValidator, RootModel
19
9
 
@@ -25,7 +15,7 @@ if TYPE_CHECKING:
25
15
  import re
26
16
  from collections.abc import Callable, Coroutine, Sequence
27
17
 
28
- from typing_extensions import NotRequired, Required, Unpack
18
+ from typing_extensions import NotRequired, Required, Self, Unpack
29
19
 
30
20
  from crawlee import Glob, Request
31
21
  from crawlee._request import RequestOptions
@@ -69,13 +59,17 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
69
59
  class HttpHeaders(RootModel, Mapping[str, str]):
70
60
  """A dictionary-like object representing HTTP headers."""
71
61
 
72
- model_config = ConfigDict(populate_by_name=True)
62
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
73
63
 
74
- root: Annotated[
75
- dict[str, str],
76
- PlainValidator(lambda value: _normalize_headers(value)),
77
- Field(default_factory=dict),
78
- ] = {}
64
+ # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
65
+ if TYPE_CHECKING:
66
+ root: dict[str, str] = {}
67
+ else:
68
+ root: Annotated[
69
+ dict[str, str],
70
+ PlainValidator(lambda value: _normalize_headers(value)),
71
+ Field(default_factory=dict),
72
+ ]
79
73
 
80
74
  def __getitem__(self, key: str) -> str:
81
75
  return self.root[key.lower()]
@@ -110,9 +104,9 @@ class ConcurrencySettings:
110
104
  def __init__(
111
105
  self,
112
106
  min_concurrency: int = 1,
113
- max_concurrency: int = 200,
107
+ max_concurrency: int = 100,
114
108
  max_tasks_per_minute: float = float('inf'),
115
- desired_concurrency: int | None = None,
109
+ desired_concurrency: int = 10,
116
110
  ) -> None:
117
111
  """Initialize a new instance.
118
112
 
@@ -125,21 +119,24 @@ class ConcurrencySettings:
125
119
  desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
126
120
  if there is a large enough supply of them. By default, it is `min_concurrency`.
127
121
  """
128
- if desired_concurrency is not None and desired_concurrency < 1:
129
- raise ValueError('desired_concurrency must be 1 or larger')
130
-
131
122
  if min_concurrency < 1:
132
123
  raise ValueError('min_concurrency must be 1 or larger')
133
124
 
134
125
  if max_concurrency < min_concurrency:
135
126
  raise ValueError('max_concurrency cannot be less than min_concurrency')
136
127
 
128
+ if desired_concurrency < min_concurrency:
129
+ raise ValueError('desired_concurrency cannot be less than min_concurrency')
130
+
131
+ if desired_concurrency > max_concurrency:
132
+ raise ValueError('desired_concurrency cannot be greater than max_concurrency')
133
+
137
134
  if max_tasks_per_minute <= 0:
138
135
  raise ValueError('max_tasks_per_minute must be positive')
139
136
 
140
137
  self.min_concurrency = min_concurrency
141
138
  self.max_concurrency = max_concurrency
142
- self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
139
+ self.desired_concurrency = desired_concurrency
143
140
  self.max_tasks_per_minute = max_tasks_per_minute
144
141
 
145
142
 
@@ -180,6 +177,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
180
177
  requests: Sequence[str | Request]
181
178
  """Requests to be added to the `RequestManager`."""
182
179
 
180
+ rq_id: str | None
181
+ """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
182
+
183
+ rq_name: str | None
184
+ """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
185
+ """
186
+
187
+ rq_alias: str | None
188
+ """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
189
+ """
190
+
183
191
 
184
192
  class PushDataKwargs(TypedDict):
185
193
  """Keyword arguments for dataset's `push_data` method."""
@@ -189,6 +197,7 @@ class PushDataFunctionCall(PushDataKwargs):
189
197
  data: list[dict[str, Any]] | dict[str, Any]
190
198
  dataset_id: str | None
191
199
  dataset_name: str | None
200
+ dataset_alias: str | None
192
201
 
193
202
 
194
203
  class KeyValueStoreInterface(Protocol):
@@ -255,21 +264,30 @@ class RequestHandlerRunResult:
255
264
  self._key_value_store_getter = key_value_store_getter
256
265
  self.add_requests_calls = list[AddRequestsKwargs]()
257
266
  self.push_data_calls = list[PushDataFunctionCall]()
258
- self.key_value_store_changes = dict[tuple[str | None, str | None], KeyValueStoreChangeRecords]()
267
+ self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
259
268
 
260
269
  async def add_requests(
261
270
  self,
262
271
  requests: Sequence[str | Request],
272
+ rq_id: str | None = None,
273
+ rq_name: str | None = None,
274
+ rq_alias: str | None = None,
263
275
  **kwargs: Unpack[EnqueueLinksKwargs],
264
276
  ) -> None:
265
277
  """Track a call to the `add_requests` context helper."""
266
- self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs))
278
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
279
+ if specified_params > 1:
280
+ raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
281
+ self.add_requests_calls.append(
282
+ AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
283
+ )
267
284
 
268
285
  async def push_data(
269
286
  self,
270
287
  data: list[dict[str, Any]] | dict[str, Any],
271
288
  dataset_id: str | None = None,
272
289
  dataset_name: str | None = None,
290
+ dataset_alias: str | None = None,
273
291
  **kwargs: Unpack[PushDataKwargs],
274
292
  ) -> None:
275
293
  """Track a call to the `push_data` context helper."""
@@ -278,6 +296,7 @@ class RequestHandlerRunResult:
278
296
  data=data,
279
297
  dataset_id=dataset_id,
280
298
  dataset_name=dataset_name,
299
+ dataset_alias=dataset_alias,
281
300
  **kwargs,
282
301
  )
283
302
  )
@@ -287,13 +306,14 @@ class RequestHandlerRunResult:
287
306
  *,
288
307
  id: str | None = None,
289
308
  name: str | None = None,
309
+ alias: str | None = None,
290
310
  ) -> KeyValueStoreInterface:
291
- if (id, name) not in self.key_value_store_changes:
292
- self.key_value_store_changes[id, name] = KeyValueStoreChangeRecords(
293
- await self._key_value_store_getter(id=id, name=name)
311
+ if (id, name, alias) not in self.key_value_store_changes:
312
+ self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
313
+ await self._key_value_store_getter(id=id, name=name, alias=alias)
294
314
  )
295
315
 
296
- return self.key_value_store_changes[id, name]
316
+ return self.key_value_store_changes[id, name, alias]
297
317
 
298
318
 
299
319
  @docs_group('Functions')
@@ -307,12 +327,21 @@ class AddRequestsFunction(Protocol):
307
327
  def __call__(
308
328
  self,
309
329
  requests: Sequence[str | Request],
330
+ rq_id: str | None = None,
331
+ rq_name: str | None = None,
332
+ rq_alias: str | None = None,
310
333
  **kwargs: Unpack[EnqueueLinksKwargs],
311
334
  ) -> Coroutine[None, None, None]:
312
335
  """Call dunder method.
313
336
 
314
337
  Args:
315
338
  requests: Requests to be added to the `RequestManager`.
339
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
340
+ provided.
341
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
342
+ can be provided.
343
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
344
+ can be provided.
316
345
  **kwargs: Additional keyword arguments.
317
346
  """
318
347
 
@@ -340,12 +369,21 @@ class EnqueueLinksFunction(Protocol):
340
369
  label: str | None = None,
341
370
  user_data: dict[str, Any] | None = None,
342
371
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
372
+ rq_id: str | None = None,
373
+ rq_name: str | None = None,
374
+ rq_alias: str | None = None,
343
375
  **kwargs: Unpack[EnqueueLinksKwargs],
344
376
  ) -> Coroutine[None, None, None]: ...
345
377
 
346
378
  @overload
347
379
  def __call__(
348
- self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
380
+ self,
381
+ *,
382
+ requests: Sequence[str | Request] | None = None,
383
+ rq_id: str | None = None,
384
+ rq_name: str | None = None,
385
+ rq_alias: str | None = None,
386
+ **kwargs: Unpack[EnqueueLinksKwargs],
349
387
  ) -> Coroutine[None, None, None]: ...
350
388
 
351
389
  def __call__(
@@ -356,6 +394,9 @@ class EnqueueLinksFunction(Protocol):
356
394
  user_data: dict[str, Any] | None = None,
357
395
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
358
396
  requests: Sequence[str | Request] | None = None,
397
+ rq_id: str | None = None,
398
+ rq_name: str | None = None,
399
+ rq_alias: str | None = None,
359
400
  **kwargs: Unpack[EnqueueLinksKwargs],
360
401
  ) -> Coroutine[None, None, None]:
361
402
  """Call enqueue links function.
@@ -373,6 +414,12 @@ class EnqueueLinksFunction(Protocol):
373
414
  - `'skip'` to exclude the request from being enqueued,
374
415
  - `'unchanged'` to use the original request options without modification.
375
416
  requests: Requests to be added to the `RequestManager`.
417
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
418
+ provided.
419
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
420
+ can be provided.
421
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
422
+ can be provided.
376
423
  **kwargs: Additional keyword arguments.
377
424
  """
378
425
 
@@ -424,12 +471,14 @@ class GetKeyValueStoreFunction(Protocol):
424
471
  *,
425
472
  id: str | None = None,
426
473
  name: str | None = None,
474
+ alias: str | None = None,
427
475
  ) -> Coroutine[None, None, KeyValueStore]:
428
476
  """Call dunder method.
429
477
 
430
478
  Args:
431
479
  id: The ID of the `KeyValueStore` to get.
432
- name: The name of the `KeyValueStore` to get.
480
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
481
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
433
482
  """
434
483
 
435
484
 
@@ -444,12 +493,14 @@ class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
444
493
  *,
445
494
  id: str | None = None,
446
495
  name: str | None = None,
496
+ alias: str | None = None,
447
497
  ) -> Coroutine[None, None, KeyValueStoreInterface]:
448
498
  """Call dunder method.
449
499
 
450
500
  Args:
451
501
  id: The ID of the `KeyValueStore` to get.
452
- name: The name of the `KeyValueStore` to get.
502
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
503
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
453
504
  """
454
505
 
455
506
 
@@ -466,6 +517,7 @@ class PushDataFunction(Protocol):
466
517
  data: list[dict[str, Any]] | dict[str, Any],
467
518
  dataset_id: str | None = None,
468
519
  dataset_name: str | None = None,
520
+ dataset_alias: str | None = None,
469
521
  **kwargs: Unpack[PushDataKwargs],
470
522
  ) -> Coroutine[None, None, None]:
471
523
  """Call dunder method.
@@ -473,7 +525,8 @@ class PushDataFunction(Protocol):
473
525
  Args:
474
526
  data: The data to push to the `Dataset`.
475
527
  dataset_id: The ID of the `Dataset` to push the data to.
476
- dataset_name: The name of the `Dataset` to push the data to.
528
+ dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
529
+ dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
477
530
  **kwargs: Additional keyword arguments.
478
531
  """
479
532
 
@@ -590,6 +643,25 @@ class BasicCrawlingContext:
590
643
  """Return hash of the context. Each context is considered unique."""
591
644
  return id(self)
592
645
 
646
+ def create_modified_copy(
647
+ self,
648
+ push_data: PushDataFunction | None = None,
649
+ add_requests: AddRequestsFunction | None = None,
650
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
651
+ ) -> Self:
652
+ """Create a modified copy of the crawling context with specified changes."""
653
+ original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
654
+ modified_fields = {
655
+ key: value
656
+ for key, value in {
657
+ 'push_data': push_data,
658
+ 'add_requests': add_requests,
659
+ 'get_key_value_store': get_key_value_store,
660
+ }.items()
661
+ if value
662
+ }
663
+ return self.__class__(**{**original_fields, **modified_fields})
664
+
593
665
 
594
666
  class GetDataKwargs(TypedDict):
595
667
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/context.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
3
+ import inspect
4
4
  from collections.abc import Callable
5
5
  from functools import wraps
6
6
  from typing import Any, TypeVar
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
44
44
 
45
45
  return await method(self, *args, **kwargs)
46
46
 
47
- return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
47
+ return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
crawlee/_utils/file.py CHANGED
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
166
173
  writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
167
174
  write_header = True
168
175
 
@@ -0,0 +1,12 @@
1
+ from typing import Any
2
+
3
+
4
+ def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
5
+ """Raise ValueError if there are more non-None kwargs then max_kwargs."""
6
+ none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
7
+ if len(none_kwargs_names) > max_kwargs:
8
+ all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
9
+ raise ValueError(
10
+ f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
11
+ f'specified: {", ".join(none_kwargs_names)}.'
12
+ )