crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +2 -1
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +76 -17
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/sitemap.py +3 -1
  7. crawlee/_utils/system.py +3 -3
  8. crawlee/browsers/_playwright_browser_controller.py +20 -14
  9. crawlee/configuration.py +1 -1
  10. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  11. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  12. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  13. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
  14. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  15. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  16. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  17. crawlee/crawlers/_basic/_basic_crawler.py +107 -27
  18. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  19. crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
  20. crawlee/events/_types.py +6 -6
  21. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  22. crawlee/fingerprint_suite/_types.py +2 -2
  23. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  24. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  25. crawlee/request_loaders/_request_list.py +1 -1
  26. crawlee/request_loaders/_request_loader.py +5 -1
  27. crawlee/request_loaders/_sitemap_request_loader.py +228 -48
  28. crawlee/sessions/_models.py +2 -2
  29. crawlee/statistics/_models.py +1 -1
  30. crawlee/storage_clients/__init__.py +12 -0
  31. crawlee/storage_clients/_base/_storage_client.py +13 -0
  32. crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
  33. crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
  34. crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
  35. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  36. crawlee/storage_clients/_file_system/_utils.py +0 -0
  37. crawlee/storage_clients/_memory/_dataset_client.py +14 -2
  38. crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
  39. crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
  40. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  41. crawlee/storage_clients/_sql/__init__.py +6 -0
  42. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  43. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  44. crawlee/storage_clients/_sql/_db_models.py +269 -0
  45. crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
  46. crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
  47. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  48. crawlee/storage_clients/_sql/py.typed +0 -0
  49. crawlee/storage_clients/models.py +10 -10
  50. crawlee/storages/_base.py +3 -1
  51. crawlee/storages/_dataset.py +9 -2
  52. crawlee/storages/_key_value_store.py +9 -2
  53. crawlee/storages/_request_queue.py +7 -2
  54. crawlee/storages/_storage_instance_manager.py +126 -72
  55. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
  56. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
  57. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
  58. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
  59. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -113,7 +113,7 @@ class Snapshotter:
113
113
  Args:
114
114
  config: The `Configuration` instance. Uses the global (default) one if not provided.
115
115
  """
116
- config = service_locator.get_configuration()
116
+ config = config or service_locator.get_configuration()
117
117
 
118
118
  # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
119
119
  # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
crawlee/_request.py CHANGED
@@ -117,6 +117,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
117
117
  user_data_adapter = TypeAdapter(UserData)
118
118
 
119
119
 
120
+ @docs_group('Other')
120
121
  class RequestOptions(TypedDict):
121
122
  """Options that can be used to customize request creation.
122
123
 
@@ -163,7 +164,7 @@ class Request(BaseModel):
163
164
  ```
164
165
  """
165
166
 
166
- model_config = ConfigDict(populate_by_name=True)
167
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
167
168
 
168
169
  unique_key: Annotated[str, Field(alias='uniqueKey')]
169
170
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
@@ -11,6 +11,10 @@ from crawlee.storage_clients import FileSystemStorageClient, StorageClient
11
11
  if TYPE_CHECKING:
12
12
  from crawlee.storages._storage_instance_manager import StorageInstanceManager
13
13
 
14
+ from logging import getLogger
15
+
16
+ logger = getLogger(__name__)
17
+
14
18
 
15
19
  @docs_group('Configuration')
16
20
  class ServiceLocator:
@@ -19,23 +23,24 @@ class ServiceLocator:
19
23
  All services are initialized to its default value lazily.
20
24
  """
21
25
 
22
- def __init__(self) -> None:
23
- self._configuration: Configuration | None = None
24
- self._event_manager: EventManager | None = None
25
- self._storage_client: StorageClient | None = None
26
- self._storage_instance_manager: StorageInstanceManager | None = None
26
+ global_storage_instance_manager: StorageInstanceManager | None = None
27
27
 
28
- # Flags to check if the services were already set.
29
- self._configuration_was_retrieved = False
30
- self._event_manager_was_retrieved = False
31
- self._storage_client_was_retrieved = False
28
+ def __init__(
29
+ self,
30
+ configuration: Configuration | None = None,
31
+ event_manager: EventManager | None = None,
32
+ storage_client: StorageClient | None = None,
33
+ ) -> None:
34
+ self._configuration = configuration
35
+ self._event_manager = event_manager
36
+ self._storage_client = storage_client
32
37
 
33
38
  def get_configuration(self) -> Configuration:
34
39
  """Get the configuration."""
35
40
  if self._configuration is None:
41
+ logger.warning('No configuration set, implicitly creating and using default Configuration.')
36
42
  self._configuration = Configuration()
37
43
 
38
- self._configuration_was_retrieved = True
39
44
  return self._configuration
40
45
 
41
46
  def set_configuration(self, configuration: Configuration) -> None:
@@ -47,7 +52,10 @@ class ServiceLocator:
47
52
  Raises:
48
53
  ServiceConflictError: If the configuration has already been retrieved before.
49
54
  """
50
- if self._configuration_was_retrieved:
55
+ if self._configuration is configuration:
56
+ # Same instance, no need to anything
57
+ return
58
+ if self._configuration:
51
59
  raise ServiceConflictError(Configuration, configuration, self._configuration)
52
60
 
53
61
  self._configuration = configuration
@@ -55,13 +63,14 @@ class ServiceLocator:
55
63
  def get_event_manager(self) -> EventManager:
56
64
  """Get the event manager."""
57
65
  if self._event_manager is None:
58
- self._event_manager = (
59
- LocalEventManager().from_config(config=self._configuration)
60
- if self._configuration
61
- else LocalEventManager.from_config()
62
- )
66
+ logger.warning('No event manager set, implicitly creating and using default LocalEventManager.')
67
+ if self._configuration is None:
68
+ logger.warning(
69
+ 'Implicit creation of event manager will implicitly set configuration as side effect. '
70
+ 'It is advised to explicitly first set the configuration instead.'
71
+ )
72
+ self._event_manager = LocalEventManager().from_config(config=self._configuration)
63
73
 
64
- self._event_manager_was_retrieved = True
65
74
  return self._event_manager
66
75
 
67
76
  def set_event_manager(self, event_manager: EventManager) -> None:
@@ -73,7 +82,10 @@ class ServiceLocator:
73
82
  Raises:
74
83
  ServiceConflictError: If the event manager has already been retrieved before.
75
84
  """
76
- if self._event_manager_was_retrieved:
85
+ if self._event_manager is event_manager:
86
+ # Same instance, no need to anything
87
+ return
88
+ if self._event_manager:
77
89
  raise ServiceConflictError(EventManager, event_manager, self._event_manager)
78
90
 
79
91
  self._event_manager = event_manager
@@ -81,9 +93,14 @@ class ServiceLocator:
81
93
  def get_storage_client(self) -> StorageClient:
82
94
  """Get the storage client."""
83
95
  if self._storage_client is None:
96
+ logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.')
97
+ if self._configuration is None:
98
+ logger.warning(
99
+ 'Implicit creation of storage client will implicitly set configuration as side effect. '
100
+ 'It is advised to explicitly first set the configuration instead.'
101
+ )
84
102
  self._storage_client = FileSystemStorageClient()
85
103
 
86
- self._storage_client_was_retrieved = True
87
104
  return self._storage_client
88
105
 
89
106
  def set_storage_client(self, storage_client: StorageClient) -> None:
@@ -95,21 +112,24 @@ class ServiceLocator:
95
112
  Raises:
96
113
  ServiceConflictError: If the storage client has already been retrieved before.
97
114
  """
98
- if self._storage_client_was_retrieved:
115
+ if self._storage_client is storage_client:
116
+ # Same instance, no need to anything
117
+ return
118
+ if self._storage_client:
99
119
  raise ServiceConflictError(StorageClient, storage_client, self._storage_client)
100
120
 
101
121
  self._storage_client = storage_client
102
122
 
103
123
  @property
104
124
  def storage_instance_manager(self) -> StorageInstanceManager:
105
- """Get the storage instance manager."""
106
- if self._storage_instance_manager is None:
125
+ """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
126
+ if ServiceLocator.global_storage_instance_manager is None:
107
127
  # Import here to avoid circular imports.
108
128
  from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415
109
129
 
110
- self._storage_instance_manager = StorageInstanceManager()
130
+ ServiceLocator.global_storage_instance_manager = StorageInstanceManager()
111
131
 
112
- return self._storage_instance_manager
132
+ return ServiceLocator.global_storage_instance_manager
113
133
 
114
134
 
115
135
  service_locator = ServiceLocator()
crawlee/_types.py CHANGED
@@ -69,7 +69,7 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
69
69
  class HttpHeaders(RootModel, Mapping[str, str]):
70
70
  """A dictionary-like object representing HTTP headers."""
71
71
 
72
- model_config = ConfigDict(populate_by_name=True)
72
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
73
73
 
74
74
  root: Annotated[
75
75
  dict[str, str],
@@ -110,9 +110,9 @@ class ConcurrencySettings:
110
110
  def __init__(
111
111
  self,
112
112
  min_concurrency: int = 1,
113
- max_concurrency: int = 200,
113
+ max_concurrency: int = 100,
114
114
  max_tasks_per_minute: float = float('inf'),
115
- desired_concurrency: int | None = None,
115
+ desired_concurrency: int = 10,
116
116
  ) -> None:
117
117
  """Initialize a new instance.
118
118
 
@@ -125,21 +125,24 @@ class ConcurrencySettings:
125
125
  desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
126
126
  if there is a large enough supply of them. By default, it is `min_concurrency`.
127
127
  """
128
- if desired_concurrency is not None and desired_concurrency < 1:
129
- raise ValueError('desired_concurrency must be 1 or larger')
130
-
131
128
  if min_concurrency < 1:
132
129
  raise ValueError('min_concurrency must be 1 or larger')
133
130
 
134
131
  if max_concurrency < min_concurrency:
135
132
  raise ValueError('max_concurrency cannot be less than min_concurrency')
136
133
 
134
+ if desired_concurrency < min_concurrency:
135
+ raise ValueError('desired_concurrency cannot be less than min_concurrency')
136
+
137
+ if desired_concurrency > max_concurrency:
138
+ raise ValueError('desired_concurrency cannot be greater than max_concurrency')
139
+
137
140
  if max_tasks_per_minute <= 0:
138
141
  raise ValueError('max_tasks_per_minute must be positive')
139
142
 
140
143
  self.min_concurrency = min_concurrency
141
144
  self.max_concurrency = max_concurrency
142
- self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
145
+ self.desired_concurrency = desired_concurrency
143
146
  self.max_tasks_per_minute = max_tasks_per_minute
144
147
 
145
148
 
@@ -180,6 +183,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
180
183
  requests: Sequence[str | Request]
181
184
  """Requests to be added to the `RequestManager`."""
182
185
 
186
+ rq_id: str | None
187
+ """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
188
+
189
+ rq_name: str | None
190
+ """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
191
+ """
192
+
193
+ rq_alias: str | None
194
+ """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
195
+ """
196
+
183
197
 
184
198
  class PushDataKwargs(TypedDict):
185
199
  """Keyword arguments for dataset's `push_data` method."""
@@ -189,6 +203,7 @@ class PushDataFunctionCall(PushDataKwargs):
189
203
  data: list[dict[str, Any]] | dict[str, Any]
190
204
  dataset_id: str | None
191
205
  dataset_name: str | None
206
+ dataset_alias: str | None
192
207
 
193
208
 
194
209
  class KeyValueStoreInterface(Protocol):
@@ -255,21 +270,30 @@ class RequestHandlerRunResult:
255
270
  self._key_value_store_getter = key_value_store_getter
256
271
  self.add_requests_calls = list[AddRequestsKwargs]()
257
272
  self.push_data_calls = list[PushDataFunctionCall]()
258
- self.key_value_store_changes = dict[tuple[str | None, str | None], KeyValueStoreChangeRecords]()
273
+ self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
259
274
 
260
275
  async def add_requests(
261
276
  self,
262
277
  requests: Sequence[str | Request],
278
+ rq_id: str | None = None,
279
+ rq_name: str | None = None,
280
+ rq_alias: str | None = None,
263
281
  **kwargs: Unpack[EnqueueLinksKwargs],
264
282
  ) -> None:
265
283
  """Track a call to the `add_requests` context helper."""
266
- self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs))
284
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
285
+ if specified_params > 1:
286
+ raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
287
+ self.add_requests_calls.append(
288
+ AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
289
+ )
267
290
 
268
291
  async def push_data(
269
292
  self,
270
293
  data: list[dict[str, Any]] | dict[str, Any],
271
294
  dataset_id: str | None = None,
272
295
  dataset_name: str | None = None,
296
+ dataset_alias: str | None = None,
273
297
  **kwargs: Unpack[PushDataKwargs],
274
298
  ) -> None:
275
299
  """Track a call to the `push_data` context helper."""
@@ -278,6 +302,7 @@ class RequestHandlerRunResult:
278
302
  data=data,
279
303
  dataset_id=dataset_id,
280
304
  dataset_name=dataset_name,
305
+ dataset_alias=dataset_alias,
281
306
  **kwargs,
282
307
  )
283
308
  )
@@ -287,13 +312,14 @@ class RequestHandlerRunResult:
287
312
  *,
288
313
  id: str | None = None,
289
314
  name: str | None = None,
315
+ alias: str | None = None,
290
316
  ) -> KeyValueStoreInterface:
291
- if (id, name) not in self.key_value_store_changes:
292
- self.key_value_store_changes[id, name] = KeyValueStoreChangeRecords(
293
- await self._key_value_store_getter(id=id, name=name)
317
+ if (id, name, alias) not in self.key_value_store_changes:
318
+ self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
319
+ await self._key_value_store_getter(id=id, name=name, alias=alias)
294
320
  )
295
321
 
296
- return self.key_value_store_changes[id, name]
322
+ return self.key_value_store_changes[id, name, alias]
297
323
 
298
324
 
299
325
  @docs_group('Functions')
@@ -307,12 +333,21 @@ class AddRequestsFunction(Protocol):
307
333
  def __call__(
308
334
  self,
309
335
  requests: Sequence[str | Request],
336
+ rq_id: str | None = None,
337
+ rq_name: str | None = None,
338
+ rq_alias: str | None = None,
310
339
  **kwargs: Unpack[EnqueueLinksKwargs],
311
340
  ) -> Coroutine[None, None, None]:
312
341
  """Call dunder method.
313
342
 
314
343
  Args:
315
344
  requests: Requests to be added to the `RequestManager`.
345
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
346
+ provided.
347
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
348
+ can be provided.
349
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
350
+ can be provided.
316
351
  **kwargs: Additional keyword arguments.
317
352
  """
318
353
 
@@ -340,12 +375,21 @@ class EnqueueLinksFunction(Protocol):
340
375
  label: str | None = None,
341
376
  user_data: dict[str, Any] | None = None,
342
377
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
378
+ rq_id: str | None = None,
379
+ rq_name: str | None = None,
380
+ rq_alias: str | None = None,
343
381
  **kwargs: Unpack[EnqueueLinksKwargs],
344
382
  ) -> Coroutine[None, None, None]: ...
345
383
 
346
384
  @overload
347
385
  def __call__(
348
- self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
386
+ self,
387
+ *,
388
+ requests: Sequence[str | Request] | None = None,
389
+ rq_id: str | None = None,
390
+ rq_name: str | None = None,
391
+ rq_alias: str | None = None,
392
+ **kwargs: Unpack[EnqueueLinksKwargs],
349
393
  ) -> Coroutine[None, None, None]: ...
350
394
 
351
395
  def __call__(
@@ -356,6 +400,9 @@ class EnqueueLinksFunction(Protocol):
356
400
  user_data: dict[str, Any] | None = None,
357
401
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
358
402
  requests: Sequence[str | Request] | None = None,
403
+ rq_id: str | None = None,
404
+ rq_name: str | None = None,
405
+ rq_alias: str | None = None,
359
406
  **kwargs: Unpack[EnqueueLinksKwargs],
360
407
  ) -> Coroutine[None, None, None]:
361
408
  """Call enqueue links function.
@@ -373,6 +420,12 @@ class EnqueueLinksFunction(Protocol):
373
420
  - `'skip'` to exclude the request from being enqueued,
374
421
  - `'unchanged'` to use the original request options without modification.
375
422
  requests: Requests to be added to the `RequestManager`.
423
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
424
+ provided.
425
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
426
+ can be provided.
427
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
428
+ can be provided.
376
429
  **kwargs: Additional keyword arguments.
377
430
  """
378
431
 
@@ -424,12 +477,14 @@ class GetKeyValueStoreFunction(Protocol):
424
477
  *,
425
478
  id: str | None = None,
426
479
  name: str | None = None,
480
+ alias: str | None = None,
427
481
  ) -> Coroutine[None, None, KeyValueStore]:
428
482
  """Call dunder method.
429
483
 
430
484
  Args:
431
485
  id: The ID of the `KeyValueStore` to get.
432
- name: The name of the `KeyValueStore` to get.
486
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
487
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
433
488
  """
434
489
 
435
490
 
@@ -444,12 +499,14 @@ class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
444
499
  *,
445
500
  id: str | None = None,
446
501
  name: str | None = None,
502
+ alias: str | None = None,
447
503
  ) -> Coroutine[None, None, KeyValueStoreInterface]:
448
504
  """Call dunder method.
449
505
 
450
506
  Args:
451
507
  id: The ID of the `KeyValueStore` to get.
452
- name: The name of the `KeyValueStore` to get.
508
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
509
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
453
510
  """
454
511
 
455
512
 
@@ -466,6 +523,7 @@ class PushDataFunction(Protocol):
466
523
  data: list[dict[str, Any]] | dict[str, Any],
467
524
  dataset_id: str | None = None,
468
525
  dataset_name: str | None = None,
526
+ dataset_alias: str | None = None,
469
527
  **kwargs: Unpack[PushDataKwargs],
470
528
  ) -> Coroutine[None, None, None]:
471
529
  """Call dunder method.
@@ -473,7 +531,8 @@ class PushDataFunction(Protocol):
473
531
  Args:
474
532
  data: The data to push to the `Dataset`.
475
533
  dataset_id: The ID of the `Dataset` to push the data to.
476
- dataset_name: The name of the `Dataset` to push the data to.
534
+ dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
535
+ dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
477
536
  **kwargs: Additional keyword arguments.
478
537
  """
479
538
 
@@ -0,0 +1,12 @@
1
+ from typing import Any
2
+
3
+
4
+ def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
5
+ """Raise ValueError if there are more non-None kwargs then max_kwargs."""
6
+ none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
7
+ if len(none_kwargs_names) > max_kwargs:
8
+ all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
9
+ raise ValueError(
10
+ f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
11
+ f'specified: {", ".join(none_kwargs_names)}.'
12
+ )
crawlee/_utils/sitemap.py CHANGED
@@ -9,6 +9,7 @@ from datetime import datetime, timedelta
9
9
  from hashlib import sha256
10
10
  from logging import getLogger
11
11
  from typing import TYPE_CHECKING, Literal, TypedDict
12
+ from xml.sax import SAXParseException
12
13
  from xml.sax.expatreader import ExpatParser
13
14
  from xml.sax.handler import ContentHandler
14
15
 
@@ -192,7 +193,8 @@ class _XmlSitemapParser:
192
193
 
193
194
  def close(self) -> None:
194
195
  """Clean up resources."""
195
- self._parser.close()
196
+ with suppress(SAXParseException):
197
+ self._parser.close()
196
198
 
197
199
 
198
200
  def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:
crawlee/_utils/system.py CHANGED
@@ -36,7 +36,7 @@ else:
36
36
  class CpuInfo(BaseModel):
37
37
  """Information about the CPU usage."""
38
38
 
39
- model_config = ConfigDict(populate_by_name=True)
39
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
40
40
 
41
41
  used_ratio: Annotated[float, Field(alias='usedRatio')]
42
42
  """The ratio of CPU currently in use, represented as a float between 0 and 1."""
@@ -51,7 +51,7 @@ class CpuInfo(BaseModel):
51
51
  class MemoryUsageInfo(BaseModel):
52
52
  """Information about the memory usage."""
53
53
 
54
- model_config = ConfigDict(populate_by_name=True)
54
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
55
55
 
56
56
  current_size: Annotated[
57
57
  ByteSize,
@@ -71,7 +71,7 @@ class MemoryUsageInfo(BaseModel):
71
71
  class MemoryInfo(MemoryUsageInfo):
72
72
  """Information about system memory."""
73
73
 
74
- model_config = ConfigDict(populate_by_name=True)
74
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
75
75
 
76
76
  total_size: Annotated[
77
77
  ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from asyncio import Lock
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import TYPE_CHECKING, Any, cast
7
8
 
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
77
78
 
78
79
  self._total_opened_pages = 0
79
80
 
81
+ self._context_creation_lock: Lock | None = None
82
+
83
+ async def _get_context_creation_lock(self) -> Lock:
84
+ """Get context checking and creation lock.
85
+
86
+ It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
87
+ memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
88
+ """
89
+ if self._context_creation_lock:
90
+ return self._context_creation_lock
91
+ self._context_creation_lock = Lock()
92
+ return self._context_creation_lock
93
+
80
94
  @property
81
95
  @override
82
96
  def pages(self) -> list[Page]:
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
137
151
  Raises:
138
152
  ValueError: If the browser has reached the maximum number of open pages.
139
153
  """
140
- if not self._browser_context:
141
- self._browser_context = await self._create_browser_context(
142
- browser_new_context_options=browser_new_context_options,
143
- proxy_info=proxy_info,
144
- )
145
-
146
154
  if not self.has_free_capacity:
147
155
  raise ValueError('Cannot open more pages in this browser.')
148
156
 
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
154
162
  )
155
163
  page = await new_context.new_page()
156
164
  else:
157
- if not self._browser_context:
158
- self._browser_context = await self._create_browser_context(
159
- browser_new_context_options=browser_new_context_options,
160
- proxy_info=proxy_info,
161
- )
165
+ async with await self._get_context_creation_lock():
166
+ if not self._browser_context:
167
+ self._browser_context = await self._create_browser_context(
168
+ browser_new_context_options=browser_new_context_options,
169
+ proxy_info=proxy_info,
170
+ )
162
171
  page = await self._browser_context.new_page()
163
172
 
164
173
  # Handle page close event
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
169
178
  self._last_page_opened_at = datetime.now(timezone.utc)
170
179
 
171
180
  self._total_opened_pages += 1
172
-
173
181
  return page
174
182
 
175
183
  @override
@@ -206,7 +214,6 @@ class PlaywrightBrowserController(BrowserController):
206
214
  `self._fingerprint_generator` is available.
207
215
  """
208
216
  browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
209
-
210
217
  if proxy_info:
211
218
  if browser_new_context_options.get('proxy'):
212
219
  logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
244
251
  browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
245
252
  'extra_http_headers', extra_http_headers
246
253
  )
247
-
248
254
  return await self._browser.new_context(**browser_new_context_options)
crawlee/configuration.py CHANGED
@@ -28,7 +28,7 @@ class Configuration(BaseSettings):
28
28
  Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
29
29
  """
30
30
 
31
- model_config = SettingsConfigDict(populate_by_name=True)
31
+ model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
32
32
 
33
33
  internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
34
34
  """Timeout for the internal asynchronous operations."""
@@ -34,7 +34,9 @@ TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=St
34
34
 
35
35
  @docs_group('Crawlers')
36
36
  class AbstractHttpCrawler(
37
- Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
37
+ BasicCrawler[TCrawlingContext, StatisticsState],
38
+ ABC,
39
+ Generic[TCrawlingContext, TParseResult, TSelectResult],
38
40
  ):
39
41
  """A web crawler for performing HTTP requests.
40
42
 
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
16
16
 
17
17
 
18
18
  @docs_group('HTTP parsers')
19
- class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC):
19
+ class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
20
20
  """Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
21
21
 
22
22
  @abstractmethod
@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
31
31
 
32
32
  @dataclass(frozen=True)
33
33
  @docs_group('Crawling contexts')
34
- class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
34
+ class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
35
35
  """The crawling context used by `AbstractHttpCrawler`.
36
36
 
37
37
  It provides access to key objects as well as utility functions for handling crawling tasks.
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
12
12
  from parsel import Selector
13
13
  from typing_extensions import Self, TypeVar, override
14
14
 
15
- from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
15
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
16
16
  from crawlee._utils.docs import docs_group
17
17
  from crawlee._utils.wait import wait_for
18
18
  from crawlee.crawlers import (
@@ -85,8 +85,8 @@ class _NonPersistentStatistics(Statistics):
85
85
 
86
86
  @docs_group('Crawlers')
87
87
  class AdaptivePlaywrightCrawler(
88
- Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
89
88
  BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
89
+ Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
90
90
  ):
91
91
  """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
92
92
 
@@ -158,6 +158,10 @@ class AdaptivePlaywrightCrawler(
158
158
  self.result_checker = result_checker or (lambda _: True)
159
159
  self.result_comparator = result_comparator or create_default_comparator(result_checker)
160
160
 
161
+ # Set default concurrency settings for browser crawlers if not provided
162
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
163
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
164
+
161
165
  super().__init__(statistics=statistics, **kwargs)
162
166
 
163
167
  # Sub crawlers related.
@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
12
12
  class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
13
13
  """Statistic data about a crawler run with additional information related to adaptive crawling."""
14
14
 
15
- model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
15
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
16
16
 
17
17
  http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
18
18
  """Number representing how many times static http based crawling was used."""
@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
31
31
  @dataclass(frozen=True)
32
32
  @docs_group('Crawling contexts')
33
33
  class AdaptivePlaywrightCrawlingContext(
34
- Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult]
34
+ ParsedHttpCrawlingContext[TStaticParseResult],
35
+ Generic[TStaticParseResult, TStaticSelectResult],
35
36
  ):
36
37
  _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
37
38
  """The crawling context used by `AdaptivePlaywrightCrawler`.
@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
32
32
 
33
33
 
34
34
  class RenderingTypePredictorState(BaseModel):
35
- model_config = ConfigDict(populate_by_name=True)
35
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
36
36
 
37
37
  model: Annotated[
38
38
  LogisticRegression,