crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
- from ._request import Request, RequestOptions
3
+ from ._request import Request, RequestOptions, RequestState
4
4
  from ._service_locator import service_locator
5
5
  from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
6
6
  from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
14
14
  'HttpHeaders',
15
15
  'Request',
16
16
  'RequestOptions',
17
+ 'RequestState',
17
18
  'RequestTransformAction',
18
19
  'SkippedReason',
19
20
  'service_locator',
@@ -113,7 +113,7 @@ class Snapshotter:
113
113
  Args:
114
114
  config: The `Configuration` instance. Uses the global (default) one if not provided.
115
115
  """
116
- config = service_locator.get_configuration()
116
+ config = config or service_locator.get_configuration()
117
117
 
118
118
  # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
119
119
  # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
@@ -1,4 +1,8 @@
1
1
  # ruff: noqa: N802, PLC0415
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from collections.abc import Callable
2
6
 
3
7
 
4
8
  def patch_browserforge() -> None:
@@ -12,7 +16,7 @@ def patch_browserforge() -> None:
12
16
  import apify_fingerprint_datapoints
13
17
  from browserforge import download
14
18
 
15
- download.DATA_DIRS: dict[str, Path] = { # type:ignore[misc]
19
+ download.DATA_DIRS = {
16
20
  'headers': apify_fingerprint_datapoints.get_header_network().parent,
17
21
  'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
18
22
  }
@@ -20,7 +24,7 @@ def patch_browserforge() -> None:
20
24
  def DownloadIfNotExists(**flags: bool) -> None:
21
25
  pass
22
26
 
23
- download.DownloadIfNotExists = DownloadIfNotExists
27
+ download.DownloadIfNotExists: Callable[..., None] = DownloadIfNotExists
24
28
 
25
29
  import browserforge.bayesian_network
26
30
 
@@ -33,7 +37,7 @@ def patch_browserforge() -> None:
33
37
  path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
34
38
  super().__init__(path)
35
39
 
36
- browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc]
40
+ browserforge.bayesian_network.BayesianNetwork: BayesianNetwork = BayesianNetwork
37
41
  import browserforge.headers.generator
38
42
 
39
43
  browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
crawlee/_request.py CHANGED
@@ -11,7 +11,7 @@ from yarl import URL
11
11
  from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
12
12
  from crawlee._utils.crypto import crypto_random_object_id
13
13
  from crawlee._utils.docs import docs_group
14
- from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
14
+ from crawlee._utils.requests import compute_unique_key
15
15
  from crawlee._utils.urls import validate_http_url
16
16
 
17
17
  if TYPE_CHECKING:
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
34
34
  class CrawleeRequestData(BaseModel):
35
35
  """Crawlee-specific configuration stored in the `user_data`."""
36
36
 
37
- max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37
+ max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
38
38
  """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
39
39
  `BasicCrawler`."""
40
40
 
41
41
  enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
42
42
  """The strategy that was used for enqueuing the request."""
43
43
 
44
- state: RequestState | None = None
44
+ state: RequestState = RequestState.UNPROCESSED
45
45
  """Describes the request's current lifecycle state."""
46
46
 
47
47
  session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -93,7 +93,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
93
93
  def __delitem__(self, key: str) -> None:
94
94
  del self.__pydantic_extra__[key]
95
95
 
96
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
96
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
97
97
  yield from self.__pydantic_extra__
98
98
 
99
99
  def __len__(self) -> int:
@@ -117,6 +117,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
117
117
  user_data_adapter = TypeAdapter(UserData)
118
118
 
119
119
 
120
+ @docs_group('Other')
120
121
  class RequestOptions(TypedDict):
121
122
  """Options that can be used to customize request creation.
122
123
 
@@ -136,6 +137,8 @@ class RequestOptions(TypedDict):
136
137
  always_enqueue: NotRequired[bool]
137
138
  user_data: NotRequired[dict[str, JsonSerializable]]
138
139
  no_retry: NotRequired[bool]
140
+ enqueue_strategy: NotRequired[EnqueueStrategy]
141
+ max_retries: NotRequired[int | None]
139
142
 
140
143
 
141
144
  @docs_group('Storage data')
@@ -163,13 +166,9 @@ class Request(BaseModel):
163
166
  ```
164
167
  """
165
168
 
166
- model_config = ConfigDict(populate_by_name=True)
169
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
167
170
 
168
- id: str
169
- """A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
170
- with `unique_key`."""
171
-
172
- unique_key: Annotated[str, Field(alias='uniqueKey')]
171
+ unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
173
172
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
174
173
  to the same URL.
175
174
 
@@ -181,40 +180,52 @@ class Request(BaseModel):
181
180
  and specify which URLs shall be considered equal.
182
181
  """
183
182
 
184
- url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183
+ url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
185
184
  """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
186
185
  and fragments."""
187
186
 
188
- method: HttpMethod = 'GET'
187
+ method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
189
188
  """HTTP request method."""
190
189
 
191
- headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
192
- """HTTP request headers."""
193
-
194
190
  payload: Annotated[
195
191
  HttpPayload | None,
196
192
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
197
193
  PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194
+ Field(frozen=True),
198
195
  ] = None
199
196
  """HTTP request payload."""
200
197
 
201
- user_data: Annotated[
202
- dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
203
- Field(alias='userData', default_factory=lambda: UserData()),
204
- PlainValidator(user_data_adapter.validate_python),
205
- PlainSerializer(
206
- lambda instance: user_data_adapter.dump_python(
207
- instance,
208
- by_alias=True,
209
- exclude_none=True,
210
- exclude_unset=True,
211
- exclude_defaults=True,
212
- )
213
- ),
214
- ] = {}
215
- """Custom user data assigned to the request. Use this to save any request related data to the
216
- request's scope, keeping them accessible on retries, failures etc.
217
- """
198
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
199
+ if TYPE_CHECKING:
200
+ headers: HttpHeaders = HttpHeaders()
201
+ """HTTP request headers."""
202
+
203
+ user_data: dict[str, JsonSerializable] = {}
204
+ """Custom user data assigned to the request. Use this to save any request related data to the
205
+ request's scope, keeping them accessible on retries, failures etc.
206
+ """
207
+
208
+ else:
209
+ headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
210
+ """HTTP request headers."""
211
+
212
+ user_data: Annotated[
213
+ dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
214
+ Field(alias='userData', default_factory=lambda: UserData()),
215
+ PlainValidator(user_data_adapter.validate_python),
216
+ PlainSerializer(
217
+ lambda instance: user_data_adapter.dump_python(
218
+ instance,
219
+ by_alias=True,
220
+ exclude_none=True,
221
+ exclude_unset=True,
222
+ exclude_defaults=True,
223
+ )
224
+ ),
225
+ ]
226
+ """Custom user data assigned to the request. Use this to save any request related data to the
227
+ request's scope, keeping them accessible on retries, failures etc.
228
+ """
218
229
 
219
230
  retry_count: Annotated[int, Field(alias='retryCount')] = 0
220
231
  """Number of times the request has been retried."""
@@ -239,10 +250,11 @@ class Request(BaseModel):
239
250
  label: str | None = None,
240
251
  session_id: str | None = None,
241
252
  unique_key: str | None = None,
242
- id: str | None = None,
243
253
  keep_url_fragment: bool = False,
244
254
  use_extended_unique_key: bool = False,
245
255
  always_enqueue: bool = False,
256
+ enqueue_strategy: EnqueueStrategy | None = None,
257
+ max_retries: int | None = None,
246
258
  **kwargs: Any,
247
259
  ) -> Self:
248
260
  """Create a new `Request` instance from a URL.
@@ -264,14 +276,15 @@ class Request(BaseModel):
264
276
  raised.
265
277
  unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
266
278
  the URL and other parameters. Requests with the same `unique_key` are treated as identical.
267
- id: A unique identifier for the request. If not provided, it is automatically generated from the
268
- `unique_key`.
269
279
  keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
270
280
  the `unique_key` computation. This is only relevant when `unique_key` is not provided.
271
281
  use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the
272
282
  `unique_key` computation. This is only relevant when `unique_key` is not provided.
273
283
  always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
274
284
  Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285
+ enqueue_strategy: The strategy that will be used for enqueuing the request.
286
+ max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287
+ option of `BasicCrawler`.
275
288
  **kwargs: Additional request properties.
276
289
  """
277
290
  if unique_key is not None and always_enqueue:
@@ -294,17 +307,29 @@ class Request(BaseModel):
294
307
  )
295
308
 
296
309
  if always_enqueue:
297
- unique_key = f'{unique_key}_{crypto_random_object_id()}'
310
+ unique_key = f'{crypto_random_object_id()}|{unique_key}'
311
+
312
+ user_data_dict = kwargs.pop('user_data', {}) or {}
313
+ crawlee_data_dict = user_data_dict.get('__crawlee', {})
314
+
315
+ if max_retries is not None:
316
+ crawlee_data_dict['maxRetries'] = max_retries
298
317
 
299
- id = id or unique_key_to_request_id(unique_key)
318
+ if enqueue_strategy is not None:
319
+ crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320
+
321
+ crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322
+
323
+ if crawlee_data:
324
+ user_data_dict['__crawlee'] = crawlee_data
300
325
 
301
326
  request = cls(
302
327
  url=url,
303
328
  unique_key=unique_key,
304
- id=id,
305
329
  method=method,
306
330
  headers=headers,
307
331
  payload=payload,
332
+ user_data=user_data_dict,
308
333
  **kwargs,
309
334
  )
310
335
 
@@ -350,7 +375,7 @@ class Request(BaseModel):
350
375
  self.crawlee_data.crawl_depth = new_value
351
376
 
352
377
  @property
353
- def state(self) -> RequestState | None:
378
+ def state(self) -> RequestState:
354
379
  """Crawlee-specific request handling state."""
355
380
  return self.crawlee_data.state
356
381
 
@@ -363,10 +388,6 @@ class Request(BaseModel):
363
388
  """Crawlee-specific limit on the number of retries of the request."""
364
389
  return self.crawlee_data.max_retries
365
390
 
366
- @max_retries.setter
367
- def max_retries(self, new_max_retries: int) -> None:
368
- self.crawlee_data.max_retries = new_max_retries
369
-
370
391
  @property
371
392
  def session_rotation_count(self) -> int | None:
372
393
  """Crawlee-specific number of finished session rotations for the request."""
@@ -11,6 +11,10 @@ from crawlee.storage_clients import FileSystemStorageClient, StorageClient
11
11
  if TYPE_CHECKING:
12
12
  from crawlee.storages._storage_instance_manager import StorageInstanceManager
13
13
 
14
+ from logging import getLogger
15
+
16
+ logger = getLogger(__name__)
17
+
14
18
 
15
19
  @docs_group('Configuration')
16
20
  class ServiceLocator:
@@ -19,23 +23,24 @@ class ServiceLocator:
19
23
  All services are initialized to its default value lazily.
20
24
  """
21
25
 
22
- def __init__(self) -> None:
23
- self._configuration: Configuration | None = None
24
- self._event_manager: EventManager | None = None
25
- self._storage_client: StorageClient | None = None
26
- self._storage_instance_manager: StorageInstanceManager | None = None
26
+ global_storage_instance_manager: StorageInstanceManager | None = None
27
27
 
28
- # Flags to check if the services were already set.
29
- self._configuration_was_retrieved = False
30
- self._event_manager_was_retrieved = False
31
- self._storage_client_was_retrieved = False
28
+ def __init__(
29
+ self,
30
+ configuration: Configuration | None = None,
31
+ event_manager: EventManager | None = None,
32
+ storage_client: StorageClient | None = None,
33
+ ) -> None:
34
+ self._configuration = configuration
35
+ self._event_manager = event_manager
36
+ self._storage_client = storage_client
32
37
 
33
38
  def get_configuration(self) -> Configuration:
34
39
  """Get the configuration."""
35
40
  if self._configuration is None:
41
+ logger.debug('No configuration set, implicitly creating and using default Configuration.')
36
42
  self._configuration = Configuration()
37
43
 
38
- self._configuration_was_retrieved = True
39
44
  return self._configuration
40
45
 
41
46
  def set_configuration(self, configuration: Configuration) -> None:
@@ -47,7 +52,10 @@ class ServiceLocator:
47
52
  Raises:
48
53
  ServiceConflictError: If the configuration has already been retrieved before.
49
54
  """
50
- if self._configuration_was_retrieved:
55
+ if self._configuration is configuration:
56
+ # Same instance, no need to anything
57
+ return
58
+ if self._configuration:
51
59
  raise ServiceConflictError(Configuration, configuration, self._configuration)
52
60
 
53
61
  self._configuration = configuration
@@ -55,13 +63,14 @@ class ServiceLocator:
55
63
  def get_event_manager(self) -> EventManager:
56
64
  """Get the event manager."""
57
65
  if self._event_manager is None:
58
- self._event_manager = (
59
- LocalEventManager().from_config(config=self._configuration)
60
- if self._configuration
61
- else LocalEventManager.from_config()
62
- )
66
+ logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
67
+ if self._configuration is None:
68
+ logger.debug(
69
+ 'Implicit creation of event manager will implicitly set configuration as side effect. '
70
+ 'It is advised to explicitly first set the configuration instead.'
71
+ )
72
+ self._event_manager = LocalEventManager().from_config(config=self._configuration)
63
73
 
64
- self._event_manager_was_retrieved = True
65
74
  return self._event_manager
66
75
 
67
76
  def set_event_manager(self, event_manager: EventManager) -> None:
@@ -73,7 +82,10 @@ class ServiceLocator:
73
82
  Raises:
74
83
  ServiceConflictError: If the event manager has already been retrieved before.
75
84
  """
76
- if self._event_manager_was_retrieved:
85
+ if self._event_manager is event_manager:
86
+ # Same instance, no need to anything
87
+ return
88
+ if self._event_manager:
77
89
  raise ServiceConflictError(EventManager, event_manager, self._event_manager)
78
90
 
79
91
  self._event_manager = event_manager
@@ -81,9 +93,14 @@ class ServiceLocator:
81
93
  def get_storage_client(self) -> StorageClient:
82
94
  """Get the storage client."""
83
95
  if self._storage_client is None:
96
+ logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
97
+ if self._configuration is None:
98
+ logger.warning(
99
+ 'Implicit creation of storage client will implicitly set configuration as side effect. '
100
+ 'It is advised to explicitly first set the configuration instead.'
101
+ )
84
102
  self._storage_client = FileSystemStorageClient()
85
103
 
86
- self._storage_client_was_retrieved = True
87
104
  return self._storage_client
88
105
 
89
106
  def set_storage_client(self, storage_client: StorageClient) -> None:
@@ -95,21 +112,24 @@ class ServiceLocator:
95
112
  Raises:
96
113
  ServiceConflictError: If the storage client has already been retrieved before.
97
114
  """
98
- if self._storage_client_was_retrieved:
115
+ if self._storage_client is storage_client:
116
+ # Same instance, no need to anything
117
+ return
118
+ if self._storage_client:
99
119
  raise ServiceConflictError(StorageClient, storage_client, self._storage_client)
100
120
 
101
121
  self._storage_client = storage_client
102
122
 
103
123
  @property
104
124
  def storage_instance_manager(self) -> StorageInstanceManager:
105
- """Get the storage instance manager."""
106
- if self._storage_instance_manager is None:
125
+ """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
126
+ if ServiceLocator.global_storage_instance_manager is None:
107
127
  # Import here to avoid circular imports.
108
128
  from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415
109
129
 
110
- self._storage_instance_manager = StorageInstanceManager()
130
+ ServiceLocator.global_storage_instance_manager = StorageInstanceManager()
111
131
 
112
- return self._storage_instance_manager
132
+ return ServiceLocator.global_storage_instance_manager
113
133
 
114
134
 
115
135
  service_locator = ServiceLocator()