apify 2.7.1b17__py3-none-any.whl → 2.7.1b18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +132 -79
- apify/_configuration.py +36 -5
- apify/storage_clients/_apify/_dataset_client.py +14 -13
- apify/storage_clients/_apify/_key_value_store_client.py +15 -13
- apify/storage_clients/_apify/_request_queue_client.py +14 -13
- apify/storage_clients/_apify/_storage_client.py +26 -25
- apify/storage_clients/_apify/_utils.py +142 -92
- {apify-2.7.1b17.dist-info → apify-2.7.1b18.dist-info}/METADATA +2 -2
- {apify-2.7.1b17.dist-info → apify-2.7.1b18.dist-info}/RECORD +11 -11
- {apify-2.7.1b17.dist-info → apify-2.7.1b18.dist-info}/WHEEL +0 -0
- {apify-2.7.1b17.dist-info → apify-2.7.1b18.dist-info}/licenses/LICENSE +0 -0
apify/_actor.py
CHANGED
|
@@ -5,6 +5,7 @@ import os
|
|
|
5
5
|
import sys
|
|
6
6
|
from contextlib import suppress
|
|
7
7
|
from datetime import datetime, timedelta, timezone
|
|
8
|
+
from functools import cached_property
|
|
8
9
|
from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
|
|
9
10
|
|
|
10
11
|
from lazy_object_proxy import Proxy
|
|
@@ -14,6 +15,7 @@ from pydantic import AliasChoices
|
|
|
14
15
|
from apify_client import ApifyClientAsync
|
|
15
16
|
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
|
|
16
17
|
from crawlee import service_locator
|
|
18
|
+
from crawlee.errors import ServiceConflictError
|
|
17
19
|
from crawlee.events import (
|
|
18
20
|
Event,
|
|
19
21
|
EventAbortingData,
|
|
@@ -23,6 +25,7 @@ from crawlee.events import (
|
|
|
23
25
|
EventPersistStateData,
|
|
24
26
|
EventSystemInfoData,
|
|
25
27
|
)
|
|
28
|
+
from crawlee.storage_clients import FileSystemStorageClient
|
|
26
29
|
|
|
27
30
|
from apify._charging import ChargeResult, ChargingManager, ChargingManagerImplementation
|
|
28
31
|
from apify._configuration import Configuration
|
|
@@ -34,6 +37,7 @@ from apify._utils import docs_group, docs_name, get_system_info, is_running_in_i
|
|
|
34
37
|
from apify.events import ApifyEventManager, EventManager, LocalEventManager
|
|
35
38
|
from apify.log import _configure_logging, logger
|
|
36
39
|
from apify.storage_clients import ApifyStorageClient
|
|
40
|
+
from apify.storage_clients._file_system import ApifyFileSystemStorageClient
|
|
37
41
|
from apify.storages import Dataset, KeyValueStore, RequestQueue
|
|
38
42
|
|
|
39
43
|
if TYPE_CHECKING:
|
|
@@ -119,28 +123,15 @@ class _ActorType:
|
|
|
119
123
|
self._exit_process = self._get_default_exit_process() if exit_process is None else exit_process
|
|
120
124
|
self._is_exiting = False
|
|
121
125
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
126
|
+
# Actor state when this method is being executed is unpredictable.
|
|
127
|
+
# Actor can be initialized by lazy object proxy or by user directly, or by both.
|
|
128
|
+
# Until `init` method is run, this state of uncertainty remains. This is the reason why any setting done here in
|
|
129
|
+
# `__init__` method should not be considered final.
|
|
125
130
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
self.
|
|
129
|
-
|
|
130
|
-
# Set the event manager based on whether the Actor is running on the platform or locally.
|
|
131
|
-
self._event_manager = (
|
|
132
|
-
ApifyEventManager(
|
|
133
|
-
configuration=self._configuration,
|
|
134
|
-
persist_state_interval=self._configuration.persist_state_interval,
|
|
135
|
-
)
|
|
136
|
-
if self.is_at_home()
|
|
137
|
-
else LocalEventManager(
|
|
138
|
-
system_info_interval=self._configuration.system_info_interval,
|
|
139
|
-
persist_state_interval=self._configuration.persist_state_interval,
|
|
140
|
-
)
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
self._charging_manager = ChargingManagerImplementation(self._configuration, self._apify_client)
|
|
131
|
+
self._configuration = configuration
|
|
132
|
+
self._configure_logging = configure_logging
|
|
133
|
+
self._apify_client: ApifyClientAsync | None = None
|
|
134
|
+
self._local_storage_client: StorageClient | None = None
|
|
144
135
|
|
|
145
136
|
self._is_initialized = False
|
|
146
137
|
|
|
@@ -198,32 +189,76 @@ class _ActorType:
|
|
|
198
189
|
@property
|
|
199
190
|
def apify_client(self) -> ApifyClientAsync:
|
|
200
191
|
"""The ApifyClientAsync instance the Actor instance uses."""
|
|
192
|
+
if not self._apify_client:
|
|
193
|
+
self._apify_client = self.new_client()
|
|
201
194
|
return self._apify_client
|
|
202
195
|
|
|
203
|
-
@
|
|
196
|
+
@cached_property
|
|
204
197
|
def configuration(self) -> Configuration:
|
|
205
198
|
"""The Configuration instance the Actor instance uses."""
|
|
206
|
-
|
|
199
|
+
if self._configuration:
|
|
200
|
+
return self._configuration
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
# Set implicit default Apify configuration, unless configuration was already set.
|
|
204
|
+
implicit_configuration = Configuration()
|
|
205
|
+
service_locator.set_configuration(implicit_configuration)
|
|
206
|
+
self._configuration = implicit_configuration
|
|
207
|
+
except ServiceConflictError:
|
|
208
|
+
self.log.debug(
|
|
209
|
+
'Configuration in service locator was set explicitly before Actor.init was called.'
|
|
210
|
+
'Using the existing configuration as implicit configuration for the Actor.'
|
|
211
|
+
)
|
|
207
212
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
"""The Configuration instance the Actor instance uses."""
|
|
213
|
+
# Use the configuration from the service locator
|
|
214
|
+
self._configuration = Configuration.get_global_configuration()
|
|
211
215
|
return self._configuration
|
|
212
216
|
|
|
213
|
-
@
|
|
217
|
+
@cached_property
|
|
214
218
|
def event_manager(self) -> EventManager:
|
|
215
219
|
"""The EventManager instance the Actor instance uses."""
|
|
216
|
-
return
|
|
220
|
+
return (
|
|
221
|
+
ApifyEventManager(
|
|
222
|
+
configuration=self.configuration,
|
|
223
|
+
persist_state_interval=self.configuration.persist_state_interval,
|
|
224
|
+
)
|
|
225
|
+
if self.is_at_home()
|
|
226
|
+
else LocalEventManager(
|
|
227
|
+
system_info_interval=self.configuration.system_info_interval,
|
|
228
|
+
persist_state_interval=self.configuration.persist_state_interval,
|
|
229
|
+
)
|
|
230
|
+
)
|
|
217
231
|
|
|
218
232
|
@property
|
|
219
233
|
def log(self) -> logging.Logger:
|
|
220
234
|
"""The logging.Logger instance the Actor uses."""
|
|
221
235
|
return logger
|
|
222
236
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
237
|
+
def _get_local_storage_client(self) -> StorageClient:
|
|
238
|
+
"""Get the local storage client the Actor instance uses."""
|
|
239
|
+
if self._local_storage_client:
|
|
240
|
+
return self._local_storage_client
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
# Set implicit default local storage client, unless local storage client was already set.
|
|
244
|
+
implicit_storage_client = ApifyFileSystemStorageClient()
|
|
245
|
+
service_locator.set_storage_client(implicit_storage_client)
|
|
246
|
+
self._local_storage_client = implicit_storage_client
|
|
247
|
+
except ServiceConflictError:
|
|
248
|
+
self.log.debug(
|
|
249
|
+
'Storage client in service locator was set explicitly before Actor.init was called.'
|
|
250
|
+
'Using the existing storage client as implicit storage client for the Actor.'
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
self._local_storage_client = service_locator.get_storage_client()
|
|
254
|
+
if type(self._local_storage_client) is FileSystemStorageClient:
|
|
255
|
+
self.log.warning(
|
|
256
|
+
f'Using {FileSystemStorageClient.__module__}.{FileSystemStorageClient.__name__} in Actor context is not'
|
|
257
|
+
f' recommended and can lead to problems with reading the input file. Use '
|
|
258
|
+
f'`apify.storage_clients.FileSystemStorageClient` instead.'
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return self._local_storage_client
|
|
227
262
|
|
|
228
263
|
def _raise_if_not_initialized(self) -> None:
|
|
229
264
|
if not self._is_initialized:
|
|
@@ -233,7 +268,7 @@ class _ActorType:
|
|
|
233
268
|
if not force_cloud:
|
|
234
269
|
return
|
|
235
270
|
|
|
236
|
-
if not self.is_at_home() and self.
|
|
271
|
+
if not self.is_at_home() and self.configuration.token is None:
|
|
237
272
|
raise RuntimeError(
|
|
238
273
|
'In order to use the Apify cloud storage from your computer, '
|
|
239
274
|
'you need to provide an Apify token using the APIFY_TOKEN environment variable.'
|
|
@@ -250,12 +285,23 @@ class _ActorType:
|
|
|
250
285
|
This method should be called immediately before performing any additional Actor actions, and it should be
|
|
251
286
|
called only once.
|
|
252
287
|
"""
|
|
288
|
+
if self._configuration:
|
|
289
|
+
# Set explicitly the configuration in the service locator
|
|
290
|
+
service_locator.set_configuration(self.configuration)
|
|
291
|
+
else:
|
|
292
|
+
# Ensure that the configuration (cached property) is set
|
|
293
|
+
_ = self.configuration
|
|
294
|
+
|
|
253
295
|
if self._is_initialized:
|
|
254
296
|
raise RuntimeError('The Actor was already initialized!')
|
|
255
297
|
|
|
256
298
|
if _ActorType._is_any_instance_initialized:
|
|
257
299
|
self.log.warning('Repeated Actor initialization detected - this is non-standard usage, proceed with care')
|
|
258
300
|
|
|
301
|
+
# Create an instance of the cloud storage client, the local storage client is obtained
|
|
302
|
+
# from the service locator
|
|
303
|
+
self._cloud_storage_client = ApifyStorageClient()
|
|
304
|
+
|
|
259
305
|
# Make sure that the currently initialized instance is also available through the global `Actor` proxy
|
|
260
306
|
cast('Proxy', Actor).__wrapped__ = self
|
|
261
307
|
|
|
@@ -265,9 +311,11 @@ class _ActorType:
|
|
|
265
311
|
# If the Actor is running on the Apify platform, we set the cloud storage client.
|
|
266
312
|
if self.is_at_home():
|
|
267
313
|
service_locator.set_storage_client(self._cloud_storage_client)
|
|
314
|
+
self._local_storage_client = self._cloud_storage_client
|
|
315
|
+
else:
|
|
316
|
+
self._get_local_storage_client()
|
|
268
317
|
|
|
269
318
|
service_locator.set_event_manager(self.event_manager)
|
|
270
|
-
service_locator.set_configuration(self.configuration)
|
|
271
319
|
|
|
272
320
|
# The logging configuration has to be called after all service_locator set methods.
|
|
273
321
|
if self._configure_logging:
|
|
@@ -279,10 +327,10 @@ class _ActorType:
|
|
|
279
327
|
# TODO: Print outdated SDK version warning (we need a new env var for this)
|
|
280
328
|
# https://github.com/apify/apify-sdk-python/issues/146
|
|
281
329
|
|
|
282
|
-
await self.
|
|
330
|
+
await self.event_manager.__aenter__()
|
|
283
331
|
self.log.debug('Event manager initialized')
|
|
284
332
|
|
|
285
|
-
await self.
|
|
333
|
+
await self._charging_manager_implementation.__aenter__()
|
|
286
334
|
self.log.debug('Charging manager initialized')
|
|
287
335
|
|
|
288
336
|
self._is_initialized = True
|
|
@@ -323,10 +371,10 @@ class _ActorType:
|
|
|
323
371
|
await asyncio.sleep(0.1)
|
|
324
372
|
|
|
325
373
|
if event_listeners_timeout:
|
|
326
|
-
await self.
|
|
374
|
+
await self.event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
|
|
327
375
|
|
|
328
|
-
await self.
|
|
329
|
-
await self.
|
|
376
|
+
await self.event_manager.__aexit__(None, None, None)
|
|
377
|
+
await self._charging_manager_implementation.__aexit__(None, None, None)
|
|
330
378
|
|
|
331
379
|
await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
|
|
332
380
|
self._is_initialized = False
|
|
@@ -385,8 +433,8 @@ class _ActorType:
|
|
|
385
433
|
(increases exponentially from this value).
|
|
386
434
|
timeout: The socket timeout of the HTTP requests sent to the Apify API.
|
|
387
435
|
"""
|
|
388
|
-
token = token or self.
|
|
389
|
-
api_url = api_url or self.
|
|
436
|
+
token = token or self.configuration.token
|
|
437
|
+
api_url = api_url or self.configuration.api_base_url
|
|
390
438
|
return ApifyClientAsync(
|
|
391
439
|
token=token,
|
|
392
440
|
api_url=api_url,
|
|
@@ -427,13 +475,13 @@ class _ActorType:
|
|
|
427
475
|
self._raise_if_not_initialized()
|
|
428
476
|
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
|
|
429
477
|
|
|
430
|
-
storage_client = self._cloud_storage_client if force_cloud else self.
|
|
478
|
+
storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client()
|
|
431
479
|
|
|
432
480
|
return await Dataset.open(
|
|
433
481
|
id=id,
|
|
434
482
|
alias=alias,
|
|
435
483
|
name=name,
|
|
436
|
-
configuration=self.
|
|
484
|
+
configuration=self.configuration,
|
|
437
485
|
storage_client=storage_client,
|
|
438
486
|
)
|
|
439
487
|
|
|
@@ -465,13 +513,14 @@ class _ActorType:
|
|
|
465
513
|
"""
|
|
466
514
|
self._raise_if_not_initialized()
|
|
467
515
|
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
|
|
468
|
-
|
|
516
|
+
|
|
517
|
+
storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client()
|
|
469
518
|
|
|
470
519
|
return await KeyValueStore.open(
|
|
471
520
|
id=id,
|
|
472
521
|
alias=alias,
|
|
473
522
|
name=name,
|
|
474
|
-
configuration=self.
|
|
523
|
+
configuration=self.configuration,
|
|
475
524
|
storage_client=storage_client,
|
|
476
525
|
)
|
|
477
526
|
|
|
@@ -506,13 +555,13 @@ class _ActorType:
|
|
|
506
555
|
self._raise_if_not_initialized()
|
|
507
556
|
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
|
|
508
557
|
|
|
509
|
-
storage_client = self._cloud_storage_client if force_cloud else self.
|
|
558
|
+
storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client()
|
|
510
559
|
|
|
511
560
|
return await RequestQueue.open(
|
|
512
561
|
id=id,
|
|
513
562
|
alias=alias,
|
|
514
563
|
name=name,
|
|
515
|
-
configuration=self.
|
|
564
|
+
configuration=self.configuration,
|
|
516
565
|
storage_client=storage_client,
|
|
517
566
|
)
|
|
518
567
|
|
|
@@ -536,7 +585,7 @@ class _ActorType:
|
|
|
536
585
|
data = data if isinstance(data, list) else [data]
|
|
537
586
|
|
|
538
587
|
max_charged_count = (
|
|
539
|
-
self.
|
|
588
|
+
self.get_charging_manager().calculate_max_event_charge_count_within_limit(charged_event_name)
|
|
540
589
|
if charged_event_name is not None
|
|
541
590
|
else None
|
|
542
591
|
)
|
|
@@ -550,7 +599,7 @@ class _ActorType:
|
|
|
550
599
|
await dataset.push_data(data)
|
|
551
600
|
|
|
552
601
|
if charged_event_name:
|
|
553
|
-
return await self.
|
|
602
|
+
return await self.get_charging_manager().charge(
|
|
554
603
|
event_name=charged_event_name,
|
|
555
604
|
count=min(max_charged_count, len(data)) if max_charged_count is not None else len(data),
|
|
556
605
|
)
|
|
@@ -561,9 +610,9 @@ class _ActorType:
|
|
|
561
610
|
"""Get the Actor input value from the default key-value store associated with the current Actor run."""
|
|
562
611
|
self._raise_if_not_initialized()
|
|
563
612
|
|
|
564
|
-
input_value = await self.get_value(self.
|
|
565
|
-
input_secrets_private_key = self.
|
|
566
|
-
input_secrets_key_passphrase = self.
|
|
613
|
+
input_value = await self.get_value(self.configuration.input_key)
|
|
614
|
+
input_secrets_private_key = self.configuration.input_secrets_private_key_file
|
|
615
|
+
input_secrets_key_passphrase = self.configuration.input_secrets_private_key_passphrase
|
|
567
616
|
if input_secrets_private_key and input_secrets_key_passphrase:
|
|
568
617
|
private_key = load_private_key(
|
|
569
618
|
input_secrets_private_key,
|
|
@@ -607,7 +656,11 @@ class _ActorType:
|
|
|
607
656
|
def get_charging_manager(self) -> ChargingManager:
|
|
608
657
|
"""Retrieve the charging manager to access granular pricing information."""
|
|
609
658
|
self._raise_if_not_initialized()
|
|
610
|
-
return self.
|
|
659
|
+
return self._charging_manager_implementation
|
|
660
|
+
|
|
661
|
+
@cached_property
|
|
662
|
+
def _charging_manager_implementation(self) -> ChargingManagerImplementation:
|
|
663
|
+
return ChargingManagerImplementation(self.configuration, self.apify_client)
|
|
611
664
|
|
|
612
665
|
async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
|
|
613
666
|
"""Charge for a specified number of events - sub-operations of the Actor.
|
|
@@ -619,7 +672,7 @@ class _ActorType:
|
|
|
619
672
|
count: Number of events to charge for.
|
|
620
673
|
"""
|
|
621
674
|
self._raise_if_not_initialized()
|
|
622
|
-
return await self.
|
|
675
|
+
return await self.get_charging_manager().charge(event_name, count)
|
|
623
676
|
|
|
624
677
|
@overload
|
|
625
678
|
def on(
|
|
@@ -670,7 +723,7 @@ class _ActorType:
|
|
|
670
723
|
"""
|
|
671
724
|
self._raise_if_not_initialized()
|
|
672
725
|
|
|
673
|
-
self.
|
|
726
|
+
self.event_manager.on(event=event_name, listener=listener)
|
|
674
727
|
return listener
|
|
675
728
|
|
|
676
729
|
@overload
|
|
@@ -696,11 +749,11 @@ class _ActorType:
|
|
|
696
749
|
"""
|
|
697
750
|
self._raise_if_not_initialized()
|
|
698
751
|
|
|
699
|
-
self.
|
|
752
|
+
self.event_manager.off(event=event_name, listener=listener)
|
|
700
753
|
|
|
701
754
|
def is_at_home(self) -> bool:
|
|
702
755
|
"""Return `True` when the Actor is running on the Apify platform, and `False` otherwise (e.g. local run)."""
|
|
703
|
-
return self.
|
|
756
|
+
return self.configuration.is_at_home
|
|
704
757
|
|
|
705
758
|
def get_env(self) -> dict:
|
|
706
759
|
"""Return a dictionary with information parsed from all the `APIFY_XXX` environment variables.
|
|
@@ -726,7 +779,7 @@ class _ActorType:
|
|
|
726
779
|
aliases = [field_name]
|
|
727
780
|
|
|
728
781
|
for alias in aliases:
|
|
729
|
-
config[alias] = getattr(self.
|
|
782
|
+
config[alias] = getattr(self.configuration, field_name)
|
|
730
783
|
|
|
731
784
|
env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]}
|
|
732
785
|
return {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config}
|
|
@@ -771,7 +824,7 @@ class _ActorType:
|
|
|
771
824
|
"""
|
|
772
825
|
self._raise_if_not_initialized()
|
|
773
826
|
|
|
774
|
-
client = self.new_client(token=token) if token else self.
|
|
827
|
+
client = self.new_client(token=token) if token else self.apify_client
|
|
775
828
|
|
|
776
829
|
if webhooks:
|
|
777
830
|
serialized_webhooks = [
|
|
@@ -802,7 +855,7 @@ class _ActorType:
|
|
|
802
855
|
return ActorRun.model_validate(api_result)
|
|
803
856
|
|
|
804
857
|
def _get_remaining_time(self) -> timedelta | None:
|
|
805
|
-
"""Get time remaining from the
|
|
858
|
+
"""Get time remaining from the Actor timeout. Returns `None` if not on an Apify platform."""
|
|
806
859
|
if self.is_at_home() and self.configuration.timeout_at:
|
|
807
860
|
return self.configuration.timeout_at - datetime.now(tz=timezone.utc)
|
|
808
861
|
|
|
@@ -838,7 +891,7 @@ class _ActorType:
|
|
|
838
891
|
"""
|
|
839
892
|
self._raise_if_not_initialized()
|
|
840
893
|
|
|
841
|
-
client = self.new_client(token=token) if token else self.
|
|
894
|
+
client = self.new_client(token=token) if token else self.apify_client
|
|
842
895
|
|
|
843
896
|
if status_message:
|
|
844
897
|
await client.run(run_id).update(status_message=status_message)
|
|
@@ -891,7 +944,7 @@ class _ActorType:
|
|
|
891
944
|
"""
|
|
892
945
|
self._raise_if_not_initialized()
|
|
893
946
|
|
|
894
|
-
client = self.new_client(token=token) if token else self.
|
|
947
|
+
client = self.new_client(token=token) if token else self.apify_client
|
|
895
948
|
|
|
896
949
|
if webhooks:
|
|
897
950
|
serialized_webhooks = [
|
|
@@ -963,7 +1016,7 @@ class _ActorType:
|
|
|
963
1016
|
"""
|
|
964
1017
|
self._raise_if_not_initialized()
|
|
965
1018
|
|
|
966
|
-
client = self.new_client(token=token) if token else self.
|
|
1019
|
+
client = self.new_client(token=token) if token else self.apify_client
|
|
967
1020
|
|
|
968
1021
|
if webhooks:
|
|
969
1022
|
serialized_webhooks = [
|
|
@@ -1014,13 +1067,13 @@ class _ActorType:
|
|
|
1014
1067
|
return
|
|
1015
1068
|
|
|
1016
1069
|
if not custom_after_sleep:
|
|
1017
|
-
custom_after_sleep = self.
|
|
1070
|
+
custom_after_sleep = self.configuration.metamorph_after_sleep
|
|
1018
1071
|
|
|
1019
|
-
# If is_at_home() is True,
|
|
1020
|
-
if not self.
|
|
1072
|
+
# If is_at_home() is True, configuration.actor_run_id is always set
|
|
1073
|
+
if not self.configuration.actor_run_id:
|
|
1021
1074
|
raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
|
|
1022
1075
|
|
|
1023
|
-
await self.
|
|
1076
|
+
await self.apify_client.run(self.configuration.actor_run_id).metamorph(
|
|
1024
1077
|
target_actor_id=target_actor_id,
|
|
1025
1078
|
run_input=run_input,
|
|
1026
1079
|
target_actor_build=target_actor_build,
|
|
@@ -1057,7 +1110,7 @@ class _ActorType:
|
|
|
1057
1110
|
_ActorType._is_rebooting = True
|
|
1058
1111
|
|
|
1059
1112
|
if not custom_after_sleep:
|
|
1060
|
-
custom_after_sleep = self.
|
|
1113
|
+
custom_after_sleep = self.configuration.metamorph_after_sleep
|
|
1061
1114
|
|
|
1062
1115
|
# Call all the listeners for the PERSIST_STATE and MIGRATING events, and wait for them to finish.
|
|
1063
1116
|
# PERSIST_STATE listeners are called to allow the Actor to persist its state before the reboot.
|
|
@@ -1066,10 +1119,10 @@ class _ActorType:
|
|
|
1066
1119
|
# We can't just emit the events and wait for all listeners to finish,
|
|
1067
1120
|
# because this method might be called from an event listener itself, and we would deadlock.
|
|
1068
1121
|
persist_state_listeners = flatten(
|
|
1069
|
-
(self.
|
|
1122
|
+
(self.event_manager._listeners_to_wrappers[Event.PERSIST_STATE] or {}).values() # noqa: SLF001
|
|
1070
1123
|
)
|
|
1071
1124
|
migrating_listeners = flatten(
|
|
1072
|
-
(self.
|
|
1125
|
+
(self.event_manager._listeners_to_wrappers[Event.MIGRATING] or {}).values() # noqa: SLF001
|
|
1073
1126
|
)
|
|
1074
1127
|
|
|
1075
1128
|
await asyncio.gather(
|
|
@@ -1077,10 +1130,10 @@ class _ActorType:
|
|
|
1077
1130
|
*[listener(EventMigratingData()) for listener in migrating_listeners],
|
|
1078
1131
|
)
|
|
1079
1132
|
|
|
1080
|
-
if not self.
|
|
1133
|
+
if not self.configuration.actor_run_id:
|
|
1081
1134
|
raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
|
|
1082
1135
|
|
|
1083
|
-
await self.
|
|
1136
|
+
await self.apify_client.run(self.configuration.actor_run_id).reboot()
|
|
1084
1137
|
|
|
1085
1138
|
if custom_after_sleep:
|
|
1086
1139
|
await asyncio.sleep(custom_after_sleep.total_seconds())
|
|
@@ -1119,11 +1172,11 @@ class _ActorType:
|
|
|
1119
1172
|
return
|
|
1120
1173
|
|
|
1121
1174
|
# If is_at_home() is True, config.actor_run_id is always set
|
|
1122
|
-
if not self.
|
|
1175
|
+
if not self.configuration.actor_run_id:
|
|
1123
1176
|
raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
|
|
1124
1177
|
|
|
1125
|
-
await self.
|
|
1126
|
-
actor_run_id=self.
|
|
1178
|
+
await self.apify_client.webhooks().create(
|
|
1179
|
+
actor_run_id=self.configuration.actor_run_id,
|
|
1127
1180
|
event_types=webhook.event_types,
|
|
1128
1181
|
request_url=webhook.request_url,
|
|
1129
1182
|
payload_template=webhook.payload_template,
|
|
@@ -1155,10 +1208,10 @@ class _ActorType:
|
|
|
1155
1208
|
return None
|
|
1156
1209
|
|
|
1157
1210
|
# If is_at_home() is True, config.actor_run_id is always set
|
|
1158
|
-
if not self.
|
|
1211
|
+
if not self.configuration.actor_run_id:
|
|
1159
1212
|
raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
|
|
1160
1213
|
|
|
1161
|
-
api_result = await self.
|
|
1214
|
+
api_result = await self.apify_client.run(self.configuration.actor_run_id).update(
|
|
1162
1215
|
status_message=status_message, is_status_message_terminal=is_terminal
|
|
1163
1216
|
)
|
|
1164
1217
|
|
|
@@ -1168,7 +1221,7 @@ class _ActorType:
|
|
|
1168
1221
|
self,
|
|
1169
1222
|
*,
|
|
1170
1223
|
actor_proxy_input: dict
|
|
1171
|
-
| None = None, # this is the raw proxy input from the
|
|
1224
|
+
| None = None, # this is the raw proxy input from the Actor run input, it is not spread or snake_cased in here
|
|
1172
1225
|
password: str | None = None,
|
|
1173
1226
|
groups: list[str] | None = None,
|
|
1174
1227
|
country_code: str | None = None,
|
|
@@ -1213,7 +1266,7 @@ class _ActorType:
|
|
|
1213
1266
|
country_code=country_code,
|
|
1214
1267
|
proxy_urls=proxy_urls,
|
|
1215
1268
|
new_url_function=new_url_function,
|
|
1216
|
-
_actor_config=self.
|
|
1269
|
+
_actor_config=self.configuration,
|
|
1217
1270
|
_apify_client=self._apify_client,
|
|
1218
1271
|
)
|
|
1219
1272
|
|
apify/_configuration.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Annotated, Any
|
|
|
8
8
|
from pydantic import AliasChoices, BeforeValidator, Field, model_validator
|
|
9
9
|
from typing_extensions import Self, deprecated
|
|
10
10
|
|
|
11
|
+
from crawlee import service_locator
|
|
11
12
|
from crawlee._utils.models import timedelta_ms
|
|
12
13
|
from crawlee._utils.urls import validate_http_url
|
|
13
14
|
from crawlee.configuration import Configuration as CrawleeConfiguration
|
|
@@ -424,11 +425,41 @@ class Configuration(CrawleeConfiguration):
|
|
|
424
425
|
def get_global_configuration(cls) -> Configuration:
|
|
425
426
|
"""Retrieve the global instance of the configuration.
|
|
426
427
|
|
|
427
|
-
|
|
428
|
-
|
|
428
|
+
This method ensures that ApifyConfigration is returned, even if CrawleeConfiguration was set in the
|
|
429
|
+
service locator.
|
|
429
430
|
"""
|
|
430
|
-
|
|
431
|
+
global_configuration = service_locator.get_configuration()
|
|
431
432
|
|
|
433
|
+
if isinstance(global_configuration, Configuration):
|
|
434
|
+
# If Apify configuration was already stored in service locator, return it.
|
|
435
|
+
return global_configuration
|
|
432
436
|
|
|
433
|
-
|
|
434
|
-
|
|
437
|
+
logger.warning(
|
|
438
|
+
'Non Apify Configration is set in the `service_locator` in the SDK context. '
|
|
439
|
+
'It is recommended to set `apify.Configuration` explicitly as early as possible by using '
|
|
440
|
+
'service_locator.set_configuration'
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
return cls.from_configuration(global_configuration)
|
|
444
|
+
|
|
445
|
+
@classmethod
|
|
446
|
+
def from_configuration(cls, configuration: CrawleeConfiguration) -> Configuration:
|
|
447
|
+
"""Create Apify Configuration from existing Crawlee Configuration.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
configuration: The existing Crawlee Configuration.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
The created Apify Configuration.
|
|
454
|
+
"""
|
|
455
|
+
apify_configuration = cls()
|
|
456
|
+
|
|
457
|
+
# Ensure the returned configuration is of type Apify Configuration.
|
|
458
|
+
# Most likely crawlee configuration was already set. Create Apify configuration from it.
|
|
459
|
+
# Due to known Pydantic issue https://github.com/pydantic/pydantic/issues/9516, creating new instance of
|
|
460
|
+
# Configuration from existing one in situation where environment can have some fields set by alias is very
|
|
461
|
+
# unpredictable. Use the stable workaround.
|
|
462
|
+
for name in configuration.model_fields:
|
|
463
|
+
setattr(apify_configuration, name, getattr(configuration, name))
|
|
464
|
+
|
|
465
|
+
return apify_configuration
|
|
@@ -11,8 +11,9 @@ from crawlee._utils.byte_size import ByteSize
|
|
|
11
11
|
from crawlee._utils.file import json_dumps
|
|
12
12
|
from crawlee.storage_clients._base import DatasetClient
|
|
13
13
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
14
|
+
from crawlee.storages import Dataset
|
|
14
15
|
|
|
15
|
-
from ._utils import
|
|
16
|
+
from ._utils import AliasResolver
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
19
|
from collections.abc import AsyncIterator
|
|
@@ -126,19 +127,19 @@ class ApifyDatasetClient(DatasetClient):
|
|
|
126
127
|
# Normalize 'default' alias to None
|
|
127
128
|
alias = None if alias == 'default' else alias
|
|
128
129
|
|
|
129
|
-
# Handle alias resolution
|
|
130
130
|
if alias:
|
|
131
|
-
#
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
# Create a new storage and store the
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
131
|
+
# Check if there is pre-existing alias mapping in the default KVS.
|
|
132
|
+
async with AliasResolver(storage_type=Dataset, alias=alias, configuration=configuration) as _alias:
|
|
133
|
+
id = await _alias.resolve_id()
|
|
134
|
+
|
|
135
|
+
# There was no pre-existing alias in the mapping.
|
|
136
|
+
# Create a new unnamed storage and store the mapping.
|
|
137
|
+
if id is None:
|
|
138
|
+
new_storage_metadata = DatasetMetadata.model_validate(
|
|
139
|
+
await apify_datasets_client.get_or_create(),
|
|
140
|
+
)
|
|
141
|
+
id = new_storage_metadata.id
|
|
142
|
+
await _alias.store_mapping(storage_id=id)
|
|
142
143
|
|
|
143
144
|
# If name is provided, get or create the storage by name.
|
|
144
145
|
elif name:
|
|
@@ -10,9 +10,10 @@ from yarl import URL
|
|
|
10
10
|
from apify_client import ApifyClientAsync
|
|
11
11
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
12
|
from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
|
+
from crawlee.storages import KeyValueStore
|
|
13
14
|
|
|
14
15
|
from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
|
|
15
|
-
from ._utils import
|
|
16
|
+
from ._utils import AliasResolver
|
|
16
17
|
from apify._crypto import create_hmac_signature
|
|
17
18
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
@@ -117,19 +118,20 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
|
|
|
117
118
|
# Normalize 'default' alias to None
|
|
118
119
|
alias = None if alias == 'default' else alias
|
|
119
120
|
|
|
120
|
-
# Handle alias resolution
|
|
121
121
|
if alias:
|
|
122
|
-
#
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
# Create a new storage and store the
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
122
|
+
# Check if there is pre-existing alias mapping in the default KVS.
|
|
123
|
+
async with AliasResolver(storage_type=KeyValueStore, alias=alias, configuration=configuration) as _alias:
|
|
124
|
+
id = await _alias.resolve_id()
|
|
125
|
+
|
|
126
|
+
# There was no pre-existing alias in the mapping.
|
|
127
|
+
# Create a new unnamed storage and store the mapping.
|
|
128
|
+
if id is None:
|
|
129
|
+
# Create a new storage and store the alias mapping
|
|
130
|
+
new_storage_metadata = ApifyKeyValueStoreMetadata.model_validate(
|
|
131
|
+
await apify_kvss_client.get_or_create(),
|
|
132
|
+
)
|
|
133
|
+
id = new_storage_metadata.id
|
|
134
|
+
await _alias.store_mapping(storage_id=id)
|
|
133
135
|
|
|
134
136
|
# If name is provided, get or create the storage by name.
|
|
135
137
|
elif name:
|
|
@@ -16,9 +16,10 @@ from apify_client import ApifyClientAsync
|
|
|
16
16
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
17
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
18
18
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
19
|
+
from crawlee.storages import RequestQueue
|
|
19
20
|
|
|
20
21
|
from ._models import CachedRequest, ProlongRequestLockResponse, RequestQueueHead
|
|
21
|
-
from ._utils import
|
|
22
|
+
from ._utils import AliasResolver
|
|
22
23
|
from apify import Request
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
@@ -195,19 +196,19 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
195
196
|
# Normalize 'default' alias to None
|
|
196
197
|
alias = None if alias == 'default' else alias
|
|
197
198
|
|
|
198
|
-
# Handle alias resolution
|
|
199
199
|
if alias:
|
|
200
|
-
#
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
# Create a new storage and store the
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
200
|
+
# Check if there is pre-existing alias mapping in the default KVS.
|
|
201
|
+
async with AliasResolver(storage_type=RequestQueue, alias=alias, configuration=configuration) as _alias:
|
|
202
|
+
id = await _alias.resolve_id()
|
|
203
|
+
|
|
204
|
+
# There was no pre-existing alias in the mapping.
|
|
205
|
+
# Create a new unnamed storage and store the mapping.
|
|
206
|
+
if id is None:
|
|
207
|
+
new_storage_metadata = RequestQueueMetadata.model_validate(
|
|
208
|
+
await apify_rqs_client.get_or_create(),
|
|
209
|
+
)
|
|
210
|
+
id = new_storage_metadata.id
|
|
211
|
+
await _alias.store_mapping(storage_id=id)
|
|
211
212
|
|
|
212
213
|
# If name is provided, get or create the storage by name.
|
|
213
214
|
elif name:
|
|
@@ -9,16 +9,35 @@ from crawlee.storage_clients._base import StorageClient
|
|
|
9
9
|
from ._dataset_client import ApifyDatasetClient
|
|
10
10
|
from ._key_value_store_client import ApifyKeyValueStoreClient
|
|
11
11
|
from ._request_queue_client import ApifyRequestQueueClient
|
|
12
|
+
from ._utils import hash_api_base_url_and_token
|
|
13
|
+
from apify._configuration import Configuration as ApifyConfiguration
|
|
12
14
|
from apify._utils import docs_group
|
|
13
15
|
|
|
14
16
|
if TYPE_CHECKING:
|
|
15
|
-
from
|
|
17
|
+
from collections.abc import Hashable
|
|
18
|
+
|
|
19
|
+
from crawlee.configuration import Configuration as CrawleeConfiguration
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
@docs_group('Storage clients')
|
|
19
23
|
class ApifyStorageClient(StorageClient):
|
|
20
24
|
"""Apify storage client."""
|
|
21
25
|
|
|
26
|
+
# This class breaches Liskov Substitution Principle. It requires specialized Configuration compared to its parent.
|
|
27
|
+
_lsp_violation_error_message_template = (
|
|
28
|
+
'Expected "configuration" to be an instance of "apify.Configuration", but got {} instead.'
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
@override
|
|
32
|
+
def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable:
|
|
33
|
+
if isinstance(configuration, ApifyConfiguration):
|
|
34
|
+
return hash_api_base_url_and_token(configuration)
|
|
35
|
+
|
|
36
|
+
config_class = type(configuration)
|
|
37
|
+
raise TypeError(
|
|
38
|
+
self._lsp_violation_error_message_template.format(f'{config_class.__module__}.{config_class.__name__}')
|
|
39
|
+
)
|
|
40
|
+
|
|
22
41
|
@override
|
|
23
42
|
async def create_dataset_client(
|
|
24
43
|
self,
|
|
@@ -26,19 +45,13 @@ class ApifyStorageClient(StorageClient):
|
|
|
26
45
|
id: str | None = None,
|
|
27
46
|
name: str | None = None,
|
|
28
47
|
alias: str | None = None,
|
|
29
|
-
configuration:
|
|
48
|
+
configuration: CrawleeConfiguration | None = None,
|
|
30
49
|
) -> ApifyDatasetClient:
|
|
31
|
-
# Import here to avoid circular imports.
|
|
32
|
-
from apify import Configuration as ApifyConfiguration # noqa: PLC0415
|
|
33
|
-
|
|
34
50
|
configuration = configuration or ApifyConfiguration.get_global_configuration()
|
|
35
51
|
if isinstance(configuration, ApifyConfiguration):
|
|
36
52
|
return await ApifyDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
37
53
|
|
|
38
|
-
raise TypeError(
|
|
39
|
-
f'Expected "configuration" to be an instance of "apify.Configuration", '
|
|
40
|
-
f'but got {type(configuration).__name__} instead.'
|
|
41
|
-
)
|
|
54
|
+
raise TypeError(self._lsp_violation_error_message_template.format(type(configuration).__name__))
|
|
42
55
|
|
|
43
56
|
@override
|
|
44
57
|
async def create_kvs_client(
|
|
@@ -47,19 +60,13 @@ class ApifyStorageClient(StorageClient):
|
|
|
47
60
|
id: str | None = None,
|
|
48
61
|
name: str | None = None,
|
|
49
62
|
alias: str | None = None,
|
|
50
|
-
configuration:
|
|
63
|
+
configuration: CrawleeConfiguration | None = None,
|
|
51
64
|
) -> ApifyKeyValueStoreClient:
|
|
52
|
-
# Import here to avoid circular imports.
|
|
53
|
-
from apify import Configuration as ApifyConfiguration # noqa: PLC0415
|
|
54
|
-
|
|
55
65
|
configuration = configuration or ApifyConfiguration.get_global_configuration()
|
|
56
66
|
if isinstance(configuration, ApifyConfiguration):
|
|
57
67
|
return await ApifyKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
58
68
|
|
|
59
|
-
raise TypeError(
|
|
60
|
-
f'Expected "configuration" to be an instance of "apify.Configuration", '
|
|
61
|
-
f'but got {type(configuration).__name__} instead.'
|
|
62
|
-
)
|
|
69
|
+
raise TypeError(self._lsp_violation_error_message_template.format(type(configuration).__name__))
|
|
63
70
|
|
|
64
71
|
@override
|
|
65
72
|
async def create_rq_client(
|
|
@@ -68,16 +75,10 @@ class ApifyStorageClient(StorageClient):
|
|
|
68
75
|
id: str | None = None,
|
|
69
76
|
name: str | None = None,
|
|
70
77
|
alias: str | None = None,
|
|
71
|
-
configuration:
|
|
78
|
+
configuration: CrawleeConfiguration | None = None,
|
|
72
79
|
) -> ApifyRequestQueueClient:
|
|
73
|
-
# Import here to avoid circular imports.
|
|
74
|
-
from apify import Configuration as ApifyConfiguration # noqa: PLC0415
|
|
75
|
-
|
|
76
80
|
configuration = configuration or ApifyConfiguration.get_global_configuration()
|
|
77
81
|
if isinstance(configuration, ApifyConfiguration):
|
|
78
82
|
return await ApifyRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
79
83
|
|
|
80
|
-
raise TypeError(
|
|
81
|
-
f'Expected "configuration" to be an instance of "apify.Configuration", '
|
|
82
|
-
f'but got {type(configuration).__name__} instead.'
|
|
83
|
-
)
|
|
84
|
+
raise TypeError(self._lsp_violation_error_message_template.format(type(configuration).__name__))
|
|
@@ -1,117 +1,167 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
4
|
+
from asyncio import Lock
|
|
3
5
|
from logging import getLogger
|
|
4
|
-
from typing import TYPE_CHECKING,
|
|
6
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
5
7
|
|
|
6
8
|
from apify_client import ApifyClientAsync
|
|
9
|
+
from crawlee._utils.crypto import compute_short_hash
|
|
10
|
+
|
|
11
|
+
from apify._configuration import Configuration
|
|
7
12
|
|
|
8
13
|
if TYPE_CHECKING:
|
|
14
|
+
from types import TracebackType
|
|
15
|
+
|
|
9
16
|
from apify_client.clients import KeyValueStoreClientAsync
|
|
17
|
+
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
10
18
|
|
|
11
|
-
from apify import Configuration
|
|
12
19
|
|
|
13
20
|
logger = getLogger(__name__)
|
|
14
21
|
|
|
15
|
-
_ALIAS_MAPPING_KEY = '__STORAGE_ALIASES_MAPPING'
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
async def resolve_alias_to_id(
|
|
19
|
-
alias: str,
|
|
20
|
-
storage_type: Literal['dataset', 'kvs', 'rq'],
|
|
21
|
-
configuration: Configuration,
|
|
22
|
-
) -> str | None:
|
|
23
|
-
"""Resolve a storage alias to its corresponding storage ID.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
alias: The alias to resolve.
|
|
27
|
-
storage_type: Type of storage ('dataset', 'key_value_store', or 'request_queue').
|
|
28
|
-
configuration: The configuration object containing API credentials.
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
The storage ID if found, None if the alias doesn't exist.
|
|
32
|
-
"""
|
|
33
|
-
default_kvs_client = await _get_default_kvs_client(configuration)
|
|
34
|
-
|
|
35
|
-
# Create the dictionary key for this alias.
|
|
36
|
-
alias_key = f'alias-{storage_type}-{alias}'
|
|
37
|
-
|
|
38
|
-
try:
|
|
39
|
-
record = await default_kvs_client.get_record(_ALIAS_MAPPING_KEY)
|
|
40
|
-
|
|
41
|
-
# get_record can return {key: ..., value: ..., content_type: ...}
|
|
42
|
-
if isinstance(record, dict) and 'value' in record:
|
|
43
|
-
record = record['value']
|
|
44
|
-
|
|
45
|
-
# Extract the actual data from the KVS record
|
|
46
|
-
if isinstance(record, dict) and alias_key in record:
|
|
47
|
-
storage_id = record[alias_key]
|
|
48
|
-
return str(storage_id)
|
|
49
22
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
logger.warning(f'Error accessing alias mapping for {alias}: {exc}')
|
|
23
|
+
class AliasResolver:
|
|
24
|
+
"""Class for handling aliases.
|
|
53
25
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
alias: str,
|
|
59
|
-
storage_type: Literal['dataset', 'kvs', 'rq'],
|
|
60
|
-
storage_id: str,
|
|
61
|
-
configuration: Configuration,
|
|
62
|
-
) -> None:
|
|
63
|
-
"""Store a mapping from alias to storage ID in the default key-value store.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
alias: The alias to store.
|
|
67
|
-
storage_type: Type of storage ('dataset', 'key_value_store', or 'request_queue').
|
|
68
|
-
storage_id: The storage ID to map the alias to.
|
|
69
|
-
configuration: The configuration object containing API credentials.
|
|
26
|
+
The purpose of this is class is to ensure that alias storages are created with correct id. This is achieved by using
|
|
27
|
+
default kvs as a storage for global mapping of aliases to storage ids. Same mapping is also kept in memory to avoid
|
|
28
|
+
unnecessary calls to API and also have limited support of alias storages when not running on Apify platform. When on
|
|
29
|
+
Apify platform, the storages created with alias are accessible by the same alias even after migration or reboot.
|
|
70
30
|
"""
|
|
71
|
-
default_kvs_client = await _get_default_kvs_client(configuration)
|
|
72
|
-
|
|
73
|
-
# Create the dictionary key for this alias.
|
|
74
|
-
alias_key = f'alias-{storage_type}-{alias}'
|
|
75
|
-
|
|
76
|
-
try:
|
|
77
|
-
record = await default_kvs_client.get_record(_ALIAS_MAPPING_KEY)
|
|
78
|
-
|
|
79
|
-
# get_record can return {key: ..., value: ..., content_type: ...}
|
|
80
|
-
if isinstance(record, dict) and 'value' in record:
|
|
81
|
-
record = record['value']
|
|
82
31
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
32
|
+
_alias_map: ClassVar[dict[str, str]] = {}
|
|
33
|
+
"""Map containing pre-existing alias storages and their ids. Global for all instances."""
|
|
34
|
+
_alias_init_lock: Lock | None = None
|
|
35
|
+
"""Lock for creating alias storages. Only one alias storage can be created at the time. Global for all instances."""
|
|
36
|
+
|
|
37
|
+
_ALIAS_STORAGE_KEY_SEPARATOR = ','
|
|
38
|
+
_ALIAS_MAPPING_KEY = '__STORAGE_ALIASES_MAPPING'
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self, storage_type: type[Dataset | KeyValueStore | RequestQueue], alias: str, configuration: Configuration
|
|
42
|
+
) -> None:
|
|
43
|
+
self._storage_type = storage_type
|
|
44
|
+
self._alias = alias
|
|
45
|
+
self._additional_cache_key = hash_api_base_url_and_token(configuration)
|
|
46
|
+
|
|
47
|
+
async def __aenter__(self) -> AliasResolver:
|
|
48
|
+
"""Context manager to prevent race condition in alias creation."""
|
|
49
|
+
lock = await self._get_alias_init_lock()
|
|
50
|
+
await lock.acquire()
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
async def __aexit__(
|
|
54
|
+
self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
|
|
55
|
+
) -> None:
|
|
56
|
+
lock = await self._get_alias_init_lock()
|
|
57
|
+
lock.release()
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
async def _get_alias_init_lock(cls) -> Lock:
|
|
61
|
+
"""Get lock for controlling the creation of the alias storages.
|
|
62
|
+
|
|
63
|
+
The lock is shared for all instances of the AliasResolver class.
|
|
64
|
+
It is created in async method to ensure that some event loop is already running.
|
|
65
|
+
"""
|
|
66
|
+
if cls._alias_init_lock is None:
|
|
67
|
+
cls._alias_init_lock = Lock()
|
|
68
|
+
return cls._alias_init_lock
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
async def _get_alias_map(cls) -> dict[str, str]:
|
|
72
|
+
"""Get the aliases and storage ids mapping from the default kvs.
|
|
73
|
+
|
|
74
|
+
Mapping is loaded from kvs only once and is shared for all instances of the AliasResolver class.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Map of aliases and storage ids.
|
|
78
|
+
"""
|
|
79
|
+
if not cls._alias_map:
|
|
80
|
+
default_kvs_client = await _get_default_kvs_client()
|
|
81
|
+
|
|
82
|
+
record = await default_kvs_client.get_record(cls._ALIAS_MAPPING_KEY)
|
|
83
|
+
|
|
84
|
+
# get_record can return {key: ..., value: ..., content_type: ...}
|
|
85
|
+
if isinstance(record, dict):
|
|
86
|
+
if 'value' in record and isinstance(record['value'], dict):
|
|
87
|
+
cls._alias_map = record['value']
|
|
88
|
+
else:
|
|
89
|
+
cls._alias_map = record
|
|
90
|
+
else:
|
|
91
|
+
cls._alias_map = dict[str, str]()
|
|
92
|
+
|
|
93
|
+
return cls._alias_map
|
|
94
|
+
|
|
95
|
+
async def resolve_id(self) -> str | None:
|
|
96
|
+
"""Get id of the aliased storage.
|
|
97
|
+
|
|
98
|
+
Either locate the id in the in-memory mapping or create the new storage.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Storage id if it exists, None otherwise.
|
|
102
|
+
"""
|
|
103
|
+
return (await self._get_alias_map()).get(self._storage_key, None)
|
|
104
|
+
|
|
105
|
+
async def store_mapping(self, storage_id: str) -> None:
|
|
106
|
+
"""Add alias and related storage id to the mapping in default kvs and local in-memory mapping."""
|
|
107
|
+
# Update in-memory mapping
|
|
108
|
+
(await self._get_alias_map())[self._storage_key] = storage_id
|
|
109
|
+
if not Configuration.get_global_configuration().is_at_home:
|
|
110
|
+
logging.getLogger(__name__).warning(
|
|
111
|
+
'AliasResolver storage limited retention is only supported on Apify platform. Storage is not exported.'
|
|
112
|
+
)
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
default_kvs_client = await _get_default_kvs_client()
|
|
116
|
+
await default_kvs_client.get()
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
record = await default_kvs_client.get_record(self._ALIAS_MAPPING_KEY)
|
|
120
|
+
|
|
121
|
+
# get_record can return {key: ..., value: ..., content_type: ...}
|
|
122
|
+
if isinstance(record, dict) and 'value' in record:
|
|
123
|
+
record = record['value']
|
|
124
|
+
|
|
125
|
+
# Update or create the record with the new alias mapping
|
|
126
|
+
if isinstance(record, dict):
|
|
127
|
+
record[self._storage_key] = storage_id
|
|
128
|
+
else:
|
|
129
|
+
record = {self._storage_key: storage_id}
|
|
130
|
+
|
|
131
|
+
# Store the mapping back in the KVS.
|
|
132
|
+
await default_kvs_client.set_record(self._ALIAS_MAPPING_KEY, record)
|
|
133
|
+
except Exception as exc:
|
|
134
|
+
logger.warning(f'Error storing alias mapping for {self._alias}: {exc}')
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def _storage_key(self) -> str:
|
|
138
|
+
"""Get a unique storage key used for storing the alias in the mapping."""
|
|
139
|
+
return self._ALIAS_STORAGE_KEY_SEPARATOR.join(
|
|
140
|
+
[
|
|
141
|
+
self._storage_type.__name__,
|
|
142
|
+
self._alias,
|
|
143
|
+
self._additional_cache_key,
|
|
144
|
+
]
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
async def _get_default_kvs_client() -> KeyValueStoreClientAsync:
|
|
96
149
|
"""Get a client for the default key-value store."""
|
|
97
|
-
|
|
98
|
-
if not token:
|
|
99
|
-
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
|
|
150
|
+
configuration = Configuration.get_global_configuration()
|
|
100
151
|
|
|
101
|
-
api_url = configuration.api_base_url
|
|
102
|
-
if not api_url:
|
|
103
|
-
raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
|
|
104
|
-
|
|
105
|
-
# Create Apify client with the provided token and API URL
|
|
106
152
|
apify_client_async = ApifyClientAsync(
|
|
107
|
-
token=token,
|
|
108
|
-
api_url=
|
|
153
|
+
token=configuration.token,
|
|
154
|
+
api_url=configuration.api_base_url,
|
|
109
155
|
max_retries=8,
|
|
110
156
|
min_delay_between_retries_millis=500,
|
|
111
157
|
timeout_secs=360,
|
|
112
158
|
)
|
|
113
159
|
|
|
114
|
-
|
|
115
|
-
|
|
160
|
+
return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id)
|
|
161
|
+
|
|
116
162
|
|
|
117
|
-
|
|
163
|
+
def hash_api_base_url_and_token(configuration: Configuration) -> str:
|
|
164
|
+
"""Hash configuration.api_public_base_url and configuration.token in deterministic way."""
|
|
165
|
+
if configuration.api_public_base_url is None or configuration.token is None:
|
|
166
|
+
raise ValueError("'Configuration.api_public_base_url' and 'Configuration.token' must be set.")
|
|
167
|
+
return compute_short_hash(f'{configuration.api_public_base_url}{configuration.token}'.encode())
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.1b18
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
|
|
@@ -228,7 +228,7 @@ Requires-Python: >=3.10
|
|
|
228
228
|
Requires-Dist: apify-client<3.0.0,>=2.0.0
|
|
229
229
|
Requires-Dist: apify-shared<3.0.0,>=2.0.0
|
|
230
230
|
Requires-Dist: cachetools>=5.5.0
|
|
231
|
-
Requires-Dist: crawlee==0.6.
|
|
231
|
+
Requires-Dist: crawlee==0.6.13b42
|
|
232
232
|
Requires-Dist: cryptography>=42.0.0
|
|
233
233
|
Requires-Dist: impit>=0.6.1
|
|
234
234
|
Requires-Dist: lazy-object-proxy>=1.11.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
apify/__init__.py,sha256=HpgKg2FZWJuSPfDygzJ62psylhw4NN4tKFnoYUIhcd4,838
|
|
2
|
-
apify/_actor.py,sha256=
|
|
2
|
+
apify/_actor.py,sha256=2Y_e7t2zlkZbuW7rQME7WdC4kjsoZBoCs4KCuowjbQE,56108
|
|
3
3
|
apify/_charging.py,sha256=KjZ2DnEMS0Tt8ibizmmt0RwBq8FOAsD1z-hKFgdazcY,13143
|
|
4
|
-
apify/_configuration.py,sha256=
|
|
4
|
+
apify/_configuration.py,sha256=DOKRjDGE2qU7LVPa5VZJzLShKGlkr7_207UOzfCAk_U,14676
|
|
5
5
|
apify/_consts.py,sha256=CjhyEJ4Mi0lcIrzfqz8dN7nPJWGjCeBrrXQy1PZ6zRI,440
|
|
6
6
|
apify/_crypto.py,sha256=tqUs13QkemDtGzvU41pIA2HUEawpDlgzqbwKjm4I8kM,6852
|
|
7
7
|
apify/_models.py,sha256=EzU-inWeJ7T5HNVYEwnYb79W-q4OAPhtrYctfRYzpTE,7848
|
|
@@ -35,19 +35,19 @@ apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
|
35
35
|
apify/storage_clients/__init__.py,sha256=9WLAKs2GnnP0yyKR0mc3AfJ1IqXF48V3KPMp6KaB8kU,277
|
|
36
36
|
apify/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
apify/storage_clients/_apify/__init__.py,sha256=mtbVDsxqWL3kx30elnh0kAn2kZ4s3BBsWa15Y5e7RMU,347
|
|
38
|
-
apify/storage_clients/_apify/_dataset_client.py,sha256=
|
|
39
|
-
apify/storage_clients/_apify/_key_value_store_client.py,sha256=
|
|
38
|
+
apify/storage_clients/_apify/_dataset_client.py,sha256=aKQsoIDLbuP6R8wo2DX6pzp-EHV5EWZlHWB8s8OLuG4,12096
|
|
39
|
+
apify/storage_clients/_apify/_key_value_store_client.py,sha256=EWcWE6HiXBGklCAilj4My3T95py7Nu6Zt3e_XzpySUM,10099
|
|
40
40
|
apify/storage_clients/_apify/_models.py,sha256=C6FpXswtO6kXE5RUumazm_conzJJS6PrXAGF9XBuDb8,3651
|
|
41
|
-
apify/storage_clients/_apify/_request_queue_client.py,sha256=
|
|
42
|
-
apify/storage_clients/_apify/_storage_client.py,sha256=
|
|
43
|
-
apify/storage_clients/_apify/_utils.py,sha256=
|
|
41
|
+
apify/storage_clients/_apify/_request_queue_client.py,sha256=4m7Evp0d_f-1rjatmyYIwnYYG3PGtNCBjw5TFiU0H0w,32640
|
|
42
|
+
apify/storage_clients/_apify/_storage_client.py,sha256=7oqn8-7zG7_cruw6jzmRl2htX2rOt-KPTzCRVNCcyTA,3304
|
|
43
|
+
apify/storage_clients/_apify/_utils.py,sha256=1g0oGqHmB-AeNfeMiB33om2bRZ_7wwhpijPXdWRPNsM,6609
|
|
44
44
|
apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
|
|
46
46
|
apify/storage_clients/_file_system/_key_value_store_client.py,sha256=fnSJ1EIOPCGfcE6e5S3Tux9VbnMVLCJjugkaQoH_9yo,2267
|
|
47
47
|
apify/storage_clients/_file_system/_storage_client.py,sha256=rcwpKYlrWzvlSA2xoxftg-EZAi_iGZ3vOCbu0C5lKDE,1396
|
|
48
48
|
apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
|
|
49
49
|
apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
-
apify-2.7.
|
|
51
|
-
apify-2.7.
|
|
52
|
-
apify-2.7.
|
|
53
|
-
apify-2.7.
|
|
50
|
+
apify-2.7.1b18.dist-info/METADATA,sha256=JJjECttKjzSG_t9zHcmc5CBlfwV2u3RuknNEOhXOEVY,22580
|
|
51
|
+
apify-2.7.1b18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
52
|
+
apify-2.7.1b18.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
53
|
+
apify-2.7.1b18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|