apify 3.0.0rc1__py3-none-any.whl → 3.0.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/_configuration.py CHANGED
@@ -8,6 +8,7 @@ from typing import Annotated, Any
8
8
  from pydantic import AliasChoices, BeforeValidator, Field, model_validator
9
9
  from typing_extensions import Self, deprecated
10
10
 
11
+ from crawlee import service_locator
11
12
  from crawlee._utils.models import timedelta_ms
12
13
  from crawlee._utils.urls import validate_http_url
13
14
  from crawlee.configuration import Configuration as CrawleeConfiguration
@@ -141,7 +142,7 @@ class Configuration(CrawleeConfiguration):
141
142
  ] = None
142
143
 
143
144
  default_dataset_id: Annotated[
144
- str,
145
+ str | None,
145
146
  Field(
146
147
  validation_alias=AliasChoices(
147
148
  'actor_default_dataset_id',
@@ -149,10 +150,10 @@ class Configuration(CrawleeConfiguration):
149
150
  ),
150
151
  description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
151
152
  ),
152
- ] = 'default'
153
+ ] = None
153
154
 
154
155
  default_key_value_store_id: Annotated[
155
- str,
156
+ str | None,
156
157
  Field(
157
158
  validation_alias=AliasChoices(
158
159
  'actor_default_key_value_store_id',
@@ -160,10 +161,10 @@ class Configuration(CrawleeConfiguration):
160
161
  ),
161
162
  description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
162
163
  ),
163
- ] = 'default'
164
+ ] = None
164
165
 
165
166
  default_request_queue_id: Annotated[
166
- str,
167
+ str | None,
167
168
  Field(
168
169
  validation_alias=AliasChoices(
169
170
  'actor_default_request_queue_id',
@@ -171,7 +172,7 @@ class Configuration(CrawleeConfiguration):
171
172
  ),
172
173
  description='Default request queue ID for the Apify storage client when no ID or name is provided.',
173
174
  ),
174
- ] = 'default'
175
+ ] = None
175
176
 
176
177
  disable_outdated_warning: Annotated[
177
178
  bool,
@@ -424,11 +425,41 @@ class Configuration(CrawleeConfiguration):
424
425
  def get_global_configuration(cls) -> Configuration:
425
426
  """Retrieve the global instance of the configuration.
426
427
 
427
- Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
428
- instead.
428
+ This method ensures that ApifyConfigration is returned, even if CrawleeConfiguration was set in the
429
+ service locator.
430
+ """
431
+ global_configuration = service_locator.get_configuration()
432
+
433
+ if isinstance(global_configuration, Configuration):
434
+ # If Apify configuration was already stored in service locator, return it.
435
+ return global_configuration
436
+
437
+ logger.warning(
438
+ 'Non Apify Configration is set in the `service_locator` in the SDK context. '
439
+ 'It is recommended to set `apify.Configuration` explicitly as early as possible by using '
440
+ 'service_locator.set_configuration'
441
+ )
442
+
443
+ return cls.from_configuration(global_configuration)
444
+
445
+ @classmethod
446
+ def from_configuration(cls, configuration: CrawleeConfiguration) -> Configuration:
447
+ """Create Apify Configuration from existing Crawlee Configuration.
448
+
449
+ Args:
450
+ configuration: The existing Crawlee Configuration.
451
+
452
+ Returns:
453
+ The created Apify Configuration.
429
454
  """
430
- return cls()
455
+ apify_configuration = cls()
431
456
 
457
+ # Ensure the returned configuration is of type Apify Configuration.
458
+ # Most likely crawlee configuration was already set. Create Apify configuration from it.
459
+ # Due to known Pydantic issue https://github.com/pydantic/pydantic/issues/9516, creating new instance of
460
+ # Configuration from existing one in situation where environment can have some fields set by alias is very
461
+ # unpredictable. Use the stable workaround.
462
+ for name in configuration.model_fields:
463
+ setattr(apify_configuration, name, getattr(configuration, name))
432
464
 
433
- # Monkey-patch the base class so that it works with the extended configuration
434
- CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
465
+ return apify_configuration
apify/events/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
- from crawlee.events import EventManager, LocalEventManager
1
+ from crawlee.events import Event, EventManager, LocalEventManager
2
2
 
3
3
  from ._apify_event_manager import ApifyEventManager
4
4
 
5
- __all__ = ['ApifyEventManager', 'EventManager', 'LocalEventManager']
5
+ __all__ = ['ApifyEventManager', 'Event', 'EventManager', 'LocalEventManager']
@@ -2,9 +2,11 @@ from crawlee.storage_clients import MemoryStorageClient
2
2
 
3
3
  from ._apify import ApifyStorageClient
4
4
  from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
5
+ from ._smart_apify import SmartApifyStorageClient
5
6
 
6
7
  __all__ = [
7
8
  'ApifyStorageClient',
8
9
  'FileSystemStorageClient',
9
10
  'MemoryStorageClient',
11
+ 'SmartApifyStorageClient',
10
12
  ]
@@ -11,6 +11,9 @@ from crawlee._utils.byte_size import ByteSize
11
11
  from crawlee._utils.file import json_dumps
12
12
  from crawlee.storage_clients._base import DatasetClient
13
13
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
14
+ from crawlee.storages import Dataset
15
+
16
+ from ._utils import AliasResolver
14
17
 
15
18
  if TYPE_CHECKING:
16
19
  from collections.abc import AsyncIterator
@@ -66,6 +69,7 @@ class ApifyDatasetClient(DatasetClient):
66
69
  *,
67
70
  id: str | None,
68
71
  name: str | None,
72
+ alias: str | None,
69
73
  configuration: Configuration,
70
74
  ) -> ApifyDatasetClient:
71
75
  """Open an Apify dataset client.
@@ -74,22 +78,27 @@ class ApifyDatasetClient(DatasetClient):
74
78
  It handles authentication, storage lookup/creation, and metadata retrieval.
75
79
 
76
80
  Args:
77
- id: The ID of an existing dataset to open. If provided, the client will connect to this specific storage.
78
- Cannot be used together with `name`.
79
- name: The name of a dataset to get or create. If a storage with this name exists, it will be opened;
80
- otherwise, a new one will be created. Cannot be used together with `id`.
81
+ id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
82
+ Mutually exclusive with name and alias.
83
+ name: The name of the dataset to open (global scope, persists across runs).
84
+ Mutually exclusive with id and alias.
85
+ alias: The alias of the dataset to open (run scope, creates unnamed storage).
86
+ Mutually exclusive with id and name.
81
87
  configuration: The configuration object containing API credentials and settings. Must include a valid
82
88
  `token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither
83
- `id` nor `name` is provided.
89
+ `id`, `name`, nor `alias` is provided.
84
90
 
85
91
  Returns:
86
92
  An instance for the opened or created storage client.
87
93
 
88
94
  Raises:
89
- ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name`
90
- are provided, or if neither `id` nor `name` is provided and no default storage ID is available in
91
- the configuration.
95
+ ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
96
+ `id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
97
+ in the configuration.
92
98
  """
99
+ if sum(1 for param in [id, name, alias] if param is not None) > 1:
100
+ raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
101
+
93
102
  token = configuration.token
94
103
  if not token:
95
104
  raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
@@ -115,27 +124,42 @@ class ApifyDatasetClient(DatasetClient):
115
124
  )
116
125
  apify_datasets_client = apify_client_async.datasets()
117
126
 
118
- # If both id and name are provided, raise an error.
119
- if id and name:
120
- raise ValueError('Only one of "id" or "name" can be specified, not both.')
121
-
122
- # If id is provided, get the storage by ID.
123
- if id and name is None:
124
- apify_dataset_client = apify_client_async.dataset(dataset_id=id)
127
+ # Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
128
+ # storage aliased as `__default__`
129
+ if not any([alias, name, id, configuration.default_dataset_id]):
130
+ alias = '__default__'
131
+
132
+ if alias:
133
+ # Check if there is pre-existing alias mapping in the default KVS.
134
+ async with AliasResolver(storage_type=Dataset, alias=alias, configuration=configuration) as _alias:
135
+ id = await _alias.resolve_id()
136
+
137
+ # There was no pre-existing alias in the mapping.
138
+ # Create a new unnamed storage and store the mapping.
139
+ if id is None:
140
+ new_storage_metadata = DatasetMetadata.model_validate(
141
+ await apify_datasets_client.get_or_create(),
142
+ )
143
+ id = new_storage_metadata.id
144
+ await _alias.store_mapping(storage_id=id)
125
145
 
126
146
  # If name is provided, get or create the storage by name.
127
- if name and id is None:
147
+ elif name:
128
148
  id = DatasetMetadata.model_validate(
129
149
  await apify_datasets_client.get_or_create(name=name),
130
150
  ).id
131
- apify_dataset_client = apify_client_async.dataset(dataset_id=id)
132
151
 
133
- # If both id and name are None, try to get the default storage ID from environment variables.
134
- # The default storage ID environment variable is set by the Apify platform. It also contains
135
- # a new storage ID after Actor's reboot or migration.
136
- if id is None and name is None:
152
+ # If none are provided, try to get the default storage ID from environment variables.
153
+ elif id is None:
137
154
  id = configuration.default_dataset_id
138
- apify_dataset_client = apify_client_async.dataset(dataset_id=id)
155
+ if not id:
156
+ raise ValueError(
157
+ 'Dataset "id", "name", or "alias" must be specified, '
158
+ 'or a default dataset ID must be set in the configuration.'
159
+ )
160
+
161
+ # Now create the client for the determined ID
162
+ apify_dataset_client = apify_client_async.dataset(dataset_id=id)
139
163
 
140
164
  # Fetch its metadata.
141
165
  metadata = await apify_dataset_client.get()
@@ -150,7 +174,7 @@ class ApifyDatasetClient(DatasetClient):
150
174
  # Verify that the storage exists by fetching its metadata again.
151
175
  metadata = await apify_dataset_client.get()
152
176
  if metadata is None:
153
- raise ValueError(f'Opening dataset with id={id} and name={name} failed.')
177
+ raise ValueError(f'Opening dataset with id={id}, name={name}, and alias={alias} failed.')
154
178
 
155
179
  return cls(
156
180
  api_client=apify_dataset_client,
@@ -10,8 +10,10 @@ from yarl import URL
10
10
  from apify_client import ApifyClientAsync
11
11
  from crawlee.storage_clients._base import KeyValueStoreClient
12
12
  from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
13
+ from crawlee.storages import KeyValueStore
13
14
 
14
15
  from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
16
+ from ._utils import AliasResolver
15
17
  from apify._crypto import create_hmac_signature
16
18
 
17
19
  if TYPE_CHECKING:
@@ -58,6 +60,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
58
60
  *,
59
61
  id: str | None,
60
62
  name: str | None,
63
+ alias: str | None,
61
64
  configuration: Configuration,
62
65
  ) -> ApifyKeyValueStoreClient:
63
66
  """Open an Apify key-value store client.
@@ -66,22 +69,27 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
66
69
  It handles authentication, storage lookup/creation, and metadata retrieval.
67
70
 
68
71
  Args:
69
- id: The ID of an existing key-value store to open. If provided, the client will connect to this specific
70
- storage. Cannot be used together with `name`.
71
- name: The name of a key-value store to get or create. If a storage with this name exists, it will be
72
- opened; otherwise, a new one will be created. Cannot be used together with `id`.
72
+ id: The ID of the KVS to open. If provided, searches for existing KVS by ID.
73
+ Mutually exclusive with name and alias.
74
+ name: The name of the KVS to open (global scope, persists across runs).
75
+ Mutually exclusive with id and alias.
76
+ alias: The alias of the KVS to open (run scope, creates unnamed storage).
77
+ Mutually exclusive with id and name.
73
78
  configuration: The configuration object containing API credentials and settings. Must include a valid
74
79
  `token` and `api_base_url`. May also contain a `default_key_value_store_id` for fallback when
75
- neither `id` nor `name` is provided.
80
+ neither `id`, `name`, nor `alias` is provided.
76
81
 
77
82
  Returns:
78
83
  An instance for the opened or created storage client.
79
84
 
80
85
  Raises:
81
- ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name`
82
- are provided, or if neither `id` nor `name` is provided and no default storage ID is available
86
+ ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
87
+ `id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
83
88
  in the configuration.
84
89
  """
90
+ if sum(1 for param in [id, name, alias] if param is not None) > 1:
91
+ raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
92
+
85
93
  token = configuration.token
86
94
  if not token:
87
95
  raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
@@ -107,27 +115,43 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
107
115
  )
108
116
  apify_kvss_client = apify_client_async.key_value_stores()
109
117
 
110
- # If both id and name are provided, raise an error.
111
- if id and name:
112
- raise ValueError('Only one of "id" or "name" can be specified, not both.')
113
-
114
- # If id is provided, get the storage by ID.
115
- if id and name is None:
116
- apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
118
+ # Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to
119
+ # unnamed storage aliased as `__default__`
120
+ if not any([alias, name, id, configuration.default_key_value_store_id]):
121
+ alias = '__default__'
122
+
123
+ if alias:
124
+ # Check if there is pre-existing alias mapping in the default KVS.
125
+ async with AliasResolver(storage_type=KeyValueStore, alias=alias, configuration=configuration) as _alias:
126
+ id = await _alias.resolve_id()
127
+
128
+ # There was no pre-existing alias in the mapping.
129
+ # Create a new unnamed storage and store the mapping.
130
+ if id is None:
131
+ # Create a new storage and store the alias mapping
132
+ new_storage_metadata = ApifyKeyValueStoreMetadata.model_validate(
133
+ await apify_kvss_client.get_or_create(),
134
+ )
135
+ id = new_storage_metadata.id
136
+ await _alias.store_mapping(storage_id=id)
117
137
 
118
138
  # If name is provided, get or create the storage by name.
119
- if name and id is None:
139
+ elif name:
120
140
  id = ApifyKeyValueStoreMetadata.model_validate(
121
141
  await apify_kvss_client.get_or_create(name=name),
122
142
  ).id
123
- apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
124
143
 
125
- # If both id and name are None, try to get the default storage ID from environment variables.
126
- # The default storage ID environment variable is set by the Apify platform. It also contains
127
- # a new storage ID after Actor's reboot or migration.
128
- if id is None and name is None:
144
+ # If none are provided, try to get the default storage ID from environment variables.
145
+ elif id is None:
129
146
  id = configuration.default_key_value_store_id
130
- apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
147
+ if not id:
148
+ raise ValueError(
149
+ 'KeyValueStore "id", "name", or "alias" must be specified, '
150
+ 'or a default KeyValueStore ID must be set in the configuration.'
151
+ )
152
+
153
+ # Now create the client for the determined ID
154
+ apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
131
155
 
132
156
  # Fetch its metadata.
133
157
  metadata = await apify_kvs_client.get()
@@ -142,7 +166,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
142
166
  # Verify that the storage exists by fetching its metadata again.
143
167
  metadata = await apify_kvs_client.get()
144
168
  if metadata is None:
145
- raise ValueError(f'Opening key-value store with id={id} and name={name} failed.')
169
+ raise ValueError(f'Opening key-value store with id={id}, name={name}, and alias={alias} failed.')
146
170
 
147
171
  return cls(
148
172
  api_client=apify_kvs_client,
@@ -5,7 +5,7 @@ from typing import Annotated
5
5
 
6
6
  from pydantic import BaseModel, ConfigDict, Field
7
7
 
8
- from crawlee.storage_clients.models import KeyValueStoreMetadata
8
+ from crawlee.storage_clients.models import KeyValueStoreMetadata, RequestQueueMetadata
9
9
 
10
10
  from apify import Request
11
11
  from apify._utils import docs_group
@@ -105,3 +105,27 @@ class CachedRequest(BaseModel):
105
105
 
106
106
  lock_expires_at: datetime | None = None
107
107
  """The expiration time of the lock on the request."""
108
+
109
+
110
+ class RequestQueueStats(BaseModel):
111
+ model_config = ConfigDict(populate_by_name=True)
112
+
113
+ delete_count: Annotated[int, Field(alias='deleteCount', default=0)]
114
+ """"The number of request queue deletes."""
115
+
116
+ head_item_read_count: Annotated[int, Field(alias='headItemReadCount', default=0)]
117
+ """The number of request queue head reads."""
118
+
119
+ read_count: Annotated[int, Field(alias='readCount', default=0)]
120
+ """The number of request queue reads."""
121
+
122
+ storage_bytes: Annotated[int, Field(alias='storageBytes', default=0)]
123
+ """Storage size in Bytes."""
124
+
125
+ write_count: Annotated[int, Field(alias='writeCount', default=0)]
126
+ """The number of request queue writes."""
127
+
128
+
129
+ class ApifyRequestQueueMetadata(RequestQueueMetadata):
130
+ stats: Annotated[RequestQueueStats, Field(alias='stats', default_factory=RequestQueueStats)]
131
+ """Additional statistics about the request queue."""