apify 3.0.0rc1__py3-none-any.whl → 3.0.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +150 -117
- apify/_charging.py +19 -0
- apify/_configuration.py +42 -11
- apify/events/__init__.py +2 -2
- apify/storage_clients/__init__.py +2 -0
- apify/storage_clients/_apify/_dataset_client.py +47 -23
- apify/storage_clients/_apify/_key_value_store_client.py +46 -22
- apify/storage_clients/_apify/_models.py +25 -1
- apify/storage_clients/_apify/_request_queue_client.py +188 -648
- apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
- apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
- apify/storage_clients/_apify/_storage_client.py +55 -29
- apify/storage_clients/_apify/_utils.py +194 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +22 -1
- apify/storage_clients/_file_system/_storage_client.py +7 -1
- apify/storage_clients/_smart_apify/__init__.py +1 -0
- apify/storage_clients/_smart_apify/_storage_client.py +117 -0
- {apify-3.0.0rc1.dist-info → apify-3.0.1b1.dist-info}/METADATA +20 -5
- {apify-3.0.0rc1.dist-info → apify-3.0.1b1.dist-info}/RECORD +21 -16
- {apify-3.0.0rc1.dist-info → apify-3.0.1b1.dist-info}/WHEEL +0 -0
- {apify-3.0.0rc1.dist-info → apify-3.0.1b1.dist-info}/licenses/LICENSE +0 -0
apify/_configuration.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Annotated, Any
|
|
|
8
8
|
from pydantic import AliasChoices, BeforeValidator, Field, model_validator
|
|
9
9
|
from typing_extensions import Self, deprecated
|
|
10
10
|
|
|
11
|
+
from crawlee import service_locator
|
|
11
12
|
from crawlee._utils.models import timedelta_ms
|
|
12
13
|
from crawlee._utils.urls import validate_http_url
|
|
13
14
|
from crawlee.configuration import Configuration as CrawleeConfiguration
|
|
@@ -141,7 +142,7 @@ class Configuration(CrawleeConfiguration):
|
|
|
141
142
|
] = None
|
|
142
143
|
|
|
143
144
|
default_dataset_id: Annotated[
|
|
144
|
-
str,
|
|
145
|
+
str | None,
|
|
145
146
|
Field(
|
|
146
147
|
validation_alias=AliasChoices(
|
|
147
148
|
'actor_default_dataset_id',
|
|
@@ -149,10 +150,10 @@ class Configuration(CrawleeConfiguration):
|
|
|
149
150
|
),
|
|
150
151
|
description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
|
|
151
152
|
),
|
|
152
|
-
] =
|
|
153
|
+
] = None
|
|
153
154
|
|
|
154
155
|
default_key_value_store_id: Annotated[
|
|
155
|
-
str,
|
|
156
|
+
str | None,
|
|
156
157
|
Field(
|
|
157
158
|
validation_alias=AliasChoices(
|
|
158
159
|
'actor_default_key_value_store_id',
|
|
@@ -160,10 +161,10 @@ class Configuration(CrawleeConfiguration):
|
|
|
160
161
|
),
|
|
161
162
|
description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
|
|
162
163
|
),
|
|
163
|
-
] =
|
|
164
|
+
] = None
|
|
164
165
|
|
|
165
166
|
default_request_queue_id: Annotated[
|
|
166
|
-
str,
|
|
167
|
+
str | None,
|
|
167
168
|
Field(
|
|
168
169
|
validation_alias=AliasChoices(
|
|
169
170
|
'actor_default_request_queue_id',
|
|
@@ -171,7 +172,7 @@ class Configuration(CrawleeConfiguration):
|
|
|
171
172
|
),
|
|
172
173
|
description='Default request queue ID for the Apify storage client when no ID or name is provided.',
|
|
173
174
|
),
|
|
174
|
-
] =
|
|
175
|
+
] = None
|
|
175
176
|
|
|
176
177
|
disable_outdated_warning: Annotated[
|
|
177
178
|
bool,
|
|
@@ -424,11 +425,41 @@ class Configuration(CrawleeConfiguration):
|
|
|
424
425
|
def get_global_configuration(cls) -> Configuration:
|
|
425
426
|
"""Retrieve the global instance of the configuration.
|
|
426
427
|
|
|
427
|
-
|
|
428
|
-
|
|
428
|
+
This method ensures that ApifyConfigration is returned, even if CrawleeConfiguration was set in the
|
|
429
|
+
service locator.
|
|
430
|
+
"""
|
|
431
|
+
global_configuration = service_locator.get_configuration()
|
|
432
|
+
|
|
433
|
+
if isinstance(global_configuration, Configuration):
|
|
434
|
+
# If Apify configuration was already stored in service locator, return it.
|
|
435
|
+
return global_configuration
|
|
436
|
+
|
|
437
|
+
logger.warning(
|
|
438
|
+
'Non Apify Configration is set in the `service_locator` in the SDK context. '
|
|
439
|
+
'It is recommended to set `apify.Configuration` explicitly as early as possible by using '
|
|
440
|
+
'service_locator.set_configuration'
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
return cls.from_configuration(global_configuration)
|
|
444
|
+
|
|
445
|
+
@classmethod
|
|
446
|
+
def from_configuration(cls, configuration: CrawleeConfiguration) -> Configuration:
|
|
447
|
+
"""Create Apify Configuration from existing Crawlee Configuration.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
configuration: The existing Crawlee Configuration.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
The created Apify Configuration.
|
|
429
454
|
"""
|
|
430
|
-
|
|
455
|
+
apify_configuration = cls()
|
|
431
456
|
|
|
457
|
+
# Ensure the returned configuration is of type Apify Configuration.
|
|
458
|
+
# Most likely crawlee configuration was already set. Create Apify configuration from it.
|
|
459
|
+
# Due to known Pydantic issue https://github.com/pydantic/pydantic/issues/9516, creating new instance of
|
|
460
|
+
# Configuration from existing one in situation where environment can have some fields set by alias is very
|
|
461
|
+
# unpredictable. Use the stable workaround.
|
|
462
|
+
for name in configuration.model_fields:
|
|
463
|
+
setattr(apify_configuration, name, getattr(configuration, name))
|
|
432
464
|
|
|
433
|
-
|
|
434
|
-
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
|
|
465
|
+
return apify_configuration
|
apify/events/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from crawlee.events import EventManager, LocalEventManager
|
|
1
|
+
from crawlee.events import Event, EventManager, LocalEventManager
|
|
2
2
|
|
|
3
3
|
from ._apify_event_manager import ApifyEventManager
|
|
4
4
|
|
|
5
|
-
__all__ = ['ApifyEventManager', 'EventManager', 'LocalEventManager']
|
|
5
|
+
__all__ = ['ApifyEventManager', 'Event', 'EventManager', 'LocalEventManager']
|
|
@@ -2,9 +2,11 @@ from crawlee.storage_clients import MemoryStorageClient
|
|
|
2
2
|
|
|
3
3
|
from ._apify import ApifyStorageClient
|
|
4
4
|
from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
|
|
5
|
+
from ._smart_apify import SmartApifyStorageClient
|
|
5
6
|
|
|
6
7
|
__all__ = [
|
|
7
8
|
'ApifyStorageClient',
|
|
8
9
|
'FileSystemStorageClient',
|
|
9
10
|
'MemoryStorageClient',
|
|
11
|
+
'SmartApifyStorageClient',
|
|
10
12
|
]
|
|
@@ -11,6 +11,9 @@ from crawlee._utils.byte_size import ByteSize
|
|
|
11
11
|
from crawlee._utils.file import json_dumps
|
|
12
12
|
from crawlee.storage_clients._base import DatasetClient
|
|
13
13
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
14
|
+
from crawlee.storages import Dataset
|
|
15
|
+
|
|
16
|
+
from ._utils import AliasResolver
|
|
14
17
|
|
|
15
18
|
if TYPE_CHECKING:
|
|
16
19
|
from collections.abc import AsyncIterator
|
|
@@ -66,6 +69,7 @@ class ApifyDatasetClient(DatasetClient):
|
|
|
66
69
|
*,
|
|
67
70
|
id: str | None,
|
|
68
71
|
name: str | None,
|
|
72
|
+
alias: str | None,
|
|
69
73
|
configuration: Configuration,
|
|
70
74
|
) -> ApifyDatasetClient:
|
|
71
75
|
"""Open an Apify dataset client.
|
|
@@ -74,22 +78,27 @@ class ApifyDatasetClient(DatasetClient):
|
|
|
74
78
|
It handles authentication, storage lookup/creation, and metadata retrieval.
|
|
75
79
|
|
|
76
80
|
Args:
|
|
77
|
-
id: The ID of
|
|
78
|
-
|
|
79
|
-
name: The name of
|
|
80
|
-
|
|
81
|
+
id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
|
|
82
|
+
Mutually exclusive with name and alias.
|
|
83
|
+
name: The name of the dataset to open (global scope, persists across runs).
|
|
84
|
+
Mutually exclusive with id and alias.
|
|
85
|
+
alias: The alias of the dataset to open (run scope, creates unnamed storage).
|
|
86
|
+
Mutually exclusive with id and name.
|
|
81
87
|
configuration: The configuration object containing API credentials and settings. Must include a valid
|
|
82
88
|
`token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither
|
|
83
|
-
`id` nor `
|
|
89
|
+
`id`, `name`, nor `alias` is provided.
|
|
84
90
|
|
|
85
91
|
Returns:
|
|
86
92
|
An instance for the opened or created storage client.
|
|
87
93
|
|
|
88
94
|
Raises:
|
|
89
|
-
ValueError: If the configuration is missing required fields (token, api_base_url), if
|
|
90
|
-
|
|
91
|
-
the configuration.
|
|
95
|
+
ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
|
|
96
|
+
`id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
|
|
97
|
+
in the configuration.
|
|
92
98
|
"""
|
|
99
|
+
if sum(1 for param in [id, name, alias] if param is not None) > 1:
|
|
100
|
+
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
101
|
+
|
|
93
102
|
token = configuration.token
|
|
94
103
|
if not token:
|
|
95
104
|
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
|
|
@@ -115,27 +124,42 @@ class ApifyDatasetClient(DatasetClient):
|
|
|
115
124
|
)
|
|
116
125
|
apify_datasets_client = apify_client_async.datasets()
|
|
117
126
|
|
|
118
|
-
#
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if
|
|
124
|
-
|
|
127
|
+
# Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
|
|
128
|
+
# storage aliased as `__default__`
|
|
129
|
+
if not any([alias, name, id, configuration.default_dataset_id]):
|
|
130
|
+
alias = '__default__'
|
|
131
|
+
|
|
132
|
+
if alias:
|
|
133
|
+
# Check if there is pre-existing alias mapping in the default KVS.
|
|
134
|
+
async with AliasResolver(storage_type=Dataset, alias=alias, configuration=configuration) as _alias:
|
|
135
|
+
id = await _alias.resolve_id()
|
|
136
|
+
|
|
137
|
+
# There was no pre-existing alias in the mapping.
|
|
138
|
+
# Create a new unnamed storage and store the mapping.
|
|
139
|
+
if id is None:
|
|
140
|
+
new_storage_metadata = DatasetMetadata.model_validate(
|
|
141
|
+
await apify_datasets_client.get_or_create(),
|
|
142
|
+
)
|
|
143
|
+
id = new_storage_metadata.id
|
|
144
|
+
await _alias.store_mapping(storage_id=id)
|
|
125
145
|
|
|
126
146
|
# If name is provided, get or create the storage by name.
|
|
127
|
-
|
|
147
|
+
elif name:
|
|
128
148
|
id = DatasetMetadata.model_validate(
|
|
129
149
|
await apify_datasets_client.get_or_create(name=name),
|
|
130
150
|
).id
|
|
131
|
-
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
|
|
132
151
|
|
|
133
|
-
# If
|
|
134
|
-
|
|
135
|
-
# a new storage ID after Actor's reboot or migration.
|
|
136
|
-
if id is None and name is None:
|
|
152
|
+
# If none are provided, try to get the default storage ID from environment variables.
|
|
153
|
+
elif id is None:
|
|
137
154
|
id = configuration.default_dataset_id
|
|
138
|
-
|
|
155
|
+
if not id:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
'Dataset "id", "name", or "alias" must be specified, '
|
|
158
|
+
'or a default dataset ID must be set in the configuration.'
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Now create the client for the determined ID
|
|
162
|
+
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
|
|
139
163
|
|
|
140
164
|
# Fetch its metadata.
|
|
141
165
|
metadata = await apify_dataset_client.get()
|
|
@@ -150,7 +174,7 @@ class ApifyDatasetClient(DatasetClient):
|
|
|
150
174
|
# Verify that the storage exists by fetching its metadata again.
|
|
151
175
|
metadata = await apify_dataset_client.get()
|
|
152
176
|
if metadata is None:
|
|
153
|
-
raise ValueError(f'Opening dataset with id={id}
|
|
177
|
+
raise ValueError(f'Opening dataset with id={id}, name={name}, and alias={alias} failed.')
|
|
154
178
|
|
|
155
179
|
return cls(
|
|
156
180
|
api_client=apify_dataset_client,
|
|
@@ -10,8 +10,10 @@ from yarl import URL
|
|
|
10
10
|
from apify_client import ApifyClientAsync
|
|
11
11
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
12
|
from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
|
+
from crawlee.storages import KeyValueStore
|
|
13
14
|
|
|
14
15
|
from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
|
|
16
|
+
from ._utils import AliasResolver
|
|
15
17
|
from apify._crypto import create_hmac_signature
|
|
16
18
|
|
|
17
19
|
if TYPE_CHECKING:
|
|
@@ -58,6 +60,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
|
|
|
58
60
|
*,
|
|
59
61
|
id: str | None,
|
|
60
62
|
name: str | None,
|
|
63
|
+
alias: str | None,
|
|
61
64
|
configuration: Configuration,
|
|
62
65
|
) -> ApifyKeyValueStoreClient:
|
|
63
66
|
"""Open an Apify key-value store client.
|
|
@@ -66,22 +69,27 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
|
|
|
66
69
|
It handles authentication, storage lookup/creation, and metadata retrieval.
|
|
67
70
|
|
|
68
71
|
Args:
|
|
69
|
-
id: The ID of
|
|
70
|
-
|
|
71
|
-
name: The name of
|
|
72
|
-
|
|
72
|
+
id: The ID of the KVS to open. If provided, searches for existing KVS by ID.
|
|
73
|
+
Mutually exclusive with name and alias.
|
|
74
|
+
name: The name of the KVS to open (global scope, persists across runs).
|
|
75
|
+
Mutually exclusive with id and alias.
|
|
76
|
+
alias: The alias of the KVS to open (run scope, creates unnamed storage).
|
|
77
|
+
Mutually exclusive with id and name.
|
|
73
78
|
configuration: The configuration object containing API credentials and settings. Must include a valid
|
|
74
79
|
`token` and `api_base_url`. May also contain a `default_key_value_store_id` for fallback when
|
|
75
|
-
neither `id` nor `
|
|
80
|
+
neither `id`, `name`, nor `alias` is provided.
|
|
76
81
|
|
|
77
82
|
Returns:
|
|
78
83
|
An instance for the opened or created storage client.
|
|
79
84
|
|
|
80
85
|
Raises:
|
|
81
|
-
ValueError: If the configuration is missing required fields (token, api_base_url), if
|
|
82
|
-
|
|
86
|
+
ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
|
|
87
|
+
`id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
|
|
83
88
|
in the configuration.
|
|
84
89
|
"""
|
|
90
|
+
if sum(1 for param in [id, name, alias] if param is not None) > 1:
|
|
91
|
+
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
92
|
+
|
|
85
93
|
token = configuration.token
|
|
86
94
|
if not token:
|
|
87
95
|
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
|
|
@@ -107,27 +115,43 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
|
|
|
107
115
|
)
|
|
108
116
|
apify_kvss_client = apify_client_async.key_value_stores()
|
|
109
117
|
|
|
110
|
-
#
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
if
|
|
116
|
-
|
|
118
|
+
# Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to
|
|
119
|
+
# unnamed storage aliased as `__default__`
|
|
120
|
+
if not any([alias, name, id, configuration.default_key_value_store_id]):
|
|
121
|
+
alias = '__default__'
|
|
122
|
+
|
|
123
|
+
if alias:
|
|
124
|
+
# Check if there is pre-existing alias mapping in the default KVS.
|
|
125
|
+
async with AliasResolver(storage_type=KeyValueStore, alias=alias, configuration=configuration) as _alias:
|
|
126
|
+
id = await _alias.resolve_id()
|
|
127
|
+
|
|
128
|
+
# There was no pre-existing alias in the mapping.
|
|
129
|
+
# Create a new unnamed storage and store the mapping.
|
|
130
|
+
if id is None:
|
|
131
|
+
# Create a new storage and store the alias mapping
|
|
132
|
+
new_storage_metadata = ApifyKeyValueStoreMetadata.model_validate(
|
|
133
|
+
await apify_kvss_client.get_or_create(),
|
|
134
|
+
)
|
|
135
|
+
id = new_storage_metadata.id
|
|
136
|
+
await _alias.store_mapping(storage_id=id)
|
|
117
137
|
|
|
118
138
|
# If name is provided, get or create the storage by name.
|
|
119
|
-
|
|
139
|
+
elif name:
|
|
120
140
|
id = ApifyKeyValueStoreMetadata.model_validate(
|
|
121
141
|
await apify_kvss_client.get_or_create(name=name),
|
|
122
142
|
).id
|
|
123
|
-
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
|
|
124
143
|
|
|
125
|
-
# If
|
|
126
|
-
|
|
127
|
-
# a new storage ID after Actor's reboot or migration.
|
|
128
|
-
if id is None and name is None:
|
|
144
|
+
# If none are provided, try to get the default storage ID from environment variables.
|
|
145
|
+
elif id is None:
|
|
129
146
|
id = configuration.default_key_value_store_id
|
|
130
|
-
|
|
147
|
+
if not id:
|
|
148
|
+
raise ValueError(
|
|
149
|
+
'KeyValueStore "id", "name", or "alias" must be specified, '
|
|
150
|
+
'or a default KeyValueStore ID must be set in the configuration.'
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Now create the client for the determined ID
|
|
154
|
+
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
|
|
131
155
|
|
|
132
156
|
# Fetch its metadata.
|
|
133
157
|
metadata = await apify_kvs_client.get()
|
|
@@ -142,7 +166,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
|
|
|
142
166
|
# Verify that the storage exists by fetching its metadata again.
|
|
143
167
|
metadata = await apify_kvs_client.get()
|
|
144
168
|
if metadata is None:
|
|
145
|
-
raise ValueError(f'Opening key-value store with id={id}
|
|
169
|
+
raise ValueError(f'Opening key-value store with id={id}, name={name}, and alias={alias} failed.')
|
|
146
170
|
|
|
147
171
|
return cls(
|
|
148
172
|
api_client=apify_kvs_client,
|
|
@@ -5,7 +5,7 @@ from typing import Annotated
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, ConfigDict, Field
|
|
7
7
|
|
|
8
|
-
from crawlee.storage_clients.models import KeyValueStoreMetadata
|
|
8
|
+
from crawlee.storage_clients.models import KeyValueStoreMetadata, RequestQueueMetadata
|
|
9
9
|
|
|
10
10
|
from apify import Request
|
|
11
11
|
from apify._utils import docs_group
|
|
@@ -105,3 +105,27 @@ class CachedRequest(BaseModel):
|
|
|
105
105
|
|
|
106
106
|
lock_expires_at: datetime | None = None
|
|
107
107
|
"""The expiration time of the lock on the request."""
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class RequestQueueStats(BaseModel):
|
|
111
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
112
|
+
|
|
113
|
+
delete_count: Annotated[int, Field(alias='deleteCount', default=0)]
|
|
114
|
+
""""The number of request queue deletes."""
|
|
115
|
+
|
|
116
|
+
head_item_read_count: Annotated[int, Field(alias='headItemReadCount', default=0)]
|
|
117
|
+
"""The number of request queue head reads."""
|
|
118
|
+
|
|
119
|
+
read_count: Annotated[int, Field(alias='readCount', default=0)]
|
|
120
|
+
"""The number of request queue reads."""
|
|
121
|
+
|
|
122
|
+
storage_bytes: Annotated[int, Field(alias='storageBytes', default=0)]
|
|
123
|
+
"""Storage size in Bytes."""
|
|
124
|
+
|
|
125
|
+
write_count: Annotated[int, Field(alias='writeCount', default=0)]
|
|
126
|
+
"""The number of request queue writes."""
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ApifyRequestQueueMetadata(RequestQueueMetadata):
|
|
130
|
+
stats: Annotated[RequestQueueStats, Field(alias='stats', default_factory=RequestQueueStats)]
|
|
131
|
+
"""Additional statistics about the request queue."""
|