apify 2.7.3__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (51) hide show
  1. apify/_actor.py +194 -126
  2. apify/_charging.py +34 -9
  3. apify/_configuration.py +70 -6
  4. apify/_crypto.py +0 -6
  5. apify/_models.py +7 -7
  6. apify/_proxy_configuration.py +10 -10
  7. apify/_utils.py +25 -2
  8. apify/events/__init__.py +5 -0
  9. apify/events/_apify_event_manager.py +140 -0
  10. apify/events/_types.py +102 -0
  11. apify/log.py +0 -9
  12. apify/request_loaders/__init__.py +18 -0
  13. apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
  14. apify/request_loaders/py.typed +0 -0
  15. apify/scrapy/_logging_config.py +1 -4
  16. apify/scrapy/extensions/_httpcache.py +9 -5
  17. apify/scrapy/requests.py +3 -3
  18. apify/scrapy/scheduler.py +8 -5
  19. apify/storage_clients/__init__.py +12 -0
  20. apify/storage_clients/_apify/__init__.py +11 -0
  21. apify/storage_clients/_apify/_dataset_client.py +328 -0
  22. apify/storage_clients/_apify/_key_value_store_client.py +265 -0
  23. apify/storage_clients/_apify/_models.py +131 -0
  24. apify/storage_clients/_apify/_request_queue_client.py +327 -0
  25. apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
  26. apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
  27. apify/storage_clients/_apify/_storage_client.py +106 -0
  28. apify/storage_clients/_apify/_utils.py +194 -0
  29. apify/storage_clients/_apify/py.typed +0 -0
  30. apify/storage_clients/_file_system/__init__.py +2 -0
  31. apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
  32. apify/storage_clients/_file_system/_storage_client.py +41 -0
  33. apify/storage_clients/_smart_apify/__init__.py +1 -0
  34. apify/storage_clients/_smart_apify/_storage_client.py +117 -0
  35. apify/storage_clients/py.typed +0 -0
  36. apify/storages/__init__.py +1 -3
  37. {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
  38. apify-3.0.0.dist-info/RECORD +57 -0
  39. apify/_platform_event_manager.py +0 -231
  40. apify/apify_storage_client/__init__.py +0 -3
  41. apify/apify_storage_client/_apify_storage_client.py +0 -72
  42. apify/apify_storage_client/_dataset_client.py +0 -190
  43. apify/apify_storage_client/_dataset_collection_client.py +0 -51
  44. apify/apify_storage_client/_key_value_store_client.py +0 -109
  45. apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
  46. apify/apify_storage_client/_request_queue_client.py +0 -176
  47. apify/apify_storage_client/_request_queue_collection_client.py +0 -51
  48. apify-2.7.3.dist-info/RECORD +0 -44
  49. /apify/{apify_storage_client → events}/py.typed +0 -0
  50. {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
  51. {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,265 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from logging import getLogger
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from typing_extensions import override
8
+ from yarl import URL
9
+
10
+ from apify_client import ApifyClientAsync
11
+ from crawlee.storage_clients._base import KeyValueStoreClient
12
+ from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
13
+ from crawlee.storages import KeyValueStore
14
+
15
+ from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
16
+ from ._utils import AliasResolver
17
+ from apify._crypto import create_hmac_signature
18
+
19
+ if TYPE_CHECKING:
20
+ from collections.abc import AsyncIterator
21
+
22
+ from apify_client.clients import KeyValueStoreClientAsync
23
+
24
+ from apify import Configuration
25
+
26
+ logger = getLogger(__name__)
27
+
28
+
29
+ class ApifyKeyValueStoreClient(KeyValueStoreClient):
30
+ """An Apify platform implementation of the key-value store client."""
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ api_client: KeyValueStoreClientAsync,
36
+ api_public_base_url: str,
37
+ lock: asyncio.Lock,
38
+ ) -> None:
39
+ """Initialize a new instance.
40
+
41
+ Preferably use the `ApifyKeyValueStoreClient.open` class method to create a new instance.
42
+ """
43
+ self._api_client = api_client
44
+ """The Apify KVS client for API operations."""
45
+
46
+ self._api_public_base_url = api_public_base_url
47
+ """The public base URL for accessing the key-value store records."""
48
+
49
+ self._lock = lock
50
+ """A lock to ensure that only one operation is performed at a time."""
51
+
52
+ @override
53
+ async def get_metadata(self) -> ApifyKeyValueStoreMetadata:
54
+ metadata = await self._api_client.get()
55
+ return ApifyKeyValueStoreMetadata.model_validate(metadata)
56
+
57
+ @classmethod
58
+ async def open(
59
+ cls,
60
+ *,
61
+ id: str | None,
62
+ name: str | None,
63
+ alias: str | None,
64
+ configuration: Configuration,
65
+ ) -> ApifyKeyValueStoreClient:
66
+ """Open an Apify key-value store client.
67
+
68
+ This method creates and initializes a new instance of the Apify key-value store client.
69
+ It handles authentication, storage lookup/creation, and metadata retrieval.
70
+
71
+ Args:
72
+ id: The ID of the KVS to open. If provided, searches for existing KVS by ID.
73
+ Mutually exclusive with name and alias.
74
+ name: The name of the KVS to open (global scope, persists across runs).
75
+ Mutually exclusive with id and alias.
76
+ alias: The alias of the KVS to open (run scope, creates unnamed storage).
77
+ Mutually exclusive with id and name.
78
+ configuration: The configuration object containing API credentials and settings. Must include a valid
79
+ `token` and `api_base_url`. May also contain a `default_key_value_store_id` for fallback when
80
+ neither `id`, `name`, nor `alias` is provided.
81
+
82
+ Returns:
83
+ An instance for the opened or created storage client.
84
+
85
+ Raises:
86
+ ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
87
+ `id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
88
+ in the configuration.
89
+ """
90
+ if sum(1 for param in [id, name, alias] if param is not None) > 1:
91
+ raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
92
+
93
+ token = configuration.token
94
+ if not token:
95
+ raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
96
+
97
+ api_url = configuration.api_base_url
98
+ if not api_url:
99
+ raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
100
+
101
+ api_public_base_url = configuration.api_public_base_url
102
+ if not api_public_base_url:
103
+ raise ValueError(
104
+ 'Apify storage client requires a valid API public base URL in Configuration '
105
+ f'(api_public_base_url={api_public_base_url}).'
106
+ )
107
+
108
+ # Create Apify client with the provided token and API URL.
109
+ apify_client_async = ApifyClientAsync(
110
+ token=token,
111
+ api_url=api_url,
112
+ max_retries=8,
113
+ min_delay_between_retries_millis=500,
114
+ timeout_secs=360,
115
+ )
116
+ apify_kvss_client = apify_client_async.key_value_stores()
117
+
118
+ # Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to
119
+ # unnamed storage aliased as `__default__`
120
+ if not any([alias, name, id, configuration.default_key_value_store_id]):
121
+ alias = '__default__'
122
+
123
+ if alias:
124
+ # Check if there is pre-existing alias mapping in the default KVS.
125
+ async with AliasResolver(storage_type=KeyValueStore, alias=alias, configuration=configuration) as _alias:
126
+ id = await _alias.resolve_id()
127
+
128
+ # There was no pre-existing alias in the mapping.
129
+ # Create a new unnamed storage and store the mapping.
130
+ if id is None:
131
+ # Create a new storage and store the alias mapping
132
+ new_storage_metadata = ApifyKeyValueStoreMetadata.model_validate(
133
+ await apify_kvss_client.get_or_create(),
134
+ )
135
+ id = new_storage_metadata.id
136
+ await _alias.store_mapping(storage_id=id)
137
+
138
+ # If name is provided, get or create the storage by name.
139
+ elif name:
140
+ id = ApifyKeyValueStoreMetadata.model_validate(
141
+ await apify_kvss_client.get_or_create(name=name),
142
+ ).id
143
+
144
+ # If none are provided, try to get the default storage ID from environment variables.
145
+ elif id is None:
146
+ id = configuration.default_key_value_store_id
147
+ if not id:
148
+ raise ValueError(
149
+ 'KeyValueStore "id", "name", or "alias" must be specified, '
150
+ 'or a default KeyValueStore ID must be set in the configuration.'
151
+ )
152
+
153
+ # Now create the client for the determined ID
154
+ apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
155
+
156
+ # Fetch its metadata.
157
+ metadata = await apify_kvs_client.get()
158
+
159
+ # If metadata is None, it means the storage does not exist, so we create it.
160
+ if metadata is None:
161
+ id = ApifyKeyValueStoreMetadata.model_validate(
162
+ await apify_kvss_client.get_or_create(),
163
+ ).id
164
+ apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
165
+
166
+ # Verify that the storage exists by fetching its metadata again.
167
+ metadata = await apify_kvs_client.get()
168
+ if metadata is None:
169
+ raise ValueError(f'Opening key-value store with id={id}, name={name}, and alias={alias} failed.')
170
+
171
+ return cls(
172
+ api_client=apify_kvs_client,
173
+ api_public_base_url=api_public_base_url,
174
+ lock=asyncio.Lock(),
175
+ )
176
+
177
+ @override
178
+ async def purge(self) -> None:
179
+ raise NotImplementedError(
180
+ 'Purging key-value stores is not supported in the Apify platform. '
181
+ 'Use the `drop` method to delete the key-value store instead.'
182
+ )
183
+
184
+ @override
185
+ async def drop(self) -> None:
186
+ async with self._lock:
187
+ await self._api_client.delete()
188
+
189
+ @override
190
+ async def get_value(self, key: str) -> KeyValueStoreRecord | None:
191
+ response = await self._api_client.get_record(key)
192
+ return KeyValueStoreRecord.model_validate(response) if response else None
193
+
194
+ @override
195
+ async def set_value(self, key: str, value: Any, content_type: str | None = None) -> None:
196
+ async with self._lock:
197
+ await self._api_client.set_record(
198
+ key=key,
199
+ value=value,
200
+ content_type=content_type,
201
+ )
202
+
203
+ @override
204
+ async def delete_value(self, key: str) -> None:
205
+ async with self._lock:
206
+ await self._api_client.delete_record(key=key)
207
+
208
+ @override
209
+ async def iterate_keys(
210
+ self,
211
+ *,
212
+ exclusive_start_key: str | None = None,
213
+ limit: int | None = None,
214
+ ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
215
+ count = 0
216
+
217
+ while True:
218
+ response = await self._api_client.list_keys(exclusive_start_key=exclusive_start_key)
219
+ list_key_page = KeyValueStoreListKeysPage.model_validate(response)
220
+
221
+ for item in list_key_page.items:
222
+ # Convert KeyValueStoreKeyInfo to KeyValueStoreRecordMetadata
223
+ record_metadata = KeyValueStoreRecordMetadata(
224
+ key=item.key,
225
+ size=item.size,
226
+ content_type='application/octet-stream', # Content type not available from list_keys
227
+ )
228
+ yield record_metadata
229
+ count += 1
230
+
231
+ # If we've reached the limit, stop yielding
232
+ if limit and count >= limit:
233
+ break
234
+
235
+ # If we've reached the limit or there are no more pages, exit the loop
236
+ if (limit and count >= limit) or not list_key_page.is_truncated:
237
+ break
238
+
239
+ exclusive_start_key = list_key_page.next_exclusive_start_key
240
+
241
+ @override
242
+ async def record_exists(self, key: str) -> bool:
243
+ return await self._api_client.record_exists(key=key)
244
+
245
+ async def get_public_url(self, key: str) -> str:
246
+ """Get a URL for the given key that may be used to publicly access the value in the remote key-value store.
247
+
248
+ Args:
249
+ key: The key for which the URL should be generated.
250
+
251
+ Returns:
252
+ A public URL that can be used to access the value of the given key in the KVS.
253
+ """
254
+ if self._api_client.resource_id is None:
255
+ raise ValueError('resource_id cannot be None when generating a public URL')
256
+
257
+ public_url = (
258
+ URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key
259
+ )
260
+ metadata = await self.get_metadata()
261
+
262
+ if metadata.url_signing_secret_key is not None:
263
+ public_url = public_url.with_query(signature=create_hmac_signature(metadata.url_signing_secret_key, key))
264
+
265
+ return str(public_url)
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timedelta
4
+ from typing import Annotated
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ from crawlee.storage_clients.models import KeyValueStoreMetadata, RequestQueueMetadata
9
+
10
+ from apify import Request
11
+ from apify._utils import docs_group
12
+
13
+
14
+ @docs_group('Storage data')
15
+ class ApifyKeyValueStoreMetadata(KeyValueStoreMetadata):
16
+ """Extended key-value store metadata model for Apify platform.
17
+
18
+ Includes additional Apify-specific fields.
19
+ """
20
+
21
+ url_signing_secret_key: Annotated[str | None, Field(alias='urlSigningSecretKey', default=None)]
22
+ """The secret key used for signing URLs for secure access to key-value store records."""
23
+
24
+
25
+ @docs_group('Storage data')
26
+ class ProlongRequestLockResponse(BaseModel):
27
+ """Response to prolong request lock calls."""
28
+
29
+ model_config = ConfigDict(populate_by_name=True)
30
+
31
+ lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')]
32
+
33
+
34
+ @docs_group('Storage data')
35
+ class RequestQueueHead(BaseModel):
36
+ """Model for request queue head.
37
+
38
+ Represents a collection of requests retrieved from the beginning of a queue,
39
+ including metadata about the queue's state and lock information for the requests.
40
+ """
41
+
42
+ model_config = ConfigDict(populate_by_name=True)
43
+
44
+ limit: Annotated[int | None, Field(alias='limit', default=None)]
45
+ """The maximum number of requests that were requested from the queue."""
46
+
47
+ had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients', default=False)]
48
+ """Indicates whether the queue has been accessed by multiple clients (consumers)."""
49
+
50
+ queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')]
51
+ """The timestamp when the queue was last modified."""
52
+
53
+ lock_time: Annotated[timedelta | None, Field(alias='lockSecs', default=None)]
54
+ """The duration for which the returned requests are locked and cannot be processed by other clients."""
55
+
56
+ queue_has_locked_requests: Annotated[bool | None, Field(alias='queueHasLockedRequests', default=False)]
57
+ """Indicates whether the queue contains any locked requests."""
58
+
59
+ items: Annotated[list[Request], Field(alias='items', default_factory=list[Request])]
60
+ """The list of request objects retrieved from the beginning of the queue."""
61
+
62
+
63
+ class KeyValueStoreKeyInfo(BaseModel):
64
+ """Model for a key-value store key info.
65
+
66
+ Only internal structure.
67
+ """
68
+
69
+ model_config = ConfigDict(populate_by_name=True)
70
+
71
+ key: Annotated[str, Field(alias='key')]
72
+ size: Annotated[int, Field(alias='size')]
73
+
74
+
75
+ class KeyValueStoreListKeysPage(BaseModel):
76
+ """Model for listing keys in the key-value store.
77
+
78
+ Only internal structure.
79
+ """
80
+
81
+ model_config = ConfigDict(populate_by_name=True)
82
+
83
+ count: Annotated[int, Field(alias='count')]
84
+ limit: Annotated[int, Field(alias='limit')]
85
+ is_truncated: Annotated[bool, Field(alias='isTruncated')]
86
+ items: Annotated[list[KeyValueStoreKeyInfo], Field(alias='items', default_factory=list)]
87
+ exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)]
88
+ next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)]
89
+
90
+
91
+ class CachedRequest(BaseModel):
92
+ """Pydantic model for cached request information.
93
+
94
+ Only internal structure.
95
+ """
96
+
97
+ unique_key: str
98
+ """Unique key of the request."""
99
+
100
+ was_already_handled: bool
101
+ """Whether the request was already handled."""
102
+
103
+ hydrated: Request | None = None
104
+ """The hydrated request object (the original one)."""
105
+
106
+ lock_expires_at: datetime | None = None
107
+ """The expiration time of the lock on the request."""
108
+
109
+
110
+ class RequestQueueStats(BaseModel):
111
+ model_config = ConfigDict(populate_by_name=True)
112
+
113
+ delete_count: Annotated[int, Field(alias='deleteCount', default=0)]
114
+ """"The number of request queue deletes."""
115
+
116
+ head_item_read_count: Annotated[int, Field(alias='headItemReadCount', default=0)]
117
+ """The number of request queue head reads."""
118
+
119
+ read_count: Annotated[int, Field(alias='readCount', default=0)]
120
+ """The number of request queue reads."""
121
+
122
+ storage_bytes: Annotated[int, Field(alias='storageBytes', default=0)]
123
+ """Storage size in Bytes."""
124
+
125
+ write_count: Annotated[int, Field(alias='writeCount', default=0)]
126
+ """The number of request queue writes."""
127
+
128
+
129
+ class ApifyRequestQueueMetadata(RequestQueueMetadata):
130
+ stats: Annotated[RequestQueueStats, Field(alias='stats', default_factory=RequestQueueStats)]
131
+ """Additional statistics about the request queue."""