apify 3.0.2b6__py3-none-any.whl → 3.0.3b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

@@ -22,8 +22,8 @@ URL_NO_COMMAS_REGEX = re.compile(
22
22
  class _RequestDetails(BaseModel):
23
23
  method: HttpMethod = 'GET'
24
24
  payload: str = ''
25
- headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
26
- user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
25
+ headers: Annotated[dict[str, str], Field(default_factory=dict)]
26
+ user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')]
27
27
 
28
28
 
29
29
  class _RequestsFromUrlInput(_RequestDetails):
@@ -1,19 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import warnings
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any
6
7
 
7
8
  from typing_extensions import override
8
9
 
9
- from apify_client import ApifyClientAsync
10
10
  from crawlee._utils.byte_size import ByteSize
11
11
  from crawlee._utils.file import json_dumps
12
12
  from crawlee.storage_clients._base import DatasetClient
13
13
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
14
14
  from crawlee.storages import Dataset
15
15
 
16
- from ._utils import AliasResolver
16
+ from ._utils import AliasResolver, create_apify_client
17
17
 
18
18
  if TYPE_CHECKING:
19
19
  from collections.abc import AsyncIterator
@@ -52,12 +52,17 @@ class ApifyDatasetClient(DatasetClient):
52
52
  self._api_client = api_client
53
53
  """The Apify dataset client for API operations."""
54
54
 
55
- self._api_public_base_url = api_public_base_url
56
- """The public base URL for accessing the key-value store records."""
57
-
58
55
  self._lock = lock
59
56
  """A lock to ensure that only one operation is performed at a time."""
60
57
 
58
+ if api_public_base_url:
59
+ # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
60
+ warnings.warn(
61
+ 'api_public_base_url argument is deprecated and will be removed in version 4.0.0',
62
+ DeprecationWarning,
63
+ stacklevel=2,
64
+ )
65
+
61
66
  @override
62
67
  async def get_metadata(self) -> DatasetMetadata:
63
68
  metadata = await self._api_client.get()
@@ -99,29 +104,7 @@ class ApifyDatasetClient(DatasetClient):
99
104
  if sum(1 for param in [id, name, alias] if param is not None) > 1:
100
105
  raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
101
106
 
102
- token = configuration.token
103
- if not token:
104
- raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
105
-
106
- api_url = configuration.api_base_url
107
- if not api_url:
108
- raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
109
-
110
- api_public_base_url = configuration.api_public_base_url
111
- if not api_public_base_url:
112
- raise ValueError(
113
- 'Apify storage client requires a valid API public base URL in Configuration '
114
- f'(api_public_base_url={api_public_base_url}).'
115
- )
116
-
117
- # Create Apify client with the provided token and API URL.
118
- apify_client_async = ApifyClientAsync(
119
- token=token,
120
- api_url=api_url,
121
- max_retries=8,
122
- min_delay_between_retries_millis=500,
123
- timeout_secs=360,
124
- )
107
+ apify_client_async = create_apify_client(configuration)
125
108
  apify_datasets_client = apify_client_async.datasets()
126
109
 
127
110
  # Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
@@ -178,7 +161,7 @@ class ApifyDatasetClient(DatasetClient):
178
161
 
179
162
  return cls(
180
163
  api_client=apify_dataset_client,
181
- api_public_base_url=api_public_base_url,
164
+ api_public_base_url='', # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
182
165
  lock=asyncio.Lock(),
183
166
  )
184
167
 
@@ -1,20 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import warnings
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any
6
7
 
7
8
  from typing_extensions import override
8
- from yarl import URL
9
9
 
10
- from apify_client import ApifyClientAsync
11
10
  from crawlee.storage_clients._base import KeyValueStoreClient
12
11
  from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
13
12
  from crawlee.storages import KeyValueStore
14
13
 
15
14
  from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
16
- from ._utils import AliasResolver
17
- from apify._crypto import create_hmac_signature
15
+ from ._utils import AliasResolver, create_apify_client
18
16
 
19
17
  if TYPE_CHECKING:
20
18
  from collections.abc import AsyncIterator
@@ -43,12 +41,17 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
43
41
  self._api_client = api_client
44
42
  """The Apify KVS client for API operations."""
45
43
 
46
- self._api_public_base_url = api_public_base_url
47
- """The public base URL for accessing the key-value store records."""
48
-
49
44
  self._lock = lock
50
45
  """A lock to ensure that only one operation is performed at a time."""
51
46
 
47
+ if api_public_base_url:
48
+ # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
49
+ warnings.warn(
50
+ 'api_public_base_url argument is deprecated and will be removed in version 4.0.0',
51
+ DeprecationWarning,
52
+ stacklevel=2,
53
+ )
54
+
52
55
  @override
53
56
  async def get_metadata(self) -> ApifyKeyValueStoreMetadata:
54
57
  metadata = await self._api_client.get()
@@ -90,29 +93,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
90
93
  if sum(1 for param in [id, name, alias] if param is not None) > 1:
91
94
  raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
92
95
 
93
- token = configuration.token
94
- if not token:
95
- raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
96
-
97
- api_url = configuration.api_base_url
98
- if not api_url:
99
- raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
100
-
101
- api_public_base_url = configuration.api_public_base_url
102
- if not api_public_base_url:
103
- raise ValueError(
104
- 'Apify storage client requires a valid API public base URL in Configuration '
105
- f'(api_public_base_url={api_public_base_url}).'
106
- )
107
-
108
- # Create Apify client with the provided token and API URL.
109
- apify_client_async = ApifyClientAsync(
110
- token=token,
111
- api_url=api_url,
112
- max_retries=8,
113
- min_delay_between_retries_millis=500,
114
- timeout_secs=360,
115
- )
96
+ apify_client_async = create_apify_client(configuration)
116
97
  apify_kvss_client = apify_client_async.key_value_stores()
117
98
 
118
99
  # Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to
@@ -170,7 +151,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
170
151
 
171
152
  return cls(
172
153
  api_client=apify_kvs_client,
173
- api_public_base_url=api_public_base_url,
154
+ api_public_base_url='', # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
174
155
  lock=asyncio.Lock(),
175
156
  )
176
157
 
@@ -251,15 +232,4 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
251
232
  Returns:
252
233
  A public URL that can be used to access the value of the given key in the KVS.
253
234
  """
254
- if self._api_client.resource_id is None:
255
- raise ValueError('resource_id cannot be None when generating a public URL')
256
-
257
- public_url = (
258
- URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key
259
- )
260
- metadata = await self.get_metadata()
261
-
262
- if metadata.url_signing_secret_key is not None:
263
- public_url = public_url.with_query(signature=create_hmac_signature(metadata.url_signing_secret_key, key))
264
-
265
- return str(public_url)
235
+ return await self._api_client.get_record_public_url(key=key)
@@ -94,8 +94,8 @@ class CachedRequest(BaseModel):
94
94
  Only internal structure.
95
95
  """
96
96
 
97
- unique_key: str
98
- """Unique key of the request."""
97
+ id: str
98
+ """Id of the request."""
99
99
 
100
100
  was_already_handled: bool
101
101
  """Whether the request was already handled."""
@@ -5,16 +5,15 @@ from typing import TYPE_CHECKING, Final, Literal
5
5
 
6
6
  from typing_extensions import override
7
7
 
8
- from apify_client import ApifyClientAsync
9
8
  from crawlee._utils.crypto import crypto_random_object_id
10
9
  from crawlee.storage_clients._base import RequestQueueClient
11
10
  from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
12
11
  from crawlee.storages import RequestQueue
13
12
 
14
13
  from ._models import ApifyRequestQueueMetadata, RequestQueueStats
15
- from ._request_queue_shared_client import _ApifyRequestQueueSharedClient
16
- from ._request_queue_single_client import _ApifyRequestQueueSingleClient
17
- from ._utils import AliasResolver
14
+ from ._request_queue_shared_client import ApifyRequestQueueSharedClient
15
+ from ._request_queue_single_client import ApifyRequestQueueSingleClient
16
+ from ._utils import AliasResolver, create_apify_client
18
17
 
19
18
  if TYPE_CHECKING:
20
19
  from collections.abc import Sequence
@@ -47,14 +46,14 @@ class ApifyRequestQueueClient(RequestQueueClient):
47
46
  self._api_client = api_client
48
47
  """The Apify request queue client for API operations."""
49
48
 
50
- self._implementation: _ApifyRequestQueueSingleClient | _ApifyRequestQueueSharedClient
49
+ self._implementation: ApifyRequestQueueSingleClient | ApifyRequestQueueSharedClient
51
50
  """Internal implementation used to communicate with the Apify platform based Request Queue."""
52
51
  if access == 'single':
53
- self._implementation = _ApifyRequestQueueSingleClient(
52
+ self._implementation = ApifyRequestQueueSingleClient(
54
53
  api_client=self._api_client, metadata=metadata, cache_size=self._MAX_CACHED_REQUESTS
55
54
  )
56
55
  elif access == 'shared':
57
- self._implementation = _ApifyRequestQueueSharedClient(
56
+ self._implementation = ApifyRequestQueueSharedClient(
58
57
  api_client=self._api_client,
59
58
  metadata=metadata,
60
59
  cache_size=self._MAX_CACHED_REQUESTS,
@@ -228,29 +227,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
228
227
  if sum(1 for param in [id, name, alias] if param is not None) > 1:
229
228
  raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
230
229
 
231
- token = configuration.token
232
- if not token:
233
- raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
234
-
235
- api_url = configuration.api_base_url
236
- if not api_url:
237
- raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
238
-
239
- api_public_base_url = configuration.api_public_base_url
240
- if not api_public_base_url:
241
- raise ValueError(
242
- 'Apify storage client requires a valid API public base URL in Configuration '
243
- f'(api_public_base_url={api_public_base_url}).'
244
- )
245
-
246
- # Create Apify client with the provided token and API URL.
247
- apify_client_async = ApifyClientAsync(
248
- token=token,
249
- api_url=api_url,
250
- max_retries=8,
251
- min_delay_between_retries_millis=500,
252
- timeout_secs=360,
253
- )
230
+ apify_client_async = create_apify_client(configuration)
254
231
  apify_rqs_client = apify_client_async.request_queues()
255
232
 
256
233
  # Normalize unnamed default storage in cases where not defined in `configuration.default_request_queue_id` to
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
23
23
  logger = getLogger(__name__)
24
24
 
25
25
 
26
- class _ApifyRequestQueueSharedClient:
26
+ class ApifyRequestQueueSharedClient:
27
27
  """An Apify platform implementation of the request queue client.
28
28
 
29
29
  This implementation supports multiple producers and multiple consumers scenario.
@@ -54,10 +54,10 @@ class _ApifyRequestQueueSharedClient:
54
54
  """The Apify request queue client for API operations."""
55
55
 
56
56
  self._queue_head = deque[str]()
57
- """A deque to store request unique keys in the queue head."""
57
+ """A deque to store request ids in the queue head."""
58
58
 
59
59
  self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=cache_size)
60
- """A cache to store request objects. Request unique key is used as the cache key."""
60
+ """A cache to store request objects. Request id is used as the cache key."""
61
61
 
62
62
  self._queue_has_locked_requests: bool | None = None
63
63
  """Whether the queue has requests locked by another client."""
@@ -101,31 +101,30 @@ class _ApifyRequestQueueSharedClient:
101
101
  already_present_requests: list[ProcessedRequest] = []
102
102
 
103
103
  for request in requests:
104
- if self._requests_cache.get(request.unique_key):
104
+ request_id = unique_key_to_request_id(request.unique_key)
105
+ if self._requests_cache.get(request_id):
105
106
  # We are not sure if it was already handled at this point, and it is not worth calling API for it.
106
107
  # It could have been handled by another client in the meantime, so cached information about
107
108
  # `request.was_already_handled` is not reliable.
108
109
  already_present_requests.append(
109
- ProcessedRequest.model_validate(
110
- {
111
- 'uniqueKey': request.unique_key,
112
- 'wasAlreadyPresent': True,
113
- 'wasAlreadyHandled': request.was_already_handled,
114
- }
110
+ ProcessedRequest(
111
+ id=request_id,
112
+ unique_key=request.unique_key,
113
+ was_already_present=True,
114
+ was_already_handled=request.was_already_handled,
115
115
  )
116
116
  )
117
117
 
118
118
  else:
119
119
  # Add new request to the cache.
120
- processed_request = ProcessedRequest.model_validate(
121
- {
122
- 'uniqueKey': request.unique_key,
123
- 'wasAlreadyPresent': True,
124
- 'wasAlreadyHandled': request.was_already_handled,
125
- }
120
+ processed_request = ProcessedRequest(
121
+ id=request_id,
122
+ unique_key=request.unique_key,
123
+ was_already_present=True,
124
+ was_already_handled=request.was_already_handled,
126
125
  )
127
126
  self._cache_request(
128
- request.unique_key,
127
+ request_id,
129
128
  processed_request,
130
129
  )
131
130
  new_requests.append(request)
@@ -135,7 +134,6 @@ class _ApifyRequestQueueSharedClient:
135
134
  requests_dict = [
136
135
  request.model_dump(
137
136
  by_alias=True,
138
- exclude={'id'}, # Exclude ID fields from requests since the API doesn't accept them.
139
137
  )
140
138
  for request in new_requests
141
139
  ]
@@ -150,7 +148,8 @@ class _ApifyRequestQueueSharedClient:
150
148
 
151
149
  # Remove unprocessed requests from the cache
152
150
  for unprocessed_request in api_response.unprocessed_requests:
153
- self._requests_cache.pop(unprocessed_request.unique_key, None)
151
+ unprocessed_request_id = unique_key_to_request_id(unprocessed_request.unique_key)
152
+ self._requests_cache.pop(unprocessed_request_id, None)
154
153
 
155
154
  else:
156
155
  api_response = AddRequestsResponse.model_validate(
@@ -183,7 +182,10 @@ class _ApifyRequestQueueSharedClient:
183
182
  Returns:
184
183
  The request or None if not found.
185
184
  """
186
- response = await self._api_client.get_request(unique_key_to_request_id(unique_key))
185
+ return await self._get_request_by_id(unique_key_to_request_id(unique_key))
186
+
187
+ async def _get_request_by_id(self, request_id: str) -> Request | None:
188
+ response = await self._api_client.get_request(request_id)
187
189
 
188
190
  if response is None:
189
191
  return None
@@ -210,15 +212,15 @@ class _ApifyRequestQueueSharedClient:
210
212
  return None
211
213
 
212
214
  # Get the next request ID from the queue head
213
- next_unique_key = self._queue_head.popleft()
215
+ next_request_id = self._queue_head.popleft()
214
216
 
215
- request = await self._get_or_hydrate_request(next_unique_key)
217
+ request = await self._get_or_hydrate_request(next_request_id)
216
218
 
217
219
  # Handle potential inconsistency where request might not be in the main table yet
218
220
  if request is None:
219
221
  logger.debug(
220
222
  'Cannot find a request from the beginning of queue, will be retried later',
221
- extra={'nextRequestUniqueKey': next_unique_key},
223
+ extra={'next_request_id': next_request_id},
222
224
  )
223
225
  return None
224
226
 
@@ -226,16 +228,16 @@ class _ApifyRequestQueueSharedClient:
226
228
  if request.handled_at is not None:
227
229
  logger.debug(
228
230
  'Request fetched from the beginning of queue was already handled',
229
- extra={'nextRequestUniqueKey': next_unique_key},
231
+ extra={'next_request_id': next_request_id},
230
232
  )
231
233
  return None
232
234
 
233
235
  # Use get request to ensure we have the full request object.
234
- request = await self.get_request(request.unique_key)
236
+ request = await self._get_request_by_id(next_request_id)
235
237
  if request is None:
236
238
  logger.debug(
237
239
  'Request fetched from the beginning of queue was not found in the RQ',
238
- extra={'nextRequestUniqueKey': next_unique_key},
240
+ extra={'next_request_id': next_request_id},
239
241
  )
240
242
  return None
241
243
 
@@ -252,15 +254,17 @@ class _ApifyRequestQueueSharedClient:
252
254
  Returns:
253
255
  Information about the queue operation. `None` if the given request was not in progress.
254
256
  """
257
+ request_id = unique_key_to_request_id(request.unique_key)
255
258
  # Set the handled_at timestamp if not already set
256
259
  if request.handled_at is None:
257
260
  request.handled_at = datetime.now(tz=timezone.utc)
258
261
 
259
- if cached_request := self._requests_cache[request.unique_key]:
262
+ if cached_request := self._requests_cache[request_id]:
260
263
  cached_request.was_already_handled = request.was_already_handled
261
264
  try:
262
265
  # Update the request in the API
263
266
  processed_request = await self._update_request(request)
267
+ processed_request.id = request_id
264
268
  processed_request.unique_key = request.unique_key
265
269
 
266
270
  # Update assumed handled count if this wasn't already handled
@@ -269,10 +273,9 @@ class _ApifyRequestQueueSharedClient:
269
273
  self.metadata.pending_request_count -= 1
270
274
 
271
275
  # Update the cache with the handled request
272
- cache_key = request.unique_key
273
276
  self._cache_request(
274
- cache_key,
275
- processed_request,
277
+ cache_key=request_id,
278
+ processed_request=processed_request,
276
279
  hydrated_request=request,
277
280
  )
278
281
  except Exception as exc:
@@ -356,17 +359,17 @@ class _ApifyRequestQueueSharedClient:
356
359
  # Fetch requests from the API and populate the queue head
357
360
  await self._list_head()
358
361
 
359
- async def _get_or_hydrate_request(self, unique_key: str) -> Request | None:
360
- """Get a request by unique key, either from cache or by fetching from API.
362
+ async def _get_or_hydrate_request(self, request_id: str) -> Request | None:
363
+ """Get a request by id, either from cache or by fetching from API.
361
364
 
362
365
  Args:
363
- unique_key: Unique key of the request to get.
366
+ request_id: Id of the request to get.
364
367
 
365
368
  Returns:
366
369
  The request if found and valid, otherwise None.
367
370
  """
368
371
  # First check if the request is in our cache
369
- cached_entry = self._requests_cache.get(unique_key)
372
+ cached_entry = self._requests_cache.get(request_id)
370
373
 
371
374
  if cached_entry and cached_entry.hydrated:
372
375
  # If we have the request hydrated in cache, return it
@@ -375,17 +378,17 @@ class _ApifyRequestQueueSharedClient:
375
378
  # If not in cache or not hydrated, fetch the request
376
379
  try:
377
380
  # Fetch the request data
378
- request = await self.get_request(unique_key)
381
+ request = await self._get_request_by_id(request_id)
379
382
 
380
383
  # If request is not found and return None
381
384
  if not request:
382
385
  return None
383
386
 
384
387
  # Update cache with hydrated request
385
- cache_key = request.unique_key
386
388
  self._cache_request(
387
- cache_key,
388
- ProcessedRequest(
389
+ cache_key=request_id,
390
+ processed_request=ProcessedRequest(
391
+ id=request_id,
389
392
  unique_key=request.unique_key,
390
393
  was_already_present=True,
391
394
  was_already_handled=request.handled_at is not None,
@@ -393,7 +396,7 @@ class _ApifyRequestQueueSharedClient:
393
396
  hydrated_request=request,
394
397
  )
395
398
  except Exception as exc:
396
- logger.debug(f'Error fetching request {unique_key}: {exc!s}')
399
+ logger.debug(f'Error fetching request {request_id}: {exc!s}')
397
400
  return None
398
401
  else:
399
402
  return request
@@ -442,8 +445,8 @@ class _ApifyRequestQueueSharedClient:
442
445
  logger.debug(f'Using cached queue head with {len(self._queue_head)} requests')
443
446
  # Create a list of requests from the cached queue head
444
447
  items = []
445
- for unique_key in list(self._queue_head)[:limit]:
446
- cached_request = self._requests_cache.get(unique_key)
448
+ for request_id in list(self._queue_head)[:limit]:
449
+ cached_request = self._requests_cache.get(request_id)
447
450
  if cached_request and cached_request.hydrated:
448
451
  items.append(cached_request.hydrated)
449
452
 
@@ -476,32 +479,35 @@ class _ApifyRequestQueueSharedClient:
476
479
 
477
480
  for request_data in response.get('items', []):
478
481
  request = Request.model_validate(request_data)
482
+ request_id = request_data.get('id')
479
483
 
480
484
  # Skip requests without ID or unique key
481
- if not request.unique_key:
485
+ if not request.unique_key or not request_id:
482
486
  logger.debug(
483
- 'Skipping request from queue head, missing unique key',
487
+ 'Skipping request from queue head, missing unique key or id',
484
488
  extra={
485
489
  'unique_key': request.unique_key,
490
+ 'id': request_id,
486
491
  },
487
492
  )
488
493
  continue
489
494
 
490
495
  # Cache the request
491
496
  self._cache_request(
492
- request.unique_key,
497
+ request_id,
493
498
  ProcessedRequest(
499
+ id=request_id,
494
500
  unique_key=request.unique_key,
495
501
  was_already_present=True,
496
502
  was_already_handled=False,
497
503
  ),
498
504
  hydrated_request=request,
499
505
  )
500
- self._queue_head.append(request.unique_key)
506
+ self._queue_head.append(request_id)
501
507
 
502
- for leftover_unique_key in leftover_buffer:
508
+ for leftover_id in leftover_buffer:
503
509
  # After adding new requests to the forefront, any existing leftover locked request is kept in the end.
504
- self._queue_head.append(leftover_unique_key)
510
+ self._queue_head.append(leftover_id)
505
511
  return RequestQueueHead.model_validate(response)
506
512
 
507
513
  def _cache_request(
@@ -520,7 +526,7 @@ class _ApifyRequestQueueSharedClient:
520
526
  hydrated_request: The hydrated request object, if available.
521
527
  """
522
528
  self._requests_cache[cache_key] = CachedRequest(
523
- unique_key=processed_request.unique_key,
529
+ id=processed_request.id,
524
530
  was_already_handled=processed_request.was_already_handled,
525
531
  hydrated=hydrated_request,
526
532
  lock_expires_at=None,
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
21
21
  logger = getLogger(__name__)
22
22
 
23
23
 
24
- class _ApifyRequestQueueSingleClient:
24
+ class ApifyRequestQueueSingleClient:
25
25
  """An Apify platform implementation of the request queue client with limited capability.
26
26
 
27
27
  This client is designed to use as little resources as possible, but has to be used in constrained context.
@@ -56,21 +56,21 @@ class _ApifyRequestQueueSingleClient:
56
56
  """The Apify request queue client for API operations."""
57
57
 
58
58
  self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=cache_size)
59
- """A cache to store request objects. Request unique key is used as the cache key."""
59
+ """A cache to store request objects. Request id is used as the cache key."""
60
60
 
61
61
  self._head_requests: deque[str] = deque()
62
- """Ordered unique keys of requests that represent queue head."""
62
+ """Ordered ids of requests that represent queue head."""
63
63
 
64
64
  self._requests_already_handled: set[str] = set()
65
65
  """Local estimation of requests unique keys that are already present and handled on the platform.
66
66
 
67
67
  - To enhance local deduplication.
68
68
  - To reduce the _requests_cache size. Already handled requests are most likely not going to be needed again,
69
- so no need to cache more than their unique_key.
69
+ so no need to cache more than their id.
70
70
  """
71
71
 
72
72
  self._requests_in_progress: set[str] = set()
73
- """Set of requests unique keys that are being processed locally.
73
+ """Set of requests ids that are being processed locally.
74
74
 
75
75
  - To help decide if the RQ is finished or not. This is the only consumer, so it can be tracked locally.
76
76
  """
@@ -105,26 +105,27 @@ class _ApifyRequestQueueSingleClient:
105
105
  already_present_requests: list[ProcessedRequest] = []
106
106
 
107
107
  for request in requests:
108
+ # Calculate id for request
109
+ request_id = unique_key_to_request_id(request.unique_key)
110
+
108
111
  # Check if request is known to be already handled (it has to be present as well.)
109
- if request.unique_key in self._requests_already_handled:
112
+ if request_id in self._requests_already_handled:
110
113
  already_present_requests.append(
111
- ProcessedRequest.model_validate(
112
- {
113
- 'uniqueKey': request.unique_key,
114
- 'wasAlreadyPresent': True,
115
- 'wasAlreadyHandled': True,
116
- }
114
+ ProcessedRequest(
115
+ id=request_id,
116
+ unique_key=request.unique_key,
117
+ was_already_present=True,
118
+ was_already_handled=True,
117
119
  )
118
120
  )
119
121
  # Check if request is known to be already present, but unhandled
120
- elif self._requests_cache.get(request.unique_key):
122
+ elif self._requests_cache.get(request_id):
121
123
  already_present_requests.append(
122
- ProcessedRequest.model_validate(
123
- {
124
- 'uniqueKey': request.unique_key,
125
- 'wasAlreadyPresent': True,
126
- 'wasAlreadyHandled': request.was_already_handled,
127
- }
124
+ ProcessedRequest(
125
+ id=request_id,
126
+ unique_key=request.unique_key,
127
+ was_already_present=True,
128
+ was_already_handled=request.was_already_handled,
128
129
  )
129
130
  )
130
131
  else:
@@ -132,11 +133,11 @@ class _ApifyRequestQueueSingleClient:
132
133
  new_requests.append(request)
133
134
 
134
135
  # Update local caches
135
- self._requests_cache[request.unique_key] = request
136
+ self._requests_cache[request_id] = request
136
137
  if forefront:
137
- self._head_requests.append(request.unique_key)
138
+ self._head_requests.append(request_id)
138
139
  else:
139
- self._head_requests.appendleft(request.unique_key)
140
+ self._head_requests.appendleft(request_id)
140
141
 
141
142
  if new_requests:
142
143
  # Prepare requests for API by converting to dictionaries.
@@ -155,11 +156,12 @@ class _ApifyRequestQueueSingleClient:
155
156
  api_response.processed_requests.extend(already_present_requests)
156
157
  # Remove unprocessed requests from the cache
157
158
  for unprocessed_request in api_response.unprocessed_requests:
158
- self._requests_cache.pop(unprocessed_request.unique_key, None)
159
+ self._requests_cache.pop(unique_key_to_request_id(unprocessed_request.unique_key), None)
159
160
 
160
161
  else:
161
- api_response = AddRequestsResponse.model_validate(
162
- {'unprocessedRequests': [], 'processedRequests': already_present_requests}
162
+ api_response = AddRequestsResponse(
163
+ unprocessed_requests=[],
164
+ processed_requests=already_present_requests,
163
165
  )
164
166
 
165
167
  # Update assumed total count for newly added requests.
@@ -181,15 +183,39 @@ class _ApifyRequestQueueSingleClient:
181
183
  Returns:
182
184
  The request or None if not found.
183
185
  """
184
- if unique_key in self._requests_cache:
185
- return self._requests_cache[unique_key]
186
+ return await self._get_request(id=unique_key_to_request_id(unique_key))
187
+
188
+ async def _get_request(self, id: str) -> Request | None:
189
+ """Get a request by id.
190
+
191
+ Args:
192
+ id: Id of request to get.
193
+
194
+ Returns:
195
+ The request or None if not found.
196
+ """
197
+ if id in self._requests_cache:
198
+ return self._requests_cache[id]
186
199
 
187
- response = await self._api_client.get_request(unique_key_to_request_id(unique_key))
200
+ # Requests that were not added by this client are not in local cache. Fetch them from platform.
201
+ response = await self._api_client.get_request(id)
188
202
 
189
203
  if response is None:
190
204
  return None
191
205
 
192
- return Request.model_validate(response)
206
+ request = Request.model_validate(response)
207
+
208
+ # Updated local caches
209
+ if id in self._requests_in_progress:
210
+ # No caching of requests that are already in progress, client is already aware of them.
211
+ pass
212
+ elif request.was_already_handled:
213
+ # Cache only id for already handled requests
214
+ self._requests_already_handled.add(id)
215
+ else:
216
+ # Cache full request for unhandled requests that are not yet in progress and are not yet handled.
217
+ self._requests_cache[id] = request
218
+ return request
193
219
 
194
220
  async def fetch_next_request(self) -> Request | None:
195
221
  """Return the next request in the queue to be processed.
@@ -205,13 +231,10 @@ class _ApifyRequestQueueSingleClient:
205
231
  await self._ensure_head_is_non_empty()
206
232
 
207
233
  while self._head_requests:
208
- request_unique_key = self._head_requests.pop()
209
- if (
210
- request_unique_key not in self._requests_in_progress
211
- and request_unique_key not in self._requests_already_handled
212
- ):
213
- self._requests_in_progress.add(request_unique_key)
214
- return await self.get_request(request_unique_key)
234
+ request_id = self._head_requests.pop()
235
+ if request_id not in self._requests_in_progress and request_id not in self._requests_already_handled:
236
+ self._requests_in_progress.add(request_id)
237
+ return await self._get_request(request_id)
215
238
  # No request locally and the ones returned from the platform are already in progress.
216
239
  return None
217
240
 
@@ -237,30 +260,18 @@ class _ApifyRequestQueueSingleClient:
237
260
  # Update the cached data
238
261
  for request_data in response.get('items', []):
239
262
  request = Request.model_validate(request_data)
263
+ request_id = request_data['id']
240
264
 
241
- if request.unique_key in self._requests_in_progress:
265
+ if request_id in self._requests_in_progress:
242
266
  # Ignore requests that are already in progress, we will not process them again.
243
267
  continue
268
+
244
269
  if request.was_already_handled:
245
- # Do not cache fully handled requests, we do not need them. Just cache their unique_key.
246
- self._requests_already_handled.add(request.unique_key)
247
- else:
248
- # Only fetch the request if we do not know it yet.
249
- if request.unique_key not in self._requests_cache:
250
- request_id = unique_key_to_request_id(request.unique_key)
251
- complete_request_data = await self._api_client.get_request(request_id)
252
-
253
- if complete_request_data is not None:
254
- request = Request.model_validate(complete_request_data)
255
- self._requests_cache[request.unique_key] = request
256
- else:
257
- logger.warning(
258
- f'Could not fetch request data for unique_key=`{request.unique_key}` (id=`{request_id}`)'
259
- )
260
-
261
- # Add new requests to the end of the head, unless already present in head
262
- if request.unique_key not in self._head_requests:
263
- self._head_requests.appendleft(request.unique_key)
270
+ # Do not cache fully handled requests, we do not need them. Just cache their id.
271
+ self._requests_already_handled.add(request_id)
272
+ # Add new requests to the end of the head, unless already present in head
273
+ elif request_id not in self._head_requests:
274
+ self._head_requests.appendleft(request_id)
264
275
 
265
276
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
266
277
  """Mark a request as handled after successful processing.
@@ -275,12 +286,14 @@ class _ApifyRequestQueueSingleClient:
275
286
  """
276
287
  # Set the handled_at timestamp if not already set
277
288
 
289
+ request_id = unique_key_to_request_id(request.unique_key)
290
+
278
291
  if request.handled_at is None:
279
292
  request.handled_at = datetime.now(tz=timezone.utc)
280
293
  self.metadata.handled_request_count += 1
281
294
  self.metadata.pending_request_count -= 1
282
295
 
283
- if cached_request := self._requests_cache.get(request.unique_key):
296
+ if cached_request := self._requests_cache.get(request_id):
284
297
  cached_request.handled_at = request.handled_at
285
298
 
286
299
  try:
@@ -289,10 +302,10 @@ class _ApifyRequestQueueSingleClient:
289
302
  # adding to the queue.)
290
303
  processed_request = await self._update_request(request)
291
304
  # Remember that we handled this request, to optimize local deduplication.
292
- self._requests_already_handled.add(request.unique_key)
305
+ self._requests_already_handled.add(request_id)
293
306
  # Remove request from cache. It will most likely not be needed.
294
- self._requests_cache.pop(request.unique_key)
295
- self._requests_in_progress.discard(request.unique_key)
307
+ self._requests_cache.pop(request_id)
308
+ self._requests_in_progress.discard(request_id)
296
309
 
297
310
  except Exception as exc:
298
311
  logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}')
@@ -319,23 +332,27 @@ class _ApifyRequestQueueSingleClient:
319
332
  """
320
333
  # Check if the request was marked as handled and clear it. When reclaiming,
321
334
  # we want to put the request back for processing.
335
+
336
+ request_id = unique_key_to_request_id(request.unique_key)
337
+
322
338
  if request.was_already_handled:
323
339
  request.handled_at = None
324
340
 
325
341
  try:
326
342
  # Make sure request is in the local cache. We might need it.
327
- self._requests_cache[request.unique_key] = request
343
+ self._requests_cache[request_id] = request
328
344
 
329
345
  # No longer in progress
330
- self._requests_in_progress.discard(request.unique_key)
346
+ self._requests_in_progress.discard(request_id)
331
347
  # No longer handled
332
- self._requests_already_handled.discard(request.unique_key)
348
+ self._requests_already_handled.discard(request_id)
333
349
 
334
350
  if forefront:
335
351
  # Append to top of the local head estimation
336
- self._head_requests.append(request.unique_key)
352
+ self._head_requests.append(request_id)
337
353
 
338
354
  processed_request = await self._update_request(request, forefront=forefront)
355
+ processed_request.id = request_id
339
356
  processed_request.unique_key = request.unique_key
340
357
  # If the request was previously handled, decrement our handled count since
341
358
  # we're putting it back for processing.
@@ -397,9 +414,11 @@ class _ApifyRequestQueueSingleClient:
397
414
  response = await self._api_client.list_requests(limit=10_000)
398
415
  for request_data in response.get('items', []):
399
416
  request = Request.model_validate(request_data)
417
+ request_id = request_data['id']
418
+
400
419
  if request.was_already_handled:
401
- # Cache just unique_key for deduplication
402
- self._requests_already_handled.add(request.unique_key)
420
+ # Cache just id for deduplication
421
+ self._requests_already_handled.add(request_id)
403
422
  else:
404
423
  # Cache full request
405
- self._requests_cache[request.unique_key] = request
424
+ self._requests_cache[request_id] = request
@@ -192,3 +192,30 @@ def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) ->
192
192
 
193
193
  # Truncate the key to the desired length
194
194
  return url_safe_key[:request_id_length]
195
+
196
+
197
+ def create_apify_client(configuration: Configuration) -> ApifyClientAsync:
198
+ """Create and return an ApifyClientAsync instance using the provided configuration."""
199
+ if not configuration.token:
200
+ raise ValueError(f'Apify storage client requires a valid token in Configuration (token={configuration.token}).')
201
+
202
+ api_url = configuration.api_base_url
203
+ if not api_url:
204
+ raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
205
+
206
+ api_public_base_url = configuration.api_public_base_url
207
+ if not api_public_base_url:
208
+ raise ValueError(
209
+ 'Apify storage client requires a valid API public base URL in Configuration '
210
+ f'(api_public_base_url={api_public_base_url}).'
211
+ )
212
+
213
+ # Create Apify client with the provided token and API URL.
214
+ return ApifyClientAsync(
215
+ token=configuration.token,
216
+ api_url=api_url,
217
+ api_public_url=api_public_base_url,
218
+ max_retries=8,
219
+ min_delay_between_retries_millis=500,
220
+ timeout_secs=360,
221
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.0.2b6
3
+ Version: 3.0.3b1
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -225,7 +225,7 @@ Classifier: Programming Language :: Python :: 3.12
225
225
  Classifier: Programming Language :: Python :: 3.13
226
226
  Classifier: Topic :: Software Development :: Libraries
227
227
  Requires-Python: >=3.10
228
- Requires-Dist: apify-client<3.0.0,>=2.0.0
228
+ Requires-Dist: apify-client<3.0.0,>=2.2.0
229
229
  Requires-Dist: apify-shared<3.0.0,>=2.0.0
230
230
  Requires-Dist: cachetools>=5.5.0
231
231
  Requires-Dist: crawlee<2.0.0,>=1.0.2
@@ -14,7 +14,7 @@ apify/events/_apify_event_manager.py,sha256=yArFrKa4wWDZo32iwaA3F_w36VSJf1Yaj_L1
14
14
  apify/events/_types.py,sha256=F0BHgACqnRfmdQ9GUcpnZvPxzw2bdRr8BqbGSA4cHeQ,3050
15
15
  apify/events/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  apify/request_loaders/__init__.py,sha256=SJqW0FbdZKEtAMB5kBLgqROzh3KmQc2CNEIhJpTGdPQ,356
17
- apify/request_loaders/_apify_request_list.py,sha256=kurCxX2jAKzHJ5N1Co6KjIgptqgVmjR0WpT8bd6uK9A,6220
17
+ apify/request_loaders/_apify_request_list.py,sha256=jbZTHK3ACbh4YauYVJgXHXxB3rPcots5JMcr3GdIMz8,6210
18
18
  apify/request_loaders/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  apify/scrapy/__init__.py,sha256=m2a0ts_JY9xJkBy4JU5mV8PJqjA3GGKLXBFu4nl-n-A,1048
20
20
  apify/scrapy/_actor_runner.py,sha256=rXWSnlQWGskDUH8PtLCv5SkOIx4AiVa4QbCYeCett5c,938
@@ -35,14 +35,14 @@ apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
35
35
  apify/storage_clients/__init__.py,sha256=JheTvNpVD_luQXC1KTEgtr6yVnuMEC9ajBNLCX3HuSo,358
36
36
  apify/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  apify/storage_clients/_apify/__init__.py,sha256=mtbVDsxqWL3kx30elnh0kAn2kZ4s3BBsWa15Y5e7RMU,347
38
- apify/storage_clients/_apify/_dataset_client.py,sha256=Bb3UwOaFkyuEY7tuBf8K46R4ZP_b1EaAkDOXOqwSoW8,12498
39
- apify/storage_clients/_apify/_key_value_store_client.py,sha256=42dARbLX2oeOW7uYYKkDyQbEriMuh55Mxh0SqvkOEGg,10529
40
- apify/storage_clients/_apify/_models.py,sha256=GEaN7Got1zIg42QPH36obHRWRDVNtzOkRuOWYRf9bFU,4572
41
- apify/storage_clients/_apify/_request_queue_client.py,sha256=QXCLdTBeNW8RKWnxQOE71KOpZ_lqvqisa89eeiWwZ38,14200
42
- apify/storage_clients/_apify/_request_queue_shared_client.py,sha256=CbvwcXRvfuBoy3wrQEdLX9_vKELPH_WhHQARP14audM,20709
43
- apify/storage_clients/_apify/_request_queue_single_client.py,sha256=bQMebZKyeN_gYF1ZcHcaWng_q0m41KsiNvV1YRyzD3M,17299
38
+ apify/storage_clients/_apify/_dataset_client.py,sha256=qmCJyL1MN83tYRXmc31P6yMIXVZMyRrGjr7R6-86FSE,11869
39
+ apify/storage_clients/_apify/_key_value_store_client.py,sha256=994a5bM_BGHIeirnny6QlXjy5CzMU2I9SmMksCbHCUY,9357
40
+ apify/storage_clients/_apify/_models.py,sha256=XxBru5XFdj0jqX6V-uVahT-pMQU3pZ501aTNzXCuoMU,4556
41
+ apify/storage_clients/_apify/_request_queue_client.py,sha256=tAyap34gpxvPiQ0McDjX5ojq1ZIZc4EI3PrW8VQqS4k,13292
42
+ apify/storage_clients/_apify/_request_queue_shared_client.py,sha256=pWmd6aPxM-eZ6PC1MfsfTcjD2mGGpCDS3ZZ3cG_2MEA,20971
43
+ apify/storage_clients/_apify/_request_queue_single_client.py,sha256=d2txMwxW1nlYnvjdOH8xpxhcOYNeyc1ousGHRE7jsPg,17468
44
44
  apify/storage_clients/_apify/_storage_client.py,sha256=hFl_PuX1UgOydBD6pieZ0u2NWbDmZV-i0qygKdsuHt4,4873
45
- apify/storage_clients/_apify/_utils.py,sha256=ywXoSM69amRokUZcshbAvQLIcSZq4L-bpYIGyeFxCGQ,7696
45
+ apify/storage_clients/_apify/_utils.py,sha256=375gk_TJyMWIIgRbE9SS0hQup0h6sA3mzpTG53XIjkM,8769
46
46
  apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
48
48
  apify/storage_clients/_file_system/_key_value_store_client.py,sha256=gxM3ap67PnY80Rd7P3onPAf2pksYpU0LoAlJdayEMdU,4179
@@ -51,7 +51,7 @@ apify/storage_clients/_smart_apify/__init__.py,sha256=614B2AaWY-dx6RQ6mod7VVR8gF
51
51
  apify/storage_clients/_smart_apify/_storage_client.py,sha256=GCPmVe_xWAFcO2Cuej4su4i97_d33Q9Ih_Sc5xW2Wa4,4674
52
52
  apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
53
53
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- apify-3.0.2b6.dist-info/METADATA,sha256=VvY5YhIVBaPQf2fk7f62zq8RW0ss9R8mwqJZV2Wwchk,22582
55
- apify-3.0.2b6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- apify-3.0.2b6.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
- apify-3.0.2b6.dist-info/RECORD,,
54
+ apify-3.0.3b1.dist-info/METADATA,sha256=g7A2Bi0KsFppsDByOMj3p5BtrriXTvFUnHlKg0BYdAM,22582
55
+ apify-3.0.3b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ apify-3.0.3b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
+ apify-3.0.3b1.dist-info/RECORD,,