apify 3.0.2b7__py3-none-any.whl → 3.0.3b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

@@ -22,8 +22,8 @@ URL_NO_COMMAS_REGEX = re.compile(
22
22
  class _RequestDetails(BaseModel):
23
23
  method: HttpMethod = 'GET'
24
24
  payload: str = ''
25
- headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
26
- user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
25
+ headers: Annotated[dict[str, str], Field(default_factory=dict)]
26
+ user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')]
27
27
 
28
28
 
29
29
  class _RequestsFromUrlInput(_RequestDetails):
@@ -1,19 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import warnings
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any
6
7
 
7
8
  from typing_extensions import override
8
9
 
9
- from apify_client import ApifyClientAsync
10
10
  from crawlee._utils.byte_size import ByteSize
11
11
  from crawlee._utils.file import json_dumps
12
12
  from crawlee.storage_clients._base import DatasetClient
13
13
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
14
14
  from crawlee.storages import Dataset
15
15
 
16
- from ._utils import AliasResolver
16
+ from ._utils import AliasResolver, create_apify_client
17
17
 
18
18
  if TYPE_CHECKING:
19
19
  from collections.abc import AsyncIterator
@@ -52,12 +52,17 @@ class ApifyDatasetClient(DatasetClient):
52
52
  self._api_client = api_client
53
53
  """The Apify dataset client for API operations."""
54
54
 
55
- self._api_public_base_url = api_public_base_url
56
- """The public base URL for accessing the key-value store records."""
57
-
58
55
  self._lock = lock
59
56
  """A lock to ensure that only one operation is performed at a time."""
60
57
 
58
+ if api_public_base_url:
59
+ # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
60
+ warnings.warn(
61
+ 'api_public_base_url argument is deprecated and will be removed in version 4.0.0',
62
+ DeprecationWarning,
63
+ stacklevel=2,
64
+ )
65
+
61
66
  @override
62
67
  async def get_metadata(self) -> DatasetMetadata:
63
68
  metadata = await self._api_client.get()
@@ -99,29 +104,7 @@ class ApifyDatasetClient(DatasetClient):
99
104
  if sum(1 for param in [id, name, alias] if param is not None) > 1:
100
105
  raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
101
106
 
102
- token = configuration.token
103
- if not token:
104
- raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
105
-
106
- api_url = configuration.api_base_url
107
- if not api_url:
108
- raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
109
-
110
- api_public_base_url = configuration.api_public_base_url
111
- if not api_public_base_url:
112
- raise ValueError(
113
- 'Apify storage client requires a valid API public base URL in Configuration '
114
- f'(api_public_base_url={api_public_base_url}).'
115
- )
116
-
117
- # Create Apify client with the provided token and API URL.
118
- apify_client_async = ApifyClientAsync(
119
- token=token,
120
- api_url=api_url,
121
- max_retries=8,
122
- min_delay_between_retries_millis=500,
123
- timeout_secs=360,
124
- )
107
+ apify_client_async = create_apify_client(configuration)
125
108
  apify_datasets_client = apify_client_async.datasets()
126
109
 
127
110
  # Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
@@ -178,7 +161,7 @@ class ApifyDatasetClient(DatasetClient):
178
161
 
179
162
  return cls(
180
163
  api_client=apify_dataset_client,
181
- api_public_base_url=api_public_base_url,
164
+ api_public_base_url='', # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
182
165
  lock=asyncio.Lock(),
183
166
  )
184
167
 
@@ -1,20 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import warnings
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any
6
7
 
7
8
  from typing_extensions import override
8
- from yarl import URL
9
9
 
10
- from apify_client import ApifyClientAsync
11
10
  from crawlee.storage_clients._base import KeyValueStoreClient
12
11
  from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
13
12
  from crawlee.storages import KeyValueStore
14
13
 
15
14
  from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
16
- from ._utils import AliasResolver
17
- from apify._crypto import create_hmac_signature
15
+ from ._utils import AliasResolver, create_apify_client
18
16
 
19
17
  if TYPE_CHECKING:
20
18
  from collections.abc import AsyncIterator
@@ -43,12 +41,17 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
43
41
  self._api_client = api_client
44
42
  """The Apify KVS client for API operations."""
45
43
 
46
- self._api_public_base_url = api_public_base_url
47
- """The public base URL for accessing the key-value store records."""
48
-
49
44
  self._lock = lock
50
45
  """A lock to ensure that only one operation is performed at a time."""
51
46
 
47
+ if api_public_base_url:
48
+ # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
49
+ warnings.warn(
50
+ 'api_public_base_url argument is deprecated and will be removed in version 4.0.0',
51
+ DeprecationWarning,
52
+ stacklevel=2,
53
+ )
54
+
52
55
  @override
53
56
  async def get_metadata(self) -> ApifyKeyValueStoreMetadata:
54
57
  metadata = await self._api_client.get()
@@ -90,29 +93,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
90
93
  if sum(1 for param in [id, name, alias] if param is not None) > 1:
91
94
  raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
92
95
 
93
- token = configuration.token
94
- if not token:
95
- raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
96
-
97
- api_url = configuration.api_base_url
98
- if not api_url:
99
- raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
100
-
101
- api_public_base_url = configuration.api_public_base_url
102
- if not api_public_base_url:
103
- raise ValueError(
104
- 'Apify storage client requires a valid API public base URL in Configuration '
105
- f'(api_public_base_url={api_public_base_url}).'
106
- )
107
-
108
- # Create Apify client with the provided token and API URL.
109
- apify_client_async = ApifyClientAsync(
110
- token=token,
111
- api_url=api_url,
112
- max_retries=8,
113
- min_delay_between_retries_millis=500,
114
- timeout_secs=360,
115
- )
96
+ apify_client_async = create_apify_client(configuration)
116
97
  apify_kvss_client = apify_client_async.key_value_stores()
117
98
 
118
99
  # Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to
@@ -170,7 +151,7 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
170
151
 
171
152
  return cls(
172
153
  api_client=apify_kvs_client,
173
- api_public_base_url=api_public_base_url,
154
+ api_public_base_url='', # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
174
155
  lock=asyncio.Lock(),
175
156
  )
176
157
 
@@ -251,15 +232,4 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient):
251
232
  Returns:
252
233
  A public URL that can be used to access the value of the given key in the KVS.
253
234
  """
254
- if self._api_client.resource_id is None:
255
- raise ValueError('resource_id cannot be None when generating a public URL')
256
-
257
- public_url = (
258
- URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key
259
- )
260
- metadata = await self.get_metadata()
261
-
262
- if metadata.url_signing_secret_key is not None:
263
- public_url = public_url.with_query(signature=create_hmac_signature(metadata.url_signing_secret_key, key))
264
-
265
- return str(public_url)
235
+ return await self._api_client.get_record_public_url(key=key)
@@ -94,8 +94,8 @@ class CachedRequest(BaseModel):
94
94
  Only internal structure.
95
95
  """
96
96
 
97
- unique_key: str
98
- """Unique key of the request."""
97
+ id: str
98
+ """Id of the request."""
99
99
 
100
100
  was_already_handled: bool
101
101
  """Whether the request was already handled."""
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Final, Literal
5
5
 
6
6
  from typing_extensions import override
7
7
 
8
- from apify_client import ApifyClientAsync
9
8
  from crawlee._utils.crypto import crypto_random_object_id
10
9
  from crawlee.storage_clients._base import RequestQueueClient
11
10
  from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
@@ -14,7 +13,7 @@ from crawlee.storages import RequestQueue
14
13
  from ._models import ApifyRequestQueueMetadata, RequestQueueStats
15
14
  from ._request_queue_shared_client import ApifyRequestQueueSharedClient
16
15
  from ._request_queue_single_client import ApifyRequestQueueSingleClient
17
- from ._utils import AliasResolver
16
+ from ._utils import AliasResolver, create_apify_client
18
17
 
19
18
  if TYPE_CHECKING:
20
19
  from collections.abc import Sequence
@@ -228,29 +227,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
228
227
  if sum(1 for param in [id, name, alias] if param is not None) > 1:
229
228
  raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
230
229
 
231
- token = configuration.token
232
- if not token:
233
- raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
234
-
235
- api_url = configuration.api_base_url
236
- if not api_url:
237
- raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
238
-
239
- api_public_base_url = configuration.api_public_base_url
240
- if not api_public_base_url:
241
- raise ValueError(
242
- 'Apify storage client requires a valid API public base URL in Configuration '
243
- f'(api_public_base_url={api_public_base_url}).'
244
- )
245
-
246
- # Create Apify client with the provided token and API URL.
247
- apify_client_async = ApifyClientAsync(
248
- token=token,
249
- api_url=api_url,
250
- max_retries=8,
251
- min_delay_between_retries_millis=500,
252
- timeout_secs=360,
253
- )
230
+ apify_client_async = create_apify_client(configuration)
254
231
  apify_rqs_client = apify_client_async.request_queues()
255
232
 
256
233
  # Normalize unnamed default storage in cases where not defined in `configuration.default_request_queue_id` to
@@ -54,10 +54,10 @@ class ApifyRequestQueueSharedClient:
54
54
  """The Apify request queue client for API operations."""
55
55
 
56
56
  self._queue_head = deque[str]()
57
- """A deque to store request unique keys in the queue head."""
57
+ """A deque to store request ids in the queue head."""
58
58
 
59
59
  self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=cache_size)
60
- """A cache to store request objects. Request unique key is used as the cache key."""
60
+ """A cache to store request objects. Request id is used as the cache key."""
61
61
 
62
62
  self._queue_has_locked_requests: bool | None = None
63
63
  """Whether the queue has requests locked by another client."""
@@ -101,12 +101,14 @@ class ApifyRequestQueueSharedClient:
101
101
  already_present_requests: list[ProcessedRequest] = []
102
102
 
103
103
  for request in requests:
104
- if self._requests_cache.get(request.unique_key):
104
+ request_id = unique_key_to_request_id(request.unique_key)
105
+ if self._requests_cache.get(request_id):
105
106
  # We are not sure if it was already handled at this point, and it is not worth calling API for it.
106
107
  # It could have been handled by another client in the meantime, so cached information about
107
108
  # `request.was_already_handled` is not reliable.
108
109
  already_present_requests.append(
109
110
  ProcessedRequest(
111
+ id=request_id,
110
112
  unique_key=request.unique_key,
111
113
  was_already_present=True,
112
114
  was_already_handled=request.was_already_handled,
@@ -116,12 +118,13 @@ class ApifyRequestQueueSharedClient:
116
118
  else:
117
119
  # Add new request to the cache.
118
120
  processed_request = ProcessedRequest(
121
+ id=request_id,
119
122
  unique_key=request.unique_key,
120
123
  was_already_present=True,
121
124
  was_already_handled=request.was_already_handled,
122
125
  )
123
126
  self._cache_request(
124
- request.unique_key,
127
+ request_id,
125
128
  processed_request,
126
129
  )
127
130
  new_requests.append(request)
@@ -131,7 +134,6 @@ class ApifyRequestQueueSharedClient:
131
134
  requests_dict = [
132
135
  request.model_dump(
133
136
  by_alias=True,
134
- exclude={'id'}, # Exclude ID fields from requests since the API doesn't accept them.
135
137
  )
136
138
  for request in new_requests
137
139
  ]
@@ -146,7 +148,8 @@ class ApifyRequestQueueSharedClient:
146
148
 
147
149
  # Remove unprocessed requests from the cache
148
150
  for unprocessed_request in api_response.unprocessed_requests:
149
- self._requests_cache.pop(unprocessed_request.unique_key, None)
151
+ unprocessed_request_id = unique_key_to_request_id(unprocessed_request.unique_key)
152
+ self._requests_cache.pop(unprocessed_request_id, None)
150
153
 
151
154
  else:
152
155
  api_response = AddRequestsResponse.model_validate(
@@ -179,7 +182,10 @@ class ApifyRequestQueueSharedClient:
179
182
  Returns:
180
183
  The request or None if not found.
181
184
  """
182
- response = await self._api_client.get_request(unique_key_to_request_id(unique_key))
185
+ return await self._get_request_by_id(unique_key_to_request_id(unique_key))
186
+
187
+ async def _get_request_by_id(self, request_id: str) -> Request | None:
188
+ response = await self._api_client.get_request(request_id)
183
189
 
184
190
  if response is None:
185
191
  return None
@@ -206,15 +212,15 @@ class ApifyRequestQueueSharedClient:
206
212
  return None
207
213
 
208
214
  # Get the next request ID from the queue head
209
- next_unique_key = self._queue_head.popleft()
215
+ next_request_id = self._queue_head.popleft()
210
216
 
211
- request = await self._get_or_hydrate_request(next_unique_key)
217
+ request = await self._get_or_hydrate_request(next_request_id)
212
218
 
213
219
  # Handle potential inconsistency where request might not be in the main table yet
214
220
  if request is None:
215
221
  logger.debug(
216
222
  'Cannot find a request from the beginning of queue, will be retried later',
217
- extra={'nextRequestUniqueKey': next_unique_key},
223
+ extra={'next_request_id': next_request_id},
218
224
  )
219
225
  return None
220
226
 
@@ -222,16 +228,16 @@ class ApifyRequestQueueSharedClient:
222
228
  if request.handled_at is not None:
223
229
  logger.debug(
224
230
  'Request fetched from the beginning of queue was already handled',
225
- extra={'nextRequestUniqueKey': next_unique_key},
231
+ extra={'next_request_id': next_request_id},
226
232
  )
227
233
  return None
228
234
 
229
235
  # Use get request to ensure we have the full request object.
230
- request = await self.get_request(request.unique_key)
236
+ request = await self._get_request_by_id(next_request_id)
231
237
  if request is None:
232
238
  logger.debug(
233
239
  'Request fetched from the beginning of queue was not found in the RQ',
234
- extra={'nextRequestUniqueKey': next_unique_key},
240
+ extra={'next_request_id': next_request_id},
235
241
  )
236
242
  return None
237
243
 
@@ -248,15 +254,17 @@ class ApifyRequestQueueSharedClient:
248
254
  Returns:
249
255
  Information about the queue operation. `None` if the given request was not in progress.
250
256
  """
257
+ request_id = unique_key_to_request_id(request.unique_key)
251
258
  # Set the handled_at timestamp if not already set
252
259
  if request.handled_at is None:
253
260
  request.handled_at = datetime.now(tz=timezone.utc)
254
261
 
255
- if cached_request := self._requests_cache[request.unique_key]:
262
+ if cached_request := self._requests_cache[request_id]:
256
263
  cached_request.was_already_handled = request.was_already_handled
257
264
  try:
258
265
  # Update the request in the API
259
266
  processed_request = await self._update_request(request)
267
+ processed_request.id = request_id
260
268
  processed_request.unique_key = request.unique_key
261
269
 
262
270
  # Update assumed handled count if this wasn't already handled
@@ -265,10 +273,9 @@ class ApifyRequestQueueSharedClient:
265
273
  self.metadata.pending_request_count -= 1
266
274
 
267
275
  # Update the cache with the handled request
268
- cache_key = request.unique_key
269
276
  self._cache_request(
270
- cache_key,
271
- processed_request,
277
+ cache_key=request_id,
278
+ processed_request=processed_request,
272
279
  hydrated_request=request,
273
280
  )
274
281
  except Exception as exc:
@@ -352,17 +359,17 @@ class ApifyRequestQueueSharedClient:
352
359
  # Fetch requests from the API and populate the queue head
353
360
  await self._list_head()
354
361
 
355
- async def _get_or_hydrate_request(self, unique_key: str) -> Request | None:
356
- """Get a request by unique key, either from cache or by fetching from API.
362
+ async def _get_or_hydrate_request(self, request_id: str) -> Request | None:
363
+ """Get a request by id, either from cache or by fetching from API.
357
364
 
358
365
  Args:
359
- unique_key: Unique key of the request to get.
366
+ request_id: Id of the request to get.
360
367
 
361
368
  Returns:
362
369
  The request if found and valid, otherwise None.
363
370
  """
364
371
  # First check if the request is in our cache
365
- cached_entry = self._requests_cache.get(unique_key)
372
+ cached_entry = self._requests_cache.get(request_id)
366
373
 
367
374
  if cached_entry and cached_entry.hydrated:
368
375
  # If we have the request hydrated in cache, return it
@@ -371,17 +378,17 @@ class ApifyRequestQueueSharedClient:
371
378
  # If not in cache or not hydrated, fetch the request
372
379
  try:
373
380
  # Fetch the request data
374
- request = await self.get_request(unique_key)
381
+ request = await self._get_request_by_id(request_id)
375
382
 
376
383
  # If request is not found and return None
377
384
  if not request:
378
385
  return None
379
386
 
380
387
  # Update cache with hydrated request
381
- cache_key = request.unique_key
382
388
  self._cache_request(
383
- cache_key,
384
- ProcessedRequest(
389
+ cache_key=request_id,
390
+ processed_request=ProcessedRequest(
391
+ id=request_id,
385
392
  unique_key=request.unique_key,
386
393
  was_already_present=True,
387
394
  was_already_handled=request.handled_at is not None,
@@ -389,7 +396,7 @@ class ApifyRequestQueueSharedClient:
389
396
  hydrated_request=request,
390
397
  )
391
398
  except Exception as exc:
392
- logger.debug(f'Error fetching request {unique_key}: {exc!s}')
399
+ logger.debug(f'Error fetching request {request_id}: {exc!s}')
393
400
  return None
394
401
  else:
395
402
  return request
@@ -438,8 +445,8 @@ class ApifyRequestQueueSharedClient:
438
445
  logger.debug(f'Using cached queue head with {len(self._queue_head)} requests')
439
446
  # Create a list of requests from the cached queue head
440
447
  items = []
441
- for unique_key in list(self._queue_head)[:limit]:
442
- cached_request = self._requests_cache.get(unique_key)
448
+ for request_id in list(self._queue_head)[:limit]:
449
+ cached_request = self._requests_cache.get(request_id)
443
450
  if cached_request and cached_request.hydrated:
444
451
  items.append(cached_request.hydrated)
445
452
 
@@ -472,32 +479,35 @@ class ApifyRequestQueueSharedClient:
472
479
 
473
480
  for request_data in response.get('items', []):
474
481
  request = Request.model_validate(request_data)
482
+ request_id = request_data.get('id')
475
483
 
476
484
  # Skip requests without ID or unique key
477
- if not request.unique_key:
485
+ if not request.unique_key or not request_id:
478
486
  logger.debug(
479
- 'Skipping request from queue head, missing unique key',
487
+ 'Skipping request from queue head, missing unique key or id',
480
488
  extra={
481
489
  'unique_key': request.unique_key,
490
+ 'id': request_id,
482
491
  },
483
492
  )
484
493
  continue
485
494
 
486
495
  # Cache the request
487
496
  self._cache_request(
488
- request.unique_key,
497
+ request_id,
489
498
  ProcessedRequest(
499
+ id=request_id,
490
500
  unique_key=request.unique_key,
491
501
  was_already_present=True,
492
502
  was_already_handled=False,
493
503
  ),
494
504
  hydrated_request=request,
495
505
  )
496
- self._queue_head.append(request.unique_key)
506
+ self._queue_head.append(request_id)
497
507
 
498
- for leftover_unique_key in leftover_buffer:
508
+ for leftover_id in leftover_buffer:
499
509
  # After adding new requests to the forefront, any existing leftover locked request is kept in the end.
500
- self._queue_head.append(leftover_unique_key)
510
+ self._queue_head.append(leftover_id)
501
511
  return RequestQueueHead.model_validate(response)
502
512
 
503
513
  def _cache_request(
@@ -516,7 +526,7 @@ class ApifyRequestQueueSharedClient:
516
526
  hydrated_request: The hydrated request object, if available.
517
527
  """
518
528
  self._requests_cache[cache_key] = CachedRequest(
519
- unique_key=processed_request.unique_key,
529
+ id=processed_request.id,
520
530
  was_already_handled=processed_request.was_already_handled,
521
531
  hydrated=hydrated_request,
522
532
  lock_expires_at=None,
@@ -56,21 +56,21 @@ class ApifyRequestQueueSingleClient:
56
56
  """The Apify request queue client for API operations."""
57
57
 
58
58
  self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=cache_size)
59
- """A cache to store request objects. Request unique key is used as the cache key."""
59
+ """A cache to store request objects. Request id is used as the cache key."""
60
60
 
61
61
  self._head_requests: deque[str] = deque()
62
- """Ordered unique keys of requests that represent queue head."""
62
+ """Ordered ids of requests that represent queue head."""
63
63
 
64
64
  self._requests_already_handled: set[str] = set()
65
65
  """Local estimation of requests unique keys that are already present and handled on the platform.
66
66
 
67
67
  - To enhance local deduplication.
68
68
  - To reduce the _requests_cache size. Already handled requests are most likely not going to be needed again,
69
- so no need to cache more than their unique_key.
69
+ so no need to cache more than their id.
70
70
  """
71
71
 
72
72
  self._requests_in_progress: set[str] = set()
73
- """Set of requests unique keys that are being processed locally.
73
+ """Set of requests ids that are being processed locally.
74
74
 
75
75
  - To help decide if the RQ is finished or not. This is the only consumer, so it can be tracked locally.
76
76
  """
@@ -105,19 +105,24 @@ class ApifyRequestQueueSingleClient:
105
105
  already_present_requests: list[ProcessedRequest] = []
106
106
 
107
107
  for request in requests:
108
+ # Calculate id for request
109
+ request_id = unique_key_to_request_id(request.unique_key)
110
+
108
111
  # Check if request is known to be already handled (it has to be present as well.)
109
- if request.unique_key in self._requests_already_handled:
112
+ if request_id in self._requests_already_handled:
110
113
  already_present_requests.append(
111
114
  ProcessedRequest(
115
+ id=request_id,
112
116
  unique_key=request.unique_key,
113
117
  was_already_present=True,
114
118
  was_already_handled=True,
115
119
  )
116
120
  )
117
121
  # Check if request is known to be already present, but unhandled
118
- elif self._requests_cache.get(request.unique_key):
122
+ elif self._requests_cache.get(request_id):
119
123
  already_present_requests.append(
120
124
  ProcessedRequest(
125
+ id=request_id,
121
126
  unique_key=request.unique_key,
122
127
  was_already_present=True,
123
128
  was_already_handled=request.was_already_handled,
@@ -128,11 +133,11 @@ class ApifyRequestQueueSingleClient:
128
133
  new_requests.append(request)
129
134
 
130
135
  # Update local caches
131
- self._requests_cache[request.unique_key] = request
136
+ self._requests_cache[request_id] = request
132
137
  if forefront:
133
- self._head_requests.append(request.unique_key)
138
+ self._head_requests.append(request_id)
134
139
  else:
135
- self._head_requests.appendleft(request.unique_key)
140
+ self._head_requests.appendleft(request_id)
136
141
 
137
142
  if new_requests:
138
143
  # Prepare requests for API by converting to dictionaries.
@@ -151,7 +156,7 @@ class ApifyRequestQueueSingleClient:
151
156
  api_response.processed_requests.extend(already_present_requests)
152
157
  # Remove unprocessed requests from the cache
153
158
  for unprocessed_request in api_response.unprocessed_requests:
154
- self._requests_cache.pop(unprocessed_request.unique_key, None)
159
+ self._requests_cache.pop(unique_key_to_request_id(unprocessed_request.unique_key), None)
155
160
 
156
161
  else:
157
162
  api_response = AddRequestsResponse(
@@ -178,15 +183,39 @@ class ApifyRequestQueueSingleClient:
178
183
  Returns:
179
184
  The request or None if not found.
180
185
  """
181
- if unique_key in self._requests_cache:
182
- return self._requests_cache[unique_key]
186
+ return await self._get_request(id=unique_key_to_request_id(unique_key))
187
+
188
+ async def _get_request(self, id: str) -> Request | None:
189
+ """Get a request by id.
190
+
191
+ Args:
192
+ id: Id of request to get.
193
+
194
+ Returns:
195
+ The request or None if not found.
196
+ """
197
+ if id in self._requests_cache:
198
+ return self._requests_cache[id]
183
199
 
184
- response = await self._api_client.get_request(unique_key_to_request_id(unique_key))
200
+ # Requests that were not added by this client are not in local cache. Fetch them from platform.
201
+ response = await self._api_client.get_request(id)
185
202
 
186
203
  if response is None:
187
204
  return None
188
205
 
189
- return Request.model_validate(response)
206
+ request = Request.model_validate(response)
207
+
208
+ # Updated local caches
209
+ if id in self._requests_in_progress:
210
+ # No caching of requests that are already in progress, client is already aware of them.
211
+ pass
212
+ elif request.was_already_handled:
213
+ # Cache only id for already handled requests
214
+ self._requests_already_handled.add(id)
215
+ else:
216
+ # Cache full request for unhandled requests that are not yet in progress and are not yet handled.
217
+ self._requests_cache[id] = request
218
+ return request
190
219
 
191
220
  async def fetch_next_request(self) -> Request | None:
192
221
  """Return the next request in the queue to be processed.
@@ -202,13 +231,10 @@ class ApifyRequestQueueSingleClient:
202
231
  await self._ensure_head_is_non_empty()
203
232
 
204
233
  while self._head_requests:
205
- request_unique_key = self._head_requests.pop()
206
- if (
207
- request_unique_key not in self._requests_in_progress
208
- and request_unique_key not in self._requests_already_handled
209
- ):
210
- self._requests_in_progress.add(request_unique_key)
211
- return await self.get_request(request_unique_key)
234
+ request_id = self._head_requests.pop()
235
+ if request_id not in self._requests_in_progress and request_id not in self._requests_already_handled:
236
+ self._requests_in_progress.add(request_id)
237
+ return await self._get_request(request_id)
212
238
  # No request locally and the ones returned from the platform are already in progress.
213
239
  return None
214
240
 
@@ -233,44 +259,19 @@ class ApifyRequestQueueSingleClient:
233
259
 
234
260
  # Update the cached data
235
261
  for request_data in response.get('items', []):
236
- # Due to https://github.com/apify/apify-core/blob/v0.1377.0/src/api/src/lib/request_queues/request_queue.ts#L53,
237
- # the list_head endpoint may return truncated fields for long requests (e.g., long URLs or unique keys).
238
- # If truncation is detected, fetch the full request data by its ID from the API.
239
- # This is a temporary workaround - the caching will be refactored to use request IDs instead of unique keys.
240
- # See https://github.com/apify/apify-sdk-python/issues/630 for details.
241
- if '[truncated]' in request_data['uniqueKey'] or '[truncated]' in request_data['url']:
242
- request_data = await self._api_client.get_request(request_id=request_data['id']) # noqa: PLW2901
243
-
244
262
  request = Request.model_validate(request_data)
263
+ request_id = request_data['id']
245
264
 
246
- if request.unique_key in self._requests_in_progress:
265
+ if request_id in self._requests_in_progress:
247
266
  # Ignore requests that are already in progress, we will not process them again.
248
267
  continue
249
268
 
250
269
  if request.was_already_handled:
251
- # Do not cache fully handled requests, we do not need them. Just cache their unique_key.
252
- self._requests_already_handled.add(request.unique_key)
253
- else:
254
- # Only fetch the request if we do not know it yet.
255
- if request.unique_key not in self._requests_cache:
256
- request_id = unique_key_to_request_id(request.unique_key)
257
-
258
- if request_data is not None and request_id != request_data['id']:
259
- logger.warning(
260
- f'Request ID mismatch: {request_id} != {request_data["id"]}, '
261
- 'this may cause unexpected behavior.'
262
- )
263
-
264
- # See https://github.com/apify/apify-sdk-python/issues/630 for details.
265
- if '[truncated]' not in request.unique_key:
266
- request_data = await self._api_client.get_request(request_id=request_id) # noqa: PLW2901
267
- request = Request.model_validate(request_data)
268
-
269
- self._requests_cache[request.unique_key] = request
270
-
271
- # Add new requests to the end of the head, unless already present in head
272
- if request.unique_key not in self._head_requests:
273
- self._head_requests.appendleft(request.unique_key)
270
+ # Do not cache fully handled requests, we do not need them. Just cache their id.
271
+ self._requests_already_handled.add(request_id)
272
+ # Add new requests to the end of the head, unless already present in head
273
+ elif request_id not in self._head_requests:
274
+ self._head_requests.appendleft(request_id)
274
275
 
275
276
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
276
277
  """Mark a request as handled after successful processing.
@@ -285,12 +286,14 @@ class ApifyRequestQueueSingleClient:
285
286
  """
286
287
  # Set the handled_at timestamp if not already set
287
288
 
289
+ request_id = unique_key_to_request_id(request.unique_key)
290
+
288
291
  if request.handled_at is None:
289
292
  request.handled_at = datetime.now(tz=timezone.utc)
290
293
  self.metadata.handled_request_count += 1
291
294
  self.metadata.pending_request_count -= 1
292
295
 
293
- if cached_request := self._requests_cache.get(request.unique_key):
296
+ if cached_request := self._requests_cache.get(request_id):
294
297
  cached_request.handled_at = request.handled_at
295
298
 
296
299
  try:
@@ -299,10 +302,10 @@ class ApifyRequestQueueSingleClient:
299
302
  # adding to the queue.)
300
303
  processed_request = await self._update_request(request)
301
304
  # Remember that we handled this request, to optimize local deduplication.
302
- self._requests_already_handled.add(request.unique_key)
305
+ self._requests_already_handled.add(request_id)
303
306
  # Remove request from cache. It will most likely not be needed.
304
- self._requests_cache.pop(request.unique_key)
305
- self._requests_in_progress.discard(request.unique_key)
307
+ self._requests_cache.pop(request_id)
308
+ self._requests_in_progress.discard(request_id)
306
309
 
307
310
  except Exception as exc:
308
311
  logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}')
@@ -329,23 +332,27 @@ class ApifyRequestQueueSingleClient:
329
332
  """
330
333
  # Check if the request was marked as handled and clear it. When reclaiming,
331
334
  # we want to put the request back for processing.
335
+
336
+ request_id = unique_key_to_request_id(request.unique_key)
337
+
332
338
  if request.was_already_handled:
333
339
  request.handled_at = None
334
340
 
335
341
  try:
336
342
  # Make sure request is in the local cache. We might need it.
337
- self._requests_cache[request.unique_key] = request
343
+ self._requests_cache[request_id] = request
338
344
 
339
345
  # No longer in progress
340
- self._requests_in_progress.discard(request.unique_key)
346
+ self._requests_in_progress.discard(request_id)
341
347
  # No longer handled
342
- self._requests_already_handled.discard(request.unique_key)
348
+ self._requests_already_handled.discard(request_id)
343
349
 
344
350
  if forefront:
345
351
  # Append to top of the local head estimation
346
- self._head_requests.append(request.unique_key)
352
+ self._head_requests.append(request_id)
347
353
 
348
354
  processed_request = await self._update_request(request, forefront=forefront)
355
+ processed_request.id = request_id
349
356
  processed_request.unique_key = request.unique_key
350
357
  # If the request was previously handled, decrement our handled count since
351
358
  # we're putting it back for processing.
@@ -407,9 +414,11 @@ class ApifyRequestQueueSingleClient:
407
414
  response = await self._api_client.list_requests(limit=10_000)
408
415
  for request_data in response.get('items', []):
409
416
  request = Request.model_validate(request_data)
417
+ request_id = request_data['id']
418
+
410
419
  if request.was_already_handled:
411
- # Cache just unique_key for deduplication
412
- self._requests_already_handled.add(request.unique_key)
420
+ # Cache just id for deduplication
421
+ self._requests_already_handled.add(request_id)
413
422
  else:
414
423
  # Cache full request
415
- self._requests_cache[request.unique_key] = request
424
+ self._requests_cache[request_id] = request
@@ -192,3 +192,30 @@ def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) ->
192
192
 
193
193
  # Truncate the key to the desired length
194
194
  return url_safe_key[:request_id_length]
195
+
196
+
197
+ def create_apify_client(configuration: Configuration) -> ApifyClientAsync:
198
+ """Create and return an ApifyClientAsync instance using the provided configuration."""
199
+ if not configuration.token:
200
+ raise ValueError(f'Apify storage client requires a valid token in Configuration (token={configuration.token}).')
201
+
202
+ api_url = configuration.api_base_url
203
+ if not api_url:
204
+ raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
205
+
206
+ api_public_base_url = configuration.api_public_base_url
207
+ if not api_public_base_url:
208
+ raise ValueError(
209
+ 'Apify storage client requires a valid API public base URL in Configuration '
210
+ f'(api_public_base_url={api_public_base_url}).'
211
+ )
212
+
213
+ # Create Apify client with the provided token and API URL.
214
+ return ApifyClientAsync(
215
+ token=configuration.token,
216
+ api_url=api_url,
217
+ api_public_url=api_public_base_url,
218
+ max_retries=8,
219
+ min_delay_between_retries_millis=500,
220
+ timeout_secs=360,
221
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.0.2b7
3
+ Version: 3.0.3b1
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -225,7 +225,7 @@ Classifier: Programming Language :: Python :: 3.12
225
225
  Classifier: Programming Language :: Python :: 3.13
226
226
  Classifier: Topic :: Software Development :: Libraries
227
227
  Requires-Python: >=3.10
228
- Requires-Dist: apify-client<3.0.0,>=2.0.0
228
+ Requires-Dist: apify-client<3.0.0,>=2.2.0
229
229
  Requires-Dist: apify-shared<3.0.0,>=2.0.0
230
230
  Requires-Dist: cachetools>=5.5.0
231
231
  Requires-Dist: crawlee<2.0.0,>=1.0.2
@@ -14,7 +14,7 @@ apify/events/_apify_event_manager.py,sha256=yArFrKa4wWDZo32iwaA3F_w36VSJf1Yaj_L1
14
14
  apify/events/_types.py,sha256=F0BHgACqnRfmdQ9GUcpnZvPxzw2bdRr8BqbGSA4cHeQ,3050
15
15
  apify/events/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  apify/request_loaders/__init__.py,sha256=SJqW0FbdZKEtAMB5kBLgqROzh3KmQc2CNEIhJpTGdPQ,356
17
- apify/request_loaders/_apify_request_list.py,sha256=kurCxX2jAKzHJ5N1Co6KjIgptqgVmjR0WpT8bd6uK9A,6220
17
+ apify/request_loaders/_apify_request_list.py,sha256=jbZTHK3ACbh4YauYVJgXHXxB3rPcots5JMcr3GdIMz8,6210
18
18
  apify/request_loaders/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  apify/scrapy/__init__.py,sha256=m2a0ts_JY9xJkBy4JU5mV8PJqjA3GGKLXBFu4nl-n-A,1048
20
20
  apify/scrapy/_actor_runner.py,sha256=rXWSnlQWGskDUH8PtLCv5SkOIx4AiVa4QbCYeCett5c,938
@@ -35,14 +35,14 @@ apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
35
35
  apify/storage_clients/__init__.py,sha256=JheTvNpVD_luQXC1KTEgtr6yVnuMEC9ajBNLCX3HuSo,358
36
36
  apify/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  apify/storage_clients/_apify/__init__.py,sha256=mtbVDsxqWL3kx30elnh0kAn2kZ4s3BBsWa15Y5e7RMU,347
38
- apify/storage_clients/_apify/_dataset_client.py,sha256=Bb3UwOaFkyuEY7tuBf8K46R4ZP_b1EaAkDOXOqwSoW8,12498
39
- apify/storage_clients/_apify/_key_value_store_client.py,sha256=42dARbLX2oeOW7uYYKkDyQbEriMuh55Mxh0SqvkOEGg,10529
40
- apify/storage_clients/_apify/_models.py,sha256=GEaN7Got1zIg42QPH36obHRWRDVNtzOkRuOWYRf9bFU,4572
41
- apify/storage_clients/_apify/_request_queue_client.py,sha256=PUIVmGQxqFTkRxW9FIFWjT0OeDyAGt-ULlW-rdQDTyc,14194
42
- apify/storage_clients/_apify/_request_queue_shared_client.py,sha256=uxkuIG1rgCArgs6agldC9vmB2bgrIlNnm1I214Gf6WA,20550
43
- apify/storage_clients/_apify/_request_queue_single_client.py,sha256=EuORHJnFLC1YAT6ZfQj-ayrfSJNpU4_61r_7uDyvwgA,18092
38
+ apify/storage_clients/_apify/_dataset_client.py,sha256=qmCJyL1MN83tYRXmc31P6yMIXVZMyRrGjr7R6-86FSE,11869
39
+ apify/storage_clients/_apify/_key_value_store_client.py,sha256=994a5bM_BGHIeirnny6QlXjy5CzMU2I9SmMksCbHCUY,9357
40
+ apify/storage_clients/_apify/_models.py,sha256=XxBru5XFdj0jqX6V-uVahT-pMQU3pZ501aTNzXCuoMU,4556
41
+ apify/storage_clients/_apify/_request_queue_client.py,sha256=tAyap34gpxvPiQ0McDjX5ojq1ZIZc4EI3PrW8VQqS4k,13292
42
+ apify/storage_clients/_apify/_request_queue_shared_client.py,sha256=pWmd6aPxM-eZ6PC1MfsfTcjD2mGGpCDS3ZZ3cG_2MEA,20971
43
+ apify/storage_clients/_apify/_request_queue_single_client.py,sha256=d2txMwxW1nlYnvjdOH8xpxhcOYNeyc1ousGHRE7jsPg,17468
44
44
  apify/storage_clients/_apify/_storage_client.py,sha256=hFl_PuX1UgOydBD6pieZ0u2NWbDmZV-i0qygKdsuHt4,4873
45
- apify/storage_clients/_apify/_utils.py,sha256=ywXoSM69amRokUZcshbAvQLIcSZq4L-bpYIGyeFxCGQ,7696
45
+ apify/storage_clients/_apify/_utils.py,sha256=375gk_TJyMWIIgRbE9SS0hQup0h6sA3mzpTG53XIjkM,8769
46
46
  apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
48
48
  apify/storage_clients/_file_system/_key_value_store_client.py,sha256=gxM3ap67PnY80Rd7P3onPAf2pksYpU0LoAlJdayEMdU,4179
@@ -51,7 +51,7 @@ apify/storage_clients/_smart_apify/__init__.py,sha256=614B2AaWY-dx6RQ6mod7VVR8gF
51
51
  apify/storage_clients/_smart_apify/_storage_client.py,sha256=GCPmVe_xWAFcO2Cuej4su4i97_d33Q9Ih_Sc5xW2Wa4,4674
52
52
  apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
53
53
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- apify-3.0.2b7.dist-info/METADATA,sha256=nwfRWA3Q2QIjiKSLd-jAX3Y8j9ksLx3XA3NBeZ4MZnY,22582
55
- apify-3.0.2b7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- apify-3.0.2b7.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
- apify-3.0.2b7.dist-info/RECORD,,
54
+ apify-3.0.3b1.dist-info/METADATA,sha256=g7A2Bi0KsFppsDByOMj3p5BtrriXTvFUnHlKg0BYdAM,22582
55
+ apify-3.0.3b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ apify-3.0.3b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
+ apify-3.0.3b1.dist-info/RECORD,,