apify 3.0.1b1__py3-none-any.whl → 3.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/_actor.py CHANGED
@@ -32,7 +32,7 @@ from apify._consts import EVENT_LISTENERS_TIMEOUT
32
32
  from apify._crypto import decrypt_input_secrets, load_private_key
33
33
  from apify._models import ActorRun
34
34
  from apify._proxy_configuration import ProxyConfiguration
35
- from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython, maybe_extract_enum_member_value
35
+ from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
36
36
  from apify.events import ApifyEventManager, EventManager, LocalEventManager
37
37
  from apify.log import _configure_logging, logger
38
38
  from apify.storage_clients import ApifyStorageClient
@@ -48,10 +48,10 @@ if TYPE_CHECKING:
48
48
  from typing_extensions import Self
49
49
 
50
50
  from crawlee.proxy_configuration import _NewUrlFunction
51
+ from crawlee.storage_clients import StorageClient
51
52
 
52
53
  from apify._models import Webhook
53
54
 
54
-
55
55
  MainReturnType = TypeVar('MainReturnType')
56
56
 
57
57
 
@@ -98,7 +98,10 @@ class _ActorType:
98
98
  """
99
99
 
100
100
  _is_rebooting = False
101
+ """Whether the Actor is currently rebooting."""
102
+
101
103
  _is_any_instance_initialized = False
104
+ """Whether any Actor instance was initialized."""
102
105
 
103
106
  def __init__(
104
107
  self,
@@ -106,63 +109,149 @@ class _ActorType:
106
109
  *,
107
110
  configure_logging: bool = True,
108
111
  exit_process: bool | None = None,
112
+ exit_code: int = 0,
113
+ status_message: str | None = None,
114
+ event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT,
115
+ cleanup_timeout: timedelta = timedelta(seconds=30),
109
116
  ) -> None:
110
- """Create an Actor instance.
111
-
112
- Note that you don't have to do this, all the functionality is accessible using the default instance
113
- (e.g. `Actor.open_dataset()`).
117
+ """Initialize a new instance.
114
118
 
115
119
  Args:
116
- configuration: The Actor configuration to be used. If not passed, a new Configuration instance will
117
- be created.
118
- configure_logging: Should the default logging configuration be configured?
119
- exit_process: Whether the Actor should call `sys.exit` when the context manager exits. The default is
120
- True except for the IPython, Pytest and Scrapy environments.
120
+ configuration: The Actor configuration to use. If not provided, a default configuration is created.
121
+ configure_logging: Whether to set up the default logging configuration.
122
+ exit_process: Whether the Actor should call `sys.exit` when the context manager exits.
123
+ Defaults to True, except in IPython, Pytest, and Scrapy environments.
124
+ exit_code: The exit code the Actor should use when exiting.
125
+ status_message: Final status message to display upon Actor termination.
126
+ event_listeners_timeout: Maximum time to wait for Actor event listeners to complete before exiting.
127
+ cleanup_timeout: Maximum time to wait for cleanup tasks to finish.
121
128
  """
129
+ self._configuration = configuration
130
+ self._configure_logging = configure_logging
122
131
  self._exit_process = self._get_default_exit_process() if exit_process is None else exit_process
123
- self._is_exiting = False
132
+ self._exit_code = exit_code
133
+ self._status_message = status_message
134
+ self._event_listeners_timeout = event_listeners_timeout
135
+ self._cleanup_timeout = cleanup_timeout
124
136
 
125
137
  # Actor state when this method is being executed is unpredictable.
126
138
  # Actor can be initialized by lazy object proxy or by user directly, or by both.
127
139
  # Until `init` method is run, this state of uncertainty remains. This is the reason why any setting done here in
128
140
  # `__init__` method should not be considered final.
129
141
 
130
- self._configuration = configuration
131
- self._configure_logging = configure_logging
132
142
  self._apify_client: ApifyClientAsync | None = None
143
+ self._local_storage_client: StorageClient | None = None
133
144
 
145
+ self._is_exiting = False
134
146
  self._is_initialized = False
135
147
 
136
148
  async def __aenter__(self) -> Self:
137
- """Initialize the Actor.
149
+ """Enter the Actor context.
150
+
151
+ Initializes the Actor when used in an `async with` block. This method:
138
152
 
139
- Automatically initializes the Actor instance when you use it in an `async with ...` statement.
153
+ - Sets up local or cloud storage clients depending on whether the Actor runs locally or on the Apify platform.
154
+ - Configures the event manager and starts periodic state persistence.
155
+ - Initializes the charging manager for handling charging events.
156
+ - Configures logging after all core services are registered.
140
157
 
141
- When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
142
- executing the block code, the `Actor.fail` method is called.
158
+ This method must be called exactly once per Actor instance. Re-initializing an Actor or having multiple
159
+ active Actor instances is not standard usage and may lead to warnings or unexpected behavior.
143
160
  """
144
- await self.init()
161
+ if self._is_initialized:
162
+ raise RuntimeError('The Actor was already initialized!')
163
+
164
+ # Initialize configuration first - it's required for the next steps.
165
+ if self._configuration:
166
+ # User provided explicit configuration - register it in the service locator.
167
+ service_locator.set_configuration(self.configuration)
168
+ else:
169
+ # No explicit configuration provided - trigger creation of default configuration.
170
+ _ = self.configuration
171
+
172
+ # Configure logging based on the configuration, any logs before this point are lost.
173
+ if self._configure_logging:
174
+ _configure_logging()
175
+ self.log.debug('Logging configured')
176
+
177
+ self.log.info('Initializing Actor', extra=get_system_info())
178
+ self.log.debug('Configuration initialized')
179
+
180
+ # Warn about non-standard usage patterns.
181
+ if _ActorType._is_any_instance_initialized:
182
+ self.log.warning('Repeated Actor initialization detected - this is non-standard usage, proceed with care.')
183
+
184
+ # Update the global Actor proxy to refer to this instance.
185
+ cast('Proxy', Actor).__wrapped__ = self
186
+ self._is_exiting = False
187
+ self._was_final_persist_state_emitted = False
188
+
189
+ # Initialize the storage client and register it in the service locator.
190
+ _ = self._storage_client
191
+ self.log.debug('Storage client initialized')
192
+
193
+ # Initialize the event manager and register it in the service locator.
194
+ await self.event_manager.__aenter__()
195
+ self.log.debug('Event manager initialized')
196
+
197
+ # Initialize the charging manager.
198
+ await self._charging_manager_implementation.__aenter__()
199
+ self.log.debug('Charging manager initialized')
200
+
201
+ # Mark initialization as complete and update global state.
202
+ self._is_initialized = True
203
+ _ActorType._is_any_instance_initialized = True
145
204
  return self
146
205
 
147
206
  async def __aexit__(
148
207
  self,
149
- _exc_type: type[BaseException] | None,
208
+ exc_type: type[BaseException] | None,
150
209
  exc_value: BaseException | None,
151
- _exc_traceback: TracebackType | None,
210
+ exc_traceback: TracebackType | None,
152
211
  ) -> None:
153
- """Exit the Actor, handling any exceptions properly.
212
+ """Exit the Actor context.
213
+
214
+ If the block exits with an exception, the Actor fails with a non-zero exit code.
215
+ Otherwise, it exits cleanly. In both cases the Actor:
154
216
 
155
- When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
156
- executing the block code, the `Actor.fail` method is called.
217
+ - Cancels periodic `PERSIST_STATE` events.
218
+ - Sends a final `PERSIST_STATE` event.
219
+ - Waits for all event listeners to finish.
220
+ - Stops the event manager and the charging manager.
221
+ - Optionally terminates the process with the selected exit code.
157
222
  """
158
- if not self._is_exiting:
159
- if exc_value:
160
- await self.fail(
161
- exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value,
162
- exception=exc_value,
163
- )
164
- else:
165
- await self.exit()
223
+ if self._is_exiting:
224
+ return
225
+
226
+ self._raise_if_not_initialized()
227
+
228
+ if exc_value and not is_running_in_ipython():
229
+ # In IPython, we don't run `sys.exit()` during Actor exits,
230
+ # so the exception traceback will be printed on its own
231
+ self.log.exception('Actor failed with an exception', exc_info=exc_value)
232
+ self.exit_code = ActorExitCodes.ERROR_USER_FUNCTION_THREW.value
233
+
234
+ self._is_exiting = True
235
+ self.log.info('Exiting Actor', extra={'exit_code': self.exit_code})
236
+
237
+ async def finalize() -> None:
238
+ if self.status_message is not None:
239
+ await self.set_status_message(self.status_message, is_terminal=True)
240
+
241
+ # Sleep for a bit so that the listeners have a chance to trigger
242
+ await asyncio.sleep(0.1)
243
+
244
+ if self._event_listeners_timeout:
245
+ await self.event_manager.wait_for_all_listeners_to_complete(timeout=self._event_listeners_timeout)
246
+
247
+ await self.event_manager.__aexit__(None, None, None)
248
+ await self._charging_manager_implementation.__aexit__(None, None, None)
249
+
250
+ await asyncio.wait_for(finalize(), self._cleanup_timeout.total_seconds())
251
+ self._is_initialized = False
252
+
253
+ if self._exit_process:
254
+ sys.exit(self.exit_code)
166
255
 
167
256
  def __repr__(self) -> str:
168
257
  if self is cast('Proxy', Actor).__wrapped__:
@@ -176,24 +265,58 @@ class _ActorType:
176
265
  *,
177
266
  configure_logging: bool = True,
178
267
  exit_process: bool | None = None,
268
+ exit_code: int = 0,
269
+ event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT,
270
+ status_message: str | None = None,
271
+ cleanup_timeout: timedelta = timedelta(seconds=30),
179
272
  ) -> Self:
180
- """Make a new Actor instance with a non-default configuration."""
273
+ """Make a new Actor instance with a non-default configuration.
274
+
275
+ This is necessary due to the lazy object proxying of the global `Actor` instance.
276
+ """
181
277
  return self.__class__(
182
278
  configuration=configuration,
183
279
  configure_logging=configure_logging,
184
280
  exit_process=exit_process,
281
+ exit_code=exit_code,
282
+ event_listeners_timeout=event_listeners_timeout,
283
+ status_message=status_message,
284
+ cleanup_timeout=cleanup_timeout,
185
285
  )
186
286
 
287
+ @property
288
+ def log(self) -> logging.Logger:
289
+ """Logger configured for this Actor."""
290
+ return logger
291
+
292
+ @property
293
+ def exit_code(self) -> int:
294
+ """The exit code the Actor will use when exiting."""
295
+ return self._exit_code
296
+
297
+ @exit_code.setter
298
+ def exit_code(self, value: int) -> None:
299
+ self._exit_code = value
300
+
301
+ @property
302
+ def status_message(self) -> str | None:
303
+ """The final status message that the Actor will display upon termination."""
304
+ return self._status_message
305
+
306
+ @status_message.setter
307
+ def status_message(self, value: str | None) -> None:
308
+ self._status_message = value
309
+
187
310
  @property
188
311
  def apify_client(self) -> ApifyClientAsync:
189
- """The ApifyClientAsync instance the Actor instance uses."""
312
+ """Asynchronous Apify client for interacting with the Apify API."""
190
313
  if not self._apify_client:
191
314
  self._apify_client = self.new_client()
192
315
  return self._apify_client
193
316
 
194
317
  @cached_property
195
318
  def configuration(self) -> Configuration:
196
- """The Configuration instance the Actor instance uses."""
319
+ """Actor configuration, uses the default instance if not explicitly set."""
197
320
  if self._configuration:
198
321
  return self._configuration
199
322
 
@@ -214,8 +337,11 @@ class _ActorType:
214
337
 
215
338
  @cached_property
216
339
  def event_manager(self) -> EventManager:
217
- """The EventManager instance the Actor instance uses."""
218
- return (
340
+ """Manages Apify platform events.
341
+
342
+ It uses `ApifyEventManager` on the Apify platform and `LocalEventManager` otherwise.
343
+ """
344
+ event_manager = (
219
345
  ApifyEventManager(
220
346
  configuration=self.configuration,
221
347
  persist_state_interval=self.configuration.persist_state_interval,
@@ -226,19 +352,16 @@ class _ActorType:
226
352
  persist_state_interval=self.configuration.persist_state_interval,
227
353
  )
228
354
  )
355
+ service_locator.set_event_manager(event_manager)
356
+ return event_manager
229
357
 
230
- @property
231
- def log(self) -> logging.Logger:
232
- """The logging.Logger instance the Actor uses."""
233
- return logger
234
-
235
- def _raise_if_not_initialized(self) -> None:
236
- if not self._is_initialized:
237
- raise RuntimeError('The Actor was not initialized!')
358
+ @cached_property
359
+ def _charging_manager_implementation(self) -> ChargingManagerImplementation:
360
+ return ChargingManagerImplementation(self.configuration, self.apify_client)
238
361
 
239
362
  @cached_property
240
363
  def _storage_client(self) -> SmartApifyStorageClient:
241
- """Storage client used by the actor.
364
+ """Storage client used by the Actor.
242
365
 
243
366
  Depending on the initialization of the service locator the client can be created in different ways.
244
367
  """
@@ -250,7 +373,7 @@ class _ActorType:
250
373
  service_locator.set_storage_client(implicit_storage_client)
251
374
  except ServiceConflictError:
252
375
  self.log.debug(
253
- 'Storage client in service locator was set explicitly before Actor.init was called.'
376
+ 'Storage client in service locator was set explicitly before Actor.init was called. '
254
377
  'Using the existing storage client as implicit storage client for the Actor.'
255
378
  )
256
379
  else:
@@ -270,100 +393,35 @@ class _ActorType:
270
393
  )
271
394
 
272
395
  async def init(self) -> None:
273
- """Initialize the Actor instance.
274
-
275
- This initializes the Actor instance. It configures the right storage client based on whether the Actor is
276
- running locally or on the Apify platform, it initializes the event manager for processing Actor events,
277
- and starts an interval for regularly sending `PERSIST_STATE` events, so that the Actor can regularly persist
278
- its state in response to these events.
396
+ """Initialize the Actor without using context-manager syntax.
279
397
 
280
- This method should be called immediately before performing any additional Actor actions, and it should be
281
- called only once.
398
+ Equivalent to `await Actor.__aenter__()`.
282
399
  """
283
- self.log.info('Initializing Actor...')
284
- if self._configuration:
285
- # Set explicitly the configuration in the service locator
286
- service_locator.set_configuration(self.configuration)
287
- else:
288
- # Ensure that the configuration (cached property) is set
289
- _ = self.configuration
290
-
291
- if self._is_initialized:
292
- raise RuntimeError('The Actor was already initialized!')
293
-
294
- if _ActorType._is_any_instance_initialized:
295
- self.log.warning('Repeated Actor initialization detected - this is non-standard usage, proceed with care')
296
-
297
- # Make sure that the currently initialized instance is also available through the global `Actor` proxy
298
- cast('Proxy', Actor).__wrapped__ = self
299
-
300
- self._is_exiting = False
301
- self._was_final_persist_state_emitted = False
302
-
303
- self.log.debug(f'Storage client set to {self._storage_client}')
304
-
305
- service_locator.set_event_manager(self.event_manager)
306
-
307
- # The logging configuration has to be called after all service_locator set methods.
308
- if self._configure_logging:
309
- _configure_logging()
310
-
311
- self.log.info('System info', extra=get_system_info())
312
-
313
- await self.event_manager.__aenter__()
314
- self.log.debug('Event manager initialized')
315
-
316
- await self._charging_manager_implementation.__aenter__()
317
- self.log.debug('Charging manager initialized')
318
-
319
- self._is_initialized = True
320
- _ActorType._is_any_instance_initialized = True
400
+ await self.__aenter__()
321
401
 
322
402
  async def exit(
323
403
  self,
324
404
  *,
325
405
  exit_code: int = 0,
326
- event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT,
327
406
  status_message: str | None = None,
407
+ event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT,
328
408
  cleanup_timeout: timedelta = timedelta(seconds=30),
329
409
  ) -> None:
330
- """Exit the Actor instance.
410
+ """Exit the Actor without using context-manager syntax.
331
411
 
332
- This stops the Actor instance. It cancels all the intervals for regularly sending `PERSIST_STATE` events,
333
- sends a final `PERSIST_STATE` event, waits for all the event listeners to finish, and stops the event manager.
412
+ Equivalent to `await Actor.__aexit__()`.
334
413
 
335
414
  Args:
336
- exit_code: The exit code with which the Actor should fail (defaults to `0`).
337
- event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting.
338
- status_message: The final status message that the Actor should display.
339
- cleanup_timeout: How long we should wait for event listeners.
415
+ exit_code: The exit code the Actor should use when exiting.
416
+ status_message: Final status message to display upon Actor termination.
417
+ event_listeners_timeout: Maximum time to wait for Actor event listeners to complete before exiting.
418
+ cleanup_timeout: Maximum time to wait for cleanup tasks to finish.
340
419
  """
341
- self._raise_if_not_initialized()
342
-
343
- self._is_exiting = True
344
-
345
- exit_code = maybe_extract_enum_member_value(exit_code)
346
-
347
- self.log.info('Exiting Actor', extra={'exit_code': exit_code})
348
-
349
- async def finalize() -> None:
350
- if status_message is not None:
351
- await self.set_status_message(status_message, is_terminal=True)
352
-
353
- # Sleep for a bit so that the listeners have a chance to trigger
354
- await asyncio.sleep(0.1)
355
-
356
- if event_listeners_timeout:
357
- await self.event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
358
-
359
- await self.event_manager.__aexit__(None, None, None)
360
- await self._charging_manager_implementation.__aexit__(None, None, None)
361
-
362
- await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
363
- self._is_initialized = False
364
-
365
- if self._exit_process:
366
- sys.exit(exit_code)
420
+ self.exit_code = exit_code
421
+ self.status_message = status_message
422
+ self._event_listeners_timeout = event_listeners_timeout
423
+ self._cleanup_timeout = cleanup_timeout
424
+ await self.__aexit__(None, None, None)
367
425
 
368
426
  async def fail(
369
427
  self,
@@ -372,23 +430,24 @@ class _ActorType:
372
430
  exception: BaseException | None = None,
373
431
  status_message: str | None = None,
374
432
  ) -> None:
375
- """Fail the Actor instance.
433
+ """Fail the Actor instance without using context-manager syntax.
376
434
 
377
- This performs all the same steps as Actor.exit(), but it additionally sets the exit code to `1` (by default).
435
+ Equivalent to setting the `self.exit_code` and `self.status_message` properties and using
436
+ `await Actor.__aexit__()`.
378
437
 
379
438
  Args:
380
439
  exit_code: The exit code with which the Actor should fail (defaults to `1`).
381
440
  exception: The exception with which the Actor failed.
382
441
  status_message: The final status message that the Actor should display.
383
442
  """
384
- self._raise_if_not_initialized()
385
-
386
- # In IPython, we don't run `sys.exit()` during Actor exits,
387
- # so the exception traceback will be printed on its own
388
- if exception and not is_running_in_ipython():
389
- self.log.exception('Actor failed with an exception', exc_info=exception)
443
+ self.exit_code = exit_code
444
+ self.status_message = status_message
390
445
 
391
- await self.exit(exit_code=exit_code, status_message=status_message)
446
+ await self.__aexit__(
447
+ exc_type=type(exception) if exception else None,
448
+ exc_value=exception,
449
+ exc_traceback=exception.__traceback__ if exception else None,
450
+ )
392
451
 
393
452
  def new_client(
394
453
  self,
@@ -626,10 +685,6 @@ class _ActorType:
626
685
  self._raise_if_not_initialized()
627
686
  return self._charging_manager_implementation
628
687
 
629
- @cached_property
630
- def _charging_manager_implementation(self) -> ChargingManagerImplementation:
631
- return ChargingManagerImplementation(self.configuration, self.apify_client)
632
-
633
688
  async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
634
689
  """Charge for a specified number of events - sub-operations of the Actor.
635
690
 
@@ -822,18 +877,6 @@ class _ActorType:
822
877
 
823
878
  return ActorRun.model_validate(api_result)
824
879
 
825
- def _get_remaining_time(self) -> timedelta | None:
826
- """Get time remaining from the Actor timeout. Returns `None` if not on an Apify platform."""
827
- if self.is_at_home() and self.configuration.timeout_at:
828
- return self.configuration.timeout_at - datetime.now(tz=timezone.utc)
829
-
830
- self.log.warning(
831
- 'Returning `None` instead of remaining time. Using `RemainingTime` argument is only possible when the Actor'
832
- ' is running on the Apify platform and when the timeout for the Actor run is set. '
833
- f'{self.is_at_home()=}, {self.configuration.timeout_at=}'
834
- )
835
- return None
836
-
837
880
  async def abort(
838
881
  self,
839
882
  run_id: str,
@@ -1242,6 +1285,10 @@ class _ActorType:
1242
1285
 
1243
1286
  return proxy_configuration
1244
1287
 
1288
+ def _raise_if_not_initialized(self) -> None:
1289
+ if not self._is_initialized:
1290
+ raise RuntimeError('The Actor was not initialized!')
1291
+
1245
1292
  def _get_default_exit_process(self) -> bool:
1246
1293
  """Return False for IPython, Pytest, and Scrapy environments, True otherwise."""
1247
1294
  if is_running_in_ipython():
@@ -1262,6 +1309,18 @@ class _ActorType:
1262
1309
 
1263
1310
  return True
1264
1311
 
1312
+ def _get_remaining_time(self) -> timedelta | None:
1313
+ """Get time remaining from the Actor timeout. Returns `None` if not on an Apify platform."""
1314
+ if self.is_at_home() and self.configuration.timeout_at:
1315
+ return self.configuration.timeout_at - datetime.now(tz=timezone.utc)
1316
+
1317
+ self.log.warning(
1318
+ 'Returning `None` instead of remaining time. Using `RemainingTime` argument is only possible when the Actor'
1319
+ ' is running on the Apify platform and when the timeout for the Actor run is set. '
1320
+ f'{self.is_at_home()=}, {self.configuration.timeout_at=}'
1321
+ )
1322
+ return None
1323
+
1265
1324
 
1266
1325
  Actor = cast('_ActorType', Proxy(_ActorType))
1267
1326
  """The entry point of the SDK, through which all the Actor operations should be done."""
apify/_configuration.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from datetime import datetime, timedelta
4
4
  from decimal import Decimal
5
5
  from logging import getLogger
6
+ from pathlib import Path
6
7
  from typing import Annotated, Any
7
8
 
8
9
  from pydantic import AliasChoices, BeforeValidator, Field, model_validator
@@ -421,6 +422,14 @@ class Configuration(CrawleeConfiguration):
421
422
  logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.')
422
423
  return self
423
424
 
425
+ @property
426
+ def canonical_input_key(self) -> str:
427
+ return str(Path(self.input_key).with_suffix('.json'))
428
+
429
+ @property
430
+ def input_key_candidates(self) -> set[str]:
431
+ return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).stem}
432
+
424
433
  @classmethod
425
434
  def get_global_configuration(cls) -> Configuration:
426
435
  """Retrieve the global instance of the configuration.
@@ -49,7 +49,7 @@ class ApifyHttpProxyMiddleware:
49
49
  if proxy_settings is None:
50
50
  Actor.log.info(
51
51
  'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing '
52
- ' in the Actor input.'
52
+ 'in the Actor input.'
53
53
  )
54
54
  raise NotConfigured
55
55
 
@@ -12,8 +12,8 @@ from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest
12
12
  from crawlee.storages import RequestQueue
13
13
 
14
14
  from ._models import ApifyRequestQueueMetadata, RequestQueueStats
15
- from ._request_queue_shared_client import _ApifyRequestQueueSharedClient
16
- from ._request_queue_single_client import _ApifyRequestQueueSingleClient
15
+ from ._request_queue_shared_client import ApifyRequestQueueSharedClient
16
+ from ._request_queue_single_client import ApifyRequestQueueSingleClient
17
17
  from ._utils import AliasResolver
18
18
 
19
19
  if TYPE_CHECKING:
@@ -47,14 +47,14 @@ class ApifyRequestQueueClient(RequestQueueClient):
47
47
  self._api_client = api_client
48
48
  """The Apify request queue client for API operations."""
49
49
 
50
- self._implementation: _ApifyRequestQueueSingleClient | _ApifyRequestQueueSharedClient
50
+ self._implementation: ApifyRequestQueueSingleClient | ApifyRequestQueueSharedClient
51
51
  """Internal implementation used to communicate with the Apify platform based Request Queue."""
52
52
  if access == 'single':
53
- self._implementation = _ApifyRequestQueueSingleClient(
53
+ self._implementation = ApifyRequestQueueSingleClient(
54
54
  api_client=self._api_client, metadata=metadata, cache_size=self._MAX_CACHED_REQUESTS
55
55
  )
56
56
  elif access == 'shared':
57
- self._implementation = _ApifyRequestQueueSharedClient(
57
+ self._implementation = ApifyRequestQueueSharedClient(
58
58
  api_client=self._api_client,
59
59
  metadata=metadata,
60
60
  cache_size=self._MAX_CACHED_REQUESTS,
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
23
23
  logger = getLogger(__name__)
24
24
 
25
25
 
26
- class _ApifyRequestQueueSharedClient:
26
+ class ApifyRequestQueueSharedClient:
27
27
  """An Apify platform implementation of the request queue client.
28
28
 
29
29
  This implementation supports multiple producers and multiple consumers scenario.
@@ -106,23 +106,19 @@ class _ApifyRequestQueueSharedClient:
106
106
  # It could have been handled by another client in the meantime, so cached information about
107
107
  # `request.was_already_handled` is not reliable.
108
108
  already_present_requests.append(
109
- ProcessedRequest.model_validate(
110
- {
111
- 'uniqueKey': request.unique_key,
112
- 'wasAlreadyPresent': True,
113
- 'wasAlreadyHandled': request.was_already_handled,
114
- }
109
+ ProcessedRequest(
110
+ unique_key=request.unique_key,
111
+ was_already_present=True,
112
+ was_already_handled=request.was_already_handled,
115
113
  )
116
114
  )
117
115
 
118
116
  else:
119
117
  # Add new request to the cache.
120
- processed_request = ProcessedRequest.model_validate(
121
- {
122
- 'uniqueKey': request.unique_key,
123
- 'wasAlreadyPresent': True,
124
- 'wasAlreadyHandled': request.was_already_handled,
125
- }
118
+ processed_request = ProcessedRequest(
119
+ unique_key=request.unique_key,
120
+ was_already_present=True,
121
+ was_already_handled=request.was_already_handled,
126
122
  )
127
123
  self._cache_request(
128
124
  request.unique_key,
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
21
21
  logger = getLogger(__name__)
22
22
 
23
23
 
24
- class _ApifyRequestQueueSingleClient:
24
+ class ApifyRequestQueueSingleClient:
25
25
  """An Apify platform implementation of the request queue client with limited capability.
26
26
 
27
27
  This client is designed to use as little resources as possible, but has to be used in constrained context.
@@ -108,23 +108,19 @@ class _ApifyRequestQueueSingleClient:
108
108
  # Check if request is known to be already handled (it has to be present as well.)
109
109
  if request.unique_key in self._requests_already_handled:
110
110
  already_present_requests.append(
111
- ProcessedRequest.model_validate(
112
- {
113
- 'uniqueKey': request.unique_key,
114
- 'wasAlreadyPresent': True,
115
- 'wasAlreadyHandled': True,
116
- }
111
+ ProcessedRequest(
112
+ unique_key=request.unique_key,
113
+ was_already_present=True,
114
+ was_already_handled=True,
117
115
  )
118
116
  )
119
117
  # Check if request is known to be already present, but unhandled
120
118
  elif self._requests_cache.get(request.unique_key):
121
119
  already_present_requests.append(
122
- ProcessedRequest.model_validate(
123
- {
124
- 'uniqueKey': request.unique_key,
125
- 'wasAlreadyPresent': True,
126
- 'wasAlreadyHandled': request.was_already_handled,
127
- }
120
+ ProcessedRequest(
121
+ unique_key=request.unique_key,
122
+ was_already_present=True,
123
+ was_already_handled=request.was_already_handled,
128
124
  )
129
125
  )
130
126
  else:
@@ -158,8 +154,9 @@ class _ApifyRequestQueueSingleClient:
158
154
  self._requests_cache.pop(unprocessed_request.unique_key, None)
159
155
 
160
156
  else:
161
- api_response = AddRequestsResponse.model_validate(
162
- {'unprocessedRequests': [], 'processedRequests': already_present_requests}
157
+ api_response = AddRequestsResponse(
158
+ unprocessed_requests=[],
159
+ processed_requests=already_present_requests,
163
160
  )
164
161
 
165
162
  # Update assumed total count for newly added requests.
@@ -236,20 +233,39 @@ class _ApifyRequestQueueSingleClient:
236
233
 
237
234
  # Update the cached data
238
235
  for request_data in response.get('items', []):
236
+ # Due to https://github.com/apify/apify-core/blob/v0.1377.0/src/api/src/lib/request_queues/request_queue.ts#L53,
237
+ # the list_head endpoint may return truncated fields for long requests (e.g., long URLs or unique keys).
238
+ # If truncation is detected, fetch the full request data by its ID from the API.
239
+ # This is a temporary workaround - the caching will be refactored to use request IDs instead of unique keys.
240
+ # See https://github.com/apify/apify-sdk-python/issues/630 for details.
241
+ if '[truncated]' in request_data['uniqueKey'] or '[truncated]' in request_data['url']:
242
+ request_data = await self._api_client.get_request(request_id=request_data['id']) # noqa: PLW2901
243
+
239
244
  request = Request.model_validate(request_data)
240
245
 
241
246
  if request.unique_key in self._requests_in_progress:
242
247
  # Ignore requests that are already in progress, we will not process them again.
243
248
  continue
249
+
244
250
  if request.was_already_handled:
245
251
  # Do not cache fully handled requests, we do not need them. Just cache their unique_key.
246
252
  self._requests_already_handled.add(request.unique_key)
247
253
  else:
248
254
  # Only fetch the request if we do not know it yet.
249
255
  if request.unique_key not in self._requests_cache:
250
- request = Request.model_validate(
251
- await self._api_client.get_request(unique_key_to_request_id(request.unique_key))
252
- )
256
+ request_id = unique_key_to_request_id(request.unique_key)
257
+
258
+ if request_data is not None and request_id != request_data['id']:
259
+ logger.warning(
260
+ f'Request ID mismatch: {request_id} != {request_data["id"]}, '
261
+ 'this may cause unexpected behavior.'
262
+ )
263
+
264
+ # See https://github.com/apify/apify-sdk-python/issues/630 for details.
265
+ if '[truncated]' not in request.unique_key:
266
+ request_data = await self._api_client.get_request(request_id=request_id) # noqa: PLW2901
267
+ request = Request.model_validate(request_data)
268
+
253
269
  self._requests_cache[request.unique_key] = request
254
270
 
255
271
  # Add new requests to the end of the head, unless already present in head
@@ -1,14 +1,19 @@
1
1
  import asyncio
2
2
  import json
3
- from pathlib import Path
3
+ import logging
4
4
 
5
- from typing_extensions import override
5
+ from more_itertools import flatten
6
+ from typing_extensions import Self, override
6
7
 
7
8
  from crawlee._consts import METADATA_FILENAME
9
+ from crawlee.configuration import Configuration as CrawleeConfiguration
8
10
  from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
11
+ from crawlee.storage_clients.models import KeyValueStoreRecord
9
12
 
10
13
  from apify._configuration import Configuration
11
14
 
15
+ logger = logging.getLogger(__name__)
16
+
12
17
 
13
18
  class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
14
19
  """Apify-specific implementation of the `FileSystemKeyValueStoreClient`.
@@ -17,6 +22,22 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
17
22
  directory, except for the metadata file and the `INPUT.json` file.
18
23
  """
19
24
 
25
+ @override
26
+ @classmethod
27
+ async def open(
28
+ cls,
29
+ *,
30
+ id: str | None,
31
+ name: str | None,
32
+ alias: str | None,
33
+ configuration: CrawleeConfiguration,
34
+ ) -> Self:
35
+ client = await super().open(id=id, name=name, alias=alias, configuration=configuration)
36
+
37
+ await client._sanitize_input_json_files() # noqa: SLF001 - it's okay, this is a factory method
38
+
39
+ return client
40
+
20
41
  @override
21
42
  async def purge(self) -> None:
22
43
  """Purges the key-value store by deleting all its contents.
@@ -24,16 +45,16 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
24
45
  It deletes all files in the key-value store directory, except for the metadata file and
25
46
  the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged.
26
47
  """
27
- kvs_input_key = Configuration.get_global_configuration().input_key
28
-
29
- # First try to find the alternative format of the input file and process it if it exists.
30
- for file_path in self.path_to_kvs.glob('*'):
31
- if file_path.name == f'{kvs_input_key}.json':
32
- await self._process_input_json(file_path)
48
+ configuration = Configuration.get_global_configuration()
33
49
 
34
50
  async with self._lock:
51
+ files_to_keep = set(
52
+ flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates)
53
+ )
54
+ files_to_keep.add(METADATA_FILENAME)
55
+
35
56
  for file_path in self.path_to_kvs.glob('*'):
36
- if file_path.name in {METADATA_FILENAME, kvs_input_key, f'{kvs_input_key}.{METADATA_FILENAME}'}:
57
+ if file_path.name in files_to_keep:
37
58
  continue
38
59
  if file_path.is_file():
39
60
  await asyncio.to_thread(file_path.unlink, missing_ok=True)
@@ -43,15 +64,40 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
43
64
  update_modified_at=True,
44
65
  )
45
66
 
46
- async def _process_input_json(self, path: Path) -> None:
47
- """Process simple input json file to format expected by the FileSystemKeyValueStoreClient.
67
+ async def _sanitize_input_json_files(self) -> None:
68
+ """Handle missing metadata for input files."""
69
+ configuration = Configuration.get_global_configuration()
70
+ alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key}
48
71
 
49
- For example: INPUT.json -> INPUT, INPUT.json.metadata
50
- """
51
- try:
52
- f = await asyncio.to_thread(path.open)
53
- input_data = json.load(f)
54
- finally:
55
- f.close()
56
- await asyncio.to_thread(path.unlink, missing_ok=True)
57
- await self.set_value(key=path.stem, value=input_data)
72
+ if (self.path_to_kvs / configuration.canonical_input_key).exists():
73
+ # Refresh metadata to prevent inconsistencies
74
+ input_data = await asyncio.to_thread(
75
+ lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text())
76
+ )
77
+ await self.set_value(key=configuration.canonical_input_key, value=input_data)
78
+
79
+ for alternative_key in alternative_keys:
80
+ if (alternative_input_file := self.path_to_kvs / alternative_key).exists():
81
+ logger.warning(f'Redundant input file found: {alternative_input_file}')
82
+ else:
83
+ for alternative_key in alternative_keys:
84
+ alternative_input_file = self.path_to_kvs / alternative_key
85
+
86
+ # Only process files that actually exist
87
+ if alternative_input_file.exists():
88
+ # Refresh metadata to prevent inconsistencies
89
+ with alternative_input_file.open() as f:
90
+ input_data = await asyncio.to_thread(lambda: json.load(f))
91
+ await self.set_value(key=alternative_key, value=input_data)
92
+
93
+ @override
94
+ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
95
+ configuration = Configuration.get_global_configuration()
96
+
97
+ if key in configuration.input_key_candidates:
98
+ for candidate in configuration.input_key_candidates:
99
+ value = await super().get_value(key=candidate)
100
+ if value is not None:
101
+ return value
102
+
103
+ return await super().get_value(key=key)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.0.1b1
3
+ Version: 3.0.2
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -228,7 +228,7 @@ Requires-Python: >=3.10
228
228
  Requires-Dist: apify-client<3.0.0,>=2.0.0
229
229
  Requires-Dist: apify-shared<3.0.0,>=2.0.0
230
230
  Requires-Dist: cachetools>=5.5.0
231
- Requires-Dist: crawlee<2.0.0,>=1.0.0
231
+ Requires-Dist: crawlee<2.0.0,>=1.0.2
232
232
  Requires-Dist: cryptography>=42.0.0
233
233
  Requires-Dist: impit>=0.6.1
234
234
  Requires-Dist: lazy-object-proxy>=1.11.0
@@ -1,7 +1,7 @@
1
1
  apify/__init__.py,sha256=HpgKg2FZWJuSPfDygzJ62psylhw4NN4tKFnoYUIhcd4,838
2
- apify/_actor.py,sha256=DYHoyBAu6hDLs0BcTZL-IQveLK8gPTWvb6AgDnJc3EA,54755
2
+ apify/_actor.py,sha256=kfrwD8gaeN4NcdNMD_Pj66agNh78jJjwMuNOuwLdo-E,57370
3
3
  apify/_charging.py,sha256=KjZ2DnEMS0Tt8ibizmmt0RwBq8FOAsD1z-hKFgdazcY,13143
4
- apify/_configuration.py,sha256=gq_UfWTgcP1_0kEMLhXVg33SgSxXjShbuzoXyCFfK0w,14682
4
+ apify/_configuration.py,sha256=7ZHhgRp98kr35zx4k4EB2aImq7Dq1FJjPg7r5bucv_M,14984
5
5
  apify/_consts.py,sha256=CjhyEJ4Mi0lcIrzfqz8dN7nPJWGjCeBrrXQy1PZ6zRI,440
6
6
  apify/_crypto.py,sha256=tqUs13QkemDtGzvU41pIA2HUEawpDlgzqbwKjm4I8kM,6852
7
7
  apify/_models.py,sha256=EzU-inWeJ7T5HNVYEwnYb79W-q4OAPhtrYctfRYzpTE,7848
@@ -27,7 +27,7 @@ apify/scrapy/utils.py,sha256=Ssfa-P9-g9XYP1suDce6dQ8ta7PfijiPoMl2iplE6Ow,2126
27
27
  apify/scrapy/extensions/__init__.py,sha256=cVQ8CCtOsJsRP28YKZWSUsi4FBwxI-yPJRNSXPFSa_o,98
28
28
  apify/scrapy/extensions/_httpcache.py,sha256=XIS9vFCcUtdSfeKAKnxRnI9dX_GMmX2Od8OPnOaDhQ0,8870
29
29
  apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
30
- apify/scrapy/middlewares/apify_proxy.py,sha256=CDAOXS3bcVDZHM3B0GvhXbxEikMIadLF_0P73WL_nI4,5550
30
+ apify/scrapy/middlewares/apify_proxy.py,sha256=EtR0958hGfBZ8MfwOIc_XyfBYPdMSkul-Ew4eYQCalM,5549
31
31
  apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
33
33
  apify/scrapy/pipelines/actor_dataset_push.py,sha256=XUUyznQTD-E3wYUUFt2WAOnWhbnRrY0WuedlfYfYhDI,846
@@ -38,20 +38,20 @@ apify/storage_clients/_apify/__init__.py,sha256=mtbVDsxqWL3kx30elnh0kAn2kZ4s3BBs
38
38
  apify/storage_clients/_apify/_dataset_client.py,sha256=Bb3UwOaFkyuEY7tuBf8K46R4ZP_b1EaAkDOXOqwSoW8,12498
39
39
  apify/storage_clients/_apify/_key_value_store_client.py,sha256=42dARbLX2oeOW7uYYKkDyQbEriMuh55Mxh0SqvkOEGg,10529
40
40
  apify/storage_clients/_apify/_models.py,sha256=GEaN7Got1zIg42QPH36obHRWRDVNtzOkRuOWYRf9bFU,4572
41
- apify/storage_clients/_apify/_request_queue_client.py,sha256=QXCLdTBeNW8RKWnxQOE71KOpZ_lqvqisa89eeiWwZ38,14200
42
- apify/storage_clients/_apify/_request_queue_shared_client.py,sha256=CbvwcXRvfuBoy3wrQEdLX9_vKELPH_WhHQARP14audM,20709
43
- apify/storage_clients/_apify/_request_queue_single_client.py,sha256=6CRSyWZPbKQJy3i2JBrojVTnhTYIB3gE0CTymYjpkZA,16958
41
+ apify/storage_clients/_apify/_request_queue_client.py,sha256=PUIVmGQxqFTkRxW9FIFWjT0OeDyAGt-ULlW-rdQDTyc,14194
42
+ apify/storage_clients/_apify/_request_queue_shared_client.py,sha256=uxkuIG1rgCArgs6agldC9vmB2bgrIlNnm1I214Gf6WA,20550
43
+ apify/storage_clients/_apify/_request_queue_single_client.py,sha256=EuORHJnFLC1YAT6ZfQj-ayrfSJNpU4_61r_7uDyvwgA,18092
44
44
  apify/storage_clients/_apify/_storage_client.py,sha256=hFl_PuX1UgOydBD6pieZ0u2NWbDmZV-i0qygKdsuHt4,4873
45
45
  apify/storage_clients/_apify/_utils.py,sha256=ywXoSM69amRokUZcshbAvQLIcSZq4L-bpYIGyeFxCGQ,7696
46
46
  apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
48
- apify/storage_clients/_file_system/_key_value_store_client.py,sha256=fnSJ1EIOPCGfcE6e5S3Tux9VbnMVLCJjugkaQoH_9yo,2267
48
+ apify/storage_clients/_file_system/_key_value_store_client.py,sha256=gxM3ap67PnY80Rd7P3onPAf2pksYpU0LoAlJdayEMdU,4179
49
49
  apify/storage_clients/_file_system/_storage_client.py,sha256=rcwpKYlrWzvlSA2xoxftg-EZAi_iGZ3vOCbu0C5lKDE,1396
50
50
  apify/storage_clients/_smart_apify/__init__.py,sha256=614B2AaWY-dx6RQ6mod7VVR8gFh75-_jnq5BeDD7hSc,53
51
51
  apify/storage_clients/_smart_apify/_storage_client.py,sha256=GCPmVe_xWAFcO2Cuej4su4i97_d33Q9Ih_Sc5xW2Wa4,4674
52
52
  apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
53
53
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- apify-3.0.1b1.dist-info/METADATA,sha256=Qy-fnT_4BnuEpoIhk_Aa0vIl6GVQtkqkk8diacKkzA0,22582
55
- apify-3.0.1b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- apify-3.0.1b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
- apify-3.0.1b1.dist-info/RECORD,,
54
+ apify-3.0.2.dist-info/METADATA,sha256=YRyzVDZFQFOevC5s2rezm0kM-0OFXb2HarPNZlRHG9Y,22580
55
+ apify-3.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ apify-3.0.2.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
+ apify-3.0.2.dist-info/RECORD,,
File without changes