apify 1.7.1b1__py3-none-any.whl → 2.2.0b14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (62) hide show
  1. apify/__init__.py +19 -4
  2. apify/_actor.py +1030 -0
  3. apify/_configuration.py +370 -0
  4. apify/_consts.py +10 -0
  5. apify/_crypto.py +31 -27
  6. apify/_models.py +117 -0
  7. apify/_platform_event_manager.py +231 -0
  8. apify/_proxy_configuration.py +320 -0
  9. apify/_utils.py +18 -484
  10. apify/apify_storage_client/__init__.py +3 -0
  11. apify/apify_storage_client/_apify_storage_client.py +68 -0
  12. apify/apify_storage_client/_dataset_client.py +190 -0
  13. apify/apify_storage_client/_dataset_collection_client.py +51 -0
  14. apify/apify_storage_client/_key_value_store_client.py +94 -0
  15. apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
  16. apify/apify_storage_client/_request_queue_client.py +176 -0
  17. apify/apify_storage_client/_request_queue_collection_client.py +51 -0
  18. apify/apify_storage_client/py.typed +0 -0
  19. apify/log.py +22 -105
  20. apify/scrapy/__init__.py +11 -3
  21. apify/scrapy/middlewares/__init__.py +3 -1
  22. apify/scrapy/middlewares/apify_proxy.py +29 -27
  23. apify/scrapy/middlewares/py.typed +0 -0
  24. apify/scrapy/pipelines/__init__.py +3 -1
  25. apify/scrapy/pipelines/actor_dataset_push.py +6 -3
  26. apify/scrapy/pipelines/py.typed +0 -0
  27. apify/scrapy/py.typed +0 -0
  28. apify/scrapy/requests.py +60 -58
  29. apify/scrapy/scheduler.py +28 -19
  30. apify/scrapy/utils.py +10 -32
  31. apify/storages/__init__.py +4 -10
  32. apify/storages/_request_list.py +150 -0
  33. apify/storages/py.typed +0 -0
  34. apify-2.2.0b14.dist-info/METADATA +211 -0
  35. apify-2.2.0b14.dist-info/RECORD +38 -0
  36. {apify-1.7.1b1.dist-info → apify-2.2.0b14.dist-info}/WHEEL +1 -2
  37. apify/_memory_storage/__init__.py +0 -3
  38. apify/_memory_storage/file_storage_utils.py +0 -71
  39. apify/_memory_storage/memory_storage_client.py +0 -219
  40. apify/_memory_storage/resource_clients/__init__.py +0 -19
  41. apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
  42. apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
  43. apify/_memory_storage/resource_clients/dataset.py +0 -452
  44. apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
  45. apify/_memory_storage/resource_clients/key_value_store.py +0 -533
  46. apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
  47. apify/_memory_storage/resource_clients/request_queue.py +0 -466
  48. apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
  49. apify/actor.py +0 -1351
  50. apify/config.py +0 -127
  51. apify/consts.py +0 -67
  52. apify/event_manager.py +0 -236
  53. apify/proxy_configuration.py +0 -365
  54. apify/storages/base_storage.py +0 -181
  55. apify/storages/dataset.py +0 -494
  56. apify/storages/key_value_store.py +0 -257
  57. apify/storages/request_queue.py +0 -602
  58. apify/storages/storage_client_manager.py +0 -72
  59. apify-1.7.1b1.dist-info/METADATA +0 -149
  60. apify-1.7.1b1.dist-info/RECORD +0 -41
  61. apify-1.7.1b1.dist-info/top_level.txt +0 -1
  62. {apify-1.7.1b1.dist-info → apify-2.2.0b14.dist-info}/LICENSE +0 -0
@@ -1,466 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import json
5
- import os
6
- from datetime import datetime, timezone
7
- from decimal import Decimal
8
- from typing import TYPE_CHECKING
9
-
10
- import aioshutil
11
- from apify_shared.utils import filter_out_none_values_recursively, ignore_docs, json_dumps
12
- from sortedcollections import ValueSortedDict
13
-
14
- from apify._crypto import crypto_random_object_id
15
- from apify._memory_storage.file_storage_utils import delete_request, update_metadata, update_request_queue_item
16
- from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient
17
- from apify._utils import force_rename, raise_on_duplicate_storage, raise_on_non_existing_storage, unique_key_to_request_id
18
- from apify.consts import StorageTypes
19
-
20
- if TYPE_CHECKING:
21
- from apify._memory_storage.memory_storage_client import MemoryStorageClient
22
-
23
-
24
- @ignore_docs
25
- class RequestQueueClient(BaseResourceClient):
26
- """Sub-client for manipulating a single request queue."""
27
-
28
- _id: str
29
- _resource_directory: str
30
- _memory_storage_client: MemoryStorageClient
31
- _name: str | None
32
- _requests: ValueSortedDict
33
- _created_at: datetime
34
- _accessed_at: datetime
35
- _modified_at: datetime
36
- _handled_request_count = 0
37
- _pending_request_count = 0
38
- _last_used_timestamp = Decimal(0.0)
39
- _file_operation_lock: asyncio.Lock
40
-
41
- def __init__(
42
- self: RequestQueueClient,
43
- *,
44
- base_storage_directory: str,
45
- memory_storage_client: MemoryStorageClient,
46
- id: str | None = None, # noqa: A002
47
- name: str | None = None,
48
- ) -> None:
49
- """Initialize the RequestQueueClient."""
50
- self._id = id or crypto_random_object_id()
51
- self._resource_directory = os.path.join(base_storage_directory, name or self._id)
52
- self._memory_storage_client = memory_storage_client
53
- self._name = name
54
- self._requests = ValueSortedDict(lambda req: req.get('orderNo') or -float('inf'))
55
- self._created_at = datetime.now(timezone.utc)
56
- self._accessed_at = datetime.now(timezone.utc)
57
- self._modified_at = datetime.now(timezone.utc)
58
- self._file_operation_lock = asyncio.Lock()
59
-
60
- async def get(self: RequestQueueClient) -> dict | None:
61
- """Retrieve the request queue.
62
-
63
- Returns:
64
- dict, optional: The retrieved request queue, or None, if it does not exist
65
- """
66
- found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
67
-
68
- if found:
69
- async with found._file_operation_lock:
70
- await found._update_timestamps(has_been_modified=False)
71
- return found._to_resource_info()
72
-
73
- return None
74
-
75
- async def update(self: RequestQueueClient, *, name: str | None = None) -> dict:
76
- """Update the request queue with specified fields.
77
-
78
- Args:
79
- name (str, optional): The new name for the request queue
80
-
81
- Returns:
82
- dict: The updated request queue
83
- """
84
- # Check by id
85
- existing_queue_by_id = self._find_or_create_client_by_id_or_name(
86
- memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
87
- )
88
-
89
- if existing_queue_by_id is None:
90
- raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
91
-
92
- # Skip if no changes
93
- if name is None:
94
- return existing_queue_by_id._to_resource_info()
95
-
96
- async with existing_queue_by_id._file_operation_lock:
97
- # Check that name is not in use already
98
- existing_queue_by_name = next(
99
- (queue for queue in self._memory_storage_client._request_queues_handled if queue._name and queue._name.lower() == name.lower()), None
100
- )
101
-
102
- if existing_queue_by_name is not None:
103
- raise_on_duplicate_storage(StorageTypes.REQUEST_QUEUE, 'name', name)
104
-
105
- existing_queue_by_id._name = name
106
-
107
- previous_dir = existing_queue_by_id._resource_directory
108
-
109
- existing_queue_by_id._resource_directory = os.path.join(self._memory_storage_client._request_queues_directory, name)
110
-
111
- await force_rename(previous_dir, existing_queue_by_id._resource_directory)
112
-
113
- # Update timestamps
114
- await existing_queue_by_id._update_timestamps(has_been_modified=True)
115
-
116
- return existing_queue_by_id._to_resource_info()
117
-
118
- async def delete(self: RequestQueueClient) -> None:
119
- """Delete the request queue."""
120
- queue = next((queue for queue in self._memory_storage_client._request_queues_handled if queue._id == self._id), None)
121
-
122
- if queue is not None:
123
- async with queue._file_operation_lock:
124
- self._memory_storage_client._request_queues_handled.remove(queue)
125
- queue._pending_request_count = 0
126
- queue._handled_request_count = 0
127
- queue._requests.clear()
128
-
129
- if os.path.exists(queue._resource_directory):
130
- await aioshutil.rmtree(queue._resource_directory)
131
-
132
- async def list_head(self: RequestQueueClient, *, limit: int | None = None) -> dict:
133
- """Retrieve a given number of requests from the beginning of the queue.
134
-
135
- Args:
136
- limit (int, optional): How many requests to retrieve
137
-
138
- Returns:
139
- dict: The desired number of requests from the beginning of the queue.
140
- """
141
- existing_queue_by_id = self._find_or_create_client_by_id_or_name(
142
- memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
143
- )
144
-
145
- if existing_queue_by_id is None:
146
- raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
147
-
148
- async with existing_queue_by_id._file_operation_lock:
149
- await existing_queue_by_id._update_timestamps(has_been_modified=False)
150
-
151
- items: list[dict] = []
152
-
153
- # Iterate all requests in the queue which have sorted key larger than infinity, which means `orderNo` is not `None`
154
- # This will iterate them in order of `orderNo`
155
- for request_key in existing_queue_by_id._requests.irange_key(min_key=-float('inf'), inclusive=(False, True)):
156
- if len(items) == limit:
157
- break
158
-
159
- request = existing_queue_by_id._requests.get(request_key)
160
-
161
- # Check that the request still exists and was not handled,
162
- # in case something deleted it or marked it as handled concurrenctly
163
- if request and request['orderNo']:
164
- items.append(request)
165
-
166
- return {
167
- 'limit': limit,
168
- 'hadMultipleClients': False,
169
- 'queueModifiedAt': existing_queue_by_id._modified_at,
170
- 'items': [self._json_to_request(item['json']) for item in items],
171
- }
172
-
173
- async def add_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict:
174
- """Add a request to the queue.
175
-
176
- Args:
177
- request (dict): The request to add to the queue
178
- forefront (bool, optional): Whether to add the request to the head or the end of the queue
179
-
180
- Returns:
181
- dict: The added request.
182
- """
183
- existing_queue_by_id = self._find_or_create_client_by_id_or_name(
184
- memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
185
- )
186
-
187
- if existing_queue_by_id is None:
188
- raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
189
-
190
- request_model = self._create_internal_request(request, forefront)
191
-
192
- async with existing_queue_by_id._file_operation_lock:
193
- existing_request_with_id = existing_queue_by_id._requests.get(request_model['id'])
194
-
195
- # We already have the request present, so we return information about it
196
- if existing_request_with_id is not None:
197
- await existing_queue_by_id._update_timestamps(has_been_modified=False)
198
-
199
- return {
200
- 'requestId': existing_request_with_id['id'],
201
- 'wasAlreadyHandled': existing_request_with_id['orderNo'] is None,
202
- 'wasAlreadyPresent': True,
203
- }
204
-
205
- existing_queue_by_id._requests[request_model['id']] = request_model
206
- if request_model['orderNo'] is None:
207
- existing_queue_by_id._handled_request_count += 1
208
- else:
209
- existing_queue_by_id._pending_request_count += 1
210
- await existing_queue_by_id._update_timestamps(has_been_modified=True)
211
- await update_request_queue_item(
212
- request=request_model,
213
- request_id=request_model['id'],
214
- entity_directory=existing_queue_by_id._resource_directory,
215
- persist_storage=self._memory_storage_client._persist_storage,
216
- )
217
-
218
- return {
219
- 'requestId': request_model['id'],
220
- # We return wasAlreadyHandled: false even though the request may
221
- # have been added as handled, because that's how API behaves.
222
- 'wasAlreadyHandled': False,
223
- 'wasAlreadyPresent': False,
224
- }
225
-
226
- async def get_request(self: RequestQueueClient, request_id: str) -> dict | None:
227
- """Retrieve a request from the queue.
228
-
229
- Args:
230
- request_id (str): ID of the request to retrieve
231
-
232
- Returns:
233
- dict, optional: The retrieved request, or None, if it did not exist.
234
- """
235
- existing_queue_by_id = self._find_or_create_client_by_id_or_name(
236
- memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
237
- )
238
-
239
- if existing_queue_by_id is None:
240
- raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
241
-
242
- async with existing_queue_by_id._file_operation_lock:
243
- await existing_queue_by_id._update_timestamps(has_been_modified=False)
244
-
245
- request = existing_queue_by_id._requests.get(request_id)
246
- return self._json_to_request(request['json'] if request is not None else None)
247
-
248
- async def update_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict:
249
- """Update a request in the queue.
250
-
251
- Args:
252
- request (dict): The updated request
253
- forefront (bool, optional): Whether to put the updated request in the beginning or the end of the queue
254
-
255
- Returns:
256
- dict: The updated request
257
- """
258
- existing_queue_by_id = self._find_or_create_client_by_id_or_name(
259
- memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
260
- )
261
-
262
- if existing_queue_by_id is None:
263
- raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
264
-
265
- request_model = self._create_internal_request(request, forefront)
266
-
267
- # First we need to check the existing request to be
268
- # able to return information about its handled state.
269
-
270
- existing_request = existing_queue_by_id._requests.get(request_model['id'])
271
-
272
- # Undefined means that the request is not present in the queue.
273
- # We need to insert it, to behave the same as API.
274
- if existing_request is None:
275
- return await self.add_request(request, forefront=forefront)
276
-
277
- async with existing_queue_by_id._file_operation_lock:
278
- # When updating the request, we need to make sure that
279
- # the handled counts are updated correctly in all cases.
280
- existing_queue_by_id._requests[request_model['id']] = request_model
281
-
282
- pending_count_adjustment = 0
283
- is_request_handled_state_changing = not isinstance(existing_request['orderNo'], type(request_model['orderNo']))
284
- request_was_handled_before_update = existing_request['orderNo'] is None
285
-
286
- # We add 1 pending request if previous state was handled
287
- if is_request_handled_state_changing:
288
- pending_count_adjustment = 1 if request_was_handled_before_update else -1
289
-
290
- existing_queue_by_id._pending_request_count += pending_count_adjustment
291
- existing_queue_by_id._handled_request_count -= pending_count_adjustment
292
- await existing_queue_by_id._update_timestamps(has_been_modified=True)
293
- await update_request_queue_item(
294
- request=request_model,
295
- request_id=request_model['id'],
296
- entity_directory=existing_queue_by_id._resource_directory,
297
- persist_storage=self._memory_storage_client._persist_storage,
298
- )
299
-
300
- return {
301
- 'requestId': request_model['id'],
302
- 'wasAlreadyHandled': request_was_handled_before_update,
303
- 'wasAlreadyPresent': True,
304
- }
305
-
306
- async def delete_request(self: RequestQueueClient, request_id: str) -> None:
307
- """Delete a request from the queue.
308
-
309
- Args:
310
- request_id (str): ID of the request to delete.
311
- """
312
- existing_queue_by_id = self._find_or_create_client_by_id_or_name(
313
- memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
314
- )
315
-
316
- if existing_queue_by_id is None:
317
- raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
318
-
319
- async with existing_queue_by_id._file_operation_lock:
320
- request = existing_queue_by_id._requests.get(request_id)
321
-
322
- if request:
323
- del existing_queue_by_id._requests[request_id]
324
- if request['orderNo'] is None:
325
- existing_queue_by_id._handled_request_count -= 1
326
- else:
327
- existing_queue_by_id._pending_request_count -= 1
328
- await existing_queue_by_id._update_timestamps(has_been_modified=True)
329
- await delete_request(entity_directory=existing_queue_by_id._resource_directory, request_id=request_id)
330
-
331
- def _to_resource_info(self: RequestQueueClient) -> dict:
332
- """Retrieve the request queue store info."""
333
- return {
334
- 'accessedAt': self._accessed_at,
335
- 'createdAt': self._created_at,
336
- 'hadMultipleClients': False,
337
- 'handledRequestCount': self._handled_request_count,
338
- 'id': self._id,
339
- 'modifiedAt': self._modified_at,
340
- 'name': self._name,
341
- 'pendingRequestCount': self._pending_request_count,
342
- 'stats': {},
343
- 'totalRequestCount': len(self._requests),
344
- 'userId': '1',
345
- }
346
-
347
- async def _update_timestamps(self: RequestQueueClient, has_been_modified: bool) -> None: # noqa: FBT001
348
- self._accessed_at = datetime.now(timezone.utc)
349
-
350
- if has_been_modified:
351
- self._modified_at = datetime.now(timezone.utc)
352
-
353
- request_queue_info = self._to_resource_info()
354
- await update_metadata(
355
- data=request_queue_info,
356
- entity_directory=self._resource_directory,
357
- write_metadata=self._memory_storage_client._write_metadata,
358
- )
359
-
360
- def _json_to_request(self: RequestQueueClient, request_json: str | None) -> dict | None:
361
- if request_json is None:
362
- return None
363
- request = json.loads(request_json)
364
- return filter_out_none_values_recursively(request)
365
-
366
- def _create_internal_request(self: RequestQueueClient, request: dict, forefront: bool | None) -> dict:
367
- order_no = self._calculate_order_no(request, forefront)
368
- id = unique_key_to_request_id(request['uniqueKey']) # noqa: A001
369
-
370
- if request.get('id') is not None and request['id'] != id:
371
- raise ValueError('Request ID does not match its unique_key.')
372
-
373
- json_request = json_dumps({**request, 'id': id})
374
- return {
375
- 'id': id,
376
- 'json': json_request,
377
- 'method': request.get('method'),
378
- 'orderNo': order_no,
379
- 'retryCount': request.get('retryCount', 0),
380
- 'uniqueKey': request['uniqueKey'],
381
- 'url': request['url'],
382
- }
383
-
384
- def _calculate_order_no(self: RequestQueueClient, request: dict, forefront: bool | None) -> Decimal | None:
385
- if request.get('handledAt') is not None:
386
- return None
387
-
388
- # Get the current timestamp in milliseconds
389
- timestamp = Decimal(datetime.now(timezone.utc).timestamp()) * 1000
390
- timestamp = round(timestamp, 6)
391
-
392
- # Make sure that this timestamp was not used yet, so that we have unique orderNos
393
- if timestamp <= self._last_used_timestamp:
394
- timestamp = self._last_used_timestamp + Decimal(0.000001)
395
-
396
- self._last_used_timestamp = timestamp
397
-
398
- return -timestamp if forefront else timestamp
399
-
400
- @classmethod
401
- def _get_storages_dir(cls: type[RequestQueueClient], memory_storage_client: MemoryStorageClient) -> str:
402
- return memory_storage_client._request_queues_directory
403
-
404
- @classmethod
405
- def _get_storage_client_cache(
406
- cls: type[RequestQueueClient],
407
- memory_storage_client: MemoryStorageClient,
408
- ) -> list[RequestQueueClient]:
409
- return memory_storage_client._request_queues_handled
410
-
411
- @classmethod
412
- def _create_from_directory(
413
- cls: type[RequestQueueClient],
414
- storage_directory: str,
415
- memory_storage_client: MemoryStorageClient,
416
- id: str | None = None, # noqa: A002
417
- name: str | None = None,
418
- ) -> RequestQueueClient:
419
- created_at = datetime.now(timezone.utc)
420
- accessed_at = datetime.now(timezone.utc)
421
- modified_at = datetime.now(timezone.utc)
422
- handled_request_count = 0
423
- pending_request_count = 0
424
- entries: list[dict] = []
425
-
426
- # Access the request queue folder
427
- for entry in os.scandir(storage_directory):
428
- if entry.is_file():
429
- if entry.name == '__metadata__.json':
430
- # We have found the queue's metadata file, build out information based on it
431
- with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
432
- metadata = json.load(f)
433
- id = metadata['id'] # noqa: A001
434
- name = metadata['name']
435
- created_at = datetime.fromisoformat(metadata['createdAt'])
436
- accessed_at = datetime.fromisoformat(metadata['accessedAt'])
437
- modified_at = datetime.fromisoformat(metadata['modifiedAt'])
438
- handled_request_count = metadata['handledRequestCount']
439
- pending_request_count = metadata['pendingRequestCount']
440
-
441
- continue
442
-
443
- with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
444
- request = json.load(f)
445
- if request.get('orderNo'):
446
- request['orderNo'] = Decimal(request.get('orderNo'))
447
- entries.append(request)
448
-
449
- new_client = cls(
450
- base_storage_directory=memory_storage_client._request_queues_directory,
451
- memory_storage_client=memory_storage_client,
452
- id=id,
453
- name=name,
454
- )
455
-
456
- # Overwrite properties
457
- new_client._accessed_at = accessed_at
458
- new_client._created_at = created_at
459
- new_client._modified_at = modified_at
460
- new_client._handled_request_count = handled_request_count
461
- new_client._pending_request_count = pending_request_count
462
-
463
- for request in entries:
464
- new_client._requests[request['id']] = request
465
-
466
- return new_client
@@ -1,48 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING
4
-
5
- from apify_shared.utils import ignore_docs
6
-
7
- from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient
8
- from apify._memory_storage.resource_clients.request_queue import RequestQueueClient
9
-
10
- if TYPE_CHECKING:
11
- from apify_shared.models import ListPage
12
-
13
-
14
- @ignore_docs
15
- class RequestQueueCollectionClient(BaseResourceCollectionClient):
16
- """Sub-client for manipulating request queues."""
17
-
18
- def _get_storage_client_cache(self: RequestQueueCollectionClient) -> list[RequestQueueClient]:
19
- return self._memory_storage_client._request_queues_handled
20
-
21
- def _get_resource_client_class(self: RequestQueueCollectionClient) -> type[RequestQueueClient]:
22
- return RequestQueueClient
23
-
24
- async def list(self: RequestQueueCollectionClient) -> ListPage:
25
- """List the available request queues.
26
-
27
- Returns:
28
- ListPage: The list of available request queues matching the specified filters.
29
- """
30
- return await super().list()
31
-
32
- async def get_or_create(
33
- self: RequestQueueCollectionClient,
34
- *,
35
- name: str | None = None,
36
- schema: dict | None = None,
37
- _id: str | None = None,
38
- ) -> dict:
39
- """Retrieve a named request queue, or create a new one when it doesn't exist.
40
-
41
- Args:
42
- name (str, optional): The name of the request queue to retrieve or create.
43
- schema (dict, optional): The schema of the request queue
44
-
45
- Returns:
46
- dict: The retrieved or newly-created request queue.
47
- """
48
- return await super().get_or_create(name=name, schema=schema, _id=_id)