apify 1.7.1b1__py3-none-any.whl → 2.2.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (62) hide show
  1. apify/__init__.py +33 -4
  2. apify/_actor.py +1074 -0
  3. apify/_configuration.py +370 -0
  4. apify/_consts.py +10 -0
  5. apify/_crypto.py +31 -27
  6. apify/_models.py +117 -0
  7. apify/_platform_event_manager.py +231 -0
  8. apify/_proxy_configuration.py +320 -0
  9. apify/_utils.py +18 -484
  10. apify/apify_storage_client/__init__.py +3 -0
  11. apify/apify_storage_client/_apify_storage_client.py +68 -0
  12. apify/apify_storage_client/_dataset_client.py +190 -0
  13. apify/apify_storage_client/_dataset_collection_client.py +51 -0
  14. apify/apify_storage_client/_key_value_store_client.py +94 -0
  15. apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
  16. apify/apify_storage_client/_request_queue_client.py +176 -0
  17. apify/apify_storage_client/_request_queue_collection_client.py +51 -0
  18. apify/apify_storage_client/py.typed +0 -0
  19. apify/log.py +22 -105
  20. apify/scrapy/__init__.py +11 -3
  21. apify/scrapy/middlewares/__init__.py +3 -1
  22. apify/scrapy/middlewares/apify_proxy.py +29 -27
  23. apify/scrapy/middlewares/py.typed +0 -0
  24. apify/scrapy/pipelines/__init__.py +3 -1
  25. apify/scrapy/pipelines/actor_dataset_push.py +6 -3
  26. apify/scrapy/pipelines/py.typed +0 -0
  27. apify/scrapy/py.typed +0 -0
  28. apify/scrapy/requests.py +60 -58
  29. apify/scrapy/scheduler.py +28 -19
  30. apify/scrapy/utils.py +10 -32
  31. apify/storages/__init__.py +4 -10
  32. apify/storages/_request_list.py +150 -0
  33. apify/storages/py.typed +0 -0
  34. apify-2.2.1b1.dist-info/METADATA +211 -0
  35. apify-2.2.1b1.dist-info/RECORD +38 -0
  36. {apify-1.7.1b1.dist-info → apify-2.2.1b1.dist-info}/WHEEL +1 -2
  37. apify/_memory_storage/__init__.py +0 -3
  38. apify/_memory_storage/file_storage_utils.py +0 -71
  39. apify/_memory_storage/memory_storage_client.py +0 -219
  40. apify/_memory_storage/resource_clients/__init__.py +0 -19
  41. apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
  42. apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
  43. apify/_memory_storage/resource_clients/dataset.py +0 -452
  44. apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
  45. apify/_memory_storage/resource_clients/key_value_store.py +0 -533
  46. apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
  47. apify/_memory_storage/resource_clients/request_queue.py +0 -466
  48. apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
  49. apify/actor.py +0 -1351
  50. apify/config.py +0 -127
  51. apify/consts.py +0 -67
  52. apify/event_manager.py +0 -236
  53. apify/proxy_configuration.py +0 -365
  54. apify/storages/base_storage.py +0 -181
  55. apify/storages/dataset.py +0 -494
  56. apify/storages/key_value_store.py +0 -257
  57. apify/storages/request_queue.py +0 -602
  58. apify/storages/storage_client_manager.py +0 -72
  59. apify-1.7.1b1.dist-info/METADATA +0 -149
  60. apify-1.7.1b1.dist-info/RECORD +0 -41
  61. apify-1.7.1b1.dist-info/top_level.txt +0 -1
  62. {apify-1.7.1b1.dist-info → apify-2.2.1b1.dist-info}/LICENSE +0 -0
@@ -1,452 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import json
5
- import os
6
- from datetime import datetime, timezone
7
- from typing import TYPE_CHECKING, Any, AsyncIterator
8
-
9
- import aioshutil
10
- from apify_shared.models import ListPage
11
- from apify_shared.utils import ignore_docs
12
-
13
- from apify._crypto import crypto_random_object_id
14
- from apify._memory_storage.file_storage_utils import _update_dataset_items, update_metadata
15
- from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient
16
- from apify._utils import force_rename, raise_on_duplicate_storage, raise_on_non_existing_storage
17
- from apify.consts import StorageTypes
18
-
19
- if TYPE_CHECKING:
20
- from apify_shared.types import JSONSerializable
21
-
22
- from apify._memory_storage.memory_storage_client import MemoryStorageClient
23
-
24
- # This is what API returns in the x-apify-pagination-limit
25
- # header when no limit query parameter is used.
26
- LIST_ITEMS_LIMIT = 999_999_999_999
27
-
28
- # Number of characters of the dataset item file names.
29
- # E.g.: 000000019.json - 9 digits
30
- LOCAL_ENTRY_NAME_DIGITS = 9
31
-
32
-
33
- @ignore_docs
34
- class DatasetClient(BaseResourceClient):
35
- """Sub-client for manipulating a single dataset."""
36
-
37
- _id: str
38
- _resource_directory: str
39
- _memory_storage_client: MemoryStorageClient
40
- _name: str | None
41
- _dataset_entries: dict[str, dict]
42
- _created_at: datetime
43
- _accessed_at: datetime
44
- _modified_at: datetime
45
- _item_count = 0
46
- _file_operation_lock: asyncio.Lock
47
-
48
- def __init__(
49
- self: DatasetClient,
50
- *,
51
- base_storage_directory: str,
52
- memory_storage_client: MemoryStorageClient,
53
- id: str | None = None, # noqa: A002
54
- name: str | None = None,
55
- ) -> None:
56
- """Initialize the DatasetClient."""
57
- self._id = id or crypto_random_object_id()
58
- self._resource_directory = os.path.join(base_storage_directory, name or self._id)
59
- self._memory_storage_client = memory_storage_client
60
- self._name = name
61
- self._dataset_entries = {}
62
- self._created_at = datetime.now(timezone.utc)
63
- self._accessed_at = datetime.now(timezone.utc)
64
- self._modified_at = datetime.now(timezone.utc)
65
- self._file_operation_lock = asyncio.Lock()
66
-
67
- async def get(self: DatasetClient) -> dict | None:
68
- """Retrieve the dataset.
69
-
70
- Returns:
71
- dict, optional: The retrieved dataset, or None, if it does not exist
72
- """
73
- found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
74
-
75
- if found:
76
- async with found._file_operation_lock:
77
- await found._update_timestamps(has_been_modified=False)
78
- return found._to_resource_info()
79
-
80
- return None
81
-
82
- async def update(self: DatasetClient, *, name: str | None = None) -> dict:
83
- """Update the dataset with specified fields.
84
-
85
- Args:
86
- name (str, optional): The new name for the dataset
87
-
88
- Returns:
89
- dict: The updated dataset
90
- """
91
- # Check by id
92
- existing_dataset_by_id = self._find_or_create_client_by_id_or_name(
93
- memory_storage_client=self._memory_storage_client,
94
- id=self._id,
95
- name=self._name,
96
- )
97
-
98
- if existing_dataset_by_id is None:
99
- raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
100
-
101
- # Skip if no changes
102
- if name is None:
103
- return existing_dataset_by_id._to_resource_info()
104
-
105
- async with existing_dataset_by_id._file_operation_lock:
106
- # Check that name is not in use already
107
- existing_dataset_by_name = next(
108
- (dataset for dataset in self._memory_storage_client._datasets_handled if dataset._name and dataset._name.lower() == name.lower()),
109
- None,
110
- )
111
-
112
- if existing_dataset_by_name is not None:
113
- raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name)
114
-
115
- existing_dataset_by_id._name = name
116
-
117
- previous_dir = existing_dataset_by_id._resource_directory
118
-
119
- existing_dataset_by_id._resource_directory = os.path.join(self._memory_storage_client._datasets_directory, name)
120
-
121
- await force_rename(previous_dir, existing_dataset_by_id._resource_directory)
122
-
123
- # Update timestamps
124
- await existing_dataset_by_id._update_timestamps(has_been_modified=True)
125
-
126
- return existing_dataset_by_id._to_resource_info()
127
-
128
- async def delete(self: DatasetClient) -> None:
129
- """Delete the dataset."""
130
- dataset = next((dataset for dataset in self._memory_storage_client._datasets_handled if dataset._id == self._id), None)
131
-
132
- if dataset is not None:
133
- async with dataset._file_operation_lock:
134
- self._memory_storage_client._datasets_handled.remove(dataset)
135
- dataset._item_count = 0
136
- dataset._dataset_entries.clear()
137
-
138
- if os.path.exists(dataset._resource_directory):
139
- await aioshutil.rmtree(dataset._resource_directory)
140
-
141
- async def list_items(
142
- self: DatasetClient,
143
- *,
144
- offset: int | None = 0,
145
- limit: int | None = LIST_ITEMS_LIMIT,
146
- clean: bool | None = None, # noqa: ARG002
147
- desc: bool | None = None,
148
- fields: list[str] | None = None, # noqa: ARG002
149
- omit: list[str] | None = None, # noqa: ARG002
150
- unwind: str | None = None, # noqa: ARG002
151
- skip_empty: bool | None = None, # noqa: ARG002
152
- skip_hidden: bool | None = None, # noqa: ARG002
153
- flatten: list[str] | None = None, # noqa: ARG002
154
- view: str | None = None, # noqa: ARG002
155
- ) -> ListPage:
156
- """List the items of the dataset.
157
-
158
- Args:
159
- offset (int, optional): Number of items that should be skipped at the start. The default value is 0
160
- limit (int, optional): Maximum number of items to return. By default there is no limit.
161
- desc (bool, optional): By default, results are returned in the same order as they were stored.
162
- To reverse the order, set this parameter to True.
163
- clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
164
- The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
165
- Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
166
- fields (list of str, optional): A list of fields which should be picked from the items,
167
- only these fields will remain in the resulting record objects.
168
- Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
169
- You can use this feature to effectively fix the output format.
170
- omit (list of str, optional): A list of fields which should be omitted from the items.
171
- unwind (str, optional): Name of a field which should be unwound.
172
- If the field is an array then every element of the array will become a separate record and merged with parent object.
173
- If the unwound field is an object then it is merged with the parent object.
174
- If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
175
- then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
176
- skip_empty (bool, optional): If True, then empty items are skipped from the output.
177
- Note that if used, the results might contain less items than the limit value.
178
- skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
179
- flatten (list of str, optional): A list of fields that should be flattened
180
- view (str, optional): Name of the dataset view to be used
181
-
182
- Returns:
183
- ListPage: A page of the list of dataset items according to the specified filters.
184
- """
185
- # Check by id
186
- existing_dataset_by_id = self._find_or_create_client_by_id_or_name(
187
- memory_storage_client=self._memory_storage_client,
188
- id=self._id,
189
- name=self._name,
190
- )
191
-
192
- if existing_dataset_by_id is None:
193
- raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
194
-
195
- async with existing_dataset_by_id._file_operation_lock:
196
- start, end = existing_dataset_by_id._get_start_and_end_indexes(
197
- max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0,
198
- limit,
199
- )
200
-
201
- items = []
202
-
203
- for idx in range(start, end):
204
- entry_number = self._generate_local_entry_name(idx)
205
- items.append(existing_dataset_by_id._dataset_entries[entry_number])
206
-
207
- await existing_dataset_by_id._update_timestamps(has_been_modified=False)
208
-
209
- if desc:
210
- items.reverse()
211
-
212
- return ListPage(
213
- {
214
- 'count': len(items),
215
- 'desc': desc or False,
216
- 'items': items,
217
- 'limit': limit or LIST_ITEMS_LIMIT,
218
- 'offset': offset or 0,
219
- 'total': existing_dataset_by_id._item_count,
220
- }
221
- )
222
-
223
- async def iterate_items(
224
- self: DatasetClient,
225
- *,
226
- offset: int = 0,
227
- limit: int | None = None,
228
- clean: bool | None = None, # noqa: ARG002
229
- desc: bool | None = None,
230
- fields: list[str] | None = None, # noqa: ARG002
231
- omit: list[str] | None = None, # noqa: ARG002
232
- unwind: str | None = None, # noqa: ARG002
233
- skip_empty: bool | None = None, # noqa: ARG002
234
- skip_hidden: bool | None = None, # noqa: ARG002
235
- ) -> AsyncIterator[dict]:
236
- """Iterate over the items in the dataset.
237
-
238
- Args:
239
- offset (int, optional): Number of items that should be skipped at the start. The default value is 0
240
- limit (int, optional): Maximum number of items to return. By default there is no limit.
241
- desc (bool, optional): By default, results are returned in the same order as they were stored.
242
- To reverse the order, set this parameter to True.
243
- clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
244
- The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
245
- Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
246
- fields (list of str, optional): A list of fields which should be picked from the items,
247
- only these fields will remain in the resulting record objects.
248
- Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
249
- You can use this feature to effectively fix the output format.
250
- omit (list of str, optional): A list of fields which should be omitted from the items.
251
- unwind (str, optional): Name of a field which should be unwound.
252
- If the field is an array then every element of the array will become a separate record and merged with parent object.
253
- If the unwound field is an object then it is merged with the parent object.
254
- If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
255
- then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
256
- skip_empty (bool, optional): If True, then empty items are skipped from the output.
257
- Note that if used, the results might contain less items than the limit value.
258
- skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
259
-
260
- Yields:
261
- dict: An item from the dataset
262
- """
263
- cache_size = 1000
264
- first_item = offset
265
-
266
- # If there is no limit, set last_item to None until we get the total from the first API response
267
- last_item = None if limit is None else offset + limit
268
-
269
- current_offset = first_item
270
- while last_item is None or current_offset < last_item:
271
- current_limit = cache_size if last_item is None else min(cache_size, last_item - current_offset)
272
-
273
- current_items_page = await self.list_items(
274
- offset=current_offset,
275
- limit=current_limit,
276
- desc=desc,
277
- )
278
-
279
- current_offset += current_items_page.count
280
- if last_item is None or current_items_page.total < last_item:
281
- last_item = current_items_page.total
282
-
283
- for item in current_items_page.items:
284
- yield item
285
-
286
- async def get_items_as_bytes(self: DatasetClient, *_args: Any, **_kwargs: Any) -> bytes:
287
- raise NotImplementedError('This method is not supported in local memory storage.')
288
-
289
- async def stream_items(self: DatasetClient, *_args: Any, **_kwargs: Any) -> AsyncIterator:
290
- raise NotImplementedError('This method is not supported in local memory storage.')
291
-
292
- async def push_items(self: DatasetClient, items: JSONSerializable) -> None:
293
- """Push items to the dataset.
294
-
295
- Args:
296
- items: The items which to push in the dataset. Either a stringified JSON, a dictionary, or a list of strings or dictionaries.
297
- """
298
- # Check by id
299
- existing_dataset_by_id = self._find_or_create_client_by_id_or_name(
300
- memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
301
- )
302
-
303
- if existing_dataset_by_id is None:
304
- raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
305
-
306
- normalized = self._normalize_items(items)
307
-
308
- added_ids: list[str] = []
309
- for entry in normalized:
310
- existing_dataset_by_id._item_count += 1
311
- idx = self._generate_local_entry_name(existing_dataset_by_id._item_count)
312
-
313
- existing_dataset_by_id._dataset_entries[idx] = entry
314
- added_ids.append(idx)
315
-
316
- data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001
317
-
318
- async with existing_dataset_by_id._file_operation_lock:
319
- await existing_dataset_by_id._update_timestamps(has_been_modified=True)
320
-
321
- await _update_dataset_items(
322
- data=data_entries,
323
- entity_directory=existing_dataset_by_id._resource_directory,
324
- persist_storage=self._memory_storage_client._persist_storage,
325
- )
326
-
327
- def _to_resource_info(self: DatasetClient) -> dict:
328
- """Retrieve the dataset info."""
329
- return {
330
- 'id': self._id,
331
- 'name': self._name,
332
- 'itemCount': self._item_count,
333
- 'accessedAt': self._accessed_at,
334
- 'createdAt': self._created_at,
335
- 'modifiedAt': self._modified_at,
336
- }
337
-
338
- async def _update_timestamps(self: DatasetClient, has_been_modified: bool) -> None: # noqa: FBT001
339
- """Update the timestamps of the dataset."""
340
- self._accessed_at = datetime.now(timezone.utc)
341
-
342
- if has_been_modified:
343
- self._modified_at = datetime.now(timezone.utc)
344
-
345
- dataset_info = self._to_resource_info()
346
- await update_metadata(
347
- data=dataset_info,
348
- entity_directory=self._resource_directory,
349
- write_metadata=self._memory_storage_client._write_metadata,
350
- )
351
-
352
- def _get_start_and_end_indexes(self: DatasetClient, offset: int, limit: int | None = None) -> tuple[int, int]:
353
- actual_limit = limit or self._item_count
354
- start = offset + 1
355
- end = min(offset + actual_limit, self._item_count) + 1
356
- return (start, end)
357
-
358
- def _generate_local_entry_name(self: DatasetClient, idx: int) -> str:
359
- return str(idx).zfill(LOCAL_ENTRY_NAME_DIGITS)
360
-
361
- def _normalize_items(self: DatasetClient, items: JSONSerializable) -> list[dict]:
362
- def normalize_item(item: Any) -> dict | None:
363
- if isinstance(item, str):
364
- item = json.loads(item)
365
-
366
- if isinstance(item, list):
367
- received = ',\n'.join(item)
368
- raise TypeError(f'Each dataset item can only be a single JSON object, not an array. Received: [{received}]')
369
-
370
- if (not isinstance(item, dict)) and item is not None:
371
- raise TypeError(f'Each dataset item must be a JSON object. Received: {item}')
372
-
373
- return item
374
-
375
- if isinstance(items, str):
376
- items = json.loads(items)
377
-
378
- result = list(map(normalize_item, items)) if isinstance(items, list) else [normalize_item(items)]
379
- # filter(None, ..) returns items that are True
380
- return list(filter(None, result))
381
-
382
- @classmethod
383
- def _get_storages_dir(cls: type[DatasetClient], memory_storage_client: MemoryStorageClient) -> str:
384
- return memory_storage_client._datasets_directory
385
-
386
- @classmethod
387
- def _get_storage_client_cache(
388
- cls: type[DatasetClient],
389
- memory_storage_client: MemoryStorageClient,
390
- ) -> list[DatasetClient]:
391
- return memory_storage_client._datasets_handled
392
-
393
- @classmethod
394
- def _create_from_directory(
395
- cls: type[DatasetClient],
396
- storage_directory: str,
397
- memory_storage_client: MemoryStorageClient,
398
- id: str | None = None, # noqa: A002
399
- name: str | None = None,
400
- ) -> DatasetClient:
401
- item_count = 0
402
- created_at = datetime.now(timezone.utc)
403
- accessed_at = datetime.now(timezone.utc)
404
- modified_at = datetime.now(timezone.utc)
405
- entries: dict[str, dict] = {}
406
-
407
- has_seen_metadata_file = False
408
-
409
- # Access the dataset folder
410
- for entry in os.scandir(storage_directory):
411
- if entry.is_file():
412
- if entry.name == '__metadata__.json':
413
- has_seen_metadata_file = True
414
-
415
- # We have found the dataset's metadata file, build out information based on it
416
- with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
417
- metadata = json.load(f)
418
- id = metadata['id'] # noqa: A001
419
- name = metadata['name']
420
- item_count = metadata['itemCount']
421
- created_at = datetime.fromisoformat(metadata['createdAt'])
422
- accessed_at = datetime.fromisoformat(metadata['accessedAt'])
423
- modified_at = datetime.fromisoformat(metadata['modifiedAt'])
424
-
425
- continue
426
-
427
- with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
428
- entry_content = json.load(f)
429
- entry_name = entry.name.split('.')[0]
430
-
431
- entries[entry_name] = entry_content
432
-
433
- if not has_seen_metadata_file:
434
- item_count += 1
435
-
436
- new_client = DatasetClient(
437
- base_storage_directory=memory_storage_client._datasets_directory,
438
- memory_storage_client=memory_storage_client,
439
- id=id,
440
- name=name,
441
- )
442
-
443
- # Overwrite properties
444
- new_client._accessed_at = accessed_at
445
- new_client._created_at = created_at
446
- new_client._modified_at = modified_at
447
- new_client._item_count = item_count
448
-
449
- for entry_id, content in entries.items():
450
- new_client._dataset_entries[entry_id] = content
451
-
452
- return new_client
@@ -1,48 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING
4
-
5
- from apify_shared.utils import ignore_docs
6
-
7
- from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient
8
- from apify._memory_storage.resource_clients.dataset import DatasetClient
9
-
10
- if TYPE_CHECKING:
11
- from apify_shared.models import ListPage
12
-
13
-
14
- @ignore_docs
15
- class DatasetCollectionClient(BaseResourceCollectionClient):
16
- """Sub-client for manipulating datasets."""
17
-
18
- def _get_storage_client_cache(self: DatasetCollectionClient) -> list[DatasetClient]:
19
- return self._memory_storage_client._datasets_handled
20
-
21
- def _get_resource_client_class(self: DatasetCollectionClient) -> type[DatasetClient]:
22
- return DatasetClient
23
-
24
- async def list(self: DatasetCollectionClient) -> ListPage:
25
- """List the available datasets.
26
-
27
- Returns:
28
- ListPage: The list of available datasets matching the specified filters.
29
- """
30
- return await super().list()
31
-
32
- async def get_or_create(
33
- self: DatasetCollectionClient,
34
- *,
35
- name: str | None = None,
36
- schema: dict | None = None,
37
- _id: str | None = None,
38
- ) -> dict:
39
- """Retrieve a named dataset, or create a new one when it doesn't exist.
40
-
41
- Args:
42
- name (str, optional): The name of the dataset to retrieve or create.
43
- schema (dict, optional): The schema of the dataset
44
-
45
- Returns:
46
- dict: The retrieved or newly-created dataset.
47
- """
48
- return await super().get_or_create(name=name, schema=schema, _id=_id)