apify 1.7.1b1__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +33 -4
- apify/_actor.py +1074 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.1.dist-info/METADATA +211 -0
- apify-2.2.1.dist-info/RECORD +38 -0
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.1b1.dist-info/METADATA +0 -149
- apify-1.7.1b1.dist-info/RECORD +0 -41
- apify-1.7.1b1.dist-info/top_level.txt +0 -1
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/LICENSE +0 -0
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
import aiofiles
|
|
6
|
-
from aiofiles.os import makedirs
|
|
7
|
-
from apify_shared.utils import json_dumps
|
|
8
|
-
|
|
9
|
-
from apify._utils import force_remove
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
async def update_metadata(*, data: dict, entity_directory: str, write_metadata: bool) -> None:
|
|
13
|
-
# Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present
|
|
14
|
-
if not write_metadata:
|
|
15
|
-
return
|
|
16
|
-
|
|
17
|
-
# Ensure the directory for the entity exists
|
|
18
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
19
|
-
|
|
20
|
-
# Write the metadata to the file
|
|
21
|
-
file_path = os.path.join(entity_directory, '__metadata__.json')
|
|
22
|
-
async with aiofiles.open(file_path, mode='wb') as f:
|
|
23
|
-
await f.write(json_dumps(data).encode('utf-8'))
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
async def _update_dataset_items(
|
|
27
|
-
*,
|
|
28
|
-
data: list[tuple[str, dict]],
|
|
29
|
-
entity_directory: str,
|
|
30
|
-
persist_storage: bool,
|
|
31
|
-
) -> None:
|
|
32
|
-
# Skip writing files to the disk if the client has the option set to false
|
|
33
|
-
if not persist_storage:
|
|
34
|
-
return
|
|
35
|
-
|
|
36
|
-
# Ensure the directory for the entity exists
|
|
37
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
38
|
-
|
|
39
|
-
# Save all the new items to the disk
|
|
40
|
-
for idx, item in data:
|
|
41
|
-
file_path = os.path.join(entity_directory, f'{idx}.json')
|
|
42
|
-
async with aiofiles.open(file_path, mode='wb') as f:
|
|
43
|
-
await f.write(json_dumps(item).encode('utf-8'))
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
async def update_request_queue_item(
|
|
47
|
-
*,
|
|
48
|
-
request_id: str,
|
|
49
|
-
request: dict,
|
|
50
|
-
entity_directory: str,
|
|
51
|
-
persist_storage: bool,
|
|
52
|
-
) -> None:
|
|
53
|
-
# Skip writing files to the disk if the client has the option set to false
|
|
54
|
-
if not persist_storage:
|
|
55
|
-
return
|
|
56
|
-
|
|
57
|
-
# Ensure the directory for the entity exists
|
|
58
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
59
|
-
|
|
60
|
-
# Write the request to the file
|
|
61
|
-
file_path = os.path.join(entity_directory, f'{request_id}.json')
|
|
62
|
-
async with aiofiles.open(file_path, mode='wb') as f:
|
|
63
|
-
await f.write(json_dumps(request).encode('utf-8'))
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
async def delete_request(*, request_id: str, entity_directory: str) -> None:
|
|
67
|
-
# Ensure the directory for the entity exists
|
|
68
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
69
|
-
|
|
70
|
-
file_path = os.path.join(entity_directory, f'{request_id}.json')
|
|
71
|
-
await force_remove(file_path)
|
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import contextlib
|
|
5
|
-
import os
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import aioshutil
|
|
9
|
-
from aiofiles import ospath
|
|
10
|
-
from aiofiles.os import rename, scandir
|
|
11
|
-
from apify_shared.consts import ApifyEnvVars
|
|
12
|
-
from apify_shared.utils import ignore_docs
|
|
13
|
-
|
|
14
|
-
from apify._memory_storage.resource_clients.dataset import DatasetClient
|
|
15
|
-
from apify._memory_storage.resource_clients.dataset_collection import DatasetCollectionClient
|
|
16
|
-
from apify._memory_storage.resource_clients.key_value_store import KeyValueStoreClient
|
|
17
|
-
from apify._memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient
|
|
18
|
-
from apify._memory_storage.resource_clients.request_queue import RequestQueueClient
|
|
19
|
-
from apify._memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient
|
|
20
|
-
from apify._utils import maybe_parse_bool
|
|
21
|
-
|
|
22
|
-
"""
|
|
23
|
-
Memory storage emulates data storages that are available on the Apify platform.
|
|
24
|
-
Specifically, it emulates clients for datasets, key-value stores and request queues.
|
|
25
|
-
The data are held in-memory and persisted locally if `persist_storage` is True.
|
|
26
|
-
The metadata of the storages is also persisted if `write_metadata` is True.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@ignore_docs
|
|
31
|
-
class MemoryStorageClient:
|
|
32
|
-
"""Class representing an in-memory storage."""
|
|
33
|
-
|
|
34
|
-
_local_data_directory: str
|
|
35
|
-
_datasets_directory: str
|
|
36
|
-
_key_value_stores_directory: str
|
|
37
|
-
_request_queues_directory: str
|
|
38
|
-
_write_metadata: bool
|
|
39
|
-
_persist_storage: bool
|
|
40
|
-
_datasets_handled: list[DatasetClient]
|
|
41
|
-
_key_value_stores_handled: list[KeyValueStoreClient]
|
|
42
|
-
_request_queues_handled: list[RequestQueueClient]
|
|
43
|
-
|
|
44
|
-
_purged_on_start: bool = False
|
|
45
|
-
_purge_lock: asyncio.Lock
|
|
46
|
-
|
|
47
|
-
"""Indicates whether a purge was already performed on this instance"""
|
|
48
|
-
|
|
49
|
-
def __init__(
|
|
50
|
-
self: MemoryStorageClient,
|
|
51
|
-
*,
|
|
52
|
-
local_data_directory: str | None = None,
|
|
53
|
-
write_metadata: bool | None = None,
|
|
54
|
-
persist_storage: bool | None = None,
|
|
55
|
-
) -> None:
|
|
56
|
-
"""Initialize the MemoryStorageClient.
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
local_data_directory (str, optional): A local directory where all data will be persisted
|
|
60
|
-
persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory
|
|
61
|
-
write_metadata (bool, optional): Whether to persist metadata of the storages as well
|
|
62
|
-
"""
|
|
63
|
-
self._local_data_directory = local_data_directory or os.getenv(ApifyEnvVars.LOCAL_STORAGE_DIR) or './storage'
|
|
64
|
-
self._datasets_directory = os.path.join(self._local_data_directory, 'datasets')
|
|
65
|
-
self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores')
|
|
66
|
-
self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues')
|
|
67
|
-
self._write_metadata = write_metadata if write_metadata is not None else '*' in os.getenv('DEBUG', '')
|
|
68
|
-
self._persist_storage = persist_storage if persist_storage is not None else maybe_parse_bool(os.getenv(ApifyEnvVars.PERSIST_STORAGE, 'true'))
|
|
69
|
-
self._datasets_handled = []
|
|
70
|
-
self._key_value_stores_handled = []
|
|
71
|
-
self._request_queues_handled = []
|
|
72
|
-
self._purge_lock = asyncio.Lock()
|
|
73
|
-
|
|
74
|
-
def datasets(self: MemoryStorageClient) -> DatasetCollectionClient:
|
|
75
|
-
"""Retrieve the sub-client for manipulating datasets."""
|
|
76
|
-
return DatasetCollectionClient(base_storage_directory=self._datasets_directory, memory_storage_client=self)
|
|
77
|
-
|
|
78
|
-
def dataset(self: MemoryStorageClient, dataset_id: str) -> DatasetClient:
|
|
79
|
-
"""Retrieve the sub-client for manipulating a single dataset.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
dataset_id (str): ID of the dataset to be manipulated
|
|
83
|
-
"""
|
|
84
|
-
return DatasetClient(base_storage_directory=self._datasets_directory, memory_storage_client=self, id=dataset_id)
|
|
85
|
-
|
|
86
|
-
def key_value_stores(self: MemoryStorageClient) -> KeyValueStoreCollectionClient:
|
|
87
|
-
"""Retrieve the sub-client for manipulating key-value stores."""
|
|
88
|
-
return KeyValueStoreCollectionClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self)
|
|
89
|
-
|
|
90
|
-
def key_value_store(self: MemoryStorageClient, key_value_store_id: str) -> KeyValueStoreClient:
|
|
91
|
-
"""Retrieve the sub-client for manipulating a single key-value store.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
key_value_store_id (str): ID of the key-value store to be manipulated
|
|
95
|
-
"""
|
|
96
|
-
return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self, id=key_value_store_id)
|
|
97
|
-
|
|
98
|
-
def request_queues(self: MemoryStorageClient) -> RequestQueueCollectionClient:
|
|
99
|
-
"""Retrieve the sub-client for manipulating request queues."""
|
|
100
|
-
return RequestQueueCollectionClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self)
|
|
101
|
-
|
|
102
|
-
def request_queue(
|
|
103
|
-
self: MemoryStorageClient,
|
|
104
|
-
request_queue_id: str,
|
|
105
|
-
*,
|
|
106
|
-
client_key: str | None = None, # noqa: ARG002
|
|
107
|
-
) -> RequestQueueClient:
|
|
108
|
-
"""Retrieve the sub-client for manipulating a single request queue.
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
request_queue_id (str): ID of the request queue to be manipulated
|
|
112
|
-
client_key (str): A unique identifier of the client accessing the request queue
|
|
113
|
-
"""
|
|
114
|
-
return RequestQueueClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self, id=request_queue_id)
|
|
115
|
-
|
|
116
|
-
async def _purge_on_start(self: MemoryStorageClient) -> None:
|
|
117
|
-
# Optimistic, non-blocking check
|
|
118
|
-
if self._purged_on_start is True:
|
|
119
|
-
return
|
|
120
|
-
|
|
121
|
-
async with self._purge_lock:
|
|
122
|
-
# Another check under the lock just to be sure
|
|
123
|
-
if self._purged_on_start is True:
|
|
124
|
-
return # type: ignore[unreachable] # Mypy doesn't understand that the _purged_on_start can change while we're getting the async lock
|
|
125
|
-
|
|
126
|
-
await self._purge()
|
|
127
|
-
self._purged_on_start = True
|
|
128
|
-
|
|
129
|
-
async def _purge(self: MemoryStorageClient) -> None:
|
|
130
|
-
"""Clean up the default storage directories before the run starts.
|
|
131
|
-
|
|
132
|
-
Specifically, `purge` cleans up:
|
|
133
|
-
- local directory containing the default dataset;
|
|
134
|
-
- all records from the default key-value store in the local directory, except for the "INPUT" key;
|
|
135
|
-
- local directory containing the default request queue.
|
|
136
|
-
"""
|
|
137
|
-
# Key-value stores
|
|
138
|
-
if await ospath.exists(self._key_value_stores_directory):
|
|
139
|
-
key_value_store_folders = await scandir(self._key_value_stores_directory)
|
|
140
|
-
for key_value_store_folder in key_value_store_folders:
|
|
141
|
-
if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'):
|
|
142
|
-
await self._batch_remove_files(key_value_store_folder.path)
|
|
143
|
-
elif key_value_store_folder.name == 'default':
|
|
144
|
-
await self._handle_default_key_value_store(key_value_store_folder.path)
|
|
145
|
-
|
|
146
|
-
# Datasets
|
|
147
|
-
if await ospath.exists(self._datasets_directory):
|
|
148
|
-
dataset_folders = await scandir(self._datasets_directory)
|
|
149
|
-
for dataset_folder in dataset_folders:
|
|
150
|
-
if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'):
|
|
151
|
-
await self._batch_remove_files(dataset_folder.path)
|
|
152
|
-
# Request queues
|
|
153
|
-
if await ospath.exists(self._request_queues_directory):
|
|
154
|
-
request_queue_folders = await scandir(self._request_queues_directory)
|
|
155
|
-
for request_queue_folder in request_queue_folders:
|
|
156
|
-
if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'):
|
|
157
|
-
await self._batch_remove_files(request_queue_folder.path)
|
|
158
|
-
|
|
159
|
-
async def _handle_default_key_value_store(self: MemoryStorageClient, folder: str) -> None:
|
|
160
|
-
"""Remove everything from the default key-value store folder except `possible_input_keys`."""
|
|
161
|
-
folder_exists = await ospath.exists(folder)
|
|
162
|
-
temporary_path = os.path.normpath(os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__'))
|
|
163
|
-
|
|
164
|
-
# For optimization, we want to only attempt to copy a few files from the default key-value store
|
|
165
|
-
possible_input_keys = [
|
|
166
|
-
'INPUT',
|
|
167
|
-
'INPUT.json',
|
|
168
|
-
'INPUT.bin',
|
|
169
|
-
'INPUT.txt',
|
|
170
|
-
]
|
|
171
|
-
|
|
172
|
-
if folder_exists:
|
|
173
|
-
# Create a temporary folder to save important files in
|
|
174
|
-
Path(temporary_path).mkdir(parents=True, exist_ok=True)
|
|
175
|
-
|
|
176
|
-
# Go through each file and save the ones that are important
|
|
177
|
-
for entity in possible_input_keys:
|
|
178
|
-
original_file_path = os.path.join(folder, entity)
|
|
179
|
-
temp_file_path = os.path.join(temporary_path, entity)
|
|
180
|
-
with contextlib.suppress(Exception):
|
|
181
|
-
await rename(original_file_path, temp_file_path)
|
|
182
|
-
|
|
183
|
-
# Remove the original folder and all its content
|
|
184
|
-
counter = 0
|
|
185
|
-
temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__'))
|
|
186
|
-
done = False
|
|
187
|
-
try:
|
|
188
|
-
while not done:
|
|
189
|
-
await rename(folder, temp_path_for_old_folder)
|
|
190
|
-
done = True
|
|
191
|
-
except Exception:
|
|
192
|
-
counter += 1
|
|
193
|
-
temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__'))
|
|
194
|
-
|
|
195
|
-
# Replace the temporary folder with the original folder
|
|
196
|
-
await rename(temporary_path, folder)
|
|
197
|
-
|
|
198
|
-
# Remove the old folder
|
|
199
|
-
await self._batch_remove_files(temp_path_for_old_folder)
|
|
200
|
-
|
|
201
|
-
async def _batch_remove_files(self: MemoryStorageClient, folder: str, counter: int = 0) -> None:
|
|
202
|
-
folder_exists = await ospath.exists(folder)
|
|
203
|
-
|
|
204
|
-
if folder_exists:
|
|
205
|
-
temporary_folder = (
|
|
206
|
-
folder
|
|
207
|
-
if os.path.basename(folder).startswith('__APIFY_TEMPORARY_')
|
|
208
|
-
else os.path.normpath(os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__'))
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
try:
|
|
212
|
-
# Rename the old folder to the new one to allow background deletions
|
|
213
|
-
await rename(folder, temporary_folder)
|
|
214
|
-
except Exception:
|
|
215
|
-
# Folder exists already, try again with an incremented counter
|
|
216
|
-
return await self._batch_remove_files(folder, counter + 1)
|
|
217
|
-
|
|
218
|
-
await aioshutil.rmtree(temporary_folder, ignore_errors=True)
|
|
219
|
-
return None
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from .base_resource_client import BaseResourceClient
|
|
2
|
-
from .base_resource_collection_client import BaseResourceCollectionClient
|
|
3
|
-
from .dataset import DatasetClient
|
|
4
|
-
from .dataset_collection import DatasetCollectionClient
|
|
5
|
-
from .key_value_store import KeyValueStoreClient
|
|
6
|
-
from .key_value_store_collection import KeyValueStoreCollectionClient
|
|
7
|
-
from .request_queue import RequestQueueClient
|
|
8
|
-
from .request_queue_collection import RequestQueueCollectionClient
|
|
9
|
-
|
|
10
|
-
__all__ = [
|
|
11
|
-
'BaseResourceClient',
|
|
12
|
-
'BaseResourceCollectionClient',
|
|
13
|
-
'DatasetClient',
|
|
14
|
-
'DatasetCollectionClient',
|
|
15
|
-
'KeyValueStoreClient',
|
|
16
|
-
'KeyValueStoreCollectionClient',
|
|
17
|
-
'RequestQueueClient',
|
|
18
|
-
'RequestQueueCollectionClient',
|
|
19
|
-
]
|
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
|
-
|
|
8
|
-
from apify_shared.utils import ignore_docs
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from typing_extensions import Self
|
|
12
|
-
|
|
13
|
-
from apify._memory_storage.memory_storage_client import MemoryStorageClient
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@ignore_docs
|
|
17
|
-
class BaseResourceClient(ABC):
|
|
18
|
-
"""Base class for resource clients."""
|
|
19
|
-
|
|
20
|
-
_id: str
|
|
21
|
-
_name: str | None
|
|
22
|
-
_resource_directory: str
|
|
23
|
-
|
|
24
|
-
@abstractmethod
|
|
25
|
-
def __init__(
|
|
26
|
-
self: BaseResourceClient,
|
|
27
|
-
*,
|
|
28
|
-
base_storage_directory: str,
|
|
29
|
-
memory_storage_client: MemoryStorageClient,
|
|
30
|
-
id: str | None = None, # noqa: A002
|
|
31
|
-
name: str | None = None,
|
|
32
|
-
) -> None:
|
|
33
|
-
"""Initialize the BaseResourceClient."""
|
|
34
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
35
|
-
|
|
36
|
-
@abstractmethod
|
|
37
|
-
async def get(self: BaseResourceClient) -> dict | None:
|
|
38
|
-
"""Retrieve the storage.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
dict, optional: The retrieved storage, or None, if it does not exist
|
|
42
|
-
"""
|
|
43
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
44
|
-
|
|
45
|
-
@classmethod
|
|
46
|
-
@abstractmethod
|
|
47
|
-
def _get_storages_dir(cls: type[BaseResourceClient], memory_storage_client: MemoryStorageClient) -> str:
|
|
48
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
49
|
-
|
|
50
|
-
@classmethod
|
|
51
|
-
@abstractmethod
|
|
52
|
-
def _get_storage_client_cache(
|
|
53
|
-
cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
|
|
54
|
-
memory_storage_client: MemoryStorageClient,
|
|
55
|
-
) -> list[Self]:
|
|
56
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
57
|
-
|
|
58
|
-
@abstractmethod
|
|
59
|
-
def _to_resource_info(self: BaseResourceClient) -> dict:
|
|
60
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
61
|
-
|
|
62
|
-
@classmethod
|
|
63
|
-
@abstractmethod
|
|
64
|
-
def _create_from_directory(
|
|
65
|
-
cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
|
|
66
|
-
storage_directory: str,
|
|
67
|
-
memory_storage_client: MemoryStorageClient,
|
|
68
|
-
id: str | None = None, # noqa: A002
|
|
69
|
-
name: str | None = None,
|
|
70
|
-
) -> Self:
|
|
71
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
72
|
-
|
|
73
|
-
@classmethod
|
|
74
|
-
def _find_or_create_client_by_id_or_name(
|
|
75
|
-
cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
|
|
76
|
-
memory_storage_client: MemoryStorageClient,
|
|
77
|
-
id: str | None = None, # noqa: A002
|
|
78
|
-
name: str | None = None,
|
|
79
|
-
) -> Self | None:
|
|
80
|
-
assert id is not None or name is not None # noqa: S101
|
|
81
|
-
|
|
82
|
-
storage_client_cache = cls._get_storage_client_cache(memory_storage_client)
|
|
83
|
-
storages_dir = cls._get_storages_dir(memory_storage_client)
|
|
84
|
-
|
|
85
|
-
# First check memory cache
|
|
86
|
-
found = next(
|
|
87
|
-
(
|
|
88
|
-
storage_client
|
|
89
|
-
for storage_client in storage_client_cache
|
|
90
|
-
if storage_client._id == id or (storage_client._name and name and storage_client._name.lower() == name.lower())
|
|
91
|
-
),
|
|
92
|
-
None,
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
if found is not None:
|
|
96
|
-
return found
|
|
97
|
-
|
|
98
|
-
storage_path = None
|
|
99
|
-
|
|
100
|
-
# First try to find the storage by looking up the directory by name
|
|
101
|
-
if name:
|
|
102
|
-
possible_storage_path = os.path.join(storages_dir, name)
|
|
103
|
-
if os.access(possible_storage_path, os.F_OK):
|
|
104
|
-
storage_path = possible_storage_path
|
|
105
|
-
|
|
106
|
-
# If it's not found, try going through the storages dir and finding it by metadata
|
|
107
|
-
if not storage_path and os.access(storages_dir, os.F_OK):
|
|
108
|
-
for entry in os.scandir(storages_dir):
|
|
109
|
-
if not entry.is_dir():
|
|
110
|
-
continue
|
|
111
|
-
metadata_path = os.path.join(entry.path, '__metadata__.json')
|
|
112
|
-
if not os.access(metadata_path, os.F_OK):
|
|
113
|
-
continue
|
|
114
|
-
with open(metadata_path, encoding='utf-8') as metadata_file:
|
|
115
|
-
metadata = json.load(metadata_file)
|
|
116
|
-
if id and id == metadata.get('id'):
|
|
117
|
-
storage_path = entry.path
|
|
118
|
-
name = metadata.get(name)
|
|
119
|
-
break
|
|
120
|
-
if name and name == metadata.get('name'):
|
|
121
|
-
storage_path = entry.path
|
|
122
|
-
id = metadata.get(id) # noqa: A001
|
|
123
|
-
break
|
|
124
|
-
|
|
125
|
-
# As a last resort, try to check if the accessed storage is the default one,
|
|
126
|
-
# and the folder has no metadata
|
|
127
|
-
# TODO: make this respect the APIFY_DEFAULT_XXX_ID env var
|
|
128
|
-
# https://github.com/apify/apify-sdk-python/issues/149
|
|
129
|
-
if id == 'default':
|
|
130
|
-
possible_storage_path = os.path.join(storages_dir, id)
|
|
131
|
-
if os.access(possible_storage_path, os.F_OK):
|
|
132
|
-
storage_path = possible_storage_path
|
|
133
|
-
|
|
134
|
-
if not storage_path:
|
|
135
|
-
return None
|
|
136
|
-
|
|
137
|
-
resource_client = cls._create_from_directory(storage_path, memory_storage_client, id, name)
|
|
138
|
-
|
|
139
|
-
storage_client_cache.append(resource_client)
|
|
140
|
-
|
|
141
|
-
return resource_client
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from operator import itemgetter
|
|
5
|
-
from typing import TYPE_CHECKING, Generic, TypeVar, cast
|
|
6
|
-
|
|
7
|
-
from apify_shared.models import ListPage
|
|
8
|
-
from apify_shared.utils import ignore_docs
|
|
9
|
-
|
|
10
|
-
from apify._memory_storage.file_storage_utils import update_metadata
|
|
11
|
-
from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient
|
|
12
|
-
|
|
13
|
-
if TYPE_CHECKING:
|
|
14
|
-
from apify._memory_storage.memory_storage_client import MemoryStorageClient
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ResourceClientType = TypeVar('ResourceClientType', bound=BaseResourceClient, contravariant=True) # noqa: PLC0105
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@ignore_docs
|
|
21
|
-
class BaseResourceCollectionClient(ABC, Generic[ResourceClientType]):
|
|
22
|
-
"""Base class for resource collection clients."""
|
|
23
|
-
|
|
24
|
-
_base_storage_directory: str
|
|
25
|
-
_memory_storage_client: MemoryStorageClient
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
self: BaseResourceCollectionClient,
|
|
29
|
-
*,
|
|
30
|
-
base_storage_directory: str,
|
|
31
|
-
memory_storage_client: MemoryStorageClient,
|
|
32
|
-
) -> None:
|
|
33
|
-
"""Initialize the DatasetCollectionClient with the passed arguments."""
|
|
34
|
-
self._base_storage_directory = base_storage_directory
|
|
35
|
-
self._memory_storage_client = memory_storage_client
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def _get_storage_client_cache(self: BaseResourceCollectionClient) -> list[ResourceClientType]:
|
|
39
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
40
|
-
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def _get_resource_client_class(self: BaseResourceCollectionClient) -> type[ResourceClientType]:
|
|
43
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
44
|
-
|
|
45
|
-
@abstractmethod
|
|
46
|
-
async def list(self: BaseResourceCollectionClient) -> ListPage:
|
|
47
|
-
"""List the available storages.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
ListPage: The list of available storages matching the specified filters.
|
|
51
|
-
"""
|
|
52
|
-
storage_client_cache = self._get_storage_client_cache()
|
|
53
|
-
|
|
54
|
-
items = [storage._to_resource_info() for storage in storage_client_cache]
|
|
55
|
-
|
|
56
|
-
return ListPage(
|
|
57
|
-
{
|
|
58
|
-
'total': len(items),
|
|
59
|
-
'count': len(items),
|
|
60
|
-
'offset': 0,
|
|
61
|
-
'limit': len(items),
|
|
62
|
-
'desc': False,
|
|
63
|
-
'items': sorted(items, key=itemgetter('createdAt')),
|
|
64
|
-
}
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
@abstractmethod
|
|
68
|
-
async def get_or_create(
|
|
69
|
-
self: BaseResourceCollectionClient,
|
|
70
|
-
*,
|
|
71
|
-
name: str | None = None,
|
|
72
|
-
schema: dict | None = None,
|
|
73
|
-
_id: str | None = None,
|
|
74
|
-
) -> dict:
|
|
75
|
-
"""Retrieve a named storage, or create a new one when it doesn't exist.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
name (str, optional): The name of the storage to retrieve or create.
|
|
79
|
-
schema (Dict, optional): The schema of the storage
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
dict: The retrieved or newly-created storage.
|
|
83
|
-
"""
|
|
84
|
-
resource_client_class = self._get_resource_client_class()
|
|
85
|
-
storage_client_cache = self._get_storage_client_cache()
|
|
86
|
-
|
|
87
|
-
if name or _id:
|
|
88
|
-
found = resource_client_class._find_or_create_client_by_id_or_name(
|
|
89
|
-
memory_storage_client=self._memory_storage_client,
|
|
90
|
-
name=name,
|
|
91
|
-
id=_id,
|
|
92
|
-
)
|
|
93
|
-
if found:
|
|
94
|
-
resource_info = found._to_resource_info()
|
|
95
|
-
return cast(dict, resource_info)
|
|
96
|
-
|
|
97
|
-
new_resource = resource_client_class(
|
|
98
|
-
id=_id,
|
|
99
|
-
name=name,
|
|
100
|
-
base_storage_directory=self._base_storage_directory,
|
|
101
|
-
memory_storage_client=self._memory_storage_client,
|
|
102
|
-
)
|
|
103
|
-
storage_client_cache.append(new_resource)
|
|
104
|
-
|
|
105
|
-
resource_info = new_resource._to_resource_info()
|
|
106
|
-
|
|
107
|
-
# Write to the disk
|
|
108
|
-
await update_metadata(
|
|
109
|
-
data=resource_info,
|
|
110
|
-
entity_directory=new_resource._resource_directory,
|
|
111
|
-
write_metadata=self._memory_storage_client._write_metadata,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
return cast(dict, resource_info)
|