apify 1.7.1b1__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +33 -4
- apify/_actor.py +1074 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.1.dist-info/METADATA +211 -0
- apify-2.2.1.dist-info/RECORD +38 -0
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.1b1.dist-info/METADATA +0 -149
- apify-1.7.1b1.dist-info/RECORD +0 -41
- apify-1.7.1b1.dist-info/top_level.txt +0 -1
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/LICENSE +0 -0
|
@@ -1,452 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import json
|
|
5
|
-
import os
|
|
6
|
-
from datetime import datetime, timezone
|
|
7
|
-
from typing import TYPE_CHECKING, Any, AsyncIterator
|
|
8
|
-
|
|
9
|
-
import aioshutil
|
|
10
|
-
from apify_shared.models import ListPage
|
|
11
|
-
from apify_shared.utils import ignore_docs
|
|
12
|
-
|
|
13
|
-
from apify._crypto import crypto_random_object_id
|
|
14
|
-
from apify._memory_storage.file_storage_utils import _update_dataset_items, update_metadata
|
|
15
|
-
from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient
|
|
16
|
-
from apify._utils import force_rename, raise_on_duplicate_storage, raise_on_non_existing_storage
|
|
17
|
-
from apify.consts import StorageTypes
|
|
18
|
-
|
|
19
|
-
if TYPE_CHECKING:
|
|
20
|
-
from apify_shared.types import JSONSerializable
|
|
21
|
-
|
|
22
|
-
from apify._memory_storage.memory_storage_client import MemoryStorageClient
|
|
23
|
-
|
|
24
|
-
# This is what API returns in the x-apify-pagination-limit
|
|
25
|
-
# header when no limit query parameter is used.
|
|
26
|
-
LIST_ITEMS_LIMIT = 999_999_999_999
|
|
27
|
-
|
|
28
|
-
# Number of characters of the dataset item file names.
|
|
29
|
-
# E.g.: 000000019.json - 9 digits
|
|
30
|
-
LOCAL_ENTRY_NAME_DIGITS = 9
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@ignore_docs
|
|
34
|
-
class DatasetClient(BaseResourceClient):
|
|
35
|
-
"""Sub-client for manipulating a single dataset."""
|
|
36
|
-
|
|
37
|
-
_id: str
|
|
38
|
-
_resource_directory: str
|
|
39
|
-
_memory_storage_client: MemoryStorageClient
|
|
40
|
-
_name: str | None
|
|
41
|
-
_dataset_entries: dict[str, dict]
|
|
42
|
-
_created_at: datetime
|
|
43
|
-
_accessed_at: datetime
|
|
44
|
-
_modified_at: datetime
|
|
45
|
-
_item_count = 0
|
|
46
|
-
_file_operation_lock: asyncio.Lock
|
|
47
|
-
|
|
48
|
-
def __init__(
|
|
49
|
-
self: DatasetClient,
|
|
50
|
-
*,
|
|
51
|
-
base_storage_directory: str,
|
|
52
|
-
memory_storage_client: MemoryStorageClient,
|
|
53
|
-
id: str | None = None, # noqa: A002
|
|
54
|
-
name: str | None = None,
|
|
55
|
-
) -> None:
|
|
56
|
-
"""Initialize the DatasetClient."""
|
|
57
|
-
self._id = id or crypto_random_object_id()
|
|
58
|
-
self._resource_directory = os.path.join(base_storage_directory, name or self._id)
|
|
59
|
-
self._memory_storage_client = memory_storage_client
|
|
60
|
-
self._name = name
|
|
61
|
-
self._dataset_entries = {}
|
|
62
|
-
self._created_at = datetime.now(timezone.utc)
|
|
63
|
-
self._accessed_at = datetime.now(timezone.utc)
|
|
64
|
-
self._modified_at = datetime.now(timezone.utc)
|
|
65
|
-
self._file_operation_lock = asyncio.Lock()
|
|
66
|
-
|
|
67
|
-
async def get(self: DatasetClient) -> dict | None:
|
|
68
|
-
"""Retrieve the dataset.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
dict, optional: The retrieved dataset, or None, if it does not exist
|
|
72
|
-
"""
|
|
73
|
-
found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
|
|
74
|
-
|
|
75
|
-
if found:
|
|
76
|
-
async with found._file_operation_lock:
|
|
77
|
-
await found._update_timestamps(has_been_modified=False)
|
|
78
|
-
return found._to_resource_info()
|
|
79
|
-
|
|
80
|
-
return None
|
|
81
|
-
|
|
82
|
-
async def update(self: DatasetClient, *, name: str | None = None) -> dict:
|
|
83
|
-
"""Update the dataset with specified fields.
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
name (str, optional): The new name for the dataset
|
|
87
|
-
|
|
88
|
-
Returns:
|
|
89
|
-
dict: The updated dataset
|
|
90
|
-
"""
|
|
91
|
-
# Check by id
|
|
92
|
-
existing_dataset_by_id = self._find_or_create_client_by_id_or_name(
|
|
93
|
-
memory_storage_client=self._memory_storage_client,
|
|
94
|
-
id=self._id,
|
|
95
|
-
name=self._name,
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
if existing_dataset_by_id is None:
|
|
99
|
-
raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
|
|
100
|
-
|
|
101
|
-
# Skip if no changes
|
|
102
|
-
if name is None:
|
|
103
|
-
return existing_dataset_by_id._to_resource_info()
|
|
104
|
-
|
|
105
|
-
async with existing_dataset_by_id._file_operation_lock:
|
|
106
|
-
# Check that name is not in use already
|
|
107
|
-
existing_dataset_by_name = next(
|
|
108
|
-
(dataset for dataset in self._memory_storage_client._datasets_handled if dataset._name and dataset._name.lower() == name.lower()),
|
|
109
|
-
None,
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
if existing_dataset_by_name is not None:
|
|
113
|
-
raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name)
|
|
114
|
-
|
|
115
|
-
existing_dataset_by_id._name = name
|
|
116
|
-
|
|
117
|
-
previous_dir = existing_dataset_by_id._resource_directory
|
|
118
|
-
|
|
119
|
-
existing_dataset_by_id._resource_directory = os.path.join(self._memory_storage_client._datasets_directory, name)
|
|
120
|
-
|
|
121
|
-
await force_rename(previous_dir, existing_dataset_by_id._resource_directory)
|
|
122
|
-
|
|
123
|
-
# Update timestamps
|
|
124
|
-
await existing_dataset_by_id._update_timestamps(has_been_modified=True)
|
|
125
|
-
|
|
126
|
-
return existing_dataset_by_id._to_resource_info()
|
|
127
|
-
|
|
128
|
-
async def delete(self: DatasetClient) -> None:
|
|
129
|
-
"""Delete the dataset."""
|
|
130
|
-
dataset = next((dataset for dataset in self._memory_storage_client._datasets_handled if dataset._id == self._id), None)
|
|
131
|
-
|
|
132
|
-
if dataset is not None:
|
|
133
|
-
async with dataset._file_operation_lock:
|
|
134
|
-
self._memory_storage_client._datasets_handled.remove(dataset)
|
|
135
|
-
dataset._item_count = 0
|
|
136
|
-
dataset._dataset_entries.clear()
|
|
137
|
-
|
|
138
|
-
if os.path.exists(dataset._resource_directory):
|
|
139
|
-
await aioshutil.rmtree(dataset._resource_directory)
|
|
140
|
-
|
|
141
|
-
async def list_items(
|
|
142
|
-
self: DatasetClient,
|
|
143
|
-
*,
|
|
144
|
-
offset: int | None = 0,
|
|
145
|
-
limit: int | None = LIST_ITEMS_LIMIT,
|
|
146
|
-
clean: bool | None = None, # noqa: ARG002
|
|
147
|
-
desc: bool | None = None,
|
|
148
|
-
fields: list[str] | None = None, # noqa: ARG002
|
|
149
|
-
omit: list[str] | None = None, # noqa: ARG002
|
|
150
|
-
unwind: str | None = None, # noqa: ARG002
|
|
151
|
-
skip_empty: bool | None = None, # noqa: ARG002
|
|
152
|
-
skip_hidden: bool | None = None, # noqa: ARG002
|
|
153
|
-
flatten: list[str] | None = None, # noqa: ARG002
|
|
154
|
-
view: str | None = None, # noqa: ARG002
|
|
155
|
-
) -> ListPage:
|
|
156
|
-
"""List the items of the dataset.
|
|
157
|
-
|
|
158
|
-
Args:
|
|
159
|
-
offset (int, optional): Number of items that should be skipped at the start. The default value is 0
|
|
160
|
-
limit (int, optional): Maximum number of items to return. By default there is no limit.
|
|
161
|
-
desc (bool, optional): By default, results are returned in the same order as they were stored.
|
|
162
|
-
To reverse the order, set this parameter to True.
|
|
163
|
-
clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
|
|
164
|
-
The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
|
|
165
|
-
Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
|
|
166
|
-
fields (list of str, optional): A list of fields which should be picked from the items,
|
|
167
|
-
only these fields will remain in the resulting record objects.
|
|
168
|
-
Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
|
|
169
|
-
You can use this feature to effectively fix the output format.
|
|
170
|
-
omit (list of str, optional): A list of fields which should be omitted from the items.
|
|
171
|
-
unwind (str, optional): Name of a field which should be unwound.
|
|
172
|
-
If the field is an array then every element of the array will become a separate record and merged with parent object.
|
|
173
|
-
If the unwound field is an object then it is merged with the parent object.
|
|
174
|
-
If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
|
|
175
|
-
then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
|
|
176
|
-
skip_empty (bool, optional): If True, then empty items are skipped from the output.
|
|
177
|
-
Note that if used, the results might contain less items than the limit value.
|
|
178
|
-
skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
|
|
179
|
-
flatten (list of str, optional): A list of fields that should be flattened
|
|
180
|
-
view (str, optional): Name of the dataset view to be used
|
|
181
|
-
|
|
182
|
-
Returns:
|
|
183
|
-
ListPage: A page of the list of dataset items according to the specified filters.
|
|
184
|
-
"""
|
|
185
|
-
# Check by id
|
|
186
|
-
existing_dataset_by_id = self._find_or_create_client_by_id_or_name(
|
|
187
|
-
memory_storage_client=self._memory_storage_client,
|
|
188
|
-
id=self._id,
|
|
189
|
-
name=self._name,
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
if existing_dataset_by_id is None:
|
|
193
|
-
raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
|
|
194
|
-
|
|
195
|
-
async with existing_dataset_by_id._file_operation_lock:
|
|
196
|
-
start, end = existing_dataset_by_id._get_start_and_end_indexes(
|
|
197
|
-
max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0,
|
|
198
|
-
limit,
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
items = []
|
|
202
|
-
|
|
203
|
-
for idx in range(start, end):
|
|
204
|
-
entry_number = self._generate_local_entry_name(idx)
|
|
205
|
-
items.append(existing_dataset_by_id._dataset_entries[entry_number])
|
|
206
|
-
|
|
207
|
-
await existing_dataset_by_id._update_timestamps(has_been_modified=False)
|
|
208
|
-
|
|
209
|
-
if desc:
|
|
210
|
-
items.reverse()
|
|
211
|
-
|
|
212
|
-
return ListPage(
|
|
213
|
-
{
|
|
214
|
-
'count': len(items),
|
|
215
|
-
'desc': desc or False,
|
|
216
|
-
'items': items,
|
|
217
|
-
'limit': limit or LIST_ITEMS_LIMIT,
|
|
218
|
-
'offset': offset or 0,
|
|
219
|
-
'total': existing_dataset_by_id._item_count,
|
|
220
|
-
}
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
async def iterate_items(
|
|
224
|
-
self: DatasetClient,
|
|
225
|
-
*,
|
|
226
|
-
offset: int = 0,
|
|
227
|
-
limit: int | None = None,
|
|
228
|
-
clean: bool | None = None, # noqa: ARG002
|
|
229
|
-
desc: bool | None = None,
|
|
230
|
-
fields: list[str] | None = None, # noqa: ARG002
|
|
231
|
-
omit: list[str] | None = None, # noqa: ARG002
|
|
232
|
-
unwind: str | None = None, # noqa: ARG002
|
|
233
|
-
skip_empty: bool | None = None, # noqa: ARG002
|
|
234
|
-
skip_hidden: bool | None = None, # noqa: ARG002
|
|
235
|
-
) -> AsyncIterator[dict]:
|
|
236
|
-
"""Iterate over the items in the dataset.
|
|
237
|
-
|
|
238
|
-
Args:
|
|
239
|
-
offset (int, optional): Number of items that should be skipped at the start. The default value is 0
|
|
240
|
-
limit (int, optional): Maximum number of items to return. By default there is no limit.
|
|
241
|
-
desc (bool, optional): By default, results are returned in the same order as they were stored.
|
|
242
|
-
To reverse the order, set this parameter to True.
|
|
243
|
-
clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
|
|
244
|
-
The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
|
|
245
|
-
Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
|
|
246
|
-
fields (list of str, optional): A list of fields which should be picked from the items,
|
|
247
|
-
only these fields will remain in the resulting record objects.
|
|
248
|
-
Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
|
|
249
|
-
You can use this feature to effectively fix the output format.
|
|
250
|
-
omit (list of str, optional): A list of fields which should be omitted from the items.
|
|
251
|
-
unwind (str, optional): Name of a field which should be unwound.
|
|
252
|
-
If the field is an array then every element of the array will become a separate record and merged with parent object.
|
|
253
|
-
If the unwound field is an object then it is merged with the parent object.
|
|
254
|
-
If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
|
|
255
|
-
then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
|
|
256
|
-
skip_empty (bool, optional): If True, then empty items are skipped from the output.
|
|
257
|
-
Note that if used, the results might contain less items than the limit value.
|
|
258
|
-
skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
|
|
259
|
-
|
|
260
|
-
Yields:
|
|
261
|
-
dict: An item from the dataset
|
|
262
|
-
"""
|
|
263
|
-
cache_size = 1000
|
|
264
|
-
first_item = offset
|
|
265
|
-
|
|
266
|
-
# If there is no limit, set last_item to None until we get the total from the first API response
|
|
267
|
-
last_item = None if limit is None else offset + limit
|
|
268
|
-
|
|
269
|
-
current_offset = first_item
|
|
270
|
-
while last_item is None or current_offset < last_item:
|
|
271
|
-
current_limit = cache_size if last_item is None else min(cache_size, last_item - current_offset)
|
|
272
|
-
|
|
273
|
-
current_items_page = await self.list_items(
|
|
274
|
-
offset=current_offset,
|
|
275
|
-
limit=current_limit,
|
|
276
|
-
desc=desc,
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
current_offset += current_items_page.count
|
|
280
|
-
if last_item is None or current_items_page.total < last_item:
|
|
281
|
-
last_item = current_items_page.total
|
|
282
|
-
|
|
283
|
-
for item in current_items_page.items:
|
|
284
|
-
yield item
|
|
285
|
-
|
|
286
|
-
async def get_items_as_bytes(self: DatasetClient, *_args: Any, **_kwargs: Any) -> bytes:
|
|
287
|
-
raise NotImplementedError('This method is not supported in local memory storage.')
|
|
288
|
-
|
|
289
|
-
async def stream_items(self: DatasetClient, *_args: Any, **_kwargs: Any) -> AsyncIterator:
|
|
290
|
-
raise NotImplementedError('This method is not supported in local memory storage.')
|
|
291
|
-
|
|
292
|
-
async def push_items(self: DatasetClient, items: JSONSerializable) -> None:
|
|
293
|
-
"""Push items to the dataset.
|
|
294
|
-
|
|
295
|
-
Args:
|
|
296
|
-
items: The items which to push in the dataset. Either a stringified JSON, a dictionary, or a list of strings or dictionaries.
|
|
297
|
-
"""
|
|
298
|
-
# Check by id
|
|
299
|
-
existing_dataset_by_id = self._find_or_create_client_by_id_or_name(
|
|
300
|
-
memory_storage_client=self._memory_storage_client, id=self._id, name=self._name
|
|
301
|
-
)
|
|
302
|
-
|
|
303
|
-
if existing_dataset_by_id is None:
|
|
304
|
-
raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
|
|
305
|
-
|
|
306
|
-
normalized = self._normalize_items(items)
|
|
307
|
-
|
|
308
|
-
added_ids: list[str] = []
|
|
309
|
-
for entry in normalized:
|
|
310
|
-
existing_dataset_by_id._item_count += 1
|
|
311
|
-
idx = self._generate_local_entry_name(existing_dataset_by_id._item_count)
|
|
312
|
-
|
|
313
|
-
existing_dataset_by_id._dataset_entries[idx] = entry
|
|
314
|
-
added_ids.append(idx)
|
|
315
|
-
|
|
316
|
-
data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001
|
|
317
|
-
|
|
318
|
-
async with existing_dataset_by_id._file_operation_lock:
|
|
319
|
-
await existing_dataset_by_id._update_timestamps(has_been_modified=True)
|
|
320
|
-
|
|
321
|
-
await _update_dataset_items(
|
|
322
|
-
data=data_entries,
|
|
323
|
-
entity_directory=existing_dataset_by_id._resource_directory,
|
|
324
|
-
persist_storage=self._memory_storage_client._persist_storage,
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
def _to_resource_info(self: DatasetClient) -> dict:
|
|
328
|
-
"""Retrieve the dataset info."""
|
|
329
|
-
return {
|
|
330
|
-
'id': self._id,
|
|
331
|
-
'name': self._name,
|
|
332
|
-
'itemCount': self._item_count,
|
|
333
|
-
'accessedAt': self._accessed_at,
|
|
334
|
-
'createdAt': self._created_at,
|
|
335
|
-
'modifiedAt': self._modified_at,
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
async def _update_timestamps(self: DatasetClient, has_been_modified: bool) -> None: # noqa: FBT001
|
|
339
|
-
"""Update the timestamps of the dataset."""
|
|
340
|
-
self._accessed_at = datetime.now(timezone.utc)
|
|
341
|
-
|
|
342
|
-
if has_been_modified:
|
|
343
|
-
self._modified_at = datetime.now(timezone.utc)
|
|
344
|
-
|
|
345
|
-
dataset_info = self._to_resource_info()
|
|
346
|
-
await update_metadata(
|
|
347
|
-
data=dataset_info,
|
|
348
|
-
entity_directory=self._resource_directory,
|
|
349
|
-
write_metadata=self._memory_storage_client._write_metadata,
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
def _get_start_and_end_indexes(self: DatasetClient, offset: int, limit: int | None = None) -> tuple[int, int]:
|
|
353
|
-
actual_limit = limit or self._item_count
|
|
354
|
-
start = offset + 1
|
|
355
|
-
end = min(offset + actual_limit, self._item_count) + 1
|
|
356
|
-
return (start, end)
|
|
357
|
-
|
|
358
|
-
def _generate_local_entry_name(self: DatasetClient, idx: int) -> str:
|
|
359
|
-
return str(idx).zfill(LOCAL_ENTRY_NAME_DIGITS)
|
|
360
|
-
|
|
361
|
-
def _normalize_items(self: DatasetClient, items: JSONSerializable) -> list[dict]:
|
|
362
|
-
def normalize_item(item: Any) -> dict | None:
|
|
363
|
-
if isinstance(item, str):
|
|
364
|
-
item = json.loads(item)
|
|
365
|
-
|
|
366
|
-
if isinstance(item, list):
|
|
367
|
-
received = ',\n'.join(item)
|
|
368
|
-
raise TypeError(f'Each dataset item can only be a single JSON object, not an array. Received: [{received}]')
|
|
369
|
-
|
|
370
|
-
if (not isinstance(item, dict)) and item is not None:
|
|
371
|
-
raise TypeError(f'Each dataset item must be a JSON object. Received: {item}')
|
|
372
|
-
|
|
373
|
-
return item
|
|
374
|
-
|
|
375
|
-
if isinstance(items, str):
|
|
376
|
-
items = json.loads(items)
|
|
377
|
-
|
|
378
|
-
result = list(map(normalize_item, items)) if isinstance(items, list) else [normalize_item(items)]
|
|
379
|
-
# filter(None, ..) returns items that are True
|
|
380
|
-
return list(filter(None, result))
|
|
381
|
-
|
|
382
|
-
@classmethod
|
|
383
|
-
def _get_storages_dir(cls: type[DatasetClient], memory_storage_client: MemoryStorageClient) -> str:
|
|
384
|
-
return memory_storage_client._datasets_directory
|
|
385
|
-
|
|
386
|
-
@classmethod
|
|
387
|
-
def _get_storage_client_cache(
|
|
388
|
-
cls: type[DatasetClient],
|
|
389
|
-
memory_storage_client: MemoryStorageClient,
|
|
390
|
-
) -> list[DatasetClient]:
|
|
391
|
-
return memory_storage_client._datasets_handled
|
|
392
|
-
|
|
393
|
-
@classmethod
|
|
394
|
-
def _create_from_directory(
|
|
395
|
-
cls: type[DatasetClient],
|
|
396
|
-
storage_directory: str,
|
|
397
|
-
memory_storage_client: MemoryStorageClient,
|
|
398
|
-
id: str | None = None, # noqa: A002
|
|
399
|
-
name: str | None = None,
|
|
400
|
-
) -> DatasetClient:
|
|
401
|
-
item_count = 0
|
|
402
|
-
created_at = datetime.now(timezone.utc)
|
|
403
|
-
accessed_at = datetime.now(timezone.utc)
|
|
404
|
-
modified_at = datetime.now(timezone.utc)
|
|
405
|
-
entries: dict[str, dict] = {}
|
|
406
|
-
|
|
407
|
-
has_seen_metadata_file = False
|
|
408
|
-
|
|
409
|
-
# Access the dataset folder
|
|
410
|
-
for entry in os.scandir(storage_directory):
|
|
411
|
-
if entry.is_file():
|
|
412
|
-
if entry.name == '__metadata__.json':
|
|
413
|
-
has_seen_metadata_file = True
|
|
414
|
-
|
|
415
|
-
# We have found the dataset's metadata file, build out information based on it
|
|
416
|
-
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
|
|
417
|
-
metadata = json.load(f)
|
|
418
|
-
id = metadata['id'] # noqa: A001
|
|
419
|
-
name = metadata['name']
|
|
420
|
-
item_count = metadata['itemCount']
|
|
421
|
-
created_at = datetime.fromisoformat(metadata['createdAt'])
|
|
422
|
-
accessed_at = datetime.fromisoformat(metadata['accessedAt'])
|
|
423
|
-
modified_at = datetime.fromisoformat(metadata['modifiedAt'])
|
|
424
|
-
|
|
425
|
-
continue
|
|
426
|
-
|
|
427
|
-
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
|
|
428
|
-
entry_content = json.load(f)
|
|
429
|
-
entry_name = entry.name.split('.')[0]
|
|
430
|
-
|
|
431
|
-
entries[entry_name] = entry_content
|
|
432
|
-
|
|
433
|
-
if not has_seen_metadata_file:
|
|
434
|
-
item_count += 1
|
|
435
|
-
|
|
436
|
-
new_client = DatasetClient(
|
|
437
|
-
base_storage_directory=memory_storage_client._datasets_directory,
|
|
438
|
-
memory_storage_client=memory_storage_client,
|
|
439
|
-
id=id,
|
|
440
|
-
name=name,
|
|
441
|
-
)
|
|
442
|
-
|
|
443
|
-
# Overwrite properties
|
|
444
|
-
new_client._accessed_at = accessed_at
|
|
445
|
-
new_client._created_at = created_at
|
|
446
|
-
new_client._modified_at = modified_at
|
|
447
|
-
new_client._item_count = item_count
|
|
448
|
-
|
|
449
|
-
for entry_id, content in entries.items():
|
|
450
|
-
new_client._dataset_entries[entry_id] = content
|
|
451
|
-
|
|
452
|
-
return new_client
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
|
-
|
|
5
|
-
from apify_shared.utils import ignore_docs
|
|
6
|
-
|
|
7
|
-
from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient
|
|
8
|
-
from apify._memory_storage.resource_clients.dataset import DatasetClient
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from apify_shared.models import ListPage
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@ignore_docs
|
|
15
|
-
class DatasetCollectionClient(BaseResourceCollectionClient):
|
|
16
|
-
"""Sub-client for manipulating datasets."""
|
|
17
|
-
|
|
18
|
-
def _get_storage_client_cache(self: DatasetCollectionClient) -> list[DatasetClient]:
|
|
19
|
-
return self._memory_storage_client._datasets_handled
|
|
20
|
-
|
|
21
|
-
def _get_resource_client_class(self: DatasetCollectionClient) -> type[DatasetClient]:
|
|
22
|
-
return DatasetClient
|
|
23
|
-
|
|
24
|
-
async def list(self: DatasetCollectionClient) -> ListPage:
|
|
25
|
-
"""List the available datasets.
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
ListPage: The list of available datasets matching the specified filters.
|
|
29
|
-
"""
|
|
30
|
-
return await super().list()
|
|
31
|
-
|
|
32
|
-
async def get_or_create(
|
|
33
|
-
self: DatasetCollectionClient,
|
|
34
|
-
*,
|
|
35
|
-
name: str | None = None,
|
|
36
|
-
schema: dict | None = None,
|
|
37
|
-
_id: str | None = None,
|
|
38
|
-
) -> dict:
|
|
39
|
-
"""Retrieve a named dataset, or create a new one when it doesn't exist.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
name (str, optional): The name of the dataset to retrieve or create.
|
|
43
|
-
schema (dict, optional): The schema of the dataset
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
dict: The retrieved or newly-created dataset.
|
|
47
|
-
"""
|
|
48
|
-
return await super().get_or_create(name=name, schema=schema, _id=_id)
|