apify 1.7.0b1__py3-none-any.whl → 2.2.0b14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +19 -4
- apify/_actor.py +1030 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.0b14.dist-info/METADATA +211 -0
- apify-2.2.0b14.dist-info/RECORD +38 -0
- {apify-1.7.0b1.dist-info → apify-2.2.0b14.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.0b1.dist-info/METADATA +0 -149
- apify-1.7.0b1.dist-info/RECORD +0 -41
- apify-1.7.0b1.dist-info/top_level.txt +0 -1
- {apify-1.7.0b1.dist-info → apify-2.2.0b14.dist-info}/LICENSE +0 -0
apify/storages/dataset.py
DELETED
|
@@ -1,494 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import csv
|
|
4
|
-
import io
|
|
5
|
-
import math
|
|
6
|
-
from typing import TYPE_CHECKING, AsyncIterator, Iterable, Iterator
|
|
7
|
-
|
|
8
|
-
from apify_shared.utils import ignore_docs, json_dumps
|
|
9
|
-
|
|
10
|
-
from apify._utils import wrap_internal
|
|
11
|
-
from apify.consts import MAX_PAYLOAD_SIZE_BYTES
|
|
12
|
-
from apify.storages.base_storage import BaseStorage
|
|
13
|
-
from apify.storages.key_value_store import KeyValueStore
|
|
14
|
-
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from apify_client import ApifyClientAsync
|
|
17
|
-
from apify_client.clients import DatasetClientAsync, DatasetCollectionClientAsync
|
|
18
|
-
from apify_shared.models import ListPage
|
|
19
|
-
from apify_shared.types import JSONSerializable
|
|
20
|
-
|
|
21
|
-
from apify._memory_storage import MemoryStorageClient
|
|
22
|
-
from apify._memory_storage.resource_clients import DatasetClient, DatasetCollectionClient
|
|
23
|
-
from apify.config import Configuration
|
|
24
|
-
|
|
25
|
-
# 0.01%
|
|
26
|
-
SAFETY_BUFFER_PERCENT = 0.01 / 100
|
|
27
|
-
EFFECTIVE_LIMIT_BYTES = MAX_PAYLOAD_SIZE_BYTES - math.ceil(MAX_PAYLOAD_SIZE_BYTES * SAFETY_BUFFER_PERCENT)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def _check_and_serialize(item: JSONSerializable, index: int | None = None) -> str:
|
|
31
|
-
"""Accept a JSON serializable object as an input, validate its serializability and its serialized size against `EFFECTIVE_LIMIT_BYTES`."""
|
|
32
|
-
s = ' ' if index is None else f' at index {index} '
|
|
33
|
-
|
|
34
|
-
try:
|
|
35
|
-
payload = json_dumps(item)
|
|
36
|
-
except Exception as exc:
|
|
37
|
-
raise ValueError(f'Data item{s}is not serializable to JSON.') from exc
|
|
38
|
-
|
|
39
|
-
length_bytes = len(payload.encode('utf-8'))
|
|
40
|
-
if length_bytes > EFFECTIVE_LIMIT_BYTES:
|
|
41
|
-
raise ValueError(f'Data item{s}is too large (size: {length_bytes} bytes, limit: {EFFECTIVE_LIMIT_BYTES} bytes)')
|
|
42
|
-
|
|
43
|
-
return payload
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def _chunk_by_size(items: Iterable[str]) -> Iterator[str]:
|
|
47
|
-
"""Take an array of JSONs, produce iterator of chunked JSON arrays respecting `EFFECTIVE_LIMIT_BYTES`.
|
|
48
|
-
|
|
49
|
-
Takes an array of JSONs (payloads) as input and produces an iterator of JSON strings
|
|
50
|
-
where each string is a JSON array of payloads with a maximum size of `EFFECTIVE_LIMIT_BYTES` per one
|
|
51
|
-
JSON array. Fits as many payloads as possible into a single JSON array and then moves
|
|
52
|
-
on to the next, preserving item order.
|
|
53
|
-
|
|
54
|
-
The function assumes that none of the items is larger than `EFFECTIVE_LIMIT_BYTES` and does not validate.
|
|
55
|
-
"""
|
|
56
|
-
last_chunk_bytes = 2 # Add 2 bytes for [] wrapper.
|
|
57
|
-
current_chunk = []
|
|
58
|
-
|
|
59
|
-
for payload in items:
|
|
60
|
-
length_bytes = len(payload.encode('utf-8'))
|
|
61
|
-
|
|
62
|
-
if last_chunk_bytes + length_bytes <= EFFECTIVE_LIMIT_BYTES:
|
|
63
|
-
current_chunk.append(payload)
|
|
64
|
-
last_chunk_bytes += length_bytes + 1 # Add 1 byte for ',' separator.
|
|
65
|
-
else:
|
|
66
|
-
yield f'[{",".join(current_chunk)}]'
|
|
67
|
-
current_chunk = [payload]
|
|
68
|
-
last_chunk_bytes = length_bytes + 2 # Add 2 bytes for [] wrapper.
|
|
69
|
-
|
|
70
|
-
yield f'[{",".join(current_chunk)}]'
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class Dataset(BaseStorage):
|
|
74
|
-
"""The `Dataset` class represents a store for structured data where each object stored has the same attributes.
|
|
75
|
-
|
|
76
|
-
You can imagine it as a table, where each object is a row and its attributes are columns.
|
|
77
|
-
Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records.
|
|
78
|
-
Typically it is used to store crawling results.
|
|
79
|
-
|
|
80
|
-
Do not instantiate this class directly, use the `Actor.open_dataset()` function instead.
|
|
81
|
-
|
|
82
|
-
`Dataset` stores its data either on local disk or in the Apify cloud,
|
|
83
|
-
depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set.
|
|
84
|
-
|
|
85
|
-
If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in
|
|
86
|
-
the local directory in the following files:
|
|
87
|
-
```
|
|
88
|
-
{APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json
|
|
89
|
-
```
|
|
90
|
-
Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`,
|
|
91
|
-
unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable.
|
|
92
|
-
Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset.
|
|
93
|
-
|
|
94
|
-
If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
|
|
95
|
-
[Apify Dataset](https://docs.apify.com/storage/dataset) cloud storage.
|
|
96
|
-
"""
|
|
97
|
-
|
|
98
|
-
_id: str
|
|
99
|
-
_name: str | None
|
|
100
|
-
_dataset_client: DatasetClientAsync | DatasetClient
|
|
101
|
-
|
|
102
|
-
@ignore_docs
|
|
103
|
-
def __init__(
|
|
104
|
-
self: Dataset,
|
|
105
|
-
id: str, # noqa: A002
|
|
106
|
-
name: str | None,
|
|
107
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
108
|
-
config: Configuration,
|
|
109
|
-
) -> None:
|
|
110
|
-
"""Create a `Dataset` instance.
|
|
111
|
-
|
|
112
|
-
Do not use the constructor directly, use the `Actor.open_dataset()` function instead.
|
|
113
|
-
|
|
114
|
-
Args:
|
|
115
|
-
id (str): ID of the dataset.
|
|
116
|
-
name (str, optional): Name of the dataset.
|
|
117
|
-
client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used.
|
|
118
|
-
config (Configuration): The configuration which should be used.
|
|
119
|
-
"""
|
|
120
|
-
super().__init__(id=id, name=name, client=client, config=config)
|
|
121
|
-
|
|
122
|
-
self.get_data = wrap_internal(self._get_data_internal, self.get_data) # type: ignore
|
|
123
|
-
self.push_data = wrap_internal(self._push_data_internal, self.push_data) # type: ignore
|
|
124
|
-
self.export_to_json = wrap_internal(self._export_to_json_internal, self.export_to_json) # type: ignore
|
|
125
|
-
self.export_to_csv = wrap_internal(self._export_to_csv_internal, self.export_to_csv) # type: ignore
|
|
126
|
-
|
|
127
|
-
self._dataset_client = client.dataset(self._id)
|
|
128
|
-
|
|
129
|
-
@classmethod
|
|
130
|
-
def _get_human_friendly_label(cls: type[Dataset]) -> str:
|
|
131
|
-
return 'Dataset'
|
|
132
|
-
|
|
133
|
-
@classmethod
|
|
134
|
-
def _get_default_id(cls: type[Dataset], config: Configuration) -> str:
|
|
135
|
-
return config.default_dataset_id
|
|
136
|
-
|
|
137
|
-
@classmethod
|
|
138
|
-
def _get_single_storage_client(
|
|
139
|
-
cls: type[Dataset],
|
|
140
|
-
id: str, # noqa: A002
|
|
141
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
142
|
-
) -> DatasetClientAsync | DatasetClient:
|
|
143
|
-
return client.dataset(id)
|
|
144
|
-
|
|
145
|
-
@classmethod
|
|
146
|
-
def _get_storage_collection_client(
|
|
147
|
-
cls: type[Dataset],
|
|
148
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
149
|
-
) -> DatasetCollectionClientAsync | DatasetCollectionClient:
|
|
150
|
-
return client.datasets()
|
|
151
|
-
|
|
152
|
-
@classmethod
|
|
153
|
-
async def push_data(cls: type[Dataset], data: JSONSerializable) -> None:
|
|
154
|
-
"""Store an object or an array of objects to the dataset.
|
|
155
|
-
|
|
156
|
-
The size of the data is limited by the receiving API and therefore `push_data()` will only
|
|
157
|
-
allow objects whose JSON representation is smaller than 9MB. When an array is passed,
|
|
158
|
-
none of the included objects may be larger than 9MB, but the array itself may be of any size.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
data (JSONSerializable): dict or array of dicts containing data to be stored in the default dataset.
|
|
162
|
-
The JSON representation of each item must be smaller than 9MB.
|
|
163
|
-
"""
|
|
164
|
-
dataset = await cls.open()
|
|
165
|
-
return await dataset.push_data(data)
|
|
166
|
-
|
|
167
|
-
async def _push_data_internal(self: Dataset, data: JSONSerializable) -> None:
|
|
168
|
-
# Handle singular items
|
|
169
|
-
if not isinstance(data, list):
|
|
170
|
-
payload = _check_and_serialize(data)
|
|
171
|
-
return await self._dataset_client.push_items(payload)
|
|
172
|
-
|
|
173
|
-
# Handle lists
|
|
174
|
-
payloads_generator = (_check_and_serialize(item, index) for index, item in enumerate(data))
|
|
175
|
-
|
|
176
|
-
# Invoke client in series to preserve the order of data
|
|
177
|
-
for chunk in _chunk_by_size(payloads_generator):
|
|
178
|
-
await self._dataset_client.push_items(chunk)
|
|
179
|
-
return None
|
|
180
|
-
|
|
181
|
-
@classmethod
|
|
182
|
-
async def get_data(
|
|
183
|
-
cls: type[Dataset],
|
|
184
|
-
*,
|
|
185
|
-
offset: int | None = None,
|
|
186
|
-
limit: int | None = None,
|
|
187
|
-
clean: bool | None = None,
|
|
188
|
-
desc: bool | None = None,
|
|
189
|
-
fields: list[str] | None = None,
|
|
190
|
-
omit: list[str] | None = None,
|
|
191
|
-
unwind: str | None = None,
|
|
192
|
-
skip_empty: bool | None = None,
|
|
193
|
-
skip_hidden: bool | None = None,
|
|
194
|
-
flatten: list[str] | None = None,
|
|
195
|
-
view: str | None = None,
|
|
196
|
-
) -> ListPage:
|
|
197
|
-
"""Get items from the dataset.
|
|
198
|
-
|
|
199
|
-
Args:
|
|
200
|
-
offset (int, optional): Number of items that should be skipped at the start. The default value is 0
|
|
201
|
-
limit (int, optional): Maximum number of items to return. By default there is no limit.
|
|
202
|
-
desc (bool, optional): By default, results are returned in the same order as they were stored.
|
|
203
|
-
To reverse the order, set this parameter to True.
|
|
204
|
-
clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
|
|
205
|
-
The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
|
|
206
|
-
Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
|
|
207
|
-
fields (list of str, optional): A list of fields which should be picked from the items,
|
|
208
|
-
only these fields will remain in the resulting record objects.
|
|
209
|
-
Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
|
|
210
|
-
You can use this feature to effectively fix the output format.
|
|
211
|
-
omit (list of str, optional): A list of fields which should be omitted from the items.
|
|
212
|
-
unwind (str, optional): Name of a field which should be unwound.
|
|
213
|
-
If the field is an array then every element of the array will become a separate record and merged with parent object.
|
|
214
|
-
If the unwound field is an object then it is merged with the parent object.
|
|
215
|
-
If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
|
|
216
|
-
then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
|
|
217
|
-
skip_empty (bool, optional): If True, then empty items are skipped from the output.
|
|
218
|
-
Note that if used, the results might contain less items than the limit value.
|
|
219
|
-
skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
|
|
220
|
-
flatten (list of str, optional): A list of fields that should be flattened
|
|
221
|
-
view (str, optional): Name of the dataset view to be used
|
|
222
|
-
|
|
223
|
-
Returns:
|
|
224
|
-
ListPage: A page of the list of dataset items according to the specified filters.
|
|
225
|
-
"""
|
|
226
|
-
dataset = await cls.open()
|
|
227
|
-
return await dataset.get_data(
|
|
228
|
-
offset=offset,
|
|
229
|
-
limit=limit,
|
|
230
|
-
desc=desc,
|
|
231
|
-
clean=clean,
|
|
232
|
-
fields=fields,
|
|
233
|
-
omit=omit,
|
|
234
|
-
unwind=unwind,
|
|
235
|
-
skip_empty=skip_empty,
|
|
236
|
-
skip_hidden=skip_hidden,
|
|
237
|
-
flatten=flatten,
|
|
238
|
-
view=view,
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
async def _get_data_internal(
|
|
242
|
-
self: Dataset,
|
|
243
|
-
*,
|
|
244
|
-
offset: int | None = None,
|
|
245
|
-
limit: int | None = None,
|
|
246
|
-
clean: bool | None = None,
|
|
247
|
-
desc: bool | None = None,
|
|
248
|
-
fields: list[str] | None = None,
|
|
249
|
-
omit: list[str] | None = None,
|
|
250
|
-
unwind: str | None = None,
|
|
251
|
-
skip_empty: bool | None = None,
|
|
252
|
-
skip_hidden: bool | None = None,
|
|
253
|
-
flatten: list[str] | None = None,
|
|
254
|
-
view: str | None = None,
|
|
255
|
-
) -> ListPage:
|
|
256
|
-
# TODO: Improve error handling here
|
|
257
|
-
# https://github.com/apify/apify-sdk-python/issues/140
|
|
258
|
-
return await self._dataset_client.list_items(
|
|
259
|
-
offset=offset,
|
|
260
|
-
limit=limit,
|
|
261
|
-
desc=desc,
|
|
262
|
-
clean=clean,
|
|
263
|
-
fields=fields,
|
|
264
|
-
omit=omit,
|
|
265
|
-
unwind=unwind,
|
|
266
|
-
skip_empty=skip_empty,
|
|
267
|
-
skip_hidden=skip_hidden,
|
|
268
|
-
flatten=flatten,
|
|
269
|
-
view=view,
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
async def export_to(
|
|
273
|
-
self: Dataset,
|
|
274
|
-
key: str,
|
|
275
|
-
*,
|
|
276
|
-
to_key_value_store_id: str | None = None,
|
|
277
|
-
to_key_value_store_name: str | None = None,
|
|
278
|
-
content_type: str | None = None,
|
|
279
|
-
) -> None:
|
|
280
|
-
"""Save the entirety of the dataset's contents into one file within a key-value store.
|
|
281
|
-
|
|
282
|
-
Args:
|
|
283
|
-
key (str): The key to save the data under.
|
|
284
|
-
to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved.
|
|
285
|
-
to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved.
|
|
286
|
-
You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments.
|
|
287
|
-
If you omit both, it uses the default key-value store.
|
|
288
|
-
content_type (str, optional): Either 'text/csv' or 'application/json'. Defaults to JSON.
|
|
289
|
-
"""
|
|
290
|
-
key_value_store = await KeyValueStore.open(id=to_key_value_store_id, name=to_key_value_store_name)
|
|
291
|
-
items: list[dict] = []
|
|
292
|
-
limit = 1000
|
|
293
|
-
offset = 0
|
|
294
|
-
while True:
|
|
295
|
-
list_items = await self._dataset_client.list_items(limit=limit, offset=offset)
|
|
296
|
-
items.extend(list_items.items)
|
|
297
|
-
if list_items.total <= offset + list_items.count:
|
|
298
|
-
break
|
|
299
|
-
offset += list_items.count
|
|
300
|
-
|
|
301
|
-
if len(items) == 0:
|
|
302
|
-
raise ValueError('Cannot export an empty dataset')
|
|
303
|
-
|
|
304
|
-
if content_type == 'text/csv':
|
|
305
|
-
output = io.StringIO()
|
|
306
|
-
writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
|
|
307
|
-
writer.writerows([items[0].keys(), *[item.values() for item in items]])
|
|
308
|
-
value = output.getvalue()
|
|
309
|
-
return await key_value_store.set_value(key, value, content_type)
|
|
310
|
-
|
|
311
|
-
if content_type == 'application/json':
|
|
312
|
-
return await key_value_store.set_value(key, items)
|
|
313
|
-
|
|
314
|
-
raise ValueError(f'Unsupported content type: {content_type}')
|
|
315
|
-
|
|
316
|
-
@classmethod
|
|
317
|
-
async def export_to_json(
|
|
318
|
-
cls: type[Dataset],
|
|
319
|
-
key: str,
|
|
320
|
-
*,
|
|
321
|
-
from_dataset_id: str | None = None,
|
|
322
|
-
from_dataset_name: str | None = None,
|
|
323
|
-
to_key_value_store_id: str | None = None,
|
|
324
|
-
to_key_value_store_name: str | None = None,
|
|
325
|
-
) -> None:
|
|
326
|
-
"""Save the entirety of the dataset's contents into one JSON file within a key-value store.
|
|
327
|
-
|
|
328
|
-
Args:
|
|
329
|
-
key (str): The key to save the data under.
|
|
330
|
-
from_dataset_id (str, optional): The ID of the dataset in case of calling the class method. Uses default dataset if omitted.
|
|
331
|
-
from_dataset_name (str, optional): The name of the dataset in case of calling the class method. Uses default dataset if omitted.
|
|
332
|
-
You must specify only one of `from_dataset_id` and `from_dataset_name` arguments.
|
|
333
|
-
If you omit both, it uses the default dataset.
|
|
334
|
-
to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved.
|
|
335
|
-
to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved.
|
|
336
|
-
You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments.
|
|
337
|
-
If you omit both, it uses the default key-value store.
|
|
338
|
-
"""
|
|
339
|
-
dataset = await cls.open(id=from_dataset_id, name=from_dataset_name)
|
|
340
|
-
await dataset.export_to_json(key, to_key_value_store_id=to_key_value_store_id, to_key_value_store_name=to_key_value_store_name)
|
|
341
|
-
|
|
342
|
-
async def _export_to_json_internal(
|
|
343
|
-
self: Dataset,
|
|
344
|
-
key: str,
|
|
345
|
-
*,
|
|
346
|
-
from_dataset_id: str | None = None, # noqa: ARG002
|
|
347
|
-
from_dataset_name: str | None = None, # noqa: ARG002
|
|
348
|
-
to_key_value_store_id: str | None = None,
|
|
349
|
-
to_key_value_store_name: str | None = None,
|
|
350
|
-
) -> None:
|
|
351
|
-
await self.export_to(
|
|
352
|
-
key,
|
|
353
|
-
to_key_value_store_id=to_key_value_store_id,
|
|
354
|
-
to_key_value_store_name=to_key_value_store_name,
|
|
355
|
-
content_type='application/json',
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
@classmethod
|
|
359
|
-
async def export_to_csv(
|
|
360
|
-
cls: type[Dataset],
|
|
361
|
-
key: str,
|
|
362
|
-
*,
|
|
363
|
-
from_dataset_id: str | None = None,
|
|
364
|
-
from_dataset_name: str | None = None,
|
|
365
|
-
to_key_value_store_id: str | None = None,
|
|
366
|
-
to_key_value_store_name: str | None = None,
|
|
367
|
-
) -> None:
|
|
368
|
-
"""Save the entirety of the dataset's contents into one CSV file within a key-value store.
|
|
369
|
-
|
|
370
|
-
Args:
|
|
371
|
-
key (str): The key to save the data under.
|
|
372
|
-
from_dataset_id (str, optional): The ID of the dataset in case of calling the class method. Uses default dataset if omitted.
|
|
373
|
-
from_dataset_name (str, optional): The name of the dataset in case of calling the class method. Uses default dataset if omitted.
|
|
374
|
-
You must specify only one of `from_dataset_id` and `from_dataset_name` arguments.
|
|
375
|
-
If you omit both, it uses the default dataset.
|
|
376
|
-
to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved.
|
|
377
|
-
to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved.
|
|
378
|
-
You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments.
|
|
379
|
-
If you omit both, it uses the default key-value store.
|
|
380
|
-
"""
|
|
381
|
-
dataset = await cls.open(id=from_dataset_id, name=from_dataset_name)
|
|
382
|
-
await dataset.export_to_csv(key, to_key_value_store_id=to_key_value_store_id, to_key_value_store_name=to_key_value_store_name)
|
|
383
|
-
|
|
384
|
-
async def _export_to_csv_internal(
|
|
385
|
-
self: Dataset,
|
|
386
|
-
key: str,
|
|
387
|
-
*,
|
|
388
|
-
from_dataset_id: str | None = None, # noqa: ARG002
|
|
389
|
-
from_dataset_name: str | None = None, # noqa: ARG002
|
|
390
|
-
to_key_value_store_id: str | None = None,
|
|
391
|
-
to_key_value_store_name: str | None = None,
|
|
392
|
-
) -> None:
|
|
393
|
-
await self.export_to(
|
|
394
|
-
key,
|
|
395
|
-
to_key_value_store_id=to_key_value_store_id,
|
|
396
|
-
to_key_value_store_name=to_key_value_store_name,
|
|
397
|
-
content_type='text/csv',
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
async def get_info(self: Dataset) -> dict | None:
|
|
401
|
-
"""Get an object containing general information about the dataset.
|
|
402
|
-
|
|
403
|
-
Returns:
|
|
404
|
-
dict: Object returned by calling the GET dataset API endpoint.
|
|
405
|
-
"""
|
|
406
|
-
return await self._dataset_client.get()
|
|
407
|
-
|
|
408
|
-
def iterate_items(
|
|
409
|
-
self: Dataset,
|
|
410
|
-
*,
|
|
411
|
-
offset: int = 0,
|
|
412
|
-
limit: int | None = None,
|
|
413
|
-
clean: bool | None = None,
|
|
414
|
-
desc: bool | None = None,
|
|
415
|
-
fields: list[str] | None = None,
|
|
416
|
-
omit: list[str] | None = None,
|
|
417
|
-
unwind: str | None = None,
|
|
418
|
-
skip_empty: bool | None = None,
|
|
419
|
-
skip_hidden: bool | None = None,
|
|
420
|
-
) -> AsyncIterator[dict]:
|
|
421
|
-
"""Iterate over the items in the dataset.
|
|
422
|
-
|
|
423
|
-
Args:
|
|
424
|
-
offset (int, optional): Number of items that should be skipped at the start. The default value is 0
|
|
425
|
-
limit (int, optional): Maximum number of items to return. By default there is no limit.
|
|
426
|
-
desc (bool, optional): By default, results are returned in the same order as they were stored.
|
|
427
|
-
To reverse the order, set this parameter to True.
|
|
428
|
-
clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
|
|
429
|
-
The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
|
|
430
|
-
Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
|
|
431
|
-
fields (list of str, optional): A list of fields which should be picked from the items,
|
|
432
|
-
only these fields will remain in the resulting record objects.
|
|
433
|
-
Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
|
|
434
|
-
You can use this feature to effectively fix the output format.
|
|
435
|
-
omit (list of str, optional): A list of fields which should be omitted from the items.
|
|
436
|
-
unwind (str, optional): Name of a field which should be unwound.
|
|
437
|
-
If the field is an array then every element of the array will become a separate record and merged with parent object.
|
|
438
|
-
If the unwound field is an object then it is merged with the parent object.
|
|
439
|
-
If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
|
|
440
|
-
then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
|
|
441
|
-
skip_empty (bool, optional): If True, then empty items are skipped from the output.
|
|
442
|
-
Note that if used, the results might contain less items than the limit value.
|
|
443
|
-
skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
|
|
444
|
-
|
|
445
|
-
Yields:
|
|
446
|
-
dict: An item from the dataset
|
|
447
|
-
"""
|
|
448
|
-
return self._dataset_client.iterate_items(
|
|
449
|
-
offset=offset,
|
|
450
|
-
limit=limit,
|
|
451
|
-
clean=clean,
|
|
452
|
-
desc=desc,
|
|
453
|
-
fields=fields,
|
|
454
|
-
omit=omit,
|
|
455
|
-
unwind=unwind,
|
|
456
|
-
skip_empty=skip_empty,
|
|
457
|
-
skip_hidden=skip_hidden,
|
|
458
|
-
)
|
|
459
|
-
|
|
460
|
-
async def drop(self: Dataset) -> None:
|
|
461
|
-
"""Remove the dataset either from the Apify cloud storage or from the local directory."""
|
|
462
|
-
await self._dataset_client.delete()
|
|
463
|
-
self._remove_from_cache()
|
|
464
|
-
|
|
465
|
-
@classmethod
|
|
466
|
-
async def open(
|
|
467
|
-
cls: type[Dataset],
|
|
468
|
-
*,
|
|
469
|
-
id: str | None = None, # noqa: A002
|
|
470
|
-
name: str | None = None,
|
|
471
|
-
force_cloud: bool = False,
|
|
472
|
-
config: Configuration | None = None,
|
|
473
|
-
) -> Dataset:
|
|
474
|
-
"""Open a dataset.
|
|
475
|
-
|
|
476
|
-
Datasets are used to store structured data where each object stored has the same attributes,
|
|
477
|
-
such as online store products or real estate offers.
|
|
478
|
-
The actual data is stored either on the local filesystem or in the Apify cloud.
|
|
479
|
-
|
|
480
|
-
Args:
|
|
481
|
-
id (str, optional): ID of the dataset to be opened.
|
|
482
|
-
If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run.
|
|
483
|
-
If the dataset with the given ID does not exist, it raises an error.
|
|
484
|
-
name (str, optional): Name of the dataset to be opened.
|
|
485
|
-
If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run.
|
|
486
|
-
If the dataset with the given name does not exist, it is created.
|
|
487
|
-
force_cloud (bool, optional): If set to True, it will open a dataset on the Apify Platform even when running the actor locally.
|
|
488
|
-
Defaults to False.
|
|
489
|
-
config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted.
|
|
490
|
-
|
|
491
|
-
Returns:
|
|
492
|
-
Dataset: An instance of the `Dataset` class for the given ID or name.
|
|
493
|
-
"""
|
|
494
|
-
return await super().open(id=id, name=name, force_cloud=force_cloud, config=config) # type: ignore
|