apify 2.7.3__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +194 -126
- apify/_charging.py +34 -9
- apify/_configuration.py +70 -6
- apify/_crypto.py +0 -6
- apify/_models.py +7 -7
- apify/_proxy_configuration.py +10 -10
- apify/_utils.py +25 -2
- apify/events/__init__.py +5 -0
- apify/events/_apify_event_manager.py +140 -0
- apify/events/_types.py +102 -0
- apify/log.py +0 -9
- apify/request_loaders/__init__.py +18 -0
- apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
- apify/request_loaders/py.typed +0 -0
- apify/scrapy/_logging_config.py +1 -4
- apify/scrapy/extensions/_httpcache.py +9 -5
- apify/scrapy/requests.py +3 -3
- apify/scrapy/scheduler.py +8 -5
- apify/storage_clients/__init__.py +12 -0
- apify/storage_clients/_apify/__init__.py +11 -0
- apify/storage_clients/_apify/_dataset_client.py +328 -0
- apify/storage_clients/_apify/_key_value_store_client.py +265 -0
- apify/storage_clients/_apify/_models.py +131 -0
- apify/storage_clients/_apify/_request_queue_client.py +327 -0
- apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
- apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
- apify/storage_clients/_apify/_storage_client.py +106 -0
- apify/storage_clients/_apify/_utils.py +194 -0
- apify/storage_clients/_apify/py.typed +0 -0
- apify/storage_clients/_file_system/__init__.py +2 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
- apify/storage_clients/_file_system/_storage_client.py +41 -0
- apify/storage_clients/_smart_apify/__init__.py +1 -0
- apify/storage_clients/_smart_apify/_storage_client.py +117 -0
- apify/storage_clients/py.typed +0 -0
- apify/storages/__init__.py +1 -3
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
- apify-3.0.0.dist-info/RECORD +57 -0
- apify/_platform_event_manager.py +0 -231
- apify/apify_storage_client/__init__.py +0 -3
- apify/apify_storage_client/_apify_storage_client.py +0 -72
- apify/apify_storage_client/_dataset_client.py +0 -190
- apify/apify_storage_client/_dataset_collection_client.py +0 -51
- apify/apify_storage_client/_key_value_store_client.py +0 -109
- apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
- apify/apify_storage_client/_request_queue_client.py +0 -176
- apify/apify_storage_client/_request_queue_collection_client.py +0 -51
- apify-2.7.3.dist-info/RECORD +0 -44
- /apify/{apify_storage_client → events}/py.typed +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
apify/log.py
CHANGED
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
-
from apify_shared.utils import ignore_docs
|
|
6
5
|
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
|
|
7
6
|
|
|
8
7
|
# Name of the logger used throughout the library (resolves to 'apify')
|
|
@@ -12,7 +11,6 @@ logger_name = __name__.split('.')[0]
|
|
|
12
11
|
logger = logging.getLogger(logger_name)
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
@ignore_docs
|
|
16
14
|
class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from parent class)
|
|
17
15
|
pass
|
|
18
16
|
|
|
@@ -29,13 +27,6 @@ def _configure_logging() -> None:
|
|
|
29
27
|
else:
|
|
30
28
|
apify_client_logger.setLevel(level)
|
|
31
29
|
|
|
32
|
-
# Silence HTTPX logger unless debug logging is requested
|
|
33
|
-
httpx_logger = logging.getLogger('httpx')
|
|
34
|
-
if level > logging.DEBUG:
|
|
35
|
-
httpx_logger.setLevel(logging.WARNING)
|
|
36
|
-
else:
|
|
37
|
-
httpx_logger.setLevel(level)
|
|
38
|
-
|
|
39
30
|
# Use configured log level for apify logger
|
|
40
31
|
apify_logger = logging.getLogger('apify')
|
|
41
32
|
configure_logger(apify_logger, remove_old_handlers=True)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from crawlee.request_loaders import (
|
|
2
|
+
RequestList,
|
|
3
|
+
RequestLoader,
|
|
4
|
+
RequestManager,
|
|
5
|
+
RequestManagerTandem,
|
|
6
|
+
SitemapRequestLoader,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from ._apify_request_list import ApifyRequestList
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
'ApifyRequestList',
|
|
13
|
+
'RequestList',
|
|
14
|
+
'RequestLoader',
|
|
15
|
+
'RequestManager',
|
|
16
|
+
'RequestManagerTandem',
|
|
17
|
+
'SitemapRequestLoader',
|
|
18
|
+
]
|
|
@@ -3,16 +3,15 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import re
|
|
5
5
|
from asyncio import Task
|
|
6
|
-
from
|
|
7
|
-
from typing import Annotated, Any, Union
|
|
6
|
+
from typing import Annotated, Any
|
|
8
7
|
|
|
9
8
|
from pydantic import BaseModel, Field, TypeAdapter
|
|
10
9
|
|
|
11
|
-
from crawlee import Request
|
|
12
10
|
from crawlee._types import HttpMethod
|
|
13
|
-
from crawlee.http_clients import HttpClient,
|
|
14
|
-
from crawlee.request_loaders import RequestList
|
|
11
|
+
from crawlee.http_clients import HttpClient, ImpitHttpClient
|
|
12
|
+
from crawlee.request_loaders import RequestList
|
|
15
13
|
|
|
14
|
+
from apify import Request
|
|
16
15
|
from apify._utils import docs_group
|
|
17
16
|
|
|
18
17
|
URL_NO_COMMAS_REGEX = re.compile(
|
|
@@ -35,11 +34,11 @@ class _SimpleUrlInput(_RequestDetails):
|
|
|
35
34
|
url: str
|
|
36
35
|
|
|
37
36
|
|
|
38
|
-
url_input_adapter = TypeAdapter(list[
|
|
37
|
+
url_input_adapter = TypeAdapter(list[_RequestsFromUrlInput | _SimpleUrlInput])
|
|
39
38
|
|
|
40
39
|
|
|
41
|
-
@docs_group('
|
|
42
|
-
class RequestList
|
|
40
|
+
@docs_group('Request loaders')
|
|
41
|
+
class ApifyRequestList(RequestList):
|
|
43
42
|
"""Extends crawlee RequestList.
|
|
44
43
|
|
|
45
44
|
Method open is used to create RequestList from actor's requestListSources input.
|
|
@@ -50,7 +49,7 @@ class RequestList(CrawleeRequestList):
|
|
|
50
49
|
name: str | None = None,
|
|
51
50
|
request_list_sources_input: list[dict[str, Any]] | None = None,
|
|
52
51
|
http_client: HttpClient | None = None,
|
|
53
|
-
) ->
|
|
52
|
+
) -> ApifyRequestList:
|
|
54
53
|
"""Initialize a new instance from request list source input.
|
|
55
54
|
|
|
56
55
|
Args:
|
|
@@ -74,24 +73,26 @@ class RequestList(CrawleeRequestList):
|
|
|
74
73
|
```
|
|
75
74
|
"""
|
|
76
75
|
request_list_sources_input = request_list_sources_input or []
|
|
77
|
-
return await
|
|
76
|
+
return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client)
|
|
78
77
|
|
|
79
78
|
@staticmethod
|
|
80
79
|
async def _create_request_list(
|
|
81
80
|
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None
|
|
82
|
-
) ->
|
|
81
|
+
) -> ApifyRequestList:
|
|
83
82
|
if not http_client:
|
|
84
|
-
http_client =
|
|
83
|
+
http_client = ImpitHttpClient()
|
|
85
84
|
|
|
86
85
|
url_inputs = url_input_adapter.validate_python(request_list_sources_input)
|
|
87
86
|
|
|
88
87
|
simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
|
|
89
88
|
remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
|
|
90
89
|
|
|
91
|
-
simple_url_requests =
|
|
92
|
-
remote_url_requests = await
|
|
90
|
+
simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs)
|
|
91
|
+
remote_url_requests = await ApifyRequestList._fetch_requests_from_url(
|
|
92
|
+
remote_url_inputs, http_client=http_client
|
|
93
|
+
)
|
|
93
94
|
|
|
94
|
-
return
|
|
95
|
+
return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests)
|
|
95
96
|
|
|
96
97
|
@staticmethod
|
|
97
98
|
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
|
|
@@ -119,13 +120,15 @@ class RequestList(CrawleeRequestList):
|
|
|
119
120
|
"""
|
|
120
121
|
created_requests: list[Request] = []
|
|
121
122
|
|
|
122
|
-
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
|
|
123
|
+
async def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
|
|
123
124
|
"""Extract links from response body and use them to create `Request` objects.
|
|
124
125
|
|
|
125
126
|
Use the regular expression to find all matching links in the response body, then create `Request`
|
|
126
127
|
objects from these links and the provided input attributes.
|
|
127
128
|
"""
|
|
128
|
-
|
|
129
|
+
response = await (task.result()).read()
|
|
130
|
+
matches = re.finditer(URL_NO_COMMAS_REGEX, response.decode('utf-8'))
|
|
131
|
+
|
|
129
132
|
created_requests.extend(
|
|
130
133
|
[
|
|
131
134
|
Request.from_url(
|
|
@@ -148,7 +151,11 @@ class RequestList(CrawleeRequestList):
|
|
|
148
151
|
)
|
|
149
152
|
)
|
|
150
153
|
|
|
151
|
-
get_response_task.add_done_callback(
|
|
154
|
+
get_response_task.add_done_callback(
|
|
155
|
+
lambda task, inp=remote_url_requests_input: asyncio.create_task( # type: ignore[misc]
|
|
156
|
+
create_requests_from_response(inp, task)
|
|
157
|
+
)
|
|
158
|
+
)
|
|
152
159
|
remote_url_requests.append(get_response_task)
|
|
153
160
|
|
|
154
161
|
await asyncio.gather(*remote_url_requests)
|
|
File without changes
|
apify/scrapy/_logging_config.py
CHANGED
|
@@ -10,7 +10,7 @@ from apify.log import ActorLogFormatter
|
|
|
10
10
|
|
|
11
11
|
# Define logger names.
|
|
12
12
|
_PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
|
|
13
|
-
_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', '
|
|
13
|
+
_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'protego', 'twisted']
|
|
14
14
|
_ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
|
|
15
15
|
|
|
16
16
|
|
|
@@ -37,9 +37,6 @@ def initialize_logging() -> None:
|
|
|
37
37
|
for logger_name in [None, *_ALL_LOGGERS]:
|
|
38
38
|
_configure_logger(logger_name, logging_level, handler)
|
|
39
39
|
|
|
40
|
-
# Set the 'httpx' logger to a less verbose level.
|
|
41
|
-
logging.getLogger('httpx').setLevel('WARNING')
|
|
42
|
-
|
|
43
40
|
# Monkey-patch Scrapy's logging configuration to re-apply our settings.
|
|
44
41
|
original_configure_logging = scrapy_logging.configure_logging
|
|
45
42
|
|
|
@@ -13,8 +13,8 @@ from scrapy.http.headers import Headers
|
|
|
13
13
|
from scrapy.responsetypes import responsetypes
|
|
14
14
|
|
|
15
15
|
from apify import Configuration
|
|
16
|
-
from apify.apify_storage_client import ApifyStorageClient
|
|
17
16
|
from apify.scrapy._async_thread import AsyncThread
|
|
17
|
+
from apify.storage_clients import ApifyStorageClient
|
|
18
18
|
from apify.storages import KeyValueStore
|
|
19
19
|
|
|
20
20
|
if TYPE_CHECKING:
|
|
@@ -51,10 +51,14 @@ class ApifyCacheStorage:
|
|
|
51
51
|
kvs_name = get_kvs_name(spider.name)
|
|
52
52
|
|
|
53
53
|
async def open_kvs() -> KeyValueStore:
|
|
54
|
-
|
|
55
|
-
if
|
|
56
|
-
storage_client = ApifyStorageClient
|
|
57
|
-
return await KeyValueStore.open(
|
|
54
|
+
configuration = Configuration.get_global_configuration()
|
|
55
|
+
if configuration.is_at_home:
|
|
56
|
+
storage_client = ApifyStorageClient()
|
|
57
|
+
return await KeyValueStore.open(
|
|
58
|
+
name=kvs_name,
|
|
59
|
+
configuration=configuration,
|
|
60
|
+
storage_client=storage_client,
|
|
61
|
+
)
|
|
58
62
|
return await KeyValueStore.open(name=kvs_name)
|
|
59
63
|
|
|
60
64
|
logger.debug("Starting background thread for cache storage's event loop")
|
apify/scrapy/requests.py
CHANGED
|
@@ -10,9 +10,10 @@ from scrapy import Spider
|
|
|
10
10
|
from scrapy.http.headers import Headers
|
|
11
11
|
from scrapy.utils.request import request_from_dict
|
|
12
12
|
|
|
13
|
-
from crawlee import Request as ApifyRequest
|
|
14
13
|
from crawlee._types import HttpHeaders
|
|
15
14
|
|
|
15
|
+
from apify import Request as ApifyRequest
|
|
16
|
+
|
|
16
17
|
logger = getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
@@ -121,7 +122,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
|
|
|
121
122
|
|
|
122
123
|
# Update the meta field with the meta field from the apify_request
|
|
123
124
|
meta = scrapy_request.meta or {}
|
|
124
|
-
meta.update({'
|
|
125
|
+
meta.update({'apify_request_unique_key': apify_request.unique_key})
|
|
125
126
|
# scrapy_request.meta is a property, so we have to set it like this
|
|
126
127
|
scrapy_request._meta = meta # noqa: SLF001
|
|
127
128
|
|
|
@@ -133,7 +134,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
|
|
|
133
134
|
url=apify_request.url,
|
|
134
135
|
method=apify_request.method,
|
|
135
136
|
meta={
|
|
136
|
-
'apify_request_id': apify_request.id,
|
|
137
137
|
'apify_request_unique_key': apify_request.unique_key,
|
|
138
138
|
},
|
|
139
139
|
)
|
apify/scrapy/scheduler.py
CHANGED
|
@@ -11,7 +11,7 @@ from scrapy.utils.reactor import is_asyncio_reactor_installed
|
|
|
11
11
|
from ._async_thread import AsyncThread
|
|
12
12
|
from .requests import to_apify_request, to_scrapy_request
|
|
13
13
|
from apify import Configuration
|
|
14
|
-
from apify.
|
|
14
|
+
from apify.storage_clients import ApifyStorageClient
|
|
15
15
|
from apify.storages import RequestQueue
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
@@ -49,10 +49,13 @@ class ApifyScheduler(BaseScheduler):
|
|
|
49
49
|
self.spider = spider
|
|
50
50
|
|
|
51
51
|
async def open_rq() -> RequestQueue:
|
|
52
|
-
|
|
53
|
-
if
|
|
54
|
-
storage_client = ApifyStorageClient
|
|
55
|
-
return await RequestQueue.open(
|
|
52
|
+
configuration = Configuration.get_global_configuration()
|
|
53
|
+
if configuration.is_at_home:
|
|
54
|
+
storage_client = ApifyStorageClient()
|
|
55
|
+
return await RequestQueue.open(
|
|
56
|
+
configuration=configuration,
|
|
57
|
+
storage_client=storage_client,
|
|
58
|
+
)
|
|
56
59
|
return await RequestQueue.open()
|
|
57
60
|
|
|
58
61
|
try:
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from crawlee.storage_clients import MemoryStorageClient
|
|
2
|
+
|
|
3
|
+
from ._apify import ApifyStorageClient
|
|
4
|
+
from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
|
|
5
|
+
from ._smart_apify import SmartApifyStorageClient
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
'ApifyStorageClient',
|
|
9
|
+
'FileSystemStorageClient',
|
|
10
|
+
'MemoryStorageClient',
|
|
11
|
+
'SmartApifyStorageClient',
|
|
12
|
+
]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from ._dataset_client import ApifyDatasetClient
|
|
2
|
+
from ._key_value_store_client import ApifyKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import ApifyRequestQueueClient
|
|
4
|
+
from ._storage_client import ApifyStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
'ApifyDatasetClient',
|
|
8
|
+
'ApifyKeyValueStoreClient',
|
|
9
|
+
'ApifyRequestQueueClient',
|
|
10
|
+
'ApifyStorageClient',
|
|
11
|
+
]
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from logging import getLogger
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from typing_extensions import override
|
|
8
|
+
|
|
9
|
+
from apify_client import ApifyClientAsync
|
|
10
|
+
from crawlee._utils.byte_size import ByteSize
|
|
11
|
+
from crawlee._utils.file import json_dumps
|
|
12
|
+
from crawlee.storage_clients._base import DatasetClient
|
|
13
|
+
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
14
|
+
from crawlee.storages import Dataset
|
|
15
|
+
|
|
16
|
+
from ._utils import AliasResolver
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import AsyncIterator
|
|
20
|
+
|
|
21
|
+
from apify_client.clients import DatasetClientAsync
|
|
22
|
+
from crawlee._types import JsonSerializable
|
|
23
|
+
|
|
24
|
+
from apify import Configuration
|
|
25
|
+
|
|
26
|
+
logger = getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ApifyDatasetClient(DatasetClient):
|
|
30
|
+
"""An Apify platform implementation of the dataset client."""
|
|
31
|
+
|
|
32
|
+
_MAX_PAYLOAD_SIZE = ByteSize.from_mb(9)
|
|
33
|
+
"""Maximum size for a single payload."""
|
|
34
|
+
|
|
35
|
+
_SAFETY_BUFFER_COEFFICIENT = 0.01 / 100 # 0.01%
|
|
36
|
+
"""Percentage buffer to reduce payload limit slightly for safety."""
|
|
37
|
+
|
|
38
|
+
_EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_COEFFICIENT)
|
|
39
|
+
"""Calculated payload limit considering safety buffer."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
api_client: DatasetClientAsync,
|
|
45
|
+
api_public_base_url: str,
|
|
46
|
+
lock: asyncio.Lock,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Initialize a new instance.
|
|
49
|
+
|
|
50
|
+
Preferably use the `ApifyDatasetClient.open` class method to create a new instance.
|
|
51
|
+
"""
|
|
52
|
+
self._api_client = api_client
|
|
53
|
+
"""The Apify dataset client for API operations."""
|
|
54
|
+
|
|
55
|
+
self._api_public_base_url = api_public_base_url
|
|
56
|
+
"""The public base URL for accessing the key-value store records."""
|
|
57
|
+
|
|
58
|
+
self._lock = lock
|
|
59
|
+
"""A lock to ensure that only one operation is performed at a time."""
|
|
60
|
+
|
|
61
|
+
@override
|
|
62
|
+
async def get_metadata(self) -> DatasetMetadata:
|
|
63
|
+
metadata = await self._api_client.get()
|
|
64
|
+
return DatasetMetadata.model_validate(metadata)
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
async def open(
|
|
68
|
+
cls,
|
|
69
|
+
*,
|
|
70
|
+
id: str | None,
|
|
71
|
+
name: str | None,
|
|
72
|
+
alias: str | None,
|
|
73
|
+
configuration: Configuration,
|
|
74
|
+
) -> ApifyDatasetClient:
|
|
75
|
+
"""Open an Apify dataset client.
|
|
76
|
+
|
|
77
|
+
This method creates and initializes a new instance of the Apify dataset client.
|
|
78
|
+
It handles authentication, storage lookup/creation, and metadata retrieval.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
|
|
82
|
+
Mutually exclusive with name and alias.
|
|
83
|
+
name: The name of the dataset to open (global scope, persists across runs).
|
|
84
|
+
Mutually exclusive with id and alias.
|
|
85
|
+
alias: The alias of the dataset to open (run scope, creates unnamed storage).
|
|
86
|
+
Mutually exclusive with id and name.
|
|
87
|
+
configuration: The configuration object containing API credentials and settings. Must include a valid
|
|
88
|
+
`token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither
|
|
89
|
+
`id`, `name`, nor `alias` is provided.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
An instance for the opened or created storage client.
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
|
|
96
|
+
`id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
|
|
97
|
+
in the configuration.
|
|
98
|
+
"""
|
|
99
|
+
if sum(1 for param in [id, name, alias] if param is not None) > 1:
|
|
100
|
+
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
101
|
+
|
|
102
|
+
token = configuration.token
|
|
103
|
+
if not token:
|
|
104
|
+
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
|
|
105
|
+
|
|
106
|
+
api_url = configuration.api_base_url
|
|
107
|
+
if not api_url:
|
|
108
|
+
raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
|
|
109
|
+
|
|
110
|
+
api_public_base_url = configuration.api_public_base_url
|
|
111
|
+
if not api_public_base_url:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
'Apify storage client requires a valid API public base URL in Configuration '
|
|
114
|
+
f'(api_public_base_url={api_public_base_url}).'
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Create Apify client with the provided token and API URL.
|
|
118
|
+
apify_client_async = ApifyClientAsync(
|
|
119
|
+
token=token,
|
|
120
|
+
api_url=api_url,
|
|
121
|
+
max_retries=8,
|
|
122
|
+
min_delay_between_retries_millis=500,
|
|
123
|
+
timeout_secs=360,
|
|
124
|
+
)
|
|
125
|
+
apify_datasets_client = apify_client_async.datasets()
|
|
126
|
+
|
|
127
|
+
# Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
|
|
128
|
+
# storage aliased as `__default__`
|
|
129
|
+
if not any([alias, name, id, configuration.default_dataset_id]):
|
|
130
|
+
alias = '__default__'
|
|
131
|
+
|
|
132
|
+
if alias:
|
|
133
|
+
# Check if there is pre-existing alias mapping in the default KVS.
|
|
134
|
+
async with AliasResolver(storage_type=Dataset, alias=alias, configuration=configuration) as _alias:
|
|
135
|
+
id = await _alias.resolve_id()
|
|
136
|
+
|
|
137
|
+
# There was no pre-existing alias in the mapping.
|
|
138
|
+
# Create a new unnamed storage and store the mapping.
|
|
139
|
+
if id is None:
|
|
140
|
+
new_storage_metadata = DatasetMetadata.model_validate(
|
|
141
|
+
await apify_datasets_client.get_or_create(),
|
|
142
|
+
)
|
|
143
|
+
id = new_storage_metadata.id
|
|
144
|
+
await _alias.store_mapping(storage_id=id)
|
|
145
|
+
|
|
146
|
+
# If name is provided, get or create the storage by name.
|
|
147
|
+
elif name:
|
|
148
|
+
id = DatasetMetadata.model_validate(
|
|
149
|
+
await apify_datasets_client.get_or_create(name=name),
|
|
150
|
+
).id
|
|
151
|
+
|
|
152
|
+
# If none are provided, try to get the default storage ID from environment variables.
|
|
153
|
+
elif id is None:
|
|
154
|
+
id = configuration.default_dataset_id
|
|
155
|
+
if not id:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
'Dataset "id", "name", or "alias" must be specified, '
|
|
158
|
+
'or a default dataset ID must be set in the configuration.'
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Now create the client for the determined ID
|
|
162
|
+
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
|
|
163
|
+
|
|
164
|
+
# Fetch its metadata.
|
|
165
|
+
metadata = await apify_dataset_client.get()
|
|
166
|
+
|
|
167
|
+
# If metadata is None, it means the storage does not exist, so we create it.
|
|
168
|
+
if metadata is None:
|
|
169
|
+
id = DatasetMetadata.model_validate(
|
|
170
|
+
await apify_datasets_client.get_or_create(),
|
|
171
|
+
).id
|
|
172
|
+
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
|
|
173
|
+
|
|
174
|
+
# Verify that the storage exists by fetching its metadata again.
|
|
175
|
+
metadata = await apify_dataset_client.get()
|
|
176
|
+
if metadata is None:
|
|
177
|
+
raise ValueError(f'Opening dataset with id={id}, name={name}, and alias={alias} failed.')
|
|
178
|
+
|
|
179
|
+
return cls(
|
|
180
|
+
api_client=apify_dataset_client,
|
|
181
|
+
api_public_base_url=api_public_base_url,
|
|
182
|
+
lock=asyncio.Lock(),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
@override
|
|
186
|
+
async def purge(self) -> None:
|
|
187
|
+
raise NotImplementedError(
|
|
188
|
+
'Purging datasets is not supported in the Apify platform. '
|
|
189
|
+
'Use the `drop` method to delete the dataset instead.'
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
@override
|
|
193
|
+
async def drop(self) -> None:
|
|
194
|
+
async with self._lock:
|
|
195
|
+
await self._api_client.delete()
|
|
196
|
+
|
|
197
|
+
@override
|
|
198
|
+
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
|
|
199
|
+
async def payloads_generator() -> AsyncIterator[str]:
|
|
200
|
+
for index, item in enumerate(data):
|
|
201
|
+
yield await self._check_and_serialize(item, index)
|
|
202
|
+
|
|
203
|
+
async with self._lock:
|
|
204
|
+
# Handle lists
|
|
205
|
+
if isinstance(data, list):
|
|
206
|
+
# Invoke client in series to preserve the order of data
|
|
207
|
+
async for items in self._chunk_by_size(payloads_generator()):
|
|
208
|
+
await self._api_client.push_items(items=items)
|
|
209
|
+
|
|
210
|
+
# Handle singular items
|
|
211
|
+
else:
|
|
212
|
+
items = await self._check_and_serialize(data)
|
|
213
|
+
await self._api_client.push_items(items=items)
|
|
214
|
+
|
|
215
|
+
@override
|
|
216
|
+
async def get_data(
|
|
217
|
+
self,
|
|
218
|
+
*,
|
|
219
|
+
offset: int = 0,
|
|
220
|
+
limit: int | None = 999_999_999_999,
|
|
221
|
+
clean: bool = False,
|
|
222
|
+
desc: bool = False,
|
|
223
|
+
fields: list[str] | None = None,
|
|
224
|
+
omit: list[str] | None = None,
|
|
225
|
+
unwind: list[str] | None = None,
|
|
226
|
+
skip_empty: bool = False,
|
|
227
|
+
skip_hidden: bool = False,
|
|
228
|
+
flatten: list[str] | None = None,
|
|
229
|
+
view: str | None = None,
|
|
230
|
+
) -> DatasetItemsListPage:
|
|
231
|
+
response = await self._api_client.list_items(
|
|
232
|
+
offset=offset,
|
|
233
|
+
limit=limit,
|
|
234
|
+
clean=clean,
|
|
235
|
+
desc=desc,
|
|
236
|
+
fields=fields,
|
|
237
|
+
omit=omit,
|
|
238
|
+
unwind=unwind,
|
|
239
|
+
skip_empty=skip_empty,
|
|
240
|
+
skip_hidden=skip_hidden,
|
|
241
|
+
flatten=flatten,
|
|
242
|
+
view=view,
|
|
243
|
+
)
|
|
244
|
+
return DatasetItemsListPage.model_validate(vars(response))
|
|
245
|
+
|
|
246
|
+
@override
|
|
247
|
+
async def iterate_items(
|
|
248
|
+
self,
|
|
249
|
+
*,
|
|
250
|
+
offset: int = 0,
|
|
251
|
+
limit: int | None = None,
|
|
252
|
+
clean: bool = False,
|
|
253
|
+
desc: bool = False,
|
|
254
|
+
fields: list[str] | None = None,
|
|
255
|
+
omit: list[str] | None = None,
|
|
256
|
+
unwind: list[str] | None = None,
|
|
257
|
+
skip_empty: bool = False,
|
|
258
|
+
skip_hidden: bool = False,
|
|
259
|
+
) -> AsyncIterator[dict]:
|
|
260
|
+
async for item in self._api_client.iterate_items(
|
|
261
|
+
offset=offset,
|
|
262
|
+
limit=limit,
|
|
263
|
+
clean=clean,
|
|
264
|
+
desc=desc,
|
|
265
|
+
fields=fields,
|
|
266
|
+
omit=omit,
|
|
267
|
+
unwind=unwind,
|
|
268
|
+
skip_empty=skip_empty,
|
|
269
|
+
skip_hidden=skip_hidden,
|
|
270
|
+
):
|
|
271
|
+
yield item
|
|
272
|
+
|
|
273
|
+
@classmethod
|
|
274
|
+
async def _check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str:
|
|
275
|
+
"""Serialize a given item to JSON, checks its serializability and size against a limit.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
item: The item to serialize.
|
|
279
|
+
index: Index of the item, used for error context.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Serialized JSON string.
|
|
283
|
+
|
|
284
|
+
Raises:
|
|
285
|
+
ValueError: If item is not JSON serializable or exceeds size limit.
|
|
286
|
+
"""
|
|
287
|
+
s = ' ' if index is None else f' at index {index} '
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
payload = await json_dumps(item)
|
|
291
|
+
except Exception as exc:
|
|
292
|
+
raise ValueError(f'Data item{s}is not serializable to JSON.') from exc
|
|
293
|
+
|
|
294
|
+
payload_size = ByteSize(len(payload.encode('utf-8')))
|
|
295
|
+
if payload_size > cls._EFFECTIVE_LIMIT_SIZE:
|
|
296
|
+
raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})')
|
|
297
|
+
|
|
298
|
+
return payload
|
|
299
|
+
|
|
300
|
+
async def _chunk_by_size(self, items: AsyncIterator[str]) -> AsyncIterator[str]:
|
|
301
|
+
"""Yield chunks of JSON arrays composed of input strings, respecting a size limit.
|
|
302
|
+
|
|
303
|
+
Groups an iterable of JSON string payloads into larger JSON arrays, ensuring the total size
|
|
304
|
+
of each array does not exceed `EFFECTIVE_LIMIT_SIZE`. Each output is a JSON array string that
|
|
305
|
+
contains as many payloads as possible without breaching the size threshold, maintaining the
|
|
306
|
+
order of the original payloads. Assumes individual items are below the size limit.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
items: Iterable of JSON string payloads.
|
|
310
|
+
|
|
311
|
+
Yields:
|
|
312
|
+
Strings representing JSON arrays of payloads, each staying within the size limit.
|
|
313
|
+
"""
|
|
314
|
+
last_chunk_size = ByteSize(2) # Add 2 bytes for [] wrapper.
|
|
315
|
+
current_chunk = []
|
|
316
|
+
|
|
317
|
+
async for payload in items:
|
|
318
|
+
payload_size = ByteSize(len(payload.encode('utf-8')))
|
|
319
|
+
|
|
320
|
+
if last_chunk_size + payload_size <= self._EFFECTIVE_LIMIT_SIZE:
|
|
321
|
+
current_chunk.append(payload)
|
|
322
|
+
last_chunk_size += payload_size + ByteSize(1) # Add 1 byte for ',' separator.
|
|
323
|
+
else:
|
|
324
|
+
yield f'[{",".join(current_chunk)}]'
|
|
325
|
+
current_chunk = [payload]
|
|
326
|
+
last_chunk_size = payload_size + ByteSize(2) # Add 2 bytes for [] wrapper.
|
|
327
|
+
|
|
328
|
+
yield f'[{",".join(current_chunk)}]'
|