apify 2.7.3__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +194 -126
- apify/_charging.py +34 -9
- apify/_configuration.py +70 -6
- apify/_crypto.py +0 -6
- apify/_models.py +7 -7
- apify/_proxy_configuration.py +10 -10
- apify/_utils.py +25 -2
- apify/events/__init__.py +5 -0
- apify/events/_apify_event_manager.py +140 -0
- apify/events/_types.py +102 -0
- apify/log.py +0 -9
- apify/request_loaders/__init__.py +18 -0
- apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
- apify/request_loaders/py.typed +0 -0
- apify/scrapy/_logging_config.py +1 -4
- apify/scrapy/extensions/_httpcache.py +9 -5
- apify/scrapy/requests.py +3 -3
- apify/scrapy/scheduler.py +8 -5
- apify/storage_clients/__init__.py +12 -0
- apify/storage_clients/_apify/__init__.py +11 -0
- apify/storage_clients/_apify/_dataset_client.py +328 -0
- apify/storage_clients/_apify/_key_value_store_client.py +265 -0
- apify/storage_clients/_apify/_models.py +131 -0
- apify/storage_clients/_apify/_request_queue_client.py +327 -0
- apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
- apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
- apify/storage_clients/_apify/_storage_client.py +106 -0
- apify/storage_clients/_apify/_utils.py +194 -0
- apify/storage_clients/_apify/py.typed +0 -0
- apify/storage_clients/_file_system/__init__.py +2 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
- apify/storage_clients/_file_system/_storage_client.py +41 -0
- apify/storage_clients/_smart_apify/__init__.py +1 -0
- apify/storage_clients/_smart_apify/_storage_client.py +117 -0
- apify/storage_clients/py.typed +0 -0
- apify/storages/__init__.py +1 -3
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
- apify-3.0.0.dist-info/RECORD +57 -0
- apify/_platform_event_manager.py +0 -231
- apify/apify_storage_client/__init__.py +0 -3
- apify/apify_storage_client/_apify_storage_client.py +0 -72
- apify/apify_storage_client/_dataset_client.py +0 -190
- apify/apify_storage_client/_dataset_collection_client.py +0 -51
- apify/apify_storage_client/_key_value_store_client.py +0 -109
- apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
- apify/apify_storage_client/_request_queue_client.py +0 -176
- apify/apify_storage_client/_request_queue_collection_client.py +0 -51
- apify-2.7.3.dist-info/RECORD +0 -44
- /apify/{apify_storage_client → events}/py.typed +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
apify/_platform_event_manager.py
DELETED
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from typing import TYPE_CHECKING, Annotated, Any, Literal, Union
|
|
6
|
-
|
|
7
|
-
import websockets.asyncio.client
|
|
8
|
-
from pydantic import BaseModel, Discriminator, Field, TypeAdapter
|
|
9
|
-
from typing_extensions import Self, Unpack, override
|
|
10
|
-
|
|
11
|
-
from crawlee.events._event_manager import EventManager, EventManagerOptions
|
|
12
|
-
from crawlee.events._local_event_manager import LocalEventManager
|
|
13
|
-
from crawlee.events._types import (
|
|
14
|
-
Event,
|
|
15
|
-
EventAbortingData,
|
|
16
|
-
EventExitData,
|
|
17
|
-
EventMigratingData,
|
|
18
|
-
EventPersistStateData,
|
|
19
|
-
EventSystemInfoData,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
from apify._utils import docs_group
|
|
23
|
-
from apify.log import logger
|
|
24
|
-
|
|
25
|
-
if TYPE_CHECKING:
|
|
26
|
-
from types import TracebackType
|
|
27
|
-
|
|
28
|
-
from apify._configuration import Configuration
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
__all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager']
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@docs_group('Data structures')
|
|
35
|
-
class PersistStateEvent(BaseModel):
|
|
36
|
-
name: Literal[Event.PERSIST_STATE]
|
|
37
|
-
data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@docs_group('Data structures')
|
|
41
|
-
class SystemInfoEventData(BaseModel):
|
|
42
|
-
mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
|
|
43
|
-
mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
|
|
44
|
-
mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
|
|
45
|
-
cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
|
|
46
|
-
cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
|
|
47
|
-
cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
|
|
48
|
-
is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
|
|
49
|
-
created_at: Annotated[datetime, Field(alias='createdAt')]
|
|
50
|
-
|
|
51
|
-
def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
|
|
52
|
-
return EventSystemInfoData.model_validate(
|
|
53
|
-
{
|
|
54
|
-
'cpu_info': {
|
|
55
|
-
'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus,
|
|
56
|
-
'created_at': self.created_at,
|
|
57
|
-
},
|
|
58
|
-
'memory_info': {
|
|
59
|
-
'total_size': self.mem_max_bytes,
|
|
60
|
-
'current_size': self.mem_current_bytes,
|
|
61
|
-
'created_at': self.created_at,
|
|
62
|
-
},
|
|
63
|
-
}
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
@docs_group('Data structures')
|
|
68
|
-
class SystemInfoEvent(BaseModel):
|
|
69
|
-
name: Literal[Event.SYSTEM_INFO]
|
|
70
|
-
data: SystemInfoEventData
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@docs_group('Data structures')
|
|
74
|
-
class MigratingEvent(BaseModel):
|
|
75
|
-
name: Literal[Event.MIGRATING]
|
|
76
|
-
data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
@docs_group('Data structures')
|
|
80
|
-
class AbortingEvent(BaseModel):
|
|
81
|
-
name: Literal[Event.ABORTING]
|
|
82
|
-
data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
@docs_group('Data structures')
|
|
86
|
-
class ExitEvent(BaseModel):
|
|
87
|
-
name: Literal[Event.EXIT]
|
|
88
|
-
data: Annotated[EventExitData, Field(default_factory=EventExitData)]
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
@docs_group('Data structures')
|
|
92
|
-
class EventWithoutData(BaseModel):
|
|
93
|
-
name: Literal[
|
|
94
|
-
Event.SESSION_RETIRED,
|
|
95
|
-
Event.BROWSER_LAUNCHED,
|
|
96
|
-
Event.BROWSER_RETIRED,
|
|
97
|
-
Event.BROWSER_CLOSED,
|
|
98
|
-
Event.PAGE_CREATED,
|
|
99
|
-
Event.PAGE_CLOSED,
|
|
100
|
-
]
|
|
101
|
-
data: Any = None
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
@docs_group('Data structures')
|
|
105
|
-
class DeprecatedEvent(BaseModel):
|
|
106
|
-
name: Literal['cpuInfo']
|
|
107
|
-
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
@docs_group('Data structures')
|
|
111
|
-
class UnknownEvent(BaseModel):
|
|
112
|
-
name: str
|
|
113
|
-
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
EventMessage = Union[
|
|
117
|
-
PersistStateEvent,
|
|
118
|
-
SystemInfoEvent,
|
|
119
|
-
MigratingEvent,
|
|
120
|
-
AbortingEvent,
|
|
121
|
-
ExitEvent,
|
|
122
|
-
EventWithoutData,
|
|
123
|
-
]
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
event_data_adapter: TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent] = TypeAdapter(
|
|
127
|
-
Union[
|
|
128
|
-
Annotated[
|
|
129
|
-
EventMessage,
|
|
130
|
-
Discriminator('name'),
|
|
131
|
-
],
|
|
132
|
-
DeprecatedEvent,
|
|
133
|
-
UnknownEvent,
|
|
134
|
-
]
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
@docs_group('Classes')
|
|
139
|
-
class PlatformEventManager(EventManager):
|
|
140
|
-
"""A class for managing Actor events.
|
|
141
|
-
|
|
142
|
-
You shouldn't use this class directly,
|
|
143
|
-
but instead use it via the `Actor.on()` and `Actor.off()` methods.
|
|
144
|
-
"""
|
|
145
|
-
|
|
146
|
-
_platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
|
|
147
|
-
_process_platform_messages_task: asyncio.Task | None = None
|
|
148
|
-
_send_system_info_interval_task: asyncio.Task | None = None
|
|
149
|
-
_connected_to_platform_websocket: asyncio.Future = asyncio.Future()
|
|
150
|
-
|
|
151
|
-
def __init__(self, config: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
|
|
152
|
-
"""Create an instance of the EventManager.
|
|
153
|
-
|
|
154
|
-
Args:
|
|
155
|
-
config: The Actor configuration to be used in this event manager.
|
|
156
|
-
kwargs: Event manager options - forwarded to the base class
|
|
157
|
-
"""
|
|
158
|
-
super().__init__(**kwargs)
|
|
159
|
-
|
|
160
|
-
self._config = config
|
|
161
|
-
self._listener_tasks = set()
|
|
162
|
-
self._connected_to_platform_websocket = asyncio.Future[bool]()
|
|
163
|
-
|
|
164
|
-
@override
|
|
165
|
-
async def __aenter__(self) -> Self:
|
|
166
|
-
await super().__aenter__()
|
|
167
|
-
self._connected_to_platform_websocket = asyncio.Future()
|
|
168
|
-
|
|
169
|
-
# Run tasks but don't await them
|
|
170
|
-
if self._config.actor_events_ws_url:
|
|
171
|
-
self._process_platform_messages_task = asyncio.create_task(
|
|
172
|
-
self._process_platform_messages(self._config.actor_events_ws_url)
|
|
173
|
-
)
|
|
174
|
-
is_connected = await self._connected_to_platform_websocket
|
|
175
|
-
if not is_connected:
|
|
176
|
-
raise RuntimeError('Error connecting to platform events websocket!')
|
|
177
|
-
else:
|
|
178
|
-
logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
|
|
179
|
-
|
|
180
|
-
return self
|
|
181
|
-
|
|
182
|
-
@override
|
|
183
|
-
async def __aexit__(
|
|
184
|
-
self,
|
|
185
|
-
exc_type: type[BaseException] | None,
|
|
186
|
-
exc_value: BaseException | None,
|
|
187
|
-
exc_traceback: TracebackType | None,
|
|
188
|
-
) -> None:
|
|
189
|
-
if self._platform_events_websocket:
|
|
190
|
-
await self._platform_events_websocket.close()
|
|
191
|
-
|
|
192
|
-
if self._process_platform_messages_task:
|
|
193
|
-
await self._process_platform_messages_task
|
|
194
|
-
|
|
195
|
-
await super().__aexit__(exc_type, exc_value, exc_traceback)
|
|
196
|
-
|
|
197
|
-
async def _process_platform_messages(self, ws_url: str) -> None:
|
|
198
|
-
try:
|
|
199
|
-
async with websockets.asyncio.client.connect(ws_url) as websocket:
|
|
200
|
-
self._platform_events_websocket = websocket
|
|
201
|
-
self._connected_to_platform_websocket.set_result(True)
|
|
202
|
-
|
|
203
|
-
async for message in websocket:
|
|
204
|
-
try:
|
|
205
|
-
parsed_message = event_data_adapter.validate_json(message)
|
|
206
|
-
|
|
207
|
-
if isinstance(parsed_message, DeprecatedEvent):
|
|
208
|
-
continue
|
|
209
|
-
|
|
210
|
-
if isinstance(parsed_message, UnknownEvent):
|
|
211
|
-
logger.info(
|
|
212
|
-
f'Unknown message received: event_name={parsed_message.name}, '
|
|
213
|
-
f'event_data={parsed_message.data}'
|
|
214
|
-
)
|
|
215
|
-
continue
|
|
216
|
-
|
|
217
|
-
self.emit(
|
|
218
|
-
event=parsed_message.name,
|
|
219
|
-
event_data=parsed_message.data
|
|
220
|
-
if not isinstance(parsed_message.data, SystemInfoEventData)
|
|
221
|
-
else parsed_message.data.to_crawlee_format(self._config.dedicated_cpus or 1),
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
if parsed_message.name == Event.MIGRATING:
|
|
225
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
226
|
-
self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
|
|
227
|
-
except Exception:
|
|
228
|
-
logger.exception('Cannot parse Actor event', extra={'message': message})
|
|
229
|
-
except Exception:
|
|
230
|
-
logger.exception('Error in websocket connection')
|
|
231
|
-
self._connected_to_platform_websocket.set_result(False)
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
|
-
|
|
5
|
-
from typing_extensions import override
|
|
6
|
-
|
|
7
|
-
from apify_client import ApifyClientAsync
|
|
8
|
-
from crawlee._utils.crypto import crypto_random_object_id
|
|
9
|
-
from crawlee.storage_clients import StorageClient
|
|
10
|
-
|
|
11
|
-
from apify._utils import docs_group
|
|
12
|
-
from apify.apify_storage_client._dataset_client import DatasetClient
|
|
13
|
-
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
|
|
14
|
-
from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient
|
|
15
|
-
from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient
|
|
16
|
-
from apify.apify_storage_client._request_queue_client import RequestQueueClient
|
|
17
|
-
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
|
|
18
|
-
|
|
19
|
-
if TYPE_CHECKING:
|
|
20
|
-
from apify._configuration import Configuration
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@docs_group('Classes')
|
|
24
|
-
class ApifyStorageClient(StorageClient):
|
|
25
|
-
"""A storage client implementation based on the Apify platform storage."""
|
|
26
|
-
|
|
27
|
-
def __init__(self, *, configuration: Configuration) -> None:
|
|
28
|
-
self._client_key = crypto_random_object_id()
|
|
29
|
-
self._apify_client = ApifyClientAsync(
|
|
30
|
-
token=configuration.token,
|
|
31
|
-
api_url=configuration.api_base_url,
|
|
32
|
-
max_retries=8,
|
|
33
|
-
min_delay_between_retries_millis=500,
|
|
34
|
-
timeout_secs=360,
|
|
35
|
-
)
|
|
36
|
-
self._configuration = configuration
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
def from_config(cls, config: Configuration) -> ApifyStorageClient:
|
|
40
|
-
return cls(configuration=config)
|
|
41
|
-
|
|
42
|
-
@override
|
|
43
|
-
def dataset(self, id: str) -> DatasetClient:
|
|
44
|
-
return DatasetClient(self._apify_client.dataset(id))
|
|
45
|
-
|
|
46
|
-
@override
|
|
47
|
-
def datasets(self) -> DatasetCollectionClient:
|
|
48
|
-
return DatasetCollectionClient(self._apify_client.datasets())
|
|
49
|
-
|
|
50
|
-
@override
|
|
51
|
-
def key_value_store(self, id: str) -> KeyValueStoreClient:
|
|
52
|
-
return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url)
|
|
53
|
-
|
|
54
|
-
@override
|
|
55
|
-
def key_value_stores(self) -> KeyValueStoreCollectionClient:
|
|
56
|
-
return KeyValueStoreCollectionClient(self._apify_client.key_value_stores())
|
|
57
|
-
|
|
58
|
-
@override
|
|
59
|
-
def request_queue(self, id: str) -> RequestQueueClient:
|
|
60
|
-
return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key))
|
|
61
|
-
|
|
62
|
-
@override
|
|
63
|
-
def request_queues(self) -> RequestQueueCollectionClient:
|
|
64
|
-
return RequestQueueCollectionClient(self._apify_client.request_queues())
|
|
65
|
-
|
|
66
|
-
@override
|
|
67
|
-
async def purge_on_start(self) -> None:
|
|
68
|
-
pass
|
|
69
|
-
|
|
70
|
-
@override
|
|
71
|
-
def get_rate_limit_errors(self) -> dict[int, int]:
|
|
72
|
-
return self._apify_client.stats.rate_limit_errors
|
|
@@ -1,190 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
|
-
|
|
5
|
-
from typing_extensions import override
|
|
6
|
-
|
|
7
|
-
from crawlee.storage_clients._base import DatasetClient as BaseDatasetClient
|
|
8
|
-
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from collections.abc import AsyncIterator
|
|
12
|
-
from contextlib import AbstractAsyncContextManager
|
|
13
|
-
|
|
14
|
-
from httpx import Response
|
|
15
|
-
|
|
16
|
-
from apify_client.clients import DatasetClientAsync
|
|
17
|
-
from crawlee._types import JsonSerializable
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class DatasetClient(BaseDatasetClient):
|
|
21
|
-
"""Dataset resource client implementation based on the Apify platform storage."""
|
|
22
|
-
|
|
23
|
-
def __init__(self, apify_dataset_client: DatasetClientAsync) -> None:
|
|
24
|
-
self._client = apify_dataset_client
|
|
25
|
-
|
|
26
|
-
@override
|
|
27
|
-
async def get(self) -> DatasetMetadata | None:
|
|
28
|
-
result = await self._client.get()
|
|
29
|
-
return DatasetMetadata.model_validate(result) if result else None
|
|
30
|
-
|
|
31
|
-
@override
|
|
32
|
-
async def update(
|
|
33
|
-
self,
|
|
34
|
-
*,
|
|
35
|
-
name: str | None = None,
|
|
36
|
-
) -> DatasetMetadata:
|
|
37
|
-
return DatasetMetadata.model_validate(
|
|
38
|
-
await self._client.update(
|
|
39
|
-
name=name,
|
|
40
|
-
)
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
@override
|
|
44
|
-
async def delete(self) -> None:
|
|
45
|
-
await self._client.delete()
|
|
46
|
-
|
|
47
|
-
@override
|
|
48
|
-
async def list_items(
|
|
49
|
-
self,
|
|
50
|
-
*,
|
|
51
|
-
offset: int | None = 0,
|
|
52
|
-
limit: int | None = BaseDatasetClient._LIST_ITEMS_LIMIT, # noqa: SLF001
|
|
53
|
-
clean: bool = False,
|
|
54
|
-
desc: bool = False,
|
|
55
|
-
fields: list[str] | None = None,
|
|
56
|
-
omit: list[str] | None = None,
|
|
57
|
-
unwind: str | None = None,
|
|
58
|
-
skip_empty: bool = False,
|
|
59
|
-
skip_hidden: bool = False,
|
|
60
|
-
flatten: list[str] | None = None,
|
|
61
|
-
view: str | None = None,
|
|
62
|
-
) -> DatasetItemsListPage:
|
|
63
|
-
return DatasetItemsListPage.model_validate(
|
|
64
|
-
vars(
|
|
65
|
-
await self._client.list_items(
|
|
66
|
-
offset=offset,
|
|
67
|
-
limit=limit,
|
|
68
|
-
clean=clean,
|
|
69
|
-
desc=desc,
|
|
70
|
-
fields=fields,
|
|
71
|
-
omit=omit,
|
|
72
|
-
unwind=unwind,
|
|
73
|
-
skip_empty=skip_empty,
|
|
74
|
-
skip_hidden=skip_hidden,
|
|
75
|
-
flatten=flatten,
|
|
76
|
-
view=view,
|
|
77
|
-
)
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
@override
|
|
82
|
-
async def iterate_items(
|
|
83
|
-
self,
|
|
84
|
-
*,
|
|
85
|
-
offset: int = 0,
|
|
86
|
-
limit: int | None = None,
|
|
87
|
-
clean: bool = False,
|
|
88
|
-
desc: bool = False,
|
|
89
|
-
fields: list[str] | None = None,
|
|
90
|
-
omit: list[str] | None = None,
|
|
91
|
-
unwind: str | None = None,
|
|
92
|
-
skip_empty: bool = False,
|
|
93
|
-
skip_hidden: bool = False,
|
|
94
|
-
) -> AsyncIterator[dict]:
|
|
95
|
-
async for item in self._client.iterate_items(
|
|
96
|
-
offset=offset,
|
|
97
|
-
limit=limit,
|
|
98
|
-
clean=clean,
|
|
99
|
-
desc=desc,
|
|
100
|
-
fields=fields,
|
|
101
|
-
omit=omit,
|
|
102
|
-
unwind=unwind,
|
|
103
|
-
skip_empty=skip_empty,
|
|
104
|
-
skip_hidden=skip_hidden,
|
|
105
|
-
):
|
|
106
|
-
yield item
|
|
107
|
-
|
|
108
|
-
@override
|
|
109
|
-
async def get_items_as_bytes(
|
|
110
|
-
self,
|
|
111
|
-
*,
|
|
112
|
-
item_format: str = 'json',
|
|
113
|
-
offset: int | None = None,
|
|
114
|
-
limit: int | None = None,
|
|
115
|
-
desc: bool = False,
|
|
116
|
-
clean: bool = False,
|
|
117
|
-
bom: bool = False,
|
|
118
|
-
delimiter: str | None = None,
|
|
119
|
-
fields: list[str] | None = None,
|
|
120
|
-
omit: list[str] | None = None,
|
|
121
|
-
unwind: str | None = None,
|
|
122
|
-
skip_empty: bool = False,
|
|
123
|
-
skip_header_row: bool = False,
|
|
124
|
-
skip_hidden: bool = False,
|
|
125
|
-
xml_root: str | None = None,
|
|
126
|
-
xml_row: str | None = None,
|
|
127
|
-
flatten: list[str] | None = None,
|
|
128
|
-
) -> bytes:
|
|
129
|
-
return await self._client.get_items_as_bytes(
|
|
130
|
-
item_format=item_format,
|
|
131
|
-
offset=offset,
|
|
132
|
-
limit=limit,
|
|
133
|
-
desc=desc,
|
|
134
|
-
clean=clean,
|
|
135
|
-
bom=bom,
|
|
136
|
-
delimiter=delimiter,
|
|
137
|
-
fields=fields,
|
|
138
|
-
omit=omit,
|
|
139
|
-
unwind=unwind,
|
|
140
|
-
skip_empty=skip_empty,
|
|
141
|
-
skip_header_row=skip_header_row,
|
|
142
|
-
skip_hidden=skip_hidden,
|
|
143
|
-
xml_root=xml_root,
|
|
144
|
-
xml_row=xml_row,
|
|
145
|
-
flatten=flatten,
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
@override
|
|
149
|
-
async def stream_items(
|
|
150
|
-
self,
|
|
151
|
-
*,
|
|
152
|
-
item_format: str = 'json',
|
|
153
|
-
offset: int | None = None,
|
|
154
|
-
limit: int | None = None,
|
|
155
|
-
desc: bool = False,
|
|
156
|
-
clean: bool = False,
|
|
157
|
-
bom: bool = False,
|
|
158
|
-
delimiter: str | None = None,
|
|
159
|
-
fields: list[str] | None = None,
|
|
160
|
-
omit: list[str] | None = None,
|
|
161
|
-
unwind: str | None = None,
|
|
162
|
-
skip_empty: bool = False,
|
|
163
|
-
skip_header_row: bool = False,
|
|
164
|
-
skip_hidden: bool = False,
|
|
165
|
-
xml_root: str | None = None,
|
|
166
|
-
xml_row: str | None = None,
|
|
167
|
-
) -> AbstractAsyncContextManager[Response | None]:
|
|
168
|
-
return self._client.stream_items(
|
|
169
|
-
item_format=item_format,
|
|
170
|
-
offset=offset,
|
|
171
|
-
limit=limit,
|
|
172
|
-
desc=desc,
|
|
173
|
-
clean=clean,
|
|
174
|
-
bom=bom,
|
|
175
|
-
delimiter=delimiter,
|
|
176
|
-
fields=fields,
|
|
177
|
-
omit=omit,
|
|
178
|
-
unwind=unwind,
|
|
179
|
-
skip_empty=skip_empty,
|
|
180
|
-
skip_header_row=skip_header_row,
|
|
181
|
-
skip_hidden=skip_hidden,
|
|
182
|
-
xml_root=xml_root,
|
|
183
|
-
xml_row=xml_row,
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
@override
|
|
187
|
-
async def push_items(self, items: JsonSerializable) -> None:
|
|
188
|
-
await self._client.push_items(
|
|
189
|
-
items=items,
|
|
190
|
-
)
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
|
-
|
|
5
|
-
from typing_extensions import override
|
|
6
|
-
|
|
7
|
-
from crawlee.storage_clients._base import DatasetCollectionClient as BaseDatasetCollectionClient
|
|
8
|
-
from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from apify_client.clients import DatasetCollectionClientAsync
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class DatasetCollectionClient(BaseDatasetCollectionClient):
|
|
15
|
-
"""Dataset collection resource client implementation based on the Apify platform storage."""
|
|
16
|
-
|
|
17
|
-
def __init__(self, apify_dataset_collection_client: DatasetCollectionClientAsync) -> None:
|
|
18
|
-
self._client = apify_dataset_collection_client
|
|
19
|
-
|
|
20
|
-
@override
|
|
21
|
-
async def get_or_create(
|
|
22
|
-
self,
|
|
23
|
-
*,
|
|
24
|
-
id: str | None = None,
|
|
25
|
-
name: str | None = None,
|
|
26
|
-
schema: dict | None = None,
|
|
27
|
-
) -> DatasetMetadata:
|
|
28
|
-
return DatasetMetadata.model_validate(
|
|
29
|
-
await self._client.get_or_create(
|
|
30
|
-
name=id if id is not None else name,
|
|
31
|
-
schema=schema,
|
|
32
|
-
)
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
@override
|
|
36
|
-
async def list(
|
|
37
|
-
self,
|
|
38
|
-
*,
|
|
39
|
-
unnamed: bool = False,
|
|
40
|
-
limit: int | None = None,
|
|
41
|
-
offset: int | None = None,
|
|
42
|
-
desc: bool = False,
|
|
43
|
-
) -> DatasetListPage:
|
|
44
|
-
return DatasetListPage.model_validate(
|
|
45
|
-
await self._client.list(
|
|
46
|
-
unnamed=unnamed,
|
|
47
|
-
limit=limit,
|
|
48
|
-
offset=offset,
|
|
49
|
-
desc=desc,
|
|
50
|
-
)
|
|
51
|
-
)
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from contextlib import asynccontextmanager
|
|
4
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
-
|
|
6
|
-
from typing_extensions import override
|
|
7
|
-
from yarl import URL
|
|
8
|
-
|
|
9
|
-
from crawlee.storage_clients._base import KeyValueStoreClient as BaseKeyValueStoreClient
|
|
10
|
-
from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
|
|
11
|
-
|
|
12
|
-
from apify._crypto import create_hmac_signature
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from collections.abc import AsyncIterator
|
|
16
|
-
from contextlib import AbstractAsyncContextManager
|
|
17
|
-
|
|
18
|
-
from httpx import Response
|
|
19
|
-
|
|
20
|
-
from apify_client.clients import KeyValueStoreClientAsync
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class KeyValueStoreClient(BaseKeyValueStoreClient):
|
|
24
|
-
"""Key-value store resource client implementation based on the Apify platform storage."""
|
|
25
|
-
|
|
26
|
-
def __init__(self, apify_key_value_store_client: KeyValueStoreClientAsync, api_public_base_url: str) -> None:
|
|
27
|
-
self._client = apify_key_value_store_client
|
|
28
|
-
self._api_public_base_url = api_public_base_url
|
|
29
|
-
|
|
30
|
-
@override
|
|
31
|
-
async def get(self) -> KeyValueStoreMetadata | None:
|
|
32
|
-
result = await self._client.get()
|
|
33
|
-
return KeyValueStoreMetadata.model_validate(result) if result else None
|
|
34
|
-
|
|
35
|
-
@override
|
|
36
|
-
async def update(
|
|
37
|
-
self,
|
|
38
|
-
*,
|
|
39
|
-
name: str | None = None,
|
|
40
|
-
) -> KeyValueStoreMetadata:
|
|
41
|
-
return KeyValueStoreMetadata.model_validate(await self._client.update())
|
|
42
|
-
|
|
43
|
-
@override
|
|
44
|
-
async def delete(self) -> None:
|
|
45
|
-
await self._client.delete()
|
|
46
|
-
|
|
47
|
-
@override
|
|
48
|
-
async def list_keys(
|
|
49
|
-
self,
|
|
50
|
-
*,
|
|
51
|
-
limit: int = 1000,
|
|
52
|
-
exclusive_start_key: str | None = None,
|
|
53
|
-
) -> KeyValueStoreListKeysPage:
|
|
54
|
-
return KeyValueStoreListKeysPage.model_validate(await self._client.list_keys())
|
|
55
|
-
|
|
56
|
-
@override
|
|
57
|
-
async def get_record(self, key: str) -> KeyValueStoreRecord | None:
|
|
58
|
-
result = await self._client.get_record(key)
|
|
59
|
-
return KeyValueStoreRecord.model_validate(result) if result else None
|
|
60
|
-
|
|
61
|
-
@override
|
|
62
|
-
async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord | None:
|
|
63
|
-
result = await self._client.get_record_as_bytes(key)
|
|
64
|
-
return KeyValueStoreRecord.model_validate(result) if result else None
|
|
65
|
-
|
|
66
|
-
@override
|
|
67
|
-
async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]:
|
|
68
|
-
return self._stream_record_internal(key)
|
|
69
|
-
|
|
70
|
-
@asynccontextmanager
|
|
71
|
-
async def _stream_record_internal(self, key: str) -> AsyncIterator[KeyValueStoreRecord[Response] | None]:
|
|
72
|
-
async with self._client.stream_record(key) as response:
|
|
73
|
-
yield KeyValueStoreRecord.model_validate(response)
|
|
74
|
-
|
|
75
|
-
@override
|
|
76
|
-
async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None:
|
|
77
|
-
await self._client.set_record(
|
|
78
|
-
key=key,
|
|
79
|
-
value=value,
|
|
80
|
-
content_type=content_type,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
@override
|
|
84
|
-
async def delete_record(self, key: str) -> None:
|
|
85
|
-
await self._client.delete_record(
|
|
86
|
-
key=key,
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
async def get_public_url(self, key: str) -> str:
|
|
90
|
-
"""Get a URL for the given key that may be used to publicly access the value in the remote key-value store.
|
|
91
|
-
|
|
92
|
-
Args:
|
|
93
|
-
key: The key for which the URL should be generated.
|
|
94
|
-
"""
|
|
95
|
-
if self._client.resource_id is None:
|
|
96
|
-
raise ValueError('resource_id cannot be None when generating a public URL')
|
|
97
|
-
|
|
98
|
-
public_url = (
|
|
99
|
-
URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._client.resource_id / 'records' / key
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
key_value_store = await self.get()
|
|
103
|
-
|
|
104
|
-
if key_value_store is not None and isinstance(key_value_store.model_extra, dict):
|
|
105
|
-
url_signing_secret_key = key_value_store.model_extra.get('urlSigningSecretKey')
|
|
106
|
-
if url_signing_secret_key:
|
|
107
|
-
public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key))
|
|
108
|
-
|
|
109
|
-
return str(public_url)
|