apify 1.7.1b1__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +33 -4
- apify/_actor.py +1074 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.1.dist-info/METADATA +211 -0
- apify-2.2.1.dist-info/RECORD +38 -0
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.1b1.dist-info/METADATA +0 -149
- apify-1.7.1b1.dist-info/RECORD +0 -41
- apify-1.7.1b1.dist-info/top_level.txt +0 -1
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/LICENSE +0 -0
apify/proxy_configuration.py
DELETED
|
@@ -1,365 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import inspect
|
|
4
|
-
import ipaddress
|
|
5
|
-
import re
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Awaitable, Callable, Pattern, TypedDict
|
|
7
|
-
from urllib.parse import urljoin, urlparse
|
|
8
|
-
|
|
9
|
-
import httpx
|
|
10
|
-
from apify_shared.consts import ApifyEnvVars
|
|
11
|
-
from apify_shared.utils import ignore_docs
|
|
12
|
-
|
|
13
|
-
from apify.config import Configuration
|
|
14
|
-
from apify.log import logger
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from apify_client import ApifyClientAsync
|
|
18
|
-
from typing_extensions import NotRequired
|
|
19
|
-
|
|
20
|
-
APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$')
|
|
21
|
-
COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$')
|
|
22
|
-
SESSION_ID_MAX_LENGTH = 50
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def is_url(url: str) -> bool:
|
|
26
|
-
"""Check if the given string is a valid URL."""
|
|
27
|
-
try:
|
|
28
|
-
parsed_url = urlparse(urljoin(url, '/'))
|
|
29
|
-
has_all_parts = all([parsed_url.scheme, parsed_url.netloc, parsed_url.path])
|
|
30
|
-
is_domain = '.' in parsed_url.netloc
|
|
31
|
-
is_localhost = parsed_url.netloc == 'localhost'
|
|
32
|
-
try:
|
|
33
|
-
ipaddress.ip_address(parsed_url.netloc)
|
|
34
|
-
is_ip_address = True
|
|
35
|
-
except Exception:
|
|
36
|
-
is_ip_address = False
|
|
37
|
-
|
|
38
|
-
return has_all_parts and any([is_domain, is_localhost, is_ip_address])
|
|
39
|
-
except Exception:
|
|
40
|
-
return False
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def _check(
|
|
44
|
-
value: Any,
|
|
45
|
-
*,
|
|
46
|
-
label: str | None,
|
|
47
|
-
pattern: Pattern | None = None,
|
|
48
|
-
min_length: int | None = None,
|
|
49
|
-
max_length: int | None = None,
|
|
50
|
-
) -> None:
|
|
51
|
-
error_str = f'Value {value}'
|
|
52
|
-
if label:
|
|
53
|
-
error_str += f' of argument {label}'
|
|
54
|
-
|
|
55
|
-
if min_length and len(value) < min_length:
|
|
56
|
-
raise ValueError(f'{error_str} is shorter than minimum allowed length {min_length}')
|
|
57
|
-
|
|
58
|
-
if max_length and len(value) > max_length:
|
|
59
|
-
raise ValueError(f'{error_str} is longer than maximum allowed length {max_length}')
|
|
60
|
-
|
|
61
|
-
if pattern and not re.fullmatch(pattern, value):
|
|
62
|
-
raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}')
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
class ProxyInfo(TypedDict):
|
|
66
|
-
"""Provides information about a proxy connection that is used for requests."""
|
|
67
|
-
|
|
68
|
-
url: str
|
|
69
|
-
"""The URL of the proxy."""
|
|
70
|
-
|
|
71
|
-
hostname: str
|
|
72
|
-
"""The hostname of the proxy."""
|
|
73
|
-
|
|
74
|
-
port: int
|
|
75
|
-
"""The proxy port."""
|
|
76
|
-
|
|
77
|
-
username: NotRequired[str]
|
|
78
|
-
"""The username for the proxy."""
|
|
79
|
-
|
|
80
|
-
password: str
|
|
81
|
-
"""The password for the proxy."""
|
|
82
|
-
|
|
83
|
-
groups: NotRequired[list[str]]
|
|
84
|
-
"""An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy).
|
|
85
|
-
If not provided, the proxy will select the groups automatically.
|
|
86
|
-
"""
|
|
87
|
-
|
|
88
|
-
country_code: NotRequired[str]
|
|
89
|
-
"""If set and relevant proxies are available in your Apify account, all proxied requests will
|
|
90
|
-
use IP addresses that are geolocated to the specified country. For example `GB` for IPs
|
|
91
|
-
from Great Britain. Note that online services often have their own rules for handling
|
|
92
|
-
geolocation and thus the country selection is a best attempt at geolocation, rather than
|
|
93
|
-
a guaranteed hit. This parameter is optional, by default, each proxied request is assigned
|
|
94
|
-
an IP address from a random country. The country code needs to be a two letter ISO country code.
|
|
95
|
-
See the [full list of available country codes](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements).
|
|
96
|
-
This parameter is optional, by default, the proxy uses all available proxy servers from all countries.
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
session_id: NotRequired[str]
|
|
100
|
-
"""The identifier of the used proxy session, if used. Using the same session ID guarantees getting the same proxy URL."""
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
class ProxyConfiguration:
|
|
104
|
-
"""Configures a connection to a proxy server with the provided options.
|
|
105
|
-
|
|
106
|
-
Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists.
|
|
107
|
-
The default servers used by this class are managed by [Apify Proxy](https://docs.apify.com/proxy).
|
|
108
|
-
To be able to use Apify Proxy, you need an Apify account and access to the selected proxies. If you provide no configuration option,
|
|
109
|
-
the proxies will be managed automatically using a smart algorithm.
|
|
110
|
-
|
|
111
|
-
If you want to use your own proxies, use the `proxy_urls` or `new_url_function` constructor options.
|
|
112
|
-
Your list of proxy URLs will be rotated by the configuration, if this option is provided.
|
|
113
|
-
"""
|
|
114
|
-
|
|
115
|
-
is_man_in_the_middle = False
|
|
116
|
-
|
|
117
|
-
_next_custom_url_index = 0
|
|
118
|
-
_proxy_urls: list[str]
|
|
119
|
-
_used_proxy_urls: dict[str, str]
|
|
120
|
-
_new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None
|
|
121
|
-
_groups: list[str]
|
|
122
|
-
_country_code: str | None = None
|
|
123
|
-
_password: str | None = None
|
|
124
|
-
_hostname: str
|
|
125
|
-
_port: int
|
|
126
|
-
_uses_apify_proxy: bool | None = None
|
|
127
|
-
_actor_config: Configuration
|
|
128
|
-
_apify_client: ApifyClientAsync | None = None
|
|
129
|
-
|
|
130
|
-
@ignore_docs
|
|
131
|
-
def __init__(
|
|
132
|
-
self: ProxyConfiguration,
|
|
133
|
-
*,
|
|
134
|
-
password: str | None = None,
|
|
135
|
-
groups: list[str] | None = None,
|
|
136
|
-
country_code: str | None = None,
|
|
137
|
-
proxy_urls: list[str] | None = None,
|
|
138
|
-
new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None,
|
|
139
|
-
_actor_config: Configuration | None = None,
|
|
140
|
-
_apify_client: ApifyClientAsync | None = None,
|
|
141
|
-
) -> None:
|
|
142
|
-
"""Create a ProxyConfiguration instance. It is highly recommended to use `Actor.create_proxy_configuration()` instead of this.
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
password (str, optional): Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available.
|
|
146
|
-
groups (list of str, optional): Proxy groups which the Apify Proxy should use, if provided.
|
|
147
|
-
country_code (str, optional): Country which the Apify Proxy should use, if provided.
|
|
148
|
-
proxy_urls (list of str, optional): Custom proxy server URLs which should be rotated through.
|
|
149
|
-
new_url_function (Callable, optional): Function which returns a custom proxy URL to be used.
|
|
150
|
-
"""
|
|
151
|
-
if groups:
|
|
152
|
-
groups = [str(group) for group in groups]
|
|
153
|
-
for group in groups:
|
|
154
|
-
_check(group, label='groups', pattern=APIFY_PROXY_VALUE_REGEX)
|
|
155
|
-
if country_code:
|
|
156
|
-
country_code = str(country_code)
|
|
157
|
-
_check(country_code, label='country_code', pattern=COUNTRY_CODE_REGEX)
|
|
158
|
-
if proxy_urls:
|
|
159
|
-
for i, url in enumerate(proxy_urls):
|
|
160
|
-
if not is_url(url):
|
|
161
|
-
raise ValueError(f'proxy_urls[{i}] ("{url}") is not a valid URL')
|
|
162
|
-
|
|
163
|
-
# Validation
|
|
164
|
-
if proxy_urls and new_url_function:
|
|
165
|
-
raise ValueError('Cannot combine custom proxies in "proxy_urls" with custom generating function in "new_url_function".')
|
|
166
|
-
|
|
167
|
-
if (proxy_urls or new_url_function) and (groups or country_code):
|
|
168
|
-
raise ValueError(
|
|
169
|
-
'Cannot combine custom proxies with Apify Proxy!'
|
|
170
|
-
' It is not allowed to set "proxy_urls" or "new_url_function" combined with'
|
|
171
|
-
' "groups" or "country_code".'
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
# mypy has a bug with narrowing types for filter (https://github.com/python/mypy/issues/12682)
|
|
175
|
-
if proxy_urls and next(filter(lambda url: 'apify.com' in url, proxy_urls), None): # type: ignore
|
|
176
|
-
logger.warning(
|
|
177
|
-
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxy_urls`.\n'
|
|
178
|
-
'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration'
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
self._actor_config = _actor_config or Configuration._get_default_instance()
|
|
182
|
-
self._apify_client = _apify_client
|
|
183
|
-
|
|
184
|
-
self._hostname = self._actor_config.proxy_hostname
|
|
185
|
-
self._port = self._actor_config.proxy_port
|
|
186
|
-
self._password = password or self._actor_config.proxy_password
|
|
187
|
-
|
|
188
|
-
self._proxy_urls = list(proxy_urls) if proxy_urls else []
|
|
189
|
-
self._used_proxy_urls = {}
|
|
190
|
-
self._new_url_function = new_url_function
|
|
191
|
-
self._groups = list(groups) if groups else []
|
|
192
|
-
self._country_code = country_code
|
|
193
|
-
self._uses_apify_proxy = not (proxy_urls or new_url_function)
|
|
194
|
-
|
|
195
|
-
async def initialize(self: ProxyConfiguration) -> None:
|
|
196
|
-
"""Load the Apify Proxy password if the API token is provided and check access to Apify Proxy and provided proxy groups.
|
|
197
|
-
|
|
198
|
-
Only called if Apify Proxy configuration is used.
|
|
199
|
-
Also checks if country has access to Apify Proxy groups if the country code is provided.
|
|
200
|
-
|
|
201
|
-
You should use the Actor.create_proxy_configuration function
|
|
202
|
-
to create a pre-initialized `ProxyConfiguration` instance instead of calling this manually.
|
|
203
|
-
"""
|
|
204
|
-
if self._uses_apify_proxy:
|
|
205
|
-
await self._maybe_fetch_password()
|
|
206
|
-
await self._check_access()
|
|
207
|
-
|
|
208
|
-
async def new_url(self: ProxyConfiguration, session_id: int | str | None = None) -> str:
|
|
209
|
-
"""Return a new proxy URL based on provided configuration options and the `sessionId` parameter.
|
|
210
|
-
|
|
211
|
-
Args:
|
|
212
|
-
session_id (int or str, optional): Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions).
|
|
213
|
-
All the HTTP requests going through the proxy with the same session identifier
|
|
214
|
-
will use the same target proxy server (i.e. the same IP address).
|
|
215
|
-
The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
216
|
-
|
|
217
|
-
Returns:
|
|
218
|
-
str: A string with a proxy URL, including authentication credentials and port number.
|
|
219
|
-
For example, `http://bob:password123@proxy.example.com:8000`
|
|
220
|
-
"""
|
|
221
|
-
if session_id is not None:
|
|
222
|
-
session_id = f'{session_id}'
|
|
223
|
-
_check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX)
|
|
224
|
-
|
|
225
|
-
if self._new_url_function:
|
|
226
|
-
try:
|
|
227
|
-
res = self._new_url_function(session_id)
|
|
228
|
-
if inspect.isawaitable(res):
|
|
229
|
-
res = await res
|
|
230
|
-
return str(res)
|
|
231
|
-
except Exception as exc:
|
|
232
|
-
raise ValueError('The provided "new_url_function" did not return a valid URL') from exc
|
|
233
|
-
|
|
234
|
-
if self._proxy_urls:
|
|
235
|
-
if not session_id:
|
|
236
|
-
index = self._next_custom_url_index
|
|
237
|
-
self._next_custom_url_index = (self._next_custom_url_index + 1) % len(self._proxy_urls)
|
|
238
|
-
return self._proxy_urls[index]
|
|
239
|
-
|
|
240
|
-
if session_id not in self._used_proxy_urls:
|
|
241
|
-
index = self._next_custom_url_index
|
|
242
|
-
self._next_custom_url_index = (self._next_custom_url_index + 1) % len(self._proxy_urls)
|
|
243
|
-
self._used_proxy_urls[session_id] = self._proxy_urls[index]
|
|
244
|
-
|
|
245
|
-
return self._used_proxy_urls[session_id]
|
|
246
|
-
|
|
247
|
-
username = self._get_username(session_id)
|
|
248
|
-
|
|
249
|
-
return f'http://{username}:{self._password}@{self._hostname}:{self._port}'
|
|
250
|
-
|
|
251
|
-
async def new_proxy_info(self: ProxyConfiguration, session_id: int | str | None = None) -> ProxyInfo:
|
|
252
|
-
"""Create a new ProxyInfo object.
|
|
253
|
-
|
|
254
|
-
Use it if you want to work with a rich representation of a proxy URL.
|
|
255
|
-
If you need the URL string only, use `ProxyConfiguration.new_url`.
|
|
256
|
-
|
|
257
|
-
Args:
|
|
258
|
-
session_id (int or str, optional): Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions).
|
|
259
|
-
All the HTTP requests going through the proxy with the same session identifier
|
|
260
|
-
will use the same target proxy server (i.e. the same IP address).
|
|
261
|
-
The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
262
|
-
|
|
263
|
-
Returns:
|
|
264
|
-
ProxyInfo: Dictionary that represents information about the proxy and its configuration.
|
|
265
|
-
"""
|
|
266
|
-
if session_id is not None:
|
|
267
|
-
session_id = f'{session_id}'
|
|
268
|
-
_check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX)
|
|
269
|
-
|
|
270
|
-
url = await self.new_url(session_id)
|
|
271
|
-
res: ProxyInfo
|
|
272
|
-
if self._uses_apify_proxy:
|
|
273
|
-
res = {
|
|
274
|
-
'url': url,
|
|
275
|
-
'hostname': self._hostname,
|
|
276
|
-
'port': self._port,
|
|
277
|
-
'username': self._get_username(session_id),
|
|
278
|
-
'password': self._password or '',
|
|
279
|
-
'groups': self._groups,
|
|
280
|
-
}
|
|
281
|
-
if self._country_code:
|
|
282
|
-
res['country_code'] = self._country_code
|
|
283
|
-
if session_id is not None:
|
|
284
|
-
res['session_id'] = session_id
|
|
285
|
-
return res
|
|
286
|
-
|
|
287
|
-
parsed_url = urlparse(url)
|
|
288
|
-
assert parsed_url.hostname is not None # noqa: S101
|
|
289
|
-
assert parsed_url.port is not None # noqa: S101
|
|
290
|
-
res = {
|
|
291
|
-
'url': url,
|
|
292
|
-
'hostname': parsed_url.hostname,
|
|
293
|
-
'port': parsed_url.port,
|
|
294
|
-
'password': parsed_url.password or '',
|
|
295
|
-
}
|
|
296
|
-
if parsed_url.username:
|
|
297
|
-
res['username'] = parsed_url.username
|
|
298
|
-
return res
|
|
299
|
-
|
|
300
|
-
async def _maybe_fetch_password(self: ProxyConfiguration) -> None:
|
|
301
|
-
token = self._actor_config.token
|
|
302
|
-
|
|
303
|
-
if token and self._apify_client:
|
|
304
|
-
user_info = await self._apify_client.user().get()
|
|
305
|
-
if user_info:
|
|
306
|
-
password = user_info['proxy']['password']
|
|
307
|
-
|
|
308
|
-
if self._password:
|
|
309
|
-
if self._password != password:
|
|
310
|
-
logger.warning(
|
|
311
|
-
'The Apify Proxy password you provided belongs to'
|
|
312
|
-
' a different user than the Apify token you are using. Are you sure this is correct?'
|
|
313
|
-
)
|
|
314
|
-
else:
|
|
315
|
-
self._password = password
|
|
316
|
-
|
|
317
|
-
if not self._password:
|
|
318
|
-
raise ValueError(
|
|
319
|
-
'Apify Proxy password must be provided using the "password" constructor argument'
|
|
320
|
-
f' or the "{ApifyEnvVars.PROXY_PASSWORD}" environment variable.'
|
|
321
|
-
f' If you add the "{ApifyEnvVars.TOKEN}" environment variable, the password will be automatically inferred.'
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
async def _check_access(self: ProxyConfiguration) -> None:
|
|
325
|
-
proxy_status_url = f'{self._actor_config.proxy_status_url}/?format=json'
|
|
326
|
-
|
|
327
|
-
status = None
|
|
328
|
-
async with httpx.AsyncClient(proxies=await self.new_url()) as client:
|
|
329
|
-
for _ in range(2):
|
|
330
|
-
try:
|
|
331
|
-
response = await client.get(proxy_status_url)
|
|
332
|
-
status = response.json()
|
|
333
|
-
break
|
|
334
|
-
except Exception: # noqa: S110
|
|
335
|
-
# retry on connection errors
|
|
336
|
-
pass
|
|
337
|
-
|
|
338
|
-
if status:
|
|
339
|
-
if not status['connected']:
|
|
340
|
-
raise ConnectionError(status['connectionError'])
|
|
341
|
-
|
|
342
|
-
self.is_man_in_the_middle = status['isManInTheMiddle']
|
|
343
|
-
else:
|
|
344
|
-
logger.warning(
|
|
345
|
-
'Apify Proxy access check timed out. Watch out for errors with status code 407. '
|
|
346
|
-
"If you see some, it most likely means you don't have access to either all or some of the proxies you're trying to use."
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
def _get_username(self: ProxyConfiguration, session_id: int | str | None = None) -> str:
|
|
350
|
-
if session_id is not None:
|
|
351
|
-
session_id = f'{session_id}'
|
|
352
|
-
|
|
353
|
-
parts: list[str] = []
|
|
354
|
-
|
|
355
|
-
if self._groups:
|
|
356
|
-
parts.append(f'groups-{"+".join(self._groups)}')
|
|
357
|
-
if session_id is not None:
|
|
358
|
-
parts.append(f'session-{session_id}')
|
|
359
|
-
if self._country_code:
|
|
360
|
-
parts.append(f'country-{self._country_code}')
|
|
361
|
-
|
|
362
|
-
if not parts:
|
|
363
|
-
return 'auto'
|
|
364
|
-
|
|
365
|
-
return ','.join(parts)
|
apify/storages/base_storage.py
DELETED
|
@@ -1,181 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import TYPE_CHECKING, Generic, TypeVar, cast
|
|
6
|
-
|
|
7
|
-
from apify_shared.utils import ignore_docs
|
|
8
|
-
|
|
9
|
-
from apify._memory_storage import MemoryStorageClient
|
|
10
|
-
from apify._memory_storage.resource_clients import BaseResourceClient, BaseResourceCollectionClient
|
|
11
|
-
from apify.config import Configuration
|
|
12
|
-
from apify.storages.storage_client_manager import StorageClientManager
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from apify_client import ApifyClientAsync
|
|
16
|
-
|
|
17
|
-
BaseResourceClientType = TypeVar('BaseResourceClientType', bound=BaseResourceClient)
|
|
18
|
-
BaseResourceCollectionClientType = TypeVar('BaseResourceCollectionClientType', bound=BaseResourceCollectionClient)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@ignore_docs
|
|
22
|
-
class BaseStorage(ABC, Generic[BaseResourceClientType, BaseResourceCollectionClientType]):
|
|
23
|
-
"""A class for managing storages."""
|
|
24
|
-
|
|
25
|
-
_id: str
|
|
26
|
-
_name: str | None
|
|
27
|
-
_storage_client: ApifyClientAsync | MemoryStorageClient
|
|
28
|
-
_config: Configuration
|
|
29
|
-
|
|
30
|
-
_cache_by_id: dict | None = None
|
|
31
|
-
_cache_by_name: dict | None = None
|
|
32
|
-
_storage_creating_lock: asyncio.Lock | None = None
|
|
33
|
-
|
|
34
|
-
def __init__(
|
|
35
|
-
self: BaseStorage,
|
|
36
|
-
id: str, # noqa: A002
|
|
37
|
-
name: str | None,
|
|
38
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
39
|
-
config: Configuration,
|
|
40
|
-
) -> None:
|
|
41
|
-
"""Initialize the storage.
|
|
42
|
-
|
|
43
|
-
Do not use this method directly, but use `Actor.open_<STORAGE>()` instead.
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
id (str): The storage id
|
|
47
|
-
name (str, optional): The storage name
|
|
48
|
-
client (ApifyClientAsync or MemoryStorageClient): The storage client
|
|
49
|
-
config (Configuration): The configuration
|
|
50
|
-
"""
|
|
51
|
-
self._id = id
|
|
52
|
-
self._name = name
|
|
53
|
-
self._storage_client = client
|
|
54
|
-
self._config = config
|
|
55
|
-
|
|
56
|
-
@classmethod
|
|
57
|
-
@abstractmethod
|
|
58
|
-
def _get_human_friendly_label(cls: type[BaseStorage]) -> str:
|
|
59
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
60
|
-
|
|
61
|
-
@classmethod
|
|
62
|
-
@abstractmethod
|
|
63
|
-
def _get_default_id(cls: type[BaseStorage], config: Configuration) -> str:
|
|
64
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
65
|
-
|
|
66
|
-
@classmethod
|
|
67
|
-
@abstractmethod
|
|
68
|
-
def _get_single_storage_client(
|
|
69
|
-
cls: type[BaseStorage],
|
|
70
|
-
id: str, # noqa: A002
|
|
71
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
72
|
-
) -> BaseResourceClientType:
|
|
73
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
74
|
-
|
|
75
|
-
@classmethod
|
|
76
|
-
@abstractmethod
|
|
77
|
-
def _get_storage_collection_client(
|
|
78
|
-
cls: type[BaseStorage],
|
|
79
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
80
|
-
) -> BaseResourceCollectionClientType:
|
|
81
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
82
|
-
|
|
83
|
-
@classmethod
|
|
84
|
-
def _ensure_class_initialized(cls: type[BaseStorage]) -> None:
|
|
85
|
-
if cls._cache_by_id is None:
|
|
86
|
-
cls._cache_by_id = {}
|
|
87
|
-
if cls._cache_by_name is None:
|
|
88
|
-
cls._cache_by_name = {}
|
|
89
|
-
if cls._storage_creating_lock is None:
|
|
90
|
-
cls._storage_creating_lock = asyncio.Lock()
|
|
91
|
-
|
|
92
|
-
@classmethod
|
|
93
|
-
@abstractmethod
|
|
94
|
-
async def open(
|
|
95
|
-
cls: type[BaseStorage],
|
|
96
|
-
*,
|
|
97
|
-
id: str | None = None, # noqa: A002
|
|
98
|
-
name: str | None = None,
|
|
99
|
-
force_cloud: bool = False,
|
|
100
|
-
config: Configuration | None = None,
|
|
101
|
-
) -> BaseStorage:
|
|
102
|
-
"""Open a storage, or return a cached storage object if it was opened before.
|
|
103
|
-
|
|
104
|
-
Opens a storage with the given ID or name.
|
|
105
|
-
Returns the cached storage object if the storage was opened before.
|
|
106
|
-
|
|
107
|
-
Args:
|
|
108
|
-
id (str, optional): ID of the storage to be opened.
|
|
109
|
-
If neither `id` nor `name` are provided, the method returns the default storage associated with the actor run.
|
|
110
|
-
If the storage with the given ID does not exist, it raises an error.
|
|
111
|
-
name (str, optional): Name of the storage to be opened.
|
|
112
|
-
If neither `id` nor `name` are provided, the method returns the default storage associated with the actor run.
|
|
113
|
-
If the storage with the given name does not exist, it is created.
|
|
114
|
-
force_cloud (bool, optional): If set to True, it will open a storage on the Apify Platform even when running the actor locally.
|
|
115
|
-
Defaults to False.
|
|
116
|
-
config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted.
|
|
117
|
-
|
|
118
|
-
Returns:
|
|
119
|
-
An instance of the storage.
|
|
120
|
-
"""
|
|
121
|
-
cls._ensure_class_initialized()
|
|
122
|
-
assert cls._cache_by_id is not None # noqa: S101
|
|
123
|
-
assert cls._cache_by_name is not None # noqa: S101
|
|
124
|
-
assert not (id and name) # noqa: S101
|
|
125
|
-
|
|
126
|
-
used_config = config or Configuration.get_global_configuration()
|
|
127
|
-
used_client = StorageClientManager.get_storage_client(force_cloud=force_cloud)
|
|
128
|
-
|
|
129
|
-
is_default_storage_on_local = False
|
|
130
|
-
# Fetch default ID if no ID or name was passed
|
|
131
|
-
if not id and not name:
|
|
132
|
-
if isinstance(used_client, MemoryStorageClient):
|
|
133
|
-
is_default_storage_on_local = True
|
|
134
|
-
id = cls._get_default_id(used_config) # noqa: A001
|
|
135
|
-
|
|
136
|
-
# Try to get the storage instance from cache
|
|
137
|
-
cached_storage = None
|
|
138
|
-
if id:
|
|
139
|
-
cached_storage = cls._cache_by_id.get(id)
|
|
140
|
-
elif name:
|
|
141
|
-
cached_storage = cls._cache_by_name.get(name)
|
|
142
|
-
|
|
143
|
-
if cached_storage is not None:
|
|
144
|
-
# This cast is needed since MyPy doesn't understand very well that Self and Storage are the same
|
|
145
|
-
return cast(BaseStorage, cached_storage)
|
|
146
|
-
|
|
147
|
-
# Purge default storages if configured
|
|
148
|
-
if used_config.purge_on_start and isinstance(used_client, MemoryStorageClient):
|
|
149
|
-
await used_client._purge_on_start()
|
|
150
|
-
|
|
151
|
-
assert cls._storage_creating_lock is not None # noqa: S101
|
|
152
|
-
async with cls._storage_creating_lock:
|
|
153
|
-
# Create the storage
|
|
154
|
-
if id and not is_default_storage_on_local:
|
|
155
|
-
single_storage_client = cls._get_single_storage_client(id, used_client)
|
|
156
|
-
storage_info = await single_storage_client.get()
|
|
157
|
-
if not storage_info:
|
|
158
|
-
storage_label = cls._get_human_friendly_label()
|
|
159
|
-
raise RuntimeError(f'{storage_label} with id "{id}" does not exist!')
|
|
160
|
-
elif is_default_storage_on_local:
|
|
161
|
-
storage_collection_client = cls._get_storage_collection_client(used_client)
|
|
162
|
-
storage_info = await storage_collection_client.get_or_create(name=name, _id=id)
|
|
163
|
-
else:
|
|
164
|
-
storage_collection_client = cls._get_storage_collection_client(used_client)
|
|
165
|
-
storage_info = await storage_collection_client.get_or_create(name=name)
|
|
166
|
-
|
|
167
|
-
storage = cls(storage_info['id'], storage_info.get('name'), used_client, used_config)
|
|
168
|
-
|
|
169
|
-
# Cache by id and name
|
|
170
|
-
cls._cache_by_id[storage._id] = storage
|
|
171
|
-
if storage._name is not None:
|
|
172
|
-
cls._cache_by_name[storage._name] = storage
|
|
173
|
-
|
|
174
|
-
return storage
|
|
175
|
-
|
|
176
|
-
def _remove_from_cache(self: BaseStorage) -> None:
|
|
177
|
-
if self.__class__._cache_by_id is not None:
|
|
178
|
-
del self.__class__._cache_by_id[self._id]
|
|
179
|
-
|
|
180
|
-
if self._name and self.__class__._cache_by_name is not None:
|
|
181
|
-
del self.__class__._cache_by_name[self._name]
|