apify 1.7.3b4__py3-none-any.whl → 2.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +17 -4
- apify/_actor.py +963 -0
- apify/_configuration.py +310 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +29 -27
- apify/_models.py +110 -0
- apify/_platform_event_manager.py +222 -0
- apify/_proxy_configuration.py +316 -0
- apify/_utils.py +0 -497
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +56 -0
- apify/apify_storage_client/_dataset_client.py +188 -0
- apify/apify_storage_client/_dataset_collection_client.py +50 -0
- apify/apify_storage_client/_key_value_store_client.py +98 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +50 -0
- apify/apify_storage_client/_request_queue_client.py +196 -0
- apify/apify_storage_client/_request_queue_collection_client.py +50 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +3 -112
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +21 -21
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +1 -1
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +55 -54
- apify/scrapy/scheduler.py +19 -13
- apify/scrapy/utils.py +2 -31
- apify/storages/__init__.py +2 -10
- apify/storages/py.typed +0 -0
- {apify-1.7.3b4.dist-info → apify-2.0.0a1.dist-info}/METADATA +24 -46
- apify-2.0.0a1.dist-info/RECORD +37 -0
- {apify-1.7.3b4.dist-info → apify-2.0.0a1.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1357
- apify/config.py +0 -130
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.3b4.dist-info/RECORD +0 -41
- apify-1.7.3b4.dist-info/top_level.txt +0 -1
- {apify-1.7.3b4.dist-info → apify-2.0.0a1.dist-info}/LICENSE +0 -0
apify/_utils.py
CHANGED
|
@@ -1,67 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
3
|
import builtins
|
|
5
|
-
import contextlib
|
|
6
|
-
import functools
|
|
7
|
-
import inspect
|
|
8
|
-
import json
|
|
9
|
-
import mimetypes
|
|
10
|
-
import os
|
|
11
|
-
import re
|
|
12
4
|
import sys
|
|
13
|
-
import time
|
|
14
|
-
from base64 import b64encode
|
|
15
|
-
from collections import OrderedDict
|
|
16
|
-
from collections.abc import MutableMapping
|
|
17
|
-
from datetime import datetime, timezone
|
|
18
|
-
from hashlib import sha256
|
|
19
5
|
from importlib import metadata
|
|
20
|
-
from logging import getLogger
|
|
21
|
-
from typing import (
|
|
22
|
-
Any,
|
|
23
|
-
Callable,
|
|
24
|
-
Generic,
|
|
25
|
-
ItemsView,
|
|
26
|
-
Iterator,
|
|
27
|
-
NoReturn,
|
|
28
|
-
TypeVar,
|
|
29
|
-
ValuesView,
|
|
30
|
-
cast,
|
|
31
|
-
overload,
|
|
32
|
-
)
|
|
33
|
-
from typing import OrderedDict as OrderedDictType
|
|
34
|
-
from urllib.parse import parse_qsl, urlencode, urlparse
|
|
35
|
-
|
|
36
|
-
import aioshutil
|
|
37
|
-
import psutil
|
|
38
|
-
from aiofiles import ospath
|
|
39
|
-
from aiofiles.os import remove, rename
|
|
40
|
-
from apify_shared.consts import (
|
|
41
|
-
BOOL_ENV_VARS,
|
|
42
|
-
BOOL_ENV_VARS_TYPE,
|
|
43
|
-
DATETIME_ENV_VARS,
|
|
44
|
-
DATETIME_ENV_VARS_TYPE,
|
|
45
|
-
FLOAT_ENV_VARS,
|
|
46
|
-
FLOAT_ENV_VARS_TYPE,
|
|
47
|
-
INTEGER_ENV_VARS,
|
|
48
|
-
INTEGER_ENV_VARS_TYPE,
|
|
49
|
-
STRING_ENV_VARS_TYPE,
|
|
50
|
-
ActorEnvVars,
|
|
51
|
-
ApifyEnvVars,
|
|
52
|
-
)
|
|
53
|
-
from apify_shared.utils import (
|
|
54
|
-
ignore_docs,
|
|
55
|
-
is_content_type_json,
|
|
56
|
-
is_content_type_text,
|
|
57
|
-
is_content_type_xml,
|
|
58
|
-
maybe_extract_enum_member_value,
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
from apify.consts import REQUEST_ID_LENGTH, StorageTypes
|
|
62
|
-
|
|
63
|
-
T = TypeVar('T')
|
|
64
|
-
logger = getLogger(__name__)
|
|
65
6
|
|
|
66
7
|
|
|
67
8
|
def get_system_info() -> dict:
|
|
@@ -80,443 +21,5 @@ def get_system_info() -> dict:
|
|
|
80
21
|
return system_info
|
|
81
22
|
|
|
82
23
|
|
|
83
|
-
DualPropertyType = TypeVar('DualPropertyType')
|
|
84
|
-
DualPropertyOwner = TypeVar('DualPropertyOwner')
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
@ignore_docs
|
|
88
|
-
class dualproperty(Generic[DualPropertyType]): # noqa: N801
|
|
89
|
-
"""Descriptor combining `property` and `classproperty`.
|
|
90
|
-
|
|
91
|
-
When accessing the decorated attribute on an instance, it calls the getter with the instance as the first argument,
|
|
92
|
-
and when accessing it on a class, it calls the getter with the class as the first argument.
|
|
93
|
-
"""
|
|
94
|
-
|
|
95
|
-
def __init__(self: dualproperty, getter: Callable[..., DualPropertyType]) -> None:
|
|
96
|
-
"""Initialize the dualproperty.
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
getter (Callable): The getter of the property.
|
|
100
|
-
It should accept either an instance or a class as its first argument.
|
|
101
|
-
"""
|
|
102
|
-
self.getter = getter
|
|
103
|
-
|
|
104
|
-
def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualPropertyOwner]) -> DualPropertyType:
|
|
105
|
-
"""Call the getter with the right object.
|
|
106
|
-
|
|
107
|
-
Args:
|
|
108
|
-
obj (T | None): The instance of class T on which the getter will be called
|
|
109
|
-
owner (type[T]): The class object of class T on which the getter will be called, if obj is None
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
The result of the getter.
|
|
113
|
-
"""
|
|
114
|
-
val = self.getter(obj or owner)
|
|
115
|
-
return cast(DualPropertyType, val)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
@overload
|
|
119
|
-
def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE) -> bool | None:
|
|
120
|
-
...
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
@overload
|
|
124
|
-
def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE, default: bool) -> bool: # noqa: FBT001
|
|
125
|
-
...
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
@overload
|
|
129
|
-
def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE) -> datetime | str | None:
|
|
130
|
-
...
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@overload
|
|
134
|
-
def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE, default: datetime) -> datetime | str:
|
|
135
|
-
...
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
@overload
|
|
139
|
-
def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE) -> float | None:
|
|
140
|
-
...
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
@overload
|
|
144
|
-
def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE, default: float) -> float:
|
|
145
|
-
...
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
@overload
|
|
149
|
-
def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE) -> int | None:
|
|
150
|
-
...
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
@overload
|
|
154
|
-
def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE, default: int) -> int:
|
|
155
|
-
...
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
@overload
|
|
159
|
-
def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE, default: str) -> str:
|
|
160
|
-
...
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
@overload
|
|
164
|
-
def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE) -> str | None:
|
|
165
|
-
...
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
@overload
|
|
169
|
-
def fetch_and_parse_env_var(env_var: ActorEnvVars | ApifyEnvVars) -> Any:
|
|
170
|
-
...
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def fetch_and_parse_env_var(env_var: Any, default: Any = None) -> Any:
|
|
174
|
-
env_var_name = str(maybe_extract_enum_member_value(env_var))
|
|
175
|
-
|
|
176
|
-
val = os.getenv(env_var_name)
|
|
177
|
-
if not val:
|
|
178
|
-
return default
|
|
179
|
-
|
|
180
|
-
if env_var in BOOL_ENV_VARS:
|
|
181
|
-
return maybe_parse_bool(val)
|
|
182
|
-
if env_var in FLOAT_ENV_VARS:
|
|
183
|
-
parsed_float = maybe_parse_float(val)
|
|
184
|
-
if parsed_float is None:
|
|
185
|
-
return default
|
|
186
|
-
return parsed_float
|
|
187
|
-
if env_var in INTEGER_ENV_VARS:
|
|
188
|
-
parsed_int = maybe_parse_int(val)
|
|
189
|
-
if parsed_int is None:
|
|
190
|
-
return default
|
|
191
|
-
return parsed_int
|
|
192
|
-
if env_var in DATETIME_ENV_VARS:
|
|
193
|
-
return maybe_parse_datetime(val)
|
|
194
|
-
return val
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def get_cpu_usage_percent() -> float:
|
|
198
|
-
return psutil.cpu_percent()
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def get_memory_usage_bytes() -> int:
|
|
202
|
-
current_process = psutil.Process(os.getpid())
|
|
203
|
-
mem = int(current_process.memory_info().rss or 0)
|
|
204
|
-
for child in current_process.children(recursive=True):
|
|
205
|
-
with contextlib.suppress(psutil.NoSuchProcess):
|
|
206
|
-
mem += int(child.memory_info().rss or 0)
|
|
207
|
-
return mem
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def maybe_parse_bool(val: str | None) -> bool:
|
|
211
|
-
return val in {'true', 'True', '1'}
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
def maybe_parse_datetime(val: str) -> datetime | str:
|
|
215
|
-
try:
|
|
216
|
-
return datetime.strptime(val, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=timezone.utc)
|
|
217
|
-
except ValueError:
|
|
218
|
-
return val
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def maybe_parse_float(val: str) -> float | None:
|
|
222
|
-
try:
|
|
223
|
-
return float(val)
|
|
224
|
-
except ValueError:
|
|
225
|
-
return None
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
def maybe_parse_int(val: str) -> int | None:
|
|
229
|
-
try:
|
|
230
|
-
return int(val)
|
|
231
|
-
except ValueError:
|
|
232
|
-
return None
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
async def run_func_at_interval_async(func: Callable, interval_secs: float) -> None:
|
|
236
|
-
started_at = time.perf_counter()
|
|
237
|
-
sleep_until = started_at
|
|
238
|
-
while True:
|
|
239
|
-
now = time.perf_counter()
|
|
240
|
-
sleep_until += interval_secs
|
|
241
|
-
while sleep_until < now:
|
|
242
|
-
sleep_until += interval_secs
|
|
243
|
-
|
|
244
|
-
sleep_for_secs = sleep_until - now
|
|
245
|
-
await asyncio.sleep(sleep_for_secs)
|
|
246
|
-
|
|
247
|
-
res = func()
|
|
248
|
-
if inspect.isawaitable(res):
|
|
249
|
-
await res
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
async def force_remove(filename: str) -> None:
|
|
253
|
-
"""JS-like rm(filename, { force: true })."""
|
|
254
|
-
with contextlib.suppress(FileNotFoundError):
|
|
255
|
-
await remove(filename)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
def raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: # noqa: A002
|
|
259
|
-
client_type = maybe_extract_enum_member_value(client_type)
|
|
260
|
-
raise ValueError(f'{client_type} with id "{id}" does not exist.')
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
def raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn:
|
|
264
|
-
client_type = maybe_extract_enum_member_value(client_type)
|
|
265
|
-
raise ValueError(f'{client_type} with {key_name} "{value}" already exists.')
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
def guess_file_extension(content_type: str) -> str | None:
|
|
269
|
-
"""Guess the file extension based on content type."""
|
|
270
|
-
# e.g. mimetypes.guess_extension('application/json ') does not work...
|
|
271
|
-
actual_content_type = content_type.split(';')[0].strip()
|
|
272
|
-
|
|
273
|
-
# mimetypes.guess_extension returns 'xsl' in this case, because 'application/xxx' is "structured"
|
|
274
|
-
# ('text/xml' would be "unstructured" and return 'xml')
|
|
275
|
-
# we have to explicitly override it here
|
|
276
|
-
if actual_content_type == 'application/xml':
|
|
277
|
-
return 'xml'
|
|
278
|
-
|
|
279
|
-
# Guess the extension from the mime type
|
|
280
|
-
ext = mimetypes.guess_extension(actual_content_type)
|
|
281
|
-
|
|
282
|
-
# Remove the leading dot if extension successfully parsed
|
|
283
|
-
return ext[1:] if ext is not None else ext
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
def maybe_parse_body(body: bytes, content_type: str) -> Any:
|
|
287
|
-
if is_content_type_json(content_type):
|
|
288
|
-
return json.loads(body.decode('utf-8')) # Returns any
|
|
289
|
-
if is_content_type_xml(content_type) or is_content_type_text(content_type):
|
|
290
|
-
return body.decode('utf-8')
|
|
291
|
-
return body
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def unique_key_to_request_id(unique_key: str) -> str:
|
|
295
|
-
"""Generate request ID based on unique key in a deterministic way."""
|
|
296
|
-
request_id = re.sub(r'(\+|\/|=)', '', b64encode(sha256(unique_key.encode('utf-8')).digest()).decode('utf-8'))
|
|
297
|
-
return request_id[:REQUEST_ID_LENGTH] if len(request_id) > REQUEST_ID_LENGTH else request_id
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
async def force_rename(src_dir: str, dst_dir: str) -> None:
|
|
301
|
-
"""Rename a directory. Checks for existence of soruce directory and removes destination directory if it exists."""
|
|
302
|
-
# Make sure source directory exists
|
|
303
|
-
if await ospath.exists(src_dir):
|
|
304
|
-
# Remove destination directory if it exists
|
|
305
|
-
if await ospath.exists(dst_dir):
|
|
306
|
-
await aioshutil.rmtree(dst_dir, ignore_errors=True)
|
|
307
|
-
await rename(src_dir, dst_dir)
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
ImplementationType = TypeVar('ImplementationType', bound=Callable)
|
|
311
|
-
MetadataType = TypeVar('MetadataType', bound=Callable)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
def wrap_internal(implementation: ImplementationType, metadata_source: MetadataType) -> MetadataType:
|
|
315
|
-
@functools.wraps(metadata_source)
|
|
316
|
-
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
317
|
-
return implementation(*args, **kwargs)
|
|
318
|
-
|
|
319
|
-
return cast(MetadataType, wrapper)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
@ignore_docs
|
|
323
|
-
class LRUCache(MutableMapping, Generic[T]):
|
|
324
|
-
"""Attempt to reimplement LRUCache from `@apify/datastructures` using `OrderedDict`."""
|
|
325
|
-
|
|
326
|
-
_cache: OrderedDictType[str, T]
|
|
327
|
-
|
|
328
|
-
_max_length: int
|
|
329
|
-
|
|
330
|
-
def __init__(self: LRUCache, max_length: int) -> None:
|
|
331
|
-
"""Create a LRUCache with a specific max_length."""
|
|
332
|
-
self._cache = OrderedDict()
|
|
333
|
-
self._max_length = max_length
|
|
334
|
-
|
|
335
|
-
def __getitem__(self: LRUCache, key: str) -> T:
|
|
336
|
-
"""Get an item from the cache. Move it to the end if present."""
|
|
337
|
-
val = self._cache[key]
|
|
338
|
-
# No 'key in cache' condition since the previous line would raise KeyError
|
|
339
|
-
self._cache.move_to_end(key)
|
|
340
|
-
return cast(T, val)
|
|
341
|
-
|
|
342
|
-
# Sadly TS impl returns bool indicating whether the key was already present or not
|
|
343
|
-
def __setitem__(self: LRUCache, key: str, value: T) -> None:
|
|
344
|
-
"""Add an item to the cache. Remove least used item if max_length exceeded."""
|
|
345
|
-
self._cache[key] = value
|
|
346
|
-
if len(self._cache) > self._max_length:
|
|
347
|
-
self._cache.popitem(last=False)
|
|
348
|
-
|
|
349
|
-
def __delitem__(self: LRUCache, key: str) -> None:
|
|
350
|
-
"""Remove an item from the cache."""
|
|
351
|
-
del self._cache[key]
|
|
352
|
-
|
|
353
|
-
def __iter__(self: LRUCache) -> Iterator[str]:
|
|
354
|
-
"""Iterate over the keys of the cache in order of insertion."""
|
|
355
|
-
return self._cache.__iter__()
|
|
356
|
-
|
|
357
|
-
def __len__(self: LRUCache) -> int:
|
|
358
|
-
"""Get the number of items in the cache."""
|
|
359
|
-
return len(self._cache)
|
|
360
|
-
|
|
361
|
-
def values(self: LRUCache) -> ValuesView[T]: # Needed so we don't mutate the cache by __getitem__
|
|
362
|
-
"""Iterate over the values in the cache in order of insertion."""
|
|
363
|
-
return self._cache.values()
|
|
364
|
-
|
|
365
|
-
def items(self: LRUCache) -> ItemsView[str, T]: # Needed so we don't mutate the cache by __getitem__
|
|
366
|
-
"""Iterate over the pairs of (key, value) in the cache in order of insertion."""
|
|
367
|
-
return self._cache.items()
|
|
368
|
-
|
|
369
|
-
|
|
370
24
|
def is_running_in_ipython() -> bool:
|
|
371
25
|
return getattr(builtins, '__IPYTHON__', False)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
@overload
|
|
375
|
-
def budget_ow(value: str | float | bool, predicate: tuple[type, bool], value_name: str) -> None:
|
|
376
|
-
...
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
@overload
|
|
380
|
-
def budget_ow(value: dict, predicate: dict[str, tuple[type, bool]]) -> None:
|
|
381
|
-
...
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
def budget_ow(
|
|
385
|
-
value: dict | str | float | bool,
|
|
386
|
-
predicate: dict[str, tuple[type, bool]] | tuple[type, bool],
|
|
387
|
-
value_name: str | None = None,
|
|
388
|
-
) -> None:
|
|
389
|
-
"""Budget version of ow."""
|
|
390
|
-
|
|
391
|
-
def validate_single(field_value: Any, expected_type: type, required: bool, name: str) -> None: # noqa: FBT001
|
|
392
|
-
if field_value is None and required:
|
|
393
|
-
raise ValueError(f'"{name}" is required!')
|
|
394
|
-
if (field_value is not None or required) and not isinstance(field_value, expected_type):
|
|
395
|
-
raise ValueError(f'"{name}" must be of type "{expected_type.__name__}" but it is "{type(field_value).__name__}"!')
|
|
396
|
-
|
|
397
|
-
# Validate object
|
|
398
|
-
if isinstance(value, dict) and isinstance(predicate, dict):
|
|
399
|
-
for key, (field_type, required) in predicate.items():
|
|
400
|
-
field_value = value.get(key)
|
|
401
|
-
validate_single(field_value, field_type, required, key)
|
|
402
|
-
# Validate "primitive"
|
|
403
|
-
elif isinstance(value, (int, str, float, bool)) and isinstance(predicate, tuple) and value_name is not None:
|
|
404
|
-
field_type, required = predicate
|
|
405
|
-
validate_single(value, field_type, required, value_name)
|
|
406
|
-
else:
|
|
407
|
-
raise ValueError('Wrong input!')
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
PARSE_DATE_FIELDS_MAX_DEPTH = 3
|
|
411
|
-
PARSE_DATE_FIELDS_KEY_SUFFIX = 'At'
|
|
412
|
-
ListOrDictOrAny = TypeVar('ListOrDictOrAny', list, dict, Any)
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
def compute_short_hash(data: bytes, *, length: int = 8) -> str:
|
|
416
|
-
"""Computes a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.
|
|
417
|
-
|
|
418
|
-
Args:
|
|
419
|
-
data: The binary data to be hashed.
|
|
420
|
-
length: The length of the hash to be returned.
|
|
421
|
-
|
|
422
|
-
Returns:
|
|
423
|
-
A substring (prefix) of the hexadecimal hash of the data.
|
|
424
|
-
"""
|
|
425
|
-
hash_object = sha256(data)
|
|
426
|
-
return hash_object.hexdigest()[:length]
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
|
|
430
|
-
"""Normalizes a URL.
|
|
431
|
-
|
|
432
|
-
This function cleans and standardizes a URL by removing leading and trailing whitespaces,
|
|
433
|
-
converting the scheme and netloc to lower case, stripping unwanted tracking parameters
|
|
434
|
-
(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,
|
|
435
|
-
and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally
|
|
436
|
-
identical but differ in trivial ways (such as parameter order or casing) are treated as the same.
|
|
437
|
-
|
|
438
|
-
Args:
|
|
439
|
-
url: The URL to be normalized.
|
|
440
|
-
keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained.
|
|
441
|
-
|
|
442
|
-
Returns:
|
|
443
|
-
A string containing the normalized URL.
|
|
444
|
-
"""
|
|
445
|
-
# Parse the URL
|
|
446
|
-
parsed_url = urlparse(url.strip())
|
|
447
|
-
search_params = dict(parse_qsl(parsed_url.query)) # Convert query to a dict
|
|
448
|
-
|
|
449
|
-
# Remove any 'utm_' parameters
|
|
450
|
-
search_params = {k: v for k, v in search_params.items() if not k.startswith('utm_')}
|
|
451
|
-
|
|
452
|
-
# Construct the new query string
|
|
453
|
-
sorted_keys = sorted(search_params.keys())
|
|
454
|
-
sorted_query = urlencode([(k, search_params[k]) for k in sorted_keys])
|
|
455
|
-
|
|
456
|
-
# Construct the final URL
|
|
457
|
-
new_url = (
|
|
458
|
-
parsed_url._replace(
|
|
459
|
-
query=sorted_query,
|
|
460
|
-
scheme=parsed_url.scheme,
|
|
461
|
-
netloc=parsed_url.netloc,
|
|
462
|
-
path=parsed_url.path.rstrip('/'),
|
|
463
|
-
)
|
|
464
|
-
.geturl()
|
|
465
|
-
.lower()
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
# Retain the URL fragment if required
|
|
469
|
-
if not keep_url_fragment:
|
|
470
|
-
new_url = new_url.split('#')[0]
|
|
471
|
-
|
|
472
|
-
return new_url
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
def compute_unique_key(
|
|
476
|
-
url: str,
|
|
477
|
-
method: str = 'GET',
|
|
478
|
-
payload: bytes | None = None,
|
|
479
|
-
*,
|
|
480
|
-
keep_url_fragment: bool = False,
|
|
481
|
-
use_extended_unique_key: bool = False,
|
|
482
|
-
) -> str:
|
|
483
|
-
"""Computes a unique key for caching & deduplication of requests.
|
|
484
|
-
|
|
485
|
-
This function computes a unique key by normalizing the provided URL and method.
|
|
486
|
-
If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
|
|
487
|
-
included in the key. Otherwise, the unique key is just the normalized URL.
|
|
488
|
-
|
|
489
|
-
Args:
|
|
490
|
-
url: The request URL.
|
|
491
|
-
method: The HTTP method, defaults to 'GET'.
|
|
492
|
-
payload: The request payload, defaults to None.
|
|
493
|
-
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
|
|
494
|
-
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
|
|
495
|
-
|
|
496
|
-
Returns:
|
|
497
|
-
A string representing the unique key for the request.
|
|
498
|
-
"""
|
|
499
|
-
# Normalize the URL and method.
|
|
500
|
-
try:
|
|
501
|
-
normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment)
|
|
502
|
-
except Exception as exc:
|
|
503
|
-
logger.warning(f'Failed to normalize URL: {exc}')
|
|
504
|
-
normalized_url = url
|
|
505
|
-
|
|
506
|
-
normalized_method = method.upper()
|
|
507
|
-
|
|
508
|
-
# Compute and return the extended unique key if required.
|
|
509
|
-
if use_extended_unique_key:
|
|
510
|
-
payload_hash = compute_short_hash(payload) if payload else ''
|
|
511
|
-
return f'{normalized_method}({payload_hash}):{normalized_url}'
|
|
512
|
-
|
|
513
|
-
# Log information if there is a non-GET request with a payload.
|
|
514
|
-
if normalized_method != 'GET' and payload:
|
|
515
|
-
logger.info(
|
|
516
|
-
f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know '
|
|
517
|
-
'that if your requests point to the same URL and differ only in method and payload, you should consider '
|
|
518
|
-
'using the "use_extended_unique_key" option.'
|
|
519
|
-
)
|
|
520
|
-
|
|
521
|
-
# Return the normalized URL as the unique key.
|
|
522
|
-
return normalized_url
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing_extensions import override
|
|
2
|
+
|
|
3
|
+
from apify_client import ApifyClientAsync
|
|
4
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
5
|
+
from crawlee.base_storage_client import BaseStorageClient
|
|
6
|
+
|
|
7
|
+
from apify._configuration import Configuration
|
|
8
|
+
from apify.apify_storage_client._dataset_client import DatasetClient
|
|
9
|
+
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
|
|
10
|
+
from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient
|
|
11
|
+
from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient
|
|
12
|
+
from apify.apify_storage_client._request_queue_client import RequestQueueClient
|
|
13
|
+
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ApifyStorageClient(BaseStorageClient):
|
|
17
|
+
"""A storage client implementation based on the Apify platform storage."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, *, configuration: Configuration) -> None:
|
|
20
|
+
self._client_key = crypto_random_object_id()
|
|
21
|
+
self._apify_client = ApifyClientAsync(
|
|
22
|
+
token=configuration.token,
|
|
23
|
+
api_url=configuration.api_base_url,
|
|
24
|
+
max_retries=8,
|
|
25
|
+
min_delay_between_retries_millis=500,
|
|
26
|
+
timeout_secs=360,
|
|
27
|
+
)
|
|
28
|
+
self._configuration = configuration
|
|
29
|
+
|
|
30
|
+
@override
|
|
31
|
+
def dataset(self, id: str) -> DatasetClient:
|
|
32
|
+
return DatasetClient(self._apify_client.dataset(id))
|
|
33
|
+
|
|
34
|
+
@override
|
|
35
|
+
def datasets(self) -> DatasetCollectionClient:
|
|
36
|
+
return DatasetCollectionClient(self._apify_client.datasets())
|
|
37
|
+
|
|
38
|
+
@override
|
|
39
|
+
def key_value_store(self, id: str) -> KeyValueStoreClient:
|
|
40
|
+
return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url)
|
|
41
|
+
|
|
42
|
+
@override
|
|
43
|
+
def key_value_stores(self) -> KeyValueStoreCollectionClient:
|
|
44
|
+
return KeyValueStoreCollectionClient(self._apify_client.key_value_stores())
|
|
45
|
+
|
|
46
|
+
@override
|
|
47
|
+
def request_queue(self, id: str) -> RequestQueueClient:
|
|
48
|
+
return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key))
|
|
49
|
+
|
|
50
|
+
@override
|
|
51
|
+
def request_queues(self) -> RequestQueueCollectionClient:
|
|
52
|
+
return RequestQueueCollectionClient(self._apify_client.request_queues())
|
|
53
|
+
|
|
54
|
+
@override
|
|
55
|
+
async def purge_on_start(self) -> None:
|
|
56
|
+
pass
|