apify 1.7.0b1__py3-none-any.whl → 2.2.0b14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +19 -4
- apify/_actor.py +1030 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.0b14.dist-info/METADATA +211 -0
- apify-2.2.0b14.dist-info/RECORD +38 -0
- {apify-1.7.0b1.dist-info → apify-2.2.0b14.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.0b1.dist-info/METADATA +0 -149
- apify-1.7.0b1.dist-info/RECORD +0 -41
- apify-1.7.0b1.dist-info/top_level.txt +0 -1
- {apify-1.7.0b1.dist-info → apify-2.2.0b14.dist-info}/LICENSE +0 -0
apify/_utils.py
CHANGED
|
@@ -1,67 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
3
|
import builtins
|
|
5
|
-
import contextlib
|
|
6
|
-
import functools
|
|
7
|
-
import inspect
|
|
8
|
-
import json
|
|
9
|
-
import mimetypes
|
|
10
|
-
import os
|
|
11
|
-
import re
|
|
12
4
|
import sys
|
|
13
|
-
import time
|
|
14
|
-
from base64 import b64encode
|
|
15
|
-
from collections import OrderedDict
|
|
16
|
-
from collections.abc import MutableMapping
|
|
17
|
-
from datetime import datetime, timezone
|
|
18
|
-
from hashlib import sha256
|
|
19
5
|
from importlib import metadata
|
|
20
|
-
from
|
|
21
|
-
from typing import (
|
|
22
|
-
Any,
|
|
23
|
-
Callable,
|
|
24
|
-
Generic,
|
|
25
|
-
ItemsView,
|
|
26
|
-
Iterator,
|
|
27
|
-
NoReturn,
|
|
28
|
-
TypeVar,
|
|
29
|
-
ValuesView,
|
|
30
|
-
cast,
|
|
31
|
-
overload,
|
|
32
|
-
)
|
|
33
|
-
from typing import OrderedDict as OrderedDictType
|
|
34
|
-
from urllib.parse import parse_qsl, urlencode, urlparse
|
|
35
|
-
|
|
36
|
-
import aioshutil
|
|
37
|
-
import psutil
|
|
38
|
-
from aiofiles import ospath
|
|
39
|
-
from aiofiles.os import remove, rename
|
|
40
|
-
from apify_shared.consts import (
|
|
41
|
-
BOOL_ENV_VARS,
|
|
42
|
-
BOOL_ENV_VARS_TYPE,
|
|
43
|
-
DATETIME_ENV_VARS,
|
|
44
|
-
DATETIME_ENV_VARS_TYPE,
|
|
45
|
-
FLOAT_ENV_VARS,
|
|
46
|
-
FLOAT_ENV_VARS_TYPE,
|
|
47
|
-
INTEGER_ENV_VARS,
|
|
48
|
-
INTEGER_ENV_VARS_TYPE,
|
|
49
|
-
STRING_ENV_VARS_TYPE,
|
|
50
|
-
ActorEnvVars,
|
|
51
|
-
ApifyEnvVars,
|
|
52
|
-
)
|
|
53
|
-
from apify_shared.utils import (
|
|
54
|
-
ignore_docs,
|
|
55
|
-
is_content_type_json,
|
|
56
|
-
is_content_type_text,
|
|
57
|
-
is_content_type_xml,
|
|
58
|
-
maybe_extract_enum_member_value,
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
from apify.consts import REQUEST_ID_LENGTH, StorageTypes
|
|
62
|
-
|
|
63
|
-
T = TypeVar('T')
|
|
64
|
-
logger = getLogger(__name__)
|
|
6
|
+
from typing import Callable, Literal
|
|
65
7
|
|
|
66
8
|
|
|
67
9
|
def get_system_info() -> dict:
|
|
@@ -70,6 +12,7 @@ def get_system_info() -> dict:
|
|
|
70
12
|
system_info: dict[str, str | bool] = {
|
|
71
13
|
'apify_sdk_version': metadata.version('apify'),
|
|
72
14
|
'apify_client_version': metadata.version('apify-client'),
|
|
15
|
+
'crawlee_version': metadata.version('crawlee'),
|
|
73
16
|
'python_version': python_version,
|
|
74
17
|
'os': sys.platform,
|
|
75
18
|
}
|
|
@@ -80,445 +23,36 @@ def get_system_info() -> dict:
|
|
|
80
23
|
return system_info
|
|
81
24
|
|
|
82
25
|
|
|
83
|
-
DualPropertyType = TypeVar('DualPropertyType')
|
|
84
|
-
DualPropertyOwner = TypeVar('DualPropertyOwner')
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
@ignore_docs
|
|
88
|
-
class dualproperty(Generic[DualPropertyType]): # noqa: N801
|
|
89
|
-
"""Descriptor combining `property` and `classproperty`.
|
|
90
|
-
|
|
91
|
-
When accessing the decorated attribute on an instance, it calls the getter with the instance as the first argument,
|
|
92
|
-
and when accessing it on a class, it calls the getter with the class as the first argument.
|
|
93
|
-
"""
|
|
94
|
-
|
|
95
|
-
def __init__(self: dualproperty, getter: Callable[..., DualPropertyType]) -> None:
|
|
96
|
-
"""Initialize the dualproperty.
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
getter (Callable): The getter of the property.
|
|
100
|
-
It should accept either an instance or a class as its first argument.
|
|
101
|
-
"""
|
|
102
|
-
self.getter = getter
|
|
103
|
-
|
|
104
|
-
def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualPropertyOwner]) -> DualPropertyType:
|
|
105
|
-
"""Call the getter with the right object.
|
|
106
|
-
|
|
107
|
-
Args:
|
|
108
|
-
obj (T | None): The instance of class T on which the getter will be called
|
|
109
|
-
owner (type[T]): The class object of class T on which the getter will be called, if obj is None
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
The result of the getter.
|
|
113
|
-
"""
|
|
114
|
-
val = self.getter(obj or owner)
|
|
115
|
-
return cast(DualPropertyType, val)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
@overload
|
|
119
|
-
def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE) -> bool | None:
|
|
120
|
-
...
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
@overload
|
|
124
|
-
def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE, default: bool) -> bool: # noqa: FBT001
|
|
125
|
-
...
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
@overload
|
|
129
|
-
def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE) -> datetime | str | None:
|
|
130
|
-
...
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@overload
|
|
134
|
-
def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE, default: datetime) -> datetime | str:
|
|
135
|
-
...
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
@overload
|
|
139
|
-
def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE) -> float | None:
|
|
140
|
-
...
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
@overload
|
|
144
|
-
def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE, default: float) -> float:
|
|
145
|
-
...
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
@overload
|
|
149
|
-
def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE) -> int | None:
|
|
150
|
-
...
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
@overload
|
|
154
|
-
def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE, default: int) -> int:
|
|
155
|
-
...
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
@overload
|
|
159
|
-
def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE, default: str) -> str:
|
|
160
|
-
...
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
@overload
|
|
164
|
-
def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE) -> str | None:
|
|
165
|
-
...
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
@overload
|
|
169
|
-
def fetch_and_parse_env_var(env_var: ActorEnvVars | ApifyEnvVars) -> Any:
|
|
170
|
-
...
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def fetch_and_parse_env_var(env_var: Any, default: Any = None) -> Any:
|
|
174
|
-
env_var_name = str(maybe_extract_enum_member_value(env_var))
|
|
175
|
-
|
|
176
|
-
val = os.getenv(env_var_name)
|
|
177
|
-
if not val:
|
|
178
|
-
return default
|
|
179
|
-
|
|
180
|
-
if env_var in BOOL_ENV_VARS:
|
|
181
|
-
return maybe_parse_bool(val)
|
|
182
|
-
if env_var in FLOAT_ENV_VARS:
|
|
183
|
-
parsed_float = maybe_parse_float(val)
|
|
184
|
-
if parsed_float is None:
|
|
185
|
-
return default
|
|
186
|
-
return parsed_float
|
|
187
|
-
if env_var in INTEGER_ENV_VARS:
|
|
188
|
-
parsed_int = maybe_parse_int(val)
|
|
189
|
-
if parsed_int is None:
|
|
190
|
-
return default
|
|
191
|
-
return parsed_int
|
|
192
|
-
if env_var in DATETIME_ENV_VARS:
|
|
193
|
-
return maybe_parse_datetime(val)
|
|
194
|
-
return val
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def get_cpu_usage_percent() -> float:
|
|
198
|
-
return psutil.cpu_percent()
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def get_memory_usage_bytes() -> int:
|
|
202
|
-
current_process = psutil.Process(os.getpid())
|
|
203
|
-
mem = int(current_process.memory_info().rss or 0)
|
|
204
|
-
for child in current_process.children(recursive=True):
|
|
205
|
-
with contextlib.suppress(psutil.NoSuchProcess):
|
|
206
|
-
mem += int(child.memory_info().rss or 0)
|
|
207
|
-
return mem
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def maybe_parse_bool(val: str | None) -> bool:
|
|
211
|
-
if val in {'true', 'True', '1'}:
|
|
212
|
-
return True
|
|
213
|
-
return False
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def maybe_parse_datetime(val: str) -> datetime | str:
|
|
217
|
-
try:
|
|
218
|
-
return datetime.strptime(val, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=timezone.utc)
|
|
219
|
-
except ValueError:
|
|
220
|
-
return val
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
def maybe_parse_float(val: str) -> float | None:
|
|
224
|
-
try:
|
|
225
|
-
return float(val)
|
|
226
|
-
except ValueError:
|
|
227
|
-
return None
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
def maybe_parse_int(val: str) -> int | None:
|
|
231
|
-
try:
|
|
232
|
-
return int(val)
|
|
233
|
-
except ValueError:
|
|
234
|
-
return None
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
async def run_func_at_interval_async(func: Callable, interval_secs: float) -> None:
|
|
238
|
-
started_at = time.perf_counter()
|
|
239
|
-
sleep_until = started_at
|
|
240
|
-
while True:
|
|
241
|
-
now = time.perf_counter()
|
|
242
|
-
sleep_until += interval_secs
|
|
243
|
-
while sleep_until < now:
|
|
244
|
-
sleep_until += interval_secs
|
|
245
|
-
|
|
246
|
-
sleep_for_secs = sleep_until - now
|
|
247
|
-
await asyncio.sleep(sleep_for_secs)
|
|
248
|
-
|
|
249
|
-
res = func()
|
|
250
|
-
if inspect.isawaitable(res):
|
|
251
|
-
await res
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
async def force_remove(filename: str) -> None:
|
|
255
|
-
"""JS-like rm(filename, { force: true })."""
|
|
256
|
-
with contextlib.suppress(FileNotFoundError):
|
|
257
|
-
await remove(filename)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
def raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: # noqa: A002
|
|
261
|
-
client_type = maybe_extract_enum_member_value(client_type)
|
|
262
|
-
raise ValueError(f'{client_type} with id "{id}" does not exist.')
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn:
|
|
266
|
-
client_type = maybe_extract_enum_member_value(client_type)
|
|
267
|
-
raise ValueError(f'{client_type} with {key_name} "{value}" already exists.')
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
def guess_file_extension(content_type: str) -> str | None:
|
|
271
|
-
"""Guess the file extension based on content type."""
|
|
272
|
-
# e.g. mimetypes.guess_extension('application/json ') does not work...
|
|
273
|
-
actual_content_type = content_type.split(';')[0].strip()
|
|
274
|
-
|
|
275
|
-
# mimetypes.guess_extension returns 'xsl' in this case, because 'application/xxx' is "structured"
|
|
276
|
-
# ('text/xml' would be "unstructured" and return 'xml')
|
|
277
|
-
# we have to explicitly override it here
|
|
278
|
-
if actual_content_type == 'application/xml':
|
|
279
|
-
return 'xml'
|
|
280
|
-
|
|
281
|
-
# Guess the extension from the mime type
|
|
282
|
-
ext = mimetypes.guess_extension(actual_content_type)
|
|
283
|
-
|
|
284
|
-
# Remove the leading dot if extension successfully parsed
|
|
285
|
-
return ext[1:] if ext is not None else ext
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
def maybe_parse_body(body: bytes, content_type: str) -> Any:
|
|
289
|
-
if is_content_type_json(content_type):
|
|
290
|
-
return json.loads(body.decode('utf-8')) # Returns any
|
|
291
|
-
if is_content_type_xml(content_type) or is_content_type_text(content_type):
|
|
292
|
-
return body.decode('utf-8')
|
|
293
|
-
return body
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
def unique_key_to_request_id(unique_key: str) -> str:
|
|
297
|
-
"""Generate request ID based on unique key in a deterministic way."""
|
|
298
|
-
request_id = re.sub(r'(\+|\/|=)', '', b64encode(sha256(unique_key.encode('utf-8')).digest()).decode('utf-8'))
|
|
299
|
-
return request_id[:REQUEST_ID_LENGTH] if len(request_id) > REQUEST_ID_LENGTH else request_id
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
async def force_rename(src_dir: str, dst_dir: str) -> None:
|
|
303
|
-
"""Rename a directory. Checks for existence of soruce directory and removes destination directory if it exists."""
|
|
304
|
-
# Make sure source directory exists
|
|
305
|
-
if await ospath.exists(src_dir):
|
|
306
|
-
# Remove destination directory if it exists
|
|
307
|
-
if await ospath.exists(dst_dir):
|
|
308
|
-
await aioshutil.rmtree(dst_dir, ignore_errors=True)
|
|
309
|
-
await rename(src_dir, dst_dir)
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
ImplementationType = TypeVar('ImplementationType', bound=Callable)
|
|
313
|
-
MetadataType = TypeVar('MetadataType', bound=Callable)
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
def wrap_internal(implementation: ImplementationType, metadata_source: MetadataType) -> MetadataType:
|
|
317
|
-
@functools.wraps(metadata_source)
|
|
318
|
-
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
319
|
-
return implementation(*args, **kwargs)
|
|
320
|
-
|
|
321
|
-
return cast(MetadataType, wrapper)
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
@ignore_docs
|
|
325
|
-
class LRUCache(MutableMapping, Generic[T]):
|
|
326
|
-
"""Attempt to reimplement LRUCache from `@apify/datastructures` using `OrderedDict`."""
|
|
327
|
-
|
|
328
|
-
_cache: OrderedDictType[str, T]
|
|
329
|
-
|
|
330
|
-
_max_length: int
|
|
331
|
-
|
|
332
|
-
def __init__(self: LRUCache, max_length: int) -> None:
|
|
333
|
-
"""Create a LRUCache with a specific max_length."""
|
|
334
|
-
self._cache = OrderedDict()
|
|
335
|
-
self._max_length = max_length
|
|
336
|
-
|
|
337
|
-
def __getitem__(self: LRUCache, key: str) -> T:
|
|
338
|
-
"""Get an item from the cache. Move it to the end if present."""
|
|
339
|
-
val = self._cache[key]
|
|
340
|
-
# No 'key in cache' condition since the previous line would raise KeyError
|
|
341
|
-
self._cache.move_to_end(key)
|
|
342
|
-
return cast(T, val)
|
|
343
|
-
|
|
344
|
-
# Sadly TS impl returns bool indicating whether the key was already present or not
|
|
345
|
-
def __setitem__(self: LRUCache, key: str, value: T) -> None:
|
|
346
|
-
"""Add an item to the cache. Remove least used item if max_length exceeded."""
|
|
347
|
-
self._cache[key] = value
|
|
348
|
-
if len(self._cache) > self._max_length:
|
|
349
|
-
self._cache.popitem(last=False)
|
|
350
|
-
|
|
351
|
-
def __delitem__(self: LRUCache, key: str) -> None:
|
|
352
|
-
"""Remove an item from the cache."""
|
|
353
|
-
del self._cache[key]
|
|
354
|
-
|
|
355
|
-
def __iter__(self: LRUCache) -> Iterator[str]:
|
|
356
|
-
"""Iterate over the keys of the cache in order of insertion."""
|
|
357
|
-
return self._cache.__iter__()
|
|
358
|
-
|
|
359
|
-
def __len__(self: LRUCache) -> int:
|
|
360
|
-
"""Get the number of items in the cache."""
|
|
361
|
-
return len(self._cache)
|
|
362
|
-
|
|
363
|
-
def values(self: LRUCache) -> ValuesView[T]: # Needed so we don't mutate the cache by __getitem__
|
|
364
|
-
"""Iterate over the values in the cache in order of insertion."""
|
|
365
|
-
return self._cache.values()
|
|
366
|
-
|
|
367
|
-
def items(self: LRUCache) -> ItemsView[str, T]: # Needed so we don't mutate the cache by __getitem__
|
|
368
|
-
"""Iterate over the pairs of (key, value) in the cache in order of insertion."""
|
|
369
|
-
return self._cache.items()
|
|
370
|
-
|
|
371
|
-
|
|
372
26
|
def is_running_in_ipython() -> bool:
|
|
373
27
|
return getattr(builtins, '__IPYTHON__', False)
|
|
374
28
|
|
|
375
29
|
|
|
376
|
-
|
|
377
|
-
def budget_ow(value: str | float | bool, predicate: tuple[type, bool], value_name: str) -> None:
|
|
378
|
-
...
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
@overload
|
|
382
|
-
def budget_ow(value: dict, predicate: dict[str, tuple[type, bool]]) -> None:
|
|
383
|
-
...
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
def budget_ow(
|
|
387
|
-
value: dict | str | float | bool,
|
|
388
|
-
predicate: dict[str, tuple[type, bool]] | tuple[type, bool],
|
|
389
|
-
value_name: str | None = None,
|
|
390
|
-
) -> None:
|
|
391
|
-
"""Budget version of ow."""
|
|
392
|
-
|
|
393
|
-
def validate_single(field_value: Any, expected_type: type, required: bool, name: str) -> None: # noqa: FBT001
|
|
394
|
-
if field_value is None and required:
|
|
395
|
-
raise ValueError(f'"{name}" is required!')
|
|
396
|
-
if (field_value is not None or required) and not isinstance(field_value, expected_type):
|
|
397
|
-
raise ValueError(f'"{name}" must be of type "{expected_type.__name__}" but it is "{type(field_value).__name__}"!')
|
|
398
|
-
|
|
399
|
-
# Validate object
|
|
400
|
-
if isinstance(value, dict) and isinstance(predicate, dict):
|
|
401
|
-
for key, (field_type, required) in predicate.items():
|
|
402
|
-
field_value = value.get(key)
|
|
403
|
-
validate_single(field_value, field_type, required, key)
|
|
404
|
-
# Validate "primitive"
|
|
405
|
-
elif isinstance(value, (int, str, float, bool)) and isinstance(predicate, tuple) and value_name is not None:
|
|
406
|
-
field_type, required = predicate
|
|
407
|
-
validate_single(value, field_type, required, value_name)
|
|
408
|
-
else:
|
|
409
|
-
raise ValueError('Wrong input!')
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
PARSE_DATE_FIELDS_MAX_DEPTH = 3
|
|
413
|
-
PARSE_DATE_FIELDS_KEY_SUFFIX = 'At'
|
|
414
|
-
ListOrDictOrAny = TypeVar('ListOrDictOrAny', list, dict, Any)
|
|
30
|
+
GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Errors', 'Functions']
|
|
415
31
|
|
|
416
32
|
|
|
417
|
-
def
|
|
418
|
-
"""
|
|
33
|
+
def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
|
|
34
|
+
"""Decorator to mark symbols for rendering and grouping in documentation.
|
|
419
35
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
length: The length of the hash to be returned.
|
|
423
|
-
|
|
424
|
-
Returns:
|
|
425
|
-
A substring (prefix) of the hexadecimal hash of the data.
|
|
426
|
-
"""
|
|
427
|
-
hash_object = sha256(data)
|
|
428
|
-
return hash_object.hexdigest()[:length]
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
|
|
432
|
-
"""Normalizes a URL.
|
|
433
|
-
|
|
434
|
-
This function cleans and standardizes a URL by removing leading and trailing whitespaces,
|
|
435
|
-
converting the scheme and netloc to lower case, stripping unwanted tracking parameters
|
|
436
|
-
(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,
|
|
437
|
-
and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally
|
|
438
|
-
identical but differ in trivial ways (such as parameter order or casing) are treated as the same.
|
|
439
|
-
|
|
440
|
-
Args:
|
|
441
|
-
url: The URL to be normalized.
|
|
442
|
-
keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained.
|
|
443
|
-
|
|
444
|
-
Returns:
|
|
445
|
-
A string containing the normalized URL.
|
|
36
|
+
This decorator is used purely for documentation purposes and does not alter the behavior
|
|
37
|
+
of the decorated callable.
|
|
446
38
|
"""
|
|
447
|
-
# Parse the URL
|
|
448
|
-
parsed_url = urlparse(url.strip())
|
|
449
|
-
search_params = dict(parse_qsl(parsed_url.query)) # Convert query to a dict
|
|
450
39
|
|
|
451
|
-
|
|
452
|
-
|
|
40
|
+
def wrapper(func: Callable) -> Callable:
|
|
41
|
+
return func
|
|
453
42
|
|
|
454
|
-
|
|
455
|
-
sorted_keys = sorted(search_params.keys())
|
|
456
|
-
sorted_query = urlencode([(k, search_params[k]) for k in sorted_keys])
|
|
43
|
+
return wrapper
|
|
457
44
|
|
|
458
|
-
# Construct the final URL
|
|
459
|
-
new_url = (
|
|
460
|
-
parsed_url._replace(
|
|
461
|
-
query=sorted_query,
|
|
462
|
-
scheme=parsed_url.scheme,
|
|
463
|
-
netloc=parsed_url.netloc,
|
|
464
|
-
path=parsed_url.path.rstrip('/'),
|
|
465
|
-
)
|
|
466
|
-
.geturl()
|
|
467
|
-
.lower()
|
|
468
|
-
)
|
|
469
45
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
new_url = new_url.split('#')[0]
|
|
46
|
+
def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
|
|
47
|
+
"""Decorator for renaming symbols in documentation.
|
|
473
48
|
|
|
474
|
-
|
|
49
|
+
This changes the rendered name of the symbol only in the rendered web documentation.
|
|
475
50
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
url: str,
|
|
479
|
-
method: str = 'GET',
|
|
480
|
-
payload: bytes | None = None,
|
|
481
|
-
*,
|
|
482
|
-
keep_url_fragment: bool = False,
|
|
483
|
-
use_extended_unique_key: bool = False,
|
|
484
|
-
) -> str:
|
|
485
|
-
"""Computes a unique key for caching & deduplication of requests.
|
|
486
|
-
|
|
487
|
-
This function computes a unique key by normalizing the provided URL and method.
|
|
488
|
-
If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
|
|
489
|
-
included in the key. Otherwise, the unique key is just the normalized URL.
|
|
490
|
-
|
|
491
|
-
Args:
|
|
492
|
-
url: The request URL.
|
|
493
|
-
method: The HTTP method, defaults to 'GET'.
|
|
494
|
-
payload: The request payload, defaults to None.
|
|
495
|
-
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
|
|
496
|
-
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
|
|
497
|
-
|
|
498
|
-
Returns:
|
|
499
|
-
A string representing the unique key for the request.
|
|
51
|
+
This decorator is used purely for documentation purposes and does not alter the behavior
|
|
52
|
+
of the decorated callable.
|
|
500
53
|
"""
|
|
501
|
-
# Normalize the URL and method.
|
|
502
|
-
try:
|
|
503
|
-
normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment)
|
|
504
|
-
except Exception as exc:
|
|
505
|
-
logger.warning(f'Failed to normalize URL: {exc}')
|
|
506
|
-
normalized_url = url
|
|
507
|
-
|
|
508
|
-
normalized_method = method.upper()
|
|
509
|
-
|
|
510
|
-
# Compute and return the extended unique key if required.
|
|
511
|
-
if use_extended_unique_key:
|
|
512
|
-
payload_hash = compute_short_hash(payload) if payload else ''
|
|
513
|
-
return f'{normalized_method}({payload_hash}):{normalized_url}'
|
|
514
54
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
logger.info(
|
|
518
|
-
f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know '
|
|
519
|
-
'that if your requests point to the same URL and differ only in method and payload, you should consider '
|
|
520
|
-
'using the "use_extended_unique_key" option.'
|
|
521
|
-
)
|
|
55
|
+
def wrapper(func: Callable) -> Callable:
|
|
56
|
+
return func
|
|
522
57
|
|
|
523
|
-
|
|
524
|
-
return normalized_url
|
|
58
|
+
return wrapper
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from typing_extensions import override
|
|
6
|
+
|
|
7
|
+
from apify_client import ApifyClientAsync
|
|
8
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
9
|
+
from crawlee.storage_clients import BaseStorageClient
|
|
10
|
+
|
|
11
|
+
from apify._utils import docs_group
|
|
12
|
+
from apify.apify_storage_client._dataset_client import DatasetClient
|
|
13
|
+
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
|
|
14
|
+
from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient
|
|
15
|
+
from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient
|
|
16
|
+
from apify.apify_storage_client._request_queue_client import RequestQueueClient
|
|
17
|
+
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from apify._configuration import Configuration
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@docs_group('Classes')
|
|
24
|
+
class ApifyStorageClient(BaseStorageClient):
|
|
25
|
+
"""A storage client implementation based on the Apify platform storage."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, *, configuration: Configuration) -> None:
|
|
28
|
+
self._client_key = crypto_random_object_id()
|
|
29
|
+
self._apify_client = ApifyClientAsync(
|
|
30
|
+
token=configuration.token,
|
|
31
|
+
api_url=configuration.api_base_url,
|
|
32
|
+
max_retries=8,
|
|
33
|
+
min_delay_between_retries_millis=500,
|
|
34
|
+
timeout_secs=360,
|
|
35
|
+
)
|
|
36
|
+
self._configuration = configuration
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_config(cls, config: Configuration) -> ApifyStorageClient:
|
|
40
|
+
return cls(configuration=config)
|
|
41
|
+
|
|
42
|
+
@override
|
|
43
|
+
def dataset(self, id: str) -> DatasetClient:
|
|
44
|
+
return DatasetClient(self._apify_client.dataset(id))
|
|
45
|
+
|
|
46
|
+
@override
|
|
47
|
+
def datasets(self) -> DatasetCollectionClient:
|
|
48
|
+
return DatasetCollectionClient(self._apify_client.datasets())
|
|
49
|
+
|
|
50
|
+
@override
|
|
51
|
+
def key_value_store(self, id: str) -> KeyValueStoreClient:
|
|
52
|
+
return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url)
|
|
53
|
+
|
|
54
|
+
@override
|
|
55
|
+
def key_value_stores(self) -> KeyValueStoreCollectionClient:
|
|
56
|
+
return KeyValueStoreCollectionClient(self._apify_client.key_value_stores())
|
|
57
|
+
|
|
58
|
+
@override
|
|
59
|
+
def request_queue(self, id: str) -> RequestQueueClient:
|
|
60
|
+
return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key))
|
|
61
|
+
|
|
62
|
+
@override
|
|
63
|
+
def request_queues(self) -> RequestQueueCollectionClient:
|
|
64
|
+
return RequestQueueCollectionClient(self._apify_client.request_queues())
|
|
65
|
+
|
|
66
|
+
@override
|
|
67
|
+
async def purge_on_start(self) -> None:
|
|
68
|
+
pass
|