apify 1.7.1b1__py3-none-any.whl → 2.2.0b14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (62) hide show
  1. apify/__init__.py +19 -4
  2. apify/_actor.py +1030 -0
  3. apify/_configuration.py +370 -0
  4. apify/_consts.py +10 -0
  5. apify/_crypto.py +31 -27
  6. apify/_models.py +117 -0
  7. apify/_platform_event_manager.py +231 -0
  8. apify/_proxy_configuration.py +320 -0
  9. apify/_utils.py +18 -484
  10. apify/apify_storage_client/__init__.py +3 -0
  11. apify/apify_storage_client/_apify_storage_client.py +68 -0
  12. apify/apify_storage_client/_dataset_client.py +190 -0
  13. apify/apify_storage_client/_dataset_collection_client.py +51 -0
  14. apify/apify_storage_client/_key_value_store_client.py +94 -0
  15. apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
  16. apify/apify_storage_client/_request_queue_client.py +176 -0
  17. apify/apify_storage_client/_request_queue_collection_client.py +51 -0
  18. apify/apify_storage_client/py.typed +0 -0
  19. apify/log.py +22 -105
  20. apify/scrapy/__init__.py +11 -3
  21. apify/scrapy/middlewares/__init__.py +3 -1
  22. apify/scrapy/middlewares/apify_proxy.py +29 -27
  23. apify/scrapy/middlewares/py.typed +0 -0
  24. apify/scrapy/pipelines/__init__.py +3 -1
  25. apify/scrapy/pipelines/actor_dataset_push.py +6 -3
  26. apify/scrapy/pipelines/py.typed +0 -0
  27. apify/scrapy/py.typed +0 -0
  28. apify/scrapy/requests.py +60 -58
  29. apify/scrapy/scheduler.py +28 -19
  30. apify/scrapy/utils.py +10 -32
  31. apify/storages/__init__.py +4 -10
  32. apify/storages/_request_list.py +150 -0
  33. apify/storages/py.typed +0 -0
  34. apify-2.2.0b14.dist-info/METADATA +211 -0
  35. apify-2.2.0b14.dist-info/RECORD +38 -0
  36. {apify-1.7.1b1.dist-info → apify-2.2.0b14.dist-info}/WHEEL +1 -2
  37. apify/_memory_storage/__init__.py +0 -3
  38. apify/_memory_storage/file_storage_utils.py +0 -71
  39. apify/_memory_storage/memory_storage_client.py +0 -219
  40. apify/_memory_storage/resource_clients/__init__.py +0 -19
  41. apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
  42. apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
  43. apify/_memory_storage/resource_clients/dataset.py +0 -452
  44. apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
  45. apify/_memory_storage/resource_clients/key_value_store.py +0 -533
  46. apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
  47. apify/_memory_storage/resource_clients/request_queue.py +0 -466
  48. apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
  49. apify/actor.py +0 -1351
  50. apify/config.py +0 -127
  51. apify/consts.py +0 -67
  52. apify/event_manager.py +0 -236
  53. apify/proxy_configuration.py +0 -365
  54. apify/storages/base_storage.py +0 -181
  55. apify/storages/dataset.py +0 -494
  56. apify/storages/key_value_store.py +0 -257
  57. apify/storages/request_queue.py +0 -602
  58. apify/storages/storage_client_manager.py +0 -72
  59. apify-1.7.1b1.dist-info/METADATA +0 -149
  60. apify-1.7.1b1.dist-info/RECORD +0 -41
  61. apify-1.7.1b1.dist-info/top_level.txt +0 -1
  62. {apify-1.7.1b1.dist-info → apify-2.2.0b14.dist-info}/LICENSE +0 -0
apify/_utils.py CHANGED
@@ -1,67 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
3
  import builtins
5
- import contextlib
6
- import functools
7
- import inspect
8
- import json
9
- import mimetypes
10
- import os
11
- import re
12
4
  import sys
13
- import time
14
- from base64 import b64encode
15
- from collections import OrderedDict
16
- from collections.abc import MutableMapping
17
- from datetime import datetime, timezone
18
- from hashlib import sha256
19
5
  from importlib import metadata
20
- from logging import getLogger
21
- from typing import (
22
- Any,
23
- Callable,
24
- Generic,
25
- ItemsView,
26
- Iterator,
27
- NoReturn,
28
- TypeVar,
29
- ValuesView,
30
- cast,
31
- overload,
32
- )
33
- from typing import OrderedDict as OrderedDictType
34
- from urllib.parse import parse_qsl, urlencode, urlparse
35
-
36
- import aioshutil
37
- import psutil
38
- from aiofiles import ospath
39
- from aiofiles.os import remove, rename
40
- from apify_shared.consts import (
41
- BOOL_ENV_VARS,
42
- BOOL_ENV_VARS_TYPE,
43
- DATETIME_ENV_VARS,
44
- DATETIME_ENV_VARS_TYPE,
45
- FLOAT_ENV_VARS,
46
- FLOAT_ENV_VARS_TYPE,
47
- INTEGER_ENV_VARS,
48
- INTEGER_ENV_VARS_TYPE,
49
- STRING_ENV_VARS_TYPE,
50
- ActorEnvVars,
51
- ApifyEnvVars,
52
- )
53
- from apify_shared.utils import (
54
- ignore_docs,
55
- is_content_type_json,
56
- is_content_type_text,
57
- is_content_type_xml,
58
- maybe_extract_enum_member_value,
59
- )
60
-
61
- from apify.consts import REQUEST_ID_LENGTH, StorageTypes
62
-
63
- T = TypeVar('T')
64
- logger = getLogger(__name__)
6
+ from typing import Callable, Literal
65
7
 
66
8
 
67
9
  def get_system_info() -> dict:
@@ -70,6 +12,7 @@ def get_system_info() -> dict:
70
12
  system_info: dict[str, str | bool] = {
71
13
  'apify_sdk_version': metadata.version('apify'),
72
14
  'apify_client_version': metadata.version('apify-client'),
15
+ 'crawlee_version': metadata.version('crawlee'),
73
16
  'python_version': python_version,
74
17
  'os': sys.platform,
75
18
  }
@@ -80,445 +23,36 @@ def get_system_info() -> dict:
80
23
  return system_info
81
24
 
82
25
 
83
- DualPropertyType = TypeVar('DualPropertyType')
84
- DualPropertyOwner = TypeVar('DualPropertyOwner')
85
-
86
-
87
- @ignore_docs
88
- class dualproperty(Generic[DualPropertyType]): # noqa: N801
89
- """Descriptor combining `property` and `classproperty`.
90
-
91
- When accessing the decorated attribute on an instance, it calls the getter with the instance as the first argument,
92
- and when accessing it on a class, it calls the getter with the class as the first argument.
93
- """
94
-
95
- def __init__(self: dualproperty, getter: Callable[..., DualPropertyType]) -> None:
96
- """Initialize the dualproperty.
97
-
98
- Args:
99
- getter (Callable): The getter of the property.
100
- It should accept either an instance or a class as its first argument.
101
- """
102
- self.getter = getter
103
-
104
- def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualPropertyOwner]) -> DualPropertyType:
105
- """Call the getter with the right object.
106
-
107
- Args:
108
- obj (T | None): The instance of class T on which the getter will be called
109
- owner (type[T]): The class object of class T on which the getter will be called, if obj is None
110
-
111
- Returns:
112
- The result of the getter.
113
- """
114
- val = self.getter(obj or owner)
115
- return cast(DualPropertyType, val)
116
-
117
-
118
- @overload
119
- def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE) -> bool | None:
120
- ...
121
-
122
-
123
- @overload
124
- def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE, default: bool) -> bool: # noqa: FBT001
125
- ...
126
-
127
-
128
- @overload
129
- def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE) -> datetime | str | None:
130
- ...
131
-
132
-
133
- @overload
134
- def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE, default: datetime) -> datetime | str:
135
- ...
136
-
137
-
138
- @overload
139
- def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE) -> float | None:
140
- ...
141
-
142
-
143
- @overload
144
- def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE, default: float) -> float:
145
- ...
146
-
147
-
148
- @overload
149
- def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE) -> int | None:
150
- ...
151
-
152
-
153
- @overload
154
- def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE, default: int) -> int:
155
- ...
156
-
157
-
158
- @overload
159
- def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE, default: str) -> str:
160
- ...
161
-
162
-
163
- @overload
164
- def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE) -> str | None:
165
- ...
166
-
167
-
168
- @overload
169
- def fetch_and_parse_env_var(env_var: ActorEnvVars | ApifyEnvVars) -> Any:
170
- ...
171
-
172
-
173
- def fetch_and_parse_env_var(env_var: Any, default: Any = None) -> Any:
174
- env_var_name = str(maybe_extract_enum_member_value(env_var))
175
-
176
- val = os.getenv(env_var_name)
177
- if not val:
178
- return default
179
-
180
- if env_var in BOOL_ENV_VARS:
181
- return maybe_parse_bool(val)
182
- if env_var in FLOAT_ENV_VARS:
183
- parsed_float = maybe_parse_float(val)
184
- if parsed_float is None:
185
- return default
186
- return parsed_float
187
- if env_var in INTEGER_ENV_VARS:
188
- parsed_int = maybe_parse_int(val)
189
- if parsed_int is None:
190
- return default
191
- return parsed_int
192
- if env_var in DATETIME_ENV_VARS:
193
- return maybe_parse_datetime(val)
194
- return val
195
-
196
-
197
- def get_cpu_usage_percent() -> float:
198
- return psutil.cpu_percent()
199
-
200
-
201
- def get_memory_usage_bytes() -> int:
202
- current_process = psutil.Process(os.getpid())
203
- mem = int(current_process.memory_info().rss or 0)
204
- for child in current_process.children(recursive=True):
205
- with contextlib.suppress(psutil.NoSuchProcess):
206
- mem += int(child.memory_info().rss or 0)
207
- return mem
208
-
209
-
210
- def maybe_parse_bool(val: str | None) -> bool:
211
- if val in {'true', 'True', '1'}:
212
- return True
213
- return False
214
-
215
-
216
- def maybe_parse_datetime(val: str) -> datetime | str:
217
- try:
218
- return datetime.strptime(val, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=timezone.utc)
219
- except ValueError:
220
- return val
221
-
222
-
223
- def maybe_parse_float(val: str) -> float | None:
224
- try:
225
- return float(val)
226
- except ValueError:
227
- return None
228
-
229
-
230
- def maybe_parse_int(val: str) -> int | None:
231
- try:
232
- return int(val)
233
- except ValueError:
234
- return None
235
-
236
-
237
- async def run_func_at_interval_async(func: Callable, interval_secs: float) -> None:
238
- started_at = time.perf_counter()
239
- sleep_until = started_at
240
- while True:
241
- now = time.perf_counter()
242
- sleep_until += interval_secs
243
- while sleep_until < now:
244
- sleep_until += interval_secs
245
-
246
- sleep_for_secs = sleep_until - now
247
- await asyncio.sleep(sleep_for_secs)
248
-
249
- res = func()
250
- if inspect.isawaitable(res):
251
- await res
252
-
253
-
254
- async def force_remove(filename: str) -> None:
255
- """JS-like rm(filename, { force: true })."""
256
- with contextlib.suppress(FileNotFoundError):
257
- await remove(filename)
258
-
259
-
260
- def raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: # noqa: A002
261
- client_type = maybe_extract_enum_member_value(client_type)
262
- raise ValueError(f'{client_type} with id "{id}" does not exist.')
263
-
264
-
265
- def raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn:
266
- client_type = maybe_extract_enum_member_value(client_type)
267
- raise ValueError(f'{client_type} with {key_name} "{value}" already exists.')
268
-
269
-
270
- def guess_file_extension(content_type: str) -> str | None:
271
- """Guess the file extension based on content type."""
272
- # e.g. mimetypes.guess_extension('application/json ') does not work...
273
- actual_content_type = content_type.split(';')[0].strip()
274
-
275
- # mimetypes.guess_extension returns 'xsl' in this case, because 'application/xxx' is "structured"
276
- # ('text/xml' would be "unstructured" and return 'xml')
277
- # we have to explicitly override it here
278
- if actual_content_type == 'application/xml':
279
- return 'xml'
280
-
281
- # Guess the extension from the mime type
282
- ext = mimetypes.guess_extension(actual_content_type)
283
-
284
- # Remove the leading dot if extension successfully parsed
285
- return ext[1:] if ext is not None else ext
286
-
287
-
288
- def maybe_parse_body(body: bytes, content_type: str) -> Any:
289
- if is_content_type_json(content_type):
290
- return json.loads(body.decode('utf-8')) # Returns any
291
- if is_content_type_xml(content_type) or is_content_type_text(content_type):
292
- return body.decode('utf-8')
293
- return body
294
-
295
-
296
- def unique_key_to_request_id(unique_key: str) -> str:
297
- """Generate request ID based on unique key in a deterministic way."""
298
- request_id = re.sub(r'(\+|\/|=)', '', b64encode(sha256(unique_key.encode('utf-8')).digest()).decode('utf-8'))
299
- return request_id[:REQUEST_ID_LENGTH] if len(request_id) > REQUEST_ID_LENGTH else request_id
300
-
301
-
302
- async def force_rename(src_dir: str, dst_dir: str) -> None:
303
- """Rename a directory. Checks for existence of soruce directory and removes destination directory if it exists."""
304
- # Make sure source directory exists
305
- if await ospath.exists(src_dir):
306
- # Remove destination directory if it exists
307
- if await ospath.exists(dst_dir):
308
- await aioshutil.rmtree(dst_dir, ignore_errors=True)
309
- await rename(src_dir, dst_dir)
310
-
311
-
312
- ImplementationType = TypeVar('ImplementationType', bound=Callable)
313
- MetadataType = TypeVar('MetadataType', bound=Callable)
314
-
315
-
316
- def wrap_internal(implementation: ImplementationType, metadata_source: MetadataType) -> MetadataType:
317
- @functools.wraps(metadata_source)
318
- def wrapper(*args: Any, **kwargs: Any) -> Any:
319
- return implementation(*args, **kwargs)
320
-
321
- return cast(MetadataType, wrapper)
322
-
323
-
324
- @ignore_docs
325
- class LRUCache(MutableMapping, Generic[T]):
326
- """Attempt to reimplement LRUCache from `@apify/datastructures` using `OrderedDict`."""
327
-
328
- _cache: OrderedDictType[str, T]
329
-
330
- _max_length: int
331
-
332
- def __init__(self: LRUCache, max_length: int) -> None:
333
- """Create a LRUCache with a specific max_length."""
334
- self._cache = OrderedDict()
335
- self._max_length = max_length
336
-
337
- def __getitem__(self: LRUCache, key: str) -> T:
338
- """Get an item from the cache. Move it to the end if present."""
339
- val = self._cache[key]
340
- # No 'key in cache' condition since the previous line would raise KeyError
341
- self._cache.move_to_end(key)
342
- return cast(T, val)
343
-
344
- # Sadly TS impl returns bool indicating whether the key was already present or not
345
- def __setitem__(self: LRUCache, key: str, value: T) -> None:
346
- """Add an item to the cache. Remove least used item if max_length exceeded."""
347
- self._cache[key] = value
348
- if len(self._cache) > self._max_length:
349
- self._cache.popitem(last=False)
350
-
351
- def __delitem__(self: LRUCache, key: str) -> None:
352
- """Remove an item from the cache."""
353
- del self._cache[key]
354
-
355
- def __iter__(self: LRUCache) -> Iterator[str]:
356
- """Iterate over the keys of the cache in order of insertion."""
357
- return self._cache.__iter__()
358
-
359
- def __len__(self: LRUCache) -> int:
360
- """Get the number of items in the cache."""
361
- return len(self._cache)
362
-
363
- def values(self: LRUCache) -> ValuesView[T]: # Needed so we don't mutate the cache by __getitem__
364
- """Iterate over the values in the cache in order of insertion."""
365
- return self._cache.values()
366
-
367
- def items(self: LRUCache) -> ItemsView[str, T]: # Needed so we don't mutate the cache by __getitem__
368
- """Iterate over the pairs of (key, value) in the cache in order of insertion."""
369
- return self._cache.items()
370
-
371
-
372
26
  def is_running_in_ipython() -> bool:
373
27
  return getattr(builtins, '__IPYTHON__', False)
374
28
 
375
29
 
376
- @overload
377
- def budget_ow(value: str | float | bool, predicate: tuple[type, bool], value_name: str) -> None:
378
- ...
379
-
380
-
381
- @overload
382
- def budget_ow(value: dict, predicate: dict[str, tuple[type, bool]]) -> None:
383
- ...
384
-
385
-
386
- def budget_ow(
387
- value: dict | str | float | bool,
388
- predicate: dict[str, tuple[type, bool]] | tuple[type, bool],
389
- value_name: str | None = None,
390
- ) -> None:
391
- """Budget version of ow."""
392
-
393
- def validate_single(field_value: Any, expected_type: type, required: bool, name: str) -> None: # noqa: FBT001
394
- if field_value is None and required:
395
- raise ValueError(f'"{name}" is required!')
396
- if (field_value is not None or required) and not isinstance(field_value, expected_type):
397
- raise ValueError(f'"{name}" must be of type "{expected_type.__name__}" but it is "{type(field_value).__name__}"!')
398
-
399
- # Validate object
400
- if isinstance(value, dict) and isinstance(predicate, dict):
401
- for key, (field_type, required) in predicate.items():
402
- field_value = value.get(key)
403
- validate_single(field_value, field_type, required, key)
404
- # Validate "primitive"
405
- elif isinstance(value, (int, str, float, bool)) and isinstance(predicate, tuple) and value_name is not None:
406
- field_type, required = predicate
407
- validate_single(value, field_type, required, value_name)
408
- else:
409
- raise ValueError('Wrong input!')
410
-
411
-
412
- PARSE_DATE_FIELDS_MAX_DEPTH = 3
413
- PARSE_DATE_FIELDS_KEY_SUFFIX = 'At'
414
- ListOrDictOrAny = TypeVar('ListOrDictOrAny', list, dict, Any)
30
+ GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Errors', 'Functions']
415
31
 
416
32
 
417
- def compute_short_hash(data: bytes, *, length: int = 8) -> str:
418
- """Computes a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.
33
+ def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
34
+ """Decorator to mark symbols for rendering and grouping in documentation.
419
35
 
420
- Args:
421
- data: The binary data to be hashed.
422
- length: The length of the hash to be returned.
423
-
424
- Returns:
425
- A substring (prefix) of the hexadecimal hash of the data.
426
- """
427
- hash_object = sha256(data)
428
- return hash_object.hexdigest()[:length]
429
-
430
-
431
- def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
432
- """Normalizes a URL.
433
-
434
- This function cleans and standardizes a URL by removing leading and trailing whitespaces,
435
- converting the scheme and netloc to lower case, stripping unwanted tracking parameters
436
- (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,
437
- and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally
438
- identical but differ in trivial ways (such as parameter order or casing) are treated as the same.
439
-
440
- Args:
441
- url: The URL to be normalized.
442
- keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained.
443
-
444
- Returns:
445
- A string containing the normalized URL.
36
+ This decorator is used purely for documentation purposes and does not alter the behavior
37
+ of the decorated callable.
446
38
  """
447
- # Parse the URL
448
- parsed_url = urlparse(url.strip())
449
- search_params = dict(parse_qsl(parsed_url.query)) # Convert query to a dict
450
39
 
451
- # Remove any 'utm_' parameters
452
- search_params = {k: v for k, v in search_params.items() if not k.startswith('utm_')}
40
+ def wrapper(func: Callable) -> Callable:
41
+ return func
453
42
 
454
- # Construct the new query string
455
- sorted_keys = sorted(search_params.keys())
456
- sorted_query = urlencode([(k, search_params[k]) for k in sorted_keys])
43
+ return wrapper
457
44
 
458
- # Construct the final URL
459
- new_url = (
460
- parsed_url._replace(
461
- query=sorted_query,
462
- scheme=parsed_url.scheme,
463
- netloc=parsed_url.netloc,
464
- path=parsed_url.path.rstrip('/'),
465
- )
466
- .geturl()
467
- .lower()
468
- )
469
45
 
470
- # Retain the URL fragment if required
471
- if not keep_url_fragment:
472
- new_url = new_url.split('#')[0]
46
+ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
47
+ """Decorator for renaming symbols in documentation.
473
48
 
474
- return new_url
49
+ This changes the rendered name of the symbol only in the rendered web documentation.
475
50
 
476
-
477
- def compute_unique_key(
478
- url: str,
479
- method: str = 'GET',
480
- payload: bytes | None = None,
481
- *,
482
- keep_url_fragment: bool = False,
483
- use_extended_unique_key: bool = False,
484
- ) -> str:
485
- """Computes a unique key for caching & deduplication of requests.
486
-
487
- This function computes a unique key by normalizing the provided URL and method.
488
- If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
489
- included in the key. Otherwise, the unique key is just the normalized URL.
490
-
491
- Args:
492
- url: The request URL.
493
- method: The HTTP method, defaults to 'GET'.
494
- payload: The request payload, defaults to None.
495
- keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
496
- use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
497
-
498
- Returns:
499
- A string representing the unique key for the request.
51
+ This decorator is used purely for documentation purposes and does not alter the behavior
52
+ of the decorated callable.
500
53
  """
501
- # Normalize the URL and method.
502
- try:
503
- normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment)
504
- except Exception as exc:
505
- logger.warning(f'Failed to normalize URL: {exc}')
506
- normalized_url = url
507
-
508
- normalized_method = method.upper()
509
-
510
- # Compute and return the extended unique key if required.
511
- if use_extended_unique_key:
512
- payload_hash = compute_short_hash(payload) if payload else ''
513
- return f'{normalized_method}({payload_hash}):{normalized_url}'
514
54
 
515
- # Log information if there is a non-GET request with a payload.
516
- if normalized_method != 'GET' and payload:
517
- logger.info(
518
- f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know '
519
- 'that if your requests point to the same URL and differ only in method and payload, you should consider '
520
- 'using the "use_extended_unique_key" option.'
521
- )
55
+ def wrapper(func: Callable) -> Callable:
56
+ return func
522
57
 
523
- # Return the normalized URL as the unique key.
524
- return normalized_url
58
+ return wrapper
@@ -0,0 +1,3 @@
1
+ from apify.apify_storage_client._apify_storage_client import ApifyStorageClient
2
+
3
+ __all__ = ['ApifyStorageClient']
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from typing_extensions import override
6
+
7
+ from apify_client import ApifyClientAsync
8
+ from crawlee._utils.crypto import crypto_random_object_id
9
+ from crawlee.storage_clients import BaseStorageClient
10
+
11
+ from apify._utils import docs_group
12
+ from apify.apify_storage_client._dataset_client import DatasetClient
13
+ from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
14
+ from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient
15
+ from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient
16
+ from apify.apify_storage_client._request_queue_client import RequestQueueClient
17
+ from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
18
+
19
+ if TYPE_CHECKING:
20
+ from apify._configuration import Configuration
21
+
22
+
23
+ @docs_group('Classes')
24
+ class ApifyStorageClient(BaseStorageClient):
25
+ """A storage client implementation based on the Apify platform storage."""
26
+
27
+ def __init__(self, *, configuration: Configuration) -> None:
28
+ self._client_key = crypto_random_object_id()
29
+ self._apify_client = ApifyClientAsync(
30
+ token=configuration.token,
31
+ api_url=configuration.api_base_url,
32
+ max_retries=8,
33
+ min_delay_between_retries_millis=500,
34
+ timeout_secs=360,
35
+ )
36
+ self._configuration = configuration
37
+
38
+ @classmethod
39
+ def from_config(cls, config: Configuration) -> ApifyStorageClient:
40
+ return cls(configuration=config)
41
+
42
+ @override
43
+ def dataset(self, id: str) -> DatasetClient:
44
+ return DatasetClient(self._apify_client.dataset(id))
45
+
46
+ @override
47
+ def datasets(self) -> DatasetCollectionClient:
48
+ return DatasetCollectionClient(self._apify_client.datasets())
49
+
50
+ @override
51
+ def key_value_store(self, id: str) -> KeyValueStoreClient:
52
+ return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url)
53
+
54
+ @override
55
+ def key_value_stores(self) -> KeyValueStoreCollectionClient:
56
+ return KeyValueStoreCollectionClient(self._apify_client.key_value_stores())
57
+
58
+ @override
59
+ def request_queue(self, id: str) -> RequestQueueClient:
60
+ return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key))
61
+
62
+ @override
63
+ def request_queues(self) -> RequestQueueCollectionClient:
64
+ return RequestQueueCollectionClient(self._apify_client.request_queues())
65
+
66
+ @override
67
+ async def purge_on_start(self) -> None:
68
+ pass