apify 1.7.3b3__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (61) hide show
  1. apify/__init__.py +19 -4
  2. apify/_actor.py +979 -0
  3. apify/_configuration.py +310 -0
  4. apify/_consts.py +10 -0
  5. apify/_crypto.py +29 -27
  6. apify/_models.py +110 -0
  7. apify/_platform_event_manager.py +222 -0
  8. apify/_proxy_configuration.py +316 -0
  9. apify/_utils.py +0 -497
  10. apify/apify_storage_client/__init__.py +3 -0
  11. apify/apify_storage_client/_apify_storage_client.py +56 -0
  12. apify/apify_storage_client/_dataset_client.py +188 -0
  13. apify/apify_storage_client/_dataset_collection_client.py +50 -0
  14. apify/apify_storage_client/_key_value_store_client.py +98 -0
  15. apify/apify_storage_client/_key_value_store_collection_client.py +50 -0
  16. apify/apify_storage_client/_request_queue_client.py +208 -0
  17. apify/apify_storage_client/_request_queue_collection_client.py +50 -0
  18. apify/apify_storage_client/py.typed +0 -0
  19. apify/log.py +24 -105
  20. apify/scrapy/__init__.py +11 -3
  21. apify/scrapy/middlewares/__init__.py +3 -1
  22. apify/scrapy/middlewares/apify_proxy.py +21 -21
  23. apify/scrapy/middlewares/py.typed +0 -0
  24. apify/scrapy/pipelines/__init__.py +3 -1
  25. apify/scrapy/pipelines/actor_dataset_push.py +1 -1
  26. apify/scrapy/pipelines/py.typed +0 -0
  27. apify/scrapy/py.typed +0 -0
  28. apify/scrapy/requests.py +55 -54
  29. apify/scrapy/scheduler.py +19 -13
  30. apify/scrapy/utils.py +2 -31
  31. apify/storages/__init__.py +2 -10
  32. apify/storages/py.typed +0 -0
  33. apify-2.0.0.dist-info/METADATA +209 -0
  34. apify-2.0.0.dist-info/RECORD +37 -0
  35. {apify-1.7.3b3.dist-info → apify-2.0.0.dist-info}/WHEEL +1 -2
  36. apify/_memory_storage/__init__.py +0 -3
  37. apify/_memory_storage/file_storage_utils.py +0 -71
  38. apify/_memory_storage/memory_storage_client.py +0 -219
  39. apify/_memory_storage/resource_clients/__init__.py +0 -19
  40. apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
  41. apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
  42. apify/_memory_storage/resource_clients/dataset.py +0 -452
  43. apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
  44. apify/_memory_storage/resource_clients/key_value_store.py +0 -533
  45. apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
  46. apify/_memory_storage/resource_clients/request_queue.py +0 -466
  47. apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
  48. apify/actor.py +0 -1357
  49. apify/config.py +0 -130
  50. apify/consts.py +0 -67
  51. apify/event_manager.py +0 -236
  52. apify/proxy_configuration.py +0 -365
  53. apify/storages/base_storage.py +0 -181
  54. apify/storages/dataset.py +0 -494
  55. apify/storages/key_value_store.py +0 -257
  56. apify/storages/request_queue.py +0 -602
  57. apify/storages/storage_client_manager.py +0 -72
  58. apify-1.7.3b3.dist-info/METADATA +0 -150
  59. apify-1.7.3b3.dist-info/RECORD +0 -41
  60. apify-1.7.3b3.dist-info/top_level.txt +0 -1
  61. {apify-1.7.3b3.dist-info → apify-2.0.0.dist-info}/LICENSE +0 -0
apify/log.py CHANGED
@@ -1,16 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  import logging
5
- import textwrap
6
- import traceback
7
- from typing import Any
4
+ from typing import TYPE_CHECKING
8
5
 
9
- from apify_shared.utils import ignore_docs
10
- from colorama import Fore, Style, just_fix_windows_console
11
-
12
- just_fix_windows_console()
6
+ from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
13
7
 
8
+ if TYPE_CHECKING:
9
+ from apify import Configuration
14
10
 
15
11
  # Name of the logger used throughout the library (resolves to 'apify')
16
12
  logger_name = __name__.split('.')[0]
@@ -18,107 +14,30 @@ logger_name = __name__.split('.')[0]
18
14
  # Logger used throughout the library
19
15
  logger = logging.getLogger(logger_name)
20
16
 
21
- _LOG_NAME_COLOR = Fore.LIGHTBLACK_EX
22
-
23
- _LOG_LEVEL_COLOR = {
24
- logging.DEBUG: Fore.BLUE,
25
- logging.INFO: Fore.GREEN,
26
- logging.WARNING: Fore.YELLOW,
27
- logging.ERROR: Fore.RED,
28
- logging.CRITICAL: Fore.RED,
29
- }
30
-
31
- _LOG_LEVEL_SHORT_ALIAS = {
32
- logging.DEBUG: 'DEBUG',
33
- logging.INFO: 'INFO ',
34
- logging.WARNING: 'WARN ',
35
- logging.ERROR: 'ERROR',
36
- }
37
-
38
- # So that all the log messages have the same alignment
39
- _LOG_MESSAGE_INDENT = ' ' * 6
40
-
41
-
42
- class ActorLogFormatter(logging.Formatter):
43
- """Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.
44
-
45
- It formats the log records so that they:
46
- - start with the level (colorized, and padded to 5 chars so that it is nicely aligned)
47
- - then have the actual log message, if it's multiline then it's nicely indented
48
- - then have the stringified extra log fields
49
- - then, if an exception is a part of the log record, prints the formatted exception.
50
- """
51
-
52
- # The fields that are added to the log record with `logger.log(..., extra={...})`
53
- # are just merged in the log record with the other log record properties, and you can't get them in some nice, isolated way.
54
- # So, to get the extra fields, we just compare all the properties present in the log record
55
- # with properties present in an empty log record,
56
- # and extract all the extra ones not present in the empty log record
57
- empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)
58
-
59
- def __init__(
60
- self: ActorLogFormatter,
61
- include_logger_name: bool = False, # noqa: FBT001, FBT002
62
- *args: Any,
63
- **kwargs: Any,
64
- ) -> None:
65
- """Create an instance of the ActorLogFormatter.
66
-
67
- Args:
68
- include_logger_name: Include logger name at the beginning of the log line. Defaults to False.
69
- args: Arguments passed to the parent class.
70
- kwargs: Keyword arguments passed to the parent class.
71
- """
72
- super().__init__(*args, **kwargs)
73
- self.include_logger_name = include_logger_name
74
-
75
- def _get_extra_fields(self: ActorLogFormatter, record: logging.LogRecord) -> dict[str, Any]:
76
- extra_fields: dict[str, Any] = {}
77
- for key, value in record.__dict__.items():
78
- if key not in self.empty_record.__dict__:
79
- extra_fields[key] = value # noqa: PERF403
80
-
81
- return extra_fields
82
-
83
- @ignore_docs
84
- def format(self: ActorLogFormatter, record: logging.LogRecord) -> str:
85
- """Format the log record nicely.
86
17
 
87
- This formats the log record so that it:
88
- - starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)
89
- - then has the actual log message, if it's multiline then it's nicely indented
90
- - then has the stringified extra log fields
91
- - then, if an exception is a part of the log record, prints the formatted exception.
92
- """
93
- logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} '
18
+ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
19
+ pass
94
20
 
95
- # Colorize the log level, and shorten it to 6 chars tops
96
- level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '')
97
- level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname)
98
- level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} '
99
21
 
100
- # Format the exception, if there is some
101
- # Basically just print the traceback and indent it a bit
102
- exception_string = ''
103
- if record.exc_info:
104
- exc_info = record.exc_info
105
- record.exc_info = None
106
- exception_string = ''.join(traceback.format_exception(*exc_info)).rstrip()
107
- exception_string = '\n' + textwrap.indent(exception_string, _LOG_MESSAGE_INDENT)
22
+ def _configure_logging(configuration: Configuration) -> None:
23
+ apify_client_logger = logging.getLogger('apify_client')
24
+ configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
108
25
 
109
- # Format the extra log record fields, if there were some
110
- # Just stringify them to JSON and color them gray
111
- extra_string = ''
112
- extra = self._get_extra_fields(record)
113
- if extra:
114
- extra_string = f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}'
26
+ level = get_configured_log_level(configuration)
115
27
 
116
- # Format the actual log message, and indent everything but the first line
117
- log_string = super().format(record)
118
- log_string = textwrap.indent(log_string, _LOG_MESSAGE_INDENT).lstrip()
28
+ # Keep apify_client logger quiet unless debug logging is requested
29
+ if level > logging.DEBUG:
30
+ apify_client_logger.setLevel(logging.INFO)
31
+ else:
32
+ apify_client_logger.setLevel(level)
119
33
 
120
- if self.include_logger_name:
121
- # Include logger name at the beginning of the log line
122
- return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}'
34
+ # Silence HTTPX logger unless debug logging is requested
35
+ httpx_logger = logging.getLogger('httpx')
36
+ if level > logging.DEBUG:
37
+ httpx_logger.setLevel(logging.WARNING)
38
+ else:
39
+ httpx_logger.setLevel(level)
123
40
 
124
- return f'{level_string}{log_string}{extra_string}{exception_string}'
41
+ # Use configured log level for apify logger
42
+ apify_logger = logging.getLogger('apify')
43
+ configure_logger(apify_logger, configuration, remove_old_handlers=True)
apify/scrapy/__init__.py CHANGED
@@ -1,3 +1,11 @@
1
- from .requests import to_apify_request, to_scrapy_request
2
- from .scheduler import ApifyScheduler
3
- from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client
1
+ from apify.scrapy.requests import to_apify_request, to_scrapy_request
2
+ from apify.scrapy.scheduler import ApifyScheduler
3
+ from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id
4
+
5
+ __all__ = [
6
+ 'to_apify_request',
7
+ 'to_scrapy_request',
8
+ 'ApifyScheduler',
9
+ 'get_basic_auth_header',
10
+ 'get_running_event_loop_id',
11
+ ]
@@ -1 +1,3 @@
1
- from .apify_proxy import ApifyHttpProxyMiddleware
1
+ from apify.scrapy.middlewares.apify_proxy import ApifyHttpProxyMiddleware
2
+
3
+ __all__ = ['ApifyHttpProxyMiddleware']
@@ -12,8 +12,7 @@ except ImportError as exc:
12
12
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
13
13
  ) from exc
14
14
 
15
- from apify.actor import Actor
16
- from apify.proxy_configuration import ProxyConfiguration
15
+ from apify import Actor, ProxyConfiguration
17
16
  from apify.scrapy.utils import get_basic_auth_header
18
17
 
19
18
 
@@ -43,23 +42,25 @@ class ApifyHttpProxyMiddleware:
43
42
  """Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler.
44
43
 
45
44
  Args:
46
- cls: Class type.
47
45
  crawler: Scrapy Crawler object.
48
46
 
49
- Returns:
50
- ApifyHttpProxyMiddleware: Instance of the class.
47
+ Returns: Instance of the class.
51
48
  """
52
49
  proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
53
50
 
54
51
  if proxy_settings is None:
55
- Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.')
52
+ Actor.log.warning(
53
+ 'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing '
54
+ ' in the Actor input.'
55
+ )
56
56
  raise NotConfigured
57
57
 
58
58
  use_apify_proxy = proxy_settings.get('useApifyProxy', False)
59
59
 
60
60
  if use_apify_proxy is not True:
61
61
  Actor.log.warning(
62
- 'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.'
62
+ 'ApifyHttpProxyMiddleware is not going to be used. Actor input field '
63
+ '"proxyConfiguration.useApifyProxy" is probably set to False.'
63
64
  )
64
65
  raise NotConfigured
65
66
 
@@ -74,9 +75,6 @@ class ApifyHttpProxyMiddleware:
74
75
 
75
76
  Raises:
76
77
  ValueError: If username and password are not provided in the proxy URL.
77
-
78
- Returns:
79
- None: The request is processed and middleware pipeline can continue.
80
78
  """
81
79
  Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}')
82
80
  url = await self._get_new_proxy_url()
@@ -95,7 +93,7 @@ class ApifyHttpProxyMiddleware:
95
93
  request: Request,
96
94
  exception: Exception,
97
95
  spider: Spider,
98
- ) -> None | Request:
96
+ ) -> None:
99
97
  """Process an exception that occurs during request processing.
100
98
 
101
99
  Args:
@@ -104,18 +102,19 @@ class ApifyHttpProxyMiddleware:
104
102
  spider: Scrapy Spider object.
105
103
 
106
104
  Returns:
107
- If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline.
108
- Return None otherwise to allow the continuation of request processing.
105
+ Returning None, meaning Scrapy will continue processing this exception, executing any other
106
+ process_exception() methods of installed middleware, until no middleware is left and the default
107
+ exception handling kicks in.
109
108
  """
110
109
  Actor.log.debug(
111
110
  f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
112
111
  )
113
112
 
114
113
  if isinstance(exception, TunnelError):
115
- Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...')
116
- return request
117
-
118
- return None
114
+ Actor.log.warning(
115
+ f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", '
116
+ 'reason="{exception}", skipping...'
117
+ )
119
118
 
120
119
  async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
121
120
  """Get a new proxy URL.
@@ -123,8 +122,7 @@ class ApifyHttpProxyMiddleware:
123
122
  Raises:
124
123
  NotConfigured: If creation of the proxy configuration fails.
125
124
 
126
- Returns:
127
- ParseResult: New proxy URL.
125
+ Returns: New proxy URL.
128
126
  """
129
127
  # Get proxy configuration, creating it if necessary
130
128
  proxy_cfg = (
@@ -136,7 +134,9 @@ class ApifyHttpProxyMiddleware:
136
134
  # If the proxy configuration is still not available, raise an error. However, this should not happen due
137
135
  # to the checks in the `from_crawler` method.
138
136
  if proxy_cfg is None:
139
- Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.')
137
+ Actor.log.error(
138
+ 'Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.'
139
+ )
140
140
  raise NotConfigured
141
141
 
142
142
  # Store the proxy configuration for future use
@@ -144,4 +144,4 @@ class ApifyHttpProxyMiddleware:
144
144
 
145
145
  # Get a new proxy URL and return it
146
146
  new_url = await proxy_cfg.new_url()
147
- return urlparse(new_url)
147
+ return urlparse(str(new_url))
File without changes
@@ -1 +1,3 @@
1
- from .actor_dataset_push import ActorDatasetPushPipeline
1
+ from apify.scrapy.pipelines.actor_dataset_push import ActorDatasetPushPipeline
2
+
3
+ __all__ = ['ActorDatasetPushPipeline']
@@ -9,7 +9,7 @@ except ImportError as exc:
9
9
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
10
10
  ) from exc
11
11
 
12
- from apify.actor import Actor
12
+ from apify import Actor
13
13
 
14
14
 
15
15
  class ActorDatasetPushPipeline:
File without changes
apify/scrapy/py.typed ADDED
File without changes
apify/scrapy/requests.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import codecs
4
4
  import pickle
5
+ from typing import Any, cast
5
6
 
6
7
  try:
7
8
  from scrapy import Request, Spider
@@ -12,9 +13,11 @@ except ImportError as exc:
12
13
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
13
14
  ) from exc
14
15
 
15
- from apify._crypto import crypto_random_object_id
16
- from apify._utils import compute_unique_key
17
- from apify.actor import Actor
16
+ from crawlee import Request as CrawleeRequest
17
+ from crawlee._utils.crypto import crypto_random_object_id
18
+ from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
19
+
20
+ from apify import Actor
18
21
 
19
22
 
20
23
  def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
@@ -25,7 +28,7 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
25
28
  return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
26
29
 
27
30
 
28
- def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
31
+ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None:
29
32
  """Convert a Scrapy request to an Apify request.
30
33
 
31
34
  Args:
@@ -35,7 +38,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
35
38
  Returns:
36
39
  The converted Apify request if the conversion was successful, otherwise None.
37
40
  """
38
- if not isinstance(scrapy_request, Request):
41
+ if not isinstance(cast(Any, scrapy_request), Request):
39
42
  Actor.log.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.')
40
43
  return None
41
44
 
@@ -43,39 +46,41 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
43
46
  Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
44
47
 
45
48
  try:
46
- apify_request = {
47
- 'url': scrapy_request.url,
48
- 'method': scrapy_request.method,
49
- 'payload': scrapy_request.body,
50
- 'userData': scrapy_request.meta.get('userData', {}),
51
- }
52
-
53
- # Convert Scrapy's headers to a dictionary and store them in the apify_request
54
- if isinstance(scrapy_request.headers, Headers):
55
- apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
56
- else:
57
- Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}')
58
-
59
- # If the request was produced by the middleware (e.g. retry or redirect), we must compute the unique key here
60
49
  if _is_request_produced_by_middleware(scrapy_request):
61
- apify_request['uniqueKey'] = compute_unique_key(
50
+ unique_key = compute_unique_key(
62
51
  url=scrapy_request.url,
63
52
  method=scrapy_request.method,
64
53
  payload=scrapy_request.body,
65
54
  use_extended_unique_key=True,
66
55
  )
67
- # Othwerwise, we can use the unique key (also the id) from the meta
56
+ elif scrapy_request.dont_filter:
57
+ unique_key = crypto_random_object_id(8)
58
+ elif scrapy_request.meta.get('apify_request_unique_key'):
59
+ unique_key = scrapy_request.meta['apify_request_unique_key']
68
60
  else:
69
- if scrapy_request.meta.get('apify_request_id'):
70
- apify_request['id'] = scrapy_request.meta['apify_request_id']
61
+ unique_key = crypto_random_object_id(8)
71
62
 
72
- if scrapy_request.meta.get('apify_request_unique_key'):
73
- apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
63
+ if scrapy_request.meta.get('apify_request_id'):
64
+ request_id = scrapy_request.meta['apify_request_id']
65
+ else:
66
+ request_id = unique_key_to_request_id(unique_key)
67
+
68
+ apify_request = CrawleeRequest(
69
+ url=scrapy_request.url,
70
+ method=scrapy_request.method,
71
+ payload=scrapy_request.body,
72
+ user_data=scrapy_request.meta.get('userData', {}),
73
+ unique_key=unique_key,
74
+ id=request_id,
75
+ )
74
76
 
75
- # If the request's dont_filter field is set, we must generate a random `uniqueKey` to avoid deduplication
76
- # of the request in the Request Queue.
77
- if scrapy_request.dont_filter:
78
- apify_request['uniqueKey'] = crypto_random_object_id(8)
77
+ # Convert Scrapy's headers to a dictionary and store them in the apify_request
78
+ if isinstance(scrapy_request.headers, Headers):
79
+ apify_request.headers = dict(scrapy_request.headers.to_unicode_dict())
80
+ else:
81
+ Actor.log.warning(
82
+ f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
83
+ )
79
84
 
80
85
  # Serialize the Scrapy Request and store it in the apify_request.
81
86
  # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
@@ -83,7 +88,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
83
88
  # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
84
89
  scrapy_request_dict = scrapy_request.to_dict(spider=spider)
85
90
  scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
86
- apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded
91
+ apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded
87
92
 
88
93
  except Exception as exc:
89
94
  Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
@@ -93,7 +98,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
93
98
  return apify_request
94
99
 
95
100
 
96
- def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
101
+ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
97
102
  """Convert an Apify request to a Scrapy request.
98
103
 
99
104
  Args:
@@ -101,32 +106,26 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
101
106
  spider: The Scrapy spider that the request is associated with.
102
107
 
103
108
  Raises:
104
- TypeError: If the apify_request is not a dictionary.
109
+ TypeError: If the apify_request is not a crawlee request.
105
110
  ValueError: If the apify_request does not contain the required keys.
106
111
 
107
112
  Returns:
108
113
  The converted Scrapy request.
109
114
  """
110
- if not isinstance(apify_request, dict):
111
- raise TypeError('apify_request must be a dictionary')
112
-
113
- required_keys = ['url', 'method', 'id', 'uniqueKey']
114
- missing_keys = [key for key in required_keys if key not in apify_request]
115
-
116
- if missing_keys:
117
- raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
115
+ if not isinstance(cast(Any, apify_request), CrawleeRequest):
116
+ raise TypeError('apify_request must be a crawlee.Request instance')
118
117
 
119
118
  call_id = crypto_random_object_id(8)
120
119
  Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
121
120
 
122
121
  # If the apify_request comes from the Scrapy
123
- if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']:
122
+ if 'scrapy_request' in apify_request.user_data:
124
123
  # Deserialize the Scrapy Request from the apify_request.
125
124
  # - This process involves decoding the base64-encoded request data and reconstructing
126
125
  # the Scrapy Request object from its dictionary representation.
127
126
  Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
128
127
 
129
- scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
128
+ scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
130
129
  if not isinstance(scrapy_request_dict_encoded, str):
131
130
  raise TypeError('scrapy_request_dict_encoded must be a string')
132
131
 
@@ -142,34 +141,36 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
142
141
 
143
142
  # Update the meta field with the meta field from the apify_request
144
143
  meta = scrapy_request.meta or {}
145
- meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']})
146
- scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this
144
+ meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
145
+ # scrapy_request.meta is a property, so we have to set it like this
146
+ scrapy_request._meta = meta # noqa: SLF001
147
147
 
148
148
  # If the apify_request comes directly from the Request Queue, typically start URLs
149
149
  else:
150
150
  Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
151
151
 
152
152
  scrapy_request = Request(
153
- url=apify_request['url'],
154
- method=apify_request['method'],
153
+ url=apify_request.url,
154
+ method=apify_request.method,
155
155
  meta={
156
- 'apify_request_id': apify_request['id'],
157
- 'apify_request_unique_key': apify_request['uniqueKey'],
156
+ 'apify_request_id': apify_request.id,
157
+ 'apify_request_unique_key': apify_request.unique_key,
158
158
  },
159
159
  )
160
160
 
161
161
  # Add optional 'headers' field
162
- if 'headers' in apify_request:
163
- if isinstance(apify_request['headers'], dict):
164
- scrapy_request.headers = Headers(apify_request['headers'])
162
+ if apify_request.headers:
163
+ if isinstance(cast(Any, apify_request.headers), dict):
164
+ scrapy_request.headers = Headers(apify_request.headers)
165
165
  else:
166
166
  Actor.log.warning(
167
- f'apify_request[headers] is not an instance of the dict class, apify_request[headers] = {apify_request["headers"]}',
167
+ 'apify_request[headers] is not an instance of the dict class, '
168
+ f'apify_request[headers] = {apify_request.headers}',
168
169
  )
169
170
 
170
171
  # Add optional 'userData' field
171
- if 'userData' in apify_request:
172
- scrapy_request.meta['userData'] = apify_request['userData']
172
+ if apify_request.user_data:
173
+ scrapy_request.meta['userData'] = apify_request.user_data
173
174
 
174
175
  Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
175
176
  return scrapy_request
apify/scrapy/scheduler.py CHANGED
@@ -2,6 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import traceback
4
4
 
5
+ from apify._configuration import Configuration
6
+ from apify.apify_storage_client import ApifyStorageClient
7
+
5
8
  try:
6
9
  from scrapy import Spider
7
10
  from scrapy.core.scheduler import BaseScheduler
@@ -12,10 +15,11 @@ except ImportError as exc:
12
15
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
13
16
  ) from exc
14
17
 
15
- from apify._crypto import crypto_random_object_id
16
- from apify.actor import Actor
18
+ from crawlee._utils.crypto import crypto_random_object_id
19
+
20
+ from apify import Actor
17
21
  from apify.scrapy.requests import to_apify_request, to_scrapy_request
18
- from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client
22
+ from apify.scrapy.utils import nested_event_loop
19
23
  from apify.storages import RequestQueue
20
24
 
21
25
 
@@ -44,8 +48,12 @@ class ApifyScheduler(BaseScheduler):
44
48
  """
45
49
  self.spider = spider
46
50
 
51
+ async def open_queue() -> RequestQueue:
52
+ custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration())
53
+ return await RequestQueue.open(storage_client=custom_loop_apify_client)
54
+
47
55
  try:
48
- self._rq = nested_event_loop.run_until_complete(open_queue_with_custom_client())
56
+ self._rq = nested_event_loop.run_until_complete(open_queue())
49
57
  except BaseException:
50
58
  traceback.print_exc()
51
59
  raise
@@ -95,18 +103,13 @@ class ApifyScheduler(BaseScheduler):
95
103
  raise TypeError('self._rq must be an instance of the RequestQueue class')
96
104
 
97
105
  try:
98
- result = nested_event_loop.run_until_complete(
99
- self._rq.add_request(
100
- apify_request,
101
- use_extended_unique_key=True,
102
- )
103
- )
106
+ result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
104
107
  except BaseException:
105
108
  traceback.print_exc()
106
109
  raise
107
110
 
108
111
  Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
109
- return bool(result['wasAlreadyPresent'])
112
+ return bool(result.was_already_present)
110
113
 
111
114
  def next_request(self: ApifyScheduler) -> Request | None:
112
115
  """Fetch the next request from the scheduler.
@@ -127,7 +130,9 @@ class ApifyScheduler(BaseScheduler):
127
130
  traceback.print_exc()
128
131
  raise
129
132
 
130
- Actor.log.debug(f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})')
133
+ Actor.log.debug(
134
+ f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})'
135
+ )
131
136
 
132
137
  if apify_request is None:
133
138
  return None
@@ -145,6 +150,7 @@ class ApifyScheduler(BaseScheduler):
145
150
 
146
151
  scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
147
152
  Actor.log.debug(
148
- f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',
153
+ f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned '
154
+ f'(scrapy_request={scrapy_request})',
149
155
  )
150
156
  return scrapy_request
apify/scrapy/utils.py CHANGED
@@ -10,11 +10,10 @@ try:
10
10
  from scrapy.utils.python import to_bytes
11
11
  except ImportError as exc:
12
12
  raise ImportError(
13
- 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
13
+ 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
14
+ '"pip install apify[scrapy]".'
14
15
  ) from exc
15
16
 
16
- from apify.actor import Actor
17
- from apify.storages import RequestQueue, StorageClientManager
18
17
 
19
18
  nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
20
19
 
@@ -71,31 +70,3 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
71
70
  settings['APIFY_PROXY_SETTINGS'] = proxy_config
72
71
 
73
72
  return settings
74
-
75
-
76
- async def open_queue_with_custom_client() -> RequestQueue:
77
- """Open a Request Queue with custom Apify Client.
78
-
79
- TODO: add support for custom client to Actor.open_request_queue(), so that
80
- we don't have to do this hacky workaround
81
- """
82
- # Create a new Apify Client with its httpx client in the custom event loop
83
- custom_loop_apify_client = Actor.new_client()
84
-
85
- # Set the new Apify Client as the default client, back up the old client
86
- old_client = Actor.apify_client
87
- StorageClientManager.set_cloud_client(custom_loop_apify_client)
88
-
89
- # Create a new Request Queue in the custom event loop,
90
- # replace its Apify client with the custom loop's Apify client
91
- rq = await Actor.open_request_queue()
92
-
93
- if Actor.config.is_at_home:
94
- rq._request_queue_client = custom_loop_apify_client.request_queue(
95
- rq._id,
96
- client_key=rq._client_key,
97
- )
98
-
99
- # Restore the old Apify Client as the default client
100
- StorageClientManager.set_cloud_client(old_client)
101
- return rq
@@ -1,11 +1,3 @@
1
- from .dataset import Dataset
2
- from .key_value_store import KeyValueStore
3
- from .request_queue import RequestQueue
4
- from .storage_client_manager import StorageClientManager
1
+ from crawlee.storages import Dataset, KeyValueStore, RequestQueue
5
2
 
6
- __all__ = [
7
- 'Dataset',
8
- 'KeyValueStore',
9
- 'RequestQueue',
10
- 'StorageClientManager',
11
- ]
3
+ __all__ = ['Dataset', 'KeyValueStore', 'RequestQueue']
File without changes