apify 1.7.1b1__py3-none-any.whl → 2.2.0b14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +19 -4
- apify/_actor.py +1030 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.0b14.dist-info/METADATA +211 -0
- apify-2.2.0b14.dist-info/RECORD +38 -0
- {apify-1.7.1b1.dist-info → apify-2.2.0b14.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.1b1.dist-info/METADATA +0 -149
- apify-1.7.1b1.dist-info/RECORD +0 -41
- apify-1.7.1b1.dist-info/top_level.txt +0 -1
- {apify-1.7.1b1.dist-info → apify-2.2.0b14.dist-info}/LICENSE +0 -0
apify/log.py
CHANGED
|
@@ -1,16 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import logging
|
|
5
|
-
import textwrap
|
|
6
|
-
import traceback
|
|
7
|
-
from typing import Any
|
|
8
4
|
|
|
9
5
|
from apify_shared.utils import ignore_docs
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
just_fix_windows_console()
|
|
13
|
-
|
|
6
|
+
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
|
|
14
7
|
|
|
15
8
|
# Name of the logger used throughout the library (resolves to 'apify')
|
|
16
9
|
logger_name = __name__.split('.')[0]
|
|
@@ -18,107 +11,31 @@ logger_name = __name__.split('.')[0]
|
|
|
18
11
|
# Logger used throughout the library
|
|
19
12
|
logger = logging.getLogger(logger_name)
|
|
20
13
|
|
|
21
|
-
_LOG_NAME_COLOR = Fore.LIGHTBLACK_EX
|
|
22
|
-
|
|
23
|
-
_LOG_LEVEL_COLOR = {
|
|
24
|
-
logging.DEBUG: Fore.BLUE,
|
|
25
|
-
logging.INFO: Fore.GREEN,
|
|
26
|
-
logging.WARNING: Fore.YELLOW,
|
|
27
|
-
logging.ERROR: Fore.RED,
|
|
28
|
-
logging.CRITICAL: Fore.RED,
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
_LOG_LEVEL_SHORT_ALIAS = {
|
|
32
|
-
logging.DEBUG: 'DEBUG',
|
|
33
|
-
logging.INFO: 'INFO ',
|
|
34
|
-
logging.WARNING: 'WARN ',
|
|
35
|
-
logging.ERROR: 'ERROR',
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
# So that all the log messages have the same alignment
|
|
39
|
-
_LOG_MESSAGE_INDENT = ' ' * 6
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class ActorLogFormatter(logging.Formatter):
|
|
43
|
-
"""Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.
|
|
44
|
-
|
|
45
|
-
It formats the log records so that they:
|
|
46
|
-
- start with the level (colorized, and padded to 5 chars so that it is nicely aligned)
|
|
47
|
-
- then have the actual log message, if it's multiline then it's nicely indented
|
|
48
|
-
- then have the stringified extra log fields
|
|
49
|
-
- then, if an exception is a part of the log record, prints the formatted exception.
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
# The fields that are added to the log record with `logger.log(..., extra={...})`
|
|
53
|
-
# are just merged in the log record with the other log record properties, and you can't get them in some nice, isolated way.
|
|
54
|
-
# So, to get the extra fields, we just compare all the properties present in the log record
|
|
55
|
-
# with properties present in an empty log record,
|
|
56
|
-
# and extract all the extra ones not present in the empty log record
|
|
57
|
-
empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)
|
|
58
|
-
|
|
59
|
-
def __init__(
|
|
60
|
-
self: ActorLogFormatter,
|
|
61
|
-
include_logger_name: bool = False, # noqa: FBT001, FBT002
|
|
62
|
-
*args: Any,
|
|
63
|
-
**kwargs: Any,
|
|
64
|
-
) -> None:
|
|
65
|
-
"""Create an instance of the ActorLogFormatter.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
include_logger_name: Include logger name at the beginning of the log line. Defaults to False.
|
|
69
|
-
args: Arguments passed to the parent class.
|
|
70
|
-
kwargs: Keyword arguments passed to the parent class.
|
|
71
|
-
"""
|
|
72
|
-
super().__init__(*args, **kwargs)
|
|
73
|
-
self.include_logger_name = include_logger_name
|
|
74
|
-
|
|
75
|
-
def _get_extra_fields(self: ActorLogFormatter, record: logging.LogRecord) -> dict[str, Any]:
|
|
76
|
-
extra_fields: dict[str, Any] = {}
|
|
77
|
-
for key, value in record.__dict__.items():
|
|
78
|
-
if key not in self.empty_record.__dict__:
|
|
79
|
-
extra_fields[key] = value
|
|
80
|
-
|
|
81
|
-
return extra_fields
|
|
82
|
-
|
|
83
|
-
@ignore_docs
|
|
84
|
-
def format(self: ActorLogFormatter, record: logging.LogRecord) -> str:
|
|
85
|
-
"""Format the log record nicely.
|
|
86
14
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
- then has the stringified extra log fields
|
|
91
|
-
- then, if an exception is a part of the log record, prints the formatted exception.
|
|
92
|
-
"""
|
|
93
|
-
logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} '
|
|
15
|
+
@ignore_docs
|
|
16
|
+
class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from parent class)
|
|
17
|
+
pass
|
|
94
18
|
|
|
95
|
-
# Colorize the log level, and shorten it to 6 chars tops
|
|
96
|
-
level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '')
|
|
97
|
-
level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname)
|
|
98
|
-
level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} '
|
|
99
19
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
if record.exc_info:
|
|
104
|
-
exc_info = record.exc_info
|
|
105
|
-
record.exc_info = None
|
|
106
|
-
exception_string = ''.join(traceback.format_exception(*exc_info)).rstrip()
|
|
107
|
-
exception_string = '\n' + textwrap.indent(exception_string, _LOG_MESSAGE_INDENT)
|
|
20
|
+
def _configure_logging() -> None:
|
|
21
|
+
apify_client_logger = logging.getLogger('apify_client')
|
|
22
|
+
configure_logger(apify_client_logger, remove_old_handlers=True)
|
|
108
23
|
|
|
109
|
-
|
|
110
|
-
# Just stringify them to JSON and color them gray
|
|
111
|
-
extra_string = ''
|
|
112
|
-
extra = self._get_extra_fields(record)
|
|
113
|
-
if extra:
|
|
114
|
-
extra_string = f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}'
|
|
24
|
+
level = get_configured_log_level()
|
|
115
25
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
26
|
+
# Keep apify_client logger quiet unless debug logging is requested
|
|
27
|
+
if level > logging.DEBUG:
|
|
28
|
+
apify_client_logger.setLevel(logging.INFO)
|
|
29
|
+
else:
|
|
30
|
+
apify_client_logger.setLevel(level)
|
|
119
31
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
32
|
+
# Silence HTTPX logger unless debug logging is requested
|
|
33
|
+
httpx_logger = logging.getLogger('httpx')
|
|
34
|
+
if level > logging.DEBUG:
|
|
35
|
+
httpx_logger.setLevel(logging.WARNING)
|
|
36
|
+
else:
|
|
37
|
+
httpx_logger.setLevel(level)
|
|
123
38
|
|
|
124
|
-
|
|
39
|
+
# Use configured log level for apify logger
|
|
40
|
+
apify_logger = logging.getLogger('apify')
|
|
41
|
+
configure_logger(apify_logger, remove_old_handlers=True)
|
apify/scrapy/__init__.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
-
from .requests import to_apify_request, to_scrapy_request
|
|
2
|
-
from .scheduler import ApifyScheduler
|
|
3
|
-
from .utils import get_basic_auth_header, get_running_event_loop_id
|
|
1
|
+
from apify.scrapy.requests import to_apify_request, to_scrapy_request
|
|
2
|
+
from apify.scrapy.scheduler import ApifyScheduler
|
|
3
|
+
from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
'ApifyScheduler',
|
|
7
|
+
'get_basic_auth_header',
|
|
8
|
+
'get_running_event_loop_id',
|
|
9
|
+
'to_apify_request',
|
|
10
|
+
'to_scrapy_request',
|
|
11
|
+
]
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
from urllib.parse import ParseResult, urlparse
|
|
4
5
|
|
|
5
6
|
try:
|
|
6
|
-
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from scrapy import Request, Spider
|
|
9
|
+
from scrapy.crawler import Crawler
|
|
7
10
|
from scrapy.core.downloader.handlers.http11 import TunnelError
|
|
8
|
-
from scrapy.crawler import Crawler # noqa: TCH002
|
|
9
11
|
from scrapy.exceptions import NotConfigured
|
|
10
12
|
except ImportError as exc:
|
|
11
13
|
raise ImportError(
|
|
12
14
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
13
15
|
) from exc
|
|
14
16
|
|
|
15
|
-
from apify
|
|
16
|
-
from apify.proxy_configuration import ProxyConfiguration
|
|
17
|
+
from apify import Actor, ProxyConfiguration
|
|
17
18
|
from apify.scrapy.utils import get_basic_auth_header
|
|
18
19
|
|
|
19
20
|
|
|
@@ -28,7 +29,7 @@ class ApifyHttpProxyMiddleware:
|
|
|
28
29
|
proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []}
|
|
29
30
|
"""
|
|
30
31
|
|
|
31
|
-
def __init__(self
|
|
32
|
+
def __init__(self, proxy_settings: dict) -> None:
|
|
32
33
|
"""Create a new instance.
|
|
33
34
|
|
|
34
35
|
Args:
|
|
@@ -43,29 +44,31 @@ class ApifyHttpProxyMiddleware:
|
|
|
43
44
|
"""Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler.
|
|
44
45
|
|
|
45
46
|
Args:
|
|
46
|
-
cls: Class type.
|
|
47
47
|
crawler: Scrapy Crawler object.
|
|
48
48
|
|
|
49
|
-
Returns:
|
|
50
|
-
ApifyHttpProxyMiddleware: Instance of the class.
|
|
49
|
+
Returns: Instance of the class.
|
|
51
50
|
"""
|
|
52
51
|
proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
|
|
53
52
|
|
|
54
53
|
if proxy_settings is None:
|
|
55
|
-
Actor.log.warning(
|
|
54
|
+
Actor.log.warning(
|
|
55
|
+
'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing '
|
|
56
|
+
' in the Actor input.'
|
|
57
|
+
)
|
|
56
58
|
raise NotConfigured
|
|
57
59
|
|
|
58
60
|
use_apify_proxy = proxy_settings.get('useApifyProxy', False)
|
|
59
61
|
|
|
60
62
|
if use_apify_proxy is not True:
|
|
61
63
|
Actor.log.warning(
|
|
62
|
-
'ApifyHttpProxyMiddleware is not going to be used. Actor input field
|
|
64
|
+
'ApifyHttpProxyMiddleware is not going to be used. Actor input field '
|
|
65
|
+
'"proxyConfiguration.useApifyProxy" is probably set to False.'
|
|
63
66
|
)
|
|
64
67
|
raise NotConfigured
|
|
65
68
|
|
|
66
69
|
return cls(proxy_settings)
|
|
67
70
|
|
|
68
|
-
async def process_request(self
|
|
71
|
+
async def process_request(self, request: Request, spider: Spider) -> None:
|
|
69
72
|
"""Process a Scrapy request by assigning a new proxy.
|
|
70
73
|
|
|
71
74
|
Args:
|
|
@@ -74,9 +77,6 @@ class ApifyHttpProxyMiddleware:
|
|
|
74
77
|
|
|
75
78
|
Raises:
|
|
76
79
|
ValueError: If username and password are not provided in the proxy URL.
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
None: The request is processed and middleware pipeline can continue.
|
|
80
80
|
"""
|
|
81
81
|
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}')
|
|
82
82
|
url = await self._get_new_proxy_url()
|
|
@@ -91,11 +91,11 @@ class ApifyHttpProxyMiddleware:
|
|
|
91
91
|
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}')
|
|
92
92
|
|
|
93
93
|
def process_exception(
|
|
94
|
-
self
|
|
94
|
+
self,
|
|
95
95
|
request: Request,
|
|
96
96
|
exception: Exception,
|
|
97
97
|
spider: Spider,
|
|
98
|
-
) -> None
|
|
98
|
+
) -> None:
|
|
99
99
|
"""Process an exception that occurs during request processing.
|
|
100
100
|
|
|
101
101
|
Args:
|
|
@@ -104,27 +104,27 @@ class ApifyHttpProxyMiddleware:
|
|
|
104
104
|
spider: Scrapy Spider object.
|
|
105
105
|
|
|
106
106
|
Returns:
|
|
107
|
-
|
|
108
|
-
|
|
107
|
+
Returning None, meaning Scrapy will continue processing this exception, executing any other
|
|
108
|
+
process_exception() methods of installed middleware, until no middleware is left and the default
|
|
109
|
+
exception handling kicks in.
|
|
109
110
|
"""
|
|
110
111
|
Actor.log.debug(
|
|
111
112
|
f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
|
|
112
113
|
)
|
|
113
114
|
|
|
114
115
|
if isinstance(exception, TunnelError):
|
|
115
|
-
Actor.log.warning(
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
116
|
+
Actor.log.warning(
|
|
117
|
+
f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", '
|
|
118
|
+
'reason="{exception}", skipping...'
|
|
119
|
+
)
|
|
119
120
|
|
|
120
|
-
async def _get_new_proxy_url(self
|
|
121
|
+
async def _get_new_proxy_url(self) -> ParseResult:
|
|
121
122
|
"""Get a new proxy URL.
|
|
122
123
|
|
|
123
124
|
Raises:
|
|
124
125
|
NotConfigured: If creation of the proxy configuration fails.
|
|
125
126
|
|
|
126
|
-
Returns:
|
|
127
|
-
ParseResult: New proxy URL.
|
|
127
|
+
Returns: New proxy URL.
|
|
128
128
|
"""
|
|
129
129
|
# Get proxy configuration, creating it if necessary
|
|
130
130
|
proxy_cfg = (
|
|
@@ -136,7 +136,9 @@ class ApifyHttpProxyMiddleware:
|
|
|
136
136
|
# If the proxy configuration is still not available, raise an error. However, this should not happen due
|
|
137
137
|
# to the checks in the `from_crawler` method.
|
|
138
138
|
if proxy_cfg is None:
|
|
139
|
-
Actor.log.error(
|
|
139
|
+
Actor.log.error(
|
|
140
|
+
'Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.'
|
|
141
|
+
)
|
|
140
142
|
raise NotConfigured
|
|
141
143
|
|
|
142
144
|
# Store the proxy configuration for future use
|
|
@@ -144,4 +146,4 @@ class ApifyHttpProxyMiddleware:
|
|
|
144
146
|
|
|
145
147
|
# Get a new proxy URL and return it
|
|
146
148
|
new_url = await proxy_cfg.new_url()
|
|
147
|
-
return urlparse(new_url)
|
|
149
|
+
return urlparse(str(new_url))
|
|
File without changes
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
3
5
|
from itemadapter.adapter import ItemAdapter
|
|
4
6
|
|
|
5
7
|
try:
|
|
6
|
-
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from scrapy import Item, Spider
|
|
7
10
|
except ImportError as exc:
|
|
8
11
|
raise ImportError(
|
|
9
12
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
10
13
|
) from exc
|
|
11
14
|
|
|
12
|
-
from apify
|
|
15
|
+
from apify import Actor
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class ActorDatasetPushPipeline:
|
|
@@ -19,7 +22,7 @@ class ActorDatasetPushPipeline:
|
|
|
19
22
|
"""
|
|
20
23
|
|
|
21
24
|
async def process_item(
|
|
22
|
-
self
|
|
25
|
+
self,
|
|
23
26
|
item: Item,
|
|
24
27
|
spider: Spider,
|
|
25
28
|
) -> Item:
|
|
File without changes
|
apify/scrapy/py.typed
ADDED
|
File without changes
|
apify/scrapy/requests.py
CHANGED
|
@@ -2,6 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import codecs
|
|
4
4
|
import pickle
|
|
5
|
+
from typing import Any, cast
|
|
6
|
+
|
|
7
|
+
from apify_shared.utils import ignore_docs
|
|
5
8
|
|
|
6
9
|
try:
|
|
7
10
|
from scrapy import Request, Spider
|
|
@@ -12,9 +15,12 @@ except ImportError as exc:
|
|
|
12
15
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
13
16
|
) from exc
|
|
14
17
|
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
+
from crawlee import Request as CrawleeRequest
|
|
19
|
+
from crawlee._types import HttpHeaders
|
|
20
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
21
|
+
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
|
|
22
|
+
|
|
23
|
+
from apify import Actor
|
|
18
24
|
|
|
19
25
|
|
|
20
26
|
def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
|
|
@@ -25,7 +31,8 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
|
|
|
25
31
|
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
|
|
26
32
|
|
|
27
33
|
|
|
28
|
-
|
|
34
|
+
@ignore_docs
|
|
35
|
+
def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None:
|
|
29
36
|
"""Convert a Scrapy request to an Apify request.
|
|
30
37
|
|
|
31
38
|
Args:
|
|
@@ -36,46 +43,50 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
|
|
|
36
43
|
The converted Apify request if the conversion was successful, otherwise None.
|
|
37
44
|
"""
|
|
38
45
|
if not isinstance(scrapy_request, Request):
|
|
39
|
-
Actor.log.warning(
|
|
46
|
+
Actor.log.warning( # type: ignore[unreachable]
|
|
47
|
+
'Failed to convert to Apify request: Scrapy request must be a Request instance.'
|
|
48
|
+
)
|
|
40
49
|
return None
|
|
41
50
|
|
|
42
51
|
call_id = crypto_random_object_id(8)
|
|
43
52
|
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
|
|
44
53
|
|
|
45
54
|
try:
|
|
46
|
-
apify_request = {
|
|
47
|
-
'url': scrapy_request.url,
|
|
48
|
-
'method': scrapy_request.method,
|
|
49
|
-
'payload': scrapy_request.body,
|
|
50
|
-
'userData': scrapy_request.meta.get('userData', {}),
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
# Convert Scrapy's headers to a dictionary and store them in the apify_request
|
|
54
|
-
if isinstance(scrapy_request.headers, Headers):
|
|
55
|
-
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
|
|
56
|
-
else:
|
|
57
|
-
Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}')
|
|
58
|
-
|
|
59
|
-
# If the request was produced by the middleware (e.g. retry or redirect), we must compute the unique key here
|
|
60
55
|
if _is_request_produced_by_middleware(scrapy_request):
|
|
61
|
-
|
|
56
|
+
unique_key = compute_unique_key(
|
|
62
57
|
url=scrapy_request.url,
|
|
63
|
-
method=scrapy_request.method,
|
|
58
|
+
method=scrapy_request.method, # type: ignore[arg-type] # str vs literal
|
|
64
59
|
payload=scrapy_request.body,
|
|
65
60
|
use_extended_unique_key=True,
|
|
66
61
|
)
|
|
67
|
-
|
|
62
|
+
elif scrapy_request.dont_filter:
|
|
63
|
+
unique_key = crypto_random_object_id(8)
|
|
64
|
+
elif scrapy_request.meta.get('apify_request_unique_key'):
|
|
65
|
+
unique_key = scrapy_request.meta['apify_request_unique_key']
|
|
68
66
|
else:
|
|
69
|
-
|
|
70
|
-
apify_request['id'] = scrapy_request.meta['apify_request_id']
|
|
67
|
+
unique_key = crypto_random_object_id(8)
|
|
71
68
|
|
|
72
|
-
|
|
73
|
-
|
|
69
|
+
if scrapy_request.meta.get('apify_request_id'):
|
|
70
|
+
request_id = scrapy_request.meta['apify_request_id']
|
|
71
|
+
else:
|
|
72
|
+
request_id = unique_key_to_request_id(unique_key)
|
|
73
|
+
|
|
74
|
+
apify_request = CrawleeRequest(
|
|
75
|
+
url=scrapy_request.url,
|
|
76
|
+
method=scrapy_request.method,
|
|
77
|
+
payload=scrapy_request.body,
|
|
78
|
+
user_data=scrapy_request.meta.get('userData', {}),
|
|
79
|
+
unique_key=unique_key,
|
|
80
|
+
id=request_id,
|
|
81
|
+
)
|
|
74
82
|
|
|
75
|
-
#
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
83
|
+
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
|
|
84
|
+
if isinstance(scrapy_request.headers, Headers):
|
|
85
|
+
apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
|
|
86
|
+
else:
|
|
87
|
+
Actor.log.warning( # type: ignore[unreachable]
|
|
88
|
+
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
|
|
89
|
+
)
|
|
79
90
|
|
|
80
91
|
# Serialize the Scrapy Request and store it in the apify_request.
|
|
81
92
|
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
|
|
@@ -83,7 +94,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
|
|
|
83
94
|
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
|
|
84
95
|
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
|
|
85
96
|
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
|
|
86
|
-
apify_request['
|
|
97
|
+
apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded
|
|
87
98
|
|
|
88
99
|
except Exception as exc:
|
|
89
100
|
Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
|
|
@@ -93,7 +104,8 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
|
|
|
93
104
|
return apify_request
|
|
94
105
|
|
|
95
106
|
|
|
96
|
-
|
|
107
|
+
@ignore_docs
|
|
108
|
+
def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
|
|
97
109
|
"""Convert an Apify request to a Scrapy request.
|
|
98
110
|
|
|
99
111
|
Args:
|
|
@@ -101,32 +113,26 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
101
113
|
spider: The Scrapy spider that the request is associated with.
|
|
102
114
|
|
|
103
115
|
Raises:
|
|
104
|
-
TypeError: If the apify_request is not a
|
|
116
|
+
TypeError: If the apify_request is not a crawlee request.
|
|
105
117
|
ValueError: If the apify_request does not contain the required keys.
|
|
106
118
|
|
|
107
119
|
Returns:
|
|
108
120
|
The converted Scrapy request.
|
|
109
121
|
"""
|
|
110
|
-
if not isinstance(apify_request,
|
|
111
|
-
raise TypeError('apify_request must be a
|
|
112
|
-
|
|
113
|
-
required_keys = ['url', 'method', 'id', 'uniqueKey']
|
|
114
|
-
missing_keys = [key for key in required_keys if key not in apify_request]
|
|
115
|
-
|
|
116
|
-
if missing_keys:
|
|
117
|
-
raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
|
|
122
|
+
if not isinstance(cast(Any, apify_request), CrawleeRequest):
|
|
123
|
+
raise TypeError('apify_request must be a crawlee.Request instance')
|
|
118
124
|
|
|
119
125
|
call_id = crypto_random_object_id(8)
|
|
120
126
|
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
|
|
121
127
|
|
|
122
128
|
# If the apify_request comes from the Scrapy
|
|
123
|
-
if '
|
|
129
|
+
if 'scrapy_request' in apify_request.user_data:
|
|
124
130
|
# Deserialize the Scrapy Request from the apify_request.
|
|
125
131
|
# - This process involves decoding the base64-encoded request data and reconstructing
|
|
126
132
|
# the Scrapy Request object from its dictionary representation.
|
|
127
133
|
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
|
|
128
134
|
|
|
129
|
-
scrapy_request_dict_encoded = apify_request['
|
|
135
|
+
scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
|
|
130
136
|
if not isinstance(scrapy_request_dict_encoded, str):
|
|
131
137
|
raise TypeError('scrapy_request_dict_encoded must be a string')
|
|
132
138
|
|
|
@@ -142,34 +148,30 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
142
148
|
|
|
143
149
|
# Update the meta field with the meta field from the apify_request
|
|
144
150
|
meta = scrapy_request.meta or {}
|
|
145
|
-
meta.update({'apify_request_id': apify_request
|
|
146
|
-
|
|
151
|
+
meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
|
|
152
|
+
# scrapy_request.meta is a property, so we have to set it like this
|
|
153
|
+
scrapy_request._meta = meta # noqa: SLF001
|
|
147
154
|
|
|
148
155
|
# If the apify_request comes directly from the Request Queue, typically start URLs
|
|
149
156
|
else:
|
|
150
157
|
Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
|
|
151
158
|
|
|
152
159
|
scrapy_request = Request(
|
|
153
|
-
url=apify_request
|
|
154
|
-
method=apify_request
|
|
160
|
+
url=apify_request.url,
|
|
161
|
+
method=apify_request.method,
|
|
155
162
|
meta={
|
|
156
|
-
'apify_request_id': apify_request
|
|
157
|
-
'apify_request_unique_key': apify_request
|
|
163
|
+
'apify_request_id': apify_request.id,
|
|
164
|
+
'apify_request_unique_key': apify_request.unique_key,
|
|
158
165
|
},
|
|
159
166
|
)
|
|
160
167
|
|
|
161
168
|
# Add optional 'headers' field
|
|
162
|
-
if
|
|
163
|
-
|
|
164
|
-
scrapy_request.headers = Headers(apify_request['headers'])
|
|
165
|
-
else:
|
|
166
|
-
Actor.log.warning(
|
|
167
|
-
f'apify_request[headers] is not an instance of the dict class, apify_request[headers] = {apify_request["headers"]}',
|
|
168
|
-
)
|
|
169
|
+
if apify_request.headers:
|
|
170
|
+
scrapy_request.headers |= Headers(apify_request.headers)
|
|
169
171
|
|
|
170
172
|
# Add optional 'userData' field
|
|
171
|
-
if
|
|
172
|
-
scrapy_request.meta['userData'] = apify_request
|
|
173
|
+
if apify_request.user_data:
|
|
174
|
+
scrapy_request.meta['userData'] = apify_request.user_data
|
|
173
175
|
|
|
174
176
|
Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
|
|
175
177
|
return scrapy_request
|