apify 1.7.3b4__py3-none-any.whl → 2.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +17 -4
- apify/_actor.py +963 -0
- apify/_configuration.py +310 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +29 -27
- apify/_models.py +110 -0
- apify/_platform_event_manager.py +222 -0
- apify/_proxy_configuration.py +316 -0
- apify/_utils.py +0 -497
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +56 -0
- apify/apify_storage_client/_dataset_client.py +188 -0
- apify/apify_storage_client/_dataset_collection_client.py +50 -0
- apify/apify_storage_client/_key_value_store_client.py +98 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +50 -0
- apify/apify_storage_client/_request_queue_client.py +196 -0
- apify/apify_storage_client/_request_queue_collection_client.py +50 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +3 -112
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +21 -21
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +1 -1
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +55 -54
- apify/scrapy/scheduler.py +19 -13
- apify/scrapy/utils.py +2 -31
- apify/storages/__init__.py +2 -10
- apify/storages/py.typed +0 -0
- {apify-1.7.3b4.dist-info → apify-2.0.0a1.dist-info}/METADATA +24 -46
- apify-2.0.0a1.dist-info/RECORD +37 -0
- {apify-1.7.3b4.dist-info → apify-2.0.0a1.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1357
- apify/config.py +0 -130
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.3b4.dist-info/RECORD +0 -41
- apify-1.7.3b4.dist-info/top_level.txt +0 -1
- {apify-1.7.3b4.dist-info → apify-2.0.0a1.dist-info}/LICENSE +0 -0
apify/log.py
CHANGED
|
@@ -1,16 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import logging
|
|
5
|
-
import textwrap
|
|
6
|
-
import traceback
|
|
7
|
-
from typing import Any
|
|
8
|
-
|
|
9
|
-
from apify_shared.utils import ignore_docs
|
|
10
|
-
from colorama import Fore, Style, just_fix_windows_console
|
|
11
|
-
|
|
12
|
-
just_fix_windows_console()
|
|
13
4
|
|
|
5
|
+
from crawlee._log_config import CrawleeLogFormatter
|
|
14
6
|
|
|
15
7
|
# Name of the logger used throughout the library (resolves to 'apify')
|
|
16
8
|
logger_name = __name__.split('.')[0]
|
|
@@ -18,107 +10,6 @@ logger_name = __name__.split('.')[0]
|
|
|
18
10
|
# Logger used throughout the library
|
|
19
11
|
logger = logging.getLogger(logger_name)
|
|
20
12
|
|
|
21
|
-
_LOG_NAME_COLOR = Fore.LIGHTBLACK_EX
|
|
22
|
-
|
|
23
|
-
_LOG_LEVEL_COLOR = {
|
|
24
|
-
logging.DEBUG: Fore.BLUE,
|
|
25
|
-
logging.INFO: Fore.GREEN,
|
|
26
|
-
logging.WARNING: Fore.YELLOW,
|
|
27
|
-
logging.ERROR: Fore.RED,
|
|
28
|
-
logging.CRITICAL: Fore.RED,
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
_LOG_LEVEL_SHORT_ALIAS = {
|
|
32
|
-
logging.DEBUG: 'DEBUG',
|
|
33
|
-
logging.INFO: 'INFO ',
|
|
34
|
-
logging.WARNING: 'WARN ',
|
|
35
|
-
logging.ERROR: 'ERROR',
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
# So that all the log messages have the same alignment
|
|
39
|
-
_LOG_MESSAGE_INDENT = ' ' * 6
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class ActorLogFormatter(logging.Formatter):
|
|
43
|
-
"""Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.
|
|
44
|
-
|
|
45
|
-
It formats the log records so that they:
|
|
46
|
-
- start with the level (colorized, and padded to 5 chars so that it is nicely aligned)
|
|
47
|
-
- then have the actual log message, if it's multiline then it's nicely indented
|
|
48
|
-
- then have the stringified extra log fields
|
|
49
|
-
- then, if an exception is a part of the log record, prints the formatted exception.
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
# The fields that are added to the log record with `logger.log(..., extra={...})`
|
|
53
|
-
# are just merged in the log record with the other log record properties, and you can't get them in some nice, isolated way.
|
|
54
|
-
# So, to get the extra fields, we just compare all the properties present in the log record
|
|
55
|
-
# with properties present in an empty log record,
|
|
56
|
-
# and extract all the extra ones not present in the empty log record
|
|
57
|
-
empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)
|
|
58
|
-
|
|
59
|
-
def __init__(
|
|
60
|
-
self: ActorLogFormatter,
|
|
61
|
-
include_logger_name: bool = False, # noqa: FBT001, FBT002
|
|
62
|
-
*args: Any,
|
|
63
|
-
**kwargs: Any,
|
|
64
|
-
) -> None:
|
|
65
|
-
"""Create an instance of the ActorLogFormatter.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
include_logger_name: Include logger name at the beginning of the log line. Defaults to False.
|
|
69
|
-
args: Arguments passed to the parent class.
|
|
70
|
-
kwargs: Keyword arguments passed to the parent class.
|
|
71
|
-
"""
|
|
72
|
-
super().__init__(*args, **kwargs)
|
|
73
|
-
self.include_logger_name = include_logger_name
|
|
74
|
-
|
|
75
|
-
def _get_extra_fields(self: ActorLogFormatter, record: logging.LogRecord) -> dict[str, Any]:
|
|
76
|
-
extra_fields: dict[str, Any] = {}
|
|
77
|
-
for key, value in record.__dict__.items():
|
|
78
|
-
if key not in self.empty_record.__dict__:
|
|
79
|
-
extra_fields[key] = value # noqa: PERF403
|
|
80
|
-
|
|
81
|
-
return extra_fields
|
|
82
|
-
|
|
83
|
-
@ignore_docs
|
|
84
|
-
def format(self: ActorLogFormatter, record: logging.LogRecord) -> str:
|
|
85
|
-
"""Format the log record nicely.
|
|
86
|
-
|
|
87
|
-
This formats the log record so that it:
|
|
88
|
-
- starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)
|
|
89
|
-
- then has the actual log message, if it's multiline then it's nicely indented
|
|
90
|
-
- then has the stringified extra log fields
|
|
91
|
-
- then, if an exception is a part of the log record, prints the formatted exception.
|
|
92
|
-
"""
|
|
93
|
-
logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} '
|
|
94
|
-
|
|
95
|
-
# Colorize the log level, and shorten it to 6 chars tops
|
|
96
|
-
level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '')
|
|
97
|
-
level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname)
|
|
98
|
-
level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} '
|
|
99
|
-
|
|
100
|
-
# Format the exception, if there is some
|
|
101
|
-
# Basically just print the traceback and indent it a bit
|
|
102
|
-
exception_string = ''
|
|
103
|
-
if record.exc_info:
|
|
104
|
-
exc_info = record.exc_info
|
|
105
|
-
record.exc_info = None
|
|
106
|
-
exception_string = ''.join(traceback.format_exception(*exc_info)).rstrip()
|
|
107
|
-
exception_string = '\n' + textwrap.indent(exception_string, _LOG_MESSAGE_INDENT)
|
|
108
|
-
|
|
109
|
-
# Format the extra log record fields, if there were some
|
|
110
|
-
# Just stringify them to JSON and color them gray
|
|
111
|
-
extra_string = ''
|
|
112
|
-
extra = self._get_extra_fields(record)
|
|
113
|
-
if extra:
|
|
114
|
-
extra_string = f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}'
|
|
115
|
-
|
|
116
|
-
# Format the actual log message, and indent everything but the first line
|
|
117
|
-
log_string = super().format(record)
|
|
118
|
-
log_string = textwrap.indent(log_string, _LOG_MESSAGE_INDENT).lstrip()
|
|
119
|
-
|
|
120
|
-
if self.include_logger_name:
|
|
121
|
-
# Include logger name at the beginning of the log line
|
|
122
|
-
return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}'
|
|
123
13
|
|
|
124
|
-
|
|
14
|
+
class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
|
|
15
|
+
pass
|
apify/scrapy/__init__.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
-
from .requests import to_apify_request, to_scrapy_request
|
|
2
|
-
from .scheduler import ApifyScheduler
|
|
3
|
-
from .utils import get_basic_auth_header, get_running_event_loop_id
|
|
1
|
+
from apify.scrapy.requests import to_apify_request, to_scrapy_request
|
|
2
|
+
from apify.scrapy.scheduler import ApifyScheduler
|
|
3
|
+
from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
'to_apify_request',
|
|
7
|
+
'to_scrapy_request',
|
|
8
|
+
'ApifyScheduler',
|
|
9
|
+
'get_basic_auth_header',
|
|
10
|
+
'get_running_event_loop_id',
|
|
11
|
+
]
|
|
@@ -12,8 +12,7 @@ except ImportError as exc:
|
|
|
12
12
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
13
13
|
) from exc
|
|
14
14
|
|
|
15
|
-
from apify
|
|
16
|
-
from apify.proxy_configuration import ProxyConfiguration
|
|
15
|
+
from apify import Actor, ProxyConfiguration
|
|
17
16
|
from apify.scrapy.utils import get_basic_auth_header
|
|
18
17
|
|
|
19
18
|
|
|
@@ -43,23 +42,25 @@ class ApifyHttpProxyMiddleware:
|
|
|
43
42
|
"""Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler.
|
|
44
43
|
|
|
45
44
|
Args:
|
|
46
|
-
cls: Class type.
|
|
47
45
|
crawler: Scrapy Crawler object.
|
|
48
46
|
|
|
49
|
-
Returns:
|
|
50
|
-
ApifyHttpProxyMiddleware: Instance of the class.
|
|
47
|
+
Returns: Instance of the class.
|
|
51
48
|
"""
|
|
52
49
|
proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
|
|
53
50
|
|
|
54
51
|
if proxy_settings is None:
|
|
55
|
-
Actor.log.warning(
|
|
52
|
+
Actor.log.warning(
|
|
53
|
+
'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing '
|
|
54
|
+
' in the Actor input.'
|
|
55
|
+
)
|
|
56
56
|
raise NotConfigured
|
|
57
57
|
|
|
58
58
|
use_apify_proxy = proxy_settings.get('useApifyProxy', False)
|
|
59
59
|
|
|
60
60
|
if use_apify_proxy is not True:
|
|
61
61
|
Actor.log.warning(
|
|
62
|
-
'ApifyHttpProxyMiddleware is not going to be used. Actor input field
|
|
62
|
+
'ApifyHttpProxyMiddleware is not going to be used. Actor input field '
|
|
63
|
+
'"proxyConfiguration.useApifyProxy" is probably set to False.'
|
|
63
64
|
)
|
|
64
65
|
raise NotConfigured
|
|
65
66
|
|
|
@@ -74,9 +75,6 @@ class ApifyHttpProxyMiddleware:
|
|
|
74
75
|
|
|
75
76
|
Raises:
|
|
76
77
|
ValueError: If username and password are not provided in the proxy URL.
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
None: The request is processed and middleware pipeline can continue.
|
|
80
78
|
"""
|
|
81
79
|
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}')
|
|
82
80
|
url = await self._get_new_proxy_url()
|
|
@@ -95,7 +93,7 @@ class ApifyHttpProxyMiddleware:
|
|
|
95
93
|
request: Request,
|
|
96
94
|
exception: Exception,
|
|
97
95
|
spider: Spider,
|
|
98
|
-
) -> None
|
|
96
|
+
) -> None:
|
|
99
97
|
"""Process an exception that occurs during request processing.
|
|
100
98
|
|
|
101
99
|
Args:
|
|
@@ -104,18 +102,19 @@ class ApifyHttpProxyMiddleware:
|
|
|
104
102
|
spider: Scrapy Spider object.
|
|
105
103
|
|
|
106
104
|
Returns:
|
|
107
|
-
|
|
108
|
-
|
|
105
|
+
Returning None, meaning Scrapy will continue processing this exception, executing any other
|
|
106
|
+
process_exception() methods of installed middleware, until no middleware is left and the default
|
|
107
|
+
exception handling kicks in.
|
|
109
108
|
"""
|
|
110
109
|
Actor.log.debug(
|
|
111
110
|
f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
|
|
112
111
|
)
|
|
113
112
|
|
|
114
113
|
if isinstance(exception, TunnelError):
|
|
115
|
-
Actor.log.warning(
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
114
|
+
Actor.log.warning(
|
|
115
|
+
f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", '
|
|
116
|
+
'reason="{exception}", skipping...'
|
|
117
|
+
)
|
|
119
118
|
|
|
120
119
|
async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
|
|
121
120
|
"""Get a new proxy URL.
|
|
@@ -123,8 +122,7 @@ class ApifyHttpProxyMiddleware:
|
|
|
123
122
|
Raises:
|
|
124
123
|
NotConfigured: If creation of the proxy configuration fails.
|
|
125
124
|
|
|
126
|
-
Returns:
|
|
127
|
-
ParseResult: New proxy URL.
|
|
125
|
+
Returns: New proxy URL.
|
|
128
126
|
"""
|
|
129
127
|
# Get proxy configuration, creating it if necessary
|
|
130
128
|
proxy_cfg = (
|
|
@@ -136,7 +134,9 @@ class ApifyHttpProxyMiddleware:
|
|
|
136
134
|
# If the proxy configuration is still not available, raise an error. However, this should not happen due
|
|
137
135
|
# to the checks in the `from_crawler` method.
|
|
138
136
|
if proxy_cfg is None:
|
|
139
|
-
Actor.log.error(
|
|
137
|
+
Actor.log.error(
|
|
138
|
+
'Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.'
|
|
139
|
+
)
|
|
140
140
|
raise NotConfigured
|
|
141
141
|
|
|
142
142
|
# Store the proxy configuration for future use
|
|
@@ -144,4 +144,4 @@ class ApifyHttpProxyMiddleware:
|
|
|
144
144
|
|
|
145
145
|
# Get a new proxy URL and return it
|
|
146
146
|
new_url = await proxy_cfg.new_url()
|
|
147
|
-
return urlparse(new_url)
|
|
147
|
+
return urlparse(str(new_url))
|
|
File without changes
|
|
File without changes
|
apify/scrapy/py.typed
ADDED
|
File without changes
|
apify/scrapy/requests.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import codecs
|
|
4
4
|
import pickle
|
|
5
|
+
from typing import Any, cast
|
|
5
6
|
|
|
6
7
|
try:
|
|
7
8
|
from scrapy import Request, Spider
|
|
@@ -12,9 +13,11 @@ except ImportError as exc:
|
|
|
12
13
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
13
14
|
) from exc
|
|
14
15
|
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
16
|
+
from crawlee import Request as CrawleeRequest
|
|
17
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
18
|
+
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
|
|
19
|
+
|
|
20
|
+
from apify import Actor
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
|
|
@@ -25,7 +28,7 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
|
|
|
25
28
|
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
|
|
26
29
|
|
|
27
30
|
|
|
28
|
-
def to_apify_request(scrapy_request: Request, spider: Spider) ->
|
|
31
|
+
def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None:
|
|
29
32
|
"""Convert a Scrapy request to an Apify request.
|
|
30
33
|
|
|
31
34
|
Args:
|
|
@@ -35,7 +38,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
|
|
|
35
38
|
Returns:
|
|
36
39
|
The converted Apify request if the conversion was successful, otherwise None.
|
|
37
40
|
"""
|
|
38
|
-
if not isinstance(scrapy_request, Request):
|
|
41
|
+
if not isinstance(cast(Any, scrapy_request), Request):
|
|
39
42
|
Actor.log.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.')
|
|
40
43
|
return None
|
|
41
44
|
|
|
@@ -43,39 +46,41 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
|
|
|
43
46
|
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
|
|
44
47
|
|
|
45
48
|
try:
|
|
46
|
-
apify_request = {
|
|
47
|
-
'url': scrapy_request.url,
|
|
48
|
-
'method': scrapy_request.method,
|
|
49
|
-
'payload': scrapy_request.body,
|
|
50
|
-
'userData': scrapy_request.meta.get('userData', {}),
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
# Convert Scrapy's headers to a dictionary and store them in the apify_request
|
|
54
|
-
if isinstance(scrapy_request.headers, Headers):
|
|
55
|
-
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
|
|
56
|
-
else:
|
|
57
|
-
Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}')
|
|
58
|
-
|
|
59
|
-
# If the request was produced by the middleware (e.g. retry or redirect), we must compute the unique key here
|
|
60
49
|
if _is_request_produced_by_middleware(scrapy_request):
|
|
61
|
-
|
|
50
|
+
unique_key = compute_unique_key(
|
|
62
51
|
url=scrapy_request.url,
|
|
63
52
|
method=scrapy_request.method,
|
|
64
53
|
payload=scrapy_request.body,
|
|
65
54
|
use_extended_unique_key=True,
|
|
66
55
|
)
|
|
67
|
-
|
|
56
|
+
elif scrapy_request.dont_filter:
|
|
57
|
+
unique_key = crypto_random_object_id(8)
|
|
58
|
+
elif scrapy_request.meta.get('apify_request_unique_key'):
|
|
59
|
+
unique_key = scrapy_request.meta['apify_request_unique_key']
|
|
68
60
|
else:
|
|
69
|
-
|
|
70
|
-
apify_request['id'] = scrapy_request.meta['apify_request_id']
|
|
61
|
+
unique_key = crypto_random_object_id(8)
|
|
71
62
|
|
|
72
|
-
|
|
73
|
-
|
|
63
|
+
if scrapy_request.meta.get('apify_request_id'):
|
|
64
|
+
request_id = scrapy_request.meta['apify_request_id']
|
|
65
|
+
else:
|
|
66
|
+
request_id = unique_key_to_request_id(unique_key)
|
|
67
|
+
|
|
68
|
+
apify_request = CrawleeRequest(
|
|
69
|
+
url=scrapy_request.url,
|
|
70
|
+
method=scrapy_request.method,
|
|
71
|
+
payload=scrapy_request.body,
|
|
72
|
+
user_data=scrapy_request.meta.get('userData', {}),
|
|
73
|
+
unique_key=unique_key,
|
|
74
|
+
id=request_id,
|
|
75
|
+
)
|
|
74
76
|
|
|
75
|
-
#
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
# Convert Scrapy's headers to a dictionary and store them in the apify_request
|
|
78
|
+
if isinstance(scrapy_request.headers, Headers):
|
|
79
|
+
apify_request.headers = dict(scrapy_request.headers.to_unicode_dict())
|
|
80
|
+
else:
|
|
81
|
+
Actor.log.warning(
|
|
82
|
+
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
|
|
83
|
+
)
|
|
79
84
|
|
|
80
85
|
# Serialize the Scrapy Request and store it in the apify_request.
|
|
81
86
|
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
|
|
@@ -83,7 +88,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
|
|
|
83
88
|
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
|
|
84
89
|
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
|
|
85
90
|
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
|
|
86
|
-
apify_request['
|
|
91
|
+
apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded
|
|
87
92
|
|
|
88
93
|
except Exception as exc:
|
|
89
94
|
Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
|
|
@@ -93,7 +98,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
|
|
|
93
98
|
return apify_request
|
|
94
99
|
|
|
95
100
|
|
|
96
|
-
def to_scrapy_request(apify_request:
|
|
101
|
+
def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
|
|
97
102
|
"""Convert an Apify request to a Scrapy request.
|
|
98
103
|
|
|
99
104
|
Args:
|
|
@@ -101,32 +106,26 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
101
106
|
spider: The Scrapy spider that the request is associated with.
|
|
102
107
|
|
|
103
108
|
Raises:
|
|
104
|
-
TypeError: If the apify_request is not a
|
|
109
|
+
TypeError: If the apify_request is not a crawlee request.
|
|
105
110
|
ValueError: If the apify_request does not contain the required keys.
|
|
106
111
|
|
|
107
112
|
Returns:
|
|
108
113
|
The converted Scrapy request.
|
|
109
114
|
"""
|
|
110
|
-
if not isinstance(apify_request,
|
|
111
|
-
raise TypeError('apify_request must be a
|
|
112
|
-
|
|
113
|
-
required_keys = ['url', 'method', 'id', 'uniqueKey']
|
|
114
|
-
missing_keys = [key for key in required_keys if key not in apify_request]
|
|
115
|
-
|
|
116
|
-
if missing_keys:
|
|
117
|
-
raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
|
|
115
|
+
if not isinstance(cast(Any, apify_request), CrawleeRequest):
|
|
116
|
+
raise TypeError('apify_request must be a crawlee.Request instance')
|
|
118
117
|
|
|
119
118
|
call_id = crypto_random_object_id(8)
|
|
120
119
|
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
|
|
121
120
|
|
|
122
121
|
# If the apify_request comes from the Scrapy
|
|
123
|
-
if '
|
|
122
|
+
if 'scrapy_request' in apify_request.user_data:
|
|
124
123
|
# Deserialize the Scrapy Request from the apify_request.
|
|
125
124
|
# - This process involves decoding the base64-encoded request data and reconstructing
|
|
126
125
|
# the Scrapy Request object from its dictionary representation.
|
|
127
126
|
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
|
|
128
127
|
|
|
129
|
-
scrapy_request_dict_encoded = apify_request['
|
|
128
|
+
scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
|
|
130
129
|
if not isinstance(scrapy_request_dict_encoded, str):
|
|
131
130
|
raise TypeError('scrapy_request_dict_encoded must be a string')
|
|
132
131
|
|
|
@@ -142,34 +141,36 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
142
141
|
|
|
143
142
|
# Update the meta field with the meta field from the apify_request
|
|
144
143
|
meta = scrapy_request.meta or {}
|
|
145
|
-
meta.update({'apify_request_id': apify_request
|
|
146
|
-
|
|
144
|
+
meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
|
|
145
|
+
# scrapy_request.meta is a property, so we have to set it like this
|
|
146
|
+
scrapy_request._meta = meta # noqa: SLF001
|
|
147
147
|
|
|
148
148
|
# If the apify_request comes directly from the Request Queue, typically start URLs
|
|
149
149
|
else:
|
|
150
150
|
Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
|
|
151
151
|
|
|
152
152
|
scrapy_request = Request(
|
|
153
|
-
url=apify_request
|
|
154
|
-
method=apify_request
|
|
153
|
+
url=apify_request.url,
|
|
154
|
+
method=apify_request.method,
|
|
155
155
|
meta={
|
|
156
|
-
'apify_request_id': apify_request
|
|
157
|
-
'apify_request_unique_key': apify_request
|
|
156
|
+
'apify_request_id': apify_request.id,
|
|
157
|
+
'apify_request_unique_key': apify_request.unique_key,
|
|
158
158
|
},
|
|
159
159
|
)
|
|
160
160
|
|
|
161
161
|
# Add optional 'headers' field
|
|
162
|
-
if
|
|
163
|
-
if isinstance(apify_request
|
|
164
|
-
scrapy_request.headers = Headers(apify_request
|
|
162
|
+
if apify_request.headers:
|
|
163
|
+
if isinstance(cast(Any, apify_request.headers), dict):
|
|
164
|
+
scrapy_request.headers = Headers(apify_request.headers)
|
|
165
165
|
else:
|
|
166
166
|
Actor.log.warning(
|
|
167
|
-
|
|
167
|
+
'apify_request[headers] is not an instance of the dict class, '
|
|
168
|
+
f'apify_request[headers] = {apify_request.headers}',
|
|
168
169
|
)
|
|
169
170
|
|
|
170
171
|
# Add optional 'userData' field
|
|
171
|
-
if
|
|
172
|
-
scrapy_request.meta['userData'] = apify_request
|
|
172
|
+
if apify_request.user_data:
|
|
173
|
+
scrapy_request.meta['userData'] = apify_request.user_data
|
|
173
174
|
|
|
174
175
|
Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
|
|
175
176
|
return scrapy_request
|
apify/scrapy/scheduler.py
CHANGED
|
@@ -2,6 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import traceback
|
|
4
4
|
|
|
5
|
+
from apify._configuration import Configuration
|
|
6
|
+
from apify.apify_storage_client import ApifyStorageClient
|
|
7
|
+
|
|
5
8
|
try:
|
|
6
9
|
from scrapy import Spider
|
|
7
10
|
from scrapy.core.scheduler import BaseScheduler
|
|
@@ -12,10 +15,11 @@ except ImportError as exc:
|
|
|
12
15
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
13
16
|
) from exc
|
|
14
17
|
|
|
15
|
-
from
|
|
16
|
-
|
|
18
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
19
|
+
|
|
20
|
+
from apify import Actor
|
|
17
21
|
from apify.scrapy.requests import to_apify_request, to_scrapy_request
|
|
18
|
-
from apify.scrapy.utils import nested_event_loop
|
|
22
|
+
from apify.scrapy.utils import nested_event_loop
|
|
19
23
|
from apify.storages import RequestQueue
|
|
20
24
|
|
|
21
25
|
|
|
@@ -44,8 +48,12 @@ class ApifyScheduler(BaseScheduler):
|
|
|
44
48
|
"""
|
|
45
49
|
self.spider = spider
|
|
46
50
|
|
|
51
|
+
async def open_queue() -> RequestQueue:
|
|
52
|
+
custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration())
|
|
53
|
+
return await RequestQueue.open(storage_client=custom_loop_apify_client)
|
|
54
|
+
|
|
47
55
|
try:
|
|
48
|
-
self._rq = nested_event_loop.run_until_complete(
|
|
56
|
+
self._rq = nested_event_loop.run_until_complete(open_queue())
|
|
49
57
|
except BaseException:
|
|
50
58
|
traceback.print_exc()
|
|
51
59
|
raise
|
|
@@ -95,18 +103,13 @@ class ApifyScheduler(BaseScheduler):
|
|
|
95
103
|
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
96
104
|
|
|
97
105
|
try:
|
|
98
|
-
result = nested_event_loop.run_until_complete(
|
|
99
|
-
self._rq.add_request(
|
|
100
|
-
apify_request,
|
|
101
|
-
use_extended_unique_key=True,
|
|
102
|
-
)
|
|
103
|
-
)
|
|
106
|
+
result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
|
|
104
107
|
except BaseException:
|
|
105
108
|
traceback.print_exc()
|
|
106
109
|
raise
|
|
107
110
|
|
|
108
111
|
Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
|
|
109
|
-
return bool(result
|
|
112
|
+
return bool(result.was_already_present)
|
|
110
113
|
|
|
111
114
|
def next_request(self: ApifyScheduler) -> Request | None:
|
|
112
115
|
"""Fetch the next request from the scheduler.
|
|
@@ -127,7 +130,9 @@ class ApifyScheduler(BaseScheduler):
|
|
|
127
130
|
traceback.print_exc()
|
|
128
131
|
raise
|
|
129
132
|
|
|
130
|
-
Actor.log.debug(
|
|
133
|
+
Actor.log.debug(
|
|
134
|
+
f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})'
|
|
135
|
+
)
|
|
131
136
|
|
|
132
137
|
if apify_request is None:
|
|
133
138
|
return None
|
|
@@ -145,6 +150,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
145
150
|
|
|
146
151
|
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
|
|
147
152
|
Actor.log.debug(
|
|
148
|
-
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned
|
|
153
|
+
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned '
|
|
154
|
+
f'(scrapy_request={scrapy_request})',
|
|
149
155
|
)
|
|
150
156
|
return scrapy_request
|
apify/scrapy/utils.py
CHANGED
|
@@ -10,11 +10,10 @@ try:
|
|
|
10
10
|
from scrapy.utils.python import to_bytes
|
|
11
11
|
except ImportError as exc:
|
|
12
12
|
raise ImportError(
|
|
13
|
-
'To use this module, you need to install the "scrapy" extra.
|
|
13
|
+
'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
|
|
14
|
+
'"pip install apify[scrapy]".'
|
|
14
15
|
) from exc
|
|
15
16
|
|
|
16
|
-
from apify.actor import Actor
|
|
17
|
-
from apify.storages import RequestQueue, StorageClientManager
|
|
18
17
|
|
|
19
18
|
nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
|
|
20
19
|
|
|
@@ -71,31 +70,3 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
|
|
|
71
70
|
settings['APIFY_PROXY_SETTINGS'] = proxy_config
|
|
72
71
|
|
|
73
72
|
return settings
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
async def open_queue_with_custom_client() -> RequestQueue:
|
|
77
|
-
"""Open a Request Queue with custom Apify Client.
|
|
78
|
-
|
|
79
|
-
TODO: add support for custom client to Actor.open_request_queue(), so that
|
|
80
|
-
we don't have to do this hacky workaround
|
|
81
|
-
"""
|
|
82
|
-
# Create a new Apify Client with its httpx client in the custom event loop
|
|
83
|
-
custom_loop_apify_client = Actor.new_client()
|
|
84
|
-
|
|
85
|
-
# Set the new Apify Client as the default client, back up the old client
|
|
86
|
-
old_client = Actor.apify_client
|
|
87
|
-
StorageClientManager.set_cloud_client(custom_loop_apify_client)
|
|
88
|
-
|
|
89
|
-
# Create a new Request Queue in the custom event loop,
|
|
90
|
-
# replace its Apify client with the custom loop's Apify client
|
|
91
|
-
rq = await Actor.open_request_queue()
|
|
92
|
-
|
|
93
|
-
if Actor.config.is_at_home:
|
|
94
|
-
rq._request_queue_client = custom_loop_apify_client.request_queue(
|
|
95
|
-
rq._id,
|
|
96
|
-
client_key=rq._client_key,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Restore the old Apify Client as the default client
|
|
100
|
-
StorageClientManager.set_cloud_client(old_client)
|
|
101
|
-
return rq
|
apify/storages/__init__.py
CHANGED
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .key_value_store import KeyValueStore
|
|
3
|
-
from .request_queue import RequestQueue
|
|
4
|
-
from .storage_client_manager import StorageClientManager
|
|
1
|
+
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
5
2
|
|
|
6
|
-
__all__ = [
|
|
7
|
-
'Dataset',
|
|
8
|
-
'KeyValueStore',
|
|
9
|
-
'RequestQueue',
|
|
10
|
-
'StorageClientManager',
|
|
11
|
-
]
|
|
3
|
+
__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue']
|
apify/storages/py.typed
ADDED
|
File without changes
|