apify 1.4.2b3__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/scrapy/__init__.py +1 -2
- apify/scrapy/middlewares/__init__.py +2 -0
- apify/scrapy/middlewares/apify_proxy.py +145 -0
- apify/scrapy/{middlewares.py → middlewares/apify_retry.py} +3 -3
- apify/scrapy/utils.py +11 -0
- {apify-1.4.2b3.dist-info → apify-1.5.0.dist-info}/METADATA +1 -1
- {apify-1.4.2b3.dist-info → apify-1.5.0.dist-info}/RECORD +10 -8
- {apify-1.4.2b3.dist-info → apify-1.5.0.dist-info}/LICENSE +0 -0
- {apify-1.4.2b3.dist-info → apify-1.5.0.dist-info}/WHEEL +0 -0
- {apify-1.4.2b3.dist-info → apify-1.5.0.dist-info}/top_level.txt +0 -0
apify/scrapy/__init__.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from .middlewares import ApifyRetryMiddleware
|
|
2
1
|
from .pipelines import ActorDatasetPushPipeline
|
|
3
2
|
from .scheduler import ApifyScheduler
|
|
4
|
-
from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
|
|
3
|
+
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from urllib.parse import ParseResult, urlparse
|
|
5
|
+
|
|
6
|
+
from scrapy.core.downloader.handlers.http11 import TunnelError
|
|
7
|
+
from scrapy.exceptions import NotConfigured
|
|
8
|
+
|
|
9
|
+
from ...actor import Actor
|
|
10
|
+
from ...proxy_configuration import ProxyConfiguration
|
|
11
|
+
from ..utils import get_basic_auth_header
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from scrapy import Request, Spider
|
|
15
|
+
from scrapy.crawler import Crawler
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ApifyHttpProxyMiddleware:
|
|
19
|
+
"""Apify HTTP proxy middleware for Scrapy.
|
|
20
|
+
|
|
21
|
+
This middleware enhances request processing by adding a 'proxy' field to the request's meta and an authentication
|
|
22
|
+
header. It draws inspiration from the `HttpProxyMiddleware` included by default in Scrapy projects. The proxy URL
|
|
23
|
+
is sourced from the settings under the `APIFY_PROXY_SETTINGS` key. The value of this key, a dictionary, should be
|
|
24
|
+
provided by the Actor input. An example of the proxy settings:
|
|
25
|
+
|
|
26
|
+
proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []}
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict) -> None:
|
|
30
|
+
"""Create a new instance.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
proxy_settings: Dictionary containing proxy settings, provided by the Actor input.
|
|
34
|
+
auth_encoding: Encoding for basic authentication (default is 'latin-1').
|
|
35
|
+
"""
|
|
36
|
+
self._proxy_settings = proxy_settings
|
|
37
|
+
self._proxy_cfg_internal: ProxyConfiguration | None = None
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> ApifyHttpProxyMiddleware:
|
|
41
|
+
"""Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
cls: Class type.
|
|
45
|
+
crawler: Scrapy Crawler object.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
ApifyHttpProxyMiddleware: Instance of the class.
|
|
49
|
+
"""
|
|
50
|
+
proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
|
|
51
|
+
|
|
52
|
+
if proxy_settings is None:
|
|
53
|
+
Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.')
|
|
54
|
+
raise NotConfigured
|
|
55
|
+
|
|
56
|
+
use_apify_proxy = proxy_settings.get('useApifyProxy', False)
|
|
57
|
+
|
|
58
|
+
if use_apify_proxy is not True:
|
|
59
|
+
Actor.log.warning(
|
|
60
|
+
'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.'
|
|
61
|
+
)
|
|
62
|
+
raise NotConfigured
|
|
63
|
+
|
|
64
|
+
return cls(proxy_settings)
|
|
65
|
+
|
|
66
|
+
async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None:
|
|
67
|
+
"""Process a Scrapy request by assigning a new proxy.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
request: Scrapy Request object.
|
|
71
|
+
spider: Scrapy Spider object.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If username and password are not provided in the proxy URL.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
None: The request is processed and middleware pipeline can continue.
|
|
78
|
+
"""
|
|
79
|
+
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}')
|
|
80
|
+
url = await self._get_new_proxy_url()
|
|
81
|
+
|
|
82
|
+
if not (url.username and url.password):
|
|
83
|
+
raise ValueError('Username and password must be provided in the proxy URL.')
|
|
84
|
+
|
|
85
|
+
request.meta['proxy'] = url.geturl()
|
|
86
|
+
basic_auth_header = get_basic_auth_header(url.username, url.password)
|
|
87
|
+
request.headers[b'Proxy-Authorization'] = basic_auth_header
|
|
88
|
+
|
|
89
|
+
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}')
|
|
90
|
+
|
|
91
|
+
def process_exception(
|
|
92
|
+
self: ApifyHttpProxyMiddleware,
|
|
93
|
+
request: Request,
|
|
94
|
+
exception: Exception,
|
|
95
|
+
spider: Spider,
|
|
96
|
+
) -> None | Request:
|
|
97
|
+
"""Process an exception that occurs during request processing.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
request: Scrapy Request object.
|
|
101
|
+
exception: Exception object.
|
|
102
|
+
spider: Scrapy Spider object.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline.
|
|
106
|
+
Return None otherwise to allow the continuation of request processing.
|
|
107
|
+
"""
|
|
108
|
+
Actor.log.debug(
|
|
109
|
+
f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if isinstance(exception, TunnelError):
|
|
113
|
+
Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...')
|
|
114
|
+
return request
|
|
115
|
+
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
|
|
119
|
+
"""Get a new proxy URL.
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
NotConfigured: If creation of the proxy configuration fails.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
ParseResult: New proxy URL.
|
|
126
|
+
"""
|
|
127
|
+
# Get proxy configuration, creating it if necessary
|
|
128
|
+
proxy_cfg = (
|
|
129
|
+
self._proxy_cfg_internal
|
|
130
|
+
if isinstance(self._proxy_cfg_internal, ProxyConfiguration)
|
|
131
|
+
else await Actor.create_proxy_configuration(actor_proxy_input=self._proxy_settings)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# If the proxy configuration is still not available, raise an error. However, this should not happen due
|
|
135
|
+
# to the checks in the `from_crawler` method.
|
|
136
|
+
if proxy_cfg is None:
|
|
137
|
+
Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.')
|
|
138
|
+
raise NotConfigured
|
|
139
|
+
|
|
140
|
+
# Store the proxy configuration for future use
|
|
141
|
+
self._proxy_cfg_internal = proxy_cfg
|
|
142
|
+
|
|
143
|
+
# Get a new proxy URL and return it
|
|
144
|
+
new_url = await proxy_cfg.new_url()
|
|
145
|
+
return urlparse(new_url)
|
|
@@ -11,14 +11,14 @@ except ImportError as exc:
|
|
|
11
11
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
12
12
|
) from exc
|
|
13
13
|
|
|
14
|
-
from
|
|
15
|
-
from
|
|
14
|
+
from ...actor import Actor
|
|
15
|
+
from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
18
|
from scrapy import Spider
|
|
19
19
|
from scrapy.http import Request, Response
|
|
20
20
|
|
|
21
|
-
from
|
|
21
|
+
from ...storages import RequestQueue
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class ApifyRetryMiddleware(RetryMiddleware):
|
apify/scrapy/utils.py
CHANGED
|
@@ -3,6 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import codecs
|
|
5
5
|
import pickle
|
|
6
|
+
from base64 import b64encode
|
|
7
|
+
from urllib.parse import unquote
|
|
8
|
+
|
|
9
|
+
from scrapy.utils.python import to_bytes
|
|
6
10
|
|
|
7
11
|
try:
|
|
8
12
|
from scrapy import Request, Spider
|
|
@@ -19,6 +23,13 @@ from ..storages import RequestQueue, StorageClientManager
|
|
|
19
23
|
nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
|
|
20
24
|
|
|
21
25
|
|
|
26
|
+
def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
|
|
27
|
+
"""Generate a basic authentication header for the given username and password."""
|
|
28
|
+
string = f'{unquote(username)}:{unquote(password)}'
|
|
29
|
+
user_pass = to_bytes(string, encoding=auth_encoding)
|
|
30
|
+
return b'Basic ' + b64encode(user_pass)
|
|
31
|
+
|
|
32
|
+
|
|
22
33
|
def get_running_event_loop_id() -> int:
|
|
23
34
|
"""Get the ID of the currently running event loop.
|
|
24
35
|
|
|
@@ -20,19 +20,21 @@ apify/_memory_storage/resource_clients/key_value_store.py,sha256=Fbw2dDsO8WveQBM
|
|
|
20
20
|
apify/_memory_storage/resource_clients/key_value_store_collection.py,sha256=H_Uc2inqk_q4wn5rFxSD9G2swCWuPt8jJFsR7bCFe7s,1678
|
|
21
21
|
apify/_memory_storage/resource_clients/request_queue.py,sha256=7LS_jrBBJvylFZedZHrgwMPyCsLz8X9-mAvvhOaYzXI,19614
|
|
22
22
|
apify/_memory_storage/resource_clients/request_queue_collection.py,sha256=2WKOoE7FrAJmgyXbxCmowDRdkauYhdA3yGTJw5LeoAg,1651
|
|
23
|
-
apify/scrapy/__init__.py,sha256=
|
|
24
|
-
apify/scrapy/middlewares.py,sha256=iQltZDis-2cKJCDE87v4pPiMKlW3r9eurPbKqer1GW8,4556
|
|
23
|
+
apify/scrapy/__init__.py,sha256=z8uIRyjdp03tm2mmgRXff3LasUwuInnvkFt_L2Q-cGw,222
|
|
25
24
|
apify/scrapy/pipelines.py,sha256=beBC1JwPU-51vEHL4k3GC1iljY74e5XAor8YBE8lE-I,955
|
|
26
25
|
apify/scrapy/scheduler.py,sha256=A6lKB7Bp5F2uoFy6l_gDb-CNHrVfznNORRnLWhFWOBY,4899
|
|
27
|
-
apify/scrapy/utils.py,sha256=
|
|
26
|
+
apify/scrapy/utils.py,sha256=DcspnpCY7LVPGpFPrZzzn9nRB4p5gWgK-UOb1ChCwMA,6777
|
|
27
|
+
apify/scrapy/middlewares/__init__.py,sha256=zzosV8BD8SZQIrVKsSaGFGV9rHinNLKm5GPL3ZNxSZQ,96
|
|
28
|
+
apify/scrapy/middlewares/apify_proxy.py,sha256=SAbEtzYB-vGHpS1FaLth0afWeHxLsd4-qqdNLJI5fA4,5740
|
|
29
|
+
apify/scrapy/middlewares/apify_retry.py,sha256=RrUMrXgk9FTydBG99VbD7m1nDtWccMsO_Kf-rNivunI,4559
|
|
28
30
|
apify/storages/__init__.py,sha256=rBdwhyZxUMG6m_7uAb4sl5eg_dxiLvYVas5aRcZ6PIE,268
|
|
29
31
|
apify/storages/base_storage.py,sha256=gV2izLpzM20kW__RgXo6Z_qf7dRPqiI_LLUup1ul3GU,7422
|
|
30
32
|
apify/storages/dataset.py,sha256=fbWyr7IZwM19UruUFrQLl8VEdgmF-FeVHmAHI01r4sQ,23305
|
|
31
33
|
apify/storages/key_value_store.py,sha256=vnCo-BgmVB4x1i_8A3cIypnCSItxHI_mZji_Xv8_fyk,10744
|
|
32
34
|
apify/storages/request_queue.py,sha256=wUpPLXyDAwoVwhEMeeCyVE_QzbQZSRNYZ9-YoIL0GMs,26929
|
|
33
35
|
apify/storages/storage_client_manager.py,sha256=QAGbu47pwFkHa-AFfolNW3W5hvR7zNz2yxK9Sv0wQbA,2457
|
|
34
|
-
apify-1.
|
|
35
|
-
apify-1.
|
|
36
|
-
apify-1.
|
|
37
|
-
apify-1.
|
|
38
|
-
apify-1.
|
|
36
|
+
apify-1.5.0.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
37
|
+
apify-1.5.0.dist-info/METADATA,sha256=HP-3DvhaO_IRYFjnqyGzlIkseaXqGY_yIxU86Du57O0,6233
|
|
38
|
+
apify-1.5.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
39
|
+
apify-1.5.0.dist-info/top_level.txt,sha256=2oFNsHggn5m_rCaaP7xijQg_-Va2ByOSYuvKgACsS5w,6
|
|
40
|
+
apify-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|