aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/libs/spider/referer.py
CHANGED
|
@@ -1,6 +1,25 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
Referer Middleware for AioScrapy
|
|
3
|
+
AioScrapy的Referer中间件
|
|
4
|
+
|
|
5
|
+
This middleware populates the 'Referer' HTTP header in requests based on the response
|
|
6
|
+
that generated them. It implements various referrer policies as defined in the W3C
|
|
7
|
+
Referrer Policy specification, allowing control over what information is included
|
|
8
|
+
in the Referer header for privacy and security reasons.
|
|
9
|
+
此中间件根据生成请求的响应填充请求中的'Referer' HTTP头。它实现了W3C Referer Policy
|
|
10
|
+
规范中定义的各种引用策略,允许出于隐私和安全原因控制Referer头中包含的信息。
|
|
11
|
+
|
|
12
|
+
The middleware supports all standard referrer policies:
|
|
13
|
+
中间件支持所有标准的引用策略:
|
|
14
|
+
- no-referrer
|
|
15
|
+
- no-referrer-when-downgrade
|
|
16
|
+
- same-origin
|
|
17
|
+
- origin
|
|
18
|
+
- strict-origin
|
|
19
|
+
- origin-when-cross-origin
|
|
20
|
+
- strict-origin-when-cross-origin
|
|
21
|
+
- unsafe-url
|
|
22
|
+
- aioscrapy-default (a variant of no-referrer-when-downgrade)
|
|
4
23
|
"""
|
|
5
24
|
import warnings
|
|
6
25
|
from typing import Tuple
|
|
@@ -30,24 +49,102 @@ POLICY_AIOSCRAPY_DEFAULT = "aioscrapy-default"
|
|
|
30
49
|
|
|
31
50
|
|
|
32
51
|
class ReferrerPolicy:
|
|
52
|
+
"""
|
|
53
|
+
Base class for implementing W3C Referrer Policy.
|
|
54
|
+
实现W3C引用策略的基类。
|
|
55
|
+
|
|
56
|
+
This abstract class defines the interface and common functionality for all
|
|
57
|
+
referrer policy implementations. Each subclass implements a specific policy
|
|
58
|
+
from the W3C Referrer Policy specification.
|
|
59
|
+
这个抽象类为所有引用策略实现定义了接口和通用功能。每个子类实现W3C引用策略
|
|
60
|
+
规范中的特定策略。
|
|
33
61
|
|
|
62
|
+
Reference: https://www.w3.org/TR/referrer-policy/
|
|
63
|
+
参考:https://www.w3.org/TR/referrer-policy/
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Schemes that should never send a referrer
|
|
67
|
+
# 永远不应该发送引用的方案
|
|
34
68
|
NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES
|
|
69
|
+
|
|
70
|
+
# Policy name (to be defined by subclasses)
|
|
71
|
+
# 策略名称(由子类定义)
|
|
35
72
|
name: str
|
|
36
73
|
|
|
37
74
|
def referrer(self, response_url, request_url):
|
|
75
|
+
"""
|
|
76
|
+
Determine the referrer value based on the policy.
|
|
77
|
+
根据策略确定引用值。
|
|
78
|
+
|
|
79
|
+
This method must be implemented by subclasses to determine what referrer
|
|
80
|
+
value (if any) should be sent for a request, based on the response URL
|
|
81
|
+
that generated the request and the request URL.
|
|
82
|
+
此方法必须由子类实现,以根据生成请求的响应URL和请求URL确定应为请求
|
|
83
|
+
发送什么引用值(如果有)。
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
response_url: The URL of the response that generated the request.
|
|
87
|
+
生成请求的响应的URL。
|
|
88
|
+
request_url: The URL of the request being made.
|
|
89
|
+
正在发出的请求的URL。
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
str or None: The referrer value to use, or None if no referrer should be sent.
|
|
93
|
+
要使用的引用值,如果不应发送引用,则为None。
|
|
94
|
+
"""
|
|
38
95
|
raise NotImplementedError()
|
|
39
96
|
|
|
40
97
|
def stripped_referrer(self, url):
|
|
98
|
+
"""
|
|
99
|
+
Return a stripped version of the URL suitable for use as a referrer.
|
|
100
|
+
返回适合用作引用的URL的剥离版本。
|
|
101
|
+
|
|
102
|
+
This method strips sensitive information from a URL according to the
|
|
103
|
+
referrer policy specification.
|
|
104
|
+
此方法根据引用策略规范从URL中剥离敏感信息。
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
url: The URL to strip.
|
|
108
|
+
要剥离的URL。
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
str or None: The stripped URL, or None if the URL uses a scheme that
|
|
112
|
+
should never send a referrer.
|
|
113
|
+
剥离后的URL,如果URL使用的方案永远不应该发送引用,则为None。
|
|
114
|
+
"""
|
|
41
115
|
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
|
|
42
116
|
return self.strip_url(url)
|
|
117
|
+
return None
|
|
43
118
|
|
|
44
119
|
def origin_referrer(self, url):
|
|
120
|
+
"""
|
|
121
|
+
Return only the origin portion of a URL for use as a referrer.
|
|
122
|
+
仅返回URL的源部分以用作引用。
|
|
123
|
+
|
|
124
|
+
This method returns just the scheme, host, and port of a URL, which is
|
|
125
|
+
useful for policies that only send the origin as the referrer.
|
|
126
|
+
此方法仅返回URL的方案、主机和端口,这对于仅发送源作为引用的策略很有用。
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
url: The URL to get the origin from.
|
|
130
|
+
要获取源的URL。
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
str or None: The origin of the URL, or None if the URL uses a scheme that
|
|
134
|
+
should never send a referrer.
|
|
135
|
+
URL的源,如果URL使用的方案永远不应该发送引用,则为None。
|
|
136
|
+
"""
|
|
45
137
|
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
|
|
46
138
|
return self.origin(url)
|
|
139
|
+
return None
|
|
47
140
|
|
|
48
141
|
def strip_url(self, url, origin_only=False):
|
|
49
142
|
"""
|
|
50
|
-
|
|
143
|
+
Strip a URL according to the referrer policy specification.
|
|
144
|
+
根据引用策略规范剥离URL。
|
|
145
|
+
|
|
146
|
+
Reference: https://www.w3.org/TR/referrer-policy/#strip-url
|
|
147
|
+
参考:https://www.w3.org/TR/referrer-policy/#strip-url
|
|
51
148
|
|
|
52
149
|
If url is null, return no referrer.
|
|
53
150
|
If url's scheme is a local scheme, then return no referrer.
|
|
@@ -58,6 +155,16 @@ class ReferrerPolicy:
|
|
|
58
155
|
Set url's path to null.
|
|
59
156
|
Set url's query to null.
|
|
60
157
|
Return url.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
url: The URL to strip.
|
|
161
|
+
要剥离的URL。
|
|
162
|
+
origin_only: Whether to strip the URL to just its origin.
|
|
163
|
+
是否将URL剥离为仅其源。
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
str or None: The stripped URL, or None if the URL is empty.
|
|
167
|
+
剥离后的URL,如果URL为空,则为None。
|
|
61
168
|
"""
|
|
62
169
|
if not url:
|
|
63
170
|
return None
|
|
@@ -68,10 +175,45 @@ class ReferrerPolicy:
|
|
|
68
175
|
origin_only=origin_only)
|
|
69
176
|
|
|
70
177
|
def origin(self, url):
|
|
71
|
-
"""
|
|
178
|
+
"""
|
|
179
|
+
Return serialized origin (scheme, host, port) for a URL.
|
|
180
|
+
返回URL的序列化源(方案、主机、端口)。
|
|
181
|
+
|
|
182
|
+
The origin of a URL is just its scheme, host, and port, without path,
|
|
183
|
+
query, or fragment.
|
|
184
|
+
URL的源只是其方案、主机和端口,没有路径、查询或片段。
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
url: The URL to get the origin from.
|
|
188
|
+
要获取源的URL。
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
str or None: The origin of the URL, or None if the URL is empty.
|
|
192
|
+
URL的源,如果URL为空,则为None。
|
|
193
|
+
"""
|
|
72
194
|
return self.strip_url(url, origin_only=True)
|
|
73
195
|
|
|
74
196
|
def potentially_trustworthy(self, url):
|
|
197
|
+
"""
|
|
198
|
+
Determine if a URL is potentially trustworthy.
|
|
199
|
+
确定URL是否可能值得信任。
|
|
200
|
+
|
|
201
|
+
This is a simplified implementation that considers HTTPS and FTPS URLs
|
|
202
|
+
as potentially trustworthy, and data URLs as not trustworthy.
|
|
203
|
+
这是一个简化的实现,将HTTPS和FTPS URL视为可能值得信任,将数据URL视为不值得信任。
|
|
204
|
+
|
|
205
|
+
Note: this does not follow the full algorithm from:
|
|
206
|
+
注意:这不遵循以下完整算法:
|
|
207
|
+
https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
url: The URL to check.
|
|
211
|
+
要检查的URL。
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
bool: True if the URL is potentially trustworthy, False otherwise.
|
|
215
|
+
如果URL可能值得信任,则为True,否则为False。
|
|
216
|
+
"""
|
|
75
217
|
# Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
|
|
76
218
|
parsed_url = urlparse(url)
|
|
77
219
|
if parsed_url.scheme in ('data',):
|
|
@@ -79,20 +221,57 @@ class ReferrerPolicy:
|
|
|
79
221
|
return self.tls_protected(url)
|
|
80
222
|
|
|
81
223
|
def tls_protected(self, url):
|
|
224
|
+
"""
|
|
225
|
+
Determine if a URL is protected by TLS (HTTPS or FTPS).
|
|
226
|
+
确定URL是否受TLS(HTTPS或FTPS)保护。
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
url: The URL to check.
|
|
230
|
+
要检查的URL。
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
bool: True if the URL uses HTTPS or FTPS, False otherwise.
|
|
234
|
+
如果URL使用HTTPS或FTPS,则为True,否则为False。
|
|
235
|
+
"""
|
|
82
236
|
return urlparse(url).scheme in ('https', 'ftps')
|
|
83
237
|
|
|
84
238
|
|
|
85
239
|
class NoReferrerPolicy(ReferrerPolicy):
|
|
86
240
|
"""
|
|
87
|
-
|
|
241
|
+
Implementation of the "no-referrer" referrer policy.
|
|
242
|
+
"no-referrer"引用策略的实现。
|
|
243
|
+
|
|
244
|
+
Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
|
|
245
|
+
参考:https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
|
|
88
246
|
|
|
89
247
|
The simplest policy is "no-referrer", which specifies that no referrer information
|
|
90
248
|
is to be sent along with requests made from a particular request client to any origin.
|
|
91
249
|
The header will be omitted entirely.
|
|
250
|
+
最简单的策略是"no-referrer",它指定不随特定请求客户端向任何源发出的请求
|
|
251
|
+
发送任何引用信息。头将完全省略。
|
|
92
252
|
"""
|
|
253
|
+
# Policy name
|
|
254
|
+
# 策略名称
|
|
93
255
|
name: str = POLICY_NO_REFERRER
|
|
94
256
|
|
|
95
257
|
def referrer(self, response_url, request_url):
|
|
258
|
+
"""
|
|
259
|
+
Determine the referrer value based on the no-referrer policy.
|
|
260
|
+
根据no-referrer策略确定引用值。
|
|
261
|
+
|
|
262
|
+
This policy always returns None, meaning no Referer header should be sent.
|
|
263
|
+
此策略始终返回None,表示不应发送Referer头。
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
response_url: The URL of the response that generated the request.
|
|
267
|
+
生成请求的响应的URL。
|
|
268
|
+
request_url: The URL of the request being made.
|
|
269
|
+
正在发出的请求的URL。
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
None: Always returns None, indicating no referrer should be sent.
|
|
273
|
+
始终返回None,表示不应发送引用。
|
|
274
|
+
"""
|
|
96
275
|
return None
|
|
97
276
|
|
|
98
277
|
|
|
@@ -270,16 +449,42 @@ _policy_classes[''] = NoReferrerWhenDowngradePolicy
|
|
|
270
449
|
|
|
271
450
|
def _load_policy_class(policy, warning_only=False):
|
|
272
451
|
"""
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
452
|
+
Load a referrer policy class by name or path.
|
|
453
|
+
通过名称或路径加载引用策略类。
|
|
454
|
+
|
|
455
|
+
This function attempts to load a referrer policy class either by importing it
|
|
456
|
+
from a path or by looking it up in the standard policy classes dictionary.
|
|
457
|
+
此函数尝试通过从路径导入或在标准策略类字典中查找来加载引用策略类。
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
policy: A string representing either a path to a policy class or a standard
|
|
461
|
+
policy name from the W3C Referrer Policy specification.
|
|
462
|
+
表示策略类路径或W3C引用策略规范中的标准策略名称的字符串。
|
|
463
|
+
warning_only: If True, warnings will be issued instead of raising exceptions
|
|
464
|
+
when a policy cannot be loaded.
|
|
465
|
+
如果为True,当无法加载策略时将发出警告而不是引发异常。
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
A referrer policy class, or None if the policy could not be loaded and
|
|
469
|
+
warning_only is True.
|
|
470
|
+
引用策略类,如果无法加载策略且warning_only为True,则为None。
|
|
471
|
+
|
|
472
|
+
Raises:
|
|
473
|
+
RuntimeError: If the policy could not be loaded and warning_only is False.
|
|
474
|
+
如果无法加载策略且warning_only为False,则引发RuntimeError。
|
|
276
475
|
"""
|
|
277
476
|
try:
|
|
477
|
+
# Try to load the policy as a Python object (e.g., 'mymodule.MyPolicy')
|
|
478
|
+
# 尝试将策略作为Python对象加载(例如,'mymodule.MyPolicy')
|
|
278
479
|
return load_object(policy)
|
|
279
480
|
except ValueError:
|
|
280
481
|
try:
|
|
482
|
+
# Try to load the policy as a standard policy name
|
|
483
|
+
# 尝试将策略作为标准策略名称加载
|
|
281
484
|
return _policy_classes[policy.lower()]
|
|
282
485
|
except KeyError:
|
|
486
|
+
# Policy could not be loaded
|
|
487
|
+
# 无法加载策略
|
|
283
488
|
msg = f"Could not load referrer policy {policy!r}"
|
|
284
489
|
if not warning_only:
|
|
285
490
|
raise RuntimeError(msg)
|
|
@@ -289,76 +494,246 @@ def _load_policy_class(policy, warning_only=False):
|
|
|
289
494
|
|
|
290
495
|
|
|
291
496
|
class RefererMiddleware:
|
|
497
|
+
"""
|
|
498
|
+
Middleware for populating the 'Referer' HTTP header in requests.
|
|
499
|
+
用于填充请求中的'Referer' HTTP头的中间件。
|
|
500
|
+
|
|
501
|
+
This middleware sets the 'Referer' HTTP header in requests based on the response
|
|
502
|
+
that generated them, following the W3C Referrer Policy specification. It allows
|
|
503
|
+
control over what information is included in the Referer header for privacy and
|
|
504
|
+
security reasons.
|
|
505
|
+
此中间件根据生成请求的响应设置请求中的'Referer' HTTP头,遵循W3C引用策略规范。
|
|
506
|
+
它允许出于隐私和安全原因控制Referer头中包含的信息。
|
|
507
|
+
"""
|
|
292
508
|
|
|
293
509
|
def __init__(self, settings=None):
|
|
510
|
+
"""
|
|
511
|
+
Initialize the RefererMiddleware.
|
|
512
|
+
初始化RefererMiddleware。
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
settings: The AioScrapy settings object.
|
|
516
|
+
AioScrapy设置对象。
|
|
517
|
+
If None, the default policy will be used.
|
|
518
|
+
如果为None,将使用默认策略。
|
|
519
|
+
"""
|
|
520
|
+
# Set the default policy
|
|
521
|
+
# 设置默认策略
|
|
294
522
|
self.default_policy = DefaultReferrerPolicy
|
|
523
|
+
|
|
524
|
+
# If settings are provided, load the policy from settings
|
|
525
|
+
# 如果提供了设置,从设置加载策略
|
|
295
526
|
if settings is not None:
|
|
296
527
|
self.default_policy = _load_policy_class(
|
|
297
528
|
settings.get('REFERRER_POLICY'))
|
|
298
529
|
|
|
299
530
|
@classmethod
|
|
300
531
|
def from_crawler(cls, crawler):
|
|
532
|
+
"""
|
|
533
|
+
Create a RefererMiddleware instance from a crawler.
|
|
534
|
+
从爬虫创建RefererMiddleware实例。
|
|
535
|
+
|
|
536
|
+
This is the factory method used by AioScrapy to create the middleware.
|
|
537
|
+
这是AioScrapy用于创建中间件的工厂方法。
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
crawler: The crawler that will use this middleware.
|
|
541
|
+
将使用此中间件的爬虫。
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
RefererMiddleware: A new RefererMiddleware instance.
|
|
545
|
+
一个新的RefererMiddleware实例。
|
|
546
|
+
|
|
547
|
+
Raises:
|
|
548
|
+
NotConfigured: If REFERER_ENABLED is False in the crawler settings.
|
|
549
|
+
如果爬虫设置中的REFERER_ENABLED为False。
|
|
550
|
+
"""
|
|
551
|
+
# Check if the middleware is enabled
|
|
552
|
+
# 检查中间件是否已启用
|
|
301
553
|
if not crawler.settings.getbool('REFERER_ENABLED'):
|
|
302
554
|
raise NotConfigured
|
|
555
|
+
|
|
556
|
+
# Create a new instance with the crawler's settings
|
|
557
|
+
# 使用爬虫的设置创建一个新实例
|
|
303
558
|
mw = cls(crawler.settings)
|
|
304
559
|
|
|
560
|
+
# Connect the request_scheduled method to the request_scheduled signal
|
|
561
|
+
# to handle redirections
|
|
562
|
+
# 将request_scheduled方法连接到request_scheduled信号以处理重定向
|
|
305
563
|
# Note: this hook is a bit of a hack to intercept redirections
|
|
564
|
+
# 注意:这个钩子有点像一个黑客,用于拦截重定向
|
|
306
565
|
crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
|
|
307
566
|
|
|
567
|
+
# Return the new instance
|
|
568
|
+
# 返回新实例
|
|
308
569
|
return mw
|
|
309
570
|
|
|
310
571
|
def policy(self, resp_or_url, request):
|
|
311
572
|
"""
|
|
312
|
-
Determine Referrer-Policy to use
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
-
|
|
573
|
+
Determine the Referrer-Policy to use for a request.
|
|
574
|
+
确定用于请求的引用策略。
|
|
575
|
+
|
|
576
|
+
This method determines which referrer policy to use based on the following
|
|
577
|
+
precedence rules:
|
|
578
|
+
此方法根据以下优先级规则确定要使用的引用策略:
|
|
579
|
+
|
|
580
|
+
- If a valid policy is set in Request meta, it is used.
|
|
581
|
+
如果在Request meta中设置了有效的策略,则使用它。
|
|
582
|
+
- If the policy is set in meta but is wrong (e.g. a typo error),
|
|
583
|
+
the policy from settings is used.
|
|
584
|
+
如果在meta中设置了策略但是错误的(例如,拼写错误),
|
|
585
|
+
则使用设置中的策略。
|
|
586
|
+
- If the policy is not set in Request meta,
|
|
587
|
+
but there is a Referrer-Policy header in the parent response,
|
|
588
|
+
it is used if valid.
|
|
589
|
+
如果在Request meta中未设置策略,
|
|
590
|
+
但在父响应中有Referrer-Policy头,
|
|
591
|
+
如果有效,则使用它。
|
|
592
|
+
- Otherwise, the policy from settings is used.
|
|
593
|
+
否则,使用设置中的策略。
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
resp_or_url: The parent Response object or URL string.
|
|
597
|
+
父Response对象或URL字符串。
|
|
598
|
+
request: The Request object being processed.
|
|
599
|
+
正在处理的Request对象。
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
ReferrerPolicy: An instance of the appropriate referrer policy class.
|
|
603
|
+
适当的引用策略类的实例。
|
|
322
604
|
"""
|
|
605
|
+
# Try to get the policy name from the request meta
|
|
606
|
+
# 尝试从请求元数据获取策略名称
|
|
323
607
|
policy_name = request.meta.get('referrer_policy')
|
|
608
|
+
|
|
609
|
+
# If no policy in meta, try to get it from the response headers
|
|
610
|
+
# 如果元数据中没有策略,尝试从响应头获取
|
|
324
611
|
if policy_name is None:
|
|
325
612
|
if isinstance(resp_or_url, Response):
|
|
326
613
|
policy_header = resp_or_url.headers.get('Referrer-Policy')
|
|
327
614
|
if policy_header is not None:
|
|
328
615
|
policy_name = to_unicode(policy_header.decode('latin1') if isinstance(policy_header, bytes) else policy_header)
|
|
616
|
+
|
|
617
|
+
# If no policy was found, use the default
|
|
618
|
+
# 如果未找到策略,使用默认值
|
|
329
619
|
if policy_name is None:
|
|
330
620
|
return self.default_policy()
|
|
331
621
|
|
|
622
|
+
# Try to load the policy class
|
|
623
|
+
# 尝试加载策略类
|
|
332
624
|
cls = _load_policy_class(policy_name, warning_only=True)
|
|
625
|
+
|
|
626
|
+
# Return an instance of the policy class, or the default if loading failed
|
|
627
|
+
# 返回策略类的实例,如果加载失败,则返回默认值
|
|
333
628
|
return cls() if cls else self.default_policy()
|
|
334
629
|
|
|
335
630
|
async def process_spider_output(self, response, result, spider):
|
|
631
|
+
"""
|
|
632
|
+
Process the spider output to set the 'Referer' header in requests.
|
|
633
|
+
处理爬虫输出以在请求中设置'Referer'头。
|
|
634
|
+
|
|
635
|
+
This method processes each request yielded by the spider and sets the
|
|
636
|
+
'Referer' header based on the appropriate referrer policy.
|
|
637
|
+
此方法处理爬虫产生的每个请求,并根据适当的引用策略设置'Referer'头。
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
response: The response being processed.
|
|
641
|
+
正在处理的响应。
|
|
642
|
+
result: The result returned by the spider.
|
|
643
|
+
爬虫返回的结果。
|
|
644
|
+
spider: The spider that generated the result.
|
|
645
|
+
生成结果的爬虫。
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
An async generator yielding processed requests and other items.
|
|
649
|
+
一个产生处理后的请求和其他项目的异步生成器。
|
|
650
|
+
"""
|
|
336
651
|
def _set_referer(r):
|
|
652
|
+
"""
|
|
653
|
+
Set the 'Referer' header for a request if it's a Request object.
|
|
654
|
+
如果是Request对象,则为请求设置'Referer'头。
|
|
655
|
+
|
|
656
|
+
Args:
|
|
657
|
+
r: The item to process.
|
|
658
|
+
要处理的项目。
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
The processed item.
|
|
662
|
+
处理后的项目。
|
|
663
|
+
"""
|
|
664
|
+
# Only process Request objects
|
|
665
|
+
# 只处理Request对象
|
|
337
666
|
if isinstance(r, Request):
|
|
667
|
+
# Get the referrer value based on the policy
|
|
668
|
+
# 根据策略获取引用值
|
|
338
669
|
referrer = self.policy(response, r).referrer(response.url, r.url)
|
|
670
|
+
|
|
671
|
+
# If a referrer value was returned, set it in the request headers
|
|
672
|
+
# 如果返回了引用值,则在请求头中设置它
|
|
339
673
|
if referrer is not None:
|
|
340
674
|
r.headers.setdefault('Referer', referrer)
|
|
675
|
+
|
|
676
|
+
# Return the item, possibly modified
|
|
677
|
+
# 返回可能已修改的项目
|
|
341
678
|
return r
|
|
679
|
+
|
|
680
|
+
# Process each item in the result
|
|
681
|
+
# 处理结果中的每个项目
|
|
342
682
|
return (_set_referer(r) async for r in result or ())
|
|
343
683
|
|
|
344
684
|
def request_scheduled(self, request, spider):
|
|
345
|
-
|
|
685
|
+
"""
|
|
686
|
+
Handle scheduled requests to patch the 'Referer' header if necessary.
|
|
687
|
+
处理计划的请求,以在必要时修补'Referer'头。
|
|
688
|
+
|
|
689
|
+
This method is called when a request is scheduled. It handles redirected
|
|
690
|
+
requests by updating the 'Referer' header according to the appropriate
|
|
691
|
+
referrer policy.
|
|
692
|
+
当请求被计划时调用此方法。它通过根据适当的引用策略更新'Referer'头来处理
|
|
693
|
+
重定向的请求。
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
request: The request being scheduled.
|
|
697
|
+
正在计划的请求。
|
|
698
|
+
spider: The spider that generated the request.
|
|
699
|
+
生成请求的爬虫。
|
|
700
|
+
"""
|
|
701
|
+
# Check if this is a redirected request
|
|
702
|
+
# 检查这是否是重定向的请求
|
|
346
703
|
redirected_urls = request.meta.get('redirect_urls', [])
|
|
347
704
|
if redirected_urls:
|
|
705
|
+
# Get the current 'Referer' header value
|
|
706
|
+
# 获取当前的'Referer'头值
|
|
348
707
|
request_referrer = request.headers.get('Referer')
|
|
349
|
-
|
|
708
|
+
|
|
709
|
+
# We don't patch the referrer value if there is none
|
|
710
|
+
# 如果没有引用值,我们不会修补它
|
|
350
711
|
if request_referrer is not None:
|
|
351
|
-
#
|
|
712
|
+
# The request's referrer header value acts as a surrogate
|
|
352
713
|
# for the parent response URL
|
|
714
|
+
# 请求的引用头值作为父响应URL的替代品
|
|
353
715
|
#
|
|
354
716
|
# Note: if the 3xx response contained a Referrer-Policy header,
|
|
355
717
|
# the information is not available using this hook
|
|
718
|
+
# 注意:如果3xx响应包含Referrer-Policy头,
|
|
719
|
+
# 使用此钩子无法获取信息
|
|
356
720
|
parent_url = safe_url_string(request_referrer)
|
|
721
|
+
|
|
722
|
+
# Get the referrer value based on the policy
|
|
723
|
+
# 根据策略获取引用值
|
|
357
724
|
policy_referrer = self.policy(parent_url, request).referrer(
|
|
358
725
|
parent_url, request.url)
|
|
726
|
+
|
|
727
|
+
# If the policy referrer is different from the current referrer,
|
|
728
|
+
# update the header
|
|
729
|
+
# 如果策略引用与当前引用不同,则更新头
|
|
359
730
|
if policy_referrer != request_referrer:
|
|
360
731
|
if policy_referrer is None:
|
|
732
|
+
# Remove the 'Referer' header if the policy says not to send one
|
|
733
|
+
# 如果策略说不发送引用,则删除'Referer'头
|
|
361
734
|
request.headers.pop('Referer')
|
|
362
735
|
else:
|
|
736
|
+
# Update the 'Referer' header with the policy value
|
|
737
|
+
# 使用策略值更新'Referer'头
|
|
363
738
|
request.headers['Referer'] = policy_referrer
|
|
364
739
|
|