aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,25 @@
1
1
  """
2
- RefererMiddleware: populates Request referer field, based on the Response which
3
- originated it.
2
+ Referer Middleware for AioScrapy
3
+ AioScrapy的Referer中间件
4
+
5
+ This middleware populates the 'Referer' HTTP header in requests based on the response
6
+ that generated them. It implements various referrer policies as defined in the W3C
7
+ Referrer Policy specification, allowing control over what information is included
8
+ in the Referer header for privacy and security reasons.
9
+ 此中间件根据生成请求的响应填充请求中的'Referer' HTTP头。它实现了W3C Referer Policy
10
+ 规范中定义的各种引用策略,允许出于隐私和安全原因控制Referer头中包含的信息。
11
+
12
+ The middleware supports all standard referrer policies:
13
+ 中间件支持所有标准的引用策略:
14
+ - no-referrer
15
+ - no-referrer-when-downgrade
16
+ - same-origin
17
+ - origin
18
+ - strict-origin
19
+ - origin-when-cross-origin
20
+ - strict-origin-when-cross-origin
21
+ - unsafe-url
22
+ - aioscrapy-default (a variant of no-referrer-when-downgrade)
4
23
  """
5
24
  import warnings
6
25
  from typing import Tuple
@@ -30,24 +49,102 @@ POLICY_AIOSCRAPY_DEFAULT = "aioscrapy-default"
30
49
 
31
50
 
32
51
  class ReferrerPolicy:
52
+ """
53
+ Base class for implementing W3C Referrer Policy.
54
+ 实现W3C引用策略的基类。
55
+
56
+ This abstract class defines the interface and common functionality for all
57
+ referrer policy implementations. Each subclass implements a specific policy
58
+ from the W3C Referrer Policy specification.
59
+ 这个抽象类为所有引用策略实现定义了接口和通用功能。每个子类实现W3C引用策略
60
+ 规范中的特定策略。
33
61
 
62
+ Reference: https://www.w3.org/TR/referrer-policy/
63
+ 参考:https://www.w3.org/TR/referrer-policy/
64
+ """
65
+
66
+ # Schemes that should never send a referrer
67
+ # 永远不应该发送引用的方案
34
68
  NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES
69
+
70
+ # Policy name (to be defined by subclasses)
71
+ # 策略名称(由子类定义)
35
72
  name: str
36
73
 
37
74
  def referrer(self, response_url, request_url):
75
+ """
76
+ Determine the referrer value based on the policy.
77
+ 根据策略确定引用值。
78
+
79
+ This method must be implemented by subclasses to determine what referrer
80
+ value (if any) should be sent for a request, based on the response URL
81
+ that generated the request and the request URL.
82
+ 此方法必须由子类实现,以根据生成请求的响应URL和请求URL确定应为请求
83
+ 发送什么引用值(如果有)。
84
+
85
+ Args:
86
+ response_url: The URL of the response that generated the request.
87
+ 生成请求的响应的URL。
88
+ request_url: The URL of the request being made.
89
+ 正在发出的请求的URL。
90
+
91
+ Returns:
92
+ str or None: The referrer value to use, or None if no referrer should be sent.
93
+ 要使用的引用值,如果不应发送引用,则为None。
94
+ """
38
95
  raise NotImplementedError()
39
96
 
40
97
  def stripped_referrer(self, url):
98
+ """
99
+ Return a stripped version of the URL suitable for use as a referrer.
100
+ 返回适合用作引用的URL的剥离版本。
101
+
102
+ This method strips sensitive information from a URL according to the
103
+ referrer policy specification.
104
+ 此方法根据引用策略规范从URL中剥离敏感信息。
105
+
106
+ Args:
107
+ url: The URL to strip.
108
+ 要剥离的URL。
109
+
110
+ Returns:
111
+ str or None: The stripped URL, or None if the URL uses a scheme that
112
+ should never send a referrer.
113
+ 剥离后的URL,如果URL使用的方案永远不应该发送引用,则为None。
114
+ """
41
115
  if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
42
116
  return self.strip_url(url)
117
+ return None
43
118
 
44
119
  def origin_referrer(self, url):
120
+ """
121
+ Return only the origin portion of a URL for use as a referrer.
122
+ 仅返回URL的源部分以用作引用。
123
+
124
+ This method returns just the scheme, host, and port of a URL, which is
125
+ useful for policies that only send the origin as the referrer.
126
+ 此方法仅返回URL的方案、主机和端口,这对于仅发送源作为引用的策略很有用。
127
+
128
+ Args:
129
+ url: The URL to get the origin from.
130
+ 要获取源的URL。
131
+
132
+ Returns:
133
+ str or None: The origin of the URL, or None if the URL uses a scheme that
134
+ should never send a referrer.
135
+ URL的源,如果URL使用的方案永远不应该发送引用,则为None。
136
+ """
45
137
  if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
46
138
  return self.origin(url)
139
+ return None
47
140
 
48
141
  def strip_url(self, url, origin_only=False):
49
142
  """
50
- https://www.w3.org/TR/referrer-policy/#strip-url
143
+ Strip a URL according to the referrer policy specification.
144
+ 根据引用策略规范剥离URL。
145
+
146
+ Reference: https://www.w3.org/TR/referrer-policy/#strip-url
147
+ 参考:https://www.w3.org/TR/referrer-policy/#strip-url
51
148
 
52
149
  If url is null, return no referrer.
53
150
  If url's scheme is a local scheme, then return no referrer.
@@ -58,6 +155,16 @@ class ReferrerPolicy:
58
155
  Set url's path to null.
59
156
  Set url's query to null.
60
157
  Return url.
158
+
159
+ Args:
160
+ url: The URL to strip.
161
+ 要剥离的URL。
162
+ origin_only: Whether to strip the URL to just its origin.
163
+ 是否将URL剥离为仅其源。
164
+
165
+ Returns:
166
+ str or None: The stripped URL, or None if the URL is empty.
167
+ 剥离后的URL,如果URL为空,则为None。
61
168
  """
62
169
  if not url:
63
170
  return None
@@ -68,10 +175,45 @@ class ReferrerPolicy:
68
175
  origin_only=origin_only)
69
176
 
70
177
  def origin(self, url):
71
- """Return serialized origin (scheme, host, path) for a request or response URL."""
178
+ """
179
+ Return serialized origin (scheme, host, port) for a URL.
180
+ 返回URL的序列化源(方案、主机、端口)。
181
+
182
+ The origin of a URL is just its scheme, host, and port, without path,
183
+ query, or fragment.
184
+ URL的源只是其方案、主机和端口,没有路径、查询或片段。
185
+
186
+ Args:
187
+ url: The URL to get the origin from.
188
+ 要获取源的URL。
189
+
190
+ Returns:
191
+ str or None: The origin of the URL, or None if the URL is empty.
192
+ URL的源,如果URL为空,则为None。
193
+ """
72
194
  return self.strip_url(url, origin_only=True)
73
195
 
74
196
  def potentially_trustworthy(self, url):
197
+ """
198
+ Determine if a URL is potentially trustworthy.
199
+ 确定URL是否可能值得信任。
200
+
201
+ This is a simplified implementation that considers HTTPS and FTPS URLs
202
+ as potentially trustworthy, and data URLs as not trustworthy.
203
+ 这是一个简化的实现,将HTTPS和FTPS URL视为可能值得信任,将数据URL视为不值得信任。
204
+
205
+ Note: this does not follow the full algorithm from:
206
+ 注意:这不遵循以下完整算法:
207
+ https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
208
+
209
+ Args:
210
+ url: The URL to check.
211
+ 要检查的URL。
212
+
213
+ Returns:
214
+ bool: True if the URL is potentially trustworthy, False otherwise.
215
+ 如果URL可能值得信任,则为True,否则为False。
216
+ """
75
217
  # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
76
218
  parsed_url = urlparse(url)
77
219
  if parsed_url.scheme in ('data',):
@@ -79,20 +221,57 @@ class ReferrerPolicy:
79
221
  return self.tls_protected(url)
80
222
 
81
223
  def tls_protected(self, url):
224
+ """
225
+ Determine if a URL is protected by TLS (HTTPS or FTPS).
226
+ 确定URL是否受TLS(HTTPS或FTPS)保护。
227
+
228
+ Args:
229
+ url: The URL to check.
230
+ 要检查的URL。
231
+
232
+ Returns:
233
+ bool: True if the URL uses HTTPS or FTPS, False otherwise.
234
+ 如果URL使用HTTPS或FTPS,则为True,否则为False。
235
+ """
82
236
  return urlparse(url).scheme in ('https', 'ftps')
83
237
 
84
238
 
85
239
  class NoReferrerPolicy(ReferrerPolicy):
86
240
  """
87
- https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
241
+ Implementation of the "no-referrer" referrer policy.
242
+ "no-referrer"引用策略的实现。
243
+
244
+ Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
245
+ 参考:https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
88
246
 
89
247
  The simplest policy is "no-referrer", which specifies that no referrer information
90
248
  is to be sent along with requests made from a particular request client to any origin.
91
249
  The header will be omitted entirely.
250
+ 最简单的策略是"no-referrer",它指定不随特定请求客户端向任何源发出的请求
251
+ 发送任何引用信息。头将完全省略。
92
252
  """
253
+ # Policy name
254
+ # 策略名称
93
255
  name: str = POLICY_NO_REFERRER
94
256
 
95
257
  def referrer(self, response_url, request_url):
258
+ """
259
+ Determine the referrer value based on the no-referrer policy.
260
+ 根据no-referrer策略确定引用值。
261
+
262
+ This policy always returns None, meaning no Referer header should be sent.
263
+ 此策略始终返回None,表示不应发送Referer头。
264
+
265
+ Args:
266
+ response_url: The URL of the response that generated the request.
267
+ 生成请求的响应的URL。
268
+ request_url: The URL of the request being made.
269
+ 正在发出的请求的URL。
270
+
271
+ Returns:
272
+ None: Always returns None, indicating no referrer should be sent.
273
+ 始终返回None,表示不应发送引用。
274
+ """
96
275
  return None
97
276
 
98
277
 
@@ -270,16 +449,42 @@ _policy_classes[''] = NoReferrerWhenDowngradePolicy
270
449
 
271
450
  def _load_policy_class(policy, warning_only=False):
272
451
  """
273
- Expect a string for the path to the policy class,
274
- otherwise try to interpret the string as a standard value
275
- from https://www.w3.org/TR/referrer-policy/#referrer-policies
452
+ Load a referrer policy class by name or path.
453
+ 通过名称或路径加载引用策略类。
454
+
455
+ This function attempts to load a referrer policy class either by importing it
456
+ from a path or by looking it up in the standard policy classes dictionary.
457
+ 此函数尝试通过从路径导入或在标准策略类字典中查找来加载引用策略类。
458
+
459
+ Args:
460
+ policy: A string representing either a path to a policy class or a standard
461
+ policy name from the W3C Referrer Policy specification.
462
+ 表示策略类路径或W3C引用策略规范中的标准策略名称的字符串。
463
+ warning_only: If True, warnings will be issued instead of raising exceptions
464
+ when a policy cannot be loaded.
465
+ 如果为True,当无法加载策略时将发出警告而不是引发异常。
466
+
467
+ Returns:
468
+ A referrer policy class, or None if the policy could not be loaded and
469
+ warning_only is True.
470
+ 引用策略类,如果无法加载策略且warning_only为True,则为None。
471
+
472
+ Raises:
473
+ RuntimeError: If the policy could not be loaded and warning_only is False.
474
+ 如果无法加载策略且warning_only为False,则引发RuntimeError。
276
475
  """
277
476
  try:
477
+ # Try to load the policy as a Python object (e.g., 'mymodule.MyPolicy')
478
+ # 尝试将策略作为Python对象加载(例如,'mymodule.MyPolicy')
278
479
  return load_object(policy)
279
480
  except ValueError:
280
481
  try:
482
+ # Try to load the policy as a standard policy name
483
+ # 尝试将策略作为标准策略名称加载
281
484
  return _policy_classes[policy.lower()]
282
485
  except KeyError:
486
+ # Policy could not be loaded
487
+ # 无法加载策略
283
488
  msg = f"Could not load referrer policy {policy!r}"
284
489
  if not warning_only:
285
490
  raise RuntimeError(msg)
@@ -289,76 +494,246 @@ def _load_policy_class(policy, warning_only=False):
289
494
 
290
495
 
291
496
  class RefererMiddleware:
497
+ """
498
+ Middleware for populating the 'Referer' HTTP header in requests.
499
+ 用于填充请求中的'Referer' HTTP头的中间件。
500
+
501
+ This middleware sets the 'Referer' HTTP header in requests based on the response
502
+ that generated them, following the W3C Referrer Policy specification. It allows
503
+ control over what information is included in the Referer header for privacy and
504
+ security reasons.
505
+ 此中间件根据生成请求的响应设置请求中的'Referer' HTTP头,遵循W3C引用策略规范。
506
+ 它允许出于隐私和安全原因控制Referer头中包含的信息。
507
+ """
292
508
 
293
509
  def __init__(self, settings=None):
510
+ """
511
+ Initialize the RefererMiddleware.
512
+ 初始化RefererMiddleware。
513
+
514
+ Args:
515
+ settings: The AioScrapy settings object.
516
+ AioScrapy设置对象。
517
+ If None, the default policy will be used.
518
+ 如果为None,将使用默认策略。
519
+ """
520
+ # Set the default policy
521
+ # 设置默认策略
294
522
  self.default_policy = DefaultReferrerPolicy
523
+
524
+ # If settings are provided, load the policy from settings
525
+ # 如果提供了设置,从设置加载策略
295
526
  if settings is not None:
296
527
  self.default_policy = _load_policy_class(
297
528
  settings.get('REFERRER_POLICY'))
298
529
 
299
530
  @classmethod
300
531
  def from_crawler(cls, crawler):
532
+ """
533
+ Create a RefererMiddleware instance from a crawler.
534
+ 从爬虫创建RefererMiddleware实例。
535
+
536
+ This is the factory method used by AioScrapy to create the middleware.
537
+ 这是AioScrapy用于创建中间件的工厂方法。
538
+
539
+ Args:
540
+ crawler: The crawler that will use this middleware.
541
+ 将使用此中间件的爬虫。
542
+
543
+ Returns:
544
+ RefererMiddleware: A new RefererMiddleware instance.
545
+ 一个新的RefererMiddleware实例。
546
+
547
+ Raises:
548
+ NotConfigured: If REFERER_ENABLED is False in the crawler settings.
549
+ 如果爬虫设置中的REFERER_ENABLED为False。
550
+ """
551
+ # Check if the middleware is enabled
552
+ # 检查中间件是否已启用
301
553
  if not crawler.settings.getbool('REFERER_ENABLED'):
302
554
  raise NotConfigured
555
+
556
+ # Create a new instance with the crawler's settings
557
+ # 使用爬虫的设置创建一个新实例
303
558
  mw = cls(crawler.settings)
304
559
 
560
+ # Connect the request_scheduled method to the request_scheduled signal
561
+ # to handle redirections
562
+ # 将request_scheduled方法连接到request_scheduled信号以处理重定向
305
563
  # Note: this hook is a bit of a hack to intercept redirections
564
+ # 注意:这个钩子有点像一个黑客,用于拦截重定向
306
565
  crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
307
566
 
567
+ # Return the new instance
568
+ # 返回新实例
308
569
  return mw
309
570
 
310
571
  def policy(self, resp_or_url, request):
311
572
  """
312
- Determine Referrer-Policy to use from a parent Response (or URL),
313
- and a Request to be sent.
314
-
315
- - if a valid policy is set in Request meta, it is used.
316
- - if the policy is set in meta but is wrong (e.g. a typo error),
317
- the policy from settings is used
318
- - if the policy is not set in Request meta,
319
- but there is a Referrer-policy header in the parent response,
320
- it is used if valid
321
- - otherwise, the policy from settings is used.
573
+ Determine the Referrer-Policy to use for a request.
574
+ 确定用于请求的引用策略。
575
+
576
+ This method determines which referrer policy to use based on the following
577
+ precedence rules:
578
+ 此方法根据以下优先级规则确定要使用的引用策略:
579
+
580
+ - If a valid policy is set in Request meta, it is used.
581
+ 如果在Request meta中设置了有效的策略,则使用它。
582
+ - If the policy is set in meta but is wrong (e.g. a typo error),
583
+ the policy from settings is used.
584
+ 如果在meta中设置了策略但是错误的(例如,拼写错误),
585
+ 则使用设置中的策略。
586
+ - If the policy is not set in Request meta,
587
+ but there is a Referrer-Policy header in the parent response,
588
+ it is used if valid.
589
+ 如果在Request meta中未设置策略,
590
+ 但在父响应中有Referrer-Policy头,
591
+ 如果有效,则使用它。
592
+ - Otherwise, the policy from settings is used.
593
+ 否则,使用设置中的策略。
594
+
595
+ Args:
596
+ resp_or_url: The parent Response object or URL string.
597
+ 父Response对象或URL字符串。
598
+ request: The Request object being processed.
599
+ 正在处理的Request对象。
600
+
601
+ Returns:
602
+ ReferrerPolicy: An instance of the appropriate referrer policy class.
603
+ 适当的引用策略类的实例。
322
604
  """
605
+ # Try to get the policy name from the request meta
606
+ # 尝试从请求元数据获取策略名称
323
607
  policy_name = request.meta.get('referrer_policy')
608
+
609
+ # If no policy in meta, try to get it from the response headers
610
+ # 如果元数据中没有策略,尝试从响应头获取
324
611
  if policy_name is None:
325
612
  if isinstance(resp_or_url, Response):
326
613
  policy_header = resp_or_url.headers.get('Referrer-Policy')
327
614
  if policy_header is not None:
328
615
  policy_name = to_unicode(policy_header.decode('latin1') if isinstance(policy_header, bytes) else policy_header)
616
+
617
+ # If no policy was found, use the default
618
+ # 如果未找到策略,使用默认值
329
619
  if policy_name is None:
330
620
  return self.default_policy()
331
621
 
622
+ # Try to load the policy class
623
+ # 尝试加载策略类
332
624
  cls = _load_policy_class(policy_name, warning_only=True)
625
+
626
+ # Return an instance of the policy class, or the default if loading failed
627
+ # 返回策略类的实例,如果加载失败,则返回默认值
333
628
  return cls() if cls else self.default_policy()
334
629
 
335
630
  async def process_spider_output(self, response, result, spider):
631
+ """
632
+ Process the spider output to set the 'Referer' header in requests.
633
+ 处理爬虫输出以在请求中设置'Referer'头。
634
+
635
+ This method processes each request yielded by the spider and sets the
636
+ 'Referer' header based on the appropriate referrer policy.
637
+ 此方法处理爬虫产生的每个请求,并根据适当的引用策略设置'Referer'头。
638
+
639
+ Args:
640
+ response: The response being processed.
641
+ 正在处理的响应。
642
+ result: The result returned by the spider.
643
+ 爬虫返回的结果。
644
+ spider: The spider that generated the result.
645
+ 生成结果的爬虫。
646
+
647
+ Returns:
648
+ An async generator yielding processed requests and other items.
649
+ 一个产生处理后的请求和其他项目的异步生成器。
650
+ """
336
651
  def _set_referer(r):
652
+ """
653
+ Set the 'Referer' header for a request if it's a Request object.
654
+ 如果是Request对象,则为请求设置'Referer'头。
655
+
656
+ Args:
657
+ r: The item to process.
658
+ 要处理的项目。
659
+
660
+ Returns:
661
+ The processed item.
662
+ 处理后的项目。
663
+ """
664
+ # Only process Request objects
665
+ # 只处理Request对象
337
666
  if isinstance(r, Request):
667
+ # Get the referrer value based on the policy
668
+ # 根据策略获取引用值
338
669
  referrer = self.policy(response, r).referrer(response.url, r.url)
670
+
671
+ # If a referrer value was returned, set it in the request headers
672
+ # 如果返回了引用值,则在请求头中设置它
339
673
  if referrer is not None:
340
674
  r.headers.setdefault('Referer', referrer)
675
+
676
+ # Return the item, possibly modified
677
+ # 返回可能已修改的项目
341
678
  return r
679
+
680
+ # Process each item in the result
681
+ # 处理结果中的每个项目
342
682
  return (_set_referer(r) async for r in result or ())
343
683
 
344
684
  def request_scheduled(self, request, spider):
345
- # check redirected request to patch "Referer" header if necessary
685
+ """
686
+ Handle scheduled requests to patch the 'Referer' header if necessary.
687
+ 处理计划的请求,以在必要时修补'Referer'头。
688
+
689
+ This method is called when a request is scheduled. It handles redirected
690
+ requests by updating the 'Referer' header according to the appropriate
691
+ referrer policy.
692
+ 当请求被计划时调用此方法。它通过根据适当的引用策略更新'Referer'头来处理
693
+ 重定向的请求。
694
+
695
+ Args:
696
+ request: The request being scheduled.
697
+ 正在计划的请求。
698
+ spider: The spider that generated the request.
699
+ 生成请求的爬虫。
700
+ """
701
+ # Check if this is a redirected request
702
+ # 检查这是否是重定向的请求
346
703
  redirected_urls = request.meta.get('redirect_urls', [])
347
704
  if redirected_urls:
705
+ # Get the current 'Referer' header value
706
+ # 获取当前的'Referer'头值
348
707
  request_referrer = request.headers.get('Referer')
349
- # we don't patch the referrer value if there is none
708
+
709
+ # We don't patch the referrer value if there is none
710
+ # 如果没有引用值,我们不会修补它
350
711
  if request_referrer is not None:
351
- # the request's referrer header value acts as a surrogate
712
+ # The request's referrer header value acts as a surrogate
352
713
  # for the parent response URL
714
+ # 请求的引用头值作为父响应URL的替代品
353
715
  #
354
716
  # Note: if the 3xx response contained a Referrer-Policy header,
355
717
  # the information is not available using this hook
718
+ # 注意:如果3xx响应包含Referrer-Policy头,
719
+ # 使用此钩子无法获取信息
356
720
  parent_url = safe_url_string(request_referrer)
721
+
722
+ # Get the referrer value based on the policy
723
+ # 根据策略获取引用值
357
724
  policy_referrer = self.policy(parent_url, request).referrer(
358
725
  parent_url, request.url)
726
+
727
+ # If the policy referrer is different from the current referrer,
728
+ # update the header
729
+ # 如果策略引用与当前引用不同,则更新头
359
730
  if policy_referrer != request_referrer:
360
731
  if policy_referrer is None:
732
+ # Remove the 'Referer' header if the policy says not to send one
733
+ # 如果策略说不发送引用,则删除'Referer'头
361
734
  request.headers.pop('Referer')
362
735
  else:
736
+ # Update the 'Referer' header with the policy value
737
+ # 使用策略值更新'Referer'头
363
738
  request.headers['Referer'] = policy_referrer
364
739