aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,30 @@
1
1
  """
2
+ Retry Middleware for AioScrapy
3
+ AioScrapy的重试中间件
4
+
2
5
  An extension to retry failed requests that are potentially caused by temporary
3
6
  problems such as a connection timeout or HTTP 500 error.
7
+ 一个扩展,用于重试可能由临时问题(如连接超时或HTTP 500错误)导致的失败请求。
8
+
9
+ You can change the behavior of this middleware by modifying the scraping settings:
10
+ 您可以通过修改抓取设置来更改此中间件的行为:
11
+
12
+ RETRY_ENABLED - whether to enable the retry middleware (default: True)
13
+ RETRY_ENABLED - 是否启用重试中间件(默认:True)
14
+
15
+ RETRY_TIMES - how many times to retry a failed page (default: 2)
16
+ RETRY_TIMES - 重试失败页面的次数(默认:2)
4
17
 
5
- You can change the behaviour of this middleware by modifing the scraping settings:
6
- RETRY_TIMES - how many times to retry a failed page
7
- RETRY_HTTP_CODES - which HTTP response codes to retry
18
+ RETRY_HTTP_CODES - which HTTP response codes to retry (default: [500, 502, 503, 504, 522, 524, 408, 429])
19
+ RETRY_HTTP_CODES - 要重试的HTTP响应代码(默认:[500, 502, 503, 504, 522, 524, 408, 429])
8
20
 
9
- Failed pages are collected on the scraping process and rescheduled at the end,
10
- once the spider has finished crawling all regular (non failed) pages.
21
+ RETRY_PRIORITY_ADJUST - adjust retry request priority (default: -1)
22
+ RETRY_PRIORITY_ADJUST - 调整重试请求优先级(默认:-1)
23
+
24
+ Failed pages are collected during the scraping process and rescheduled,
25
+ allowing the spider to continue crawling other pages while retrying failed ones.
26
+ 失败的页面在抓取过程中被收集并重新安排,
27
+ 允许爬虫在重试失败页面的同时继续抓取其他页面。
11
28
  """
12
29
  from typing import Optional, Union
13
30
 
@@ -24,6 +41,8 @@ from aioscrapy.spiders import Spider
24
41
  from aioscrapy.utils.log import logger as retry_logger
25
42
  from aioscrapy.utils.python import global_object_name
26
43
 
44
+ # Tuple of exception types that should trigger a retry
45
+ # 应触发重试的异常类型元组
27
46
  NEED_RETRY_ERROR = (TimeoutError, ConnectionRefusedError, IOError, ProxyException, DownloadError, EndOfStream)
28
47
 
29
48
 
@@ -38,7 +57,50 @@ def get_retry_request(
38
57
  stats_base_key: str = 'retry',
39
58
  ):
40
59
  """
41
- 使用了scrapy的retry,将日志等级改为info
60
+ Create a new request object to retry the specified failed request.
61
+ 创建一个新的请求对象来重试指定的失败请求。
62
+
63
+ This function is based on Scrapy's retry functionality but uses INFO level
64
+ logging instead of DEBUG level for retry attempts. It creates a copy of the
65
+ original request with updated retry count and adjusted priority.
66
+ 此函数基于Scrapy的重试功能,但对重试尝试使用INFO级别的日志记录而不是DEBUG级别。
67
+ 它创建原始请求的副本,更新重试计数并调整优先级。
68
+
69
+ Args:
70
+ request: The original Request object that failed.
71
+ 失败的原始Request对象。
72
+ spider: The spider instance that generated the request.
73
+ 生成请求的爬虫实例。
74
+ reason: The reason for the retry, either a string or an exception.
75
+ 重试的原因,可以是字符串或异常。
76
+ Defaults to 'unspecified'.
77
+ 默认为'unspecified'。
78
+ max_retry_times: Maximum number of times to retry this request.
79
+ 重试此请求的最大次数。
80
+ If None, uses the value from request.meta['max_retry_times']
81
+ or the RETRY_TIMES setting.
82
+ 如果为None,则使用request.meta['max_retry_times']
83
+ 或RETRY_TIMES设置中的值。
84
+ priority_adjust: Amount to adjust the request priority.
85
+ 调整请求优先级的数量。
86
+ If None, uses the value from request.meta['priority_adjust']
87
+ or the RETRY_PRIORITY_ADJUST setting.
88
+ 如果为None,则使用request.meta['priority_adjust']
89
+ 或RETRY_PRIORITY_ADJUST设置中的值。
90
+ logger: The logger to use for logging retry attempts.
91
+ 用于记录重试尝试的日志记录器。
92
+ Defaults to the retry_logger.
93
+ 默认为retry_logger。
94
+ stats_base_key: The base key to use for recording retry statistics.
95
+ 用于记录重试统计信息的基本键。
96
+ Defaults to 'retry'.
97
+ 默认为'retry'。
98
+
99
+ Returns:
100
+ Request: A new Request object with updated retry count and priority,
101
+ or None if max_retry_times has been reached.
102
+ 具有更新的重试计数和优先级的新Request对象,
103
+ 如果已达到max_retry_times,则为None。
42
104
  """
43
105
  settings = spider.crawler.settings
44
106
  stats = spider.crawler.stats
@@ -78,9 +140,35 @@ def get_retry_request(
78
140
 
79
141
 
80
142
  class RetryMiddleware:
143
+ """
144
+ Middleware to retry failed requests.
145
+ 重试失败请求的中间件。
146
+
147
+ This middleware retries requests that have failed due to temporary issues
148
+ such as connection problems or certain HTTP error codes. It works by
149
+ intercepting responses with error status codes and exceptions, then
150
+ creating new retry requests with updated retry counts.
151
+ 此中间件重试由于临时问题(如连接问题或某些HTTP错误代码)而失败的请求。
152
+ 它通过拦截具有错误状态代码和异常的响应,然后创建具有更新的重试计数的新重试请求来工作。
153
+ """
154
+
155
+ # List of exceptions that should trigger a retry
156
+ # 应触发重试的异常列表
81
157
  EXCEPTIONS_TO_RETRY = NEED_RETRY_ERROR
82
158
 
83
159
  def __init__(self, settings):
160
+ """
161
+ Initialize the RetryMiddleware.
162
+ 初始化RetryMiddleware。
163
+
164
+ Args:
165
+ settings: The AioScrapy settings object.
166
+ AioScrapy设置对象。
167
+
168
+ Raises:
169
+ NotConfigured: If RETRY_ENABLED is False.
170
+ 如果RETRY_ENABLED为False。
171
+ """
84
172
  if not settings.getbool('RETRY_ENABLED'):
85
173
  raise NotConfigured
86
174
  self.max_retry_times = settings.getint('RETRY_TIMES')
@@ -89,26 +177,124 @@ class RetryMiddleware:
89
177
 
90
178
  @classmethod
91
179
  def from_crawler(cls, crawler):
180
+ """
181
+ Create a RetryMiddleware instance from a crawler.
182
+ 从爬虫创建RetryMiddleware实例。
183
+
184
+ This is the factory method used by AioScrapy to create middleware instances.
185
+ 这是AioScrapy用于创建中间件实例的工厂方法。
186
+
187
+ Args:
188
+ crawler: The crawler that will use this middleware.
189
+ 将使用此中间件的爬虫。
190
+
191
+ Returns:
192
+ RetryMiddleware: A new RetryMiddleware instance.
193
+ 一个新的RetryMiddleware实例。
194
+ """
92
195
  return cls(crawler.settings)
93
196
 
94
197
  def process_response(self, request, response, spider):
198
+ """
199
+ Process a response to check if it needs to be retried.
200
+ 处理响应以检查是否需要重试。
201
+
202
+ This method checks if the response status code is in the list of
203
+ status codes that should be retried. If so, it creates a retry request.
204
+ 此方法检查响应状态代码是否在应重试的状态代码列表中。
205
+ 如果是,则创建重试请求。
206
+
207
+ Args:
208
+ request: The original request that generated the response.
209
+ 生成响应的原始请求。
210
+ response: The response to process.
211
+ 要处理的响应。
212
+ spider: The spider that generated the request.
213
+ 生成请求的爬虫。
214
+
215
+ Returns:
216
+ Response or Request: The original response or a new retry request.
217
+ 原始响应或新的重试请求。
218
+ """
219
+ # Don't retry if the request has dont_retry set to True
220
+ # 如果请求的dont_retry设置为True,则不重试
95
221
  if request.meta.get('dont_retry', False):
96
222
  return response
223
+
224
+ # Retry if the status code is in the list of codes to retry
225
+ # 如果状态代码在要重试的代码列表中,则重试
97
226
  if response.status in self.retry_http_codes:
98
227
  reason = f"Retry response status code: {response.status}"
99
228
  return self._retry(request, reason, spider) or response
229
+
230
+ # Otherwise, return the response as is
231
+ # 否则,按原样返回响应
100
232
  return response
101
233
 
102
234
  def process_exception(self, request, exception, spider):
235
+ """
236
+ Process an exception to check if the request should be retried.
237
+ 处理异常以检查是否应重试请求。
238
+
239
+ This method checks if the exception is in the list of exceptions
240
+ that should trigger a retry. If so, it creates a retry request.
241
+ 此方法检查异常是否在应触发重试的异常列表中。
242
+ 如果是,则创建重试请求。
243
+
244
+ Args:
245
+ request: The request that caused the exception.
246
+ 导致异常的请求。
247
+ exception: The exception that was raised.
248
+ 引发的异常。
249
+ spider: The spider that generated the request.
250
+ 生成请求的爬虫。
251
+
252
+ Returns:
253
+ Request or None: A new retry request or None if the request should not be retried.
254
+ 新的重试请求,如果不应重试请求,则为None。
255
+ """
256
+ # Retry if the exception is in the list of exceptions to retry
257
+ # and the request doesn't have dont_retry set to True
258
+ # 如果异常在要重试的异常列表中,并且请求的dont_retry未设置为True,则重试
103
259
  if (
104
260
  isinstance(exception, self.EXCEPTIONS_TO_RETRY)
105
261
  and not request.meta.get('dont_retry', False)
106
262
  ):
107
263
  return self._retry(request, exception, spider)
108
264
 
265
+ # Otherwise, return None to let the exception be processed by other middleware
266
+ # 否则,返回None以让异常被其他中间件处理
267
+ return None
268
+
109
269
  def _retry(self, request, reason, spider):
270
+ """
271
+ Create a retry request for the given request.
272
+ 为给定请求创建重试请求。
273
+
274
+ This internal method gets the retry parameters from the request metadata
275
+ or middleware settings, then calls get_retry_request to create a new request.
276
+ 此内部方法从请求元数据或中间件设置获取重试参数,
277
+ 然后调用get_retry_request创建新请求。
278
+
279
+ Args:
280
+ request: The original request to retry.
281
+ 要重试的原始请求。
282
+ reason: The reason for the retry (string or exception).
283
+ 重试的原因(字符串或异常)。
284
+ spider: The spider that generated the request.
285
+ 生成请求的爬虫。
286
+
287
+ Returns:
288
+ Request or None: A new retry request or None if max retries has been reached.
289
+ 新的重试请求,如果已达到最大重试次数,则为None。
290
+ """
291
+ # Get retry parameters from request metadata or middleware settings
292
+ # 从请求元数据或中间件设置获取重试参数
110
293
  max_retry_times = request.meta.get('max_retry_times', self.max_retry_times)
111
294
  priority_adjust = request.meta.get('priority_adjust', self.priority_adjust)
295
+
296
+ # Create and return a retry request
297
+ # 创建并返回重试请求
112
298
  return get_retry_request(
113
299
  request,
114
300
  reason=reason,
@@ -1,31 +1,173 @@
1
+ """
2
+ Downloader Statistics Middleware for AioScrapy
3
+ AioScrapy的下载器统计中间件
4
+
5
+ This module provides a middleware that collects statistics about the downloader
6
+ component, including request and response counts, bytes transferred, HTTP methods,
7
+ response status codes, and exceptions.
8
+ 此模块提供了一个中间件,用于收集有关下载器组件的统计信息,
9
+ 包括请求和响应计数、传输的字节数、HTTP方法、响应状态码和异常。
10
+ """
11
+
1
12
  from aioscrapy.exceptions import NotConfigured
2
13
  from aioscrapy.utils.request import request_httprepr
3
14
  from aioscrapy.utils.python import global_object_name
4
15
 
5
16
 
6
17
  class DownloaderStats:
18
+ """
19
+ Middleware to collect statistics about the downloader component.
20
+ 用于收集下载器组件统计信息的中间件。
21
+
22
+ This middleware collects various statistics about the downloader component,
23
+ such as the number of requests and responses, bytes transferred, HTTP methods used,
24
+ response status codes, and exceptions encountered. These statistics are stored
25
+ in the crawler's stats collector and can be used for monitoring and debugging.
26
+ 此中间件收集有关下载器组件的各种统计信息,例如请求和响应的数量、
27
+ 传输的字节数、使用的HTTP方法、响应状态码和遇到的异常。
28
+ 这些统计信息存储在爬虫的统计收集器中,可用于监控和调试。
29
+ """
7
30
 
8
31
  def __init__(self, stats):
32
+ """
33
+ Initialize the DownloaderStats middleware.
34
+ 初始化DownloaderStats中间件。
35
+
36
+ Args:
37
+ stats: The stats collector instance to use for storing statistics.
38
+ 用于存储统计信息的统计收集器实例。
39
+ """
9
40
  self.stats = stats
10
41
 
11
42
  @classmethod
12
43
  def from_crawler(cls, crawler):
44
+ """
45
+ Create a DownloaderStats instance from a crawler.
46
+ 从爬虫创建DownloaderStats实例。
47
+
48
+ This is the factory method used by AioScrapy to create middleware instances.
49
+ It checks if the DOWNLOADER_STATS setting is enabled before creating the middleware.
50
+ 这是AioScrapy用于创建中间件实例的工厂方法。
51
+ 它在创建中间件之前检查DOWNLOADER_STATS设置是否启用。
52
+
53
+ Args:
54
+ crawler: The crawler that will use this middleware.
55
+ 将使用此中间件的爬虫。
56
+
57
+ Returns:
58
+ DownloaderStats: A new DownloaderStats instance.
59
+ 一个新的DownloaderStats实例。
60
+
61
+ Raises:
62
+ NotConfigured: If DOWNLOADER_STATS setting is disabled.
63
+ 如果DOWNLOADER_STATS设置被禁用。
64
+ """
13
65
  if not crawler.settings.getbool('DOWNLOADER_STATS'):
14
66
  raise NotConfigured
15
67
  return cls(crawler.stats)
16
68
 
17
69
  def process_request(self, request, spider):
70
+ """
71
+ Process a request to collect request statistics.
72
+ 处理请求以收集请求统计信息。
73
+
74
+ This method is called for every request that passes through the middleware.
75
+ It increments counters for the total number of requests, requests by HTTP method,
76
+ and the total number of bytes in requests.
77
+ 此方法在每个通过中间件的请求上调用。
78
+ 它增加总请求数、按HTTP方法的请求数和请求中的总字节数的计数器。
79
+
80
+ Args:
81
+ request: The request being processed.
82
+ 正在处理的请求。
83
+ spider: The spider that generated the request.
84
+ 生成请求的爬虫。
85
+
86
+ Returns:
87
+ None: This method returns None to continue processing the request.
88
+ 此方法返回None以继续处理请求。
89
+ """
90
+ # Increment the total request count
91
+ # 增加总请求计数
18
92
  self.stats.inc_value('downloader/request_count', spider=spider)
93
+
94
+ # Increment the count for this specific HTTP method
95
+ # 增加此特定HTTP方法的计数
19
96
  self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
97
+
98
+ # Add the request size to the total bytes counter
99
+ # 将请求大小添加到总字节计数器
20
100
  self.stats.inc_value('downloader/request_bytes', len(request_httprepr(request)), spider=spider)
21
101
 
22
102
  def process_response(self, request, response, spider):
103
+ """
104
+ Process a response to collect response statistics.
105
+ 处理响应以收集响应统计信息。
106
+
107
+ This method is called for every response that passes through the middleware.
108
+ It increments counters for the total number of responses, responses by status code,
109
+ and the total number of bytes in responses.
110
+ 此方法在每个通过中间件的响应上调用。
111
+ 它增加总响应数、按状态码的响应数和响应中的总字节数的计数器。
112
+
113
+ Args:
114
+ request: The request that generated this response.
115
+ 生成此响应的请求。
116
+ response: The response being processed.
117
+ 正在处理的响应。
118
+ spider: The spider that generated the request.
119
+ 生成请求的爬虫。
120
+
121
+ Returns:
122
+ Response: The response object, unchanged.
123
+ 响应对象,未更改。
124
+ """
125
+ # Increment the total response count
126
+ # 增加总响应计数
23
127
  self.stats.inc_value('downloader/response_count', spider=spider)
128
+
129
+ # Increment the count for this specific status code
130
+ # 增加此特定状态码的计数
24
131
  self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
132
+
133
+ # Add the response size to the total bytes counter
134
+ # 将响应大小添加到总字节计数器
25
135
  self.stats.inc_value('downloader/response_bytes', len(response.body), spider=spider)
136
+
137
+ # Return the response unchanged
138
+ # 返回未更改的响应
26
139
  return response
27
140
 
28
141
  def process_exception(self, request, exception, spider):
142
+ """
143
+ Process an exception to collect exception statistics.
144
+ 处理异常以收集异常统计信息。
145
+
146
+ This method is called when an exception occurs during request processing.
147
+ It increments counters for the total number of exceptions and exceptions by type.
148
+ 当请求处理期间发生异常时调用此方法。
149
+ 它增加总异常数和按类型的异常数的计数器。
150
+
151
+ Args:
152
+ request: The request that caused the exception.
153
+ 导致异常的请求。
154
+ exception: The exception that was raised.
155
+ 引发的异常。
156
+ spider: The spider that generated the request.
157
+ 生成请求的爬虫。
158
+
159
+ Returns:
160
+ None: This method returns None to continue processing the exception.
161
+ 此方法返回None以继续处理异常。
162
+ """
163
+ # Get the full class name of the exception
164
+ # 获取异常的完整类名
29
165
  ex_class = global_object_name(exception.__class__)
166
+
167
+ # Increment the total exception count
168
+ # 增加总异常计数
30
169
  self.stats.inc_value('downloader/exception_count', spider=spider)
170
+
171
+ # Increment the count for this specific exception type
172
+ # 增加此特定异常类型的计数
31
173
  self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)
@@ -1,23 +1,114 @@
1
- """Set User-Agent header per spider or use a default value from settings"""
1
+ """
2
+ User-Agent Middleware
3
+ 用户代理中间件
4
+
5
+ This middleware sets the User-Agent header for all requests, using either a
6
+ spider-specific user_agent attribute, or a default value from the USER_AGENT setting.
7
+ 此中间件为所有请求设置User-Agent头,使用爬虫特定的user_agent属性,
8
+ 或来自USER_AGENT设置的默认值。
9
+
10
+ The User-Agent header is important for identifying your crawler to websites and
11
+ can affect how websites respond to your requests.
12
+ User-Agent头对于向网站标识您的爬虫很重要,可能会影响网站对您的请求的响应方式。
13
+ """
2
14
 
3
15
  from aioscrapy import signals
4
16
 
5
17
 
6
18
  class UserAgentMiddleware:
7
- """This middleware allows spiders to override the user_agent"""
19
+ """
20
+ Middleware for setting the User-Agent header on requests.
21
+ 用于在请求上设置User-Agent头的中间件。
22
+
23
+ This middleware allows spiders to override the default User-Agent by specifying
24
+ a user_agent attribute. If no spider-specific User-Agent is defined, it uses
25
+ the default value from the USER_AGENT setting.
26
+ 此中间件允许爬虫通过指定user_agent属性来覆盖默认的User-Agent。
27
+ 如果未定义爬虫特定的User-Agent,则使用USER_AGENT设置中的默认值。
28
+ """
8
29
 
9
30
  def __init__(self, user_agent='Scrapy'):
31
+ """
32
+ Initialize the UserAgentMiddleware.
33
+ 初始化UserAgentMiddleware。
34
+
35
+ Args:
36
+ user_agent: The default User-Agent string to use.
37
+ 要使用的默认User-Agent字符串。
38
+ Defaults to 'Scrapy'.
39
+ 默认为'Scrapy'。
40
+ """
41
+ # Store the default user agent
42
+ # 存储默认用户代理
10
43
  self.user_agent = user_agent
11
44
 
12
45
  @classmethod
13
46
  def from_crawler(cls, crawler):
47
+ """
48
+ Create a UserAgentMiddleware instance from a crawler.
49
+ 从爬虫创建UserAgentMiddleware实例。
50
+
51
+ This is the factory method used by AioScrapy to create the middleware.
52
+ 这是AioScrapy用于创建中间件的工厂方法。
53
+
54
+ Args:
55
+ crawler: The crawler that will use this middleware.
56
+ 将使用此中间件的爬虫。
57
+
58
+ Returns:
59
+ UserAgentMiddleware: A new UserAgentMiddleware instance.
60
+ 一个新的UserAgentMiddleware实例。
61
+ """
62
+ # Create a new instance with the user agent from settings
63
+ # 使用来自设置的用户代理创建一个新实例
14
64
  o = cls(crawler.settings['USER_AGENT'])
65
+
66
+ # Connect to the spider_opened signal
67
+ # 连接到spider_opened信号
15
68
  crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
69
+
70
+ # Return the new instance
71
+ # 返回新实例
16
72
  return o
17
73
 
18
74
  def spider_opened(self, spider):
75
+ """
76
+ Handle the spider_opened signal.
77
+ 处理spider_opened信号。
78
+
79
+ This method is called when a spider is opened. It updates the user agent
80
+ with the spider's user_agent attribute if it exists.
81
+ 当爬虫打开时调用此方法。如果存在,它会使用爬虫的user_agent属性更新用户代理。
82
+
83
+ Args:
84
+ spider: The spider that was opened.
85
+ 被打开的爬虫。
86
+ """
87
+ # Update the user agent with the spider's user_agent attribute if it exists
88
+ # 如果存在,则使用爬虫的user_agent属性更新用户代理
19
89
  self.user_agent = getattr(spider, 'user_agent', self.user_agent)
20
90
 
21
91
  def process_request(self, request, spider):
92
+ """
93
+ Process a request before it is sent to the downloader.
94
+ 在请求发送到下载器之前处理它。
95
+
96
+ This method sets the User-Agent header in the request if it's not already set
97
+ and if a user agent is configured.
98
+ 如果尚未设置User-Agent头且已配置用户代理,此方法会在请求中设置它。
99
+
100
+ Args:
101
+ request: The request being processed.
102
+ 正在处理的请求。
103
+ spider: The spider that generated the request.
104
+ 生成请求的爬虫。
105
+
106
+ Returns:
107
+ None: This method does not return a response or a deferred.
108
+ 此方法不返回响应或延迟对象。
109
+ """
110
+ # Set the User-Agent header in the request if it's not already set
111
+ # and if a user agent is configured
112
+ # 如果尚未设置User-Agent头且已配置用户代理,则在请求中设置它
22
113
  if self.user_agent:
23
114
  request.headers.setdefault('User-Agent', self.user_agent)