aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,11 @@
1
- """Download handlers for different schemes"""
1
+ """
2
+ Download handlers for different URL schemes.
3
+ 不同URL方案的下载处理程序。
4
+
5
+ This module provides the base classes and manager for download handlers,
6
+ which are responsible for handling different URL schemes (http, https, ftp, etc.).
7
+ 此模块提供了下载处理程序的基类和管理器,负责处理不同的URL方案(http、https、ftp等)。
8
+ """
2
9
 
3
10
  from abc import abstractmethod
4
11
  from typing import Optional
@@ -13,73 +20,248 @@ from aioscrapy.utils.python import without_none_values
13
20
 
14
21
 
15
22
  class BaseDownloadHandler:
23
+ """
24
+ Base class for download handlers.
25
+ 下载处理程序的基类。
26
+
27
+ Download handlers are responsible for handling requests with specific URL schemes
28
+ (http, https, ftp, etc.). Each scheme has its own handler implementation.
29
+ 下载处理程序负责处理具有特定URL方案的请求(http、https、ftp等)。每个方案都有自己的处理程序实现。
30
+ """
31
+
16
32
  @abstractmethod
17
- async def download_request(self, requests: Request, spider: Spider):
33
+ async def download_request(self, request: Request, spider: Spider):
34
+ """
35
+ Download the given request and return a response.
36
+ 下载给定的请求并返回响应。
37
+
38
+ Args:
39
+ request: The request to download.
40
+ 要下载的请求。
41
+ spider: The spider that generated the request.
42
+ 生成请求的爬虫。
43
+
44
+ Returns:
45
+ A response object.
46
+ 响应对象。
47
+ """
18
48
  raise NotImplementedError()
19
49
 
20
50
  @abstractmethod
21
51
  async def close(self):
52
+ """
53
+ Close the handler and release its resources.
54
+ 关闭处理程序并释放其资源。
55
+
56
+ This method is called when the spider is closed.
57
+ 当爬虫关闭时调用此方法。
58
+ """
22
59
  pass
23
60
 
24
61
 
25
62
  class DownloadHandlerManager:
63
+ """
64
+ Manager for download handlers.
65
+ 下载处理程序的管理器。
66
+
67
+ This class manages download handlers for different URL schemes.
68
+ It lazily loads handlers when they are first needed and keeps track
69
+ of which schemes are supported.
70
+ 此类管理不同URL方案的下载处理程序。它在首次需要时懒加载处理程序,并跟踪支持哪些方案。
71
+ """
26
72
 
27
73
  def __init__(self, crawler):
74
+ """
75
+ Initialize the download handler manager.
76
+ 初始化下载处理程序管理器。
77
+
78
+ Args:
79
+ crawler: The crawler instance that this manager belongs to.
80
+ 此管理器所属的爬虫实例。
81
+ """
28
82
  self._crawler = crawler
29
83
 
30
- # stores acceptable schemes on instancing
84
+ # Load scheme handlers configuration from settings
85
+ # 从设置加载方案处理程序配置
86
+ # First try DOWNLOAD_HANDLERS_MAP[DOWNLOAD_HANDLERS_TYPE], then fall back to DOWNLOAD_HANDLERS
87
+ # 首先尝试DOWNLOAD_HANDLERS_MAP[DOWNLOAD_HANDLERS_TYPE],然后回退到DOWNLOAD_HANDLERS
31
88
  self._schemes: dict = without_none_values(
32
89
  crawler.settings.get('DOWNLOAD_HANDLERS_MAP', {}).get(crawler.settings.get('DOWNLOAD_HANDLERS_TYPE')) or
33
90
  crawler.settings.getwithbase('DOWNLOAD_HANDLERS')
34
91
  )
92
+
93
+ # Dictionary of scheme -> handler instance
94
+ # 方案 -> 处理程序实例的字典
35
95
  self._handlers: dict = {} # stores instanced handlers for schemes
96
+
97
+ # Dictionary of scheme -> error message for failed handlers
98
+ # 方案 -> 失败处理程序的错误消息的字典
36
99
  self._notconfigured: dict = {} # remembers failed handlers
100
+
101
+ # Connect to engine_stopped signal to close handlers
102
+ # 连接到engine_stopped信号以关闭处理程序
37
103
  crawler.signals.connect(self._close, signals.engine_stopped)
38
104
 
39
105
  @classmethod
40
106
  def from_crawler(cls, crawler) -> "DownloadHandlerManager":
107
+ """
108
+ Create a download handler manager from a crawler.
109
+ 从爬虫创建下载处理程序管理器。
110
+
111
+ This is a factory method that creates a new download handler manager
112
+ instance with the given crawler.
113
+ 这是一个工厂方法,使用给定的爬虫创建一个新的下载处理程序管理器实例。
114
+
115
+ Args:
116
+ crawler: The crawler instance that will use this manager.
117
+ 将使用此管理器的爬虫实例。
118
+
119
+ Returns:
120
+ DownloadHandlerManager: A new download handler manager instance.
121
+ 一个新的下载处理程序管理器实例。
122
+ """
41
123
  return cls(crawler)
42
124
 
43
125
  async def _get_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
44
- """Lazy-load the downloadhandler for a scheme
45
- only on the first request for that scheme.
46
126
  """
127
+ Lazy-load the download handler for a scheme.
128
+ 懒加载方案的下载处理程序。
129
+
130
+ This method only loads the handler on the first request for that scheme.
131
+ 此方法仅在首次请求该方案时加载处理程序。
132
+
133
+ Args:
134
+ scheme: The URL scheme to get a handler for (e.g., 'http', 'https', 'ftp').
135
+ 要获取处理程序的URL方案(例如,'http'、'https'、'ftp')。
136
+
137
+ Returns:
138
+ BaseDownloadHandler: The handler for the scheme, or None if no handler
139
+ is available or could be loaded.
140
+ 方案的处理程序,如果没有可用或无法加载的处理程序,则为None。
141
+ """
142
+ # Return cached handler if available
143
+ # 如果可用,返回缓存的处理程序
47
144
  if scheme in self._handlers:
48
145
  return self._handlers[scheme]
146
+
147
+ # Return None if we already know this scheme is not configured
148
+ # 如果我们已经知道此方案未配置,则返回None
49
149
  if scheme in self._notconfigured:
50
150
  return None
151
+
152
+ # Return None if no handler is defined for this scheme
153
+ # 如果没有为此方案定义处理程序,则返回None
51
154
  if scheme not in self._schemes:
52
155
  self._notconfigured[scheme] = 'no handler available for that scheme'
53
156
  return None
54
157
 
158
+ # Load the handler for this scheme
159
+ # 加载此方案的处理程序
55
160
  return await self._load_handler(scheme)
56
161
 
57
162
  async def _load_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
163
+ """
164
+ Load a download handler for a scheme.
165
+ 加载方案的下载处理程序。
166
+
167
+ This method attempts to load the handler class specified in the settings
168
+ for the given scheme.
169
+ 此方法尝试加载设置中为给定方案指定的处理程序类。
170
+
171
+ Args:
172
+ scheme: The URL scheme to load a handler for.
173
+ 要加载处理程序的URL方案。
174
+
175
+ Returns:
176
+ BaseDownloadHandler: The loaded handler, or None if the handler
177
+ could not be loaded.
178
+ 加载的处理程序,如果无法加载处理程序,则为None。
179
+ """
180
+ # Get the handler class path from settings
181
+ # 从设置获取处理程序类路径
58
182
  path: str = self._schemes[scheme]
183
+
59
184
  try:
185
+ # Load the handler class
186
+ # 加载处理程序类
60
187
  dh: BaseDownloadHandler = await load_instance(
61
188
  path,
62
189
  settings=self._crawler.settings,
63
190
  )
64
191
  except NotConfigured as ex:
192
+ # Handler explicitly raised NotConfigured
193
+ # 处理程序明确引发NotConfigured
65
194
  self._notconfigured[scheme] = str(ex)
66
195
  return None
67
196
  except Exception as ex:
197
+ # Any other exception during loading
198
+ # 加载期间的任何其他异常
68
199
  logger.exception(f'Loading "{path}" for scheme "{scheme}"')
69
200
  self._notconfigured[scheme] = str(ex)
70
201
  return None
71
202
  else:
203
+ # Successfully loaded the handler
204
+ # 成功加载处理程序
72
205
  self._handlers[scheme] = dh
73
206
  return dh
74
207
 
75
208
  async def download_request(self, request: Request, spider: Spider) -> HtmlResponse:
209
+ """
210
+ Download a request using the appropriate handler for its URL scheme.
211
+ 使用适合其URL方案的处理程序下载请求。
212
+
213
+ This method determines the URL scheme of the request, gets the appropriate
214
+ handler, and delegates the download to that handler.
215
+ 此方法确定请求的URL方案,获取适当的处理程序,并将下载委托给该处理程序。
216
+
217
+ Args:
218
+ request: The request to download.
219
+ 要下载的请求。
220
+ spider: The spider that generated the request.
221
+ 生成请求的爬虫。
222
+
223
+ Returns:
224
+ HtmlResponse: The response from the handler.
225
+ 来自处理程序的响应。
226
+
227
+ Raises:
228
+ NotSupported: If no handler is available for the request's URL scheme.
229
+ 如果请求的URL方案没有可用的处理程序。
230
+ """
231
+ # Extract the scheme from the URL (http, https, ftp, etc.)
232
+ # 从URL提取方案(http、https、ftp等)
76
233
  scheme = urlparse_cached(request).scheme
234
+
235
+ # Get the handler for this scheme
236
+ # 获取此方案的处理程序
77
237
  handler: BaseDownloadHandler = await self._get_handler(scheme)
238
+
239
+ # Raise an exception if no handler is available
240
+ # 如果没有可用的处理程序,则引发异常
78
241
  if not handler:
79
242
  raise NotSupported("Unsupported URL scheme '%s': %s" %
80
243
  (scheme, self._notconfigured[scheme]))
244
+
245
+ # Delegate the download to the handler
246
+ # 将下载委托给处理程序
81
247
  return await handler.download_request(request, spider)
82
248
 
83
249
  async def _close(self, *_a, **_kw) -> None:
250
+ """
251
+ Close all download handlers.
252
+ 关闭所有下载处理程序。
253
+
254
+ This method is called when the engine is stopped. It closes all
255
+ download handlers that have been loaded.
256
+ 当引擎停止时调用此方法。它关闭所有已加载的下载处理程序。
257
+
258
+ Args:
259
+ *_a: Variable positional arguments from the signal (not used).
260
+ 来自信号的可变位置参数(未使用)。
261
+ **_kw: Variable keyword arguments from the signal (not used).
262
+ 来自信号的可变关键字参数(未使用)。
263
+ """
264
+ # Close each handler
265
+ # 关闭每个处理程序
84
266
  for dh in self._handlers.values():
85
267
  await dh.close()
@@ -1,3 +1,13 @@
1
+ """
2
+
3
+ Download handler implementation using aiohttp.
4
+ 使用aiohttp的下载处理程序实现。
5
+
6
+ This module provides a download handler that uses aiohttp to perform HTTP/HTTPS requests.
7
+ It supports features like browser impersonation, proxies, and cookies.
8
+ 此模块提供了一个使用aiohttp执行HTTP/HTTPS请求的下载处理程序。
9
+ 它支持浏览器模拟、代理和Cookie等功能。
10
+ """
1
11
  import asyncio
2
12
  import re
3
13
  import ssl
@@ -15,31 +25,168 @@ from aioscrapy.utils.log import logger
15
25
 
16
26
 
17
27
  class AioHttpDownloadHandler(BaseDownloadHandler):
18
- session: Optional[aiohttp.ClientSession] = None
28
+ """
29
+ Download handler that uses aiohttp to download HTTP/HTTPS requests.
30
+ 使用aiohttp下载HTTP/HTTPS请求的下载处理程序。
31
+
32
+ This handler implements the BaseDownloadHandler interface using the aiohttp
33
+ library to perform HTTP/HTTPS requests.
34
+ 此处理程序使用aiohttp库执行HTTP/HTTPS请求,实现了BaseDownloadHandler接口。
35
+ """
36
+
37
+ session: Optional[aiohttp.ClientSession] = None # Shared session when USE_SESSION is True
38
+ # 当USE_SESSION为True时的共享会话
19
39
 
20
40
  def __init__(self, settings: Settings):
41
+ """
42
+ Initialize the AioHttpDownloadHandler.
43
+ 初始化AioHttpDownloadHandler。
44
+
45
+ Args:
46
+ settings: The settings object containing configuration for the handler.
47
+ 包含处理程序配置的设置对象。
48
+ """
21
49
  self.settings = settings
22
- self.aiohttp_client_session_args: dict = settings.getdict('AIOHTTP_CLIENT_SESSION_ARGS')
50
+
51
+ # Arguments to pass to aiohttp.ClientSession constructor
52
+ # 传递给aiohttp.ClientSession构造函数的参数
53
+ self.aiohttp_args: dict = settings.getdict('AIOHTTP_ARGS')
54
+
55
+ # SSL verification setting
56
+ # SSL验证设置
23
57
  self.verify_ssl: Optional[bool] = settings.get("VERIFY_SSL")
58
+
59
+ # SSL protocol version (e.g., ssl.PROTOCOL_TLSv1_2)
60
+ # SSL协议版本(例如,ssl.PROTOCOL_TLSv1_2)
24
61
  self.ssl_protocol = settings.get("SSL_PROTOCOL") # ssl.PROTOCOL_TLSv1_2
62
+
63
+ # Whether to use a persistent session for all requests
64
+ # 是否对所有请求使用持久会话
25
65
  self.use_session: bool = settings.getbool("USE_SESSION", False)
26
66
 
27
67
  @classmethod
28
68
  def from_settings(cls, settings: Settings):
69
+ """
70
+ Create a download handler from settings.
71
+ 从设置创建下载处理程序。
72
+
73
+ This is a factory method that creates a new AioHttpDownloadHandler
74
+ instance with the given settings.
75
+ 这是一个工厂方法,使用给定的设置创建一个新的AioHttpDownloadHandler实例。
76
+
77
+ Args:
78
+ settings: The settings to use for the handler.
79
+ 用于处理程序的设置。
80
+
81
+ Returns:
82
+ AioHttpDownloadHandler: A new download handler instance.
83
+ 一个新的下载处理程序实例。
84
+ """
29
85
  return cls(settings)
30
86
 
31
87
  def get_session(self, *args, **kwargs) -> aiohttp.ClientSession:
88
+ """
89
+ Get or create a shared aiohttp ClientSession.
90
+ 获取或创建共享的aiohttp ClientSession。
91
+
92
+ This method returns the existing session if one exists, or creates
93
+ a new one if none exists yet. This is used when USE_SESSION is True
94
+ to reuse the same session for multiple requests.
95
+ 如果会话已存在,此方法返回现有会话;如果尚不存在,则创建一个新会话。
96
+ 当USE_SESSION为True时使用此方法,为多个请求重用相同的会话。
97
+
98
+ Args:
99
+ *args: Positional arguments to pass to aiohttp.ClientSession constructor.
100
+ 传递给aiohttp.ClientSession构造函数的位置参数。
101
+ **kwargs: Keyword arguments to pass to aiohttp.ClientSession constructor.
102
+ 传递给aiohttp.ClientSession构造函数的关键字参数。
103
+
104
+ Returns:
105
+ aiohttp.ClientSession: The shared client session.
106
+ 共享的客户端会话。
107
+ """
32
108
  if self.session is None:
33
109
  self.session = aiohttp.ClientSession(*args, **kwargs)
34
110
  return self.session
35
111
 
36
- async def download_request(self, request: Request, _) -> HtmlResponse:
112
+ async def download_request(self, request: Request, spider) -> HtmlResponse:
113
+ """
114
+ Download a request using aiohttp.
115
+ 使用aiohttp下载请求。
116
+
117
+ This method implements the BaseDownloadHandler.download_request interface.
118
+ It wraps the actual download logic in _download_request and handles
119
+ aiohttp-specific exceptions.
120
+ 此方法实现了BaseDownloadHandler.download_request接口。
121
+ 它将实际的下载逻辑包装在_download_request中,并处理aiohttp特定的异常。
122
+
123
+ Args:
124
+ request: The request to download.
125
+ 要下载的请求。
126
+ spider: The spider making the request. This parameter is required by the
127
+ BaseDownloadHandler interface but is not used in this implementation.
128
+ 发出请求的爬虫。此参数是BaseDownloadHandler接口所需的,但在此实现中未使用。
129
+ It is included to maintain compatibility with the interface and to allow
130
+ subclasses to use it if needed.
131
+ 包含它是为了保持与接口的兼容性,并允许子类在需要时使用它。
132
+
133
+ Returns:
134
+ HtmlResponse: The response from the server.
135
+ 来自服务器的响应。
136
+
137
+ Raises:
138
+ DownloadError: If an aiohttp ClientError occurs during the download.
139
+ 如果在下载过程中发生aiohttp ClientError。
140
+ """
37
141
  try:
142
+ # The spider parameter is intentionally unused in this implementation
143
+ # 在此实现中有意不使用spider参数
38
144
  return await self._download_request(request)
39
145
  except ClientError as e:
146
+ # Wrap aiohttp-specific exceptions in a generic DownloadError
147
+ # 将aiohttp特定的异常包装在通用的DownloadError中
40
148
  raise DownloadError(real_error=e) from e
41
149
 
42
150
  async def _download_request(self, request: Request) -> HtmlResponse:
151
+ """
152
+ Perform the actual download of a request using aiohttp.
153
+ 使用aiohttp执行请求的实际下载。
154
+
155
+ This method handles the details of configuring and performing the HTTP request,
156
+ including SSL settings, proxies, cookies, and session management. It supports
157
+ various request options through request.meta:
158
+ 此方法处理配置和执行HTTP请求的详细信息,包括SSL设置、代理、Cookie和会话管理。
159
+ 它通过request.meta支持各种请求选项:
160
+
161
+ - verify_ssl: Whether to verify SSL certificates
162
+ 是否验证SSL证书
163
+ - download_timeout: Timeout for the request in seconds
164
+ 请求超时时间(秒)
165
+ - dont_redirect: Whether to disable following redirects
166
+ 是否禁用跟随重定向
167
+ - TLS_CIPHERS: Custom SSL cipher suite to use
168
+ 要使用的自定义SSL密码套件
169
+ - ssl_protocol: SSL protocol version to use
170
+ 要使用的SSL协议版本
171
+ - proxy: Proxy URL to use for the request
172
+ 用于请求的代理URL
173
+
174
+ Args:
175
+ request: The request to download.
176
+ 要下载的请求。
177
+ This includes the URL, method, headers, body, cookies, and
178
+ meta information for configuring the request.
179
+ 这包括URL、方法、标头、正文、Cookie和用于配置请求的元信息。
180
+
181
+ Returns:
182
+ HtmlResponse: The response from the server.
183
+ 来自服务器的响应。
184
+ This includes the status code, headers, body, cookies,
185
+ and encoding of the response.
186
+ 这包括响应的状态码、标头、正文、Cookie和编码。
187
+ """
188
+ # Prepare request parameters
189
+ # 准备请求参数
43
190
  kwargs = {
44
191
  'verify_ssl': request.meta.get('verify_ssl', self.verify_ssl),
45
192
  'timeout': request.meta.get('download_timeout', 180),
@@ -50,9 +197,13 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
50
197
  'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
51
198
  }
52
199
 
200
+ # Set headers from request or default settings
201
+ # 从请求或默认设置设置标头
53
202
  headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
54
203
  kwargs['headers'] = headers
55
204
 
205
+ # Configure SSL context if needed
206
+ # 如果需要,配置SSL上下文
56
207
  ssl_ciphers: str = request.meta.get('TLS_CIPHERS')
57
208
  ssl_protocol = request.meta.get('ssl_protocol', self.ssl_protocol)
58
209
  if ssl_ciphers or ssl_protocol:
@@ -65,28 +216,38 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
65
216
  kwargs['ssl'] = context
66
217
  kwargs['verify_ssl'] = True
67
218
 
219
+ # Configure proxy if specified
220
+ # 如果指定,配置代理
68
221
  proxy: str = request.meta.get("proxy")
69
222
  if proxy:
70
223
  kwargs["proxy"] = proxy
71
224
  logger.debug(f"使用代理{proxy}抓取: {request.url}")
72
225
 
226
+ # Perform the request using either a persistent session or a new session
227
+ # 使用持久会话或新会话执行请求
73
228
  if self.use_session:
74
229
  # Not recommended to use session, The abnormal phenomena will occurs when using tunnel proxy
75
- session = self.get_session(**self.aiohttp_client_session_args)
230
+ # 不建议使用会话,使用隧道代理时会出现异常现象
231
+ session = self.get_session(**self.aiohttp_args)
76
232
  async with session.request(request.method, request.url, **kwargs) as response:
77
233
  content: bytes = await response.read()
78
-
79
234
  else:
80
- async with aiohttp.ClientSession(**self.aiohttp_client_session_args) as session:
235
+ # Create a new session for each request (recommended)
236
+ # 为每个请求创建一个新会话(推荐)
237
+ async with aiohttp.ClientSession(**self.aiohttp_args) as session:
81
238
  async with session.request(request.method, request.url, **kwargs) as response:
82
239
  content: bytes = await response.read()
83
240
 
241
+ # Process cookies from response
242
+ # 处理响应中的Cookie
84
243
  r_cookies = response.cookies.output() or None
85
244
  if r_cookies:
86
245
  r_cookies = {
87
246
  cookie[0]: cookie[1] for cookie in re.findall(r'Set-Cookie: (.*?)=(.*?); Domain', r_cookies, re.S)
88
247
  }
89
248
 
249
+ # Create and return the response object
250
+ # 创建并返回响应对象
90
251
  return HtmlResponse(
91
252
  str(response.url),
92
253
  status=response.status,
@@ -97,9 +258,32 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
97
258
  )
98
259
 
99
260
  async def close(self):
261
+ """
262
+ Close the download handler and release its resources.
263
+ 关闭下载处理程序并释放其资源。
264
+
265
+ This method closes the shared session if one exists and waits for
266
+ the underlying SSL connections to close properly. It follows the
267
+ recommended graceful shutdown procedure for aiohttp sessions.
268
+ 此方法关闭共享会话(如果存在),并等待底层SSL连接正确关闭。
269
+ 它遵循aiohttp会话的推荐优雅关闭程序。
270
+
271
+ The 250ms sleep after closing the session is recommended by the aiohttp
272
+ documentation to allow the underlying SSL connections to be properly closed.
273
+ Without this delay, SSL connections might be terminated abruptly, which
274
+ can cause issues with some servers.
275
+ 关闭会话后的250毫秒睡眠是aiohttp文档推荐的,以允许底层SSL连接正确关闭。
276
+ 没有这个延迟,SSL连接可能会突然终止,这可能会导致某些服务器出现问题。
277
+
278
+ See: https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown
279
+ 参见:https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown
280
+ """
100
281
  if self.session is not None:
282
+ # Close the shared session
283
+ # 关闭共享会话
101
284
  await self.session.close()
102
285
 
103
286
  # Wait 250 ms for the underlying SSL connections to close
287
+ # 等待250毫秒让底层SSL连接关闭
104
288
  # https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown
105
289
  await asyncio.sleep(0.250)