aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,11 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Download handlers for different URL schemes.
|
|
3
|
+
不同URL方案的下载处理程序。
|
|
4
|
+
|
|
5
|
+
This module provides the base classes and manager for download handlers,
|
|
6
|
+
which are responsible for handling different URL schemes (http, https, ftp, etc.).
|
|
7
|
+
此模块提供了下载处理程序的基类和管理器,负责处理不同的URL方案(http、https、ftp等)。
|
|
8
|
+
"""
|
|
2
9
|
|
|
3
10
|
from abc import abstractmethod
|
|
4
11
|
from typing import Optional
|
|
@@ -13,73 +20,248 @@ from aioscrapy.utils.python import without_none_values
|
|
|
13
20
|
|
|
14
21
|
|
|
15
22
|
class BaseDownloadHandler:
|
|
23
|
+
"""
|
|
24
|
+
Base class for download handlers.
|
|
25
|
+
下载处理程序的基类。
|
|
26
|
+
|
|
27
|
+
Download handlers are responsible for handling requests with specific URL schemes
|
|
28
|
+
(http, https, ftp, etc.). Each scheme has its own handler implementation.
|
|
29
|
+
下载处理程序负责处理具有特定URL方案的请求(http、https、ftp等)。每个方案都有自己的处理程序实现。
|
|
30
|
+
"""
|
|
31
|
+
|
|
16
32
|
@abstractmethod
|
|
17
|
-
async def download_request(self,
|
|
33
|
+
async def download_request(self, request: Request, spider: Spider):
|
|
34
|
+
"""
|
|
35
|
+
Download the given request and return a response.
|
|
36
|
+
下载给定的请求并返回响应。
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
request: The request to download.
|
|
40
|
+
要下载的请求。
|
|
41
|
+
spider: The spider that generated the request.
|
|
42
|
+
生成请求的爬虫。
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
A response object.
|
|
46
|
+
响应对象。
|
|
47
|
+
"""
|
|
18
48
|
raise NotImplementedError()
|
|
19
49
|
|
|
20
50
|
@abstractmethod
|
|
21
51
|
async def close(self):
|
|
52
|
+
"""
|
|
53
|
+
Close the handler and release its resources.
|
|
54
|
+
关闭处理程序并释放其资源。
|
|
55
|
+
|
|
56
|
+
This method is called when the spider is closed.
|
|
57
|
+
当爬虫关闭时调用此方法。
|
|
58
|
+
"""
|
|
22
59
|
pass
|
|
23
60
|
|
|
24
61
|
|
|
25
62
|
class DownloadHandlerManager:
|
|
63
|
+
"""
|
|
64
|
+
Manager for download handlers.
|
|
65
|
+
下载处理程序的管理器。
|
|
66
|
+
|
|
67
|
+
This class manages download handlers for different URL schemes.
|
|
68
|
+
It lazily loads handlers when they are first needed and keeps track
|
|
69
|
+
of which schemes are supported.
|
|
70
|
+
此类管理不同URL方案的下载处理程序。它在首次需要时懒加载处理程序,并跟踪支持哪些方案。
|
|
71
|
+
"""
|
|
26
72
|
|
|
27
73
|
def __init__(self, crawler):
|
|
74
|
+
"""
|
|
75
|
+
Initialize the download handler manager.
|
|
76
|
+
初始化下载处理程序管理器。
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
crawler: The crawler instance that this manager belongs to.
|
|
80
|
+
此管理器所属的爬虫实例。
|
|
81
|
+
"""
|
|
28
82
|
self._crawler = crawler
|
|
29
83
|
|
|
30
|
-
#
|
|
84
|
+
# Load scheme handlers configuration from settings
|
|
85
|
+
# 从设置加载方案处理程序配置
|
|
86
|
+
# First try DOWNLOAD_HANDLERS_MAP[DOWNLOAD_HANDLERS_TYPE], then fall back to DOWNLOAD_HANDLERS
|
|
87
|
+
# 首先尝试DOWNLOAD_HANDLERS_MAP[DOWNLOAD_HANDLERS_TYPE],然后回退到DOWNLOAD_HANDLERS
|
|
31
88
|
self._schemes: dict = without_none_values(
|
|
32
89
|
crawler.settings.get('DOWNLOAD_HANDLERS_MAP', {}).get(crawler.settings.get('DOWNLOAD_HANDLERS_TYPE')) or
|
|
33
90
|
crawler.settings.getwithbase('DOWNLOAD_HANDLERS')
|
|
34
91
|
)
|
|
92
|
+
|
|
93
|
+
# Dictionary of scheme -> handler instance
|
|
94
|
+
# 方案 -> 处理程序实例的字典
|
|
35
95
|
self._handlers: dict = {} # stores instanced handlers for schemes
|
|
96
|
+
|
|
97
|
+
# Dictionary of scheme -> error message for failed handlers
|
|
98
|
+
# 方案 -> 失败处理程序的错误消息的字典
|
|
36
99
|
self._notconfigured: dict = {} # remembers failed handlers
|
|
100
|
+
|
|
101
|
+
# Connect to engine_stopped signal to close handlers
|
|
102
|
+
# 连接到engine_stopped信号以关闭处理程序
|
|
37
103
|
crawler.signals.connect(self._close, signals.engine_stopped)
|
|
38
104
|
|
|
39
105
|
@classmethod
|
|
40
106
|
def from_crawler(cls, crawler) -> "DownloadHandlerManager":
|
|
107
|
+
"""
|
|
108
|
+
Create a download handler manager from a crawler.
|
|
109
|
+
从爬虫创建下载处理程序管理器。
|
|
110
|
+
|
|
111
|
+
This is a factory method that creates a new download handler manager
|
|
112
|
+
instance with the given crawler.
|
|
113
|
+
这是一个工厂方法,使用给定的爬虫创建一个新的下载处理程序管理器实例。
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
crawler: The crawler instance that will use this manager.
|
|
117
|
+
将使用此管理器的爬虫实例。
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
DownloadHandlerManager: A new download handler manager instance.
|
|
121
|
+
一个新的下载处理程序管理器实例。
|
|
122
|
+
"""
|
|
41
123
|
return cls(crawler)
|
|
42
124
|
|
|
43
125
|
async def _get_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
|
|
44
|
-
"""Lazy-load the downloadhandler for a scheme
|
|
45
|
-
only on the first request for that scheme.
|
|
46
126
|
"""
|
|
127
|
+
Lazy-load the download handler for a scheme.
|
|
128
|
+
懒加载方案的下载处理程序。
|
|
129
|
+
|
|
130
|
+
This method only loads the handler on the first request for that scheme.
|
|
131
|
+
此方法仅在首次请求该方案时加载处理程序。
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
scheme: The URL scheme to get a handler for (e.g., 'http', 'https', 'ftp').
|
|
135
|
+
要获取处理程序的URL方案(例如,'http'、'https'、'ftp')。
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
BaseDownloadHandler: The handler for the scheme, or None if no handler
|
|
139
|
+
is available or could be loaded.
|
|
140
|
+
方案的处理程序,如果没有可用或无法加载的处理程序,则为None。
|
|
141
|
+
"""
|
|
142
|
+
# Return cached handler if available
|
|
143
|
+
# 如果可用,返回缓存的处理程序
|
|
47
144
|
if scheme in self._handlers:
|
|
48
145
|
return self._handlers[scheme]
|
|
146
|
+
|
|
147
|
+
# Return None if we already know this scheme is not configured
|
|
148
|
+
# 如果我们已经知道此方案未配置,则返回None
|
|
49
149
|
if scheme in self._notconfigured:
|
|
50
150
|
return None
|
|
151
|
+
|
|
152
|
+
# Return None if no handler is defined for this scheme
|
|
153
|
+
# 如果没有为此方案定义处理程序,则返回None
|
|
51
154
|
if scheme not in self._schemes:
|
|
52
155
|
self._notconfigured[scheme] = 'no handler available for that scheme'
|
|
53
156
|
return None
|
|
54
157
|
|
|
158
|
+
# Load the handler for this scheme
|
|
159
|
+
# 加载此方案的处理程序
|
|
55
160
|
return await self._load_handler(scheme)
|
|
56
161
|
|
|
57
162
|
async def _load_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
|
|
163
|
+
"""
|
|
164
|
+
Load a download handler for a scheme.
|
|
165
|
+
加载方案的下载处理程序。
|
|
166
|
+
|
|
167
|
+
This method attempts to load the handler class specified in the settings
|
|
168
|
+
for the given scheme.
|
|
169
|
+
此方法尝试加载设置中为给定方案指定的处理程序类。
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
scheme: The URL scheme to load a handler for.
|
|
173
|
+
要加载处理程序的URL方案。
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
BaseDownloadHandler: The loaded handler, or None if the handler
|
|
177
|
+
could not be loaded.
|
|
178
|
+
加载的处理程序,如果无法加载处理程序,则为None。
|
|
179
|
+
"""
|
|
180
|
+
# Get the handler class path from settings
|
|
181
|
+
# 从设置获取处理程序类路径
|
|
58
182
|
path: str = self._schemes[scheme]
|
|
183
|
+
|
|
59
184
|
try:
|
|
185
|
+
# Load the handler class
|
|
186
|
+
# 加载处理程序类
|
|
60
187
|
dh: BaseDownloadHandler = await load_instance(
|
|
61
188
|
path,
|
|
62
189
|
settings=self._crawler.settings,
|
|
63
190
|
)
|
|
64
191
|
except NotConfigured as ex:
|
|
192
|
+
# Handler explicitly raised NotConfigured
|
|
193
|
+
# 处理程序明确引发NotConfigured
|
|
65
194
|
self._notconfigured[scheme] = str(ex)
|
|
66
195
|
return None
|
|
67
196
|
except Exception as ex:
|
|
197
|
+
# Any other exception during loading
|
|
198
|
+
# 加载期间的任何其他异常
|
|
68
199
|
logger.exception(f'Loading "{path}" for scheme "{scheme}"')
|
|
69
200
|
self._notconfigured[scheme] = str(ex)
|
|
70
201
|
return None
|
|
71
202
|
else:
|
|
203
|
+
# Successfully loaded the handler
|
|
204
|
+
# 成功加载处理程序
|
|
72
205
|
self._handlers[scheme] = dh
|
|
73
206
|
return dh
|
|
74
207
|
|
|
75
208
|
async def download_request(self, request: Request, spider: Spider) -> HtmlResponse:
|
|
209
|
+
"""
|
|
210
|
+
Download a request using the appropriate handler for its URL scheme.
|
|
211
|
+
使用适合其URL方案的处理程序下载请求。
|
|
212
|
+
|
|
213
|
+
This method determines the URL scheme of the request, gets the appropriate
|
|
214
|
+
handler, and delegates the download to that handler.
|
|
215
|
+
此方法确定请求的URL方案,获取适当的处理程序,并将下载委托给该处理程序。
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
request: The request to download.
|
|
219
|
+
要下载的请求。
|
|
220
|
+
spider: The spider that generated the request.
|
|
221
|
+
生成请求的爬虫。
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
HtmlResponse: The response from the handler.
|
|
225
|
+
来自处理程序的响应。
|
|
226
|
+
|
|
227
|
+
Raises:
|
|
228
|
+
NotSupported: If no handler is available for the request's URL scheme.
|
|
229
|
+
如果请求的URL方案没有可用的处理程序。
|
|
230
|
+
"""
|
|
231
|
+
# Extract the scheme from the URL (http, https, ftp, etc.)
|
|
232
|
+
# 从URL提取方案(http、https、ftp等)
|
|
76
233
|
scheme = urlparse_cached(request).scheme
|
|
234
|
+
|
|
235
|
+
# Get the handler for this scheme
|
|
236
|
+
# 获取此方案的处理程序
|
|
77
237
|
handler: BaseDownloadHandler = await self._get_handler(scheme)
|
|
238
|
+
|
|
239
|
+
# Raise an exception if no handler is available
|
|
240
|
+
# 如果没有可用的处理程序,则引发异常
|
|
78
241
|
if not handler:
|
|
79
242
|
raise NotSupported("Unsupported URL scheme '%s': %s" %
|
|
80
243
|
(scheme, self._notconfigured[scheme]))
|
|
244
|
+
|
|
245
|
+
# Delegate the download to the handler
|
|
246
|
+
# 将下载委托给处理程序
|
|
81
247
|
return await handler.download_request(request, spider)
|
|
82
248
|
|
|
83
249
|
async def _close(self, *_a, **_kw) -> None:
|
|
250
|
+
"""
|
|
251
|
+
Close all download handlers.
|
|
252
|
+
关闭所有下载处理程序。
|
|
253
|
+
|
|
254
|
+
This method is called when the engine is stopped. It closes all
|
|
255
|
+
download handlers that have been loaded.
|
|
256
|
+
当引擎停止时调用此方法。它关闭所有已加载的下载处理程序。
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
*_a: Variable positional arguments from the signal (not used).
|
|
260
|
+
来自信号的可变位置参数(未使用)。
|
|
261
|
+
**_kw: Variable keyword arguments from the signal (not used).
|
|
262
|
+
来自信号的可变关键字参数(未使用)。
|
|
263
|
+
"""
|
|
264
|
+
# Close each handler
|
|
265
|
+
# 关闭每个处理程序
|
|
84
266
|
for dh in self._handlers.values():
|
|
85
267
|
await dh.close()
|
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
Download handler implementation using aiohttp.
|
|
4
|
+
使用aiohttp的下载处理程序实现。
|
|
5
|
+
|
|
6
|
+
This module provides a download handler that uses aiohttp to perform HTTP/HTTPS requests.
|
|
7
|
+
It supports features like browser impersonation, proxies, and cookies.
|
|
8
|
+
此模块提供了一个使用aiohttp执行HTTP/HTTPS请求的下载处理程序。
|
|
9
|
+
它支持浏览器模拟、代理和Cookie等功能。
|
|
10
|
+
"""
|
|
1
11
|
import asyncio
|
|
2
12
|
import re
|
|
3
13
|
import ssl
|
|
@@ -15,31 +25,168 @@ from aioscrapy.utils.log import logger
|
|
|
15
25
|
|
|
16
26
|
|
|
17
27
|
class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
18
|
-
|
|
28
|
+
"""
|
|
29
|
+
Download handler that uses aiohttp to download HTTP/HTTPS requests.
|
|
30
|
+
使用aiohttp下载HTTP/HTTPS请求的下载处理程序。
|
|
31
|
+
|
|
32
|
+
This handler implements the BaseDownloadHandler interface using the aiohttp
|
|
33
|
+
library to perform HTTP/HTTPS requests.
|
|
34
|
+
此处理程序使用aiohttp库执行HTTP/HTTPS请求,实现了BaseDownloadHandler接口。
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
session: Optional[aiohttp.ClientSession] = None # Shared session when USE_SESSION is True
|
|
38
|
+
# 当USE_SESSION为True时的共享会话
|
|
19
39
|
|
|
20
40
|
def __init__(self, settings: Settings):
|
|
41
|
+
"""
|
|
42
|
+
Initialize the AioHttpDownloadHandler.
|
|
43
|
+
初始化AioHttpDownloadHandler。
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
settings: The settings object containing configuration for the handler.
|
|
47
|
+
包含处理程序配置的设置对象。
|
|
48
|
+
"""
|
|
21
49
|
self.settings = settings
|
|
22
|
-
|
|
50
|
+
|
|
51
|
+
# Arguments to pass to aiohttp.ClientSession constructor
|
|
52
|
+
# 传递给aiohttp.ClientSession构造函数的参数
|
|
53
|
+
self.aiohttp_args: dict = settings.getdict('AIOHTTP_ARGS')
|
|
54
|
+
|
|
55
|
+
# SSL verification setting
|
|
56
|
+
# SSL验证设置
|
|
23
57
|
self.verify_ssl: Optional[bool] = settings.get("VERIFY_SSL")
|
|
58
|
+
|
|
59
|
+
# SSL protocol version (e.g., ssl.PROTOCOL_TLSv1_2)
|
|
60
|
+
# SSL协议版本(例如,ssl.PROTOCOL_TLSv1_2)
|
|
24
61
|
self.ssl_protocol = settings.get("SSL_PROTOCOL") # ssl.PROTOCOL_TLSv1_2
|
|
62
|
+
|
|
63
|
+
# Whether to use a persistent session for all requests
|
|
64
|
+
# 是否对所有请求使用持久会话
|
|
25
65
|
self.use_session: bool = settings.getbool("USE_SESSION", False)
|
|
26
66
|
|
|
27
67
|
@classmethod
|
|
28
68
|
def from_settings(cls, settings: Settings):
|
|
69
|
+
"""
|
|
70
|
+
Create a download handler from settings.
|
|
71
|
+
从设置创建下载处理程序。
|
|
72
|
+
|
|
73
|
+
This is a factory method that creates a new AioHttpDownloadHandler
|
|
74
|
+
instance with the given settings.
|
|
75
|
+
这是一个工厂方法,使用给定的设置创建一个新的AioHttpDownloadHandler实例。
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
settings: The settings to use for the handler.
|
|
79
|
+
用于处理程序的设置。
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
AioHttpDownloadHandler: A new download handler instance.
|
|
83
|
+
一个新的下载处理程序实例。
|
|
84
|
+
"""
|
|
29
85
|
return cls(settings)
|
|
30
86
|
|
|
31
87
|
def get_session(self, *args, **kwargs) -> aiohttp.ClientSession:
|
|
88
|
+
"""
|
|
89
|
+
Get or create a shared aiohttp ClientSession.
|
|
90
|
+
获取或创建共享的aiohttp ClientSession。
|
|
91
|
+
|
|
92
|
+
This method returns the existing session if one exists, or creates
|
|
93
|
+
a new one if none exists yet. This is used when USE_SESSION is True
|
|
94
|
+
to reuse the same session for multiple requests.
|
|
95
|
+
如果会话已存在,此方法返回现有会话;如果尚不存在,则创建一个新会话。
|
|
96
|
+
当USE_SESSION为True时使用此方法,为多个请求重用相同的会话。
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
*args: Positional arguments to pass to aiohttp.ClientSession constructor.
|
|
100
|
+
传递给aiohttp.ClientSession构造函数的位置参数。
|
|
101
|
+
**kwargs: Keyword arguments to pass to aiohttp.ClientSession constructor.
|
|
102
|
+
传递给aiohttp.ClientSession构造函数的关键字参数。
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
aiohttp.ClientSession: The shared client session.
|
|
106
|
+
共享的客户端会话。
|
|
107
|
+
"""
|
|
32
108
|
if self.session is None:
|
|
33
109
|
self.session = aiohttp.ClientSession(*args, **kwargs)
|
|
34
110
|
return self.session
|
|
35
111
|
|
|
36
|
-
async def download_request(self, request: Request,
|
|
112
|
+
async def download_request(self, request: Request, spider) -> HtmlResponse:
|
|
113
|
+
"""
|
|
114
|
+
Download a request using aiohttp.
|
|
115
|
+
使用aiohttp下载请求。
|
|
116
|
+
|
|
117
|
+
This method implements the BaseDownloadHandler.download_request interface.
|
|
118
|
+
It wraps the actual download logic in _download_request and handles
|
|
119
|
+
aiohttp-specific exceptions.
|
|
120
|
+
此方法实现了BaseDownloadHandler.download_request接口。
|
|
121
|
+
它将实际的下载逻辑包装在_download_request中,并处理aiohttp特定的异常。
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
request: The request to download.
|
|
125
|
+
要下载的请求。
|
|
126
|
+
spider: The spider making the request. This parameter is required by the
|
|
127
|
+
BaseDownloadHandler interface but is not used in this implementation.
|
|
128
|
+
发出请求的爬虫。此参数是BaseDownloadHandler接口所需的,但在此实现中未使用。
|
|
129
|
+
It is included to maintain compatibility with the interface and to allow
|
|
130
|
+
subclasses to use it if needed.
|
|
131
|
+
包含它是为了保持与接口的兼容性,并允许子类在需要时使用它。
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
HtmlResponse: The response from the server.
|
|
135
|
+
来自服务器的响应。
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
DownloadError: If an aiohttp ClientError occurs during the download.
|
|
139
|
+
如果在下载过程中发生aiohttp ClientError。
|
|
140
|
+
"""
|
|
37
141
|
try:
|
|
142
|
+
# The spider parameter is intentionally unused in this implementation
|
|
143
|
+
# 在此实现中有意不使用spider参数
|
|
38
144
|
return await self._download_request(request)
|
|
39
145
|
except ClientError as e:
|
|
146
|
+
# Wrap aiohttp-specific exceptions in a generic DownloadError
|
|
147
|
+
# 将aiohttp特定的异常包装在通用的DownloadError中
|
|
40
148
|
raise DownloadError(real_error=e) from e
|
|
41
149
|
|
|
42
150
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
151
|
+
"""
|
|
152
|
+
Perform the actual download of a request using aiohttp.
|
|
153
|
+
使用aiohttp执行请求的实际下载。
|
|
154
|
+
|
|
155
|
+
This method handles the details of configuring and performing the HTTP request,
|
|
156
|
+
including SSL settings, proxies, cookies, and session management. It supports
|
|
157
|
+
various request options through request.meta:
|
|
158
|
+
此方法处理配置和执行HTTP请求的详细信息,包括SSL设置、代理、Cookie和会话管理。
|
|
159
|
+
它通过request.meta支持各种请求选项:
|
|
160
|
+
|
|
161
|
+
- verify_ssl: Whether to verify SSL certificates
|
|
162
|
+
是否验证SSL证书
|
|
163
|
+
- download_timeout: Timeout for the request in seconds
|
|
164
|
+
请求超时时间(秒)
|
|
165
|
+
- dont_redirect: Whether to disable following redirects
|
|
166
|
+
是否禁用跟随重定向
|
|
167
|
+
- TLS_CIPHERS: Custom SSL cipher suite to use
|
|
168
|
+
要使用的自定义SSL密码套件
|
|
169
|
+
- ssl_protocol: SSL protocol version to use
|
|
170
|
+
要使用的SSL协议版本
|
|
171
|
+
- proxy: Proxy URL to use for the request
|
|
172
|
+
用于请求的代理URL
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
request: The request to download.
|
|
176
|
+
要下载的请求。
|
|
177
|
+
This includes the URL, method, headers, body, cookies, and
|
|
178
|
+
meta information for configuring the request.
|
|
179
|
+
这包括URL、方法、标头、正文、Cookie和用于配置请求的元信息。
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
HtmlResponse: The response from the server.
|
|
183
|
+
来自服务器的响应。
|
|
184
|
+
This includes the status code, headers, body, cookies,
|
|
185
|
+
and encoding of the response.
|
|
186
|
+
这包括响应的状态码、标头、正文、Cookie和编码。
|
|
187
|
+
"""
|
|
188
|
+
# Prepare request parameters
|
|
189
|
+
# 准备请求参数
|
|
43
190
|
kwargs = {
|
|
44
191
|
'verify_ssl': request.meta.get('verify_ssl', self.verify_ssl),
|
|
45
192
|
'timeout': request.meta.get('download_timeout', 180),
|
|
@@ -50,9 +197,13 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
|
50
197
|
'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
|
|
51
198
|
}
|
|
52
199
|
|
|
200
|
+
# Set headers from request or default settings
|
|
201
|
+
# 从请求或默认设置设置标头
|
|
53
202
|
headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
|
|
54
203
|
kwargs['headers'] = headers
|
|
55
204
|
|
|
205
|
+
# Configure SSL context if needed
|
|
206
|
+
# 如果需要,配置SSL上下文
|
|
56
207
|
ssl_ciphers: str = request.meta.get('TLS_CIPHERS')
|
|
57
208
|
ssl_protocol = request.meta.get('ssl_protocol', self.ssl_protocol)
|
|
58
209
|
if ssl_ciphers or ssl_protocol:
|
|
@@ -65,28 +216,38 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
|
65
216
|
kwargs['ssl'] = context
|
|
66
217
|
kwargs['verify_ssl'] = True
|
|
67
218
|
|
|
219
|
+
# Configure proxy if specified
|
|
220
|
+
# 如果指定,配置代理
|
|
68
221
|
proxy: str = request.meta.get("proxy")
|
|
69
222
|
if proxy:
|
|
70
223
|
kwargs["proxy"] = proxy
|
|
71
224
|
logger.debug(f"使用代理{proxy}抓取: {request.url}")
|
|
72
225
|
|
|
226
|
+
# Perform the request using either a persistent session or a new session
|
|
227
|
+
# 使用持久会话或新会话执行请求
|
|
73
228
|
if self.use_session:
|
|
74
229
|
# Not recommended to use session, The abnormal phenomena will occurs when using tunnel proxy
|
|
75
|
-
|
|
230
|
+
# 不建议使用会话,使用隧道代理时会出现异常现象
|
|
231
|
+
session = self.get_session(**self.aiohttp_args)
|
|
76
232
|
async with session.request(request.method, request.url, **kwargs) as response:
|
|
77
233
|
content: bytes = await response.read()
|
|
78
|
-
|
|
79
234
|
else:
|
|
80
|
-
|
|
235
|
+
# Create a new session for each request (recommended)
|
|
236
|
+
# 为每个请求创建一个新会话(推荐)
|
|
237
|
+
async with aiohttp.ClientSession(**self.aiohttp_args) as session:
|
|
81
238
|
async with session.request(request.method, request.url, **kwargs) as response:
|
|
82
239
|
content: bytes = await response.read()
|
|
83
240
|
|
|
241
|
+
# Process cookies from response
|
|
242
|
+
# 处理响应中的Cookie
|
|
84
243
|
r_cookies = response.cookies.output() or None
|
|
85
244
|
if r_cookies:
|
|
86
245
|
r_cookies = {
|
|
87
246
|
cookie[0]: cookie[1] for cookie in re.findall(r'Set-Cookie: (.*?)=(.*?); Domain', r_cookies, re.S)
|
|
88
247
|
}
|
|
89
248
|
|
|
249
|
+
# Create and return the response object
|
|
250
|
+
# 创建并返回响应对象
|
|
90
251
|
return HtmlResponse(
|
|
91
252
|
str(response.url),
|
|
92
253
|
status=response.status,
|
|
@@ -97,9 +258,32 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
|
97
258
|
)
|
|
98
259
|
|
|
99
260
|
async def close(self):
|
|
261
|
+
"""
|
|
262
|
+
Close the download handler and release its resources.
|
|
263
|
+
关闭下载处理程序并释放其资源。
|
|
264
|
+
|
|
265
|
+
This method closes the shared session if one exists and waits for
|
|
266
|
+
the underlying SSL connections to close properly. It follows the
|
|
267
|
+
recommended graceful shutdown procedure for aiohttp sessions.
|
|
268
|
+
此方法关闭共享会话(如果存在),并等待底层SSL连接正确关闭。
|
|
269
|
+
它遵循aiohttp会话的推荐优雅关闭程序。
|
|
270
|
+
|
|
271
|
+
The 250ms sleep after closing the session is recommended by the aiohttp
|
|
272
|
+
documentation to allow the underlying SSL connections to be properly closed.
|
|
273
|
+
Without this delay, SSL connections might be terminated abruptly, which
|
|
274
|
+
can cause issues with some servers.
|
|
275
|
+
关闭会话后的250毫秒睡眠是aiohttp文档推荐的,以允许底层SSL连接正确关闭。
|
|
276
|
+
没有这个延迟,SSL连接可能会突然终止,这可能会导致某些服务器出现问题。
|
|
277
|
+
|
|
278
|
+
See: https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown
|
|
279
|
+
参见:https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown
|
|
280
|
+
"""
|
|
100
281
|
if self.session is not None:
|
|
282
|
+
# Close the shared session
|
|
283
|
+
# 关闭共享会话
|
|
101
284
|
await self.session.close()
|
|
102
285
|
|
|
103
286
|
# Wait 250 ms for the underlying SSL connections to close
|
|
287
|
+
# 等待250毫秒让底层SSL连接关闭
|
|
104
288
|
# https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown
|
|
105
289
|
await asyncio.sleep(0.250)
|