aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Download handler implementation using curl_cffi.
|
|
3
|
+
使用curl_cffi的下载处理程序实现。
|
|
4
|
+
|
|
5
|
+
This module provides a download handler that uses curl_cffi to perform HTTP/HTTPS requests.
|
|
6
|
+
It supports features like browser impersonation, proxies, and cookies.
|
|
7
|
+
此模块提供了一个使用curl_cffi执行HTTP/HTTPS请求的下载处理程序。
|
|
8
|
+
它支持浏览器模拟、代理和Cookie等功能。
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
from curl_cffi.curl import CurlError
|
|
2
12
|
from curl_cffi.requests import AsyncSession
|
|
3
13
|
|
|
@@ -10,50 +20,151 @@ from aioscrapy.utils.log import logger
|
|
|
10
20
|
|
|
11
21
|
|
|
12
22
|
class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
23
|
+
"""
|
|
24
|
+
Download handler that uses curl_cffi to perform HTTP/HTTPS requests.
|
|
25
|
+
使用curl_cffi执行HTTP/HTTPS请求的下载处理程序。
|
|
26
|
+
|
|
27
|
+
This handler implements the BaseDownloadHandler interface using the curl_cffi
|
|
28
|
+
library, which provides high-performance HTTP requests with browser fingerprinting
|
|
29
|
+
capabilities.
|
|
30
|
+
此处理程序使用curl_cffi库实现BaseDownloadHandler接口,该库提供具有浏览器指纹
|
|
31
|
+
功能的高性能HTTP请求。
|
|
32
|
+
"""
|
|
13
33
|
|
|
14
34
|
def __init__(self, settings):
|
|
35
|
+
"""
|
|
36
|
+
Initialize the CurlCffiDownloadHandler.
|
|
37
|
+
初始化CurlCffiDownloadHandler。
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
settings: The settings object containing configuration for the handler.
|
|
41
|
+
包含处理程序配置的设置对象。
|
|
42
|
+
"""
|
|
15
43
|
self.settings: Settings = settings
|
|
16
|
-
|
|
44
|
+
|
|
45
|
+
# Arguments to pass to curl_cffi AsyncSession constructor
|
|
46
|
+
# 传递给curl_cffi AsyncSession构造函数的参数
|
|
47
|
+
self.curl_cffi_args: dict = self.settings.get('CURL_CFFI_ARGS', {})
|
|
48
|
+
|
|
49
|
+
# SSL verification setting
|
|
50
|
+
# SSL验证设置
|
|
17
51
|
self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
|
|
18
52
|
|
|
19
53
|
@classmethod
|
|
20
54
|
def from_settings(cls, settings: Settings):
|
|
55
|
+
"""
|
|
56
|
+
Create a download handler from settings.
|
|
57
|
+
从设置创建下载处理程序。
|
|
58
|
+
|
|
59
|
+
This is a factory method that creates a new CurlCffiDownloadHandler
|
|
60
|
+
instance with the given settings.
|
|
61
|
+
这是一个工厂方法,使用给定的设置创建一个新的CurlCffiDownloadHandler实例。
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
settings: The settings to use for the handler.
|
|
65
|
+
用于处理程序的设置。
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
CurlCffiDownloadHandler: A new download handler instance.
|
|
69
|
+
一个新的下载处理程序实例。
|
|
70
|
+
"""
|
|
21
71
|
return cls(settings)
|
|
22
72
|
|
|
23
73
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
74
|
+
"""
|
|
75
|
+
Download a request using curl_cffi.
|
|
76
|
+
使用curl_cffi下载请求。
|
|
77
|
+
|
|
78
|
+
This method implements the BaseDownloadHandler.download_request interface.
|
|
79
|
+
It wraps the actual download logic in _download_request and handles
|
|
80
|
+
curl_cffi-specific exceptions.
|
|
81
|
+
此方法实现了BaseDownloadHandler.download_request接口。
|
|
82
|
+
它将实际的下载逻辑包装在_download_request中,并处理curl_cffi特定的异常。
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
request: The request to download.
|
|
86
|
+
要下载的请求。
|
|
87
|
+
_: The spider (not used in this implementation).
|
|
88
|
+
爬虫(在此实现中未使用)。
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
HtmlResponse: The response from the server.
|
|
92
|
+
来自服务器的响应。
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
DownloadError: If a CurlError occurs during the download.
|
|
96
|
+
如果在下载过程中发生CurlError。
|
|
97
|
+
"""
|
|
24
98
|
try:
|
|
25
99
|
return await self._download_request(request)
|
|
26
100
|
except CurlError as e:
|
|
101
|
+
# Wrap curl_cffi-specific exceptions in a generic DownloadError
|
|
102
|
+
# 将curl_cffi特定的异常包装在通用的DownloadError中
|
|
27
103
|
raise DownloadError(real_error=e) from e
|
|
28
104
|
|
|
29
105
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
106
|
+
"""
|
|
107
|
+
Internal method to perform the actual download using curl_cffi.
|
|
108
|
+
使用curl_cffi执行实际下载的内部方法。
|
|
109
|
+
|
|
110
|
+
This method configures and uses a curl_cffi.AsyncSession to perform the request,
|
|
111
|
+
handling SSL settings, proxies, cookies, browser impersonation, and other request parameters.
|
|
112
|
+
此方法配置并使用curl_cffi.AsyncSession执行请求,处理SSL设置、代理、Cookie、
|
|
113
|
+
浏览器模拟和其他请求参数。
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
request: The request to download.
|
|
117
|
+
要下载的请求。
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
HtmlResponse: The response from the server.
|
|
121
|
+
来自服务器的响应。
|
|
122
|
+
"""
|
|
123
|
+
# Configure request parameters
|
|
124
|
+
# 配置请求参数
|
|
30
125
|
kwargs = {
|
|
31
126
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
32
127
|
'cookies': dict(request.cookies),
|
|
33
128
|
'verify': request.meta.get('verify_ssl', self.verify_ssl),
|
|
34
129
|
'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
|
|
35
130
|
'dont_redirect') is None else request.meta.get('dont_redirect'),
|
|
36
|
-
'impersonate': request.meta.get('impersonate'),
|
|
131
|
+
'impersonate': request.meta.get('impersonate'), # Browser fingerprinting feature
|
|
132
|
+
# 浏览器指纹功能
|
|
37
133
|
}
|
|
134
|
+
|
|
135
|
+
# Handle request body data
|
|
136
|
+
# 处理请求体数据
|
|
38
137
|
post_data = request.body or None
|
|
39
138
|
if isinstance(post_data, dict):
|
|
40
|
-
kwargs['json'] = post_data
|
|
139
|
+
kwargs['json'] = post_data # Send as JSON
|
|
140
|
+
# 作为JSON发送
|
|
41
141
|
else:
|
|
42
|
-
kwargs['data'] = post_data
|
|
142
|
+
kwargs['data'] = post_data # Send as form data or raw bytes
|
|
143
|
+
# 作为表单数据或原始字节发送
|
|
43
144
|
|
|
145
|
+
# Set request headers
|
|
146
|
+
# 设置请求头
|
|
44
147
|
headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
|
|
45
148
|
kwargs['headers'] = headers
|
|
46
149
|
|
|
150
|
+
# Configure proxy if specified
|
|
151
|
+
# 如果指定,配置代理
|
|
47
152
|
proxy = request.meta.get("proxy")
|
|
48
153
|
if proxy:
|
|
49
154
|
kwargs["proxies"] = {'http': proxy, 'https': proxy}
|
|
50
155
|
logger.debug(f"use proxy {proxy}: {request.url}")
|
|
51
156
|
|
|
52
|
-
|
|
157
|
+
# Configure curl_cffi session
|
|
158
|
+
# 配置curl_cffi会话
|
|
159
|
+
session_args = self.curl_cffi_args.copy()
|
|
53
160
|
|
|
161
|
+
# Perform the request
|
|
162
|
+
# 执行请求
|
|
54
163
|
async with AsyncSession(**session_args) as session:
|
|
55
164
|
response = await session.request(request.method, request.url, **kwargs)
|
|
56
165
|
|
|
166
|
+
# Convert curl_cffi response to HtmlResponse
|
|
167
|
+
# 将curl_cffi响应转换为HtmlResponse
|
|
57
168
|
return HtmlResponse(
|
|
58
169
|
str(response.url),
|
|
59
170
|
status=response.status_code,
|
|
@@ -64,4 +175,14 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
|
64
175
|
)
|
|
65
176
|
|
|
66
177
|
async def close(self):
|
|
178
|
+
"""
|
|
179
|
+
Close the download handler and release resources.
|
|
180
|
+
关闭下载处理程序并释放资源。
|
|
181
|
+
|
|
182
|
+
This method is called when the spider is closing. In this implementation,
|
|
183
|
+
there are no persistent resources to clean up since curl_cffi.AsyncSession
|
|
184
|
+
is created and closed for each request.
|
|
185
|
+
当爬虫关闭时调用此方法。在此实现中,没有需要清理的持久资源,
|
|
186
|
+
因为curl_cffi.AsyncSession是为每个请求创建和关闭的。
|
|
187
|
+
"""
|
|
67
188
|
pass
|
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Download handler implementation using httpx.
|
|
3
|
+
使用httpx的下载处理程序实现。
|
|
4
|
+
|
|
5
|
+
This module provides a download handler that uses httpx to perform HTTP/HTTPS requests.
|
|
6
|
+
It supports HTTP/2, SSL customization, proxies, and cookies.
|
|
7
|
+
此模块提供了一个使用httpx执行HTTP/HTTPS请求的下载处理程序。
|
|
8
|
+
它支持HTTP/2、SSL自定义、代理和Cookie。
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
import ssl
|
|
2
12
|
|
|
3
13
|
import httpx
|
|
@@ -12,65 +22,175 @@ from aioscrapy.utils.log import logger
|
|
|
12
22
|
|
|
13
23
|
|
|
14
24
|
class HttpxDownloadHandler(BaseDownloadHandler):
|
|
25
|
+
"""
|
|
26
|
+
Download handler that uses httpx to perform HTTP/HTTPS requests.
|
|
27
|
+
使用httpx执行HTTP/HTTPS请求的下载处理程序。
|
|
28
|
+
|
|
29
|
+
This handler implements the BaseDownloadHandler interface using the httpx
|
|
30
|
+
library, which provides modern HTTP client features including HTTP/2 support,
|
|
31
|
+
connection pooling, and async capabilities.
|
|
32
|
+
此处理程序使用httpx库实现BaseDownloadHandler接口,该库提供现代HTTP客户端功能,
|
|
33
|
+
包括HTTP/2支持、连接池和异步功能。
|
|
34
|
+
"""
|
|
15
35
|
|
|
16
36
|
def __init__(self, settings):
|
|
37
|
+
"""
|
|
38
|
+
Initialize the HttpxDownloadHandler.
|
|
39
|
+
初始化HttpxDownloadHandler。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
settings: The settings object containing configuration for the handler.
|
|
43
|
+
包含处理程序配置的设置对象。
|
|
44
|
+
"""
|
|
17
45
|
self.settings: Settings = settings
|
|
18
|
-
|
|
46
|
+
|
|
47
|
+
# Arguments to pass to httpx AsyncClient constructor
|
|
48
|
+
# 传递给httpx AsyncClient构造函数的参数
|
|
49
|
+
self.httpx_args: dict = self.settings.get('HTTPX_ARGS', {})
|
|
50
|
+
|
|
51
|
+
# SSL verification setting
|
|
52
|
+
# SSL验证设置
|
|
19
53
|
self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
|
|
20
|
-
|
|
54
|
+
|
|
55
|
+
# SSL protocol version to use (e.g., ssl.PROTOCOL_TLSv1_2)
|
|
56
|
+
# 要使用的SSL协议版本(例如,ssl.PROTOCOL_TLSv1_2)
|
|
57
|
+
self.ssl_protocol = self.settings.get("SSL_PROTOCOL")
|
|
58
|
+
|
|
59
|
+
# Fix for non-standard HTTP headers in responses
|
|
60
|
+
# 修复响应中的非标准HTTP头
|
|
21
61
|
if self.settings.getbool("FIX_HTTPX_HEADER", True):
|
|
22
|
-
# Fixed non-standard response's header 修复不标准的响应头
|
|
23
62
|
import h11
|
|
24
63
|
import re
|
|
25
64
|
h11._readers.header_field_re = re.compile(b"(?P<field_name>.*?):[ \t](?P<field_value>.*?)")
|
|
26
65
|
|
|
27
66
|
@classmethod
|
|
28
67
|
def from_settings(cls, settings: Settings):
|
|
68
|
+
"""
|
|
69
|
+
Create a download handler from settings.
|
|
70
|
+
从设置创建下载处理程序。
|
|
71
|
+
|
|
72
|
+
This is a factory method that creates a new HttpxDownloadHandler
|
|
73
|
+
instance with the given settings.
|
|
74
|
+
这是一个工厂方法,使用给定的设置创建一个新的HttpxDownloadHandler实例。
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
settings: The settings to use for the handler.
|
|
78
|
+
用于处理程序的设置。
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
HttpxDownloadHandler: A new download handler instance.
|
|
82
|
+
一个新的下载处理程序实例。
|
|
83
|
+
"""
|
|
29
84
|
return cls(settings)
|
|
30
85
|
|
|
31
86
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
87
|
+
"""
|
|
88
|
+
Download a request using httpx.
|
|
89
|
+
使用httpx下载请求。
|
|
90
|
+
|
|
91
|
+
This method implements the BaseDownloadHandler.download_request interface.
|
|
92
|
+
It wraps the actual download logic in _download_request and handles
|
|
93
|
+
httpx-specific exceptions.
|
|
94
|
+
此方法实现了BaseDownloadHandler.download_request接口。
|
|
95
|
+
它将实际的下载逻辑包装在_download_request中,并处理httpx特定的异常。
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
request: The request to download.
|
|
99
|
+
要下载的请求。
|
|
100
|
+
_: The spider (not used in this implementation).
|
|
101
|
+
爬虫(在此实现中未使用)。
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
HtmlResponse: The response from the server.
|
|
105
|
+
来自服务器的响应。
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
DownloadError: If an HttpxError occurs during the download.
|
|
109
|
+
如果在下载过程中发生HttpxError。
|
|
110
|
+
"""
|
|
32
111
|
try:
|
|
33
112
|
return await self._download_request(request)
|
|
34
113
|
except HttpxError as e:
|
|
114
|
+
# Wrap httpx-specific exceptions in a generic DownloadError
|
|
115
|
+
# 将httpx特定的异常包装在通用的DownloadError中
|
|
35
116
|
raise DownloadError(real_error=e) from e
|
|
36
117
|
|
|
37
118
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
119
|
+
"""
|
|
120
|
+
Internal method to perform the actual download using httpx.
|
|
121
|
+
使用httpx执行实际下载的内部方法。
|
|
122
|
+
|
|
123
|
+
This method configures and uses an httpx.AsyncClient to perform the request,
|
|
124
|
+
handling SSL settings, proxies, cookies, and other request parameters.
|
|
125
|
+
此方法配置并使用httpx.AsyncClient执行请求,处理SSL设置、代理、Cookie和其他请求参数。
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
request: The request to download.
|
|
129
|
+
要下载的请求。
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
HtmlResponse: The response from the server.
|
|
133
|
+
来自服务器的响应。
|
|
134
|
+
"""
|
|
135
|
+
# Configure request parameters
|
|
136
|
+
# 配置请求参数
|
|
38
137
|
kwargs = {
|
|
39
138
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
40
139
|
'cookies': dict(request.cookies),
|
|
41
140
|
'data': request.body or None
|
|
42
141
|
}
|
|
142
|
+
|
|
143
|
+
# Set request headers
|
|
144
|
+
# 设置请求头
|
|
43
145
|
headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
|
|
44
146
|
kwargs['headers'] = headers
|
|
45
147
|
|
|
46
|
-
|
|
47
|
-
|
|
148
|
+
# Configure httpx client session
|
|
149
|
+
# 配置httpx客户端会话
|
|
150
|
+
session_args = self.httpx_args.copy()
|
|
151
|
+
session_args.setdefault('http2', True) # Enable HTTP/2 by default
|
|
152
|
+
# 默认启用HTTP/2
|
|
48
153
|
session_args.update({
|
|
49
154
|
'verify': request.meta.get('verify_ssl', self.verify_ssl),
|
|
50
155
|
'follow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
|
|
51
156
|
'dont_redirect') is None else request.meta.get('dont_redirect'),
|
|
52
157
|
'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
|
|
53
158
|
})
|
|
159
|
+
|
|
160
|
+
# Configure SSL settings if specified
|
|
161
|
+
# 如果指定,配置SSL设置
|
|
54
162
|
ssl_ciphers = request.meta.get('TLS_CIPHERS')
|
|
55
163
|
ssl_protocol = request.meta.get('ssl_protocol', self.ssl_protocol)
|
|
56
164
|
if ssl_ciphers or ssl_protocol:
|
|
57
165
|
if ssl_protocol:
|
|
166
|
+
# Create SSL context with specific protocol
|
|
167
|
+
# 使用特定协议创建SSL上下文
|
|
58
168
|
context = ssl.SSLContext(protocol=ssl_protocol)
|
|
59
169
|
else:
|
|
170
|
+
# Use default SSL context
|
|
171
|
+
# 使用默认SSL上下文
|
|
60
172
|
context = ssl.create_default_context()
|
|
61
173
|
|
|
174
|
+
# Set SSL ciphers if specified
|
|
175
|
+
# 如果指定,设置SSL密码
|
|
62
176
|
ssl_ciphers and context.set_ciphers(ssl_ciphers)
|
|
63
177
|
session_args['verify'] = context
|
|
64
178
|
|
|
179
|
+
# Configure proxy if specified
|
|
180
|
+
# 如果指定,配置代理
|
|
65
181
|
proxy = request.meta.get("proxy")
|
|
66
182
|
if proxy:
|
|
67
183
|
session_args["proxies"] = proxy
|
|
68
184
|
logger.debug(f"使用代理{proxy}抓取: {request.url}")
|
|
69
185
|
|
|
186
|
+
# Perform the request
|
|
187
|
+
# 执行请求
|
|
70
188
|
async with httpx.AsyncClient(**session_args) as session:
|
|
71
189
|
response = await session.request(request.method, request.url, **kwargs)
|
|
72
190
|
content = response.read()
|
|
73
191
|
|
|
192
|
+
# Convert httpx response to HtmlResponse
|
|
193
|
+
# 将httpx响应转换为HtmlResponse
|
|
74
194
|
return HtmlResponse(
|
|
75
195
|
str(response.url),
|
|
76
196
|
status=response.status_code,
|
|
@@ -81,4 +201,14 @@ class HttpxDownloadHandler(BaseDownloadHandler):
|
|
|
81
201
|
)
|
|
82
202
|
|
|
83
203
|
async def close(self):
|
|
204
|
+
"""
|
|
205
|
+
Close the download handler and release resources.
|
|
206
|
+
关闭下载处理程序并释放资源。
|
|
207
|
+
|
|
208
|
+
This method is called when the spider is closing. In this implementation,
|
|
209
|
+
there are no persistent resources to clean up since httpx.AsyncClient
|
|
210
|
+
is created and closed for each request.
|
|
211
|
+
当爬虫关闭时调用此方法。在此实现中,没有需要清理的持久资源,
|
|
212
|
+
因为httpx.AsyncClient是为每个请求创建和关闭的。
|
|
213
|
+
"""
|
|
84
214
|
pass
|
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Download handler implementation using pyhttpx.
|
|
3
|
+
使用pyhttpx的下载处理程序实现。
|
|
4
|
+
|
|
5
|
+
This module provides a download handler that uses pyhttpx to perform HTTP/HTTPS requests.
|
|
6
|
+
It supports HTTP/2, proxies, and cookies, and runs synchronous pyhttpx in a thread pool.
|
|
7
|
+
此模块提供了一个使用pyhttpx执行HTTP/HTTPS请求的下载处理程序。
|
|
8
|
+
它支持HTTP/2、代理和Cookie,并在线程池中运行同步的pyhttpx。
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
import asyncio
|
|
2
12
|
|
|
3
13
|
import pyhttpx
|
|
@@ -12,24 +22,113 @@ from aioscrapy.utils.log import logger
|
|
|
12
22
|
|
|
13
23
|
|
|
14
24
|
class PyhttpxDownloadHandler(BaseDownloadHandler):
|
|
25
|
+
"""
|
|
26
|
+
Download handler that uses pyhttpx to perform HTTP/HTTPS requests.
|
|
27
|
+
使用pyhttpx执行HTTP/HTTPS请求的下载处理程序。
|
|
28
|
+
|
|
29
|
+
This handler implements the BaseDownloadHandler interface using the pyhttpx
|
|
30
|
+
library, which provides HTTP client features including HTTP/2 support.
|
|
31
|
+
Since pyhttpx is synchronous, this handler runs it in a thread pool.
|
|
32
|
+
此处理程序使用pyhttpx库实现BaseDownloadHandler接口,该库提供包括HTTP/2支持的HTTP客户端功能。
|
|
33
|
+
由于pyhttpx是同步的,此处理程序在线程池中运行它。
|
|
34
|
+
"""
|
|
15
35
|
|
|
16
36
|
def __init__(self, settings):
|
|
37
|
+
"""
|
|
38
|
+
Initialize the PyhttpxDownloadHandler.
|
|
39
|
+
初始化PyhttpxDownloadHandler。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
settings: The settings object containing configuration for the handler.
|
|
43
|
+
包含处理程序配置的设置对象。
|
|
44
|
+
"""
|
|
17
45
|
self.settings: Settings = settings
|
|
18
|
-
|
|
46
|
+
|
|
47
|
+
# Arguments to pass to pyhttpx HttpSession constructor
|
|
48
|
+
# 传递给pyhttpx HttpSession构造函数的参数
|
|
49
|
+
self.pyhttpx_args: dict = self.settings.get('PYHTTPX_ARGS', {})
|
|
50
|
+
|
|
51
|
+
# SSL verification setting
|
|
52
|
+
# SSL验证设置
|
|
19
53
|
self.verify_ssl = self.settings.get("VERIFY_SSL", True)
|
|
54
|
+
|
|
55
|
+
# Get the current event loop for running pyhttpx in a thread pool
|
|
56
|
+
# 获取当前事件循环,用于在线程池中运行pyhttpx
|
|
20
57
|
self.loop = asyncio.get_running_loop()
|
|
21
58
|
|
|
22
59
|
@classmethod
|
|
23
60
|
def from_settings(cls, settings: Settings):
|
|
61
|
+
"""
|
|
62
|
+
Create a download handler from settings.
|
|
63
|
+
从设置创建下载处理程序。
|
|
64
|
+
|
|
65
|
+
This is a factory method that creates a new PyhttpxDownloadHandler
|
|
66
|
+
instance with the given settings.
|
|
67
|
+
这是一个工厂方法,使用给定的设置创建一个新的PyhttpxDownloadHandler实例。
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
settings: The settings to use for the handler.
|
|
71
|
+
用于处理程序的设置。
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
PyhttpxDownloadHandler: A new download handler instance.
|
|
75
|
+
一个新的下载处理程序实例。
|
|
76
|
+
"""
|
|
24
77
|
return cls(settings)
|
|
25
78
|
|
|
26
79
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
80
|
+
"""
|
|
81
|
+
Download a request using pyhttpx.
|
|
82
|
+
使用pyhttpx下载请求。
|
|
83
|
+
|
|
84
|
+
This method implements the BaseDownloadHandler.download_request interface.
|
|
85
|
+
It wraps the actual download logic in _download_request and handles
|
|
86
|
+
pyhttpx-specific exceptions.
|
|
87
|
+
此方法实现了BaseDownloadHandler.download_request接口。
|
|
88
|
+
它将实际的下载逻辑包装在_download_request中,并处理pyhttpx特定的异常。
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
request: The request to download.
|
|
92
|
+
要下载的请求。
|
|
93
|
+
_: The spider (not used in this implementation).
|
|
94
|
+
爬虫(在此实现中未使用)。
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
HtmlResponse: The response from the server.
|
|
98
|
+
来自服务器的响应。
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
DownloadError: If a PyHttpxError occurs during the download.
|
|
102
|
+
如果在下载过程中发生PyHttpxError。
|
|
103
|
+
"""
|
|
27
104
|
try:
|
|
28
105
|
return await self._download_request(request)
|
|
29
106
|
except PyHttpxError as e:
|
|
107
|
+
# Wrap pyhttpx-specific exceptions in a generic DownloadError
|
|
108
|
+
# 将pyhttpx特定的异常包装在通用的DownloadError中
|
|
30
109
|
raise DownloadError(real_error=e) from e
|
|
31
110
|
|
|
32
111
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
112
|
+
"""
|
|
113
|
+
Internal method to perform the actual download using pyhttpx.
|
|
114
|
+
使用pyhttpx执行实际下载的内部方法。
|
|
115
|
+
|
|
116
|
+
This method configures and uses a pyhttpx.HttpSession to perform the request,
|
|
117
|
+
handling SSL settings, proxies, cookies, and other request parameters.
|
|
118
|
+
Since pyhttpx is synchronous, it runs in a thread pool using asyncio.to_thread.
|
|
119
|
+
此方法配置并使用pyhttpx.HttpSession执行请求,处理SSL设置、代理、Cookie和其他请求参数。
|
|
120
|
+
由于pyhttpx是同步的,它使用asyncio.to_thread在线程池中运行。
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
request: The request to download.
|
|
124
|
+
要下载的请求。
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
HtmlResponse: The response from the server.
|
|
128
|
+
来自服务器的响应。
|
|
129
|
+
"""
|
|
130
|
+
# Configure request parameters
|
|
131
|
+
# 配置请求参数
|
|
33
132
|
kwargs = {
|
|
34
133
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
35
134
|
'cookies': dict(request.cookies),
|
|
@@ -37,24 +136,47 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
|
|
|
37
136
|
'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
|
|
38
137
|
'dont_redirect') is None else request.meta.get('dont_redirect')
|
|
39
138
|
}
|
|
139
|
+
|
|
140
|
+
# Handle request body data
|
|
141
|
+
# 处理请求体数据
|
|
40
142
|
post_data = request.body or None
|
|
41
143
|
if isinstance(post_data, dict):
|
|
42
|
-
kwargs['json'] = post_data
|
|
144
|
+
kwargs['json'] = post_data # Send as JSON
|
|
145
|
+
# 作为JSON发送
|
|
43
146
|
else:
|
|
44
|
-
kwargs['data'] = post_data
|
|
147
|
+
kwargs['data'] = post_data # Send as form data or raw bytes
|
|
148
|
+
# 作为表单数据或原始字节发送
|
|
45
149
|
|
|
150
|
+
# Set request headers
|
|
151
|
+
# 设置请求头
|
|
46
152
|
headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
|
|
47
153
|
kwargs['headers'] = headers
|
|
48
154
|
|
|
155
|
+
# Configure proxy if specified
|
|
156
|
+
# 如果指定,配置代理
|
|
49
157
|
proxy = request.meta.get("proxy")
|
|
50
158
|
if proxy:
|
|
51
159
|
kwargs["proxies"] = {'https': proxy}
|
|
52
160
|
logger.debug(f"use proxy {proxy}: {request.url}")
|
|
53
161
|
|
|
54
|
-
|
|
55
|
-
|
|
162
|
+
# Configure pyhttpx session
|
|
163
|
+
# 配置pyhttpx会话
|
|
164
|
+
session_args = self.pyhttpx_args.copy()
|
|
165
|
+
session_args.setdefault('http2', True) # Enable HTTP/2 by default
|
|
166
|
+
# 默认启用HTTP/2
|
|
167
|
+
|
|
168
|
+
if ja3 := request.meta.get("ja3"):
|
|
169
|
+
session_args['ja3'] = ja3
|
|
170
|
+
|
|
171
|
+
# Execute the request in a thread pool since pyhttpx is synchronous
|
|
172
|
+
# 由于pyhttpx是同步的,在线程池中执行请求
|
|
56
173
|
with pyhttpx.HttpSession(**session_args) as session:
|
|
174
|
+
# Run the synchronous pyhttpx request in a thread pool
|
|
175
|
+
# 在线程池中运行同步的pyhttpx请求
|
|
57
176
|
response = await asyncio.to_thread(session.request, request.method, request.url, **kwargs)
|
|
177
|
+
|
|
178
|
+
# Convert pyhttpx response to HtmlResponse
|
|
179
|
+
# 将pyhttpx响应转换为HtmlResponse
|
|
58
180
|
return HtmlResponse(
|
|
59
181
|
request.url,
|
|
60
182
|
status=response.status_code,
|
|
@@ -65,4 +187,14 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
|
|
|
65
187
|
)
|
|
66
188
|
|
|
67
189
|
async def close(self):
|
|
190
|
+
"""
|
|
191
|
+
Close the download handler and release resources.
|
|
192
|
+
关闭下载处理程序并释放资源。
|
|
193
|
+
|
|
194
|
+
This method is called when the spider is closing. In this implementation,
|
|
195
|
+
there are no persistent resources to clean up since pyhttpx.HttpSession
|
|
196
|
+
is created and closed for each request.
|
|
197
|
+
当爬虫关闭时调用此方法。在此实现中,没有需要清理的持久资源,
|
|
198
|
+
因为pyhttpx.HttpSession是为每个请求创建和关闭的。
|
|
199
|
+
"""
|
|
68
200
|
pass
|