aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,13 @@
1
+ """
2
+ Download handler implementation using curl_cffi.
3
+ 使用curl_cffi的下载处理程序实现。
4
+
5
+ This module provides a download handler that uses curl_cffi to perform HTTP/HTTPS requests.
6
+ It supports features like browser impersonation, proxies, and cookies.
7
+ 此模块提供了一个使用curl_cffi执行HTTP/HTTPS请求的下载处理程序。
8
+ 它支持浏览器模拟、代理和Cookie等功能。
9
+ """
10
+
1
11
  from curl_cffi.curl import CurlError
2
12
  from curl_cffi.requests import AsyncSession
3
13
 
@@ -10,50 +20,151 @@ from aioscrapy.utils.log import logger
10
20
 
11
21
 
12
22
  class CurlCffiDownloadHandler(BaseDownloadHandler):
23
+ """
24
+ Download handler that uses curl_cffi to perform HTTP/HTTPS requests.
25
+ 使用curl_cffi执行HTTP/HTTPS请求的下载处理程序。
26
+
27
+ This handler implements the BaseDownloadHandler interface using the curl_cffi
28
+ library, which provides high-performance HTTP requests with browser fingerprinting
29
+ capabilities.
30
+ 此处理程序使用curl_cffi库实现BaseDownloadHandler接口,该库提供具有浏览器指纹
31
+ 功能的高性能HTTP请求。
32
+ """
13
33
 
14
34
  def __init__(self, settings):
35
+ """
36
+ Initialize the CurlCffiDownloadHandler.
37
+ 初始化CurlCffiDownloadHandler。
38
+
39
+ Args:
40
+ settings: The settings object containing configuration for the handler.
41
+ 包含处理程序配置的设置对象。
42
+ """
15
43
  self.settings: Settings = settings
16
- self.httpx_client_session_args: dict = self.settings.get('CURL_CFFI_CLIENT_SESSION_ARGS', {})
44
+
45
+ # Arguments to pass to curl_cffi AsyncSession constructor
46
+ # 传递给curl_cffi AsyncSession构造函数的参数
47
+ self.curl_cffi_args: dict = self.settings.get('CURL_CFFI_ARGS', {})
48
+
49
+ # SSL verification setting
50
+ # SSL验证设置
17
51
  self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
18
52
 
19
53
  @classmethod
20
54
  def from_settings(cls, settings: Settings):
55
+ """
56
+ Create a download handler from settings.
57
+ 从设置创建下载处理程序。
58
+
59
+ This is a factory method that creates a new CurlCffiDownloadHandler
60
+ instance with the given settings.
61
+ 这是一个工厂方法,使用给定的设置创建一个新的CurlCffiDownloadHandler实例。
62
+
63
+ Args:
64
+ settings: The settings to use for the handler.
65
+ 用于处理程序的设置。
66
+
67
+ Returns:
68
+ CurlCffiDownloadHandler: A new download handler instance.
69
+ 一个新的下载处理程序实例。
70
+ """
21
71
  return cls(settings)
22
72
 
23
73
  async def download_request(self, request: Request, _) -> HtmlResponse:
74
+ """
75
+ Download a request using curl_cffi.
76
+ 使用curl_cffi下载请求。
77
+
78
+ This method implements the BaseDownloadHandler.download_request interface.
79
+ It wraps the actual download logic in _download_request and handles
80
+ curl_cffi-specific exceptions.
81
+ 此方法实现了BaseDownloadHandler.download_request接口。
82
+ 它将实际的下载逻辑包装在_download_request中,并处理curl_cffi特定的异常。
83
+
84
+ Args:
85
+ request: The request to download.
86
+ 要下载的请求。
87
+ _: The spider (not used in this implementation).
88
+ 爬虫(在此实现中未使用)。
89
+
90
+ Returns:
91
+ HtmlResponse: The response from the server.
92
+ 来自服务器的响应。
93
+
94
+ Raises:
95
+ DownloadError: If a CurlError occurs during the download.
96
+ 如果在下载过程中发生CurlError。
97
+ """
24
98
  try:
25
99
  return await self._download_request(request)
26
100
  except CurlError as e:
101
+ # Wrap curl_cffi-specific exceptions in a generic DownloadError
102
+ # 将curl_cffi特定的异常包装在通用的DownloadError中
27
103
  raise DownloadError(real_error=e) from e
28
104
 
29
105
  async def _download_request(self, request: Request) -> HtmlResponse:
106
+ """
107
+ Internal method to perform the actual download using curl_cffi.
108
+ 使用curl_cffi执行实际下载的内部方法。
109
+
110
+ This method configures and uses a curl_cffi.AsyncSession to perform the request,
111
+ handling SSL settings, proxies, cookies, browser impersonation, and other request parameters.
112
+ 此方法配置并使用curl_cffi.AsyncSession执行请求,处理SSL设置、代理、Cookie、
113
+ 浏览器模拟和其他请求参数。
114
+
115
+ Args:
116
+ request: The request to download.
117
+ 要下载的请求。
118
+
119
+ Returns:
120
+ HtmlResponse: The response from the server.
121
+ 来自服务器的响应。
122
+ """
123
+ # Configure request parameters
124
+ # 配置请求参数
30
125
  kwargs = {
31
126
  'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
32
127
  'cookies': dict(request.cookies),
33
128
  'verify': request.meta.get('verify_ssl', self.verify_ssl),
34
129
  'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
35
130
  'dont_redirect') is None else request.meta.get('dont_redirect'),
36
- 'impersonate': request.meta.get('impersonate'),
131
+ 'impersonate': request.meta.get('impersonate'), # Browser fingerprinting feature
132
+ # 浏览器指纹功能
37
133
  }
134
+
135
+ # Handle request body data
136
+ # 处理请求体数据
38
137
  post_data = request.body or None
39
138
  if isinstance(post_data, dict):
40
- kwargs['json'] = post_data
139
+ kwargs['json'] = post_data # Send as JSON
140
+ # 作为JSON发送
41
141
  else:
42
- kwargs['data'] = post_data
142
+ kwargs['data'] = post_data # Send as form data or raw bytes
143
+ # 作为表单数据或原始字节发送
43
144
 
145
+ # Set request headers
146
+ # 设置请求头
44
147
  headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
45
148
  kwargs['headers'] = headers
46
149
 
150
+ # Configure proxy if specified
151
+ # 如果指定,配置代理
47
152
  proxy = request.meta.get("proxy")
48
153
  if proxy:
49
154
  kwargs["proxies"] = {'http': proxy, 'https': proxy}
50
155
  logger.debug(f"use proxy {proxy}: {request.url}")
51
156
 
52
- session_args = self.httpx_client_session_args.copy()
157
+ # Configure curl_cffi session
158
+ # 配置curl_cffi会话
159
+ session_args = self.curl_cffi_args.copy()
53
160
 
161
+ # Perform the request
162
+ # 执行请求
54
163
  async with AsyncSession(**session_args) as session:
55
164
  response = await session.request(request.method, request.url, **kwargs)
56
165
 
166
+ # Convert curl_cffi response to HtmlResponse
167
+ # 将curl_cffi响应转换为HtmlResponse
57
168
  return HtmlResponse(
58
169
  str(response.url),
59
170
  status=response.status_code,
@@ -64,4 +175,14 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
64
175
  )
65
176
 
66
177
  async def close(self):
178
+ """
179
+ Close the download handler and release resources.
180
+ 关闭下载处理程序并释放资源。
181
+
182
+ This method is called when the spider is closing. In this implementation,
183
+ there are no persistent resources to clean up since curl_cffi.AsyncSession
184
+ is created and closed for each request.
185
+ 当爬虫关闭时调用此方法。在此实现中,没有需要清理的持久资源,
186
+ 因为curl_cffi.AsyncSession是为每个请求创建和关闭的。
187
+ """
67
188
  pass
@@ -1,3 +1,13 @@
1
+ """
2
+ Download handler implementation using httpx.
3
+ 使用httpx的下载处理程序实现。
4
+
5
+ This module provides a download handler that uses httpx to perform HTTP/HTTPS requests.
6
+ It supports HTTP/2, SSL customization, proxies, and cookies.
7
+ 此模块提供了一个使用httpx执行HTTP/HTTPS请求的下载处理程序。
8
+ 它支持HTTP/2、SSL自定义、代理和Cookie。
9
+ """
10
+
1
11
  import ssl
2
12
 
3
13
  import httpx
@@ -12,65 +22,175 @@ from aioscrapy.utils.log import logger
12
22
 
13
23
 
14
24
  class HttpxDownloadHandler(BaseDownloadHandler):
25
+ """
26
+ Download handler that uses httpx to perform HTTP/HTTPS requests.
27
+ 使用httpx执行HTTP/HTTPS请求的下载处理程序。
28
+
29
+ This handler implements the BaseDownloadHandler interface using the httpx
30
+ library, which provides modern HTTP client features including HTTP/2 support,
31
+ connection pooling, and async capabilities.
32
+ 此处理程序使用httpx库实现BaseDownloadHandler接口,该库提供现代HTTP客户端功能,
33
+ 包括HTTP/2支持、连接池和异步功能。
34
+ """
15
35
 
16
36
  def __init__(self, settings):
37
+ """
38
+ Initialize the HttpxDownloadHandler.
39
+ 初始化HttpxDownloadHandler。
40
+
41
+ Args:
42
+ settings: The settings object containing configuration for the handler.
43
+ 包含处理程序配置的设置对象。
44
+ """
17
45
  self.settings: Settings = settings
18
- self.httpx_client_session_args: dict = self.settings.get('HTTPX_CLIENT_SESSION_ARGS', {})
46
+
47
+ # Arguments to pass to httpx AsyncClient constructor
48
+ # 传递给httpx AsyncClient构造函数的参数
49
+ self.httpx_args: dict = self.settings.get('HTTPX_ARGS', {})
50
+
51
+ # SSL verification setting
52
+ # SSL验证设置
19
53
  self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
20
- self.ssl_protocol = self.settings.get("SSL_PROTOCOL") # ssl.PROTOCOL_TLSv1_2
54
+
55
+ # SSL protocol version to use (e.g., ssl.PROTOCOL_TLSv1_2)
56
+ # 要使用的SSL协议版本(例如,ssl.PROTOCOL_TLSv1_2)
57
+ self.ssl_protocol = self.settings.get("SSL_PROTOCOL")
58
+
59
+ # Fix for non-standard HTTP headers in responses
60
+ # 修复响应中的非标准HTTP头
21
61
  if self.settings.getbool("FIX_HTTPX_HEADER", True):
22
- # Fixed non-standard response's header 修复不标准的响应头
23
62
  import h11
24
63
  import re
25
64
  h11._readers.header_field_re = re.compile(b"(?P<field_name>.*?):[ \t](?P<field_value>.*?)")
26
65
 
27
66
  @classmethod
28
67
  def from_settings(cls, settings: Settings):
68
+ """
69
+ Create a download handler from settings.
70
+ 从设置创建下载处理程序。
71
+
72
+ This is a factory method that creates a new HttpxDownloadHandler
73
+ instance with the given settings.
74
+ 这是一个工厂方法,使用给定的设置创建一个新的HttpxDownloadHandler实例。
75
+
76
+ Args:
77
+ settings: The settings to use for the handler.
78
+ 用于处理程序的设置。
79
+
80
+ Returns:
81
+ HttpxDownloadHandler: A new download handler instance.
82
+ 一个新的下载处理程序实例。
83
+ """
29
84
  return cls(settings)
30
85
 
31
86
  async def download_request(self, request: Request, _) -> HtmlResponse:
87
+ """
88
+ Download a request using httpx.
89
+ 使用httpx下载请求。
90
+
91
+ This method implements the BaseDownloadHandler.download_request interface.
92
+ It wraps the actual download logic in _download_request and handles
93
+ httpx-specific exceptions.
94
+ 此方法实现了BaseDownloadHandler.download_request接口。
95
+ 它将实际的下载逻辑包装在_download_request中,并处理httpx特定的异常。
96
+
97
+ Args:
98
+ request: The request to download.
99
+ 要下载的请求。
100
+ _: The spider (not used in this implementation).
101
+ 爬虫(在此实现中未使用)。
102
+
103
+ Returns:
104
+ HtmlResponse: The response from the server.
105
+ 来自服务器的响应。
106
+
107
+ Raises:
108
+ DownloadError: If an HttpxError occurs during the download.
109
+ 如果在下载过程中发生HttpxError。
110
+ """
32
111
  try:
33
112
  return await self._download_request(request)
34
113
  except HttpxError as e:
114
+ # Wrap httpx-specific exceptions in a generic DownloadError
115
+ # 将httpx特定的异常包装在通用的DownloadError中
35
116
  raise DownloadError(real_error=e) from e
36
117
 
37
118
  async def _download_request(self, request: Request) -> HtmlResponse:
119
+ """
120
+ Internal method to perform the actual download using httpx.
121
+ 使用httpx执行实际下载的内部方法。
122
+
123
+ This method configures and uses an httpx.AsyncClient to perform the request,
124
+ handling SSL settings, proxies, cookies, and other request parameters.
125
+ 此方法配置并使用httpx.AsyncClient执行请求,处理SSL设置、代理、Cookie和其他请求参数。
126
+
127
+ Args:
128
+ request: The request to download.
129
+ 要下载的请求。
130
+
131
+ Returns:
132
+ HtmlResponse: The response from the server.
133
+ 来自服务器的响应。
134
+ """
135
+ # Configure request parameters
136
+ # 配置请求参数
38
137
  kwargs = {
39
138
  'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
40
139
  'cookies': dict(request.cookies),
41
140
  'data': request.body or None
42
141
  }
142
+
143
+ # Set request headers
144
+ # 设置请求头
43
145
  headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
44
146
  kwargs['headers'] = headers
45
147
 
46
- session_args = self.httpx_client_session_args.copy()
47
- session_args.setdefault('http2', True)
148
+ # Configure httpx client session
149
+ # 配置httpx客户端会话
150
+ session_args = self.httpx_args.copy()
151
+ session_args.setdefault('http2', True) # Enable HTTP/2 by default
152
+ # 默认启用HTTP/2
48
153
  session_args.update({
49
154
  'verify': request.meta.get('verify_ssl', self.verify_ssl),
50
155
  'follow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
51
156
  'dont_redirect') is None else request.meta.get('dont_redirect'),
52
157
  'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
53
158
  })
159
+
160
+ # Configure SSL settings if specified
161
+ # 如果指定,配置SSL设置
54
162
  ssl_ciphers = request.meta.get('TLS_CIPHERS')
55
163
  ssl_protocol = request.meta.get('ssl_protocol', self.ssl_protocol)
56
164
  if ssl_ciphers or ssl_protocol:
57
165
  if ssl_protocol:
166
+ # Create SSL context with specific protocol
167
+ # 使用特定协议创建SSL上下文
58
168
  context = ssl.SSLContext(protocol=ssl_protocol)
59
169
  else:
170
+ # Use default SSL context
171
+ # 使用默认SSL上下文
60
172
  context = ssl.create_default_context()
61
173
 
174
+ # Set SSL ciphers if specified
175
+ # 如果指定,设置SSL密码
62
176
  ssl_ciphers and context.set_ciphers(ssl_ciphers)
63
177
  session_args['verify'] = context
64
178
 
179
+ # Configure proxy if specified
180
+ # 如果指定,配置代理
65
181
  proxy = request.meta.get("proxy")
66
182
  if proxy:
67
183
  session_args["proxies"] = proxy
68
184
  logger.debug(f"使用代理{proxy}抓取: {request.url}")
69
185
 
186
+ # Perform the request
187
+ # 执行请求
70
188
  async with httpx.AsyncClient(**session_args) as session:
71
189
  response = await session.request(request.method, request.url, **kwargs)
72
190
  content = response.read()
73
191
 
192
+ # Convert httpx response to HtmlResponse
193
+ # 将httpx响应转换为HtmlResponse
74
194
  return HtmlResponse(
75
195
  str(response.url),
76
196
  status=response.status_code,
@@ -81,4 +201,14 @@ class HttpxDownloadHandler(BaseDownloadHandler):
81
201
  )
82
202
 
83
203
  async def close(self):
204
+ """
205
+ Close the download handler and release resources.
206
+ 关闭下载处理程序并释放资源。
207
+
208
+ This method is called when the spider is closing. In this implementation,
209
+ there are no persistent resources to clean up since httpx.AsyncClient
210
+ is created and closed for each request.
211
+ 当爬虫关闭时调用此方法。在此实现中,没有需要清理的持久资源,
212
+ 因为httpx.AsyncClient是为每个请求创建和关闭的。
213
+ """
84
214
  pass
@@ -1,3 +1,13 @@
1
+ """
2
+ Download handler implementation using pyhttpx.
3
+ 使用pyhttpx的下载处理程序实现。
4
+
5
+ This module provides a download handler that uses pyhttpx to perform HTTP/HTTPS requests.
6
+ It supports HTTP/2, proxies, and cookies, and runs synchronous pyhttpx in a thread pool.
7
+ 此模块提供了一个使用pyhttpx执行HTTP/HTTPS请求的下载处理程序。
8
+ 它支持HTTP/2、代理和Cookie,并在线程池中运行同步的pyhttpx。
9
+ """
10
+
1
11
  import asyncio
2
12
 
3
13
  import pyhttpx
@@ -12,24 +22,113 @@ from aioscrapy.utils.log import logger
12
22
 
13
23
 
14
24
  class PyhttpxDownloadHandler(BaseDownloadHandler):
25
+ """
26
+ Download handler that uses pyhttpx to perform HTTP/HTTPS requests.
27
+ 使用pyhttpx执行HTTP/HTTPS请求的下载处理程序。
28
+
29
+ This handler implements the BaseDownloadHandler interface using the pyhttpx
30
+ library, which provides HTTP client features including HTTP/2 support.
31
+ Since pyhttpx is synchronous, this handler runs it in a thread pool.
32
+ 此处理程序使用pyhttpx库实现BaseDownloadHandler接口,该库提供包括HTTP/2支持的HTTP客户端功能。
33
+ 由于pyhttpx是同步的,此处理程序在线程池中运行它。
34
+ """
15
35
 
16
36
  def __init__(self, settings):
37
+ """
38
+ Initialize the PyhttpxDownloadHandler.
39
+ 初始化PyhttpxDownloadHandler。
40
+
41
+ Args:
42
+ settings: The settings object containing configuration for the handler.
43
+ 包含处理程序配置的设置对象。
44
+ """
17
45
  self.settings: Settings = settings
18
- self.pyhttpx_client_args: dict = self.settings.get('PYHTTPX_CLIENT_ARGS', {})
46
+
47
+ # Arguments to pass to pyhttpx HttpSession constructor
48
+ # 传递给pyhttpx HttpSession构造函数的参数
49
+ self.pyhttpx_args: dict = self.settings.get('PYHTTPX_ARGS', {})
50
+
51
+ # SSL verification setting
52
+ # SSL验证设置
19
53
  self.verify_ssl = self.settings.get("VERIFY_SSL", True)
54
+
55
+ # Get the current event loop for running pyhttpx in a thread pool
56
+ # 获取当前事件循环,用于在线程池中运行pyhttpx
20
57
  self.loop = asyncio.get_running_loop()
21
58
 
22
59
  @classmethod
23
60
  def from_settings(cls, settings: Settings):
61
+ """
62
+ Create a download handler from settings.
63
+ 从设置创建下载处理程序。
64
+
65
+ This is a factory method that creates a new PyhttpxDownloadHandler
66
+ instance with the given settings.
67
+ 这是一个工厂方法,使用给定的设置创建一个新的PyhttpxDownloadHandler实例。
68
+
69
+ Args:
70
+ settings: The settings to use for the handler.
71
+ 用于处理程序的设置。
72
+
73
+ Returns:
74
+ PyhttpxDownloadHandler: A new download handler instance.
75
+ 一个新的下载处理程序实例。
76
+ """
24
77
  return cls(settings)
25
78
 
26
79
  async def download_request(self, request: Request, _) -> HtmlResponse:
80
+ """
81
+ Download a request using pyhttpx.
82
+ 使用pyhttpx下载请求。
83
+
84
+ This method implements the BaseDownloadHandler.download_request interface.
85
+ It wraps the actual download logic in _download_request and handles
86
+ pyhttpx-specific exceptions.
87
+ 此方法实现了BaseDownloadHandler.download_request接口。
88
+ 它将实际的下载逻辑包装在_download_request中,并处理pyhttpx特定的异常。
89
+
90
+ Args:
91
+ request: The request to download.
92
+ 要下载的请求。
93
+ _: The spider (not used in this implementation).
94
+ 爬虫(在此实现中未使用)。
95
+
96
+ Returns:
97
+ HtmlResponse: The response from the server.
98
+ 来自服务器的响应。
99
+
100
+ Raises:
101
+ DownloadError: If a PyHttpxError occurs during the download.
102
+ 如果在下载过程中发生PyHttpxError。
103
+ """
27
104
  try:
28
105
  return await self._download_request(request)
29
106
  except PyHttpxError as e:
107
+ # Wrap pyhttpx-specific exceptions in a generic DownloadError
108
+ # 将pyhttpx特定的异常包装在通用的DownloadError中
30
109
  raise DownloadError(real_error=e) from e
31
110
 
32
111
  async def _download_request(self, request: Request) -> HtmlResponse:
112
+ """
113
+ Internal method to perform the actual download using pyhttpx.
114
+ 使用pyhttpx执行实际下载的内部方法。
115
+
116
+ This method configures and uses a pyhttpx.HttpSession to perform the request,
117
+ handling SSL settings, proxies, cookies, and other request parameters.
118
+ Since pyhttpx is synchronous, it runs in a thread pool using asyncio.to_thread.
119
+ 此方法配置并使用pyhttpx.HttpSession执行请求,处理SSL设置、代理、Cookie和其他请求参数。
120
+ 由于pyhttpx是同步的,它使用asyncio.to_thread在线程池中运行。
121
+
122
+ Args:
123
+ request: The request to download.
124
+ 要下载的请求。
125
+
126
+ Returns:
127
+ HtmlResponse: The response from the server.
128
+ 来自服务器的响应。
129
+ """
130
+ # Configure request parameters
131
+ # 配置请求参数
33
132
  kwargs = {
34
133
  'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
35
134
  'cookies': dict(request.cookies),
@@ -37,24 +136,47 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
37
136
  'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
38
137
  'dont_redirect') is None else request.meta.get('dont_redirect')
39
138
  }
139
+
140
+ # Handle request body data
141
+ # 处理请求体数据
40
142
  post_data = request.body or None
41
143
  if isinstance(post_data, dict):
42
- kwargs['json'] = post_data
144
+ kwargs['json'] = post_data # Send as JSON
145
+ # 作为JSON发送
43
146
  else:
44
- kwargs['data'] = post_data
147
+ kwargs['data'] = post_data # Send as form data or raw bytes
148
+ # 作为表单数据或原始字节发送
45
149
 
150
+ # Set request headers
151
+ # 设置请求头
46
152
  headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
47
153
  kwargs['headers'] = headers
48
154
 
155
+ # Configure proxy if specified
156
+ # 如果指定,配置代理
49
157
  proxy = request.meta.get("proxy")
50
158
  if proxy:
51
159
  kwargs["proxies"] = {'https': proxy}
52
160
  logger.debug(f"use proxy {proxy}: {request.url}")
53
161
 
54
- session_args = self.pyhttpx_client_args.copy()
55
- session_args.setdefault('http2', True)
162
+ # Configure pyhttpx session
163
+ # 配置pyhttpx会话
164
+ session_args = self.pyhttpx_args.copy()
165
+ session_args.setdefault('http2', True) # Enable HTTP/2 by default
166
+ # 默认启用HTTP/2
167
+
168
+ if ja3 := request.meta.get("ja3"):
169
+ session_args['ja3'] = ja3
170
+
171
+ # Execute the request in a thread pool since pyhttpx is synchronous
172
+ # 由于pyhttpx是同步的,在线程池中执行请求
56
173
  with pyhttpx.HttpSession(**session_args) as session:
174
+ # Run the synchronous pyhttpx request in a thread pool
175
+ # 在线程池中运行同步的pyhttpx请求
57
176
  response = await asyncio.to_thread(session.request, request.method, request.url, **kwargs)
177
+
178
+ # Convert pyhttpx response to HtmlResponse
179
+ # 将pyhttpx响应转换为HtmlResponse
58
180
  return HtmlResponse(
59
181
  request.url,
60
182
  status=response.status_code,
@@ -65,4 +187,14 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
65
187
  )
66
188
 
67
189
  async def close(self):
190
+ """
191
+ Close the download handler and release resources.
192
+ 关闭下载处理程序并释放资源。
193
+
194
+ This method is called when the spider is closing. In this implementation,
195
+ there are no persistent resources to clean up since pyhttpx.HttpSession
196
+ is created and closed for each request.
197
+ 当爬虫关闭时调用此方法。在此实现中,没有需要清理的持久资源,
198
+ 因为pyhttpx.HttpSession是为每个请求创建和关闭的。
199
+ """
68
200
  pass