aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,13 @@
1
+ """
2
+ Download handler implementation using requests.
3
+ 使用requests的下载处理程序实现。
4
+
5
+ This module provides a download handler that uses the requests library to perform HTTP/HTTPS requests.
6
+ It runs synchronous requests in a thread pool to make it compatible with the async framework.
7
+ 此模块提供了一个使用requests库执行HTTP/HTTPS请求的下载处理程序。
8
+ 它在线程池中运行同步请求,使其与异步框架兼容。
9
+ """
10
+
1
11
  import asyncio
2
12
 
3
13
  import requests
@@ -12,22 +22,106 @@ from aioscrapy.utils.log import logger
12
22
 
13
23
 
14
24
  class RequestsDownloadHandler(BaseDownloadHandler):
25
+ """
26
+ Download handler that uses requests to perform HTTP/HTTPS requests.
27
+ 使用requests执行HTTP/HTTPS请求的下载处理程序。
28
+
29
+ This handler implements the BaseDownloadHandler interface using the requests
30
+ library, which is a popular synchronous HTTP client for Python. Since requests
31
+ is synchronous, this handler runs it in a thread pool to make it compatible
32
+ with the async framework.
33
+ 此处理程序使用requests库实现BaseDownloadHandler接口,requests是Python中流行的
34
+ 同步HTTP客户端。由于requests是同步的,此处理程序在线程池中运行它,使其与异步框架兼容。
35
+ """
15
36
 
16
37
  def __init__(self, settings):
38
+ """
39
+ Initialize the RequestsDownloadHandler.
40
+ 初始化RequestsDownloadHandler。
41
+
42
+ Args:
43
+ settings: The settings object containing configuration for the handler.
44
+ 包含处理程序配置的设置对象。
45
+ """
17
46
  self.settings: Settings = settings
47
+
48
+ # SSL verification setting
49
+ # SSL验证设置
18
50
  self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
19
51
 
20
52
  @classmethod
21
53
  def from_settings(cls, settings: Settings):
54
+ """
55
+ Create a download handler from settings.
56
+ 从设置创建下载处理程序。
57
+
58
+ This is a factory method that creates a new RequestsDownloadHandler
59
+ instance with the given settings.
60
+ 这是一个工厂方法,使用给定的设置创建一个新的RequestsDownloadHandler实例。
61
+
62
+ Args:
63
+ settings: The settings to use for the handler.
64
+ 用于处理程序的设置。
65
+
66
+ Returns:
67
+ RequestsDownloadHandler: A new download handler instance.
68
+ 一个新的下载处理程序实例。
69
+ """
22
70
  return cls(settings)
23
71
 
24
72
  async def download_request(self, request: Request, _) -> HtmlResponse:
73
+ """
74
+ Download a request using requests.
75
+ 使用requests下载请求。
76
+
77
+ This method implements the BaseDownloadHandler.download_request interface.
78
+ It wraps the actual download logic in _download_request and handles
79
+ requests-specific exceptions.
80
+ 此方法实现了BaseDownloadHandler.download_request接口。
81
+ 它将实际的下载逻辑包装在_download_request中,并处理requests特定的异常。
82
+
83
+ Args:
84
+ request: The request to download.
85
+ 要下载的请求。
86
+ _: The spider (not used in this implementation).
87
+ 爬虫(在此实现中未使用)。
88
+
89
+ Returns:
90
+ HtmlResponse: The response from the server.
91
+ 来自服务器的响应。
92
+
93
+ Raises:
94
+ DownloadError: If a RequestsError occurs during the download.
95
+ 如果在下载过程中发生RequestsError。
96
+ """
25
97
  try:
26
98
  return await self._download_request(request)
27
99
  except RequestsError as e:
100
+ # Wrap requests-specific exceptions in a generic DownloadError
101
+ # 将requests特定的异常包装在通用的DownloadError中
28
102
  raise DownloadError(real_error=e) from e
29
103
 
30
104
  async def _download_request(self, request: Request) -> HtmlResponse:
105
+ """
106
+ Internal method to perform the actual download using requests.
107
+ 使用requests执行实际下载的内部方法。
108
+
109
+ This method configures and uses the requests library to perform the request,
110
+ handling SSL settings, proxies, cookies, and other request parameters.
111
+ Since requests is synchronous, it runs in a thread pool using asyncio.to_thread.
112
+ 此方法配置并使用requests库执行请求,处理SSL设置、代理、Cookie和其他请求参数。
113
+ 由于requests是同步的,它使用asyncio.to_thread在线程池中运行。
114
+
115
+ Args:
116
+ request: The request to download.
117
+ 要下载的请求。
118
+
119
+ Returns:
120
+ HtmlResponse: The response from the server.
121
+ 来自服务器的响应。
122
+ """
123
+ # Configure request parameters
124
+ # 配置请求参数
31
125
  kwargs = {
32
126
  'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
33
127
  'cookies': dict(request.cookies),
@@ -35,21 +129,35 @@ class RequestsDownloadHandler(BaseDownloadHandler):
35
129
  'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
36
130
  'dont_redirect') is None else request.meta.get('dont_redirect')
37
131
  }
132
+
133
+ # Handle request body data
134
+ # 处理请求体数据
38
135
  post_data = request.body or None
39
136
  if isinstance(post_data, dict):
40
- kwargs['json'] = post_data
137
+ kwargs['json'] = post_data # Send as JSON
138
+ # 作为JSON发送
41
139
  else:
42
- kwargs['data'] = post_data
140
+ kwargs['data'] = post_data # Send as form data or raw bytes
141
+ # 作为表单数据或原始字节发送
43
142
 
143
+ # Set request headers
144
+ # 设置请求头
44
145
  headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
45
146
  kwargs['headers'] = headers
46
147
 
148
+ # Configure proxy if specified
149
+ # 如果指定,配置代理
47
150
  proxy = request.meta.get("proxy")
48
151
  if proxy:
49
152
  kwargs["proxies"] = {'http': proxy, 'https': proxy}
50
153
  logger.debug(f"use proxy {proxy}: {request.url}")
51
154
 
155
+ # Execute the request in a thread pool since requests is synchronous
156
+ # 由于requests是同步的,在线程池中执行请求
52
157
  response = await asyncio.to_thread(requests.request, request.method, request.url, **kwargs)
158
+
159
+ # Convert requests response to HtmlResponse
160
+ # 将requests响应转换为HtmlResponse
53
161
  return HtmlResponse(
54
162
  response.url,
55
163
  status=response.status_code,
@@ -60,4 +168,14 @@ class RequestsDownloadHandler(BaseDownloadHandler):
60
168
  )
61
169
 
62
170
  async def close(self):
171
+ """
172
+ Close the download handler and release resources.
173
+ 关闭下载处理程序并释放资源。
174
+
175
+ This method is called when the spider is closing. In this implementation,
176
+ there are no persistent resources to clean up since the requests library
177
+ doesn't maintain persistent connections between calls in this usage pattern.
178
+ 当爬虫关闭时调用此方法。在此实现中,没有需要清理的持久资源,
179
+ 因为在这种使用模式下,requests库不会在调用之间维护持久连接。
180
+ """
63
181
  pass
@@ -0,0 +1,2 @@
1
+ from .playwright import PlaywrightDownloadHandler, PlaywrightDriver
2
+ from .drissionpage import DrissionPageDownloadHandler, DrissionPageDriver
@@ -0,0 +1,493 @@
1
+ """
2
+ Download handler implementation using DrissionPage.
3
+ 使用DrissionPage的下载处理程序实现。
4
+
5
+ This module provides a download handler that uses DrissionPage to perform browser-based HTTP requests.
6
+ It supports full browser automation, JavaScript execution, and event handling.
7
+ 此模块提供了一个使用DrissionPage执行基于浏览器的HTTP请求的下载处理程序。
8
+ 它支持完整的浏览器自动化、JavaScript执行和事件处理。
9
+ """
10
+
11
+ import asyncio
12
+ from typing import Dict, Optional, Tuple, Any
13
+ from urllib.parse import urlparse
14
+
15
+ from DrissionPage.errors import BaseError
16
+ from DrissionPage import ChromiumPage, ChromiumOptions
17
+
18
+ from aioscrapy import Request, Spider
19
+ from aioscrapy.core.downloader.handlers import BaseDownloadHandler
20
+ from aioscrapy.exceptions import DownloadError, NotSupported
21
+ from aioscrapy.http import WebDriverResponse
22
+ from aioscrapy.settings import Settings
23
+ from .driverpool import WebDriverPool, WebDriverBase
24
+
25
+
26
+ class DrissionPageDriver(WebDriverBase):
27
+ """
28
+ A wrapper around DrissionPage's browser automation API.
29
+ 对DrissionPage浏览器自动化API的包装。
30
+
31
+ This class provides a simplified interface for working with DrissionPage browsers,
32
+ handling initialization, proxy configuration, and browser lifecycle management.
33
+ 此类提供了一个简化的接口来使用DrissionPage浏览器,处理初始化、代理配置和浏览器生命周期管理。
34
+ """
35
+ _port = 0
36
+
37
+ @classmethod
38
+ def port(cls):
39
+ """
40
+ Generate a unique port number for browser instances.
41
+ 为浏览器实例生成唯一的端口号。
42
+
43
+ This method increments a class-level counter to ensure each browser instance
44
+ gets a unique debugging port, which prevents port conflicts when running
45
+ multiple browser instances simultaneously.
46
+ 此方法递增类级别计数器,以确保每个浏览器实例获得唯一的调试端口,
47
+ 这可以防止同时运行多个浏览器实例时发生端口冲突。
48
+
49
+ Returns:
50
+ int: A unique port number.
51
+ 一个唯一的端口号。
52
+ """
53
+ cls._port += 1
54
+ return cls._port
55
+
56
+ def __init__(
57
+ self,
58
+ *,
59
+ proxy: Optional[str] = None,
60
+ user_agent: str = None,
61
+ headless: bool = False,
62
+ arguments=None,
63
+ max_uses: Optional[int] = None,
64
+ **kwargs # Additional arguments (not used directly)
65
+ # 其他参数(不直接使用)
66
+ ):
67
+ """
68
+ Initialize the DrissionPageDriver.
69
+ 初始化DrissionPageDriver。
70
+
71
+ Args:
72
+ proxy: Optional proxy URL to use for browser connections.
73
+ 用于浏览器连接的可选代理URL。
74
+ user_agent: Optional user agent string to use.
75
+ 要使用的可选用户代理字符串。
76
+ headless: Whether to run the browser in headless mode (without GUI).
77
+ 是否在无头模式下运行浏览器(无GUI)。
78
+ arguments: Additional command-line arguments to pass to the browser.
79
+ 传递给浏览器的其他命令行参数。
80
+ max_uses: Optional count of uses after which the browser should be recycled.
81
+ 浏览器应该被回收的使用次数的可选计数。
82
+ **kwargs: Additional arguments passed to the parent class or for future extensions.
83
+ 传递给父类的其他参数或用于未来扩展。
84
+ These arguments are intentionally not used directly in this implementation
85
+ but are included for compatibility with the WebDriverBase interface.
86
+ 这些参数在此实现中有意不直接使用,但包含它们是为了与WebDriverBase接口兼容。
87
+ """
88
+ # Browser configuration
89
+ # 浏览器配置
90
+ self.proxy = proxy # Proxy URL 代理URL
91
+ self.max_uses = max_uses # Counter for browser recycling 浏览器回收计数器
92
+ self.user_agent = user_agent # User agent string 用户代理字符串
93
+ self.headless = headless
94
+ self.arguments = arguments # Apply additional browser arguments 浏览器启动参数
95
+
96
+ # DrissionPage components (initialized in setup())
97
+ # DrissionPage组件(在setup()中初始化)
98
+ self.page: Optional[ChromiumPage] = None # Browser page 浏览器页面
99
+ self.url = None # Current URL (used for cookie management) 当前URL(用于Cookie管理)
100
+
101
+ async def setup(self):
102
+ """
103
+ Initialize the DrissionPage browser and page.
104
+ 初始化DrissionPage浏览器和页面。
105
+
106
+ This method creates a ChromiumOptions instance with the specified configuration,
107
+ then initializes a ChromiumPage with these options. It applies all configuration
108
+ options such as proxy settings, window size, and user agent.
109
+ 此方法创建具有指定配置的ChromiumOptions实例,然后使用这些选项初始化ChromiumPage。
110
+ 它应用所有配置选项,如代理设置、窗口大小和用户代理。
111
+
112
+ Returns:
113
+ None
114
+ """
115
+ # Run the browser initialization in a separate thread to avoid blocking the event loop
116
+ # 在单独的线程中运行浏览器初始化,以避免阻塞事件循环
117
+ await asyncio.to_thread(self._setup_sync)
118
+
119
+ def _setup_sync(self):
120
+ """
121
+ Synchronous implementation of browser setup.
122
+ 浏览器设置的同步实现。
123
+
124
+ This method is called by setup() in a separate thread to perform the actual
125
+ browser initialization without blocking the event loop.
126
+ 此方法由setup()在单独的线程中调用,以执行实际的浏览器初始化,而不会阻塞事件循环。
127
+
128
+ Returns:
129
+ None
130
+ """
131
+ # Create ChromiumOptions with the specified configuration
132
+ # 使用指定的配置创建ChromiumOptions
133
+ co = ChromiumOptions()
134
+
135
+ co.set_local_port(9221+self.port())
136
+
137
+ # Apply additional browser arguments
138
+ # 应用其他浏览器参数
139
+ if self.arguments:
140
+ for arg in self.arguments:
141
+ if isinstance(arg, str):
142
+ co.set_argument(arg)
143
+ elif isinstance(arg, (list, tuple)):
144
+ co.set_argument(*arg)
145
+ else:
146
+ raise BaseError(f"arguments error: {arg}")
147
+
148
+ co.headless(self.headless)
149
+
150
+ # Apply proxy settings if specified
151
+ # 如果指定了代理设置,则应用它们
152
+ if self.proxy:
153
+ proxy_url = urlparse(self.proxy)
154
+ proxy_server = f"{proxy_url.scheme}://{proxy_url.netloc}"
155
+ co.set_proxy(proxy_server)
156
+
157
+ # Apply user agent if specified
158
+ # 如果指定了用户代理,则应用它
159
+ if self.user_agent:
160
+ co.set_user_agent(self.user_agent)
161
+
162
+ # Create the ChromiumPage with the configured options
163
+ # 使用配置的选项创建ChromiumPage
164
+ self.page = ChromiumPage(co)
165
+
166
+ async def quit(self):
167
+ """
168
+ Close the browser and clean up resources.
169
+ 关闭浏览器并清理资源。
170
+
171
+ This method closes the browser and releases all associated resources.
172
+ 此方法关闭浏览器并释放所有相关资源。
173
+
174
+ Returns:
175
+ None
176
+ """
177
+ # Run the browser cleanup in a separate thread to avoid blocking the event loop
178
+ # 在单独的线程中运行浏览器清理,以避免阻塞事件循环
179
+ if self.page:
180
+ await asyncio.to_thread(self._quit_sync)
181
+
182
+ def _quit_sync(self):
183
+ """
184
+ Synchronous implementation of browser cleanup.
185
+ 浏览器清理的同步实现。
186
+
187
+ This method is called by quit() in a separate thread to perform the actual
188
+ browser cleanup without blocking the event loop.
189
+ 此方法由quit()在单独的线程中调用,以执行实际的浏览器清理,而不会阻塞事件循环。
190
+
191
+ Returns:
192
+ None
193
+ """
194
+ if self.page:
195
+ self.page.quit()
196
+
197
+ async def get_cookies(self) -> Dict[str, str]:
198
+ """
199
+ Get all cookies from the browser.
200
+ 从浏览器获取所有Cookie。
201
+
202
+ This method retrieves all cookies from the current browser session
203
+ and returns them as a dictionary of name-value pairs.
204
+ 此方法从当前浏览器会话检索所有Cookie,并将它们作为名称-值对的字典返回。
205
+
206
+ Returns:
207
+ dict: A dictionary of cookie name-value pairs.
208
+ Cookie名称-值对的字典。
209
+ """
210
+ # Run the cookie retrieval in a separate thread to avoid blocking the event loop
211
+ # 在单独的线程中运行Cookie检索,以避免阻塞事件循环
212
+ cookies = await asyncio.to_thread(self._get_cookies_sync)
213
+ return cookies
214
+
215
+ def _get_cookies_sync(self) -> Dict[str, str]:
216
+ """
217
+ Synchronous implementation of cookie retrieval.
218
+ Cookie检索的同步实现。
219
+
220
+ This method is called by get_cookies() in a separate thread to perform the actual
221
+ cookie retrieval without blocking the event loop.
222
+ 此方法由get_cookies()在单独的线程中调用,以执行实际的Cookie检索,而不会阻塞事件循环。
223
+
224
+ Returns:
225
+ dict: A dictionary of cookie name-value pairs.
226
+ Cookie名称-值对的字典。
227
+ """
228
+ # Convert the list of cookie objects to a name-value dictionary
229
+ # 将Cookie对象列表转换为名称-值字典
230
+ cookies = {}
231
+ if self.page:
232
+ for cookie in self.page.cookies(all_domains=True):
233
+ cookies[cookie.get('name')] = cookie.get('value')
234
+ return cookies
235
+
236
+ async def set_cookies(self, cookies: Dict[str, str]):
237
+ """
238
+ Set cookies in the browser.
239
+ 在浏览器中设置Cookie。
240
+
241
+ This method adds the provided cookies to the browser,
242
+ associating them with the current URL.
243
+ 此方法将提供的Cookie添加到浏览器中,将它们与当前URL关联。
244
+
245
+ Args:
246
+ cookies: A dictionary of cookie name-value pairs to set.
247
+ 要设置的Cookie名称-值对的字典。
248
+
249
+ Returns:
250
+ None
251
+ """
252
+ # Run the cookie setting in a separate thread to avoid blocking the event loop
253
+ # 在单独的线程中运行Cookie设置,以避免阻塞事件循环
254
+ await asyncio.to_thread(self._set_cookies_sync, cookies)
255
+
256
+ def _set_cookies_sync(self, cookies: Dict[str, str]):
257
+ """
258
+ Synchronous implementation of cookie setting.
259
+ Cookie设置的同步实现。
260
+
261
+ This method is called by set_cookies() in a separate thread to perform the actual
262
+ cookie setting without blocking the event loop.
263
+ 此方法由set_cookies()在单独的线程中调用,以执行实际的Cookie设置,而不会阻塞事件循环。
264
+
265
+ Args:
266
+ cookies: A dictionary of cookie name-value pairs to set.
267
+ 要设置的Cookie名称-值对的字典。
268
+
269
+ Returns:
270
+ None
271
+ """
272
+ if self.page:
273
+ self.page.set.cookies(cookies)
274
+
275
+
276
+ class DrissionPageDownloadHandler(BaseDownloadHandler):
277
+ """
278
+ Download handler that uses DrissionPage to perform browser-based HTTP requests.
279
+ 使用DrissionPage执行基于浏览器的HTTP请求的下载处理程序。
280
+
281
+ This handler implements the BaseDownloadHandler interface using DrissionPage,
282
+ which provides a high-level API to control browsers. It supports full browser
283
+ automation, JavaScript execution, and event handling.
284
+ 此处理程序使用DrissionPage实现BaseDownloadHandler接口,DrissionPage提供了控制浏览器的
285
+ 高级API。它支持完整的浏览器自动化、JavaScript执行和事件处理。
286
+ """
287
+
288
+ def __init__(self, settings: Settings):
289
+ """
290
+ Initialize the DrissionPageHandler.
291
+ 初始化DrissionPageHandler。
292
+
293
+ Args:
294
+ settings: The settings object containing configuration for the handler.
295
+ 包含处理程序配置的设置对象。
296
+ """
297
+ self.settings = settings
298
+
299
+ # Get DrissionPage client arguments from settings
300
+ # 从设置中获取DrissionPage客户端参数
301
+ client_args = settings.getdict('DP_ARGS', {})
302
+
303
+ # Configure the pool size for browser instances
304
+ # 配置浏览器实例的池大小
305
+ pool_size = client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
306
+
307
+ # Initialize the WebDriver pool
308
+ # 初始化WebDriver池
309
+ self._webdriver_pool = WebDriverPool(DrissionPageDriver, pool_size=pool_size, **client_args)
310
+
311
+ @classmethod
312
+ def from_settings(cls, settings: Settings):
313
+ """
314
+ Create a download handler from settings.
315
+ 从设置创建下载处理程序。
316
+
317
+ This is a factory method that creates a new DrissionPageHandler
318
+ instance with the given settings.
319
+ 这是一个工厂方法,使用给定的设置创建一个新的DrissionPageHandler实例。
320
+
321
+ Args:
322
+ settings: The settings to use for the handler.
323
+ 用于处理程序的设置。
324
+
325
+ Returns:
326
+ DrissionPageHandler: A new download handler instance.
327
+ 一个新的下载处理程序实例。
328
+ """
329
+ return cls(settings)
330
+
331
+ async def download_request(self, request: Request, spider: Spider) -> WebDriverResponse:
332
+ """
333
+ Download a request using DrissionPage.
334
+ 使用DrissionPage下载请求。
335
+
336
+ This method implements the BaseDownloadHandler.download_request interface.
337
+ It wraps the actual download logic in _download_request and handles
338
+ DrissionPage-specific exceptions.
339
+ 此方法实现了BaseDownloadHandler.download_request接口。
340
+ 它将实际的下载逻辑包装在_download_request中,并处理DrissionPage特定的异常。
341
+
342
+ Args:
343
+ request: The request to download.
344
+ 要下载的请求。
345
+ spider: The spider that initiated the request.
346
+ 发起请求的爬虫。
347
+
348
+ Returns:
349
+ WebDriverResponse: The response from the browser with DrissionPage driver attached.
350
+ 附加了DrissionPage驱动程序的浏览器响应。
351
+ This response contains the page content, cookies, and a reference
352
+ to the browser instance for further interaction.
353
+ 此响应包含页面内容、Cookie和对浏览器实例的引用,以便进一步交互。
354
+
355
+ Raises:
356
+ DownloadError: If a DrissionPage error or any other exception occurs during the download.
357
+ 如果在下载过程中发生DrissionPage错误或任何其他异常。
358
+ """
359
+ try:
360
+ return await self._download_request(request, spider)
361
+ except BaseError as e:
362
+ # Wrap DrissionPage-specific exceptions in a generic DownloadError
363
+ # 将DrissionPage特定的异常包装在通用的DownloadError中
364
+ raise DownloadError(real_error=e) from e
365
+
366
+ async def _download_request(self, request: Request, spider) -> WebDriverResponse:
367
+ """
368
+ Internal method to perform the actual download using DrissionPage.
369
+ 使用DrissionPage执行实际下载的内部方法。
370
+
371
+ This method configures and uses a DrissionPage browser to perform the request,
372
+ handling cookies, user agent, proxies, and event listeners. It also supports
373
+ custom browser actions defined in the spider.
374
+ 此方法配置并使用DrissionPage浏览器执行请求,处理Cookie、用户代理、代理和事件监听器。
375
+ 它还支持在爬虫中定义的自定义浏览器操作。
376
+
377
+ Args:
378
+ request: The request to download.
379
+ 要下载的请求。
380
+ spider: The spider that initiated the request.
381
+ 发起请求的爬虫。
382
+ This can be used to access spider-specific settings and methods,
383
+ particularly the process_action method if defined.
384
+ 这可用于访问爬虫特定的设置和方法,特别是process_action方法(如果已定义)。
385
+
386
+ Returns:
387
+ WebDriverResponse: The response from the browser with DrissionPage driver attached.
388
+ 附加了DrissionPage驱动程序的浏览器响应。
389
+ This response contains the page content, cookies, and a reference
390
+ to the browser instance for further interaction.
391
+ 此响应包含页面内容、Cookie和对浏览器实例的引用,以便进一步交互。
392
+
393
+ Raises:
394
+ NotSupported: If the spider's process_action method is defined as an async function.
395
+ 如果爬虫的process_action方法被定义为异步函数。
396
+ Exception: If any other error occurs during the browser automation.
397
+ 如果在浏览器自动化过程中发生任何其他错误。
398
+ """
399
+ # Extract request parameters
400
+ # 提取请求参数
401
+ cookies = dict(request.cookies)
402
+ timeout = request.meta.get('download_timeout', 30) # In seconds
403
+ # 以秒为单位
404
+ user_agent = request.headers.get("User-Agent")
405
+ proxy: str = request.meta.get("proxy")
406
+ url = request.url
407
+
408
+ # Dictionary to store custom data
409
+ # 存储自定义数据的字典
410
+ cache_response = {}
411
+
412
+ # Configure browser options
413
+ # 配置浏览器选项
414
+ kwargs = dict()
415
+ if proxy:
416
+ kwargs['proxy'] = proxy
417
+ if user_agent:
418
+ kwargs['user_agent'] = user_agent
419
+
420
+ # Get a browser instance from the pool
421
+ # 从池中获取浏览器实例
422
+ driver: DrissionPageDriver = await self._webdriver_pool.get(**kwargs)
423
+
424
+ try:
425
+ # Set cookies if provided
426
+ # 如果提供了Cookie,则设置Cookie
427
+ if cookies:
428
+ driver.url = url
429
+ await driver.set_cookies(cookies)
430
+
431
+ driver.page.listen.start('gitee.com/explore')
432
+ # Navigate to the URL
433
+ # 导航到URL
434
+ await asyncio.to_thread(driver.page.get, url, timeout=timeout)
435
+
436
+ # Execute custom actions if defined in the spider
437
+ # 如果在爬虫中定义了自定义操作,则执行
438
+ if process_action_fn := getattr(spider, 'process_action', None):
439
+ if asyncio.iscoroutinefunction(process_action_fn):
440
+ raise NotSupported(f'process_action can not use async')
441
+
442
+ action_result = await asyncio.to_thread(process_action_fn, driver, request)
443
+ if action_result:
444
+ cache_response[action_result[0]] = action_result[1]
445
+
446
+ def get_html(d):
447
+ """
448
+ Get the HTML content of the current page.
449
+ 获取当前页面的HTML内容。
450
+
451
+ This is a helper function to get the HTML content from the driver
452
+ in a way that can be run in a separate thread.
453
+ 这是一个辅助函数,用于以可以在单独线程中运行的方式从驱动程序获取HTML内容。
454
+
455
+ Args:
456
+ d: The DrissionPageDriver instance.
457
+ DrissionPageDriver实例。
458
+
459
+ Returns:
460
+ str: The HTML content of the current page.
461
+ 当前页面的HTML内容。
462
+ """
463
+ return d.page.html
464
+
465
+ # Create and return the final response
466
+ # 创建并返回最终响应
467
+ return WebDriverResponse(
468
+ url=driver.page.url,
469
+ status=200,
470
+ text=await asyncio.to_thread(get_html, driver),
471
+ cookies=await driver.get_cookies(),
472
+ cache_response=cache_response,
473
+ driver=driver,
474
+ driver_pool=self._webdriver_pool
475
+ )
476
+ except Exception as e:
477
+ # Remove the driver from the pool on error
478
+ # 出错时从池中移除驱动程序
479
+ await self._webdriver_pool.remove(driver)
480
+ raise e
481
+
482
+ async def close(self):
483
+ """
484
+ Close the download handler and release resources.
485
+ 关闭下载处理程序并释放资源。
486
+
487
+ This method is called when the spider is closing. It closes all browser
488
+ instances in the pool and releases associated resources.
489
+ 当爬虫关闭时调用此方法。它关闭池中的所有浏览器实例并释放相关资源。
490
+ """
491
+ # Close all browser instances in the pool
492
+ # 关闭池中的所有浏览器实例
493
+ await self._webdriver_pool.close()