aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,498 @@
1
+ """
2
+ Download handler implementation using Playwright.
3
+ 使用Playwright的下载处理程序实现。
4
+
5
+ This module provides a download handler that uses Playwright to perform browser-based HTTP requests.
6
+ It supports full browser automation, JavaScript execution, and event handling.
7
+ 此模块提供了一个使用Playwright执行基于浏览器的HTTP请求的下载处理程序。
8
+ 它支持完整的浏览器自动化、JavaScript执行和事件处理。
9
+ """
10
+ import os
11
+ from functools import wraps
12
+ from typing import Dict, Optional, Tuple, Literal
13
+ from urllib.parse import urlparse, urlunparse
14
+
15
+
16
+ try:
17
+ from playwright._impl._errors import Error
18
+ except ImportError:
19
+ from playwright._impl._api_types import Error
20
+
21
+ from playwright.async_api._generated import Response as EventResponse
22
+ from playwright.async_api import Page, BrowserContext, ViewportSize, ProxySettings
23
+ from playwright.async_api import Playwright, Browser
24
+ from playwright.async_api import async_playwright
25
+
26
+ from aioscrapy import Request, Spider
27
+ from aioscrapy.core.downloader.handlers import BaseDownloadHandler
28
+ from aioscrapy.exceptions import DownloadError
29
+ from aioscrapy.http import WebDriverResponse
30
+ from aioscrapy.settings import Settings
31
+ from aioscrapy.utils.tools import call_helper
32
+ from .driverpool import WebDriverPool, WebDriverBase
33
+
34
+
35
+ class PlaywrightDriver(WebDriverBase):
36
+ """
37
+ A wrapper around Playwright's browser automation API.
38
+ 对Playwright浏览器自动化API的包装。
39
+
40
+ This class provides a simplified interface for working with Playwright browsers,
41
+ handling initialization, proxy configuration, and browser lifecycle management.
42
+ 此类提供了一个简化的接口来使用Playwright浏览器,处理初始化、代理配置和浏览器生命周期管理。
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ *,
48
+ driver_type: Literal["chromium", "firefox", "webkit"] = "chromium",
49
+ proxy: Optional[str] = None,
50
+ browser_args: Optional[Dict] = None,
51
+ context_args: Optional[Dict] = None,
52
+ window_size: Optional[Tuple[int, int]] = None,
53
+ user_agent: str = None,
54
+ max_uses: Optional[int] = None,
55
+ **kwargs # Additional arguments (not used directly)
56
+ # 其他参数(不直接使用)
57
+ ):
58
+ """
59
+ Initialize the PlaywrightDriver.
60
+ 初始化PlaywrightDriver。
61
+
62
+ Args:
63
+ driver_type: The type of browser to use ("chromium", "firefox", or "webkit").
64
+ 要使用的浏览器类型("chromium"、"firefox"或"webkit")。
65
+ proxy: Optional proxy URL to use for browser connections.
66
+ 用于浏览器连接的可选代理URL。
67
+ browser_args: Optional arguments to pass to browser.launch().
68
+ 传递给browser.launch()的可选参数。
69
+ context_args: Optional arguments to pass to browser.new_context().
70
+ 传递给browser.new_context()的可选参数。
71
+ window_size: Optional tuple of (width, height) for the browser window size.
72
+ 浏览器窗口大小的可选元组(width, height)。
73
+ user_agent: Optional user agent string to use.
74
+ 要使用的可选用户代理字符串。
75
+ max_uses: Optional count of uses after which the browser should be recycled.
76
+ 浏览器应该被回收的使用次数的可选计数。
77
+ **kwargs: Additional arguments (not used directly).
78
+ 其他参数(不直接使用)。
79
+ """
80
+ # Browser configuration
81
+ # 浏览器配置
82
+ self.driver_type = driver_type # Type of browser to use
83
+ # 要使用的浏览器类型
84
+ self.proxy = proxy and self.format_context_proxy(proxy) # Formatted proxy settings
85
+ # 格式化的代理设置
86
+ self.viewport = window_size and ViewportSize(width=window_size[0], height=window_size[1]) # Browser viewport size
87
+ # 浏览器视口大小
88
+ self.browser_args = browser_args or {} # Arguments for browser.launch()
89
+ # browser.launch()的参数
90
+ self.context_args = context_args or {} # Arguments for browser.new_context()
91
+ # browser.new_context()的参数
92
+ self.user_agent = user_agent # User agent string
93
+ # 用户代理字符串
94
+
95
+ # Playwright components (initialized in setup())
96
+ # Playwright组件(在setup()中初始化)
97
+ self.driver: Optional[Playwright] = None # Playwright instance
98
+ # Playwright实例
99
+ self.browser: Optional[Browser] = None # Browser instance
100
+ # 浏览器实例
101
+ self.context: Optional[BrowserContext] = None # Browser context
102
+ # 浏览器上下文
103
+ self.page: Optional[Page] = None # Browser page
104
+ # 浏览器页面
105
+ self.url = None # Current URL (used for cookie management)
106
+ # 当前URL(用于Cookie管理)
107
+ self.max_uses = max_uses # Counter for browser recycling
108
+ # 浏览器回收计数器
109
+
110
+ async def setup(self):
111
+ """
112
+ Initialize the Playwright browser and page.
113
+ 初始化Playwright浏览器和页面。
114
+
115
+ This method starts Playwright, launches the browser, creates a browser context,
116
+ and opens a new page. It applies all configuration options such as proxy settings,
117
+ viewport size, and user agent.
118
+ 此方法启动Playwright,启动浏览器,创建浏览器上下文,并打开新页面。
119
+ 它应用所有配置选项,如代理设置、视口大小和用户代理。
120
+
121
+ Returns:
122
+ None
123
+ """
124
+ # Create copies of argument dictionaries to avoid modifying the originals
125
+ # 创建参数字典的副本,以避免修改原始字典
126
+ browser_args = self.browser_args.copy()
127
+ context_args = self.context_args.copy()
128
+
129
+ # Add --no-sandbox argument for Chrome if not specified
130
+ # 如果未指定,为Chrome添加--no-sandbox参数
131
+ if browser_args.get('args') is None:
132
+ browser_args.update({'args': ["--no-sandbox"]})
133
+
134
+ # Ensure storage state directory exists if specified
135
+ # 如果指定了存储状态目录,确保它存在
136
+ if context_args.get("storage_state") is not None:
137
+ storage_state_path = context_args.get("storage_state")
138
+ os.makedirs(os.path.dirname(storage_state_path), exist_ok=True)
139
+
140
+ # Apply proxy settings if specified
141
+ # 如果指定了代理设置,则应用它们
142
+ if self.proxy:
143
+ browser_args.update({'proxy': self.proxy})
144
+ context_args.update({'proxy': self.proxy})
145
+
146
+ # Apply viewport settings if specified
147
+ # 如果指定了视口设置,则应用它们
148
+ if self.viewport:
149
+ context_args.update({"viewport": self.viewport})
150
+ context_args.update({"screen": self.viewport})
151
+
152
+ # Apply user agent if specified
153
+ # 如果指定了用户代理,则应用它
154
+ if self.user_agent:
155
+ context_args.update({'user_agent': self.user_agent})
156
+
157
+ # Start Playwright and launch browser
158
+ # 启动Playwright和浏览器
159
+ self.driver = await async_playwright().start()
160
+ self.browser: Browser = await getattr(self.driver, self.driver_type).launch(**browser_args)
161
+
162
+ # Create browser context and page
163
+ # 创建浏览器上下文和页面
164
+ self.context = await self.browser.new_context(**context_args)
165
+ self.page = await self.context.new_page()
166
+
167
+ @staticmethod
168
+ def format_context_proxy(proxy) -> ProxySettings:
169
+ """
170
+ Format a proxy URL into Playwright's ProxySettings object.
171
+ 将代理URL格式化为Playwright的ProxySettings对象。
172
+
173
+ This method parses a proxy URL (e.g., http://user:pass@host:port) and converts
174
+ it into a ProxySettings object that Playwright can use.
175
+ 此方法解析代理URL(例如,http://user:pass@host:port)并将其转换为
176
+ Playwright可以使用的ProxySettings对象。
177
+
178
+ Args:
179
+ proxy: The proxy URL string.
180
+ 代理URL字符串。
181
+
182
+ Returns:
183
+ ProxySettings: A Playwright ProxySettings object with server, username, and password.
184
+ 包含服务器、用户名和密码的Playwright ProxySettings对象。
185
+ """
186
+ # Parse the proxy URL
187
+ # 解析代理URL
188
+ parsed_url = urlparse(proxy)
189
+
190
+ # Create and return a ProxySettings object
191
+ # 创建并返回ProxySettings对象
192
+ return ProxySettings(
193
+ # Remove username:password from the server URL
194
+ # 从服务器URL中移除username:password
195
+ server=urlunparse(parsed_url._replace(netloc=parsed_url.netloc.split('@')[-1])),
196
+ username=parsed_url.username,
197
+ password=parsed_url.password,
198
+ )
199
+
200
+ async def quit(self):
201
+ """
202
+ Close the browser and clean up resources.
203
+ 关闭浏览器并清理资源。
204
+
205
+ This method closes the page, browser context, browser, and stops the
206
+ Playwright instance, releasing all associated resources.
207
+ 此方法关闭页面、浏览器上下文、浏览器,并停止Playwright实例,
208
+ 释放所有相关资源。
209
+
210
+ Returns:
211
+ None
212
+ """
213
+ # Close the page first
214
+ # 首先关闭页面
215
+ await self.page.close()
216
+
217
+ try:
218
+ # Try to close the browser context
219
+ # 尝试关闭浏览器上下文
220
+ await self.context.close()
221
+ except:
222
+ # Ignore errors when closing the context
223
+ # 关闭上下文时忽略错误
224
+ pass
225
+ finally:
226
+ # Always close the browser and stop Playwright
227
+ # 始终关闭浏览器并停止Playwright
228
+ await self.browser.close()
229
+ await self.driver.stop()
230
+
231
+ async def get_cookies(self):
232
+ """
233
+ Get all cookies from the browser context.
234
+ 从浏览器上下文获取所有Cookie。
235
+
236
+ This method retrieves all cookies from the current browser context
237
+ and returns them as a dictionary of name-value pairs.
238
+ 此方法从当前浏览器上下文检索所有Cookie,并将它们作为名称-值对的字典返回。
239
+
240
+ Returns:
241
+ dict: A dictionary of cookie name-value pairs.
242
+ Cookie名称-值对的字典。
243
+ """
244
+ # Convert the list of cookie objects to a name-value dictionary
245
+ # 将Cookie对象列表转换为名称-值字典
246
+ return {
247
+ cookie["name"]: cookie["value"]
248
+ for cookie in await self.page.context.cookies()
249
+ }
250
+
251
+ async def set_cookies(self, cookies: dict):
252
+ """
253
+ Set cookies in the browser context.
254
+ 在浏览器上下文中设置Cookie。
255
+
256
+ This method adds the provided cookies to the browser context,
257
+ associating them with the current URL.
258
+ 此方法将提供的Cookie添加到浏览器上下文中,将它们与当前URL关联。
259
+
260
+ Args:
261
+ cookies: A dictionary of cookie name-value pairs to set.
262
+ 要设置的Cookie名称-值对的字典。
263
+
264
+ Returns:
265
+ None
266
+ """
267
+ # Convert the dictionary to the format expected by Playwright
268
+ # 将字典转换为Playwright期望的格式
269
+ await self.page.context.add_cookies([
270
+ {
271
+ "name": key,
272
+ "value": value,
273
+ # Use the stored URL or current page URL
274
+ # 使用存储的URL或当前页面URL
275
+ "url": self.url or self.page.url
276
+ }
277
+ for key, value in cookies.items()
278
+ ])
279
+
280
+
281
+ class PlaywrightDownloadHandler(BaseDownloadHandler):
282
+ """
283
+ Download handler that uses Playwright to perform browser-based HTTP requests.
284
+ 使用Playwright执行基于浏览器的HTTP请求的下载处理程序。
285
+
286
+ This handler implements the BaseDownloadHandler interface using Playwright,
287
+ which provides a high-level API to control browsers. It supports full browser
288
+ automation, JavaScript execution, and event handling.
289
+ 此处理程序使用Playwright实现BaseDownloadHandler接口,Playwright提供了控制浏览器的
290
+ 高级API。它支持完整的浏览器自动化、JavaScript执行和事件处理。
291
+ """
292
+
293
+ def __init__(self, settings: Settings):
294
+ """
295
+ Initialize the PlaywrightHandler.
296
+ 初始化PlaywrightHandler。
297
+
298
+ Args:
299
+ settings: The settings object containing configuration for the handler.
300
+ 包含处理程序配置的设置对象。
301
+ """
302
+ self.settings = settings
303
+
304
+ # Get Playwright client arguments from settings
305
+ # 从设置中获取Playwright客户端参数
306
+ playwright_client_args = settings.getdict('PLAYWRIGHT_ARGS')
307
+
308
+ # Set the default page load event to wait for
309
+ # 设置要等待的默认页面加载事件
310
+ self.wait_until = playwright_client_args.get('wait_until', 'domcontentloaded')
311
+
312
+ # Configure the pool size for browser instances
313
+ # 配置浏览器实例的池大小
314
+ pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
315
+
316
+ # Initialize the WebDriver pool
317
+ # 初始化WebDriver池
318
+ self._webdriver_pool = WebDriverPool(PlaywrightDriver, pool_size=pool_size, **playwright_client_args)
319
+
320
+ @classmethod
321
+ def from_settings(cls, settings: Settings):
322
+ """
323
+ Create a download handler from settings.
324
+ 从设置创建下载处理程序。
325
+
326
+ This is a factory method that creates a new PlaywrightHandler
327
+ instance with the given settings.
328
+ 这是一个工厂方法,使用给定的设置创建一个新的PlaywrightHandler实例。
329
+
330
+ Args:
331
+ settings: The settings to use for the handler.
332
+ 用于处理程序的设置。
333
+
334
+ Returns:
335
+ PlaywrightHandler: A new download handler instance.
336
+ 一个新的下载处理程序实例。
337
+ """
338
+ return cls(settings)
339
+
340
+ async def download_request(self, request: Request, spider: Spider) -> WebDriverResponse:
341
+ """
342
+ Download a request using Playwright.
343
+ 使用Playwright下载请求。
344
+
345
+ This method implements the BaseDownloadHandler.download_request interface.
346
+ It wraps the actual download logic in _download_request and handles
347
+ Playwright-specific exceptions.
348
+ 此方法实现了BaseDownloadHandler.download_request接口。
349
+ 它将实际的下载逻辑包装在_download_request中,并处理Playwright特定的异常。
350
+
351
+ Args:
352
+ request: The request to download.
353
+ 要下载的请求。
354
+ spider: The spider that initiated the request.
355
+ 发起请求的爬虫。
356
+
357
+ Returns:
358
+ PlaywrightResponse: The response from the browser.
359
+ 来自浏览器的响应。
360
+
361
+ Raises:
362
+ DownloadError: If a Playwright error or any other exception occurs during the download.
363
+ 如果在下载过程中发生Playwright错误或任何其他异常。
364
+ """
365
+ try:
366
+ return await self._download_request(request, spider)
367
+ except Error as e:
368
+ # Wrap Playwright-specific exceptions in a generic DownloadError
369
+ # 将Playwright特定的异常包装在通用的DownloadError中
370
+ raise DownloadError(real_error=e) from e
371
+ except Exception as e:
372
+ # Wrap any other exceptions in a generic DownloadError
373
+ # 将任何其他异常包装在通用的DownloadError中
374
+ raise DownloadError(real_error=e) from e
375
+
376
+ async def _download_request(self, request: Request, spider) -> WebDriverResponse:
377
+ """
378
+ Internal method to perform the actual download using Playwright.
379
+ 使用Playwright执行实际下载的内部方法。
380
+
381
+ This method configures and uses a Playwright browser to perform the request,
382
+ handling cookies, user agent, proxies, and event listeners. It also supports
383
+ custom browser actions defined in the spider.
384
+ 此方法配置并使用Playwright浏览器执行请求,处理Cookie、用户代理、代理和事件监听器。
385
+ 它还支持在爬虫中定义的自定义浏览器操作。
386
+
387
+ Args:
388
+ request: The request to download.
389
+ 要下载的请求。
390
+ spider: The spider that initiated the request.
391
+ 发起请求的爬虫。
392
+
393
+ Returns:
394
+ PlaywrightResponse: The response from the browser.
395
+ 来自浏览器的响应。
396
+
397
+ Raises:
398
+ Exception: If any error occurs during the browser automation.
399
+ 如果在浏览器自动化过程中发生任何错误。
400
+ """
401
+ # Extract request parameters
402
+ # 提取请求参数
403
+ cookies = dict(request.cookies)
404
+ timeout = request.meta.get('download_timeout', 30) * 1000 # Convert to milliseconds
405
+ # 转换为毫秒
406
+ user_agent = request.headers.get("User-Agent")
407
+ proxy: str = request.meta.get("proxy")
408
+ url = request.url
409
+
410
+ # Dictionary to store responses from event listeners
411
+ # 存储来自事件监听器的响应的字典
412
+ cache_response = {}
413
+
414
+ # Wrapper for event handlers to capture their return values
415
+ # 包装事件处理程序以捕获其返回值
416
+ # 为了获取监听事件中的响应结果
417
+ def on_event_wrap_handler(func):
418
+ @wraps(func)
419
+ async def inner(response):
420
+ ret = await func(response)
421
+ if ret:
422
+ cache_response[ret[0]] = ret[1]
423
+
424
+ return inner
425
+
426
+ # Configure browser options
427
+ # 配置浏览器选项
428
+ kwargs = dict()
429
+ if proxy:
430
+ kwargs['proxy'] = proxy
431
+ if user_agent:
432
+ kwargs['user_agent'] = user_agent
433
+
434
+ # Get a browser instance from the pool
435
+ # 从池中获取浏览器实例
436
+ driver: PlaywrightDriver = await self._webdriver_pool.get(**kwargs)
437
+
438
+ # Set up event listeners from spider methods
439
+ # 从爬虫方法设置事件监听器
440
+ driver.page._events = dict()
441
+ for name in dir(spider):
442
+ if not name.startswith('on_event_'):
443
+ continue
444
+ driver.page.on(name.replace('on_event_', ''), on_event_wrap_handler(getattr(spider, name)))
445
+
446
+ try:
447
+ # Set cookies if provided
448
+ # 如果提供了Cookie,则设置Cookie
449
+ if cookies:
450
+ driver.url = url
451
+ await driver.set_cookies(cookies)
452
+
453
+ # Navigate to the URL
454
+ # 导航到URL
455
+ await driver.page.goto(url, wait_until=request.meta.get('wait_until', self.wait_until), timeout=timeout)
456
+
457
+ # Execute custom actions if defined in the spider
458
+ # 如果在爬虫中定义了自定义操作,则执行
459
+ if process_action_fn := getattr(spider, 'process_action', None):
460
+ action_result = await call_helper(process_action_fn, driver, request)
461
+ if action_result:
462
+ cache_response[action_result[0]] = action_result[1]
463
+
464
+ # Process any event responses
465
+ # 处理任何事件响应
466
+ for cache_key in list(cache_response.keys()):
467
+ if isinstance(cache_response[cache_key], EventResponse):
468
+ cache_ret = cache_response[cache_key]
469
+ # Convert Playwright response to PlaywrightResponse
470
+ # 将Playwright响应转换为PlaywrightResponse
471
+ cache_response[cache_key] = WebDriverResponse(url=cache_ret.url, request=request,
472
+ intercept_request=dict(url=cache_ret.request.url, headers=cache_ret.request.headers,
473
+ data=cache_ret.request.post_data, ), headers=cache_ret.headers, body=await cache_ret.body(),
474
+ status=cache_ret.status, )
475
+
476
+ # Create and return the final response
477
+ # 创建并返回最终响应
478
+ return WebDriverResponse(url=driver.page.url, status=200, text=await driver.page.content(),
479
+ cookies=await driver.get_cookies(), cache_response=cache_response, driver=driver,
480
+ driver_pool=self._webdriver_pool)
481
+ except Exception as e:
482
+ # Remove the driver from the pool on error
483
+ # 出错时从池中移除驱动程序
484
+ await self._webdriver_pool.remove(driver)
485
+ raise e
486
+
487
+ async def close(self):
488
+ """
489
+ Close the download handler and release resources.
490
+ 关闭下载处理程序并释放资源。
491
+
492
+ This method is called when the spider is closing. It closes all browser
493
+ instances in the pool and releases associated resources.
494
+ 当爬虫关闭时调用此方法。它关闭池中的所有浏览器实例并释放相关资源。
495
+ """
496
+ # Close all browser instances in the pool
497
+ # 关闭池中的所有浏览器实例
498
+ await self._webdriver_pool.close()