aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Download handler implementation using Playwright.
|
|
3
|
+
使用Playwright的下载处理程序实现。
|
|
4
|
+
|
|
5
|
+
This module provides a download handler that uses Playwright to perform browser-based HTTP requests.
|
|
6
|
+
It supports full browser automation, JavaScript execution, and event handling.
|
|
7
|
+
此模块提供了一个使用Playwright执行基于浏览器的HTTP请求的下载处理程序。
|
|
8
|
+
它支持完整的浏览器自动化、JavaScript执行和事件处理。
|
|
9
|
+
"""
|
|
10
|
+
import os
|
|
11
|
+
from functools import wraps
|
|
12
|
+
from typing import Dict, Optional, Tuple, Literal
|
|
13
|
+
from urllib.parse import urlparse, urlunparse
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from playwright._impl._errors import Error
|
|
18
|
+
except ImportError:
|
|
19
|
+
from playwright._impl._api_types import Error
|
|
20
|
+
|
|
21
|
+
from playwright.async_api._generated import Response as EventResponse
|
|
22
|
+
from playwright.async_api import Page, BrowserContext, ViewportSize, ProxySettings
|
|
23
|
+
from playwright.async_api import Playwright, Browser
|
|
24
|
+
from playwright.async_api import async_playwright
|
|
25
|
+
|
|
26
|
+
from aioscrapy import Request, Spider
|
|
27
|
+
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
28
|
+
from aioscrapy.exceptions import DownloadError
|
|
29
|
+
from aioscrapy.http import WebDriverResponse
|
|
30
|
+
from aioscrapy.settings import Settings
|
|
31
|
+
from aioscrapy.utils.tools import call_helper
|
|
32
|
+
from .driverpool import WebDriverPool, WebDriverBase
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PlaywrightDriver(WebDriverBase):
|
|
36
|
+
"""
|
|
37
|
+
A wrapper around Playwright's browser automation API.
|
|
38
|
+
对Playwright浏览器自动化API的包装。
|
|
39
|
+
|
|
40
|
+
This class provides a simplified interface for working with Playwright browsers,
|
|
41
|
+
handling initialization, proxy configuration, and browser lifecycle management.
|
|
42
|
+
此类提供了一个简化的接口来使用Playwright浏览器,处理初始化、代理配置和浏览器生命周期管理。
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
*,
|
|
48
|
+
driver_type: Literal["chromium", "firefox", "webkit"] = "chromium",
|
|
49
|
+
proxy: Optional[str] = None,
|
|
50
|
+
browser_args: Optional[Dict] = None,
|
|
51
|
+
context_args: Optional[Dict] = None,
|
|
52
|
+
window_size: Optional[Tuple[int, int]] = None,
|
|
53
|
+
user_agent: str = None,
|
|
54
|
+
max_uses: Optional[int] = None,
|
|
55
|
+
**kwargs # Additional arguments (not used directly)
|
|
56
|
+
# 其他参数(不直接使用)
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Initialize the PlaywrightDriver.
|
|
60
|
+
初始化PlaywrightDriver。
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
driver_type: The type of browser to use ("chromium", "firefox", or "webkit").
|
|
64
|
+
要使用的浏览器类型("chromium"、"firefox"或"webkit")。
|
|
65
|
+
proxy: Optional proxy URL to use for browser connections.
|
|
66
|
+
用于浏览器连接的可选代理URL。
|
|
67
|
+
browser_args: Optional arguments to pass to browser.launch().
|
|
68
|
+
传递给browser.launch()的可选参数。
|
|
69
|
+
context_args: Optional arguments to pass to browser.new_context().
|
|
70
|
+
传递给browser.new_context()的可选参数。
|
|
71
|
+
window_size: Optional tuple of (width, height) for the browser window size.
|
|
72
|
+
浏览器窗口大小的可选元组(width, height)。
|
|
73
|
+
user_agent: Optional user agent string to use.
|
|
74
|
+
要使用的可选用户代理字符串。
|
|
75
|
+
max_uses: Optional count of uses after which the browser should be recycled.
|
|
76
|
+
浏览器应该被回收的使用次数的可选计数。
|
|
77
|
+
**kwargs: Additional arguments (not used directly).
|
|
78
|
+
其他参数(不直接使用)。
|
|
79
|
+
"""
|
|
80
|
+
# Browser configuration
|
|
81
|
+
# 浏览器配置
|
|
82
|
+
self.driver_type = driver_type # Type of browser to use
|
|
83
|
+
# 要使用的浏览器类型
|
|
84
|
+
self.proxy = proxy and self.format_context_proxy(proxy) # Formatted proxy settings
|
|
85
|
+
# 格式化的代理设置
|
|
86
|
+
self.viewport = window_size and ViewportSize(width=window_size[0], height=window_size[1]) # Browser viewport size
|
|
87
|
+
# 浏览器视口大小
|
|
88
|
+
self.browser_args = browser_args or {} # Arguments for browser.launch()
|
|
89
|
+
# browser.launch()的参数
|
|
90
|
+
self.context_args = context_args or {} # Arguments for browser.new_context()
|
|
91
|
+
# browser.new_context()的参数
|
|
92
|
+
self.user_agent = user_agent # User agent string
|
|
93
|
+
# 用户代理字符串
|
|
94
|
+
|
|
95
|
+
# Playwright components (initialized in setup())
|
|
96
|
+
# Playwright组件(在setup()中初始化)
|
|
97
|
+
self.driver: Optional[Playwright] = None # Playwright instance
|
|
98
|
+
# Playwright实例
|
|
99
|
+
self.browser: Optional[Browser] = None # Browser instance
|
|
100
|
+
# 浏览器实例
|
|
101
|
+
self.context: Optional[BrowserContext] = None # Browser context
|
|
102
|
+
# 浏览器上下文
|
|
103
|
+
self.page: Optional[Page] = None # Browser page
|
|
104
|
+
# 浏览器页面
|
|
105
|
+
self.url = None # Current URL (used for cookie management)
|
|
106
|
+
# 当前URL(用于Cookie管理)
|
|
107
|
+
self.max_uses = max_uses # Counter for browser recycling
|
|
108
|
+
# 浏览器回收计数器
|
|
109
|
+
|
|
110
|
+
async def setup(self):
|
|
111
|
+
"""
|
|
112
|
+
Initialize the Playwright browser and page.
|
|
113
|
+
初始化Playwright浏览器和页面。
|
|
114
|
+
|
|
115
|
+
This method starts Playwright, launches the browser, creates a browser context,
|
|
116
|
+
and opens a new page. It applies all configuration options such as proxy settings,
|
|
117
|
+
viewport size, and user agent.
|
|
118
|
+
此方法启动Playwright,启动浏览器,创建浏览器上下文,并打开新页面。
|
|
119
|
+
它应用所有配置选项,如代理设置、视口大小和用户代理。
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
None
|
|
123
|
+
"""
|
|
124
|
+
# Create copies of argument dictionaries to avoid modifying the originals
|
|
125
|
+
# 创建参数字典的副本,以避免修改原始字典
|
|
126
|
+
browser_args = self.browser_args.copy()
|
|
127
|
+
context_args = self.context_args.copy()
|
|
128
|
+
|
|
129
|
+
# Add --no-sandbox argument for Chrome if not specified
|
|
130
|
+
# 如果未指定,为Chrome添加--no-sandbox参数
|
|
131
|
+
if browser_args.get('args') is None:
|
|
132
|
+
browser_args.update({'args': ["--no-sandbox"]})
|
|
133
|
+
|
|
134
|
+
# Ensure storage state directory exists if specified
|
|
135
|
+
# 如果指定了存储状态目录,确保它存在
|
|
136
|
+
if context_args.get("storage_state") is not None:
|
|
137
|
+
storage_state_path = context_args.get("storage_state")
|
|
138
|
+
os.makedirs(os.path.dirname(storage_state_path), exist_ok=True)
|
|
139
|
+
|
|
140
|
+
# Apply proxy settings if specified
|
|
141
|
+
# 如果指定了代理设置,则应用它们
|
|
142
|
+
if self.proxy:
|
|
143
|
+
browser_args.update({'proxy': self.proxy})
|
|
144
|
+
context_args.update({'proxy': self.proxy})
|
|
145
|
+
|
|
146
|
+
# Apply viewport settings if specified
|
|
147
|
+
# 如果指定了视口设置,则应用它们
|
|
148
|
+
if self.viewport:
|
|
149
|
+
context_args.update({"viewport": self.viewport})
|
|
150
|
+
context_args.update({"screen": self.viewport})
|
|
151
|
+
|
|
152
|
+
# Apply user agent if specified
|
|
153
|
+
# 如果指定了用户代理,则应用它
|
|
154
|
+
if self.user_agent:
|
|
155
|
+
context_args.update({'user_agent': self.user_agent})
|
|
156
|
+
|
|
157
|
+
# Start Playwright and launch browser
|
|
158
|
+
# 启动Playwright和浏览器
|
|
159
|
+
self.driver = await async_playwright().start()
|
|
160
|
+
self.browser: Browser = await getattr(self.driver, self.driver_type).launch(**browser_args)
|
|
161
|
+
|
|
162
|
+
# Create browser context and page
|
|
163
|
+
# 创建浏览器上下文和页面
|
|
164
|
+
self.context = await self.browser.new_context(**context_args)
|
|
165
|
+
self.page = await self.context.new_page()
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def format_context_proxy(proxy) -> ProxySettings:
|
|
169
|
+
"""
|
|
170
|
+
Format a proxy URL into Playwright's ProxySettings object.
|
|
171
|
+
将代理URL格式化为Playwright的ProxySettings对象。
|
|
172
|
+
|
|
173
|
+
This method parses a proxy URL (e.g., http://user:pass@host:port) and converts
|
|
174
|
+
it into a ProxySettings object that Playwright can use.
|
|
175
|
+
此方法解析代理URL(例如,http://user:pass@host:port)并将其转换为
|
|
176
|
+
Playwright可以使用的ProxySettings对象。
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
proxy: The proxy URL string.
|
|
180
|
+
代理URL字符串。
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
ProxySettings: A Playwright ProxySettings object with server, username, and password.
|
|
184
|
+
包含服务器、用户名和密码的Playwright ProxySettings对象。
|
|
185
|
+
"""
|
|
186
|
+
# Parse the proxy URL
|
|
187
|
+
# 解析代理URL
|
|
188
|
+
parsed_url = urlparse(proxy)
|
|
189
|
+
|
|
190
|
+
# Create and return a ProxySettings object
|
|
191
|
+
# 创建并返回ProxySettings对象
|
|
192
|
+
return ProxySettings(
|
|
193
|
+
# Remove username:password from the server URL
|
|
194
|
+
# 从服务器URL中移除username:password
|
|
195
|
+
server=urlunparse(parsed_url._replace(netloc=parsed_url.netloc.split('@')[-1])),
|
|
196
|
+
username=parsed_url.username,
|
|
197
|
+
password=parsed_url.password,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
async def quit(self):
|
|
201
|
+
"""
|
|
202
|
+
Close the browser and clean up resources.
|
|
203
|
+
关闭浏览器并清理资源。
|
|
204
|
+
|
|
205
|
+
This method closes the page, browser context, browser, and stops the
|
|
206
|
+
Playwright instance, releasing all associated resources.
|
|
207
|
+
此方法关闭页面、浏览器上下文、浏览器,并停止Playwright实例,
|
|
208
|
+
释放所有相关资源。
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
None
|
|
212
|
+
"""
|
|
213
|
+
# Close the page first
|
|
214
|
+
# 首先关闭页面
|
|
215
|
+
await self.page.close()
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
# Try to close the browser context
|
|
219
|
+
# 尝试关闭浏览器上下文
|
|
220
|
+
await self.context.close()
|
|
221
|
+
except:
|
|
222
|
+
# Ignore errors when closing the context
|
|
223
|
+
# 关闭上下文时忽略错误
|
|
224
|
+
pass
|
|
225
|
+
finally:
|
|
226
|
+
# Always close the browser and stop Playwright
|
|
227
|
+
# 始终关闭浏览器并停止Playwright
|
|
228
|
+
await self.browser.close()
|
|
229
|
+
await self.driver.stop()
|
|
230
|
+
|
|
231
|
+
async def get_cookies(self):
|
|
232
|
+
"""
|
|
233
|
+
Get all cookies from the browser context.
|
|
234
|
+
从浏览器上下文获取所有Cookie。
|
|
235
|
+
|
|
236
|
+
This method retrieves all cookies from the current browser context
|
|
237
|
+
and returns them as a dictionary of name-value pairs.
|
|
238
|
+
此方法从当前浏览器上下文检索所有Cookie,并将它们作为名称-值对的字典返回。
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
dict: A dictionary of cookie name-value pairs.
|
|
242
|
+
Cookie名称-值对的字典。
|
|
243
|
+
"""
|
|
244
|
+
# Convert the list of cookie objects to a name-value dictionary
|
|
245
|
+
# 将Cookie对象列表转换为名称-值字典
|
|
246
|
+
return {
|
|
247
|
+
cookie["name"]: cookie["value"]
|
|
248
|
+
for cookie in await self.page.context.cookies()
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
async def set_cookies(self, cookies: dict):
|
|
252
|
+
"""
|
|
253
|
+
Set cookies in the browser context.
|
|
254
|
+
在浏览器上下文中设置Cookie。
|
|
255
|
+
|
|
256
|
+
This method adds the provided cookies to the browser context,
|
|
257
|
+
associating them with the current URL.
|
|
258
|
+
此方法将提供的Cookie添加到浏览器上下文中,将它们与当前URL关联。
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
cookies: A dictionary of cookie name-value pairs to set.
|
|
262
|
+
要设置的Cookie名称-值对的字典。
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
None
|
|
266
|
+
"""
|
|
267
|
+
# Convert the dictionary to the format expected by Playwright
|
|
268
|
+
# 将字典转换为Playwright期望的格式
|
|
269
|
+
await self.page.context.add_cookies([
|
|
270
|
+
{
|
|
271
|
+
"name": key,
|
|
272
|
+
"value": value,
|
|
273
|
+
# Use the stored URL or current page URL
|
|
274
|
+
# 使用存储的URL或当前页面URL
|
|
275
|
+
"url": self.url or self.page.url
|
|
276
|
+
}
|
|
277
|
+
for key, value in cookies.items()
|
|
278
|
+
])
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class PlaywrightDownloadHandler(BaseDownloadHandler):
|
|
282
|
+
"""
|
|
283
|
+
Download handler that uses Playwright to perform browser-based HTTP requests.
|
|
284
|
+
使用Playwright执行基于浏览器的HTTP请求的下载处理程序。
|
|
285
|
+
|
|
286
|
+
This handler implements the BaseDownloadHandler interface using Playwright,
|
|
287
|
+
which provides a high-level API to control browsers. It supports full browser
|
|
288
|
+
automation, JavaScript execution, and event handling.
|
|
289
|
+
此处理程序使用Playwright实现BaseDownloadHandler接口,Playwright提供了控制浏览器的
|
|
290
|
+
高级API。它支持完整的浏览器自动化、JavaScript执行和事件处理。
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
def __init__(self, settings: Settings):
|
|
294
|
+
"""
|
|
295
|
+
Initialize the PlaywrightHandler.
|
|
296
|
+
初始化PlaywrightHandler。
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
settings: The settings object containing configuration for the handler.
|
|
300
|
+
包含处理程序配置的设置对象。
|
|
301
|
+
"""
|
|
302
|
+
self.settings = settings
|
|
303
|
+
|
|
304
|
+
# Get Playwright client arguments from settings
|
|
305
|
+
# 从设置中获取Playwright客户端参数
|
|
306
|
+
playwright_client_args = settings.getdict('PLAYWRIGHT_ARGS')
|
|
307
|
+
|
|
308
|
+
# Set the default page load event to wait for
|
|
309
|
+
# 设置要等待的默认页面加载事件
|
|
310
|
+
self.wait_until = playwright_client_args.get('wait_until', 'domcontentloaded')
|
|
311
|
+
|
|
312
|
+
# Configure the pool size for browser instances
|
|
313
|
+
# 配置浏览器实例的池大小
|
|
314
|
+
pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
|
|
315
|
+
|
|
316
|
+
# Initialize the WebDriver pool
|
|
317
|
+
# 初始化WebDriver池
|
|
318
|
+
self._webdriver_pool = WebDriverPool(PlaywrightDriver, pool_size=pool_size, **playwright_client_args)
|
|
319
|
+
|
|
320
|
+
@classmethod
|
|
321
|
+
def from_settings(cls, settings: Settings):
|
|
322
|
+
"""
|
|
323
|
+
Create a download handler from settings.
|
|
324
|
+
从设置创建下载处理程序。
|
|
325
|
+
|
|
326
|
+
This is a factory method that creates a new PlaywrightHandler
|
|
327
|
+
instance with the given settings.
|
|
328
|
+
这是一个工厂方法,使用给定的设置创建一个新的PlaywrightHandler实例。
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
settings: The settings to use for the handler.
|
|
332
|
+
用于处理程序的设置。
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
PlaywrightHandler: A new download handler instance.
|
|
336
|
+
一个新的下载处理程序实例。
|
|
337
|
+
"""
|
|
338
|
+
return cls(settings)
|
|
339
|
+
|
|
340
|
+
async def download_request(self, request: Request, spider: Spider) -> WebDriverResponse:
|
|
341
|
+
"""
|
|
342
|
+
Download a request using Playwright.
|
|
343
|
+
使用Playwright下载请求。
|
|
344
|
+
|
|
345
|
+
This method implements the BaseDownloadHandler.download_request interface.
|
|
346
|
+
It wraps the actual download logic in _download_request and handles
|
|
347
|
+
Playwright-specific exceptions.
|
|
348
|
+
此方法实现了BaseDownloadHandler.download_request接口。
|
|
349
|
+
它将实际的下载逻辑包装在_download_request中,并处理Playwright特定的异常。
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
request: The request to download.
|
|
353
|
+
要下载的请求。
|
|
354
|
+
spider: The spider that initiated the request.
|
|
355
|
+
发起请求的爬虫。
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
PlaywrightResponse: The response from the browser.
|
|
359
|
+
来自浏览器的响应。
|
|
360
|
+
|
|
361
|
+
Raises:
|
|
362
|
+
DownloadError: If a Playwright error or any other exception occurs during the download.
|
|
363
|
+
如果在下载过程中发生Playwright错误或任何其他异常。
|
|
364
|
+
"""
|
|
365
|
+
try:
|
|
366
|
+
return await self._download_request(request, spider)
|
|
367
|
+
except Error as e:
|
|
368
|
+
# Wrap Playwright-specific exceptions in a generic DownloadError
|
|
369
|
+
# 将Playwright特定的异常包装在通用的DownloadError中
|
|
370
|
+
raise DownloadError(real_error=e) from e
|
|
371
|
+
except Exception as e:
|
|
372
|
+
# Wrap any other exceptions in a generic DownloadError
|
|
373
|
+
# 将任何其他异常包装在通用的DownloadError中
|
|
374
|
+
raise DownloadError(real_error=e) from e
|
|
375
|
+
|
|
376
|
+
async def _download_request(self, request: Request, spider) -> WebDriverResponse:
|
|
377
|
+
"""
|
|
378
|
+
Internal method to perform the actual download using Playwright.
|
|
379
|
+
使用Playwright执行实际下载的内部方法。
|
|
380
|
+
|
|
381
|
+
This method configures and uses a Playwright browser to perform the request,
|
|
382
|
+
handling cookies, user agent, proxies, and event listeners. It also supports
|
|
383
|
+
custom browser actions defined in the spider.
|
|
384
|
+
此方法配置并使用Playwright浏览器执行请求,处理Cookie、用户代理、代理和事件监听器。
|
|
385
|
+
它还支持在爬虫中定义的自定义浏览器操作。
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
request: The request to download.
|
|
389
|
+
要下载的请求。
|
|
390
|
+
spider: The spider that initiated the request.
|
|
391
|
+
发起请求的爬虫。
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
PlaywrightResponse: The response from the browser.
|
|
395
|
+
来自浏览器的响应。
|
|
396
|
+
|
|
397
|
+
Raises:
|
|
398
|
+
Exception: If any error occurs during the browser automation.
|
|
399
|
+
如果在浏览器自动化过程中发生任何错误。
|
|
400
|
+
"""
|
|
401
|
+
# Extract request parameters
|
|
402
|
+
# 提取请求参数
|
|
403
|
+
cookies = dict(request.cookies)
|
|
404
|
+
timeout = request.meta.get('download_timeout', 30) * 1000 # Convert to milliseconds
|
|
405
|
+
# 转换为毫秒
|
|
406
|
+
user_agent = request.headers.get("User-Agent")
|
|
407
|
+
proxy: str = request.meta.get("proxy")
|
|
408
|
+
url = request.url
|
|
409
|
+
|
|
410
|
+
# Dictionary to store responses from event listeners
|
|
411
|
+
# 存储来自事件监听器的响应的字典
|
|
412
|
+
cache_response = {}
|
|
413
|
+
|
|
414
|
+
# Wrapper for event handlers to capture their return values
|
|
415
|
+
# 包装事件处理程序以捕获其返回值
|
|
416
|
+
# 为了获取监听事件中的响应结果
|
|
417
|
+
def on_event_wrap_handler(func):
|
|
418
|
+
@wraps(func)
|
|
419
|
+
async def inner(response):
|
|
420
|
+
ret = await func(response)
|
|
421
|
+
if ret:
|
|
422
|
+
cache_response[ret[0]] = ret[1]
|
|
423
|
+
|
|
424
|
+
return inner
|
|
425
|
+
|
|
426
|
+
# Configure browser options
|
|
427
|
+
# 配置浏览器选项
|
|
428
|
+
kwargs = dict()
|
|
429
|
+
if proxy:
|
|
430
|
+
kwargs['proxy'] = proxy
|
|
431
|
+
if user_agent:
|
|
432
|
+
kwargs['user_agent'] = user_agent
|
|
433
|
+
|
|
434
|
+
# Get a browser instance from the pool
|
|
435
|
+
# 从池中获取浏览器实例
|
|
436
|
+
driver: PlaywrightDriver = await self._webdriver_pool.get(**kwargs)
|
|
437
|
+
|
|
438
|
+
# Set up event listeners from spider methods
|
|
439
|
+
# 从爬虫方法设置事件监听器
|
|
440
|
+
driver.page._events = dict()
|
|
441
|
+
for name in dir(spider):
|
|
442
|
+
if not name.startswith('on_event_'):
|
|
443
|
+
continue
|
|
444
|
+
driver.page.on(name.replace('on_event_', ''), on_event_wrap_handler(getattr(spider, name)))
|
|
445
|
+
|
|
446
|
+
try:
|
|
447
|
+
# Set cookies if provided
|
|
448
|
+
# 如果提供了Cookie,则设置Cookie
|
|
449
|
+
if cookies:
|
|
450
|
+
driver.url = url
|
|
451
|
+
await driver.set_cookies(cookies)
|
|
452
|
+
|
|
453
|
+
# Navigate to the URL
|
|
454
|
+
# 导航到URL
|
|
455
|
+
await driver.page.goto(url, wait_until=request.meta.get('wait_until', self.wait_until), timeout=timeout)
|
|
456
|
+
|
|
457
|
+
# Execute custom actions if defined in the spider
|
|
458
|
+
# 如果在爬虫中定义了自定义操作,则执行
|
|
459
|
+
if process_action_fn := getattr(spider, 'process_action', None):
|
|
460
|
+
action_result = await call_helper(process_action_fn, driver, request)
|
|
461
|
+
if action_result:
|
|
462
|
+
cache_response[action_result[0]] = action_result[1]
|
|
463
|
+
|
|
464
|
+
# Process any event responses
|
|
465
|
+
# 处理任何事件响应
|
|
466
|
+
for cache_key in list(cache_response.keys()):
|
|
467
|
+
if isinstance(cache_response[cache_key], EventResponse):
|
|
468
|
+
cache_ret = cache_response[cache_key]
|
|
469
|
+
# Convert Playwright response to PlaywrightResponse
|
|
470
|
+
# 将Playwright响应转换为PlaywrightResponse
|
|
471
|
+
cache_response[cache_key] = WebDriverResponse(url=cache_ret.url, request=request,
|
|
472
|
+
intercept_request=dict(url=cache_ret.request.url, headers=cache_ret.request.headers,
|
|
473
|
+
data=cache_ret.request.post_data, ), headers=cache_ret.headers, body=await cache_ret.body(),
|
|
474
|
+
status=cache_ret.status, )
|
|
475
|
+
|
|
476
|
+
# Create and return the final response
|
|
477
|
+
# 创建并返回最终响应
|
|
478
|
+
return WebDriverResponse(url=driver.page.url, status=200, text=await driver.page.content(),
|
|
479
|
+
cookies=await driver.get_cookies(), cache_response=cache_response, driver=driver,
|
|
480
|
+
driver_pool=self._webdriver_pool)
|
|
481
|
+
except Exception as e:
|
|
482
|
+
# Remove the driver from the pool on error
|
|
483
|
+
# 出错时从池中移除驱动程序
|
|
484
|
+
await self._webdriver_pool.remove(driver)
|
|
485
|
+
raise e
|
|
486
|
+
|
|
487
|
+
async def close(self):
|
|
488
|
+
"""
|
|
489
|
+
Close the download handler and release resources.
|
|
490
|
+
关闭下载处理程序并释放资源。
|
|
491
|
+
|
|
492
|
+
This method is called when the spider is closing. It closes all browser
|
|
493
|
+
instances in the pool and releases associated resources.
|
|
494
|
+
当爬虫关闭时调用此方法。它关闭池中的所有浏览器实例并释放相关资源。
|
|
495
|
+
"""
|
|
496
|
+
# Close all browser instances in the pool
|
|
497
|
+
# 关闭池中的所有浏览器实例
|
|
498
|
+
await self._webdriver_pool.close()
|