aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
WebDriver pool implementation for Playwright/DrissionPage browsers.
|
|
4
|
+
Playwright/DrissionPage浏览器的WebDriver池实现。
|
|
5
|
+
|
|
6
|
+
This module provides a pool of Playwright browser instances that can be reused
|
|
7
|
+
across multiple requests, improving performance by avoiding the overhead of
|
|
8
|
+
creating a new browser for each request.
|
|
9
|
+
此模块提供了一个Playwright浏览器实例池,可以在多个请求之间重用,
|
|
10
|
+
通过避免为每个请求创建新浏览器的开销来提高性能。
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from asyncio import Lock
|
|
14
|
+
from asyncio.queues import Queue
|
|
15
|
+
|
|
16
|
+
from aioscrapy.utils.tools import singleton
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class WebDriverBase:
|
|
20
|
+
|
|
21
|
+
async def setup(self):
|
|
22
|
+
raise NotImplementedError
|
|
23
|
+
|
|
24
|
+
async def quit(self):
|
|
25
|
+
raise NotImplementedError
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@singleton
|
|
29
|
+
class WebDriverPool:
|
|
30
|
+
"""
|
|
31
|
+
A pool of WebDriver instances for Playwright browsers.
|
|
32
|
+
Playwright浏览器的WebDriver实例池。
|
|
33
|
+
|
|
34
|
+
This class manages a pool of browser instances that can be reused across
|
|
35
|
+
multiple requests. It handles creation, retrieval, release, and cleanup
|
|
36
|
+
of browser instances.
|
|
37
|
+
此类管理一个可以在多个请求之间重用的浏览器实例池。它处理浏览器实例的
|
|
38
|
+
创建、检索、释放和清理。
|
|
39
|
+
|
|
40
|
+
The @singleton decorator ensures only one pool exists per process.
|
|
41
|
+
@singleton装饰器确保每个进程只存在一个池。
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
driver_cls: WebDriverBase,
|
|
47
|
+
use_pool: bool = True,
|
|
48
|
+
pool_size: int = 1,
|
|
49
|
+
**kwargs
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Initialize the WebDriverPool.
|
|
53
|
+
初始化WebDriverPool。
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
use_pool: Whether to use pooling (True) or create a new browser for each request (False).
|
|
57
|
+
是否使用池化(True)或为每个请求创建新浏览器(False)。
|
|
58
|
+
pool_size: Maximum number of browser instances to keep in the pool.
|
|
59
|
+
池中保留的最大浏览器实例数。
|
|
60
|
+
driver_cls: The WebDriver class to instantiate.
|
|
61
|
+
要实例化的WebDriver类。
|
|
62
|
+
**kwargs: Additional arguments to pass to the WebDriver constructor.
|
|
63
|
+
传递给WebDriver构造函数的其他参数。
|
|
64
|
+
"""
|
|
65
|
+
self.use_pool = use_pool # Whether to reuse browser instances
|
|
66
|
+
# 是否重用浏览器实例
|
|
67
|
+
self.pool_size = pool_size # Maximum number of browsers in the pool
|
|
68
|
+
# 池中的最大浏览器数量
|
|
69
|
+
self.driver_cls = driver_cls # WebDriver class to instantiate
|
|
70
|
+
# 要实例化的WebDriver类
|
|
71
|
+
self.kwargs = kwargs # Arguments for WebDriver initialization
|
|
72
|
+
# WebDriver初始化的参数
|
|
73
|
+
|
|
74
|
+
# Queue to store available browser instances
|
|
75
|
+
# 存储可用浏览器实例的队列
|
|
76
|
+
self.queue = Queue(maxsize=pool_size)
|
|
77
|
+
# Lock to synchronize access to the pool
|
|
78
|
+
# 用于同步访问池的锁
|
|
79
|
+
self.lock = Lock()
|
|
80
|
+
# Counter for active browser instances
|
|
81
|
+
# 活动浏览器实例的计数器
|
|
82
|
+
self.driver_count = 0
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def is_full(self):
|
|
86
|
+
"""
|
|
87
|
+
Check if the pool has reached its maximum capacity.
|
|
88
|
+
检查池是否已达到其最大容量。
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
bool: True if the pool is full, False otherwise.
|
|
92
|
+
如果池已满,则为True,否则为False。
|
|
93
|
+
"""
|
|
94
|
+
return self.driver_count >= self.pool_size
|
|
95
|
+
|
|
96
|
+
async def create_driver(self, **kw):
|
|
97
|
+
"""
|
|
98
|
+
Create a new WebDriver instance.
|
|
99
|
+
创建一个新的WebDriver实例。
|
|
100
|
+
|
|
101
|
+
This method instantiates a new browser with the specified arguments
|
|
102
|
+
merged with the default arguments provided at pool initialization.
|
|
103
|
+
此方法使用指定的参数与池初始化时提供的默认参数合并来实例化新的浏览器。
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
**args: Additional arguments to override the default WebDriver arguments.
|
|
107
|
+
用于覆盖默认WebDriver参数的其他参数。
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
WebDriver: A new, initialized WebDriver instance.
|
|
111
|
+
一个新的、已初始化的WebDriver实例。
|
|
112
|
+
"""
|
|
113
|
+
# Merge default arguments with request-specific arguments
|
|
114
|
+
# 将默认参数与请求特定参数合并
|
|
115
|
+
kwargs = self.kwargs.copy()
|
|
116
|
+
kwargs.update(kw)
|
|
117
|
+
|
|
118
|
+
# Create the driver instance
|
|
119
|
+
# 创建驱动程序实例
|
|
120
|
+
driver = self.driver_cls(**kwargs)
|
|
121
|
+
|
|
122
|
+
# Initialize the browser
|
|
123
|
+
# 初始化浏览器
|
|
124
|
+
await driver.setup()
|
|
125
|
+
|
|
126
|
+
return driver
|
|
127
|
+
|
|
128
|
+
async def get(self, **kwargs):
|
|
129
|
+
"""
|
|
130
|
+
Get a WebDriver instance from the pool.
|
|
131
|
+
从池中获取WebDriver实例。
|
|
132
|
+
|
|
133
|
+
This method either returns an existing browser from the pool or creates
|
|
134
|
+
a new one if the pool is not full. It also handles browser recycling
|
|
135
|
+
based on usage count to prevent performance degradation.
|
|
136
|
+
此方法从池中返回现有浏览器,或者如果池未满则创建新浏览器。
|
|
137
|
+
它还根据使用计数处理浏览器回收,以防止性能下降。
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
**kwargs: Additional arguments to pass to the WebDriver constructor if creating a new instance.
|
|
141
|
+
如果创建新实例,则传递给WebDriver构造函数的其他参数。
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
WebDriver: A WebDriver instance ready for use.
|
|
145
|
+
准备使用的WebDriver实例。
|
|
146
|
+
"""
|
|
147
|
+
# Synchronize access to the pool
|
|
148
|
+
# 同步访问池
|
|
149
|
+
async with self.lock:
|
|
150
|
+
# If pooling is disabled, always create a new browser
|
|
151
|
+
# 如果禁用池化,始终创建新浏览器
|
|
152
|
+
if not self.use_pool:
|
|
153
|
+
return await self.create_driver(**kwargs)
|
|
154
|
+
|
|
155
|
+
# If the pool is not full, create a new browser
|
|
156
|
+
# 如果池未满,创建新浏览器
|
|
157
|
+
if not self.is_full:
|
|
158
|
+
driver = await self.create_driver(**kwargs)
|
|
159
|
+
self.driver_count += 1
|
|
160
|
+
# Otherwise, get an existing browser from the queue
|
|
161
|
+
# 否则,从队列中获取现有浏览器
|
|
162
|
+
else:
|
|
163
|
+
driver = await self.queue.get()
|
|
164
|
+
|
|
165
|
+
# Handle browser recycling based on usage count
|
|
166
|
+
# 根据使用计数处理浏览器回收
|
|
167
|
+
# 如果driver达到指定使用次数,则销毁,重新启动一个driver(处理有些driver使用次数变多则变卡的情况)
|
|
168
|
+
if driver.max_uses is not None:
|
|
169
|
+
driver.max_uses -= 1
|
|
170
|
+
if driver.max_uses <= 0:
|
|
171
|
+
# Browser has reached its maximum usage count, recycle it
|
|
172
|
+
# 浏览器已达到其最大使用计数,回收它
|
|
173
|
+
await self.remove(driver)
|
|
174
|
+
return await self.get(**kwargs)
|
|
175
|
+
|
|
176
|
+
return driver
|
|
177
|
+
|
|
178
|
+
async def release(self, driver):
|
|
179
|
+
"""
|
|
180
|
+
Release a WebDriver instance back to the pool.
|
|
181
|
+
将WebDriver实例释放回池中。
|
|
182
|
+
|
|
183
|
+
If pooling is disabled, the browser is closed. Otherwise, it's
|
|
184
|
+
returned to the pool for reuse.
|
|
185
|
+
如果禁用池化,则关闭浏览器。否则,它将返回到池中以供重用。
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
driver: The WebDriver instance to release.
|
|
189
|
+
要释放的WebDriver实例。
|
|
190
|
+
"""
|
|
191
|
+
# If pooling is disabled, close the browser
|
|
192
|
+
# 如果禁用池化,关闭浏览器
|
|
193
|
+
if not self.use_pool:
|
|
194
|
+
await driver.quit()
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
# Otherwise, return it to the pool
|
|
198
|
+
# 否则,将其返回到池中
|
|
199
|
+
await self.queue.put(driver)
|
|
200
|
+
|
|
201
|
+
async def remove(self, driver):
|
|
202
|
+
"""
|
|
203
|
+
Remove a WebDriver instance from the pool.
|
|
204
|
+
从池中移除WebDriver实例。
|
|
205
|
+
|
|
206
|
+
This method closes the browser and decrements the driver count.
|
|
207
|
+
此方法关闭浏览器并减少驱动程序计数。
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
driver: The WebDriver instance to remove.
|
|
211
|
+
要移除的WebDriver实例。
|
|
212
|
+
"""
|
|
213
|
+
# Close the browser
|
|
214
|
+
# 关闭浏览器
|
|
215
|
+
await driver.quit()
|
|
216
|
+
# Decrement the driver count
|
|
217
|
+
# 减少驱动程序计数
|
|
218
|
+
self.driver_count -= 1
|
|
219
|
+
|
|
220
|
+
async def close(self):
|
|
221
|
+
"""
|
|
222
|
+
Close all WebDriver instances in the pool.
|
|
223
|
+
关闭池中的所有WebDriver实例。
|
|
224
|
+
|
|
225
|
+
This method is called when the spider is closing. It closes all
|
|
226
|
+
browser instances and resets the pool.
|
|
227
|
+
当爬虫关闭时调用此方法。它关闭所有浏览器实例并重置池。
|
|
228
|
+
"""
|
|
229
|
+
# Close all browsers in the pool
|
|
230
|
+
# 关闭池中的所有浏览器
|
|
231
|
+
while not self.queue.empty():
|
|
232
|
+
driver = await self.queue.get()
|
|
233
|
+
await driver.quit()
|
|
234
|
+
self.driver_count -= 1
|