aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,51 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Auto Throttle Extension
|
|
3
|
+
自动限速扩展
|
|
4
|
+
|
|
5
|
+
This extension automatically adjusts the download delay between requests based on
|
|
6
|
+
the response latency, helping to avoid overloading servers and improving crawling
|
|
7
|
+
efficiency. It dynamically increases or decreases the delay to maintain a target
|
|
8
|
+
level of concurrency.
|
|
9
|
+
此扩展根据响应延迟自动调整请求之间的下载延迟,有助于避免服务器过载并提高爬取效率。
|
|
10
|
+
它动态地增加或减少延迟以维持目标并发级别。
|
|
11
|
+
|
|
12
|
+
The extension works by measuring the latency of responses and adjusting the delay
|
|
13
|
+
to try to maintain a specified number of concurrent requests to each domain.
|
|
14
|
+
该扩展通过测量响应的延迟并调整延迟来尝试维持对每个域的指定数量的并发请求。
|
|
15
|
+
"""
|
|
1
16
|
from aioscrapy import signals
|
|
2
17
|
from aioscrapy.exceptions import NotConfigured
|
|
3
18
|
from aioscrapy.utils.log import logger
|
|
4
19
|
|
|
5
20
|
|
|
6
21
|
class AutoThrottle:
|
|
22
|
+
"""
|
|
23
|
+
Extension for automatically adjusting download delays based on response latency.
|
|
24
|
+
基于响应延迟自动调整下载延迟的扩展。
|
|
25
|
+
|
|
26
|
+
This extension dynamically adjusts the download delay between requests to maintain
|
|
27
|
+
a target level of concurrency. It helps to avoid overloading servers while
|
|
28
|
+
maximizing the crawling speed.
|
|
29
|
+
此扩展动态调整请求之间的下载延迟以维持目标并发级别。它有助于避免服务器过载,
|
|
30
|
+
同时最大化爬取速度。
|
|
31
|
+
"""
|
|
7
32
|
|
|
8
33
|
def __init__(self, crawler):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the AutoThrottle extension.
|
|
36
|
+
初始化AutoThrottle扩展。
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
crawler: The crawler that will use this extension.
|
|
40
|
+
将使用此扩展的爬虫。
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
NotConfigured: If AUTOTHROTTLE_ENABLED is not set to True in the settings.
|
|
44
|
+
如果在设置中未将AUTOTHROTTLE_ENABLED设置为True。
|
|
45
|
+
"""
|
|
46
|
+
# Store the crawler
|
|
47
|
+
# 存储爬虫
|
|
9
48
|
self.crawler = crawler
|
|
49
|
+
|
|
50
|
+
# Check if the extension is enabled
|
|
51
|
+
# 检查扩展是否已启用
|
|
10
52
|
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
|
11
53
|
raise NotConfigured
|
|
12
54
|
|
|
55
|
+
# Get debug setting
|
|
56
|
+
# 获取调试设置
|
|
13
57
|
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
|
58
|
+
|
|
59
|
+
# Get target concurrency setting
|
|
60
|
+
# 获取目标并发设置
|
|
14
61
|
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
|
|
62
|
+
|
|
63
|
+
# Connect to signals
|
|
64
|
+
# 连接到信号
|
|
15
65
|
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
|
16
66
|
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
|
17
67
|
|
|
18
68
|
@classmethod
|
|
19
69
|
def from_crawler(cls, crawler):
|
|
70
|
+
"""
|
|
71
|
+
Create an AutoThrottle instance from a crawler.
|
|
72
|
+
从爬虫创建AutoThrottle实例。
|
|
73
|
+
|
|
74
|
+
This is the factory method used by Scrapy to create the extension.
|
|
75
|
+
这是Scrapy用于创建扩展的工厂方法。
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
crawler: The crawler that will use this extension.
|
|
79
|
+
将使用此扩展的爬虫。
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
AutoThrottle: A new AutoThrottle instance.
|
|
83
|
+
一个新的AutoThrottle实例。
|
|
84
|
+
"""
|
|
85
|
+
# Create and return a new instance
|
|
86
|
+
# 创建并返回一个新实例
|
|
20
87
|
return cls(crawler)
|
|
21
88
|
|
|
22
89
|
def _spider_opened(self, spider):
|
|
90
|
+
"""
|
|
91
|
+
Handle the spider_opened signal.
|
|
92
|
+
处理spider_opened信号。
|
|
93
|
+
|
|
94
|
+
This method is called when a spider is opened. It initializes the minimum,
|
|
95
|
+
maximum, and starting download delays.
|
|
96
|
+
当爬虫打开时调用此方法。它初始化最小、最大和起始下载延迟。
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
spider: The spider that was opened.
|
|
100
|
+
被打开的爬虫。
|
|
101
|
+
"""
|
|
102
|
+
# Calculate minimum delay
|
|
103
|
+
# 计算最小延迟
|
|
23
104
|
self.mindelay = self._min_delay(spider)
|
|
105
|
+
|
|
106
|
+
# Calculate maximum delay
|
|
107
|
+
# 计算最大延迟
|
|
24
108
|
self.maxdelay = self._max_delay(spider)
|
|
109
|
+
|
|
110
|
+
# Set initial download delay for the spider
|
|
111
|
+
# 为爬虫设置初始下载延迟
|
|
25
112
|
spider.download_delay = self._start_delay(spider)
|
|
26
113
|
|
|
27
114
|
def _min_delay(self, spider):
|
|
115
|
+
"""
|
|
116
|
+
Get the minimum download delay.
|
|
117
|
+
获取最小下载延迟。
|
|
118
|
+
|
|
119
|
+
This method returns the minimum download delay, which is either the spider's
|
|
120
|
+
download_delay attribute or the DOWNLOAD_DELAY setting.
|
|
121
|
+
此方法返回最小下载延迟,即爬虫的download_delay属性或DOWNLOAD_DELAY设置。
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
spider: The spider to get the minimum delay for.
|
|
125
|
+
要获取最小延迟的爬虫。
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
float: The minimum download delay in seconds.
|
|
129
|
+
最小下载延迟(以秒为单位)。
|
|
130
|
+
"""
|
|
131
|
+
# Get settings
|
|
132
|
+
# 获取设置
|
|
28
133
|
s = self.crawler.settings
|
|
134
|
+
|
|
135
|
+
# Return spider's download_delay attribute or DOWNLOAD_DELAY setting
|
|
136
|
+
# 返回爬虫的download_delay属性或DOWNLOAD_DELAY设置
|
|
29
137
|
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
|
|
30
138
|
|
|
31
139
|
def _max_delay(self, spider):
|
|
140
|
+
"""
|
|
141
|
+
Get the maximum download delay.
|
|
142
|
+
获取最大下载延迟。
|
|
143
|
+
|
|
144
|
+
This method returns the maximum download delay from the AUTOTHROTTLE_MAX_DELAY setting.
|
|
145
|
+
此方法从AUTOTHROTTLE_MAX_DELAY设置返回最大下载延迟。
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
spider: The spider to get the maximum delay for.
|
|
149
|
+
要获取最大延迟的爬虫。
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
float: The maximum download delay in seconds.
|
|
153
|
+
最大下载延迟(以秒为单位)。
|
|
154
|
+
"""
|
|
155
|
+
# Return AUTOTHROTTLE_MAX_DELAY setting
|
|
156
|
+
# 返回AUTOTHROTTLE_MAX_DELAY设置
|
|
32
157
|
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
|
|
33
158
|
|
|
34
159
|
def _start_delay(self, spider):
|
|
160
|
+
"""
|
|
161
|
+
Get the initial download delay.
|
|
162
|
+
获取初始下载延迟。
|
|
163
|
+
|
|
164
|
+
This method returns the initial download delay, which is the maximum of
|
|
165
|
+
the minimum delay and the AUTOTHROTTLE_START_DELAY setting.
|
|
166
|
+
此方法返回初始下载延迟,即最小延迟和AUTOTHROTTLE_START_DELAY设置的最大值。
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
spider: The spider to get the start delay for.
|
|
170
|
+
要获取起始延迟的爬虫。
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
float: The initial download delay in seconds.
|
|
174
|
+
初始下载延迟(以秒为单位)。
|
|
175
|
+
"""
|
|
176
|
+
# Return the maximum of minimum delay and AUTOTHROTTLE_START_DELAY setting
|
|
177
|
+
# 返回最小延迟和AUTOTHROTTLE_START_DELAY设置的最大值
|
|
35
178
|
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
|
|
36
179
|
|
|
37
180
|
def _response_downloaded(self, response, request, spider):
|
|
181
|
+
"""
|
|
182
|
+
Handle the response_downloaded signal.
|
|
183
|
+
处理response_downloaded信号。
|
|
184
|
+
|
|
185
|
+
This method is called when a response is downloaded. It adjusts the download
|
|
186
|
+
delay based on the response latency and logs debug information if enabled.
|
|
187
|
+
当下载响应时调用此方法。它根据响应延迟调整下载延迟,并在启用时记录调试信息。
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
response: The downloaded response.
|
|
191
|
+
下载的响应。
|
|
192
|
+
request: The request that generated the response.
|
|
193
|
+
生成响应的请求。
|
|
194
|
+
spider: The spider that made the request.
|
|
195
|
+
发出请求的爬虫。
|
|
196
|
+
"""
|
|
197
|
+
# Get the download slot for the request
|
|
198
|
+
# 获取请求的下载槽
|
|
38
199
|
key, slot = self._get_slot(request, spider)
|
|
200
|
+
|
|
201
|
+
# Get the download latency from the request metadata
|
|
202
|
+
# 从请求元数据获取下载延迟
|
|
39
203
|
latency = request.meta.get('download_latency')
|
|
204
|
+
|
|
205
|
+
# If latency or slot is not available, do nothing
|
|
206
|
+
# 如果延迟或槽不可用,则不执行任何操作
|
|
40
207
|
if latency is None or slot is None:
|
|
41
208
|
return
|
|
42
209
|
|
|
210
|
+
# Store the old delay for logging
|
|
211
|
+
# 存储旧延迟以供记录
|
|
43
212
|
olddelay = slot.delay
|
|
213
|
+
|
|
214
|
+
# Adjust the delay based on the latency and response
|
|
215
|
+
# 根据延迟和响应调整延迟
|
|
44
216
|
self._adjust_delay(slot, latency, response)
|
|
217
|
+
|
|
218
|
+
# Log debug information if enabled
|
|
219
|
+
# 如果启用,则记录调试信息
|
|
45
220
|
if self.debug:
|
|
221
|
+
# Calculate the delay difference
|
|
222
|
+
# 计算延迟差异
|
|
46
223
|
diff = slot.delay - olddelay
|
|
224
|
+
|
|
225
|
+
# Get the response size
|
|
226
|
+
# 获取响应大小
|
|
47
227
|
size = len(response.body)
|
|
228
|
+
|
|
229
|
+
# Get the number of concurrent requests
|
|
230
|
+
# 获取并发请求数
|
|
48
231
|
conc = len(slot.transferring)
|
|
232
|
+
|
|
233
|
+
# Log the debug information
|
|
234
|
+
# 记录调试信息
|
|
49
235
|
logger.info(
|
|
50
236
|
"slot: %(slot)s | conc:%(concurrency)2d | "
|
|
51
237
|
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
|
@@ -57,32 +243,80 @@ class AutoThrottle:
|
|
|
57
243
|
)
|
|
58
244
|
|
|
59
245
|
def _get_slot(self, request, spider):
|
|
246
|
+
"""
|
|
247
|
+
Get the download slot for a request.
|
|
248
|
+
获取请求的下载槽。
|
|
249
|
+
|
|
250
|
+
This method returns the download slot key and the slot object for a request.
|
|
251
|
+
此方法返回请求的下载槽键和槽对象。
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
request: The request to get the slot for.
|
|
255
|
+
要获取槽的请求。
|
|
256
|
+
spider: The spider that made the request.
|
|
257
|
+
发出请求的爬虫。
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
tuple: A tuple containing the slot key and the slot object.
|
|
261
|
+
包含槽键和槽对象的元组。
|
|
262
|
+
"""
|
|
263
|
+
# Get the download slot key from the request metadata
|
|
264
|
+
# 从请求元数据获取下载槽键
|
|
60
265
|
key = request.meta.get('download_slot')
|
|
266
|
+
|
|
267
|
+
# Return the key and the corresponding slot object
|
|
268
|
+
# 返回键和相应的槽对象
|
|
61
269
|
return key, self.crawler.engine.downloader.slots.get(key)
|
|
62
270
|
|
|
63
271
|
def _adjust_delay(self, slot, latency, response):
|
|
64
|
-
"""
|
|
272
|
+
"""
|
|
273
|
+
Adjust the download delay based on the response latency.
|
|
274
|
+
根据响应延迟调整下载延迟。
|
|
275
|
+
|
|
276
|
+
This method implements the delay adjustment policy. It calculates a new
|
|
277
|
+
download delay based on the response latency and the target concurrency,
|
|
278
|
+
and updates the slot's delay if appropriate.
|
|
279
|
+
此方法实现延迟调整策略。它根据响应延迟和目标并发计算新的下载延迟,
|
|
280
|
+
并在适当时更新槽的延迟。
|
|
65
281
|
|
|
282
|
+
Args:
|
|
283
|
+
slot: The download slot to adjust the delay for.
|
|
284
|
+
要调整延迟的下载槽。
|
|
285
|
+
latency: The download latency of the response.
|
|
286
|
+
响应的下载延迟。
|
|
287
|
+
response: The downloaded response.
|
|
288
|
+
下载的响应。
|
|
289
|
+
"""
|
|
66
290
|
# If a server needs `latency` seconds to respond then
|
|
67
291
|
# we should send a request each `latency/N` seconds
|
|
68
292
|
# to have N requests processed in parallel
|
|
293
|
+
# 如果服务器需要`latency`秒来响应,那么我们应该每`latency/N`秒发送一个请求,
|
|
294
|
+
# 以便并行处理N个请求
|
|
69
295
|
target_delay = latency / self.target_concurrency
|
|
70
296
|
|
|
71
297
|
# Adjust the delay to make it closer to target_delay
|
|
298
|
+
# 调整延迟使其更接近target_delay
|
|
72
299
|
new_delay = (slot.delay + target_delay) / 2.0
|
|
73
300
|
|
|
74
301
|
# If target delay is bigger than old delay, then use it instead of mean.
|
|
75
302
|
# It works better with problematic sites.
|
|
303
|
+
# 如果目标延迟大于旧延迟,则使用它而不是平均值。
|
|
304
|
+
# 这对于有问题的站点效果更好。
|
|
76
305
|
new_delay = max(target_delay, new_delay)
|
|
77
306
|
|
|
78
307
|
# Make sure self.mindelay <= new_delay <= self.max_delay
|
|
308
|
+
# 确保self.mindelay <= new_delay <= self.max_delay
|
|
79
309
|
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
|
|
80
310
|
|
|
81
311
|
# Dont adjust delay if response status != 200 and new delay is smaller
|
|
82
312
|
# than old one, as error pages (and redirections) are usually small and
|
|
83
313
|
# so tend to reduce latency, thus provoking a positive feedback by
|
|
84
314
|
# reducing delay instead of increase.
|
|
315
|
+
# 如果响应状态 != 200且新延迟小于旧延迟,则不调整延迟,因为错误页面(和重定向)
|
|
316
|
+
# 通常很小,因此倾向于减少延迟,从而引发正反馈,减少延迟而不是增加。
|
|
85
317
|
if response.status != 200 and new_delay <= slot.delay:
|
|
86
318
|
return
|
|
87
319
|
|
|
320
|
+
# Update the slot's delay
|
|
321
|
+
# 更新槽的延迟
|
|
88
322
|
slot.delay = new_delay
|