aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Downloader Module
|
|
3
|
+
下载器模块
|
|
4
|
+
|
|
5
|
+
This module provides the core downloader functionality for AioScrapy. The downloader
|
|
6
|
+
is responsible for fetching web pages and other resources from the internet, managing
|
|
7
|
+
concurrency, handling delays between requests, and processing the results through
|
|
8
|
+
middleware.
|
|
9
|
+
此模块为AioScrapy提供核心下载功能。下载器负责从互联网获取网页和其他资源,
|
|
10
|
+
管理并发,处理请求之间的延迟,并通过中间件处理结果。
|
|
11
|
+
|
|
12
|
+
The main components are:
|
|
13
|
+
主要组件包括:
|
|
14
|
+
|
|
15
|
+
1. BaseDownloader: Abstract base class defining the downloader interface
|
|
16
|
+
定义下载器接口的抽象基类
|
|
17
|
+
2. Downloader: Default implementation of the downloader
|
|
18
|
+
下载器的默认实现
|
|
19
|
+
3. Slot: Class for managing per-domain or per-IP concurrency and delays
|
|
20
|
+
用于管理每个域名或每个IP的并发和延迟的类
|
|
21
|
+
|
|
22
|
+
The downloader respects various settings like:
|
|
23
|
+
下载器遵循各种设置,如:
|
|
24
|
+
|
|
25
|
+
- CONCURRENT_REQUESTS: Global concurrency limit
|
|
26
|
+
全局并发限制
|
|
27
|
+
- CONCURRENT_REQUESTS_PER_DOMAIN: Per-domain concurrency limit
|
|
28
|
+
每个域名的并发限制
|
|
29
|
+
- CONCURRENT_REQUESTS_PER_IP: Per-IP concurrency limit
|
|
30
|
+
每个IP的并发限制
|
|
31
|
+
- DOWNLOAD_DELAY: Delay between requests
|
|
32
|
+
请求之间的延迟
|
|
33
|
+
- RANDOMIZE_DOWNLOAD_DELAY: Whether to randomize delays
|
|
34
|
+
是否随机化延迟
|
|
35
|
+
"""
|
|
1
36
|
import asyncio
|
|
2
37
|
import random
|
|
3
38
|
from abc import abstractmethod
|
|
@@ -21,11 +56,48 @@ from aioscrapy.utils.tools import call_helper, create_task
|
|
|
21
56
|
|
|
22
57
|
|
|
23
58
|
class BaseDownloaderMeta(type):
|
|
59
|
+
"""
|
|
60
|
+
Metaclass for BaseDownloader that implements the virtual subclass pattern.
|
|
61
|
+
BaseDownloader的元类,实现虚拟子类模式。
|
|
62
|
+
|
|
63
|
+
This metaclass allows classes to be recognized as BaseDownloader subclasses
|
|
64
|
+
if they implement the required interface, even if they don't explicitly inherit from it.
|
|
65
|
+
该元类允许类被识别为BaseDownloader的子类,如果它们实现了所需的接口,即使它们没有显式地继承它。
|
|
66
|
+
"""
|
|
24
67
|
|
|
25
68
|
def __instancecheck__(cls, instance):
|
|
69
|
+
"""
|
|
70
|
+
Check if an instance is an instance of this class.
|
|
71
|
+
检查实例是否是此类的实例。
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
instance: The instance to check.
|
|
75
|
+
要检查的实例。
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
bool: True if the instance implements the required interface.
|
|
79
|
+
如果实例实现了所需的接口,则为True。
|
|
80
|
+
"""
|
|
26
81
|
return cls.__subclasscheck__(type(instance))
|
|
27
82
|
|
|
28
83
|
def __subclasscheck__(cls, subclass):
|
|
84
|
+
"""
|
|
85
|
+
Check if a class is a subclass of this class.
|
|
86
|
+
检查类是否是此类的子类。
|
|
87
|
+
|
|
88
|
+
A class is considered a subclass if it implements the required methods:
|
|
89
|
+
如果类实现了所需的方法,则被视为子类:
|
|
90
|
+
- fetch: For downloading requests
|
|
91
|
+
- needs_backout: For checking if the downloader is at capacity
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
subclass: The class to check.
|
|
95
|
+
要检查的类。
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
bool: True if the class implements the required interface.
|
|
99
|
+
如果类实现了所需的接口,则为True。
|
|
100
|
+
"""
|
|
29
101
|
return (
|
|
30
102
|
hasattr(subclass, "fetch") and callable(subclass.fetch)
|
|
31
103
|
and hasattr(subclass, "needs_backout") and callable(subclass.needs_backout)
|
|
@@ -33,20 +105,79 @@ class BaseDownloaderMeta(type):
|
|
|
33
105
|
|
|
34
106
|
|
|
35
107
|
class BaseDownloader(metaclass=BaseDownloaderMeta):
|
|
108
|
+
"""
|
|
109
|
+
Abstract base class for downloaders.
|
|
110
|
+
下载器的抽象基类。
|
|
111
|
+
|
|
112
|
+
This class defines the interface that all downloaders must implement.
|
|
113
|
+
此类定义了所有下载器必须实现的接口。
|
|
114
|
+
"""
|
|
36
115
|
|
|
37
116
|
@classmethod
|
|
38
117
|
async def from_crawler(cls, crawler) -> "BaseDownloader":
|
|
118
|
+
"""
|
|
119
|
+
Create a downloader instance from a crawler.
|
|
120
|
+
从爬虫创建下载器实例。
|
|
121
|
+
|
|
122
|
+
This is a factory method that creates a downloader instance from a crawler.
|
|
123
|
+
In the base class, the crawler parameter is not used, but subclasses can
|
|
124
|
+
override this method to use crawler settings or other attributes to
|
|
125
|
+
configure the downloader.
|
|
126
|
+
这是一个工厂方法,从爬虫创建下载器实例。在基类中,crawler参数未被使用,
|
|
127
|
+
但子类可以覆盖此方法以使用爬虫设置或其他属性来配置下载器。
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
crawler: The crawler instance that will use this downloader.
|
|
131
|
+
将使用此下载器的爬虫实例。
|
|
132
|
+
This parameter is not used in the base implementation but is
|
|
133
|
+
provided for subclasses to use.
|
|
134
|
+
此参数在基本实现中未使用,但提供给子类使用。
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
BaseDownloader: A new downloader instance.
|
|
138
|
+
一个新的下载器实例。
|
|
139
|
+
"""
|
|
140
|
+
# The crawler parameter is intentionally unused in the base implementation
|
|
141
|
+
# 在基本实现中有意不使用crawler参数
|
|
39
142
|
return cls()
|
|
40
143
|
|
|
41
144
|
async def close(self) -> None:
|
|
145
|
+
"""
|
|
146
|
+
Close the downloader and release its resources.
|
|
147
|
+
关闭下载器并释放其资源。
|
|
148
|
+
|
|
149
|
+
This method is called when the spider is closed.
|
|
150
|
+
当爬虫关闭时调用此方法。
|
|
151
|
+
"""
|
|
42
152
|
pass
|
|
43
153
|
|
|
44
154
|
@abstractmethod
|
|
45
155
|
async def fetch(self, request: Request) -> None:
|
|
156
|
+
"""
|
|
157
|
+
Fetch a request.
|
|
158
|
+
获取请求。
|
|
159
|
+
|
|
160
|
+
This method should download the given request and call the appropriate
|
|
161
|
+
callback with the result.
|
|
162
|
+
此方法应下载给定的请求并使用结果调用适当的回调。
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
request: The request to fetch.
|
|
166
|
+
要获取的请求。
|
|
167
|
+
"""
|
|
46
168
|
raise NotImplementedError()
|
|
47
169
|
|
|
48
170
|
@abstractmethod
|
|
49
171
|
def needs_backout(self) -> bool:
|
|
172
|
+
"""
|
|
173
|
+
Check if the downloader needs to back out (stop accepting new requests).
|
|
174
|
+
检查下载器是否需要退出(停止接受新请求)。
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
bool: True if the downloader is at capacity and should not accept
|
|
178
|
+
new requests, False otherwise.
|
|
179
|
+
如果下载器已达到容量并且不应接受新请求,则为True,否则为False。
|
|
180
|
+
"""
|
|
50
181
|
raise NotImplementedError()
|
|
51
182
|
|
|
52
183
|
|
|
@@ -54,33 +185,96 @@ DownloaderTV = TypeVar("DownloaderTV", bound="Downloader")
|
|
|
54
185
|
|
|
55
186
|
|
|
56
187
|
class Slot:
|
|
57
|
-
"""
|
|
188
|
+
"""
|
|
189
|
+
Downloader slot for managing per-domain or per-IP concurrency and delays.
|
|
190
|
+
用于管理每个域名或每个IP的并发和延迟的下载器槽。
|
|
191
|
+
|
|
192
|
+
Each domain or IP has its own slot to control:
|
|
193
|
+
每个域名或IP都有自己的槽来控制:
|
|
194
|
+
- Concurrency: How many requests can be processed simultaneously
|
|
195
|
+
并发:可以同时处理多少请求
|
|
196
|
+
- Delay: How long to wait between requests
|
|
197
|
+
延迟:请求之间等待多长时间
|
|
198
|
+
"""
|
|
58
199
|
|
|
59
200
|
def __init__(self, concurrency: int, delay: float, randomize_delay: bool) -> None:
|
|
201
|
+
"""
|
|
202
|
+
Initialize a new downloader slot.
|
|
203
|
+
初始化一个新的下载器槽。
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
concurrency: Maximum number of concurrent requests for this slot.
|
|
207
|
+
此槽的最大并发请求数。
|
|
208
|
+
delay: Minimum delay between requests in seconds.
|
|
209
|
+
请求之间的最小延迟(秒)。
|
|
210
|
+
randomize_delay: Whether to randomize the delay between requests.
|
|
211
|
+
是否随机化请求之间的延迟。
|
|
212
|
+
"""
|
|
60
213
|
self.concurrency = concurrency
|
|
61
214
|
self.delay = delay
|
|
62
215
|
self.randomize_delay = randomize_delay
|
|
63
216
|
|
|
64
|
-
self.active: Set[Request] = set()
|
|
65
|
-
|
|
66
|
-
self.
|
|
67
|
-
|
|
68
|
-
self.
|
|
69
|
-
|
|
70
|
-
|
|
217
|
+
self.active: Set[Request] = set() # All requests being processed by this slot
|
|
218
|
+
# 此槽正在处理的所有请求
|
|
219
|
+
self.transferring: Set[Request] = set() # Requests being downloaded
|
|
220
|
+
# 正在下载的请求
|
|
221
|
+
self.queue: Deque[Request] = deque() # Requests queued for download
|
|
222
|
+
# 排队等待下载的请求
|
|
223
|
+
self.lastseen: float = 0 # Timestamp of last request processed
|
|
224
|
+
# 上次处理请求的时间戳
|
|
225
|
+
self.delay_lock: bool = False # Lock to prevent concurrent delay processing
|
|
226
|
+
# 锁定以防止并发延迟处理
|
|
227
|
+
|
|
228
|
+
def free_transfer_slots(self) -> int:
|
|
229
|
+
"""
|
|
230
|
+
Calculate how many more requests can be processed concurrently.
|
|
231
|
+
计算可以同时处理多少个更多的请求。
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
int: Number of available transfer slots.
|
|
235
|
+
可用传输槽的数量。
|
|
236
|
+
"""
|
|
71
237
|
return self.concurrency - len(self.transferring)
|
|
72
238
|
|
|
73
239
|
def download_delay(self) -> float:
|
|
240
|
+
"""
|
|
241
|
+
Get the delay to use between requests.
|
|
242
|
+
获取请求之间使用的延迟。
|
|
243
|
+
|
|
244
|
+
If randomize_delay is True, the delay will be randomized between
|
|
245
|
+
0.5 and 1.5 times the configured delay.
|
|
246
|
+
如果randomize_delay为True,延迟将在配置的延迟的0.5到1.5倍之间随机化。
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
float: The delay in seconds.
|
|
250
|
+
延迟(秒)。
|
|
251
|
+
"""
|
|
74
252
|
if self.randomize_delay:
|
|
75
253
|
return random.uniform(0.5 * self.delay, 1.5 * self.delay)
|
|
76
254
|
return self.delay
|
|
77
255
|
|
|
78
|
-
def __repr__(self):
|
|
256
|
+
def __repr__(self) -> str:
|
|
257
|
+
"""
|
|
258
|
+
Return a string representation of the slot for debugging.
|
|
259
|
+
返回用于调试的槽的字符串表示。
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
str: A string representation of the slot.
|
|
263
|
+
槽的字符串表示。
|
|
264
|
+
"""
|
|
79
265
|
cls_name = self.__class__.__name__
|
|
80
266
|
return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % (
|
|
81
267
|
cls_name, self.concurrency, self.delay, self.randomize_delay)
|
|
82
268
|
|
|
83
|
-
def __str__(self):
|
|
269
|
+
def __str__(self) -> str:
|
|
270
|
+
"""
|
|
271
|
+
Return a detailed string representation of the slot.
|
|
272
|
+
返回槽的详细字符串表示。
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
str: A detailed string representation of the slot.
|
|
276
|
+
槽的详细字符串表示。
|
|
277
|
+
"""
|
|
84
278
|
return (
|
|
85
279
|
"<downloader.Slot concurrency=%r delay=%0.2f randomize_delay=%r "
|
|
86
280
|
"len(active)=%d len(queue)=%d len(transferring)=%d lastseen=%s>" % (
|
|
@@ -92,10 +286,37 @@ class Slot:
|
|
|
92
286
|
|
|
93
287
|
|
|
94
288
|
def _get_concurrency_delay(concurrency: int, spider: Spider, settings: Settings) -> Tuple[int, float]:
|
|
289
|
+
"""
|
|
290
|
+
Get the concurrency and delay settings for a spider.
|
|
291
|
+
获取爬虫的并发和延迟设置。
|
|
292
|
+
|
|
293
|
+
This function determines the appropriate concurrency and delay values
|
|
294
|
+
by checking both the settings and spider attributes.
|
|
295
|
+
此函数通过检查设置和爬虫属性来确定适当的并发和延迟值。
|
|
296
|
+
|
|
297
|
+
Spider-specific settings take precedence over global settings.
|
|
298
|
+
爬虫特定的设置优先于全局设置。
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
concurrency: Default concurrency value from settings.
|
|
302
|
+
来自设置的默认并发值。
|
|
303
|
+
spider: The spider instance.
|
|
304
|
+
爬虫实例。
|
|
305
|
+
settings: The settings object.
|
|
306
|
+
设置对象。
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Tuple[int, float]: A tuple containing (concurrency, delay).
|
|
310
|
+
包含(并发,延迟)的元组。
|
|
311
|
+
"""
|
|
312
|
+
# Get delay from settings, then override with spider attribute if available
|
|
313
|
+
# 从设置获取延迟,然后如果可用,用爬虫属性覆盖
|
|
95
314
|
delay = settings.getfloat('DOWNLOAD_DELAY')
|
|
96
315
|
if hasattr(spider, 'download_delay'):
|
|
97
316
|
delay = spider.download_delay
|
|
98
317
|
|
|
318
|
+
# Get concurrency from settings, then override with spider attribute if available
|
|
319
|
+
# 从设置获取并发,然后如果可用,用爬虫属性覆盖
|
|
99
320
|
if hasattr(spider, 'max_concurrent_requests'):
|
|
100
321
|
concurrency = spider.max_concurrent_requests
|
|
101
322
|
|
|
@@ -103,7 +324,17 @@ def _get_concurrency_delay(concurrency: int, spider: Spider, settings: Settings)
|
|
|
103
324
|
|
|
104
325
|
|
|
105
326
|
class Downloader(BaseDownloader):
|
|
106
|
-
|
|
327
|
+
"""
|
|
328
|
+
Default implementation of the downloader.
|
|
329
|
+
下载器的默认实现。
|
|
330
|
+
|
|
331
|
+
This class handles downloading requests, managing concurrency and delays,
|
|
332
|
+
and processing the results through middleware.
|
|
333
|
+
此类处理下载请求、管理并发和延迟,并通过中间件处理结果。
|
|
334
|
+
"""
|
|
335
|
+
|
|
336
|
+
DOWNLOAD_SLOT: str = 'download_slot' # Meta key for custom download slot
|
|
337
|
+
# 自定义下载槽的元键
|
|
107
338
|
|
|
108
339
|
def __init__(
|
|
109
340
|
self,
|
|
@@ -114,140 +345,414 @@ class Downloader(BaseDownloader):
|
|
|
114
345
|
proxy: Optional[AbsProxy] = None,
|
|
115
346
|
dupefilter: Optional[DupeFilterBase] = None,
|
|
116
347
|
):
|
|
348
|
+
"""
|
|
349
|
+
Initialize the downloader.
|
|
350
|
+
初始化下载器。
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
crawler: The crawler instance that this downloader belongs to.
|
|
354
|
+
此下载器所属的爬虫实例。
|
|
355
|
+
handler: The download handler manager.
|
|
356
|
+
下载处理程序管理器。
|
|
357
|
+
middleware: The downloader middleware manager.
|
|
358
|
+
下载器中间件管理器。
|
|
359
|
+
proxy: Optional proxy handler for managing proxies.
|
|
360
|
+
可选的代理处理程序,用于管理代理。
|
|
361
|
+
dupefilter: Optional duplicate filter for avoiding duplicate requests.
|
|
362
|
+
可选的重复过滤器,用于避免重复请求。
|
|
363
|
+
"""
|
|
364
|
+
# Components from crawler
|
|
365
|
+
# 来自爬虫的组件
|
|
117
366
|
self.settings: Settings = crawler.settings
|
|
118
367
|
self.signals: SignalManager = crawler.signals
|
|
119
368
|
self.spider: Spider = crawler.spider
|
|
120
369
|
self.spider.proxy = proxy
|
|
121
370
|
self._call_engine: Callable = crawler.engine.handle_downloader_output
|
|
122
371
|
|
|
372
|
+
# External components
|
|
373
|
+
# 外部组件
|
|
123
374
|
self.middleware = middleware
|
|
124
375
|
self.handler = handler
|
|
125
376
|
self.proxy = proxy
|
|
126
377
|
self.dupefilter = dupefilter
|
|
127
378
|
|
|
379
|
+
# Concurrency and delay settings
|
|
380
|
+
# 并发和延迟设置
|
|
128
381
|
self.total_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS')
|
|
129
382
|
self.get_requests_count: int = self.settings.getint('GET_REQUESTS_COUNT') or self.total_concurrency
|
|
130
383
|
self.domain_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
|
131
384
|
self.ip_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
|
|
132
385
|
self.randomize_delay: bool = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
|
|
133
386
|
|
|
134
|
-
|
|
135
|
-
|
|
387
|
+
# State
|
|
388
|
+
# 状态
|
|
389
|
+
self.active: Set[Request] = set() # All active requests
|
|
390
|
+
# 所有活动请求
|
|
391
|
+
self.slots: dict = {} # Domain/IP -> Slot mapping
|
|
392
|
+
# 域名/IP -> 槽映射
|
|
136
393
|
self.running: bool = True
|
|
394
|
+
|
|
395
|
+
# Start slot garbage collector
|
|
396
|
+
# 启动槽垃圾收集器
|
|
137
397
|
create_task(self._slot_gc(60))
|
|
138
398
|
|
|
139
399
|
@classmethod
|
|
140
400
|
async def from_crawler(cls, crawler) -> "Downloader":
|
|
401
|
+
"""
|
|
402
|
+
Create a downloader instance from a crawler.
|
|
403
|
+
从爬虫创建下载器实例。
|
|
404
|
+
|
|
405
|
+
This factory method creates and initializes a downloader with all the
|
|
406
|
+
necessary components from the crawler.
|
|
407
|
+
此工厂方法创建并初始化具有爬虫中所有必要组件的下载器。
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
crawler: The crawler instance that will use this downloader.
|
|
411
|
+
将使用此下载器的爬虫实例。
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
Downloader: A new downloader instance.
|
|
415
|
+
一个新的下载器实例。
|
|
416
|
+
"""
|
|
417
|
+
# Initialize dupefilter if configured
|
|
418
|
+
# 如果已配置,则初始化重复过滤器
|
|
141
419
|
df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'],
|
|
142
420
|
crawler=crawler)
|
|
421
|
+
# Bind dupefilter to spider for access in spider callbacks
|
|
422
|
+
# 将重复过滤器绑定到爬虫,以便在爬虫回调中访问
|
|
143
423
|
crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
|
|
424
|
+
|
|
425
|
+
# Initialize proxy handler if configured
|
|
426
|
+
# 如果已配置,则初始化代理处理程序
|
|
427
|
+
proxy_handler = crawler.settings.get("PROXY_HANDLER") and await load_instance(
|
|
428
|
+
crawler.settings["PROXY_HANDLER"],
|
|
429
|
+
crawler=crawler
|
|
430
|
+
)
|
|
431
|
+
|
|
144
432
|
return cls(
|
|
145
433
|
crawler,
|
|
146
434
|
await call_helper(DownloadHandlerManager.from_crawler, crawler),
|
|
147
435
|
await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
|
|
148
|
-
proxy=
|
|
149
|
-
crawler=crawler),
|
|
436
|
+
proxy=proxy_handler,
|
|
150
437
|
dupefilter=df
|
|
151
438
|
)
|
|
152
439
|
|
|
153
440
|
async def fetch(self, request: Request) -> None:
|
|
441
|
+
"""
|
|
442
|
+
Fetch a request.
|
|
443
|
+
获取请求。
|
|
444
|
+
|
|
445
|
+
This method adds the request to the appropriate download slot and
|
|
446
|
+
starts processing the queue if possible.
|
|
447
|
+
此方法将请求添加到适当的下载槽,并在可能的情况下开始处理队列。
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
request: The request to fetch.
|
|
451
|
+
要获取的请求。
|
|
452
|
+
"""
|
|
453
|
+
# Add to global active requests set
|
|
454
|
+
# 添加到全局活动请求集
|
|
154
455
|
self.active.add(request)
|
|
456
|
+
|
|
457
|
+
# Get the appropriate slot for this request
|
|
458
|
+
# 获取此请求的适当槽
|
|
155
459
|
key, slot = self._get_slot(request, self.spider)
|
|
156
460
|
request.meta[self.DOWNLOAD_SLOT] = key
|
|
157
461
|
|
|
462
|
+
# Add to slot's active and queue sets
|
|
463
|
+
# 添加到槽的活动和队列集
|
|
158
464
|
slot.active.add(request)
|
|
159
465
|
slot.queue.append(request)
|
|
466
|
+
|
|
467
|
+
# Start processing the queue
|
|
468
|
+
# 开始处理队列
|
|
160
469
|
await self._process_queue(slot)
|
|
161
470
|
|
|
162
471
|
async def _process_queue(self, slot: Slot) -> None:
|
|
472
|
+
"""
|
|
473
|
+
Process the request queue for a slot.
|
|
474
|
+
处理槽的请求队列。
|
|
475
|
+
|
|
476
|
+
This method handles the download delay between requests and starts
|
|
477
|
+
downloading requests when slots are available.
|
|
478
|
+
此方法处理请求之间的下载延迟,并在槽可用时开始下载请求。
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
slot: The slot whose queue should be processed.
|
|
482
|
+
应处理其队列的槽。
|
|
483
|
+
"""
|
|
484
|
+
# If the slot is already waiting for a delay, don't process again
|
|
485
|
+
# 如果槽已经在等待延迟,则不要再次处理
|
|
163
486
|
if slot.delay_lock:
|
|
164
487
|
return
|
|
165
488
|
|
|
166
489
|
now = time()
|
|
167
490
|
delay = slot.download_delay()
|
|
491
|
+
|
|
492
|
+
# Handle download delay between requests
|
|
493
|
+
# 处理请求之间的下载延迟
|
|
168
494
|
if delay:
|
|
169
495
|
penalty = delay - now + slot.lastseen
|
|
170
496
|
if penalty > 0:
|
|
497
|
+
# Need to wait before processing next request
|
|
498
|
+
# 需要等待才能处理下一个请求
|
|
171
499
|
slot.delay_lock = True
|
|
172
500
|
await asyncio.sleep(penalty)
|
|
173
501
|
slot.delay_lock = False
|
|
502
|
+
# Schedule another processing after the delay
|
|
503
|
+
# 延迟后安排另一次处理
|
|
174
504
|
create_task(self._process_queue(slot))
|
|
175
505
|
return
|
|
176
506
|
|
|
507
|
+
# Process as many queued requests as possible
|
|
508
|
+
# 尽可能多地处理排队的请求
|
|
177
509
|
while slot.queue and slot.free_transfer_slots() > 0:
|
|
178
510
|
request = slot.queue.popleft()
|
|
179
511
|
slot.transferring.add(request)
|
|
180
512
|
create_task(self._download(slot, request))
|
|
513
|
+
# If there's a delay, only process one request at a time
|
|
514
|
+
# 如果有延迟,一次只处理一个请求
|
|
181
515
|
if delay:
|
|
182
516
|
break
|
|
183
517
|
|
|
184
518
|
async def _download(self, slot: Slot, request: Request) -> None:
|
|
519
|
+
"""
|
|
520
|
+
Download a request and process the result.
|
|
521
|
+
下载请求并处理结果。
|
|
522
|
+
|
|
523
|
+
This method handles the entire download process including:
|
|
524
|
+
此方法处理整个下载过程,包括:
|
|
525
|
+
|
|
526
|
+
1. Duplicate filtering
|
|
527
|
+
重复过滤
|
|
528
|
+
2. Middleware processing
|
|
529
|
+
中间件处理
|
|
530
|
+
3. Actual downloading
|
|
531
|
+
实际下载
|
|
532
|
+
4. Proxy handling
|
|
533
|
+
代理处理
|
|
534
|
+
5. Response processing
|
|
535
|
+
响应处理
|
|
536
|
+
6. Exception handling
|
|
537
|
+
异常处理
|
|
538
|
+
7. Cleanup and callback
|
|
539
|
+
清理和回调
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
slot: The slot that the request belongs to.
|
|
543
|
+
请求所属的槽。
|
|
544
|
+
request: The request to download.
|
|
545
|
+
要下载的请求。
|
|
546
|
+
"""
|
|
185
547
|
result = None
|
|
186
548
|
try:
|
|
549
|
+
# Check if request is a duplicate
|
|
550
|
+
# 检查请求是否重复
|
|
187
551
|
if self.dupefilter and not request.dont_filter and await self.dupefilter.request_seen(request):
|
|
188
552
|
self.dupefilter.log(request, self.spider)
|
|
189
553
|
return
|
|
554
|
+
|
|
555
|
+
# Update last seen timestamp
|
|
556
|
+
# 更新上次看到的时间戳
|
|
190
557
|
slot.lastseen = time()
|
|
558
|
+
|
|
559
|
+
# Process request through middleware
|
|
560
|
+
# 通过中间件处理请求
|
|
191
561
|
result = await self.middleware.process_request(self.spider, request)
|
|
562
|
+
|
|
563
|
+
# If middleware didn't return a response, download the request
|
|
564
|
+
# 如果中间件没有返回响应,则下载请求
|
|
192
565
|
if result is None:
|
|
566
|
+
# Add proxy if available
|
|
567
|
+
# 如果可用,添加代理
|
|
193
568
|
self.proxy and await self.proxy.add_proxy(request)
|
|
194
569
|
result = await self.handler.download_request(request, self.spider)
|
|
195
570
|
except BaseException as exc:
|
|
571
|
+
# Handle exceptions
|
|
572
|
+
# 处理异常
|
|
196
573
|
self.proxy and self.proxy.check(request, exception=exc)
|
|
197
574
|
result = await self.middleware.process_exception(self.spider, request, exc)
|
|
198
575
|
else:
|
|
576
|
+
# Process successful response
|
|
577
|
+
# 处理成功的响应
|
|
199
578
|
if isinstance(result, Response):
|
|
200
579
|
try:
|
|
580
|
+
# Check proxy status with response
|
|
581
|
+
# 使用响应检查代理状态
|
|
201
582
|
self.proxy and self.proxy.check(request, response=result)
|
|
202
583
|
result = await self.middleware.process_response(self.spider, request, result)
|
|
203
584
|
except BaseException as exc:
|
|
204
585
|
result = exc
|
|
205
586
|
finally:
|
|
587
|
+
# Cleanup: remove request from all tracking collections
|
|
588
|
+
# 清理:从所有跟踪集合中删除请求
|
|
206
589
|
slot.transferring.remove(request)
|
|
207
590
|
slot.active.remove(request)
|
|
208
591
|
self.active.remove(request)
|
|
209
592
|
|
|
593
|
+
# Send signal if we got a response
|
|
594
|
+
# 如果我们得到响应,发送信号
|
|
210
595
|
if isinstance(result, Response):
|
|
211
596
|
await self.signals.send_catch_log(signal=signals.response_downloaded,
|
|
212
597
|
response=result,
|
|
213
598
|
request=request,
|
|
214
599
|
spider=self.spider)
|
|
215
|
-
|
|
600
|
+
|
|
601
|
+
# Update dupefilter with request status
|
|
602
|
+
# 使用请求状态更新重复过滤器
|
|
216
603
|
self.dupefilter and \
|
|
217
604
|
not request.dont_filter and \
|
|
218
605
|
await self.dupefilter.done(request, done_type="request_ok" if isinstance(result, Response) else "request_err")
|
|
219
606
|
|
|
607
|
+
# Send result to engine and process next request
|
|
608
|
+
# 将结果发送到引擎并处理下一个请求
|
|
220
609
|
await self._call_engine(result, request)
|
|
221
610
|
await self._process_queue(slot)
|
|
222
611
|
|
|
223
612
|
async def close(self) -> None:
|
|
613
|
+
"""
|
|
614
|
+
Close the downloader and release its resources.
|
|
615
|
+
关闭下载器并释放其资源。
|
|
616
|
+
|
|
617
|
+
This method stops the downloader from accepting new requests and
|
|
618
|
+
closes the dupefilter if one is being used.
|
|
619
|
+
此方法停止下载器接受新请求,并在使用重复过滤器时关闭它。
|
|
620
|
+
"""
|
|
621
|
+
# Stop accepting new requests
|
|
622
|
+
# 停止接受新请求
|
|
224
623
|
self.running = False
|
|
624
|
+
|
|
625
|
+
# Close the dupefilter if one exists
|
|
626
|
+
# 如果存在重复过滤器,则关闭它
|
|
225
627
|
self.dupefilter and await self.dupefilter.close()
|
|
226
628
|
|
|
227
629
|
async def _slot_gc(self, age=60):
|
|
630
|
+
"""
|
|
631
|
+
Garbage collector for download slots.
|
|
632
|
+
下载槽的垃圾收集器。
|
|
633
|
+
|
|
634
|
+
This method periodically checks for inactive slots and removes them
|
|
635
|
+
to free up memory.
|
|
636
|
+
此方法定期检查不活动的槽并删除它们以释放内存。
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
age: The minimum age in seconds for a slot to be considered for removal.
|
|
640
|
+
槽被考虑删除的最小年龄(秒)。
|
|
641
|
+
"""
|
|
228
642
|
while self.running:
|
|
643
|
+
# Wait for the specified age before checking
|
|
644
|
+
# 在检查之前等待指定的年龄
|
|
229
645
|
await asyncio.sleep(age)
|
|
646
|
+
|
|
647
|
+
# Iterate through a copy of the slots to avoid modification during iteration
|
|
648
|
+
# 遍历槽的副本以避免在迭代期间修改
|
|
230
649
|
for key, slot in list(self.slots.items()):
|
|
650
|
+
# Log slot state for debugging
|
|
651
|
+
# 记录槽状态以进行调试
|
|
231
652
|
logger.debug(slot)
|
|
653
|
+
|
|
654
|
+
# Remove slots that have been inactive for at least 'age' seconds
|
|
655
|
+
# 删除至少'age'秒不活动的槽
|
|
232
656
|
if not slot.active and slot.lastseen + slot.delay < (time() - age):
|
|
233
657
|
self.slots.pop(key)
|
|
234
658
|
|
|
235
|
-
def needs_backout(self):
|
|
659
|
+
def needs_backout(self) -> bool:
|
|
660
|
+
"""
|
|
661
|
+
Check if the downloader needs to stop accepting new requests.
|
|
662
|
+
检查下载器是否停止接受新请求。
|
|
663
|
+
|
|
664
|
+
This method checks if the downloader has reached its maximum concurrency
|
|
665
|
+
limit and should not accept new requests.
|
|
666
|
+
此方法检查下载器是否已达到其最大并发限制,并且不应接受新请求。
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
bool: True if the downloader is at capacity and should not accept
|
|
670
|
+
new requests, False otherwise.
|
|
671
|
+
如果下载器已达到容量并且不应接受新请求,则为True,否则为False。
|
|
672
|
+
"""
|
|
236
673
|
return len(self.active) >= self.total_concurrency
|
|
237
674
|
|
|
238
675
|
def _get_slot(self, request, spider):
|
|
676
|
+
"""
|
|
677
|
+
Get or create a download slot for a request.
|
|
678
|
+
获取或创建请求的下载槽。
|
|
679
|
+
|
|
680
|
+
This method determines which slot a request should use based on its
|
|
681
|
+
domain, IP, or custom slot key, and creates the slot if it doesn't exist.
|
|
682
|
+
此方法根据请求的域名、IP或自定义槽键确定请求应使用哪个槽,如果槽不存在则创建它。
|
|
683
|
+
|
|
684
|
+
Args:
|
|
685
|
+
request: The request to get a slot for.
|
|
686
|
+
要获取槽的请求。
|
|
687
|
+
spider: The spider making the request.
|
|
688
|
+
发出请求的爬虫。
|
|
689
|
+
|
|
690
|
+
Returns:
|
|
691
|
+
Tuple[str, Slot]: A tuple containing the slot key and the slot object.
|
|
692
|
+
包含槽键和槽对象的元组。
|
|
693
|
+
"""
|
|
694
|
+
# Get the slot key for this request
|
|
695
|
+
# 获取此请求的槽键
|
|
239
696
|
key = self._get_slot_key(request, spider)
|
|
697
|
+
|
|
698
|
+
# Create the slot if it doesn't exist
|
|
699
|
+
# 如果槽不存在,则创建它
|
|
240
700
|
if key not in self.slots:
|
|
701
|
+
# Determine concurrency based on settings
|
|
702
|
+
# 根据设置确定并发
|
|
241
703
|
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
|
|
704
|
+
|
|
705
|
+
# Get spider-specific concurrency and delay
|
|
706
|
+
# 获取爬虫特定的并发和延迟
|
|
242
707
|
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
|
|
708
|
+
|
|
709
|
+
# Create a new slot with the determined settings
|
|
710
|
+
# 使用确定的设置创建新槽
|
|
243
711
|
self.slots[key] = Slot(conc, delay, self.randomize_delay)
|
|
712
|
+
|
|
244
713
|
return key, self.slots[key]
|
|
245
714
|
|
|
246
715
|
def _get_slot_key(self, request, spider):
|
|
716
|
+
"""
|
|
717
|
+
Get the key for determining which download slot to use for a request.
|
|
718
|
+
获取用于确定请求使用哪个下载槽的键。
|
|
719
|
+
|
|
720
|
+
The slot key is determined in the following order:
|
|
721
|
+
槽键按以下顺序确定:
|
|
722
|
+
|
|
723
|
+
1. Custom slot from request.meta['download_slot'] if present
|
|
724
|
+
如果存在,则从request.meta['download_slot']获取自定义槽
|
|
725
|
+
2. Proxy address if IP concurrency is enabled
|
|
726
|
+
如果启用了IP并发,则使用代理地址
|
|
727
|
+
3. Request hostname
|
|
728
|
+
请求主机名
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
request: The request to get a slot key for.
|
|
732
|
+
要获取槽键的请求。
|
|
733
|
+
spider: The spider making the request (not used in this implementation
|
|
734
|
+
but kept for interface consistency).
|
|
735
|
+
发出请求的爬虫(在此实现中未使用,但保留以保持接口一致性)。
|
|
736
|
+
This parameter is included to maintain a consistent interface with
|
|
737
|
+
other methods that might need the spider instance, and to allow
|
|
738
|
+
subclasses to use it if needed.
|
|
739
|
+
包含此参数是为了保持与可能需要爬虫实例的其他方法的一致接口,
|
|
740
|
+
并允许子类在需要时使用它。
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
str: The slot key for the request.
|
|
744
|
+
请求的槽键。
|
|
745
|
+
"""
|
|
746
|
+
# Check for custom slot in request meta
|
|
747
|
+
# 检查请求元数据中的自定义槽
|
|
247
748
|
if self.DOWNLOAD_SLOT in request.meta:
|
|
248
749
|
return request.meta[self.DOWNLOAD_SLOT]
|
|
249
750
|
|
|
751
|
+
# Use proxy as key if IP concurrency is enabled
|
|
752
|
+
# 如果启用了IP并发,则使用代理作为键
|
|
250
753
|
if self.ip_concurrency:
|
|
251
754
|
return request.meta.get("proxy", '')
|
|
755
|
+
# Otherwise use hostname
|
|
756
|
+
# 否则使用主机名
|
|
252
757
|
else:
|
|
253
758
|
return urlparse_cached(request).hostname or ''
|