aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/core/scheduler.py
CHANGED
|
@@ -1,3 +1,30 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scheduler Module
|
|
3
|
+
调度器模块
|
|
4
|
+
|
|
5
|
+
This module provides the scheduler components for AioScrapy. The scheduler is responsible
|
|
6
|
+
for managing the request queue, deciding which requests should be processed next, and
|
|
7
|
+
ensuring that requests are properly prioritized and deduplicated.
|
|
8
|
+
此模块为AioScrapy提供调度器组件。调度器负责管理请求队列,决定下一步应处理哪些请求,
|
|
9
|
+
并确保请求被正确地优先级排序和去重。
|
|
10
|
+
|
|
11
|
+
The main components are:
|
|
12
|
+
主要组件包括:
|
|
13
|
+
|
|
14
|
+
1. BaseSchedulerMeta: Metaclass that defines the required interface for schedulers
|
|
15
|
+
定义调度器所需接口的元类
|
|
16
|
+
2. BaseScheduler: Abstract base class that all schedulers must inherit from
|
|
17
|
+
所有调度器必须继承的抽象基类
|
|
18
|
+
3. Scheduler: Default implementation of the scheduler with support for persistent
|
|
19
|
+
queues and in-memory caching
|
|
20
|
+
支持持久化队列和内存缓存的默认调度器实现
|
|
21
|
+
|
|
22
|
+
Schedulers work with queue implementations to store and retrieve requests efficiently,
|
|
23
|
+
and can be configured to persist requests between runs or to use in-memory caching
|
|
24
|
+
for faster access.
|
|
25
|
+
调度器与队列实现一起工作,以高效地存储和检索请求,并且可以配置为在运行之间持久化请求
|
|
26
|
+
或使用内存缓存以便更快地访问。
|
|
27
|
+
"""
|
|
1
28
|
from abc import abstractmethod
|
|
2
29
|
from typing import Optional, Type, TypeVar, List
|
|
3
30
|
|
|
@@ -11,13 +38,49 @@ from aioscrapy.utils.log import logger
|
|
|
11
38
|
|
|
12
39
|
class BaseSchedulerMeta(type):
|
|
13
40
|
"""
|
|
14
|
-
Metaclass to check scheduler classes against the necessary interface
|
|
41
|
+
Metaclass to check scheduler classes against the necessary interface.
|
|
42
|
+
用于检查调度器类是否实现了必要接口的元类。
|
|
43
|
+
|
|
44
|
+
This metaclass ensures that any class claiming to be a scheduler
|
|
45
|
+
implements all the required methods.
|
|
46
|
+
此元类确保任何声称是调度器的类都实现了所有必需的方法。
|
|
15
47
|
"""
|
|
16
48
|
|
|
17
49
|
def __instancecheck__(cls, instance):
|
|
50
|
+
"""
|
|
51
|
+
Check if an instance is an instance of this class.
|
|
52
|
+
检查实例是否是此类的实例。
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
instance: The instance to check.
|
|
56
|
+
要检查的实例。
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
bool: True if the instance implements the required interface.
|
|
60
|
+
如果实例实现了所需的接口,则为True。
|
|
61
|
+
"""
|
|
18
62
|
return cls.__subclasscheck__(type(instance))
|
|
19
63
|
|
|
20
64
|
def __subclasscheck__(cls, subclass):
|
|
65
|
+
"""
|
|
66
|
+
Check if a class is a subclass of this class.
|
|
67
|
+
检查类是否是此类的子类。
|
|
68
|
+
|
|
69
|
+
A class is considered a subclass if it implements the required methods:
|
|
70
|
+
如果类实现了所需的方法,则被视为子类:
|
|
71
|
+
- has_pending_requests: Check if there are pending requests
|
|
72
|
+
- enqueue_request: Add a request to the queue
|
|
73
|
+
- enqueue_request_batch: Add multiple requests to the queue
|
|
74
|
+
- next_request: Get the next request from the queue
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
subclass: The class to check.
|
|
78
|
+
要检查的类。
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
bool: True if the class implements the required interface.
|
|
82
|
+
如果类实现了所需的接口,则为True。
|
|
83
|
+
"""
|
|
21
84
|
return (
|
|
22
85
|
hasattr(subclass, "has_pending_requests") and callable(subclass.has_pending_requests)
|
|
23
86
|
and hasattr(subclass, "enqueue_request") and callable(subclass.enqueue_request)
|
|
@@ -27,68 +90,149 @@ class BaseSchedulerMeta(type):
|
|
|
27
90
|
|
|
28
91
|
|
|
29
92
|
class BaseScheduler(metaclass=BaseSchedulerMeta):
|
|
93
|
+
"""
|
|
94
|
+
Base class for schedulers.
|
|
95
|
+
调度器的基类。
|
|
96
|
+
|
|
97
|
+
This class defines the interface that all schedulers must implement.
|
|
98
|
+
此类定义了所有调度器必须实现的接口。
|
|
99
|
+
"""
|
|
30
100
|
|
|
31
101
|
@classmethod
|
|
32
102
|
async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "BaseScheduler":
|
|
33
103
|
"""
|
|
34
|
-
Factory method
|
|
104
|
+
Factory method to create a scheduler from a crawler.
|
|
105
|
+
从爬虫创建调度器的工厂方法。
|
|
106
|
+
|
|
107
|
+
This method receives the current crawler object and returns a new
|
|
108
|
+
scheduler instance. In the base class, the crawler parameter is not used,
|
|
109
|
+
but subclasses can override this method to use crawler settings or other
|
|
110
|
+
attributes to configure the scheduler.
|
|
111
|
+
此方法接收当前的爬虫对象并返回一个新的调度器实例。在基类中,crawler参数未被使用,
|
|
112
|
+
但子类可以覆盖此方法以使用爬虫设置或其他属性来配置调度器。
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
crawler: The crawler instance that will use this scheduler.
|
|
116
|
+
将使用此调度器的爬虫实例。
|
|
117
|
+
This parameter is not used in the base implementation but is
|
|
118
|
+
provided for subclasses to use.
|
|
119
|
+
此参数在基本实现中未使用,但提供给子类使用。
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
BaseScheduler: A new scheduler instance.
|
|
123
|
+
一个新的调度器实例。
|
|
35
124
|
"""
|
|
125
|
+
# The crawler parameter is intentionally unused in the base implementation
|
|
126
|
+
# 在基本实现中有意不使用crawler参数
|
|
127
|
+
# pylint: disable=unused-argument
|
|
36
128
|
return cls()
|
|
37
129
|
|
|
38
130
|
async def close(self, reason: str) -> None:
|
|
39
131
|
"""
|
|
40
|
-
|
|
41
|
-
|
|
132
|
+
Close the scheduler.
|
|
133
|
+
关闭调度器。
|
|
42
134
|
|
|
43
|
-
|
|
44
|
-
|
|
135
|
+
Called when the spider is closed by the engine. It receives the reason why the crawl
|
|
136
|
+
finished as argument and it's useful to execute cleaning code. In the base class,
|
|
137
|
+
this method does nothing, but subclasses can override it to perform cleanup operations.
|
|
138
|
+
当爬虫被引擎关闭时调用。它接收爬取完成的原因作为参数,对于执行清理代码很有用。
|
|
139
|
+
在基类中,此方法不执行任何操作,但子类可以覆盖它以执行清理操作。
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
reason: A string which describes the reason why the spider was closed.
|
|
143
|
+
描述爬虫关闭原因的字符串。
|
|
144
|
+
Common values include 'finished', 'cancelled', or 'shutdown'.
|
|
145
|
+
常见值包括'finished'(完成)、'cancelled'(取消)或'shutdown'(关闭)。
|
|
146
|
+
This parameter is not used in the base implementation but is
|
|
147
|
+
provided for subclasses to use.
|
|
148
|
+
此参数在基本实现中未使用,但提供给子类使用。
|
|
45
149
|
"""
|
|
150
|
+
# The reason parameter is intentionally unused in the base implementation
|
|
151
|
+
# 在基本实现中有意不使用reason参数
|
|
152
|
+
# pylint: disable=unused-argument
|
|
46
153
|
pass
|
|
47
154
|
|
|
48
155
|
@abstractmethod
|
|
49
156
|
async def has_pending_requests(self) -> bool:
|
|
50
157
|
"""
|
|
51
|
-
|
|
158
|
+
Check if the scheduler has pending requests.
|
|
159
|
+
检查调度器是否有待处理的请求。
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
bool: True if the scheduler has enqueued requests, False otherwise.
|
|
163
|
+
如果调度器有排队的请求,则为True,否则为False。
|
|
52
164
|
"""
|
|
53
165
|
raise NotImplementedError()
|
|
54
166
|
|
|
55
167
|
@abstractmethod
|
|
56
168
|
async def enqueue_request_batch(self, requests: List[aioscrapy.Request]) -> bool:
|
|
57
169
|
"""
|
|
58
|
-
Process a batch requests received by the engine.
|
|
170
|
+
Process a batch of requests received by the engine.
|
|
171
|
+
处理引擎接收到的一批请求。
|
|
172
|
+
|
|
173
|
+
This method adds multiple requests to the scheduler's queue at once.
|
|
174
|
+
此方法一次将多个请求添加到调度器的队列中。
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
requests: A list of requests to enqueue.
|
|
178
|
+
要排队的请求列表。
|
|
59
179
|
|
|
60
|
-
|
|
180
|
+
Returns:
|
|
181
|
+
bool: True if the requests are stored correctly, False otherwise.
|
|
182
|
+
如果请求正确存储,则为True,否则为False。
|
|
61
183
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
184
|
+
Notes:
|
|
185
|
+
If False is returned, the engine will fire a request_dropped signal,
|
|
186
|
+
and will not make further attempts to schedule the requests at a later time.
|
|
187
|
+
如果返回False,引擎将触发request_dropped信号,并且不会在稍后尝试调度请求。
|
|
66
188
|
"""
|
|
67
189
|
raise NotImplementedError()
|
|
68
190
|
|
|
69
191
|
@abstractmethod
|
|
70
192
|
async def enqueue_request(self, request: aioscrapy.Request) -> bool:
|
|
71
193
|
"""
|
|
72
|
-
Process a request received by the engine.
|
|
194
|
+
Process a single request received by the engine.
|
|
195
|
+
处理引擎接收到的单个请求。
|
|
73
196
|
|
|
74
|
-
|
|
197
|
+
This method adds a request to the scheduler's queue.
|
|
198
|
+
此方法将请求添加到调度器的队列中。
|
|
75
199
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
200
|
+
Args:
|
|
201
|
+
request: The request to enqueue.
|
|
202
|
+
要排队的请求。
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
bool: True if the request is stored correctly, False otherwise.
|
|
206
|
+
如果请求正确存储,则为True,否则为False。
|
|
207
|
+
|
|
208
|
+
Notes:
|
|
209
|
+
If False is returned, the engine will fire a request_dropped signal,
|
|
210
|
+
and will not make further attempts to schedule the request at a later time.
|
|
211
|
+
如果返回False,引擎将触发request_dropped信号,并且不会在稍后尝试调度请求。
|
|
80
212
|
"""
|
|
81
213
|
raise NotImplementedError()
|
|
82
214
|
|
|
83
215
|
@abstractmethod
|
|
84
216
|
async def next_request(self) -> Optional[aioscrapy.Request]:
|
|
85
217
|
"""
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
218
|
+
Get the next request to be processed.
|
|
219
|
+
获取要处理的下一个请求。
|
|
220
|
+
|
|
221
|
+
This method returns the next request from the scheduler's queue,
|
|
222
|
+
or None if there are no requests ready to be processed.
|
|
223
|
+
此方法从调度器的队列中返回下一个请求,如果没有准备好处理的请求,则返回None。
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Optional[Request]: The next request to be processed, or None if there
|
|
227
|
+
are no requests ready at the moment.
|
|
228
|
+
要处理的下一个请求,如果当前没有准备好的请求,则为None。
|
|
229
|
+
|
|
230
|
+
Notes:
|
|
231
|
+
Returning None implies that no request from the scheduler will be sent
|
|
232
|
+
to the downloader in the current cycle. The engine will continue
|
|
233
|
+
calling next_request until has_pending_requests is False.
|
|
234
|
+
返回None意味着在当前周期中不会将调度器中的请求发送到下载器。
|
|
235
|
+
引擎将继续调用next_request,直到has_pending_requests为False。
|
|
92
236
|
"""
|
|
93
237
|
raise NotImplementedError()
|
|
94
238
|
|
|
@@ -97,27 +241,74 @@ SchedulerTV = TypeVar("SchedulerTV", bound="Scheduler")
|
|
|
97
241
|
|
|
98
242
|
|
|
99
243
|
class Scheduler(BaseScheduler):
|
|
244
|
+
"""
|
|
245
|
+
Default scheduler implementation.
|
|
246
|
+
默认的调度器实现。
|
|
247
|
+
|
|
248
|
+
This scheduler manages requests using a queue implementation and optionally
|
|
249
|
+
a cache queue for faster access to requests.
|
|
250
|
+
此调度器使用队列实现来管理请求,并可选择使用缓存队列以更快地访问请求。
|
|
251
|
+
"""
|
|
100
252
|
|
|
101
253
|
def __init__(
|
|
102
254
|
self,
|
|
103
255
|
queue: AbsQueue,
|
|
104
256
|
spider: aioscrapy.Spider,
|
|
105
|
-
stats
|
|
257
|
+
stats: Optional[StatsCollector] = None,
|
|
106
258
|
persist: bool = True,
|
|
107
259
|
cache_queue: Optional[AbsQueue] = None
|
|
108
260
|
):
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
261
|
+
"""
|
|
262
|
+
Initialize the scheduler.
|
|
263
|
+
初始化调度器。
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
queue: The main queue for storing requests.
|
|
267
|
+
存储请求的主队列。
|
|
268
|
+
spider: The spider that will use this scheduler.
|
|
269
|
+
将使用此调度器的爬虫。
|
|
270
|
+
stats: Optional stats collector for tracking metrics.
|
|
271
|
+
用于跟踪指标的可选统计收集器。
|
|
272
|
+
persist: Whether to persist the queue between runs.
|
|
273
|
+
是否在运行之间持久化队列。
|
|
274
|
+
cache_queue: Optional in-memory cache queue for faster access.
|
|
275
|
+
用于更快访问的可选内存缓存队列。
|
|
276
|
+
"""
|
|
277
|
+
self.queue = queue # Main queue (e.g., Redis queue)
|
|
278
|
+
# 主队列(例如,Redis队列)
|
|
279
|
+
self.cache_queue = cache_queue # Optional in-memory cache queue
|
|
280
|
+
# 可选的内存缓存队列
|
|
112
281
|
self.spider = spider
|
|
113
282
|
self.stats = stats
|
|
114
|
-
self.persist = persist
|
|
283
|
+
self.persist = persist # Whether to persist the queue between runs
|
|
284
|
+
# 是否在运行之间持久化队列
|
|
115
285
|
|
|
116
286
|
@classmethod
|
|
117
287
|
async def from_crawler(cls: Type[SchedulerTV], crawler: "aioscrapy.Crawler") -> SchedulerTV:
|
|
288
|
+
"""
|
|
289
|
+
Create a scheduler from a crawler.
|
|
290
|
+
从爬虫创建调度器。
|
|
291
|
+
|
|
292
|
+
This factory method creates a scheduler instance with the appropriate
|
|
293
|
+
queue implementation and settings from the crawler.
|
|
294
|
+
此工厂方法使用来自爬虫的适当队列实现和设置创建调度器实例。
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
crawler: The crawler instance that will use this scheduler.
|
|
298
|
+
将使用此调度器的爬虫实例。
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Scheduler: A new scheduler instance.
|
|
302
|
+
一个新的调度器实例。
|
|
303
|
+
"""
|
|
304
|
+
# Initialize cache queue if enabled in settings
|
|
305
|
+
# 如果在设置中启用,则初始化缓存队列
|
|
118
306
|
cache_queue = None
|
|
119
307
|
if crawler.settings.getbool('USE_SCHEDULER_QUEUE_CACHE', False):
|
|
120
308
|
cache_queue = await load_instance('aioscrapy.queue.memory.SpiderPriorityQueue', spider=crawler.spider)
|
|
309
|
+
|
|
310
|
+
# Create scheduler instance with the main queue and settings
|
|
311
|
+
# 使用主队列和设置创建调度器实例
|
|
121
312
|
instance = cls(
|
|
122
313
|
await load_instance(crawler.settings['SCHEDULER_QUEUE_CLASS'], spider=crawler.spider),
|
|
123
314
|
crawler.spider,
|
|
@@ -126,34 +317,93 @@ class Scheduler(BaseScheduler):
|
|
|
126
317
|
cache_queue=cache_queue
|
|
127
318
|
)
|
|
128
319
|
|
|
320
|
+
# Flush the queue if requested in settings
|
|
321
|
+
# 如果在设置中请求,则刷新队列
|
|
129
322
|
if crawler.settings.getbool('SCHEDULER_FLUSH_ON_START', False):
|
|
130
323
|
await instance.flush()
|
|
131
324
|
|
|
325
|
+
# Log the number of pending requests if any
|
|
326
|
+
# 如果有,记录待处理请求的数量
|
|
132
327
|
count = await call_helper(instance.queue.len)
|
|
133
328
|
count and logger.info("Resuming crawl (%d requests scheduled)" % count)
|
|
134
329
|
|
|
135
330
|
return instance
|
|
136
331
|
|
|
137
332
|
async def close(self, reason: str) -> None:
|
|
333
|
+
"""
|
|
334
|
+
Close the scheduler.
|
|
335
|
+
关闭调度器。
|
|
336
|
+
|
|
337
|
+
This method is called when the spider is closed. It handles cleanup
|
|
338
|
+
operations, including flushing the queue if persistence is disabled,
|
|
339
|
+
or moving cached requests back to the main queue if persistence is enabled.
|
|
340
|
+
当爬虫关闭时调用此方法。它处理清理操作,包括如果禁用持久性则刷新队列,
|
|
341
|
+
或者如果启用持久性则将缓存的请求移回主队列。
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
reason: The reason why the spider was closed.
|
|
345
|
+
爬虫关闭的原因。
|
|
346
|
+
Common values include 'finished', 'cancelled', or 'shutdown'.
|
|
347
|
+
常见值包括'finished'(完成)、'cancelled'(取消)或'shutdown'(关闭)。
|
|
348
|
+
This parameter is not used in the current implementation but might
|
|
349
|
+
be used in future versions or subclasses to customize cleanup behavior.
|
|
350
|
+
此参数在当前实现中未使用,但可能在未来版本或子类中用于自定义清理行为。
|
|
351
|
+
"""
|
|
352
|
+
# The reason parameter is not used in the current implementation
|
|
353
|
+
# 当前实现中未使用reason参数
|
|
354
|
+
# pylint: disable=unused-argument
|
|
138
355
|
|
|
356
|
+
# If persistence is disabled, clear the queue
|
|
357
|
+
# 如果禁用持久性,则清除队列
|
|
139
358
|
if not self.persist:
|
|
140
359
|
await self.flush()
|
|
141
360
|
return
|
|
142
361
|
|
|
143
|
-
#
|
|
362
|
+
# If persistence is enabled and we have a cache queue,
|
|
363
|
+
# move all cached requests back to the main queue
|
|
364
|
+
# 如果启用持久性并且我们有缓存队列,则将所有缓存的请求移回主队列
|
|
144
365
|
if self.cache_queue is not None:
|
|
366
|
+
# Process in batches of 2000 to avoid memory issues
|
|
367
|
+
# 以2000个批次处理,以避免内存问题
|
|
145
368
|
while True:
|
|
146
369
|
temp = []
|
|
147
370
|
async for request in self.cache_queue.pop(2000):
|
|
148
371
|
temp.append(request)
|
|
372
|
+
# Push the batch to the main queue if not empty
|
|
373
|
+
# 如果不为空,则将批次推送到主队列
|
|
149
374
|
temp and await self.queue.push_batch(temp)
|
|
375
|
+
# Break if we got less than a full batch (end of queue)
|
|
376
|
+
# 如果我们得到的批次不足(队列结束),则中断
|
|
150
377
|
if len(temp) < 2000:
|
|
151
378
|
break
|
|
152
379
|
|
|
153
380
|
async def flush(self) -> None:
|
|
381
|
+
"""
|
|
382
|
+
Clear the scheduler's queue.
|
|
383
|
+
清除调度器的队列。
|
|
384
|
+
|
|
385
|
+
This method removes all pending requests from the queue.
|
|
386
|
+
此方法从队列中删除所有待处理的请求。
|
|
387
|
+
"""
|
|
154
388
|
await call_helper(self.queue.clear)
|
|
155
389
|
|
|
156
390
|
async def enqueue_request_batch(self, requests: List[aioscrapy.Request]) -> bool:
|
|
391
|
+
"""
|
|
392
|
+
Add multiple requests to the queue at once.
|
|
393
|
+
一次将多个请求添加到队列中。
|
|
394
|
+
|
|
395
|
+
This method adds a batch of requests directly to the main queue
|
|
396
|
+
and updates the stats if enabled.
|
|
397
|
+
此方法将一批请求直接添加到主队列,并在启用时更新统计信息。
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
requests: A list of requests to enqueue.
|
|
401
|
+
要排队的请求列表。
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
bool: Always returns True, indicating the requests were accepted.
|
|
405
|
+
始终返回True,表示请求被接受。
|
|
406
|
+
"""
|
|
157
407
|
await call_helper(self.queue.push_batch, requests)
|
|
158
408
|
if self.stats:
|
|
159
409
|
self.stats.inc_value(self.queue.inc_key, count=len(requests), spider=self.spider)
|
|
@@ -161,7 +411,22 @@ class Scheduler(BaseScheduler):
|
|
|
161
411
|
|
|
162
412
|
async def enqueue_request(self, request: aioscrapy.Request) -> bool:
|
|
163
413
|
"""
|
|
164
|
-
|
|
414
|
+
Add a single request to the queue.
|
|
415
|
+
将单个请求添加到队列中。
|
|
416
|
+
|
|
417
|
+
If a cache queue is enabled (USE_SCHEDULER_QUEUE_CACHE), the request
|
|
418
|
+
is added to the cache queue first for faster access. Otherwise, it's
|
|
419
|
+
added directly to the main queue.
|
|
420
|
+
如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先将请求添加到缓存队列中
|
|
421
|
+
以便更快地访问。否则,它将直接添加到主队列中。
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
request: The request to enqueue.
|
|
425
|
+
要排队的请求。
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
bool: Always returns True, indicating the request was accepted.
|
|
429
|
+
始终返回True,表示请求被接受。
|
|
165
430
|
"""
|
|
166
431
|
if self.cache_queue is not None:
|
|
167
432
|
await call_helper(self.cache_queue.push, request)
|
|
@@ -171,11 +436,38 @@ class Scheduler(BaseScheduler):
|
|
|
171
436
|
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
|
|
172
437
|
return True
|
|
173
438
|
|
|
174
|
-
async def next_request(self, count: int = 1)
|
|
439
|
+
async def next_request(self, count: int = 1):
|
|
175
440
|
"""
|
|
176
|
-
|
|
441
|
+
Get the next request(s) to be processed.
|
|
442
|
+
获取要处理的下一个请求。
|
|
443
|
+
|
|
444
|
+
This method is an async generator that yields requests from the queue.
|
|
445
|
+
If a cache queue is enabled (USE_SCHEDULER_QUEUE_CACHE), it first tries
|
|
446
|
+
to get requests from the cache queue, then falls back to the main queue.
|
|
447
|
+
此方法是一个异步生成器,从队列中产生请求。
|
|
448
|
+
如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),它首先尝试从缓存队列中获取请求,
|
|
449
|
+
然后回退到主队列。
|
|
450
|
+
|
|
451
|
+
Note: This implementation differs from the BaseScheduler.next_request abstract
|
|
452
|
+
method, which returns a single request or None. This implementation is an
|
|
453
|
+
async generator that can yield multiple requests, making it more efficient
|
|
454
|
+
for batch processing.
|
|
455
|
+
注意:此实现与BaseScheduler.next_request抽象方法不同,后者返回单个请求或None。
|
|
456
|
+
此实现是一个异步生成器,可以产生多个请求,使其更适合批处理。
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
count: Maximum number of requests to return.
|
|
460
|
+
要返回的最大请求数。
|
|
461
|
+
Defaults to 1.
|
|
462
|
+
默认为1。
|
|
463
|
+
|
|
464
|
+
Yields:
|
|
465
|
+
Request: The next request(s) to be processed.
|
|
466
|
+
要处理的下一个请求。
|
|
177
467
|
"""
|
|
178
468
|
flag = False
|
|
469
|
+
# First try to get requests from the cache queue if available
|
|
470
|
+
# 如果可用,首先尝试从缓存队列获取请求
|
|
179
471
|
if self.cache_queue is not None:
|
|
180
472
|
async for request in self.cache_queue.pop(count):
|
|
181
473
|
if request and self.stats:
|
|
@@ -183,14 +475,36 @@ class Scheduler(BaseScheduler):
|
|
|
183
475
|
yield request
|
|
184
476
|
flag = True
|
|
185
477
|
|
|
478
|
+
# If we got requests from the cache queue, we're done
|
|
479
|
+
# 如果我们从缓存队列获取了请求,我们就完成了
|
|
186
480
|
if flag:
|
|
187
481
|
return
|
|
188
482
|
|
|
483
|
+
# Otherwise, get requests from the main queue
|
|
484
|
+
# 否则,从主队列获取请求
|
|
189
485
|
async for request in self.queue.pop(count):
|
|
190
486
|
if request and self.stats:
|
|
191
487
|
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
|
|
192
488
|
yield request
|
|
193
489
|
|
|
194
490
|
async def has_pending_requests(self) -> bool:
|
|
195
|
-
|
|
196
|
-
|
|
491
|
+
"""
|
|
492
|
+
Check if the scheduler has pending requests.
|
|
493
|
+
检查调度器是否有待处理的请求。
|
|
494
|
+
|
|
495
|
+
This method checks both the main queue and the cache queue (if enabled)
|
|
496
|
+
to determine if there are any pending requests.
|
|
497
|
+
此方法检查主队列和缓存队列(如果启用)以确定是否有任何待处理的请求。
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
bool: True if there are pending requests, False otherwise.
|
|
501
|
+
如果有待处理的请求,则为True,否则为False。
|
|
502
|
+
"""
|
|
503
|
+
# If no cache queue, just check the main queue
|
|
504
|
+
# 如果没有缓存队列,只检查主队列
|
|
505
|
+
if self.cache_queue is None:
|
|
506
|
+
return await call_helper(self.queue.len) > 0
|
|
507
|
+
# Otherwise, check both queues
|
|
508
|
+
# 否则,检查两个队列
|
|
509
|
+
else:
|
|
510
|
+
return (await call_helper(self.queue.len) + await call_helper(self.cache_queue.len)) > 0
|