aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,30 @@
1
+ """
2
+ Scheduler Module
3
+ 调度器模块
4
+
5
+ This module provides the scheduler components for AioScrapy. The scheduler is responsible
6
+ for managing the request queue, deciding which requests should be processed next, and
7
+ ensuring that requests are properly prioritized and deduplicated.
8
+ 此模块为AioScrapy提供调度器组件。调度器负责管理请求队列,决定下一步应处理哪些请求,
9
+ 并确保请求被正确地优先级排序和去重。
10
+
11
+ The main components are:
12
+ 主要组件包括:
13
+
14
+ 1. BaseSchedulerMeta: Metaclass that defines the required interface for schedulers
15
+ 定义调度器所需接口的元类
16
+ 2. BaseScheduler: Abstract base class that all schedulers must inherit from
17
+ 所有调度器必须继承的抽象基类
18
+ 3. Scheduler: Default implementation of the scheduler with support for persistent
19
+ queues and in-memory caching
20
+ 支持持久化队列和内存缓存的默认调度器实现
21
+
22
+ Schedulers work with queue implementations to store and retrieve requests efficiently,
23
+ and can be configured to persist requests between runs or to use in-memory caching
24
+ for faster access.
25
+ 调度器与队列实现一起工作,以高效地存储和检索请求,并且可以配置为在运行之间持久化请求
26
+ 或使用内存缓存以便更快地访问。
27
+ """
1
28
  from abc import abstractmethod
2
29
  from typing import Optional, Type, TypeVar, List
3
30
 
@@ -11,13 +38,49 @@ from aioscrapy.utils.log import logger
11
38
 
12
39
  class BaseSchedulerMeta(type):
13
40
  """
14
- Metaclass to check scheduler classes against the necessary interface
41
+ Metaclass to check scheduler classes against the necessary interface.
42
+ 用于检查调度器类是否实现了必要接口的元类。
43
+
44
+ This metaclass ensures that any class claiming to be a scheduler
45
+ implements all the required methods.
46
+ 此元类确保任何声称是调度器的类都实现了所有必需的方法。
15
47
  """
16
48
 
17
49
  def __instancecheck__(cls, instance):
50
+ """
51
+ Check if an instance is an instance of this class.
52
+ 检查实例是否是此类的实例。
53
+
54
+ Args:
55
+ instance: The instance to check.
56
+ 要检查的实例。
57
+
58
+ Returns:
59
+ bool: True if the instance implements the required interface.
60
+ 如果实例实现了所需的接口,则为True。
61
+ """
18
62
  return cls.__subclasscheck__(type(instance))
19
63
 
20
64
  def __subclasscheck__(cls, subclass):
65
+ """
66
+ Check if a class is a subclass of this class.
67
+ 检查类是否是此类的子类。
68
+
69
+ A class is considered a subclass if it implements the required methods:
70
+ 如果类实现了所需的方法,则被视为子类:
71
+ - has_pending_requests: Check if there are pending requests
72
+ - enqueue_request: Add a request to the queue
73
+ - enqueue_request_batch: Add multiple requests to the queue
74
+ - next_request: Get the next request from the queue
75
+
76
+ Args:
77
+ subclass: The class to check.
78
+ 要检查的类。
79
+
80
+ Returns:
81
+ bool: True if the class implements the required interface.
82
+ 如果类实现了所需的接口,则为True。
83
+ """
21
84
  return (
22
85
  hasattr(subclass, "has_pending_requests") and callable(subclass.has_pending_requests)
23
86
  and hasattr(subclass, "enqueue_request") and callable(subclass.enqueue_request)
@@ -27,68 +90,149 @@ class BaseSchedulerMeta(type):
27
90
 
28
91
 
29
92
  class BaseScheduler(metaclass=BaseSchedulerMeta):
93
+ """
94
+ Base class for schedulers.
95
+ 调度器的基类。
96
+
97
+ This class defines the interface that all schedulers must implement.
98
+ 此类定义了所有调度器必须实现的接口。
99
+ """
30
100
 
31
101
  @classmethod
32
102
  async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "BaseScheduler":
33
103
  """
34
- Factory method which receives the current :class:`~aioscrapy.crawler.Crawler` object as argument.
104
+ Factory method to create a scheduler from a crawler.
105
+ 从爬虫创建调度器的工厂方法。
106
+
107
+ This method receives the current crawler object and returns a new
108
+ scheduler instance. In the base class, the crawler parameter is not used,
109
+ but subclasses can override this method to use crawler settings or other
110
+ attributes to configure the scheduler.
111
+ 此方法接收当前的爬虫对象并返回一个新的调度器实例。在基类中,crawler参数未被使用,
112
+ 但子类可以覆盖此方法以使用爬虫设置或其他属性来配置调度器。
113
+
114
+ Args:
115
+ crawler: The crawler instance that will use this scheduler.
116
+ 将使用此调度器的爬虫实例。
117
+ This parameter is not used in the base implementation but is
118
+ provided for subclasses to use.
119
+ 此参数在基本实现中未使用,但提供给子类使用。
120
+
121
+ Returns:
122
+ BaseScheduler: A new scheduler instance.
123
+ 一个新的调度器实例。
35
124
  """
125
+ # The crawler parameter is intentionally unused in the base implementation
126
+ # 在基本实现中有意不使用crawler参数
127
+ # pylint: disable=unused-argument
36
128
  return cls()
37
129
 
38
130
  async def close(self, reason: str) -> None:
39
131
  """
40
- Called when the spider is closed by the engine. It receives the reason why the crawl
41
- finished as argument and it's useful to execute cleaning code.
132
+ Close the scheduler.
133
+ 关闭调度器。
42
134
 
43
- :param reason: a string which describes the reason why the spider was closed
44
- :type reason: :class:`str`
135
+ Called when the spider is closed by the engine. It receives the reason why the crawl
136
+ finished as argument and it's useful to execute cleaning code. In the base class,
137
+ this method does nothing, but subclasses can override it to perform cleanup operations.
138
+ 当爬虫被引擎关闭时调用。它接收爬取完成的原因作为参数,对于执行清理代码很有用。
139
+ 在基类中,此方法不执行任何操作,但子类可以覆盖它以执行清理操作。
140
+
141
+ Args:
142
+ reason: A string which describes the reason why the spider was closed.
143
+ 描述爬虫关闭原因的字符串。
144
+ Common values include 'finished', 'cancelled', or 'shutdown'.
145
+ 常见值包括'finished'(完成)、'cancelled'(取消)或'shutdown'(关闭)。
146
+ This parameter is not used in the base implementation but is
147
+ provided for subclasses to use.
148
+ 此参数在基本实现中未使用,但提供给子类使用。
45
149
  """
150
+ # The reason parameter is intentionally unused in the base implementation
151
+ # 在基本实现中有意不使用reason参数
152
+ # pylint: disable=unused-argument
46
153
  pass
47
154
 
48
155
  @abstractmethod
49
156
  async def has_pending_requests(self) -> bool:
50
157
  """
51
- ``True`` if the scheduler has enqueued requests, ``False`` otherwise
158
+ Check if the scheduler has pending requests.
159
+ 检查调度器是否有待处理的请求。
160
+
161
+ Returns:
162
+ bool: True if the scheduler has enqueued requests, False otherwise.
163
+ 如果调度器有排队的请求,则为True,否则为False。
52
164
  """
53
165
  raise NotImplementedError()
54
166
 
55
167
  @abstractmethod
56
168
  async def enqueue_request_batch(self, requests: List[aioscrapy.Request]) -> bool:
57
169
  """
58
- Process a batch requests received by the engine.
170
+ Process a batch of requests received by the engine.
171
+ 处理引擎接收到的一批请求。
172
+
173
+ This method adds multiple requests to the scheduler's queue at once.
174
+ 此方法一次将多个请求添加到调度器的队列中。
175
+
176
+ Args:
177
+ requests: A list of requests to enqueue.
178
+ 要排队的请求列表。
59
179
 
60
- Return ``True`` if the request is stored correctly, ``False`` otherwise.
180
+ Returns:
181
+ bool: True if the requests are stored correctly, False otherwise.
182
+ 如果请求正确存储,则为True,否则为False。
61
183
 
62
- If ``False``, the engine will fire a ``request_dropped`` signal, and
63
- will not make further attempts to schedule the request at a later time.
64
- For reference, the default Scrapy scheduler returns ``False`` when the
65
- request is rejected by the dupefilter.
184
+ Notes:
185
+ If False is returned, the engine will fire a request_dropped signal,
186
+ and will not make further attempts to schedule the requests at a later time.
187
+ 如果返回False,引擎将触发request_dropped信号,并且不会在稍后尝试调度请求。
66
188
  """
67
189
  raise NotImplementedError()
68
190
 
69
191
  @abstractmethod
70
192
  async def enqueue_request(self, request: aioscrapy.Request) -> bool:
71
193
  """
72
- Process a request received by the engine.
194
+ Process a single request received by the engine.
195
+ 处理引擎接收到的单个请求。
73
196
 
74
- Return ``True`` if the request is stored correctly, ``False`` otherwise.
197
+ This method adds a request to the scheduler's queue.
198
+ 此方法将请求添加到调度器的队列中。
75
199
 
76
- If ``False``, the engine will fire a ``request_dropped`` signal, and
77
- will not make further attempts to schedule the request at a later time.
78
- For reference, the default Scrapy scheduler returns ``False`` when the
79
- request is rejected by the dupefilter.
200
+ Args:
201
+ request: The request to enqueue.
202
+ 要排队的请求。
203
+
204
+ Returns:
205
+ bool: True if the request is stored correctly, False otherwise.
206
+ 如果请求正确存储,则为True,否则为False。
207
+
208
+ Notes:
209
+ If False is returned, the engine will fire a request_dropped signal,
210
+ and will not make further attempts to schedule the request at a later time.
211
+ 如果返回False,引擎将触发request_dropped信号,并且不会在稍后尝试调度请求。
80
212
  """
81
213
  raise NotImplementedError()
82
214
 
83
215
  @abstractmethod
84
216
  async def next_request(self) -> Optional[aioscrapy.Request]:
85
217
  """
86
- Return the next :class:`~scrapy.http.Request` to be processed, or ``None``
87
- to indicate that there are no requests to be considered ready at the moment.
88
-
89
- Returning ``None`` implies that no request from the scheduler will be sent
90
- to the downloader in the current reactor cycle. The engine will continue
91
- calling ``next_request`` until ``has_pending_requests`` is ``False``.
218
+ Get the next request to be processed.
219
+ 获取要处理的下一个请求。
220
+
221
+ This method returns the next request from the scheduler's queue,
222
+ or None if there are no requests ready to be processed.
223
+ 此方法从调度器的队列中返回下一个请求,如果没有准备好处理的请求,则返回None。
224
+
225
+ Returns:
226
+ Optional[Request]: The next request to be processed, or None if there
227
+ are no requests ready at the moment.
228
+ 要处理的下一个请求,如果当前没有准备好的请求,则为None。
229
+
230
+ Notes:
231
+ Returning None implies that no request from the scheduler will be sent
232
+ to the downloader in the current cycle. The engine will continue
233
+ calling next_request until has_pending_requests is False.
234
+ 返回None意味着在当前周期中不会将调度器中的请求发送到下载器。
235
+ 引擎将继续调用next_request,直到has_pending_requests为False。
92
236
  """
93
237
  raise NotImplementedError()
94
238
 
@@ -97,27 +241,74 @@ SchedulerTV = TypeVar("SchedulerTV", bound="Scheduler")
97
241
 
98
242
 
99
243
  class Scheduler(BaseScheduler):
244
+ """
245
+ Default scheduler implementation.
246
+ 默认的调度器实现。
247
+
248
+ This scheduler manages requests using a queue implementation and optionally
249
+ a cache queue for faster access to requests.
250
+ 此调度器使用队列实现来管理请求,并可选择使用缓存队列以更快地访问请求。
251
+ """
100
252
 
101
253
  def __init__(
102
254
  self,
103
255
  queue: AbsQueue,
104
256
  spider: aioscrapy.Spider,
105
- stats=Optional[StatsCollector],
257
+ stats: Optional[StatsCollector] = None,
106
258
  persist: bool = True,
107
259
  cache_queue: Optional[AbsQueue] = None
108
260
  ):
109
-
110
- self.queue = queue
111
- self.cache_queue = cache_queue
261
+ """
262
+ Initialize the scheduler.
263
+ 初始化调度器。
264
+
265
+ Args:
266
+ queue: The main queue for storing requests.
267
+ 存储请求的主队列。
268
+ spider: The spider that will use this scheduler.
269
+ 将使用此调度器的爬虫。
270
+ stats: Optional stats collector for tracking metrics.
271
+ 用于跟踪指标的可选统计收集器。
272
+ persist: Whether to persist the queue between runs.
273
+ 是否在运行之间持久化队列。
274
+ cache_queue: Optional in-memory cache queue for faster access.
275
+ 用于更快访问的可选内存缓存队列。
276
+ """
277
+ self.queue = queue # Main queue (e.g., Redis queue)
278
+ # 主队列(例如,Redis队列)
279
+ self.cache_queue = cache_queue # Optional in-memory cache queue
280
+ # 可选的内存缓存队列
112
281
  self.spider = spider
113
282
  self.stats = stats
114
- self.persist = persist
283
+ self.persist = persist # Whether to persist the queue between runs
284
+ # 是否在运行之间持久化队列
115
285
 
116
286
  @classmethod
117
287
  async def from_crawler(cls: Type[SchedulerTV], crawler: "aioscrapy.Crawler") -> SchedulerTV:
288
+ """
289
+ Create a scheduler from a crawler.
290
+ 从爬虫创建调度器。
291
+
292
+ This factory method creates a scheduler instance with the appropriate
293
+ queue implementation and settings from the crawler.
294
+ 此工厂方法使用来自爬虫的适当队列实现和设置创建调度器实例。
295
+
296
+ Args:
297
+ crawler: The crawler instance that will use this scheduler.
298
+ 将使用此调度器的爬虫实例。
299
+
300
+ Returns:
301
+ Scheduler: A new scheduler instance.
302
+ 一个新的调度器实例。
303
+ """
304
+ # Initialize cache queue if enabled in settings
305
+ # 如果在设置中启用,则初始化缓存队列
118
306
  cache_queue = None
119
307
  if crawler.settings.getbool('USE_SCHEDULER_QUEUE_CACHE', False):
120
308
  cache_queue = await load_instance('aioscrapy.queue.memory.SpiderPriorityQueue', spider=crawler.spider)
309
+
310
+ # Create scheduler instance with the main queue and settings
311
+ # 使用主队列和设置创建调度器实例
121
312
  instance = cls(
122
313
  await load_instance(crawler.settings['SCHEDULER_QUEUE_CLASS'], spider=crawler.spider),
123
314
  crawler.spider,
@@ -126,34 +317,93 @@ class Scheduler(BaseScheduler):
126
317
  cache_queue=cache_queue
127
318
  )
128
319
 
320
+ # Flush the queue if requested in settings
321
+ # 如果在设置中请求,则刷新队列
129
322
  if crawler.settings.getbool('SCHEDULER_FLUSH_ON_START', False):
130
323
  await instance.flush()
131
324
 
325
+ # Log the number of pending requests if any
326
+ # 如果有,记录待处理请求的数量
132
327
  count = await call_helper(instance.queue.len)
133
328
  count and logger.info("Resuming crawl (%d requests scheduled)" % count)
134
329
 
135
330
  return instance
136
331
 
137
332
  async def close(self, reason: str) -> None:
333
+ """
334
+ Close the scheduler.
335
+ 关闭调度器。
336
+
337
+ This method is called when the spider is closed. It handles cleanup
338
+ operations, including flushing the queue if persistence is disabled,
339
+ or moving cached requests back to the main queue if persistence is enabled.
340
+ 当爬虫关闭时调用此方法。它处理清理操作,包括如果禁用持久性则刷新队列,
341
+ 或者如果启用持久性则将缓存的请求移回主队列。
342
+
343
+ Args:
344
+ reason: The reason why the spider was closed.
345
+ 爬虫关闭的原因。
346
+ Common values include 'finished', 'cancelled', or 'shutdown'.
347
+ 常见值包括'finished'(完成)、'cancelled'(取消)或'shutdown'(关闭)。
348
+ This parameter is not used in the current implementation but might
349
+ be used in future versions or subclasses to customize cleanup behavior.
350
+ 此参数在当前实现中未使用,但可能在未来版本或子类中用于自定义清理行为。
351
+ """
352
+ # The reason parameter is not used in the current implementation
353
+ # 当前实现中未使用reason参数
354
+ # pylint: disable=unused-argument
138
355
 
356
+ # If persistence is disabled, clear the queue
357
+ # 如果禁用持久性,则清除队列
139
358
  if not self.persist:
140
359
  await self.flush()
141
360
  return
142
361
 
143
- # 如果持久化,将缓存中的任务放回到redis等分布式队列中
362
+ # If persistence is enabled and we have a cache queue,
363
+ # move all cached requests back to the main queue
364
+ # 如果启用持久性并且我们有缓存队列,则将所有缓存的请求移回主队列
144
365
  if self.cache_queue is not None:
366
+ # Process in batches of 2000 to avoid memory issues
367
+ # 以2000个批次处理,以避免内存问题
145
368
  while True:
146
369
  temp = []
147
370
  async for request in self.cache_queue.pop(2000):
148
371
  temp.append(request)
372
+ # Push the batch to the main queue if not empty
373
+ # 如果不为空,则将批次推送到主队列
149
374
  temp and await self.queue.push_batch(temp)
375
+ # Break if we got less than a full batch (end of queue)
376
+ # 如果我们得到的批次不足(队列结束),则中断
150
377
  if len(temp) < 2000:
151
378
  break
152
379
 
153
380
  async def flush(self) -> None:
381
+ """
382
+ Clear the scheduler's queue.
383
+ 清除调度器的队列。
384
+
385
+ This method removes all pending requests from the queue.
386
+ 此方法从队列中删除所有待处理的请求。
387
+ """
154
388
  await call_helper(self.queue.clear)
155
389
 
156
390
  async def enqueue_request_batch(self, requests: List[aioscrapy.Request]) -> bool:
391
+ """
392
+ Add multiple requests to the queue at once.
393
+ 一次将多个请求添加到队列中。
394
+
395
+ This method adds a batch of requests directly to the main queue
396
+ and updates the stats if enabled.
397
+ 此方法将一批请求直接添加到主队列,并在启用时更新统计信息。
398
+
399
+ Args:
400
+ requests: A list of requests to enqueue.
401
+ 要排队的请求列表。
402
+
403
+ Returns:
404
+ bool: Always returns True, indicating the requests were accepted.
405
+ 始终返回True,表示请求被接受。
406
+ """
157
407
  await call_helper(self.queue.push_batch, requests)
158
408
  if self.stats:
159
409
  self.stats.inc_value(self.queue.inc_key, count=len(requests), spider=self.spider)
@@ -161,7 +411,22 @@ class Scheduler(BaseScheduler):
161
411
 
162
412
  async def enqueue_request(self, request: aioscrapy.Request) -> bool:
163
413
  """
164
- 如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先将任务放到缓存队列中
414
+ Add a single request to the queue.
415
+ 将单个请求添加到队列中。
416
+
417
+ If a cache queue is enabled (USE_SCHEDULER_QUEUE_CACHE), the request
418
+ is added to the cache queue first for faster access. Otherwise, it's
419
+ added directly to the main queue.
420
+ 如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先将请求添加到缓存队列中
421
+ 以便更快地访问。否则,它将直接添加到主队列中。
422
+
423
+ Args:
424
+ request: The request to enqueue.
425
+ 要排队的请求。
426
+
427
+ Returns:
428
+ bool: Always returns True, indicating the request was accepted.
429
+ 始终返回True,表示请求被接受。
165
430
  """
166
431
  if self.cache_queue is not None:
167
432
  await call_helper(self.cache_queue.push, request)
@@ -171,11 +436,38 @@ class Scheduler(BaseScheduler):
171
436
  self.stats.inc_value(self.queue.inc_key, spider=self.spider)
172
437
  return True
173
438
 
174
- async def next_request(self, count: int = 1) -> Optional[aioscrapy.Request]:
439
+ async def next_request(self, count: int = 1):
175
440
  """
176
- 如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先从缓存队列中获取任务,然后从redis等分布式队列中获取任务
441
+ Get the next request(s) to be processed.
442
+ 获取要处理的下一个请求。
443
+
444
+ This method is an async generator that yields requests from the queue.
445
+ If a cache queue is enabled (USE_SCHEDULER_QUEUE_CACHE), it first tries
446
+ to get requests from the cache queue, then falls back to the main queue.
447
+ 此方法是一个异步生成器,从队列中产生请求。
448
+ 如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),它首先尝试从缓存队列中获取请求,
449
+ 然后回退到主队列。
450
+
451
+ Note: This implementation differs from the BaseScheduler.next_request abstract
452
+ method, which returns a single request or None. This implementation is an
453
+ async generator that can yield multiple requests, making it more efficient
454
+ for batch processing.
455
+ 注意:此实现与BaseScheduler.next_request抽象方法不同,后者返回单个请求或None。
456
+ 此实现是一个异步生成器,可以产生多个请求,使其更适合批处理。
457
+
458
+ Args:
459
+ count: Maximum number of requests to return.
460
+ 要返回的最大请求数。
461
+ Defaults to 1.
462
+ 默认为1。
463
+
464
+ Yields:
465
+ Request: The next request(s) to be processed.
466
+ 要处理的下一个请求。
177
467
  """
178
468
  flag = False
469
+ # First try to get requests from the cache queue if available
470
+ # 如果可用,首先尝试从缓存队列获取请求
179
471
  if self.cache_queue is not None:
180
472
  async for request in self.cache_queue.pop(count):
181
473
  if request and self.stats:
@@ -183,14 +475,36 @@ class Scheduler(BaseScheduler):
183
475
  yield request
184
476
  flag = True
185
477
 
478
+ # If we got requests from the cache queue, we're done
479
+ # 如果我们从缓存队列获取了请求,我们就完成了
186
480
  if flag:
187
481
  return
188
482
 
483
+ # Otherwise, get requests from the main queue
484
+ # 否则,从主队列获取请求
189
485
  async for request in self.queue.pop(count):
190
486
  if request and self.stats:
191
487
  self.stats.inc_value(self.queue.inc_key, spider=self.spider)
192
488
  yield request
193
489
 
194
490
  async def has_pending_requests(self) -> bool:
195
- return await call_helper(self.queue.len) if self.cache_queue is None \
196
- else (await call_helper(self.queue.len) + await call_helper(self.cache_queue.len)) > 0
491
+ """
492
+ Check if the scheduler has pending requests.
493
+ 检查调度器是否有待处理的请求。
494
+
495
+ This method checks both the main queue and the cache queue (if enabled)
496
+ to determine if there are any pending requests.
497
+ 此方法检查主队列和缓存队列(如果启用)以确定是否有任何待处理的请求。
498
+
499
+ Returns:
500
+ bool: True if there are pending requests, False otherwise.
501
+ 如果有待处理的请求,则为True,否则为False。
502
+ """
503
+ # If no cache queue, just check the main queue
504
+ # 如果没有缓存队列,只检查主队列
505
+ if self.cache_queue is None:
506
+ return await call_helper(self.queue.len) > 0
507
+ # Otherwise, check both queues
508
+ # 否则,检查两个队列
509
+ else:
510
+ return (await call_helper(self.queue.len) + await call_helper(self.cache_queue.len)) > 0