aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/core/engine.py CHANGED
@@ -1,9 +1,29 @@
1
1
  # _*_ coding: utf-8 _*_
2
+ """
3
+ Execution Engine Module
4
+ 执行引擎模块
5
+
6
+ This module provides the core execution engine for AioScrapy, which coordinates
7
+ the crawling process. The engine manages the scheduling of requests, downloading
8
+ of pages, and processing of responses through the scraper.
9
+ 此模块提供了AioScrapy的核心执行引擎,它协调爬取过程。引擎管理请求的调度、
10
+ 页面的下载以及通过抓取器处理响应。
11
+
12
+ The main components are:
13
+ 主要组件包括:
14
+
15
+ 1. ExecutionEngine: Coordinates the entire crawling process
16
+ 协调整个爬取过程
17
+ 2. Slot: Holds spider running state and resources
18
+ 保存爬虫运行状态和资源
19
+
20
+ The engine is the central component that connects all other parts of the crawling
21
+ system: the scheduler, downloader, scraper, and spider.
22
+ 引擎是连接爬取系统所有其他部分的中央组件:调度器、下载器、抓取器和爬虫。
23
+ """
2
24
 
3
25
  import asyncio
4
26
  import time
5
- from asyncio import Queue
6
- from asyncio.queues import QueueEmpty
7
27
  from typing import Optional, AsyncGenerator, Union, Callable
8
28
 
9
29
  import aioscrapy
@@ -17,66 +37,176 @@ from aioscrapy.http import Response
17
37
  from aioscrapy.http.request import Request
18
38
  from aioscrapy.utils.log import logger
19
39
  from aioscrapy.utils.misc import load_instance
20
- from aioscrapy.utils.tools import call_helper, create_task
40
+ from aioscrapy.utils.tools import call_helper
21
41
 
22
42
 
23
43
  class Slot:
44
+ """
45
+ A slot for holding spider running state and resources.
46
+ 用于保存爬虫运行状态和资源的槽。
47
+
48
+ This class keeps track of in-progress requests and start requests
49
+ for a spider.
50
+ 此类跟踪爬虫的进行中请求和起始请求。
51
+ """
24
52
 
25
53
  def __init__(self, start_requests: Optional[AsyncGenerator]) -> None:
26
- self.inprogress: set[Request] = set() # requests in progress
54
+ """
55
+ Initialize a new Slot.
56
+ 初始化一个新的Slot。
57
+
58
+ Args:
59
+ start_requests: An async generator that yields initial requests.
60
+ 产生初始请求的异步生成器。
61
+ """
62
+ self.inprogress: set[Request] = set() # requests in progress 进行中的请求
27
63
  self.start_requests = start_requests
28
- self.lock: bool = False
64
+ self.lock: bool = False # lock for accessing start_requests 访问start_requests的锁
29
65
 
30
66
  def add_request(self, request: Request) -> None:
67
+ """
68
+ Add a request to the set of in-progress requests.
69
+ 将请求添加到进行中请求的集合中。
70
+
71
+ Args:
72
+ request: The request to add.
73
+ 要添加的请求。
74
+ """
31
75
  self.inprogress.add(request)
32
76
 
33
77
  def remove_request(self, request: Request) -> None:
78
+ """
79
+ Remove a request from the set of in-progress requests.
80
+ 从进行中请求的集合中移除请求。
81
+
82
+ Args:
83
+ request: The request to remove.
84
+ 要移除的请求。
85
+ """
34
86
  self.inprogress.remove(request)
35
87
 
36
88
 
37
89
  class ExecutionEngine(object):
90
+ """
91
+ The execution engine coordinates the crawling process.
92
+ 执行引擎协调爬取过程。
93
+
94
+ It manages the scheduling of requests, downloading of pages, and processing
95
+ of responses through the scraper. The engine is the central component that
96
+ connects all other parts of the crawling system.
97
+ 它管理请求的调度、页面的下载以及通过抓取器处理响应。引擎是连接爬取系统
98
+ 所有其他部分的中央组件。
99
+
100
+ The engine's main responsibilities include:
101
+ 引擎的主要职责包括:
102
+
103
+ 1. Starting and stopping the crawling process
104
+ 启动和停止爬取过程
105
+ 2. Scheduling requests through the scheduler
106
+ 通过调度器调度请求
107
+ 3. Sending requests to the downloader
108
+ 将请求发送到下载器
109
+ 4. Passing responses to the scraper
110
+ 将响应传递给抓取器
111
+ 5. Handling spider idle state
112
+ 处理爬虫空闲状态
113
+
114
+ The engine maintains a slot for each running spider, which keeps track of
115
+ in-progress requests and start requests.
116
+ 引擎为每个运行的爬虫维护一个槽,该槽跟踪进行中的请求和起始请求。
117
+ """
38
118
 
39
119
  def __init__(self, crawler: "aioscrapy.Crawler") -> None:
120
+ """
121
+ Initialize the execution engine.
122
+ 初始化执行引擎。
123
+
124
+ Args:
125
+ crawler: The crawler instance that this engine belongs to.
126
+ 此引擎所属的爬虫实例。
127
+ """
40
128
  self.crawler = crawler
41
129
  self.settings = crawler.settings
42
130
  self.signals = crawler.signals
43
131
  self.logformatter = crawler.logformatter
44
132
 
133
+ # Components initialized during open()
134
+ # 在open()期间初始化的组件
45
135
  self.slot: Optional[Slot] = None
46
136
  self.spider: Optional[Spider] = None
47
137
  self.downloader: Optional[DownloaderTV] = None
48
138
  self.scraper: Optional[Scraper] = None
49
139
  self.scheduler: Optional[BaseScheduler] = None
50
140
 
51
- self.running: bool = False
52
- self.unlock: bool = True
53
- self.finish: bool = False
141
+ # Engine state
142
+ # 引擎状态
143
+ self.running: bool = False # True when engine is running
144
+ self.unlock: bool = True # Lock for scheduler access
145
+ self.finish: bool = False # True when engine is completely finished
54
146
 
55
147
  async def start(
56
148
  self,
57
149
  spider: Spider,
58
150
  start_requests: Optional[AsyncGenerator] = None
59
151
  ) -> None:
60
- """Start the execution engine"""
152
+ """
153
+ Start the execution engine.
154
+ 启动执行引擎。
155
+
156
+ This method initializes the engine components, opens the spider,
157
+ and starts the main crawling loop.
158
+ 此方法初始化引擎组件,打开爬虫,并启动主爬取循环。
159
+
160
+ Args:
161
+ spider: The spider instance to run.
162
+ 要运行的爬虫实例。
163
+ start_requests: Optional async generator of initial requests.
164
+ 初始请求的可选异步生成器。
165
+
166
+ Raises:
167
+ RuntimeError: If the engine is already running.
168
+ 如果引擎已经在运行。
169
+ """
61
170
  if self.running:
62
171
  raise RuntimeError("Engine already running")
63
172
 
64
173
  self.running = True
65
174
  await self.signals.send_catch_log_deferred(signal=signals.engine_started)
66
175
  await self.open(spider, start_requests)
176
+
177
+ # Main crawling loop
178
+ # 主爬取循环
67
179
  while not self.finish:
68
180
  self.running and await self._next_request()
69
181
  await asyncio.sleep(1)
70
182
  self.running and await self._spider_idle(self.spider)
71
183
 
72
184
  async def stop(self, reason: str = 'shutdown') -> None:
73
- """Stop the execution engine gracefully"""
185
+ """
186
+ Stop the execution engine gracefully.
187
+ 优雅地停止执行引擎。
188
+
189
+ This method stops the engine, waits for all pending requests to complete,
190
+ closes the spider, and sends the engine_stopped signal.
191
+ 此方法停止引擎,等待所有待处理的请求完成,关闭爬虫,并发送engine_stopped信号。
192
+
193
+ Args:
194
+ reason: The reason for stopping the engine.
195
+ 停止引擎的原因。
196
+
197
+ Raises:
198
+ RuntimeError: If the engine is not running.
199
+ 如果引擎没有运行。
200
+ """
74
201
  if not self.running:
75
202
  raise RuntimeError("Engine not running")
76
203
  self.running = False
77
204
 
205
+ # Wait for all pending requests to complete
206
+ # 等待所有待处理的请求完成
78
207
  while not self.is_idle():
79
208
  await asyncio.sleep(0.2)
209
+
80
210
  await self.close_spider(self.spider, reason=reason)
81
211
  await self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
82
212
  self.finish = True
@@ -86,39 +216,84 @@ class ExecutionEngine(object):
86
216
  spider: Spider,
87
217
  start_requests: Optional[AsyncGenerator] = None
88
218
  ) -> None:
219
+ """
220
+ Open a spider for crawling.
221
+ 打开爬虫进行爬取。
222
+
223
+ This method initializes all the components needed for crawling:
224
+ scheduler, downloader, scraper, and slot. It also sends the spider_opened signal.
225
+ 此方法初始化爬取所需的所有组件:调度器、下载器、抓取器和槽。它还发送spider_opened信号。
226
+
227
+ Args:
228
+ spider: The spider instance to open.
229
+ 要打开的爬虫实例。
230
+ start_requests: Optional async generator of initial requests.
231
+ 初始请求的可选异步生成器。
232
+ """
89
233
  logger.info("Spider opened")
90
234
 
91
235
  self.spider = spider
92
236
  await call_helper(self.crawler.stats.open_spider, spider)
93
237
 
238
+ # Initialize components
239
+ # 初始化组件
94
240
  self.scheduler = await load_instance(self.settings['SCHEDULER'], crawler=self.crawler)
95
241
  self.downloader = await load_instance(self.settings['DOWNLOADER'], crawler=self.crawler)
96
242
  self.scraper = await call_helper(Scraper.from_crawler, self.crawler)
97
243
 
244
+ # Process start requests through spider middleware
245
+ # 通过爬虫中间件处理起始请求
98
246
  start_requests = await call_helper(self.scraper.spidermw.process_start_requests, start_requests, spider)
99
247
  self.slot = Slot(start_requests)
100
248
 
101
249
  await self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
102
250
 
103
251
  async def close(self) -> None:
104
- """Close the execution engine gracefully.
252
+ """
253
+ Close the execution engine gracefully.
254
+ 优雅地关闭执行引擎。
105
255
 
106
256
  If it has already been started, stop it. In all cases, close all spiders
107
257
  and the downloader.
258
+ 如果它已经启动,则停止它。在所有情况下,关闭所有爬虫和下载器。
259
+
260
+ This method is the main entry point for shutting down the engine from
261
+ outside the engine itself.
262
+ 此方法是从引擎外部关闭引擎的主要入口点。
108
263
  """
109
264
  if self.running:
110
265
  # Will also close spiders and downloader
266
+ # 也会关闭爬虫和下载器
111
267
  await self.stop()
112
268
  elif self.spider:
113
269
  # Will also close downloader
270
+ # 也会关闭下载器
114
271
  await self.close_spider(self.spider, reason='shutdown')
115
272
  else:
273
+ # Just close the downloader if no spider is running
274
+ # 如果没有爬虫在运行,只关闭下载器
116
275
  self.downloader.close()
117
276
 
118
277
  async def _next_request(self) -> None:
278
+ """
279
+ Process the next request from the scheduler or start requests.
280
+ 处理来自调度器或起始请求的下一个请求。
281
+
282
+ This method is the core of the crawling process. It handles:
283
+ 此方法是爬取过程的核心。它处理:
284
+
285
+ 1. Spider pause/resume logic
286
+ 爬虫暂停/恢复逻辑
287
+ 2. Getting requests from the scheduler and sending them to the downloader
288
+ 从调度器获取请求并将其发送到下载器
289
+ 3. Processing start requests
290
+ 处理起始请求
291
+ """
119
292
  if self.slot is None or self.spider is None:
120
293
  return
121
294
 
295
+ # Handle spider pause/resume logic
296
+ # 处理爬虫暂停/恢复逻辑
122
297
  if self.spider.pause:
123
298
  now = int(time.time())
124
299
  last_log_time = getattr(self.spider, "last_log_time", None)
@@ -130,6 +305,8 @@ class ExecutionEngine(object):
130
305
  self.spider.pause = False
131
306
  return
132
307
 
308
+ # Get requests from scheduler and send them to downloader
309
+ # 从调度器获取请求并将其发送到下载器
133
310
  while self.unlock and not self._needs_backout() and self.unlock:
134
311
  self.unlock = False
135
312
  try:
@@ -141,21 +318,54 @@ class ExecutionEngine(object):
141
318
  finally:
142
319
  self.unlock = True
143
320
 
321
+ # Process start requests if available
322
+ # 如果可用,处理起始请求
144
323
  if self.slot.start_requests and not self._needs_backout() and not self.slot.lock:
145
324
  self.slot.lock = True
146
325
  try:
326
+ # Get the next request from start_requests
327
+ # 从start_requests获取下一个请求
147
328
  request = await self.slot.start_requests.__anext__()
148
329
  except StopAsyncIteration:
330
+ # No more start requests, set to None
331
+ # 没有更多的起始请求,设置为None
149
332
  self.slot.start_requests = None
150
- except Exception as e:
333
+ except Exception as exc:
334
+ # Log any errors and stop processing start requests
335
+ # 记录任何错误并停止处理起始请求
151
336
  self.slot.start_requests = None
152
- logger.exception('Error while obtaining start requests')
337
+ logger.exception('Error while obtaining start requests: %s', str(exc))
153
338
  else:
339
+ # If we got a request, schedule it for crawling
340
+ # 如果我们得到了请求,安排它进行爬取
154
341
  request and await self.crawl(request)
155
342
  finally:
343
+ # Always release the lock
344
+ # 始终释放锁
156
345
  self.slot.lock = False
157
346
 
158
347
  def _needs_backout(self) -> bool:
348
+ """
349
+ Check if the engine should temporarily stop processing more requests.
350
+ 检查引擎是否应该暂时停止处理更多请求。
351
+
352
+ This method determines if the request processing loop should pause by checking:
353
+ 此方法通过检查以下条件来确定请求处理循环是否应该暂停:
354
+
355
+ 1. If the engine is no longer running (self.running is False)
356
+ 引擎是否不再运行(self.running为False)
357
+ 2. If the downloader is at capacity or needs to pause
358
+ 下载器是否已达到容量或需要暂停
359
+ 3. If the scraper is at capacity or needs to pause
360
+ 抓取器是否已达到容量或需要暂停
361
+
362
+ This is used to implement flow control in the request processing pipeline.
363
+ 这用于在请求处理管道中实现流量控制。
364
+
365
+ Returns:
366
+ True if request processing should pause, False if it can continue.
367
+ 如果请求处理应该暂停,则返回True;如果可以继续,则返回False。
368
+ """
159
369
  return (
160
370
  not self.running
161
371
  or self.downloader.needs_backout()
@@ -165,6 +375,32 @@ class ExecutionEngine(object):
165
375
  async def handle_downloader_output(
166
376
  self, result: Union[Request, Response, BaseException, None], request: Request
167
377
  ) -> None:
378
+ """
379
+ Handle the output from the downloader.
380
+ 处理下载器的输出。
381
+
382
+ This method processes the result of a download, which can be:
383
+ 此方法处理下载的结果,可以是:
384
+
385
+ - None: Download was cancelled or failed without an exception
386
+ None:下载被取消或失败,没有异常
387
+ - Request: A new request to crawl
388
+ Request:要爬取的新请求
389
+ - Response: A successful response
390
+ Response:成功的响应
391
+ - BaseException: An exception that occurred during download
392
+ BaseException:下载过程中发生的异常
393
+
394
+ Args:
395
+ result: The result of the download.
396
+ 下载的结果。
397
+ request: The original request that was downloaded.
398
+ 被下载的原始请求。
399
+
400
+ Raises:
401
+ TypeError: If the result is not None, Request, Response, or BaseException.
402
+ 如果结果不是None、Request、Response或BaseException。
403
+ """
168
404
  try:
169
405
  if result is None:
170
406
  return
@@ -176,56 +412,157 @@ class ExecutionEngine(object):
176
412
  )
177
413
 
178
414
  if isinstance(result, Request):
415
+ # Schedule new request
416
+ # 调度新请求
179
417
  await self.crawl(result)
180
418
  return
181
419
 
420
+ # Set the original request on the result
421
+ # 在结果上设置原始请求
182
422
  result.request = request
423
+
183
424
  if isinstance(result, Response):
425
+ # Log successful response and send signal
426
+ # 记录成功的响应并发送信号
184
427
  logger.log(**self.logformatter.crawled(request, result, self.spider))
185
428
  await self.signals.send_catch_log(signals.response_received,
186
429
  response=result, request=request, spider=self.spider)
430
+
431
+ # Send result to scraper for processing
432
+ # 将结果发送到抓取器进行处理
187
433
  await self.scraper.enqueue_scrape(result, request)
188
434
 
189
435
  finally:
436
+ # Always remove the request from in-progress and process next request
437
+ # 始终从进行中移除请求并处理下一个请求
190
438
  self.slot.remove_request(request)
191
439
  await self._next_request()
192
440
 
193
441
  def is_idle(self) -> bool:
194
-
442
+ """
443
+ Check if the engine is idle.
444
+ 检查引擎是否空闲。
445
+
446
+ The engine is considered idle when:
447
+ 在以下情况下,引擎被认为是空闲的:
448
+
449
+ 1. The downloader has no active requests
450
+ 下载器没有活动的请求
451
+ 2. There are no requests in progress
452
+ 没有正在进行的请求
453
+ 3. The scraper is idle
454
+ 抓取器是空闲的
455
+
456
+ Returns:
457
+ True if the engine is idle, False otherwise.
458
+ 如果引擎空闲,则为True,否则为False。
459
+ """
195
460
  if self.downloader.active:
196
461
  # downloader has pending requests
462
+ # 下载器有待处理的请求
197
463
  return False
198
464
 
199
465
  if self.slot.inprogress:
200
466
  # not all start requests are handled
467
+ # 不是所有的起始请求都已处理
201
468
  return False
202
469
 
203
470
  if not self.scraper.is_idle():
204
471
  # scraper is not idle
472
+ # 抓取器不是空闲的
205
473
  return False
206
474
 
207
475
  return True
208
476
 
209
477
  async def crawl(self, request: Request) -> None:
478
+ """
479
+ Schedule a request for crawling.
480
+ 调度请求进行爬取。
481
+
482
+ This method adds the request to the scheduler's queue.
483
+ 此方法将请求添加到调度器的队列中。
484
+
485
+ Args:
486
+ request: The request to schedule.
487
+ 要调度的请求。
488
+ """
210
489
  await self.scheduler.enqueue_request(request)
211
- # create_task(self._next_request())
212
490
 
213
491
  async def close_spider(self, spider: Spider, reason: str = 'cancelled') -> None:
214
- """Close (cancel) spider and clear all its outstanding requests"""
215
-
492
+ """
493
+ Close (cancel) spider and clear all its outstanding requests.
494
+ 关闭(取消)爬虫并清除其所有未完成的请求。
495
+
496
+ This method gracefully shuts down all components related to the spider:
497
+ 此方法优雅地关闭与爬虫相关的所有组件:
498
+
499
+ 1. Downloader
500
+ 下载器
501
+ 2. Scraper
502
+ 抓取器
503
+ 3. Scheduler
504
+ 调度器
505
+ 4. Stats collector
506
+ 统计收集器
507
+ 5. Sends the spider_closed signal
508
+ 发送spider_closed信号
509
+
510
+ Args:
511
+ spider: The spider to close.
512
+ 要关闭的爬虫。
513
+ reason: The reason for closing the spider.
514
+ 关闭爬虫的原因。
515
+ """
216
516
  logger.info(f"Closing spider ({reason})")
217
517
 
518
+ # Helper function to handle exceptions during close operations
519
+ # 处理关闭操作期间异常的辅助函数
218
520
  async def close_handler(
219
521
  callback: Callable,
220
522
  *args,
221
- errmsg: str = '',
523
+ errmsg: str = '', # Error message to log if an exception occurs
524
+ # 如果发生异常时记录的错误消息
222
525
  **kwargs
223
526
  ) -> None:
527
+ """
528
+ Call a callback and log any exceptions that occur.
529
+ 调用回调并记录发生的任何异常。
530
+
531
+ This is an internal helper function used during the spider closing process
532
+ to ensure that exceptions in one closing operation don't prevent other
533
+ closing operations from being attempted. It wraps each callback in a
534
+ try-except block and logs any exceptions with the provided error message.
535
+ 这是在爬虫关闭过程中使用的内部辅助函数,用于确保一个关闭操作中的异常
536
+ 不会阻止尝试其他关闭操作。它将每个回调包装在try-except块中,并使用
537
+ 提供的错误消息记录任何异常。
538
+
539
+ Args:
540
+ callback: The callback function to call.
541
+ 要调用的回调函数。
542
+ *args: Positional arguments to pass to the callback.
543
+ 传递给回调的位置参数。
544
+ errmsg: Error message prefix to log if an exception occurs.
545
+ 如果发生异常时记录的错误消息前缀。
546
+ This will be prepended to the exception string in the log.
547
+ 这将在日志中添加到异常字符串之前。
548
+ **kwargs: Keyword arguments to pass to the callback.
549
+ 传递给回调的关键字参数。
550
+
551
+ Note:
552
+ This function catches all exceptions (including BaseException) to ensure
553
+ that the closing process continues even if a critical error occurs.
554
+ 此函数捕获所有异常(包括BaseException),以确保即使发生严重错误,
555
+ 关闭过程也会继续。
556
+ """
224
557
  try:
225
558
  await call_helper(callback, *args, **kwargs)
226
- except (Exception, BaseException) as e:
227
- logger.exception(errmsg)
559
+ except (Exception, BaseException) as exc:
560
+ # Log the error message along with the exception details
561
+ # 记录错误消息以及异常详细信息
562
+ logger.exception(f"{errmsg}: {str(exc)}")
228
563
 
564
+ # Close all components in sequence
565
+ # 按顺序关闭所有组件
229
566
  await close_handler(self.downloader.close, errmsg='Downloader close failure')
230
567
 
231
568
  await close_handler(self.scraper.close, errmsg='Scraper close failure')
@@ -239,17 +576,41 @@ class ExecutionEngine(object):
239
576
 
240
577
  logger.info(f"Spider closed ({reason})")
241
578
 
579
+ # Clean up references
580
+ # 清理引用
242
581
  await close_handler(setattr, self, 'slot', None, errmsg='Error while unassigning slot')
243
582
 
244
583
  await close_handler(setattr, self, 'spider', None, errmsg='Error while unassigning spider')
245
584
 
246
585
  async def _spider_idle(self, spider: Spider) -> None:
586
+ """
587
+ Handle the spider_idle signal.
588
+ 处理spider_idle信号。
589
+
590
+ This method is called when the spider becomes idle (no more requests to process).
591
+ 当爬虫变为空闲状态(没有更多请求要处理)时,调用此方法。
592
+
593
+ It sends the spider_idle signal, which handlers can use to add more requests.
594
+ 它发送spider_idle信号,处理程序可以使用该信号添加更多请求。
595
+
596
+ If no handler raises DontCloseSpider and there are no pending requests,
597
+ the spider is stopped.
598
+ 如果没有处理程序引发DontCloseSpider且没有待处理的请求,则停止爬虫。
599
+
600
+ Args:
601
+ spider: The idle spider.
602
+ 空闲的爬虫。
603
+ """
247
604
  assert self.spider is not None
605
+
606
+ # Send spider_idle signal and check if any handler wants to keep the spider open
607
+ # 发送spider_idle信号并检查是否有任何处理程序希望保持爬虫打开
248
608
  res = await self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
249
609
  if any(isinstance(x, DontCloseSpider) for _, x in res):
250
610
  return
251
611
 
252
612
  # method of 'has_pending_requests' has IO, so method of 'is_idle' execute twice
613
+ # 'has_pending_requests'方法有IO操作,所以'is_idle'方法执行两次
253
614
  if self.is_idle() \
254
615
  and self.slot.start_requests is None \
255
616
  and not await self.scheduler.has_pending_requests() \