aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/core/engine.py
CHANGED
|
@@ -1,9 +1,29 @@
|
|
|
1
1
|
# _*_ coding: utf-8 _*_
|
|
2
|
+
"""
|
|
3
|
+
Execution Engine Module
|
|
4
|
+
执行引擎模块
|
|
5
|
+
|
|
6
|
+
This module provides the core execution engine for AioScrapy, which coordinates
|
|
7
|
+
the crawling process. The engine manages the scheduling of requests, downloading
|
|
8
|
+
of pages, and processing of responses through the scraper.
|
|
9
|
+
此模块提供了AioScrapy的核心执行引擎,它协调爬取过程。引擎管理请求的调度、
|
|
10
|
+
页面的下载以及通过抓取器处理响应。
|
|
11
|
+
|
|
12
|
+
The main components are:
|
|
13
|
+
主要组件包括:
|
|
14
|
+
|
|
15
|
+
1. ExecutionEngine: Coordinates the entire crawling process
|
|
16
|
+
协调整个爬取过程
|
|
17
|
+
2. Slot: Holds spider running state and resources
|
|
18
|
+
保存爬虫运行状态和资源
|
|
19
|
+
|
|
20
|
+
The engine is the central component that connects all other parts of the crawling
|
|
21
|
+
system: the scheduler, downloader, scraper, and spider.
|
|
22
|
+
引擎是连接爬取系统所有其他部分的中央组件:调度器、下载器、抓取器和爬虫。
|
|
23
|
+
"""
|
|
2
24
|
|
|
3
25
|
import asyncio
|
|
4
26
|
import time
|
|
5
|
-
from asyncio import Queue
|
|
6
|
-
from asyncio.queues import QueueEmpty
|
|
7
27
|
from typing import Optional, AsyncGenerator, Union, Callable
|
|
8
28
|
|
|
9
29
|
import aioscrapy
|
|
@@ -17,66 +37,176 @@ from aioscrapy.http import Response
|
|
|
17
37
|
from aioscrapy.http.request import Request
|
|
18
38
|
from aioscrapy.utils.log import logger
|
|
19
39
|
from aioscrapy.utils.misc import load_instance
|
|
20
|
-
from aioscrapy.utils.tools import call_helper
|
|
40
|
+
from aioscrapy.utils.tools import call_helper
|
|
21
41
|
|
|
22
42
|
|
|
23
43
|
class Slot:
|
|
44
|
+
"""
|
|
45
|
+
A slot for holding spider running state and resources.
|
|
46
|
+
用于保存爬虫运行状态和资源的槽。
|
|
47
|
+
|
|
48
|
+
This class keeps track of in-progress requests and start requests
|
|
49
|
+
for a spider.
|
|
50
|
+
此类跟踪爬虫的进行中请求和起始请求。
|
|
51
|
+
"""
|
|
24
52
|
|
|
25
53
|
def __init__(self, start_requests: Optional[AsyncGenerator]) -> None:
|
|
26
|
-
|
|
54
|
+
"""
|
|
55
|
+
Initialize a new Slot.
|
|
56
|
+
初始化一个新的Slot。
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
start_requests: An async generator that yields initial requests.
|
|
60
|
+
产生初始请求的异步生成器。
|
|
61
|
+
"""
|
|
62
|
+
self.inprogress: set[Request] = set() # requests in progress 进行中的请求
|
|
27
63
|
self.start_requests = start_requests
|
|
28
|
-
self.lock: bool = False
|
|
64
|
+
self.lock: bool = False # lock for accessing start_requests 访问start_requests的锁
|
|
29
65
|
|
|
30
66
|
def add_request(self, request: Request) -> None:
|
|
67
|
+
"""
|
|
68
|
+
Add a request to the set of in-progress requests.
|
|
69
|
+
将请求添加到进行中请求的集合中。
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
request: The request to add.
|
|
73
|
+
要添加的请求。
|
|
74
|
+
"""
|
|
31
75
|
self.inprogress.add(request)
|
|
32
76
|
|
|
33
77
|
def remove_request(self, request: Request) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Remove a request from the set of in-progress requests.
|
|
80
|
+
从进行中请求的集合中移除请求。
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
request: The request to remove.
|
|
84
|
+
要移除的请求。
|
|
85
|
+
"""
|
|
34
86
|
self.inprogress.remove(request)
|
|
35
87
|
|
|
36
88
|
|
|
37
89
|
class ExecutionEngine(object):
|
|
90
|
+
"""
|
|
91
|
+
The execution engine coordinates the crawling process.
|
|
92
|
+
执行引擎协调爬取过程。
|
|
93
|
+
|
|
94
|
+
It manages the scheduling of requests, downloading of pages, and processing
|
|
95
|
+
of responses through the scraper. The engine is the central component that
|
|
96
|
+
connects all other parts of the crawling system.
|
|
97
|
+
它管理请求的调度、页面的下载以及通过抓取器处理响应。引擎是连接爬取系统
|
|
98
|
+
所有其他部分的中央组件。
|
|
99
|
+
|
|
100
|
+
The engine's main responsibilities include:
|
|
101
|
+
引擎的主要职责包括:
|
|
102
|
+
|
|
103
|
+
1. Starting and stopping the crawling process
|
|
104
|
+
启动和停止爬取过程
|
|
105
|
+
2. Scheduling requests through the scheduler
|
|
106
|
+
通过调度器调度请求
|
|
107
|
+
3. Sending requests to the downloader
|
|
108
|
+
将请求发送到下载器
|
|
109
|
+
4. Passing responses to the scraper
|
|
110
|
+
将响应传递给抓取器
|
|
111
|
+
5. Handling spider idle state
|
|
112
|
+
处理爬虫空闲状态
|
|
113
|
+
|
|
114
|
+
The engine maintains a slot for each running spider, which keeps track of
|
|
115
|
+
in-progress requests and start requests.
|
|
116
|
+
引擎为每个运行的爬虫维护一个槽,该槽跟踪进行中的请求和起始请求。
|
|
117
|
+
"""
|
|
38
118
|
|
|
39
119
|
def __init__(self, crawler: "aioscrapy.Crawler") -> None:
|
|
120
|
+
"""
|
|
121
|
+
Initialize the execution engine.
|
|
122
|
+
初始化执行引擎。
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
crawler: The crawler instance that this engine belongs to.
|
|
126
|
+
此引擎所属的爬虫实例。
|
|
127
|
+
"""
|
|
40
128
|
self.crawler = crawler
|
|
41
129
|
self.settings = crawler.settings
|
|
42
130
|
self.signals = crawler.signals
|
|
43
131
|
self.logformatter = crawler.logformatter
|
|
44
132
|
|
|
133
|
+
# Components initialized during open()
|
|
134
|
+
# 在open()期间初始化的组件
|
|
45
135
|
self.slot: Optional[Slot] = None
|
|
46
136
|
self.spider: Optional[Spider] = None
|
|
47
137
|
self.downloader: Optional[DownloaderTV] = None
|
|
48
138
|
self.scraper: Optional[Scraper] = None
|
|
49
139
|
self.scheduler: Optional[BaseScheduler] = None
|
|
50
140
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
self.
|
|
141
|
+
# Engine state
|
|
142
|
+
# 引擎状态
|
|
143
|
+
self.running: bool = False # True when engine is running
|
|
144
|
+
self.unlock: bool = True # Lock for scheduler access
|
|
145
|
+
self.finish: bool = False # True when engine is completely finished
|
|
54
146
|
|
|
55
147
|
async def start(
|
|
56
148
|
self,
|
|
57
149
|
spider: Spider,
|
|
58
150
|
start_requests: Optional[AsyncGenerator] = None
|
|
59
151
|
) -> None:
|
|
60
|
-
"""
|
|
152
|
+
"""
|
|
153
|
+
Start the execution engine.
|
|
154
|
+
启动执行引擎。
|
|
155
|
+
|
|
156
|
+
This method initializes the engine components, opens the spider,
|
|
157
|
+
and starts the main crawling loop.
|
|
158
|
+
此方法初始化引擎组件,打开爬虫,并启动主爬取循环。
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
spider: The spider instance to run.
|
|
162
|
+
要运行的爬虫实例。
|
|
163
|
+
start_requests: Optional async generator of initial requests.
|
|
164
|
+
初始请求的可选异步生成器。
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
RuntimeError: If the engine is already running.
|
|
168
|
+
如果引擎已经在运行。
|
|
169
|
+
"""
|
|
61
170
|
if self.running:
|
|
62
171
|
raise RuntimeError("Engine already running")
|
|
63
172
|
|
|
64
173
|
self.running = True
|
|
65
174
|
await self.signals.send_catch_log_deferred(signal=signals.engine_started)
|
|
66
175
|
await self.open(spider, start_requests)
|
|
176
|
+
|
|
177
|
+
# Main crawling loop
|
|
178
|
+
# 主爬取循环
|
|
67
179
|
while not self.finish:
|
|
68
180
|
self.running and await self._next_request()
|
|
69
181
|
await asyncio.sleep(1)
|
|
70
182
|
self.running and await self._spider_idle(self.spider)
|
|
71
183
|
|
|
72
184
|
async def stop(self, reason: str = 'shutdown') -> None:
|
|
73
|
-
"""
|
|
185
|
+
"""
|
|
186
|
+
Stop the execution engine gracefully.
|
|
187
|
+
优雅地停止执行引擎。
|
|
188
|
+
|
|
189
|
+
This method stops the engine, waits for all pending requests to complete,
|
|
190
|
+
closes the spider, and sends the engine_stopped signal.
|
|
191
|
+
此方法停止引擎,等待所有待处理的请求完成,关闭爬虫,并发送engine_stopped信号。
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
reason: The reason for stopping the engine.
|
|
195
|
+
停止引擎的原因。
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
RuntimeError: If the engine is not running.
|
|
199
|
+
如果引擎没有运行。
|
|
200
|
+
"""
|
|
74
201
|
if not self.running:
|
|
75
202
|
raise RuntimeError("Engine not running")
|
|
76
203
|
self.running = False
|
|
77
204
|
|
|
205
|
+
# Wait for all pending requests to complete
|
|
206
|
+
# 等待所有待处理的请求完成
|
|
78
207
|
while not self.is_idle():
|
|
79
208
|
await asyncio.sleep(0.2)
|
|
209
|
+
|
|
80
210
|
await self.close_spider(self.spider, reason=reason)
|
|
81
211
|
await self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
|
|
82
212
|
self.finish = True
|
|
@@ -86,39 +216,84 @@ class ExecutionEngine(object):
|
|
|
86
216
|
spider: Spider,
|
|
87
217
|
start_requests: Optional[AsyncGenerator] = None
|
|
88
218
|
) -> None:
|
|
219
|
+
"""
|
|
220
|
+
Open a spider for crawling.
|
|
221
|
+
打开爬虫进行爬取。
|
|
222
|
+
|
|
223
|
+
This method initializes all the components needed for crawling:
|
|
224
|
+
scheduler, downloader, scraper, and slot. It also sends the spider_opened signal.
|
|
225
|
+
此方法初始化爬取所需的所有组件:调度器、下载器、抓取器和槽。它还发送spider_opened信号。
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
spider: The spider instance to open.
|
|
229
|
+
要打开的爬虫实例。
|
|
230
|
+
start_requests: Optional async generator of initial requests.
|
|
231
|
+
初始请求的可选异步生成器。
|
|
232
|
+
"""
|
|
89
233
|
logger.info("Spider opened")
|
|
90
234
|
|
|
91
235
|
self.spider = spider
|
|
92
236
|
await call_helper(self.crawler.stats.open_spider, spider)
|
|
93
237
|
|
|
238
|
+
# Initialize components
|
|
239
|
+
# 初始化组件
|
|
94
240
|
self.scheduler = await load_instance(self.settings['SCHEDULER'], crawler=self.crawler)
|
|
95
241
|
self.downloader = await load_instance(self.settings['DOWNLOADER'], crawler=self.crawler)
|
|
96
242
|
self.scraper = await call_helper(Scraper.from_crawler, self.crawler)
|
|
97
243
|
|
|
244
|
+
# Process start requests through spider middleware
|
|
245
|
+
# 通过爬虫中间件处理起始请求
|
|
98
246
|
start_requests = await call_helper(self.scraper.spidermw.process_start_requests, start_requests, spider)
|
|
99
247
|
self.slot = Slot(start_requests)
|
|
100
248
|
|
|
101
249
|
await self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
|
|
102
250
|
|
|
103
251
|
async def close(self) -> None:
|
|
104
|
-
"""
|
|
252
|
+
"""
|
|
253
|
+
Close the execution engine gracefully.
|
|
254
|
+
优雅地关闭执行引擎。
|
|
105
255
|
|
|
106
256
|
If it has already been started, stop it. In all cases, close all spiders
|
|
107
257
|
and the downloader.
|
|
258
|
+
如果它已经启动,则停止它。在所有情况下,关闭所有爬虫和下载器。
|
|
259
|
+
|
|
260
|
+
This method is the main entry point for shutting down the engine from
|
|
261
|
+
outside the engine itself.
|
|
262
|
+
此方法是从引擎外部关闭引擎的主要入口点。
|
|
108
263
|
"""
|
|
109
264
|
if self.running:
|
|
110
265
|
# Will also close spiders and downloader
|
|
266
|
+
# 也会关闭爬虫和下载器
|
|
111
267
|
await self.stop()
|
|
112
268
|
elif self.spider:
|
|
113
269
|
# Will also close downloader
|
|
270
|
+
# 也会关闭下载器
|
|
114
271
|
await self.close_spider(self.spider, reason='shutdown')
|
|
115
272
|
else:
|
|
273
|
+
# Just close the downloader if no spider is running
|
|
274
|
+
# 如果没有爬虫在运行,只关闭下载器
|
|
116
275
|
self.downloader.close()
|
|
117
276
|
|
|
118
277
|
async def _next_request(self) -> None:
|
|
278
|
+
"""
|
|
279
|
+
Process the next request from the scheduler or start requests.
|
|
280
|
+
处理来自调度器或起始请求的下一个请求。
|
|
281
|
+
|
|
282
|
+
This method is the core of the crawling process. It handles:
|
|
283
|
+
此方法是爬取过程的核心。它处理:
|
|
284
|
+
|
|
285
|
+
1. Spider pause/resume logic
|
|
286
|
+
爬虫暂停/恢复逻辑
|
|
287
|
+
2. Getting requests from the scheduler and sending them to the downloader
|
|
288
|
+
从调度器获取请求并将其发送到下载器
|
|
289
|
+
3. Processing start requests
|
|
290
|
+
处理起始请求
|
|
291
|
+
"""
|
|
119
292
|
if self.slot is None or self.spider is None:
|
|
120
293
|
return
|
|
121
294
|
|
|
295
|
+
# Handle spider pause/resume logic
|
|
296
|
+
# 处理爬虫暂停/恢复逻辑
|
|
122
297
|
if self.spider.pause:
|
|
123
298
|
now = int(time.time())
|
|
124
299
|
last_log_time = getattr(self.spider, "last_log_time", None)
|
|
@@ -130,6 +305,8 @@ class ExecutionEngine(object):
|
|
|
130
305
|
self.spider.pause = False
|
|
131
306
|
return
|
|
132
307
|
|
|
308
|
+
# Get requests from scheduler and send them to downloader
|
|
309
|
+
# 从调度器获取请求并将其发送到下载器
|
|
133
310
|
while self.unlock and not self._needs_backout() and self.unlock:
|
|
134
311
|
self.unlock = False
|
|
135
312
|
try:
|
|
@@ -141,21 +318,54 @@ class ExecutionEngine(object):
|
|
|
141
318
|
finally:
|
|
142
319
|
self.unlock = True
|
|
143
320
|
|
|
321
|
+
# Process start requests if available
|
|
322
|
+
# 如果可用,处理起始请求
|
|
144
323
|
if self.slot.start_requests and not self._needs_backout() and not self.slot.lock:
|
|
145
324
|
self.slot.lock = True
|
|
146
325
|
try:
|
|
326
|
+
# Get the next request from start_requests
|
|
327
|
+
# 从start_requests获取下一个请求
|
|
147
328
|
request = await self.slot.start_requests.__anext__()
|
|
148
329
|
except StopAsyncIteration:
|
|
330
|
+
# No more start requests, set to None
|
|
331
|
+
# 没有更多的起始请求,设置为None
|
|
149
332
|
self.slot.start_requests = None
|
|
150
|
-
except Exception as
|
|
333
|
+
except Exception as exc:
|
|
334
|
+
# Log any errors and stop processing start requests
|
|
335
|
+
# 记录任何错误并停止处理起始请求
|
|
151
336
|
self.slot.start_requests = None
|
|
152
|
-
logger.exception('Error while obtaining start requests')
|
|
337
|
+
logger.exception('Error while obtaining start requests: %s', str(exc))
|
|
153
338
|
else:
|
|
339
|
+
# If we got a request, schedule it for crawling
|
|
340
|
+
# 如果我们得到了请求,安排它进行爬取
|
|
154
341
|
request and await self.crawl(request)
|
|
155
342
|
finally:
|
|
343
|
+
# Always release the lock
|
|
344
|
+
# 始终释放锁
|
|
156
345
|
self.slot.lock = False
|
|
157
346
|
|
|
158
347
|
def _needs_backout(self) -> bool:
|
|
348
|
+
"""
|
|
349
|
+
Check if the engine should temporarily stop processing more requests.
|
|
350
|
+
检查引擎是否应该暂时停止处理更多请求。
|
|
351
|
+
|
|
352
|
+
This method determines if the request processing loop should pause by checking:
|
|
353
|
+
此方法通过检查以下条件来确定请求处理循环是否应该暂停:
|
|
354
|
+
|
|
355
|
+
1. If the engine is no longer running (self.running is False)
|
|
356
|
+
引擎是否不再运行(self.running为False)
|
|
357
|
+
2. If the downloader is at capacity or needs to pause
|
|
358
|
+
下载器是否已达到容量或需要暂停
|
|
359
|
+
3. If the scraper is at capacity or needs to pause
|
|
360
|
+
抓取器是否已达到容量或需要暂停
|
|
361
|
+
|
|
362
|
+
This is used to implement flow control in the request processing pipeline.
|
|
363
|
+
这用于在请求处理管道中实现流量控制。
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
True if request processing should pause, False if it can continue.
|
|
367
|
+
如果请求处理应该暂停,则返回True;如果可以继续,则返回False。
|
|
368
|
+
"""
|
|
159
369
|
return (
|
|
160
370
|
not self.running
|
|
161
371
|
or self.downloader.needs_backout()
|
|
@@ -165,6 +375,32 @@ class ExecutionEngine(object):
|
|
|
165
375
|
async def handle_downloader_output(
|
|
166
376
|
self, result: Union[Request, Response, BaseException, None], request: Request
|
|
167
377
|
) -> None:
|
|
378
|
+
"""
|
|
379
|
+
Handle the output from the downloader.
|
|
380
|
+
处理下载器的输出。
|
|
381
|
+
|
|
382
|
+
This method processes the result of a download, which can be:
|
|
383
|
+
此方法处理下载的结果,可以是:
|
|
384
|
+
|
|
385
|
+
- None: Download was cancelled or failed without an exception
|
|
386
|
+
None:下载被取消或失败,没有异常
|
|
387
|
+
- Request: A new request to crawl
|
|
388
|
+
Request:要爬取的新请求
|
|
389
|
+
- Response: A successful response
|
|
390
|
+
Response:成功的响应
|
|
391
|
+
- BaseException: An exception that occurred during download
|
|
392
|
+
BaseException:下载过程中发生的异常
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
result: The result of the download.
|
|
396
|
+
下载的结果。
|
|
397
|
+
request: The original request that was downloaded.
|
|
398
|
+
被下载的原始请求。
|
|
399
|
+
|
|
400
|
+
Raises:
|
|
401
|
+
TypeError: If the result is not None, Request, Response, or BaseException.
|
|
402
|
+
如果结果不是None、Request、Response或BaseException。
|
|
403
|
+
"""
|
|
168
404
|
try:
|
|
169
405
|
if result is None:
|
|
170
406
|
return
|
|
@@ -176,56 +412,157 @@ class ExecutionEngine(object):
|
|
|
176
412
|
)
|
|
177
413
|
|
|
178
414
|
if isinstance(result, Request):
|
|
415
|
+
# Schedule new request
|
|
416
|
+
# 调度新请求
|
|
179
417
|
await self.crawl(result)
|
|
180
418
|
return
|
|
181
419
|
|
|
420
|
+
# Set the original request on the result
|
|
421
|
+
# 在结果上设置原始请求
|
|
182
422
|
result.request = request
|
|
423
|
+
|
|
183
424
|
if isinstance(result, Response):
|
|
425
|
+
# Log successful response and send signal
|
|
426
|
+
# 记录成功的响应并发送信号
|
|
184
427
|
logger.log(**self.logformatter.crawled(request, result, self.spider))
|
|
185
428
|
await self.signals.send_catch_log(signals.response_received,
|
|
186
429
|
response=result, request=request, spider=self.spider)
|
|
430
|
+
|
|
431
|
+
# Send result to scraper for processing
|
|
432
|
+
# 将结果发送到抓取器进行处理
|
|
187
433
|
await self.scraper.enqueue_scrape(result, request)
|
|
188
434
|
|
|
189
435
|
finally:
|
|
436
|
+
# Always remove the request from in-progress and process next request
|
|
437
|
+
# 始终从进行中移除请求并处理下一个请求
|
|
190
438
|
self.slot.remove_request(request)
|
|
191
439
|
await self._next_request()
|
|
192
440
|
|
|
193
441
|
def is_idle(self) -> bool:
|
|
194
|
-
|
|
442
|
+
"""
|
|
443
|
+
Check if the engine is idle.
|
|
444
|
+
检查引擎是否空闲。
|
|
445
|
+
|
|
446
|
+
The engine is considered idle when:
|
|
447
|
+
在以下情况下,引擎被认为是空闲的:
|
|
448
|
+
|
|
449
|
+
1. The downloader has no active requests
|
|
450
|
+
下载器没有活动的请求
|
|
451
|
+
2. There are no requests in progress
|
|
452
|
+
没有正在进行的请求
|
|
453
|
+
3. The scraper is idle
|
|
454
|
+
抓取器是空闲的
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
True if the engine is idle, False otherwise.
|
|
458
|
+
如果引擎空闲,则为True,否则为False。
|
|
459
|
+
"""
|
|
195
460
|
if self.downloader.active:
|
|
196
461
|
# downloader has pending requests
|
|
462
|
+
# 下载器有待处理的请求
|
|
197
463
|
return False
|
|
198
464
|
|
|
199
465
|
if self.slot.inprogress:
|
|
200
466
|
# not all start requests are handled
|
|
467
|
+
# 不是所有的起始请求都已处理
|
|
201
468
|
return False
|
|
202
469
|
|
|
203
470
|
if not self.scraper.is_idle():
|
|
204
471
|
# scraper is not idle
|
|
472
|
+
# 抓取器不是空闲的
|
|
205
473
|
return False
|
|
206
474
|
|
|
207
475
|
return True
|
|
208
476
|
|
|
209
477
|
async def crawl(self, request: Request) -> None:
|
|
478
|
+
"""
|
|
479
|
+
Schedule a request for crawling.
|
|
480
|
+
调度请求进行爬取。
|
|
481
|
+
|
|
482
|
+
This method adds the request to the scheduler's queue.
|
|
483
|
+
此方法将请求添加到调度器的队列中。
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
request: The request to schedule.
|
|
487
|
+
要调度的请求。
|
|
488
|
+
"""
|
|
210
489
|
await self.scheduler.enqueue_request(request)
|
|
211
|
-
# create_task(self._next_request())
|
|
212
490
|
|
|
213
491
|
async def close_spider(self, spider: Spider, reason: str = 'cancelled') -> None:
|
|
214
|
-
"""
|
|
215
|
-
|
|
492
|
+
"""
|
|
493
|
+
Close (cancel) spider and clear all its outstanding requests.
|
|
494
|
+
关闭(取消)爬虫并清除其所有未完成的请求。
|
|
495
|
+
|
|
496
|
+
This method gracefully shuts down all components related to the spider:
|
|
497
|
+
此方法优雅地关闭与爬虫相关的所有组件:
|
|
498
|
+
|
|
499
|
+
1. Downloader
|
|
500
|
+
下载器
|
|
501
|
+
2. Scraper
|
|
502
|
+
抓取器
|
|
503
|
+
3. Scheduler
|
|
504
|
+
调度器
|
|
505
|
+
4. Stats collector
|
|
506
|
+
统计收集器
|
|
507
|
+
5. Sends the spider_closed signal
|
|
508
|
+
发送spider_closed信号
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
spider: The spider to close.
|
|
512
|
+
要关闭的爬虫。
|
|
513
|
+
reason: The reason for closing the spider.
|
|
514
|
+
关闭爬虫的原因。
|
|
515
|
+
"""
|
|
216
516
|
logger.info(f"Closing spider ({reason})")
|
|
217
517
|
|
|
518
|
+
# Helper function to handle exceptions during close operations
|
|
519
|
+
# 处理关闭操作期间异常的辅助函数
|
|
218
520
|
async def close_handler(
|
|
219
521
|
callback: Callable,
|
|
220
522
|
*args,
|
|
221
|
-
errmsg: str = '',
|
|
523
|
+
errmsg: str = '', # Error message to log if an exception occurs
|
|
524
|
+
# 如果发生异常时记录的错误消息
|
|
222
525
|
**kwargs
|
|
223
526
|
) -> None:
|
|
527
|
+
"""
|
|
528
|
+
Call a callback and log any exceptions that occur.
|
|
529
|
+
调用回调并记录发生的任何异常。
|
|
530
|
+
|
|
531
|
+
This is an internal helper function used during the spider closing process
|
|
532
|
+
to ensure that exceptions in one closing operation don't prevent other
|
|
533
|
+
closing operations from being attempted. It wraps each callback in a
|
|
534
|
+
try-except block and logs any exceptions with the provided error message.
|
|
535
|
+
这是在爬虫关闭过程中使用的内部辅助函数,用于确保一个关闭操作中的异常
|
|
536
|
+
不会阻止尝试其他关闭操作。它将每个回调包装在try-except块中,并使用
|
|
537
|
+
提供的错误消息记录任何异常。
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
callback: The callback function to call.
|
|
541
|
+
要调用的回调函数。
|
|
542
|
+
*args: Positional arguments to pass to the callback.
|
|
543
|
+
传递给回调的位置参数。
|
|
544
|
+
errmsg: Error message prefix to log if an exception occurs.
|
|
545
|
+
如果发生异常时记录的错误消息前缀。
|
|
546
|
+
This will be prepended to the exception string in the log.
|
|
547
|
+
这将在日志中添加到异常字符串之前。
|
|
548
|
+
**kwargs: Keyword arguments to pass to the callback.
|
|
549
|
+
传递给回调的关键字参数。
|
|
550
|
+
|
|
551
|
+
Note:
|
|
552
|
+
This function catches all exceptions (including BaseException) to ensure
|
|
553
|
+
that the closing process continues even if a critical error occurs.
|
|
554
|
+
此函数捕获所有异常(包括BaseException),以确保即使发生严重错误,
|
|
555
|
+
关闭过程也会继续。
|
|
556
|
+
"""
|
|
224
557
|
try:
|
|
225
558
|
await call_helper(callback, *args, **kwargs)
|
|
226
|
-
except (Exception, BaseException) as
|
|
227
|
-
|
|
559
|
+
except (Exception, BaseException) as exc:
|
|
560
|
+
# Log the error message along with the exception details
|
|
561
|
+
# 记录错误消息以及异常详细信息
|
|
562
|
+
logger.exception(f"{errmsg}: {str(exc)}")
|
|
228
563
|
|
|
564
|
+
# Close all components in sequence
|
|
565
|
+
# 按顺序关闭所有组件
|
|
229
566
|
await close_handler(self.downloader.close, errmsg='Downloader close failure')
|
|
230
567
|
|
|
231
568
|
await close_handler(self.scraper.close, errmsg='Scraper close failure')
|
|
@@ -239,17 +576,41 @@ class ExecutionEngine(object):
|
|
|
239
576
|
|
|
240
577
|
logger.info(f"Spider closed ({reason})")
|
|
241
578
|
|
|
579
|
+
# Clean up references
|
|
580
|
+
# 清理引用
|
|
242
581
|
await close_handler(setattr, self, 'slot', None, errmsg='Error while unassigning slot')
|
|
243
582
|
|
|
244
583
|
await close_handler(setattr, self, 'spider', None, errmsg='Error while unassigning spider')
|
|
245
584
|
|
|
246
585
|
async def _spider_idle(self, spider: Spider) -> None:
|
|
586
|
+
"""
|
|
587
|
+
Handle the spider_idle signal.
|
|
588
|
+
处理spider_idle信号。
|
|
589
|
+
|
|
590
|
+
This method is called when the spider becomes idle (no more requests to process).
|
|
591
|
+
当爬虫变为空闲状态(没有更多请求要处理)时,调用此方法。
|
|
592
|
+
|
|
593
|
+
It sends the spider_idle signal, which handlers can use to add more requests.
|
|
594
|
+
它发送spider_idle信号,处理程序可以使用该信号添加更多请求。
|
|
595
|
+
|
|
596
|
+
If no handler raises DontCloseSpider and there are no pending requests,
|
|
597
|
+
the spider is stopped.
|
|
598
|
+
如果没有处理程序引发DontCloseSpider且没有待处理的请求,则停止爬虫。
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
spider: The idle spider.
|
|
602
|
+
空闲的爬虫。
|
|
603
|
+
"""
|
|
247
604
|
assert self.spider is not None
|
|
605
|
+
|
|
606
|
+
# Send spider_idle signal and check if any handler wants to keep the spider open
|
|
607
|
+
# 发送spider_idle信号并检查是否有任何处理程序希望保持爬虫打开
|
|
248
608
|
res = await self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
|
|
249
609
|
if any(isinstance(x, DontCloseSpider) for _, x in res):
|
|
250
610
|
return
|
|
251
611
|
|
|
252
612
|
# method of 'has_pending_requests' has IO, so method of 'is_idle' execute twice
|
|
613
|
+
# 'has_pending_requests'方法有IO操作,所以'is_idle'方法执行两次
|
|
253
614
|
if self.is_idle() \
|
|
254
615
|
and self.slot.start_requests is None \
|
|
255
616
|
and not await self.scheduler.has_pending_requests() \
|