aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/crawler.py
CHANGED
|
@@ -1,3 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crawler Module
|
|
3
|
+
爬虫模块
|
|
4
|
+
|
|
5
|
+
This module provides the core classes for managing the crawling process in AioScrapy.
|
|
6
|
+
It defines the Crawler, CrawlerRunner, and CrawlerProcess classes that coordinate
|
|
7
|
+
the execution of spiders and manage their lifecycle.
|
|
8
|
+
此模块提供了AioScrapy中管理爬取过程的核心类。它定义了协调爬虫执行和管理其生命周期的
|
|
9
|
+
Crawler、CrawlerRunner和CrawlerProcess类。
|
|
10
|
+
|
|
11
|
+
The main classes are:
|
|
12
|
+
主要类包括:
|
|
13
|
+
|
|
14
|
+
1. Crawler: The main class that coordinates a single crawling process
|
|
15
|
+
协调单个爬取过程的主要类
|
|
16
|
+
2. CrawlerRunner: Manages multiple crawlers
|
|
17
|
+
管理多个爬虫
|
|
18
|
+
3. CrawlerProcess: Extends CrawlerRunner to run crawlers in the current process
|
|
19
|
+
扩展CrawlerRunner以在当前进程中运行爬虫
|
|
20
|
+
|
|
21
|
+
These classes handle the initialization and shutdown of all components needed for
|
|
22
|
+
crawling, such as the engine, extensions, signals, and database connections.
|
|
23
|
+
这些类处理爬取所需的所有组件的初始化和关闭,例如引擎、扩展、信号和数据库连接。
|
|
24
|
+
"""
|
|
1
25
|
import asyncio
|
|
2
26
|
import pprint
|
|
3
27
|
import signal
|
|
@@ -32,9 +56,31 @@ from aioscrapy.statscollectors import StatsCollector
|
|
|
32
56
|
|
|
33
57
|
|
|
34
58
|
class Crawler:
|
|
59
|
+
"""
|
|
60
|
+
The Crawler is the main class that coordinates the crawling process.
|
|
61
|
+
Crawler是协调爬取过程的主要类。
|
|
35
62
|
|
|
36
|
-
|
|
63
|
+
It holds references to the main components of the crawling process and manages
|
|
64
|
+
their initialization and shutdown.
|
|
65
|
+
它持有爬取过程中主要组件的引用,并管理它们的初始化和关闭。
|
|
66
|
+
"""
|
|
37
67
|
|
|
68
|
+
def __init__(self, spidercls: Type[Spider], settings: Union[Settings, dict, None] = None) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Initialize a new Crawler.
|
|
71
|
+
初始化一个新的Crawler。
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
spidercls: The Spider class to use for this crawler.
|
|
75
|
+
此爬虫使用的Spider类。
|
|
76
|
+
settings: The settings to use for this crawler. Can be a Settings object,
|
|
77
|
+
a dictionary, or None (in which case default settings are used).
|
|
78
|
+
此爬虫使用的设置。可以是Settings对象、字典或None(在这种情况下使用默认设置)。
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If spidercls is a Spider instance instead of a Spider class.
|
|
82
|
+
如果spidercls是Spider实例而不是Spider类。
|
|
83
|
+
"""
|
|
38
84
|
if isinstance(spidercls, Spider):
|
|
39
85
|
raise ValueError('The spidercls argument must be a class, not an object')
|
|
40
86
|
|
|
@@ -55,6 +101,25 @@ class Crawler:
|
|
|
55
101
|
self.logformatter: Optional[LogFormatter] = None
|
|
56
102
|
|
|
57
103
|
async def crawl(self, *args, **kwargs) -> None:
|
|
104
|
+
"""
|
|
105
|
+
Start the crawling process.
|
|
106
|
+
开始爬取过程。
|
|
107
|
+
|
|
108
|
+
This method initializes all the components needed for crawling and starts the engine.
|
|
109
|
+
此方法初始化爬取所需的所有组件并启动引擎。
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
*args: Arguments to pass to the spider's constructor.
|
|
113
|
+
传递给爬虫构造函数的参数。
|
|
114
|
+
**kwargs: Keyword arguments to pass to the spider's constructor.
|
|
115
|
+
传递给爬虫构造函数的关键字参数。
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
RuntimeError: If crawling is already taking place.
|
|
119
|
+
如果爬取已经在进行中。
|
|
120
|
+
Exception: Any exception that occurs during the crawling process.
|
|
121
|
+
爬取过程中发生的任何异常。
|
|
122
|
+
"""
|
|
58
123
|
try:
|
|
59
124
|
configure_logging(self.spidercls, self.settings)
|
|
60
125
|
|
|
@@ -72,7 +137,7 @@ class Crawler:
|
|
|
72
137
|
self.logformatter = await load_instance(self.settings['LOG_FORMATTER'], crawler=self)
|
|
73
138
|
self.extensions = await ExtensionManager.from_crawler(self)
|
|
74
139
|
self.engine = ExecutionEngine(self)
|
|
75
|
-
# 创建所有数据库链接
|
|
140
|
+
# 创建所有数据库链接 (Create all database connections)
|
|
76
141
|
await db_manager.from_crawler(self)
|
|
77
142
|
start_requests = await async_generator_wrapper(self.spider.start_requests())
|
|
78
143
|
await self.engine.start(self.spider, start_requests)
|
|
@@ -84,8 +149,18 @@ class Crawler:
|
|
|
84
149
|
raise e
|
|
85
150
|
|
|
86
151
|
async def stop(self, signum=None) -> None:
|
|
87
|
-
"""
|
|
88
|
-
|
|
152
|
+
"""
|
|
153
|
+
Starts a graceful stop of the crawler.
|
|
154
|
+
开始优雅地停止爬虫。
|
|
155
|
+
|
|
156
|
+
This method is called when the crawler needs to be stopped, either by user
|
|
157
|
+
request or by a signal (e.g., SIGINT).
|
|
158
|
+
当爬虫需要停止时调用此方法,可能是由用户请求或信号(例如SIGINT)触发。
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
signum: The signal number that triggered the stop, if any.
|
|
162
|
+
触发停止的信号编号(如果有)。
|
|
163
|
+
"""
|
|
89
164
|
if signum is not None:
|
|
90
165
|
asyncio.current_task().set_name(self.spidercls.name)
|
|
91
166
|
logger.info(
|
|
@@ -98,19 +173,61 @@ class Crawler:
|
|
|
98
173
|
await self.engine.stop()
|
|
99
174
|
|
|
100
175
|
def _signal_shutdown(self, signum: Any, _) -> None:
|
|
176
|
+
"""
|
|
177
|
+
Signal handler for shutdown signals.
|
|
178
|
+
关闭信号的信号处理程序。
|
|
179
|
+
|
|
180
|
+
This method is called when a shutdown signal (e.g., SIGINT) is received.
|
|
181
|
+
当接收到关闭信号(例如SIGINT)时调用此方法。
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
signum: The signal number.
|
|
185
|
+
信号编号。
|
|
186
|
+
_: The frame object (not used).
|
|
187
|
+
帧对象(未使用)。
|
|
188
|
+
"""
|
|
101
189
|
asyncio.create_task(self.stop(signum))
|
|
102
190
|
|
|
103
191
|
|
|
104
192
|
class CrawlerRunner:
|
|
193
|
+
"""
|
|
194
|
+
Class that manages multiple crawlers.
|
|
195
|
+
管理多个爬虫的类。
|
|
196
|
+
|
|
197
|
+
This class keeps track of all the crawlers started by it and provides
|
|
198
|
+
methods to start and stop them.
|
|
199
|
+
此类跟踪由它启动的所有爬虫,并提供启动和停止它们的方法。
|
|
200
|
+
"""
|
|
201
|
+
|
|
105
202
|
crawlers = property(
|
|
106
203
|
lambda self: self._crawlers,
|
|
107
|
-
doc="Set of
|
|
108
|
-
"
|
|
204
|
+
doc="Set of crawlers started by crawl and managed by this class."
|
|
205
|
+
"由crawl方法启动并由此类管理的爬虫集合。"
|
|
109
206
|
)
|
|
110
207
|
|
|
111
208
|
@staticmethod
|
|
112
209
|
def _get_spider_loader(settings: Settings) -> ISpiderLoader:
|
|
113
|
-
"""
|
|
210
|
+
"""
|
|
211
|
+
Get SpiderLoader instance from settings.
|
|
212
|
+
从设置中获取SpiderLoader实例。
|
|
213
|
+
|
|
214
|
+
This method loads the spider loader class specified in the settings and
|
|
215
|
+
creates an instance of it.
|
|
216
|
+
此方法加载设置中指定的爬虫加载器类并创建其实例。
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
settings: The settings object.
|
|
220
|
+
设置对象。
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
An instance of the spider loader.
|
|
224
|
+
爬虫加载器的实例。
|
|
225
|
+
|
|
226
|
+
Warns:
|
|
227
|
+
AioScrapyDeprecationWarning: If the spider loader class does not fully
|
|
228
|
+
implement the ISpiderLoader interface.
|
|
229
|
+
如果爬虫加载器类未完全实现ISpiderLoader接口。
|
|
230
|
+
"""
|
|
114
231
|
cls_path = settings.get('SPIDER_LOADER_CLASS')
|
|
115
232
|
loader_cls = load_object(cls_path)
|
|
116
233
|
excs = (DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
|
|
@@ -126,16 +243,43 @@ class CrawlerRunner:
|
|
|
126
243
|
return loader_cls.from_settings(settings.frozencopy())
|
|
127
244
|
|
|
128
245
|
def __init__(self, settings: Union[Settings, dict, None] = None) -> None:
|
|
246
|
+
"""
|
|
247
|
+
Initialize a new CrawlerRunner.
|
|
248
|
+
初始化一个新的CrawlerRunner。
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
settings: The settings to use for this crawler runner. Can be a Settings object,
|
|
252
|
+
a dictionary, or None (in which case default settings are used).
|
|
253
|
+
此爬虫运行器使用的设置。可以是Settings对象、字典或None(在这种情况下使用默认设置)。
|
|
254
|
+
"""
|
|
129
255
|
if isinstance(settings, dict) or settings is None:
|
|
130
256
|
settings = Settings(settings)
|
|
131
257
|
self.settings = settings
|
|
132
258
|
self.spider_loader = self._get_spider_loader(settings)
|
|
133
|
-
self._crawlers = {}
|
|
134
|
-
|
|
135
|
-
self.
|
|
259
|
+
self._crawlers = {} # Dictionary of crawlers and their args/kwargs
|
|
260
|
+
# 爬虫及其参数/关键字参数的字典
|
|
261
|
+
self._active = set() # Set of active crawling tasks
|
|
262
|
+
# 活动爬取任务的集合
|
|
263
|
+
self.bootstrap_failed = False # Flag indicating if bootstrap failed
|
|
264
|
+
# 指示引导是否失败的标志
|
|
136
265
|
|
|
137
266
|
@property
|
|
138
267
|
def spiders(self):
|
|
268
|
+
"""
|
|
269
|
+
Deprecated property that returns the spider_loader.
|
|
270
|
+
已弃用的属性,返回spider_loader。
|
|
271
|
+
|
|
272
|
+
This property is deprecated and will be removed in a future version.
|
|
273
|
+
此属性已弃用,将在未来版本中删除。
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
The spider loader instance.
|
|
277
|
+
爬虫加载器实例。
|
|
278
|
+
|
|
279
|
+
Warns:
|
|
280
|
+
AioScrapyDeprecationWarning: Always warns about deprecation.
|
|
281
|
+
始终警告关于弃用。
|
|
282
|
+
"""
|
|
139
283
|
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
|
|
140
284
|
"CrawlerRunner.spider_loader.",
|
|
141
285
|
category=AioScrapyDeprecationWarning, stacklevel=2)
|
|
@@ -148,15 +292,65 @@ class CrawlerRunner:
|
|
|
148
292
|
settings: Union[Settings, dict, None] = None,
|
|
149
293
|
**kwargs
|
|
150
294
|
) -> None:
|
|
295
|
+
"""
|
|
296
|
+
Schedule a crawler to run as soon as possible.
|
|
297
|
+
安排爬虫尽快运行。
|
|
298
|
+
|
|
299
|
+
This method creates a crawler (if needed) and schedules it to run.
|
|
300
|
+
此方法创建爬虫(如果需要)并安排其运行。
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
crawler_or_spidercls: A Crawler instance or a Spider class.
|
|
304
|
+
Crawler实例或Spider类。
|
|
305
|
+
*args: Arguments to pass to the spider's constructor.
|
|
306
|
+
传递给爬虫构造函数的参数。
|
|
307
|
+
settings: The settings to use for this crawler. Can be a Settings object,
|
|
308
|
+
a dictionary, or None (in which case default settings are used).
|
|
309
|
+
此爬虫使用的设置。可以是Settings对象、字典或None(在这种情况下使用默认设置)。
|
|
310
|
+
**kwargs: Keyword arguments to pass to the spider's constructor.
|
|
311
|
+
传递给爬虫构造函数的关键字参数。
|
|
312
|
+
"""
|
|
151
313
|
crawler = self.crawl(crawler_or_spidercls, settings=settings)
|
|
152
314
|
self.crawlers.setdefault(crawler, (args, kwargs))
|
|
153
315
|
self.active_crawler(crawler, *args, **kwargs)
|
|
154
316
|
|
|
155
317
|
def active_crawler(self, crawler: Crawler, *args, **kwargs) -> None:
|
|
318
|
+
"""
|
|
319
|
+
Activate a crawler by creating a task for it.
|
|
320
|
+
通过为爬虫创建任务来激活它。
|
|
321
|
+
|
|
322
|
+
This method creates an asyncio task for the crawler and adds it to the
|
|
323
|
+
set of active tasks.
|
|
324
|
+
此方法为爬虫创建一个asyncio任务,并将其添加到活动任务集中。
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
crawler: The crawler to activate.
|
|
328
|
+
要激活的爬虫。
|
|
329
|
+
*args: Arguments to pass to the crawler's crawl method.
|
|
330
|
+
传递给爬虫crawl方法的参数。
|
|
331
|
+
**kwargs: Keyword arguments to pass to the crawler's crawl method.
|
|
332
|
+
传递给爬虫crawl方法的关键字参数。
|
|
333
|
+
"""
|
|
156
334
|
task = asyncio.create_task(crawler.crawl(*args, **kwargs), name=crawler.spidercls.name)
|
|
157
335
|
self._active.add(task)
|
|
158
336
|
|
|
159
337
|
def _done(result):
|
|
338
|
+
"""
|
|
339
|
+
Callback for when the task is done.
|
|
340
|
+
任务完成时的回调。
|
|
341
|
+
|
|
342
|
+
This function is called when the task completes, either successfully
|
|
343
|
+
or with an exception.
|
|
344
|
+
当任务完成时调用此函数,无论是成功还是出现异常。
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
result: The task result.
|
|
348
|
+
任务结果。
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
The task result.
|
|
352
|
+
任务结果。
|
|
353
|
+
"""
|
|
160
354
|
self.crawlers.pop(crawler, None)
|
|
161
355
|
self._active.discard(task)
|
|
162
356
|
self.bootstrap_failed |= not getattr(crawler, 'spider', None)
|
|
@@ -171,6 +365,33 @@ class CrawlerRunner:
|
|
|
171
365
|
settings: Union[Settings, dict, None] = None,
|
|
172
366
|
**kwargs
|
|
173
367
|
) -> Crawler:
|
|
368
|
+
"""
|
|
369
|
+
Create a crawler and add it to the crawlers dict.
|
|
370
|
+
创建爬虫并将其添加到爬虫字典中。
|
|
371
|
+
|
|
372
|
+
This method creates a crawler (if needed) and adds it to the crawlers dict,
|
|
373
|
+
but does not start it.
|
|
374
|
+
此方法创建爬虫(如果需要)并将其添加到爬虫字典中,但不启动它。
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
crawler_or_spidercls: A Crawler instance or a Spider class.
|
|
378
|
+
Crawler实例或Spider类。
|
|
379
|
+
*args: Arguments to pass to the spider's constructor.
|
|
380
|
+
传递给爬虫构造函数的参数。
|
|
381
|
+
settings: The settings to use for this crawler. Can be a Settings object,
|
|
382
|
+
a dictionary, or None (in which case default settings are used).
|
|
383
|
+
此爬虫使用的设置。可以是Settings对象、字典或None(在这种情况下使用默认设置)。
|
|
384
|
+
**kwargs: Keyword arguments to pass to the spider's constructor.
|
|
385
|
+
传递给爬虫构造函数的关键字参数。
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
The crawler instance.
|
|
389
|
+
爬虫实例。
|
|
390
|
+
|
|
391
|
+
Raises:
|
|
392
|
+
ValueError: If crawler_or_spidercls is a Spider instance instead of a Spider class.
|
|
393
|
+
如果crawler_or_spidercls是Spider实例而不是Spider类。
|
|
394
|
+
"""
|
|
174
395
|
if isinstance(crawler_or_spidercls, Spider):
|
|
175
396
|
raise ValueError(
|
|
176
397
|
'The crawler_or_spidercls argument cannot be a spider object, '
|
|
@@ -184,12 +405,43 @@ class CrawlerRunner:
|
|
|
184
405
|
crawler_or_spidercls: Union[Type[Spider], Crawler, str],
|
|
185
406
|
settings: Union[Settings, dict, None]
|
|
186
407
|
) -> Crawler:
|
|
408
|
+
"""
|
|
409
|
+
Create a crawler instance.
|
|
410
|
+
创建爬虫实例。
|
|
411
|
+
|
|
412
|
+
This method creates a crawler instance from a spider class, a crawler instance,
|
|
413
|
+
or a spider name. If a crawler instance is provided, it is returned as is.
|
|
414
|
+
此方法从爬虫类、爬虫实例或爬虫名称创建爬虫实例。如果提供了爬虫实例,则按原样返回。
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
crawler_or_spidercls: A Crawler instance, a Spider class, or a spider name.
|
|
418
|
+
Crawler实例、Spider类或爬虫名称。
|
|
419
|
+
settings: The settings to use for this crawler. Can be a Settings object,
|
|
420
|
+
a dictionary, or None.
|
|
421
|
+
此爬虫使用的设置。可以是Settings对象、字典或None。
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
Crawler: The crawler instance.
|
|
425
|
+
爬虫实例。
|
|
426
|
+
|
|
427
|
+
Raises:
|
|
428
|
+
ValueError: If crawler_or_spidercls is a Spider instance instead of a Spider class.
|
|
429
|
+
如果crawler_or_spidercls是Spider实例而不是Spider类。
|
|
430
|
+
"""
|
|
431
|
+
# Check if crawler_or_spidercls is a Spider instance (not allowed)
|
|
432
|
+
# 检查crawler_or_spidercls是否为Spider实例(不允许)
|
|
187
433
|
if isinstance(crawler_or_spidercls, Spider):
|
|
188
434
|
raise ValueError(
|
|
189
435
|
'The crawler_or_spidercls argument cannot be a spider object, '
|
|
190
436
|
'it must be a spider class (or a Crawler object)')
|
|
437
|
+
|
|
438
|
+
# If crawler_or_spidercls is already a Crawler, return it
|
|
439
|
+
# 如果crawler_or_spidercls已经是Crawler,则返回它
|
|
191
440
|
if isinstance(crawler_or_spidercls, Crawler):
|
|
192
441
|
return crawler_or_spidercls
|
|
442
|
+
|
|
443
|
+
# Otherwise, create a new crawler
|
|
444
|
+
# 否则,创建一个新的爬虫
|
|
193
445
|
return self._create_crawler(crawler_or_spidercls, settings)
|
|
194
446
|
|
|
195
447
|
def _create_crawler(
|
|
@@ -197,35 +449,136 @@ class CrawlerRunner:
|
|
|
197
449
|
spidercls: Union[Type[Spider], str],
|
|
198
450
|
settings: Union[Settings, dict, None]
|
|
199
451
|
) -> Crawler:
|
|
452
|
+
"""
|
|
453
|
+
Internal method to create a crawler instance.
|
|
454
|
+
创建爬虫实例的内部方法。
|
|
455
|
+
|
|
456
|
+
This method creates a crawler instance from a spider class or a spider name.
|
|
457
|
+
If a spider name is provided, it is loaded using the spider loader.
|
|
458
|
+
此方法从爬虫类或爬虫名称创建爬虫实例。如果提供了爬虫名称,则使用爬虫加载器加载它。
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
spidercls: A Spider class or a spider name.
|
|
462
|
+
Spider类或爬虫名称。
|
|
463
|
+
settings: The settings to use for this crawler. Can be a Settings object,
|
|
464
|
+
a dictionary, or None.
|
|
465
|
+
此爬虫使用的设置。可以是Settings对象、字典或None。
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Crawler: The crawler instance.
|
|
469
|
+
爬虫实例。
|
|
470
|
+
"""
|
|
471
|
+
# If spidercls is a string (spider name), load the spider class
|
|
472
|
+
# 如果spidercls是字符串(爬虫名称),则加载爬虫类
|
|
200
473
|
if isinstance(spidercls, str):
|
|
201
474
|
spidercls = self.spider_loader.load(spidercls)
|
|
475
|
+
|
|
476
|
+
# Create and return a new crawler instance
|
|
477
|
+
# 创建并返回一个新的爬虫实例
|
|
202
478
|
return Crawler(spidercls, settings=settings)
|
|
203
479
|
|
|
204
480
|
async def stop(self, signum=None) -> None:
|
|
481
|
+
"""
|
|
482
|
+
Stop all crawlers managed by this runner.
|
|
483
|
+
停止此运行器管理的所有爬虫。
|
|
484
|
+
|
|
485
|
+
This method calls the stop method of all crawlers managed by this runner.
|
|
486
|
+
It waits for all crawlers to stop before returning.
|
|
487
|
+
此方法调用此运行器管理的所有爬虫的stop方法。它在返回之前等待所有爬虫停止。
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
signum: The signal number that triggered the stop, if any.
|
|
491
|
+
触发停止的信号编号(如果有)。
|
|
492
|
+
This is passed to each crawler's stop method.
|
|
493
|
+
这将传递给每个爬虫的stop方法。
|
|
494
|
+
"""
|
|
495
|
+
# Stop all crawlers concurrently and wait for them to finish
|
|
496
|
+
# 并发停止所有爬虫并等待它们完成
|
|
205
497
|
await asyncio.gather(*[c.stop(signum) for c in self.crawlers])
|
|
206
498
|
|
|
207
499
|
|
|
208
500
|
class CrawlerProcess(CrawlerRunner):
|
|
501
|
+
"""
|
|
502
|
+
A class to run multiple crawlers in a process.
|
|
503
|
+
在一个进程中运行多个爬虫的类。
|
|
504
|
+
|
|
505
|
+
This class extends CrawlerRunner by adding support for running the crawlers
|
|
506
|
+
in the current process and handling shutdown signals.
|
|
507
|
+
此类通过添加对在当前进程中运行爬虫和处理关闭信号的支持来扩展CrawlerRunner。
|
|
508
|
+
"""
|
|
209
509
|
|
|
210
510
|
def __init__(
|
|
211
511
|
self,
|
|
212
512
|
settings: Union[Settings, dict, None] = None,
|
|
213
513
|
install_root_handler: bool = True
|
|
214
514
|
) -> None:
|
|
515
|
+
"""
|
|
516
|
+
Initialize a new CrawlerProcess.
|
|
517
|
+
初始化一个新的CrawlerProcess。
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
settings: The settings to use for this crawler process. Can be a Settings object,
|
|
521
|
+
a dictionary, or None (in which case default settings are used).
|
|
522
|
+
此爬虫进程使用的设置。可以是Settings对象、字典或None(在这种情况下使用默认设置)。
|
|
523
|
+
install_root_handler: Whether to install the root handler for logging.
|
|
524
|
+
是否安装日志记录的根处理程序。
|
|
525
|
+
"""
|
|
215
526
|
super().__init__(settings)
|
|
216
527
|
install_shutdown_handlers(self._signal_shutdown)
|
|
217
528
|
|
|
218
529
|
def _signal_shutdown(self, signum: Any, _) -> None:
|
|
530
|
+
"""
|
|
531
|
+
Signal handler for the first shutdown signal.
|
|
532
|
+
第一个关闭信号的信号处理程序。
|
|
533
|
+
|
|
534
|
+
This method is called when the first shutdown signal (e.g., SIGINT) is received.
|
|
535
|
+
当接收到第一个关闭信号(例如SIGINT)时调用此方法。
|
|
536
|
+
|
|
537
|
+
It installs a new signal handler for the second signal and starts the shutdown process.
|
|
538
|
+
它为第二个信号安装新的信号处理程序,并开始关闭过程。
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
signum: The signal number.
|
|
542
|
+
信号编号。
|
|
543
|
+
_: The frame object (not used).
|
|
544
|
+
帧对象(未使用)。
|
|
545
|
+
"""
|
|
219
546
|
install_shutdown_handlers(self._signal_kill)
|
|
220
547
|
asyncio.create_task(self.stop(signum))
|
|
221
548
|
|
|
222
549
|
def _signal_kill(self, signum: Any, _) -> None:
|
|
550
|
+
"""
|
|
551
|
+
Signal handler for the second shutdown signal.
|
|
552
|
+
第二个关闭信号的信号处理程序。
|
|
553
|
+
|
|
554
|
+
This method is called when the second shutdown signal (e.g., SIGINT) is received.
|
|
555
|
+
当接收到第二个关闭信号(例如SIGINT)时调用此方法。
|
|
556
|
+
|
|
557
|
+
It forces an unclean shutdown of the process.
|
|
558
|
+
它强制进程进行不干净的关闭。
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
signum: The signal number.
|
|
562
|
+
信号编号。
|
|
563
|
+
_: The frame object (not used).
|
|
564
|
+
帧对象(未使用)。
|
|
565
|
+
"""
|
|
223
566
|
install_shutdown_handlers(signal.SIG_IGN)
|
|
224
567
|
signame = signal_names[signum]
|
|
225
568
|
logger.info('Received %(signame)s twice, forcing unclean shutdown' % {'signame': signame})
|
|
226
569
|
asyncio.create_task(self._stop_reactor())
|
|
227
570
|
|
|
228
571
|
async def run(self) -> None:
|
|
572
|
+
"""
|
|
573
|
+
Run all crawlers until they finish.
|
|
574
|
+
运行所有爬虫直到它们完成。
|
|
575
|
+
|
|
576
|
+
This method activates all crawlers and waits for them to finish.
|
|
577
|
+
此方法激活所有爬虫并等待它们完成。
|
|
578
|
+
|
|
579
|
+
After all crawlers have finished, it recycles the database connections.
|
|
580
|
+
在所有爬虫完成后,它回收数据库连接。
|
|
581
|
+
"""
|
|
229
582
|
try:
|
|
230
583
|
for crawler, (args, kwargs) in self.crawlers.items():
|
|
231
584
|
self.active_crawler(crawler, *args, **kwargs)
|
|
@@ -235,6 +588,18 @@ class CrawlerProcess(CrawlerRunner):
|
|
|
235
588
|
await self.recycle_db_connect()
|
|
236
589
|
|
|
237
590
|
def start(self, use_windows_selector_eventLoop: bool = False) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Start the crawler process.
|
|
593
|
+
启动爬虫进程。
|
|
594
|
+
|
|
595
|
+
This method sets up the event loop and runs the crawlers.
|
|
596
|
+
此方法设置事件循环并运行爬虫。
|
|
597
|
+
|
|
598
|
+
Args:
|
|
599
|
+
use_windows_selector_eventLoop: Whether to use the Windows selector event loop
|
|
600
|
+
instead of the ProactorEventLoop on Windows.
|
|
601
|
+
是否在Windows上使用Windows选择器事件循环而不是ProactorEventLoop。
|
|
602
|
+
"""
|
|
238
603
|
if sys.platform.startswith('win'):
|
|
239
604
|
if use_windows_selector_eventLoop:
|
|
240
605
|
asyncio.set_event_loop_policy(asyncio.windows_events.WindowsSelectorEventLoopPolicy())
|
|
@@ -249,12 +614,29 @@ class CrawlerProcess(CrawlerRunner):
|
|
|
249
614
|
asyncio.run(self.run())
|
|
250
615
|
|
|
251
616
|
async def _stop_reactor(self) -> None:
|
|
617
|
+
"""
|
|
618
|
+
Stop the reactor (event loop).
|
|
619
|
+
停止反应器(事件循环)。
|
|
620
|
+
|
|
621
|
+
This method is called when a forced shutdown is requested.
|
|
622
|
+
当请求强制关闭时调用此方法。
|
|
623
|
+
|
|
624
|
+
It tries to recycle database connections before stopping the event loop.
|
|
625
|
+
它在停止事件循环之前尝试回收数据库连接。
|
|
626
|
+
"""
|
|
252
627
|
try:
|
|
253
628
|
await self.recycle_db_connect()
|
|
254
629
|
finally:
|
|
255
630
|
asyncio.get_event_loop().stop()
|
|
256
631
|
|
|
257
632
|
async def recycle_db_connect(self) -> None:
|
|
633
|
+
"""
|
|
634
|
+
Recycle database connections.
|
|
635
|
+
回收数据库连接。
|
|
636
|
+
|
|
637
|
+
This method closes all database connections if there are no active crawlers.
|
|
638
|
+
如果没有活动的爬虫,此方法将关闭所有数据库连接。
|
|
639
|
+
"""
|
|
258
640
|
# recycle pool of db_manager
|
|
259
641
|
if not len(self._active):
|
|
260
642
|
await db_manager.close_all()
|