aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,27 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
"""
|
|
2
|
+
CloseSpider Extension for AioScrapy
|
|
3
|
+
AioScrapy的CloseSpider扩展
|
|
4
|
+
|
|
5
|
+
CloseSpider is an extension that forces spiders to be closed after certain
|
|
6
|
+
conditions are met, such as a timeout, a maximum number of items scraped,
|
|
7
|
+
pages downloaded, or errors encountered.
|
|
8
|
+
CloseSpider是一个扩展,在满足特定条件后强制关闭爬虫,
|
|
9
|
+
例如超时、抓取的最大项目数、下载的页面数或遇到的错误数。
|
|
10
|
+
|
|
11
|
+
This extension can be configured using the following settings:
|
|
12
|
+
此扩展可以使用以下设置进行配置:
|
|
13
|
+
|
|
14
|
+
* CLOSESPIDER_TIMEOUT: Number of seconds after which the spider will be closed
|
|
15
|
+
爬虫将被关闭的秒数
|
|
16
|
+
* CLOSESPIDER_ITEMCOUNT: Maximum number of items to scrape before closing
|
|
17
|
+
关闭前要抓取的最大项目数
|
|
18
|
+
* CLOSESPIDER_PAGECOUNT: Maximum number of responses to download before closing
|
|
19
|
+
关闭前要下载的最大响应数
|
|
20
|
+
* CLOSESPIDER_ERRORCOUNT: Maximum number of errors to allow before closing
|
|
21
|
+
关闭前允许的最大错误数
|
|
22
|
+
|
|
23
|
+
See documentation in docs/topics/extensions.rst for more details.
|
|
24
|
+
有关更多详细信息,请参阅docs/topics/extensions.rst中的文档。
|
|
5
25
|
"""
|
|
6
26
|
import asyncio
|
|
7
27
|
from typing import Optional
|
|
@@ -13,10 +33,34 @@ from aioscrapy.utils.tools import create_task
|
|
|
13
33
|
|
|
14
34
|
|
|
15
35
|
class CloseSpider:
|
|
36
|
+
"""
|
|
37
|
+
Extension to close spiders when certain conditions are met.
|
|
38
|
+
当满足特定条件时关闭爬虫的扩展。
|
|
39
|
+
|
|
40
|
+
This extension monitors the spider's activity and closes it when one of the
|
|
41
|
+
configured conditions is met: timeout, maximum number of items scraped,
|
|
42
|
+
maximum number of pages downloaded, or maximum number of errors encountered.
|
|
43
|
+
此扩展监控爬虫的活动,并在满足配置的条件之一时关闭它:
|
|
44
|
+
超时、抓取的最大项目数、下载的最大页面数或遇到的最大错误数。
|
|
45
|
+
"""
|
|
16
46
|
|
|
17
47
|
def __init__(self, crawler):
|
|
48
|
+
"""
|
|
49
|
+
Initialize the CloseSpider extension.
|
|
50
|
+
初始化CloseSpider扩展。
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
crawler: The crawler instance that will use this extension.
|
|
54
|
+
将使用此扩展的爬虫实例。
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
NotConfigured: If none of the CLOSESPIDER_* settings are set.
|
|
58
|
+
如果未设置任何CLOSESPIDER_*设置。
|
|
59
|
+
"""
|
|
18
60
|
self.crawler = crawler
|
|
19
61
|
|
|
62
|
+
# Dictionary of closing conditions from settings
|
|
63
|
+
# 来自设置的关闭条件字典
|
|
20
64
|
self.close_on = {
|
|
21
65
|
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
|
|
22
66
|
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
|
@@ -24,12 +68,21 @@ class CloseSpider:
|
|
|
24
68
|
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
|
25
69
|
}
|
|
26
70
|
|
|
71
|
+
# If no closing conditions are configured, don't enable the extension
|
|
72
|
+
# 如果未配置关闭条件,则不启用扩展
|
|
27
73
|
if not any(self.close_on.values()):
|
|
28
74
|
raise NotConfigured
|
|
29
75
|
|
|
76
|
+
# Counter for each condition
|
|
77
|
+
# 每个条件的计数器
|
|
30
78
|
self.counter = defaultdict(int)
|
|
79
|
+
|
|
80
|
+
# Task for timeout handling
|
|
81
|
+
# 用于超时处理的任务
|
|
31
82
|
self.task: Optional[asyncio.tasks.Task] = None
|
|
32
83
|
|
|
84
|
+
# Connect to signals based on configured conditions
|
|
85
|
+
# 根据配置的条件连接到信号
|
|
33
86
|
if self.close_on.get('errorcount'):
|
|
34
87
|
crawler.signals.connect(self.error_count, signal=signals.spider_error)
|
|
35
88
|
if self.close_on.get('pagecount'):
|
|
@@ -38,34 +91,143 @@ class CloseSpider:
|
|
|
38
91
|
crawler.signals.connect(self.timeout_close, signal=signals.spider_opened)
|
|
39
92
|
if self.close_on.get('itemcount'):
|
|
40
93
|
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
|
|
94
|
+
|
|
95
|
+
# Always connect to spider_closed to clean up
|
|
96
|
+
# 始终连接到spider_closed以进行清理
|
|
41
97
|
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
|
|
42
98
|
|
|
43
99
|
@classmethod
|
|
44
100
|
def from_crawler(cls, crawler):
|
|
101
|
+
"""
|
|
102
|
+
Create a CloseSpider instance from a crawler.
|
|
103
|
+
从爬虫创建CloseSpider实例。
|
|
104
|
+
|
|
105
|
+
This is the factory method used by AioScrapy to create extension instances.
|
|
106
|
+
这是AioScrapy用于创建扩展实例的工厂方法。
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
crawler: The crawler that will use this extension.
|
|
110
|
+
将使用此扩展的爬虫。
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
CloseSpider: A new CloseSpider instance.
|
|
114
|
+
一个新的CloseSpider实例。
|
|
115
|
+
"""
|
|
45
116
|
return cls(crawler)
|
|
46
117
|
|
|
47
118
|
async def error_count(self, failure, response, spider):
|
|
119
|
+
"""
|
|
120
|
+
Signal handler for the spider_error signal.
|
|
121
|
+
spider_error信号的处理程序。
|
|
122
|
+
|
|
123
|
+
Increments the error counter and closes the spider if the maximum
|
|
124
|
+
number of errors has been reached.
|
|
125
|
+
增加错误计数器,如果达到最大错误数,则关闭爬虫。
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
failure: The exception that was raised.
|
|
129
|
+
引发的异常。
|
|
130
|
+
response: The response that caused the error.
|
|
131
|
+
导致错误的响应。
|
|
132
|
+
spider: The spider that raised the exception.
|
|
133
|
+
引发异常的爬虫。
|
|
134
|
+
"""
|
|
135
|
+
# Increment the error counter
|
|
136
|
+
# 增加错误计数器
|
|
48
137
|
self.counter['errorcount'] += 1
|
|
138
|
+
|
|
139
|
+
# Check if we've reached the maximum number of errors
|
|
140
|
+
# 检查是否达到最大错误数
|
|
49
141
|
if self.counter['errorcount'] == self.close_on['errorcount']:
|
|
50
142
|
create_task(self.crawler.engine.stop(reason='closespider_errorcount'))
|
|
51
143
|
|
|
52
144
|
async def page_count(self, response, request, spider):
|
|
145
|
+
"""
|
|
146
|
+
Signal handler for the response_received signal.
|
|
147
|
+
response_received信号的处理程序。
|
|
148
|
+
|
|
149
|
+
Increments the page counter and closes the spider if the maximum
|
|
150
|
+
number of pages has been downloaded.
|
|
151
|
+
增加页面计数器,如果下载的页面达到最大数量,则关闭爬虫。
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
response: The response that was received.
|
|
155
|
+
接收到的响应。
|
|
156
|
+
request: The request that generated the response.
|
|
157
|
+
生成响应的请求。
|
|
158
|
+
spider: The spider that generated the request.
|
|
159
|
+
生成请求的爬虫。
|
|
160
|
+
"""
|
|
161
|
+
# Increment the page counter
|
|
162
|
+
# 增加页面计数器
|
|
53
163
|
self.counter['pagecount'] += 1
|
|
164
|
+
|
|
165
|
+
# Check if we've reached the maximum number of pages
|
|
166
|
+
# 检查是否达到最大页面数
|
|
54
167
|
if self.counter['pagecount'] == self.close_on['pagecount']:
|
|
55
168
|
create_task(self.crawler.engine.stop(reason='closespider_pagecount'))
|
|
56
169
|
|
|
57
170
|
async def timeout_close(self, spider):
|
|
171
|
+
"""
|
|
172
|
+
Signal handler for the spider_opened signal.
|
|
173
|
+
spider_opened信号的处理程序。
|
|
174
|
+
|
|
175
|
+
Starts a task that will close the spider after the configured timeout.
|
|
176
|
+
启动一个任务,该任务将在配置的超时后关闭爬虫。
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
spider: The spider that was opened.
|
|
180
|
+
被打开的爬虫。
|
|
181
|
+
"""
|
|
58
182
|
async def close():
|
|
183
|
+
"""
|
|
184
|
+
Inner function that waits for the timeout and then stops the engine.
|
|
185
|
+
等待超时然后停止引擎的内部函数。
|
|
186
|
+
"""
|
|
59
187
|
await asyncio.sleep(self.close_on['timeout'])
|
|
60
188
|
create_task(self.crawler.engine.stop(reason='closespider_timeout'))
|
|
61
189
|
|
|
190
|
+
# Start the timeout task
|
|
191
|
+
# 启动超时任务
|
|
62
192
|
self.task = create_task(close())
|
|
63
193
|
|
|
64
194
|
async def item_scraped(self, item, spider):
|
|
195
|
+
"""
|
|
196
|
+
Signal handler for the item_scraped signal.
|
|
197
|
+
item_scraped信号的处理程序。
|
|
198
|
+
|
|
199
|
+
Increments the item counter and closes the spider if the maximum
|
|
200
|
+
number of items has been scraped.
|
|
201
|
+
增加项目计数器,如果抓取的项目达到最大数量,则关闭爬虫。
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
item: The item that was scraped.
|
|
205
|
+
抓取的项目。
|
|
206
|
+
spider: The spider that scraped the item.
|
|
207
|
+
抓取项目的爬虫。
|
|
208
|
+
"""
|
|
209
|
+
# Increment the item counter
|
|
210
|
+
# 增加项目计数器
|
|
65
211
|
self.counter['itemcount'] += 1
|
|
212
|
+
|
|
213
|
+
# Check if we've reached the maximum number of items
|
|
214
|
+
# 检查是否达到最大项目数
|
|
66
215
|
if self.counter['itemcount'] == self.close_on['itemcount']:
|
|
67
216
|
create_task(self.crawler.engine.stop(reason='closespider_itemcount'))
|
|
68
217
|
|
|
69
218
|
def spider_closed(self, spider):
|
|
219
|
+
"""
|
|
220
|
+
Signal handler for the spider_closed signal.
|
|
221
|
+
spider_closed信号的处理程序。
|
|
222
|
+
|
|
223
|
+
Cancels the timeout task if it's still running when the spider is closed.
|
|
224
|
+
如果爬虫关闭时超时任务仍在运行,则取消该任务。
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
spider: The spider that was closed.
|
|
228
|
+
被关闭的爬虫。
|
|
229
|
+
"""
|
|
230
|
+
# Cancel the timeout task if it exists and is not done
|
|
231
|
+
# 如果超时任务存在且未完成,则取消它
|
|
70
232
|
if self.task and not self.task.done():
|
|
71
233
|
self.task.cancel()
|
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Core Stats Extension
|
|
3
|
+
核心统计扩展
|
|
4
|
+
|
|
5
|
+
This extension collects and records essential statistics about the crawling process,
|
|
6
|
+
including start and finish times, elapsed time, number of items scraped, number of
|
|
7
|
+
responses received, and information about dropped items.
|
|
8
|
+
此扩展收集并记录有关爬取过程的基本统计信息,包括开始和结束时间、经过的时间、
|
|
9
|
+
已抓取的项目数量、已接收的响应数量以及有关丢弃项目的信息。
|
|
10
|
+
|
|
11
|
+
These statistics are useful for monitoring the performance and behavior of spiders,
|
|
12
|
+
and can be accessed through the Scrapy stats collector.
|
|
13
|
+
这些统计信息对于监控爬虫的性能和行为很有用,可以通过Scrapy统计收集器访问。
|
|
3
14
|
"""
|
|
4
15
|
from datetime import datetime
|
|
5
16
|
|
|
@@ -7,40 +18,179 @@ from aioscrapy import signals
|
|
|
7
18
|
|
|
8
19
|
|
|
9
20
|
class CoreStats:
|
|
21
|
+
"""
|
|
22
|
+
Extension for collecting core statistics about the crawling process.
|
|
23
|
+
用于收集有关爬取过程的核心统计信息的扩展。
|
|
24
|
+
|
|
25
|
+
This extension hooks into various Scrapy signals to collect statistics about
|
|
26
|
+
the crawling process, such as start and finish times, number of items scraped,
|
|
27
|
+
number of responses received, and information about dropped items.
|
|
28
|
+
此扩展挂钩到各种Scrapy信号,以收集有关爬取过程的统计信息,例如开始和结束时间、
|
|
29
|
+
已抓取的项目数量、已接收的响应数量以及有关丢弃项目的信息。
|
|
30
|
+
"""
|
|
10
31
|
|
|
11
32
|
def __init__(self, stats):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the CoreStats extension.
|
|
35
|
+
初始化CoreStats扩展。
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
stats: The Scrapy stats collector.
|
|
39
|
+
Scrapy统计收集器。
|
|
40
|
+
"""
|
|
41
|
+
# Stats collector
|
|
42
|
+
# 统计收集器
|
|
12
43
|
self.stats = stats
|
|
44
|
+
|
|
45
|
+
# Spider start time (will be set when spider opens)
|
|
46
|
+
# 爬虫开始时间(将在爬虫打开时设置)
|
|
13
47
|
self.start_time = None
|
|
14
48
|
|
|
15
49
|
@classmethod
|
|
16
50
|
def from_crawler(cls, crawler):
|
|
51
|
+
"""
|
|
52
|
+
Create a CoreStats instance from a crawler.
|
|
53
|
+
从爬虫创建CoreStats实例。
|
|
54
|
+
|
|
55
|
+
This is the factory method used by Scrapy to create the extension.
|
|
56
|
+
这是Scrapy用于创建扩展的工厂方法。
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
crawler: The crawler that will use this extension.
|
|
60
|
+
将使用此扩展的爬虫。
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
CoreStats: A new CoreStats instance.
|
|
64
|
+
一个新的CoreStats实例。
|
|
65
|
+
"""
|
|
66
|
+
# Create a new instance with the crawler's stats collector
|
|
67
|
+
# 使用爬虫的统计收集器创建一个新实例
|
|
17
68
|
o = cls(crawler.stats)
|
|
69
|
+
|
|
70
|
+
# Connect to signals
|
|
71
|
+
# 连接到信号
|
|
18
72
|
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
|
19
73
|
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
|
20
74
|
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
|
|
21
75
|
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
|
|
22
76
|
crawler.signals.connect(o.response_received, signal=signals.response_received)
|
|
77
|
+
|
|
78
|
+
# Return the new instance
|
|
79
|
+
# 返回新实例
|
|
23
80
|
return o
|
|
24
81
|
|
|
25
82
|
def spider_opened(self, spider):
|
|
83
|
+
"""
|
|
84
|
+
Handle the spider_opened signal.
|
|
85
|
+
处理spider_opened信号。
|
|
86
|
+
|
|
87
|
+
This method is called when a spider is opened. It records the start time
|
|
88
|
+
of the spider.
|
|
89
|
+
当爬虫打开时调用此方法。它记录爬虫的开始时间。
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
spider: The spider that was opened.
|
|
93
|
+
被打开的爬虫。
|
|
94
|
+
"""
|
|
95
|
+
# Record the start time
|
|
96
|
+
# 记录开始时间
|
|
26
97
|
self.start_time = datetime.now()
|
|
98
|
+
|
|
99
|
+
# Store the start time in the stats
|
|
100
|
+
# 将开始时间存储在统计信息中
|
|
27
101
|
self.stats.set_value('start_time', str(self.start_time), spider=spider)
|
|
28
102
|
|
|
29
103
|
def spider_closed(self, spider, reason):
|
|
104
|
+
"""
|
|
105
|
+
Handle the spider_closed signal.
|
|
106
|
+
处理spider_closed信号。
|
|
107
|
+
|
|
108
|
+
This method is called when a spider is closed. It calculates and records
|
|
109
|
+
the finish time, elapsed time, and finish reason.
|
|
110
|
+
当爬虫关闭时调用此方法。它计算并记录结束时间、经过的时间和结束原因。
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
spider: The spider that was closed.
|
|
114
|
+
被关闭的爬虫。
|
|
115
|
+
reason: The reason why the spider was closed.
|
|
116
|
+
爬虫被关闭的原因。
|
|
117
|
+
"""
|
|
118
|
+
# Record the finish time
|
|
119
|
+
# 记录结束时间
|
|
30
120
|
finish_time = datetime.now()
|
|
121
|
+
|
|
122
|
+
# Calculate elapsed time
|
|
123
|
+
# 计算经过的时间
|
|
31
124
|
elapsed_time = finish_time - self.start_time
|
|
32
125
|
elapsed_time_seconds = elapsed_time.total_seconds()
|
|
126
|
+
|
|
127
|
+
# Store finish statistics in the stats
|
|
128
|
+
# 将结束统计信息存储在统计信息中
|
|
33
129
|
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
|
|
34
130
|
self.stats.set_value('finish_time', str(finish_time), spider=spider)
|
|
35
131
|
self.stats.set_value('finish_reason', reason, spider=spider)
|
|
36
132
|
|
|
37
133
|
def item_scraped(self, item, spider):
|
|
134
|
+
"""
|
|
135
|
+
Handle the item_scraped signal.
|
|
136
|
+
处理item_scraped信号。
|
|
137
|
+
|
|
138
|
+
This method is called when an item is scraped by a spider. It increments
|
|
139
|
+
the item_scraped_count statistic.
|
|
140
|
+
当爬虫抓取项目时调用此方法。它增加item_scraped_count统计信息。
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
item: The item that was scraped.
|
|
144
|
+
被抓取的项目。
|
|
145
|
+
spider: The spider that scraped the item.
|
|
146
|
+
抓取项目的爬虫。
|
|
147
|
+
"""
|
|
148
|
+
# Increment the item scraped count
|
|
149
|
+
# 增加已抓取项目计数
|
|
38
150
|
self.stats.inc_value('item_scraped_count', spider=spider)
|
|
39
151
|
|
|
40
152
|
def response_received(self, spider):
|
|
153
|
+
"""
|
|
154
|
+
Handle the response_received signal.
|
|
155
|
+
处理response_received信号。
|
|
156
|
+
|
|
157
|
+
This method is called when a response is received by a spider. It increments
|
|
158
|
+
the response_received_count statistic.
|
|
159
|
+
当爬虫接收到响应时调用此方法。它增加response_received_count统计信息。
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
spider: The spider that received the response.
|
|
163
|
+
接收响应的爬虫。
|
|
164
|
+
"""
|
|
165
|
+
# Increment the response received count
|
|
166
|
+
# 增加已接收响应计数
|
|
41
167
|
self.stats.inc_value('response_received_count', spider=spider)
|
|
42
168
|
|
|
43
169
|
def item_dropped(self, item, spider, exception):
|
|
170
|
+
"""
|
|
171
|
+
Handle the item_dropped signal.
|
|
172
|
+
处理item_dropped信号。
|
|
173
|
+
|
|
174
|
+
This method is called when an item is dropped by a spider. It increments
|
|
175
|
+
the item_dropped_count statistic and records the reason why the item was dropped.
|
|
176
|
+
当爬虫丢弃项目时调用此方法。它增加item_dropped_count统计信息,并记录项目被丢弃的原因。
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
item: The item that was dropped.
|
|
180
|
+
被丢弃的项目。
|
|
181
|
+
spider: The spider that dropped the item.
|
|
182
|
+
丢弃项目的爬虫。
|
|
183
|
+
exception: The exception that caused the item to be dropped.
|
|
184
|
+
导致项目被丢弃的异常。
|
|
185
|
+
"""
|
|
186
|
+
# Get the reason from the exception class name
|
|
187
|
+
# 从异常类名获取原因
|
|
44
188
|
reason = exception.__class__.__name__
|
|
189
|
+
|
|
190
|
+
# Increment the item dropped count
|
|
191
|
+
# 增加已丢弃项目计数
|
|
45
192
|
self.stats.inc_value('item_dropped_count', spider=spider)
|
|
193
|
+
|
|
194
|
+
# Increment the count for this specific reason
|
|
195
|
+
# 增加此特定原因的计数
|
|
46
196
|
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
|
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Log Stats Extension
|
|
3
|
+
日志统计扩展
|
|
4
|
+
|
|
5
|
+
This extension logs basic crawling statistics periodically during the spider run.
|
|
6
|
+
It provides information about the number of pages crawled, items scraped, and their
|
|
7
|
+
respective rates per minute, which is useful for monitoring the progress and
|
|
8
|
+
performance of spiders in real-time.
|
|
9
|
+
此扩展在爬虫运行期间定期记录基本的爬取统计信息。它提供有关已爬取的页面数量、
|
|
10
|
+
已抓取的项目数量及其各自的每分钟速率的信息,这对于实时监控爬虫的进度和性能很有用。
|
|
11
|
+
"""
|
|
1
12
|
import asyncio
|
|
2
13
|
|
|
3
14
|
from aioscrapy import signals
|
|
@@ -7,44 +18,177 @@ from aioscrapy.utils.tools import create_task
|
|
|
7
18
|
|
|
8
19
|
|
|
9
20
|
class LogStats:
|
|
10
|
-
"""
|
|
21
|
+
"""
|
|
22
|
+
Extension for logging basic crawling statistics periodically.
|
|
23
|
+
用于定期记录基本爬取统计信息的扩展。
|
|
24
|
+
|
|
25
|
+
This extension logs information about the number of pages crawled and items
|
|
26
|
+
scraped, along with their respective rates per minute. The statistics are
|
|
27
|
+
logged at regular intervals during the spider run, providing real-time
|
|
28
|
+
feedback on the spider's performance.
|
|
29
|
+
此扩展记录有关已爬取的页面数量和已抓取的项目数量的信息,以及它们各自的
|
|
30
|
+
每分钟速率。统计信息在爬虫运行期间以固定的时间间隔记录,提供有关爬虫性能
|
|
31
|
+
的实时反馈。
|
|
32
|
+
"""
|
|
11
33
|
|
|
12
34
|
def __init__(self, stats, interval=60.0):
|
|
35
|
+
"""
|
|
36
|
+
Initialize the LogStats extension.
|
|
37
|
+
初始化LogStats扩展。
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
stats: The Scrapy stats collector.
|
|
41
|
+
Scrapy统计收集器。
|
|
42
|
+
interval: The time interval (in seconds) between log messages.
|
|
43
|
+
日志消息之间的时间间隔(以秒为单位)。
|
|
44
|
+
Defaults to 60.0 seconds.
|
|
45
|
+
默认为60.0秒。
|
|
46
|
+
"""
|
|
47
|
+
# Stats collector
|
|
48
|
+
# 统计收集器
|
|
13
49
|
self.stats = stats
|
|
50
|
+
|
|
51
|
+
# Interval between log messages (in seconds)
|
|
52
|
+
# 日志消息之间的间隔(以秒为单位)
|
|
14
53
|
self.interval = interval
|
|
54
|
+
|
|
55
|
+
# Multiplier to convert stats to per-minute rates
|
|
56
|
+
# 将统计数据转换为每分钟速率的乘数
|
|
15
57
|
self.multiplier = 60.0 / self.interval
|
|
58
|
+
|
|
59
|
+
# Async task for periodic logging
|
|
60
|
+
# 用于定期记录的异步任务
|
|
16
61
|
self.task = None
|
|
62
|
+
|
|
63
|
+
# Previous values for calculating rates
|
|
64
|
+
# 用于计算速率的先前值
|
|
17
65
|
self.pagesprev = 0
|
|
18
66
|
self.itemsprev = 0
|
|
19
67
|
|
|
20
68
|
@classmethod
|
|
21
69
|
def from_crawler(cls, crawler):
|
|
70
|
+
"""
|
|
71
|
+
Create a LogStats instance from a crawler.
|
|
72
|
+
从爬虫创建LogStats实例。
|
|
73
|
+
|
|
74
|
+
This is the factory method used by Scrapy to create the extension.
|
|
75
|
+
这是Scrapy用于创建扩展的工厂方法。
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
crawler: The crawler that will use this extension.
|
|
79
|
+
将使用此扩展的爬虫。
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
LogStats: A new LogStats instance.
|
|
83
|
+
一个新的LogStats实例。
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
NotConfigured: If LOGSTATS_INTERVAL is not set or is zero in the settings.
|
|
87
|
+
如果在设置中未设置LOGSTATS_INTERVAL或其值为零。
|
|
88
|
+
"""
|
|
89
|
+
# Get the log interval from settings
|
|
90
|
+
# 从设置获取日志间隔
|
|
22
91
|
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
|
|
92
|
+
|
|
93
|
+
# If no interval is configured, disable the extension
|
|
94
|
+
# 如果未配置间隔,则禁用扩展
|
|
23
95
|
if not interval:
|
|
24
96
|
raise NotConfigured
|
|
97
|
+
|
|
98
|
+
# Create a new instance with the crawler's stats collector and the configured interval
|
|
99
|
+
# 使用爬虫的统计收集器和配置的间隔创建一个新实例
|
|
25
100
|
o = cls(crawler.stats, interval)
|
|
101
|
+
|
|
102
|
+
# Connect to signals
|
|
103
|
+
# 连接到信号
|
|
26
104
|
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
|
27
105
|
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
|
106
|
+
|
|
107
|
+
# Return the new instance
|
|
108
|
+
# 返回新实例
|
|
28
109
|
return o
|
|
29
110
|
|
|
30
111
|
def spider_opened(self, spider):
|
|
112
|
+
"""
|
|
113
|
+
Handle the spider_opened signal.
|
|
114
|
+
处理spider_opened信号。
|
|
115
|
+
|
|
116
|
+
This method is called when a spider is opened. It starts the periodic
|
|
117
|
+
logging task.
|
|
118
|
+
当爬虫打开时调用此方法。它启动定期记录任务。
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
spider: The spider that was opened.
|
|
122
|
+
被打开的爬虫。
|
|
123
|
+
"""
|
|
124
|
+
# Start the periodic logging task
|
|
125
|
+
# 启动定期记录任务
|
|
31
126
|
self.task = create_task(self.log(spider))
|
|
32
127
|
|
|
33
128
|
async def log(self, spider):
|
|
129
|
+
"""
|
|
130
|
+
Log the current crawling statistics and schedule the next log.
|
|
131
|
+
记录当前爬取统计信息并安排下一次记录。
|
|
132
|
+
|
|
133
|
+
This method retrieves the current statistics, calculates the rates,
|
|
134
|
+
logs the information, and then schedules itself to run again after
|
|
135
|
+
the configured interval.
|
|
136
|
+
此方法检索当前统计信息,计算速率,记录信息,然后安排自己在配置的
|
|
137
|
+
间隔后再次运行。
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
spider: The spider whose statistics to log.
|
|
141
|
+
要记录其统计信息的爬虫。
|
|
142
|
+
"""
|
|
143
|
+
# Wait for the configured interval
|
|
144
|
+
# 等待配置的间隔
|
|
34
145
|
await asyncio.sleep(self.interval)
|
|
146
|
+
|
|
147
|
+
# Get current statistics
|
|
148
|
+
# 获取当前统计信息
|
|
35
149
|
items = self.stats.get_value('item_scraped_count', 0)
|
|
36
150
|
pages = self.stats.get_value('response_received_count', 0)
|
|
151
|
+
|
|
152
|
+
# Calculate rates (per minute)
|
|
153
|
+
# 计算速率(每分钟)
|
|
37
154
|
irate = (items - self.itemsprev) * self.multiplier
|
|
38
155
|
prate = (pages - self.pagesprev) * self.multiplier
|
|
156
|
+
|
|
157
|
+
# Update previous values for next calculation
|
|
158
|
+
# 更新先前值以供下次计算
|
|
39
159
|
self.pagesprev, self.itemsprev = pages, items
|
|
40
160
|
|
|
161
|
+
# Prepare log message
|
|
162
|
+
# 准备日志消息
|
|
41
163
|
msg = ("<%(spider_name)s> Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
|
42
164
|
"scraped %(items)d items (at %(itemrate)d items/min)")
|
|
43
165
|
log_args = {'pages': pages, 'pagerate': prate, 'spider_name': spider.name,
|
|
44
166
|
'items': items, 'itemrate': irate}
|
|
167
|
+
|
|
168
|
+
# Log the statistics
|
|
169
|
+
# 记录统计信息
|
|
45
170
|
logger.info(msg % log_args)
|
|
171
|
+
|
|
172
|
+
# Schedule the next log
|
|
173
|
+
# 安排下一次记录
|
|
46
174
|
self.task = create_task(self.log(spider))
|
|
47
175
|
|
|
48
176
|
def spider_closed(self, spider, reason):
|
|
177
|
+
"""
|
|
178
|
+
Handle the spider_closed signal.
|
|
179
|
+
处理spider_closed信号。
|
|
180
|
+
|
|
181
|
+
This method is called when a spider is closed. It cancels the periodic
|
|
182
|
+
logging task if it's still running.
|
|
183
|
+
当爬虫关闭时调用此方法。如果定期记录任务仍在运行,它会取消该任务。
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
spider: The spider that was closed.
|
|
187
|
+
被关闭的爬虫。
|
|
188
|
+
reason: The reason why the spider was closed.
|
|
189
|
+
爬虫被关闭的原因。
|
|
190
|
+
"""
|
|
191
|
+
# Cancel the logging task if it's still running
|
|
192
|
+
# 如果记录任务仍在运行,则取消它
|
|
49
193
|
if self.task and not self.task.done():
|
|
50
194
|
self.task.cancel()
|