aio-scrapy 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
  2. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -41
  3. aio_scrapy-2.1.6.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +187 -3
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +124 -3
  11. aioscrapy/core/downloader/handlers/httpx.py +133 -3
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +132 -3
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +313 -13
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  105. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  106. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  107. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  108. aioscrapy/http/response/playwright.py +0 -36
  109. aioscrapy/libs/pipelines/execl.py +0 -169
  110. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,27 @@
1
- """CloseSpider is an extension that forces spiders to be closed after certain
2
- conditions are met.
3
-
4
- See documentation in docs/topics/extensions.rst
1
+ """
2
+ CloseSpider Extension for AioScrapy
3
+ AioScrapy的CloseSpider扩展
4
+
5
+ CloseSpider is an extension that forces spiders to be closed after certain
6
+ conditions are met, such as a timeout, a maximum number of items scraped,
7
+ pages downloaded, or errors encountered.
8
+ CloseSpider是一个扩展,在满足特定条件后强制关闭爬虫,
9
+ 例如超时、抓取的最大项目数、下载的页面数或遇到的错误数。
10
+
11
+ This extension can be configured using the following settings:
12
+ 此扩展可以使用以下设置进行配置:
13
+
14
+ * CLOSESPIDER_TIMEOUT: Number of seconds after which the spider will be closed
15
+ 爬虫将被关闭的秒数
16
+ * CLOSESPIDER_ITEMCOUNT: Maximum number of items to scrape before closing
17
+ 关闭前要抓取的最大项目数
18
+ * CLOSESPIDER_PAGECOUNT: Maximum number of responses to download before closing
19
+ 关闭前要下载的最大响应数
20
+ * CLOSESPIDER_ERRORCOUNT: Maximum number of errors to allow before closing
21
+ 关闭前允许的最大错误数
22
+
23
+ See documentation in docs/topics/extensions.rst for more details.
24
+ 有关更多详细信息,请参阅docs/topics/extensions.rst中的文档。
5
25
  """
6
26
  import asyncio
7
27
  from typing import Optional
@@ -13,10 +33,34 @@ from aioscrapy.utils.tools import create_task
13
33
 
14
34
 
15
35
  class CloseSpider:
36
+ """
37
+ Extension to close spiders when certain conditions are met.
38
+ 当满足特定条件时关闭爬虫的扩展。
39
+
40
+ This extension monitors the spider's activity and closes it when one of the
41
+ configured conditions is met: timeout, maximum number of items scraped,
42
+ maximum number of pages downloaded, or maximum number of errors encountered.
43
+ 此扩展监控爬虫的活动,并在满足配置的条件之一时关闭它:
44
+ 超时、抓取的最大项目数、下载的最大页面数或遇到的最大错误数。
45
+ """
16
46
 
17
47
  def __init__(self, crawler):
48
+ """
49
+ Initialize the CloseSpider extension.
50
+ 初始化CloseSpider扩展。
51
+
52
+ Args:
53
+ crawler: The crawler instance that will use this extension.
54
+ 将使用此扩展的爬虫实例。
55
+
56
+ Raises:
57
+ NotConfigured: If none of the CLOSESPIDER_* settings are set.
58
+ 如果未设置任何CLOSESPIDER_*设置。
59
+ """
18
60
  self.crawler = crawler
19
61
 
62
+ # Dictionary of closing conditions from settings
63
+ # 来自设置的关闭条件字典
20
64
  self.close_on = {
21
65
  'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
22
66
  'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
@@ -24,12 +68,21 @@ class CloseSpider:
24
68
  'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
25
69
  }
26
70
 
71
+ # If no closing conditions are configured, don't enable the extension
72
+ # 如果未配置关闭条件,则不启用扩展
27
73
  if not any(self.close_on.values()):
28
74
  raise NotConfigured
29
75
 
76
+ # Counter for each condition
77
+ # 每个条件的计数器
30
78
  self.counter = defaultdict(int)
79
+
80
+ # Task for timeout handling
81
+ # 用于超时处理的任务
31
82
  self.task: Optional[asyncio.tasks.Task] = None
32
83
 
84
+ # Connect to signals based on configured conditions
85
+ # 根据配置的条件连接到信号
33
86
  if self.close_on.get('errorcount'):
34
87
  crawler.signals.connect(self.error_count, signal=signals.spider_error)
35
88
  if self.close_on.get('pagecount'):
@@ -38,34 +91,143 @@ class CloseSpider:
38
91
  crawler.signals.connect(self.timeout_close, signal=signals.spider_opened)
39
92
  if self.close_on.get('itemcount'):
40
93
  crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
94
+
95
+ # Always connect to spider_closed to clean up
96
+ # 始终连接到spider_closed以进行清理
41
97
  crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
42
98
 
43
99
  @classmethod
44
100
  def from_crawler(cls, crawler):
101
+ """
102
+ Create a CloseSpider instance from a crawler.
103
+ 从爬虫创建CloseSpider实例。
104
+
105
+ This is the factory method used by AioScrapy to create extension instances.
106
+ 这是AioScrapy用于创建扩展实例的工厂方法。
107
+
108
+ Args:
109
+ crawler: The crawler that will use this extension.
110
+ 将使用此扩展的爬虫。
111
+
112
+ Returns:
113
+ CloseSpider: A new CloseSpider instance.
114
+ 一个新的CloseSpider实例。
115
+ """
45
116
  return cls(crawler)
46
117
 
47
118
  async def error_count(self, failure, response, spider):
119
+ """
120
+ Signal handler for the spider_error signal.
121
+ spider_error信号的处理程序。
122
+
123
+ Increments the error counter and closes the spider if the maximum
124
+ number of errors has been reached.
125
+ 增加错误计数器,如果达到最大错误数,则关闭爬虫。
126
+
127
+ Args:
128
+ failure: The exception that was raised.
129
+ 引发的异常。
130
+ response: The response that caused the error.
131
+ 导致错误的响应。
132
+ spider: The spider that raised the exception.
133
+ 引发异常的爬虫。
134
+ """
135
+ # Increment the error counter
136
+ # 增加错误计数器
48
137
  self.counter['errorcount'] += 1
138
+
139
+ # Check if we've reached the maximum number of errors
140
+ # 检查是否达到最大错误数
49
141
  if self.counter['errorcount'] == self.close_on['errorcount']:
50
142
  create_task(self.crawler.engine.stop(reason='closespider_errorcount'))
51
143
 
52
144
  async def page_count(self, response, request, spider):
145
+ """
146
+ Signal handler for the response_received signal.
147
+ response_received信号的处理程序。
148
+
149
+ Increments the page counter and closes the spider if the maximum
150
+ number of pages has been downloaded.
151
+ 增加页面计数器,如果下载的页面达到最大数量,则关闭爬虫。
152
+
153
+ Args:
154
+ response: The response that was received.
155
+ 接收到的响应。
156
+ request: The request that generated the response.
157
+ 生成响应的请求。
158
+ spider: The spider that generated the request.
159
+ 生成请求的爬虫。
160
+ """
161
+ # Increment the page counter
162
+ # 增加页面计数器
53
163
  self.counter['pagecount'] += 1
164
+
165
+ # Check if we've reached the maximum number of pages
166
+ # 检查是否达到最大页面数
54
167
  if self.counter['pagecount'] == self.close_on['pagecount']:
55
168
  create_task(self.crawler.engine.stop(reason='closespider_pagecount'))
56
169
 
57
170
  async def timeout_close(self, spider):
171
+ """
172
+ Signal handler for the spider_opened signal.
173
+ spider_opened信号的处理程序。
174
+
175
+ Starts a task that will close the spider after the configured timeout.
176
+ 启动一个任务,该任务将在配置的超时后关闭爬虫。
177
+
178
+ Args:
179
+ spider: The spider that was opened.
180
+ 被打开的爬虫。
181
+ """
58
182
  async def close():
183
+ """
184
+ Inner function that waits for the timeout and then stops the engine.
185
+ 等待超时然后停止引擎的内部函数。
186
+ """
59
187
  await asyncio.sleep(self.close_on['timeout'])
60
188
  create_task(self.crawler.engine.stop(reason='closespider_timeout'))
61
189
 
190
+ # Start the timeout task
191
+ # 启动超时任务
62
192
  self.task = create_task(close())
63
193
 
64
194
  async def item_scraped(self, item, spider):
195
+ """
196
+ Signal handler for the item_scraped signal.
197
+ item_scraped信号的处理程序。
198
+
199
+ Increments the item counter and closes the spider if the maximum
200
+ number of items has been scraped.
201
+ 增加项目计数器,如果抓取的项目达到最大数量,则关闭爬虫。
202
+
203
+ Args:
204
+ item: The item that was scraped.
205
+ 抓取的项目。
206
+ spider: The spider that scraped the item.
207
+ 抓取项目的爬虫。
208
+ """
209
+ # Increment the item counter
210
+ # 增加项目计数器
65
211
  self.counter['itemcount'] += 1
212
+
213
+ # Check if we've reached the maximum number of items
214
+ # 检查是否达到最大项目数
66
215
  if self.counter['itemcount'] == self.close_on['itemcount']:
67
216
  create_task(self.crawler.engine.stop(reason='closespider_itemcount'))
68
217
 
69
218
  def spider_closed(self, spider):
219
+ """
220
+ Signal handler for the spider_closed signal.
221
+ spider_closed信号的处理程序。
222
+
223
+ Cancels the timeout task if it's still running when the spider is closed.
224
+ 如果爬虫关闭时超时任务仍在运行,则取消该任务。
225
+
226
+ Args:
227
+ spider: The spider that was closed.
228
+ 被关闭的爬虫。
229
+ """
230
+ # Cancel the timeout task if it exists and is not done
231
+ # 如果超时任务存在且未完成,则取消它
70
232
  if self.task and not self.task.done():
71
233
  self.task.cancel()
@@ -1,5 +1,16 @@
1
1
  """
2
- Extension for collecting core stats like items scraped and start/finish times
2
+ Core Stats Extension
3
+ 核心统计扩展
4
+
5
+ This extension collects and records essential statistics about the crawling process,
6
+ including start and finish times, elapsed time, number of items scraped, number of
7
+ responses received, and information about dropped items.
8
+ 此扩展收集并记录有关爬取过程的基本统计信息,包括开始和结束时间、经过的时间、
9
+ 已抓取的项目数量、已接收的响应数量以及有关丢弃项目的信息。
10
+
11
+ These statistics are useful for monitoring the performance and behavior of spiders,
12
+ and can be accessed through the Scrapy stats collector.
13
+ 这些统计信息对于监控爬虫的性能和行为很有用,可以通过Scrapy统计收集器访问。
3
14
  """
4
15
  from datetime import datetime
5
16
 
@@ -7,40 +18,179 @@ from aioscrapy import signals
7
18
 
8
19
 
9
20
  class CoreStats:
21
+ """
22
+ Extension for collecting core statistics about the crawling process.
23
+ 用于收集有关爬取过程的核心统计信息的扩展。
24
+
25
+ This extension hooks into various Scrapy signals to collect statistics about
26
+ the crawling process, such as start and finish times, number of items scraped,
27
+ number of responses received, and information about dropped items.
28
+ 此扩展挂钩到各种Scrapy信号,以收集有关爬取过程的统计信息,例如开始和结束时间、
29
+ 已抓取的项目数量、已接收的响应数量以及有关丢弃项目的信息。
30
+ """
10
31
 
11
32
  def __init__(self, stats):
33
+ """
34
+ Initialize the CoreStats extension.
35
+ 初始化CoreStats扩展。
36
+
37
+ Args:
38
+ stats: The Scrapy stats collector.
39
+ Scrapy统计收集器。
40
+ """
41
+ # Stats collector
42
+ # 统计收集器
12
43
  self.stats = stats
44
+
45
+ # Spider start time (will be set when spider opens)
46
+ # 爬虫开始时间(将在爬虫打开时设置)
13
47
  self.start_time = None
14
48
 
15
49
  @classmethod
16
50
  def from_crawler(cls, crawler):
51
+ """
52
+ Create a CoreStats instance from a crawler.
53
+ 从爬虫创建CoreStats实例。
54
+
55
+ This is the factory method used by Scrapy to create the extension.
56
+ 这是Scrapy用于创建扩展的工厂方法。
57
+
58
+ Args:
59
+ crawler: The crawler that will use this extension.
60
+ 将使用此扩展的爬虫。
61
+
62
+ Returns:
63
+ CoreStats: A new CoreStats instance.
64
+ 一个新的CoreStats实例。
65
+ """
66
+ # Create a new instance with the crawler's stats collector
67
+ # 使用爬虫的统计收集器创建一个新实例
17
68
  o = cls(crawler.stats)
69
+
70
+ # Connect to signals
71
+ # 连接到信号
18
72
  crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
19
73
  crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
20
74
  crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
21
75
  crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
22
76
  crawler.signals.connect(o.response_received, signal=signals.response_received)
77
+
78
+ # Return the new instance
79
+ # 返回新实例
23
80
  return o
24
81
 
25
82
  def spider_opened(self, spider):
83
+ """
84
+ Handle the spider_opened signal.
85
+ 处理spider_opened信号。
86
+
87
+ This method is called when a spider is opened. It records the start time
88
+ of the spider.
89
+ 当爬虫打开时调用此方法。它记录爬虫的开始时间。
90
+
91
+ Args:
92
+ spider: The spider that was opened.
93
+ 被打开的爬虫。
94
+ """
95
+ # Record the start time
96
+ # 记录开始时间
26
97
  self.start_time = datetime.now()
98
+
99
+ # Store the start time in the stats
100
+ # 将开始时间存储在统计信息中
27
101
  self.stats.set_value('start_time', str(self.start_time), spider=spider)
28
102
 
29
103
  def spider_closed(self, spider, reason):
104
+ """
105
+ Handle the spider_closed signal.
106
+ 处理spider_closed信号。
107
+
108
+ This method is called when a spider is closed. It calculates and records
109
+ the finish time, elapsed time, and finish reason.
110
+ 当爬虫关闭时调用此方法。它计算并记录结束时间、经过的时间和结束原因。
111
+
112
+ Args:
113
+ spider: The spider that was closed.
114
+ 被关闭的爬虫。
115
+ reason: The reason why the spider was closed.
116
+ 爬虫被关闭的原因。
117
+ """
118
+ # Record the finish time
119
+ # 记录结束时间
30
120
  finish_time = datetime.now()
121
+
122
+ # Calculate elapsed time
123
+ # 计算经过的时间
31
124
  elapsed_time = finish_time - self.start_time
32
125
  elapsed_time_seconds = elapsed_time.total_seconds()
126
+
127
+ # Store finish statistics in the stats
128
+ # 将结束统计信息存储在统计信息中
33
129
  self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
34
130
  self.stats.set_value('finish_time', str(finish_time), spider=spider)
35
131
  self.stats.set_value('finish_reason', reason, spider=spider)
36
132
 
37
133
  def item_scraped(self, item, spider):
134
+ """
135
+ Handle the item_scraped signal.
136
+ 处理item_scraped信号。
137
+
138
+ This method is called when an item is scraped by a spider. It increments
139
+ the item_scraped_count statistic.
140
+ 当爬虫抓取项目时调用此方法。它增加item_scraped_count统计信息。
141
+
142
+ Args:
143
+ item: The item that was scraped.
144
+ 被抓取的项目。
145
+ spider: The spider that scraped the item.
146
+ 抓取项目的爬虫。
147
+ """
148
+ # Increment the item scraped count
149
+ # 增加已抓取项目计数
38
150
  self.stats.inc_value('item_scraped_count', spider=spider)
39
151
 
40
152
  def response_received(self, spider):
153
+ """
154
+ Handle the response_received signal.
155
+ 处理response_received信号。
156
+
157
+ This method is called when a response is received by a spider. It increments
158
+ the response_received_count statistic.
159
+ 当爬虫接收到响应时调用此方法。它增加response_received_count统计信息。
160
+
161
+ Args:
162
+ spider: The spider that received the response.
163
+ 接收响应的爬虫。
164
+ """
165
+ # Increment the response received count
166
+ # 增加已接收响应计数
41
167
  self.stats.inc_value('response_received_count', spider=spider)
42
168
 
43
169
  def item_dropped(self, item, spider, exception):
170
+ """
171
+ Handle the item_dropped signal.
172
+ 处理item_dropped信号。
173
+
174
+ This method is called when an item is dropped by a spider. It increments
175
+ the item_dropped_count statistic and records the reason why the item was dropped.
176
+ 当爬虫丢弃项目时调用此方法。它增加item_dropped_count统计信息,并记录项目被丢弃的原因。
177
+
178
+ Args:
179
+ item: The item that was dropped.
180
+ 被丢弃的项目。
181
+ spider: The spider that dropped the item.
182
+ 丢弃项目的爬虫。
183
+ exception: The exception that caused the item to be dropped.
184
+ 导致项目被丢弃的异常。
185
+ """
186
+ # Get the reason from the exception class name
187
+ # 从异常类名获取原因
44
188
  reason = exception.__class__.__name__
189
+
190
+ # Increment the item dropped count
191
+ # 增加已丢弃项目计数
45
192
  self.stats.inc_value('item_dropped_count', spider=spider)
193
+
194
+ # Increment the count for this specific reason
195
+ # 增加此特定原因的计数
46
196
  self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
@@ -1,3 +1,14 @@
1
+ """
2
+ Log Stats Extension
3
+ 日志统计扩展
4
+
5
+ This extension logs basic crawling statistics periodically during the spider run.
6
+ It provides information about the number of pages crawled, items scraped, and their
7
+ respective rates per minute, which is useful for monitoring the progress and
8
+ performance of spiders in real-time.
9
+ 此扩展在爬虫运行期间定期记录基本的爬取统计信息。它提供有关已爬取的页面数量、
10
+ 已抓取的项目数量及其各自的每分钟速率的信息,这对于实时监控爬虫的进度和性能很有用。
11
+ """
1
12
  import asyncio
2
13
 
3
14
  from aioscrapy import signals
@@ -7,44 +18,177 @@ from aioscrapy.utils.tools import create_task
7
18
 
8
19
 
9
20
  class LogStats:
10
- """Log basic scraping stats periodically"""
21
+ """
22
+ Extension for logging basic crawling statistics periodically.
23
+ 用于定期记录基本爬取统计信息的扩展。
24
+
25
+ This extension logs information about the number of pages crawled and items
26
+ scraped, along with their respective rates per minute. The statistics are
27
+ logged at regular intervals during the spider run, providing real-time
28
+ feedback on the spider's performance.
29
+ 此扩展记录有关已爬取的页面数量和已抓取的项目数量的信息,以及它们各自的
30
+ 每分钟速率。统计信息在爬虫运行期间以固定的时间间隔记录,提供有关爬虫性能
31
+ 的实时反馈。
32
+ """
11
33
 
12
34
  def __init__(self, stats, interval=60.0):
35
+ """
36
+ Initialize the LogStats extension.
37
+ 初始化LogStats扩展。
38
+
39
+ Args:
40
+ stats: The Scrapy stats collector.
41
+ Scrapy统计收集器。
42
+ interval: The time interval (in seconds) between log messages.
43
+ 日志消息之间的时间间隔(以秒为单位)。
44
+ Defaults to 60.0 seconds.
45
+ 默认为60.0秒。
46
+ """
47
+ # Stats collector
48
+ # 统计收集器
13
49
  self.stats = stats
50
+
51
+ # Interval between log messages (in seconds)
52
+ # 日志消息之间的间隔(以秒为单位)
14
53
  self.interval = interval
54
+
55
+ # Multiplier to convert stats to per-minute rates
56
+ # 将统计数据转换为每分钟速率的乘数
15
57
  self.multiplier = 60.0 / self.interval
58
+
59
+ # Async task for periodic logging
60
+ # 用于定期记录的异步任务
16
61
  self.task = None
62
+
63
+ # Previous values for calculating rates
64
+ # 用于计算速率的先前值
17
65
  self.pagesprev = 0
18
66
  self.itemsprev = 0
19
67
 
20
68
  @classmethod
21
69
  def from_crawler(cls, crawler):
70
+ """
71
+ Create a LogStats instance from a crawler.
72
+ 从爬虫创建LogStats实例。
73
+
74
+ This is the factory method used by Scrapy to create the extension.
75
+ 这是Scrapy用于创建扩展的工厂方法。
76
+
77
+ Args:
78
+ crawler: The crawler that will use this extension.
79
+ 将使用此扩展的爬虫。
80
+
81
+ Returns:
82
+ LogStats: A new LogStats instance.
83
+ 一个新的LogStats实例。
84
+
85
+ Raises:
86
+ NotConfigured: If LOGSTATS_INTERVAL is not set or is zero in the settings.
87
+ 如果在设置中未设置LOGSTATS_INTERVAL或其值为零。
88
+ """
89
+ # Get the log interval from settings
90
+ # 从设置获取日志间隔
22
91
  interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
92
+
93
+ # If no interval is configured, disable the extension
94
+ # 如果未配置间隔,则禁用扩展
23
95
  if not interval:
24
96
  raise NotConfigured
97
+
98
+ # Create a new instance with the crawler's stats collector and the configured interval
99
+ # 使用爬虫的统计收集器和配置的间隔创建一个新实例
25
100
  o = cls(crawler.stats, interval)
101
+
102
+ # Connect to signals
103
+ # 连接到信号
26
104
  crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
27
105
  crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
106
+
107
+ # Return the new instance
108
+ # 返回新实例
28
109
  return o
29
110
 
30
111
  def spider_opened(self, spider):
112
+ """
113
+ Handle the spider_opened signal.
114
+ 处理spider_opened信号。
115
+
116
+ This method is called when a spider is opened. It starts the periodic
117
+ logging task.
118
+ 当爬虫打开时调用此方法。它启动定期记录任务。
119
+
120
+ Args:
121
+ spider: The spider that was opened.
122
+ 被打开的爬虫。
123
+ """
124
+ # Start the periodic logging task
125
+ # 启动定期记录任务
31
126
  self.task = create_task(self.log(spider))
32
127
 
33
128
  async def log(self, spider):
129
+ """
130
+ Log the current crawling statistics and schedule the next log.
131
+ 记录当前爬取统计信息并安排下一次记录。
132
+
133
+ This method retrieves the current statistics, calculates the rates,
134
+ logs the information, and then schedules itself to run again after
135
+ the configured interval.
136
+ 此方法检索当前统计信息,计算速率,记录信息,然后安排自己在配置的
137
+ 间隔后再次运行。
138
+
139
+ Args:
140
+ spider: The spider whose statistics to log.
141
+ 要记录其统计信息的爬虫。
142
+ """
143
+ # Wait for the configured interval
144
+ # 等待配置的间隔
34
145
  await asyncio.sleep(self.interval)
146
+
147
+ # Get current statistics
148
+ # 获取当前统计信息
35
149
  items = self.stats.get_value('item_scraped_count', 0)
36
150
  pages = self.stats.get_value('response_received_count', 0)
151
+
152
+ # Calculate rates (per minute)
153
+ # 计算速率(每分钟)
37
154
  irate = (items - self.itemsprev) * self.multiplier
38
155
  prate = (pages - self.pagesprev) * self.multiplier
156
+
157
+ # Update previous values for next calculation
158
+ # 更新先前值以供下次计算
39
159
  self.pagesprev, self.itemsprev = pages, items
40
160
 
161
+ # Prepare log message
162
+ # 准备日志消息
41
163
  msg = ("<%(spider_name)s> Crawled %(pages)d pages (at %(pagerate)d pages/min), "
42
164
  "scraped %(items)d items (at %(itemrate)d items/min)")
43
165
  log_args = {'pages': pages, 'pagerate': prate, 'spider_name': spider.name,
44
166
  'items': items, 'itemrate': irate}
167
+
168
+ # Log the statistics
169
+ # 记录统计信息
45
170
  logger.info(msg % log_args)
171
+
172
+ # Schedule the next log
173
+ # 安排下一次记录
46
174
  self.task = create_task(self.log(spider))
47
175
 
48
176
  def spider_closed(self, spider, reason):
177
+ """
178
+ Handle the spider_closed signal.
179
+ 处理spider_closed信号。
180
+
181
+ This method is called when a spider is closed. It cancels the periodic
182
+ logging task if it's still running.
183
+ 当爬虫关闭时调用此方法。如果定期记录任务仍在运行,它会取消该任务。
184
+
185
+ Args:
186
+ spider: The spider that was closed.
187
+ 被关闭的爬虫。
188
+ reason: The reason why the spider was closed.
189
+ 爬虫被关闭的原因。
190
+ """
191
+ # Cancel the logging task if it's still running
192
+ # 如果记录任务仍在运行,则取消它
49
193
  if self.task and not self.task.done():
50
194
  self.task.cancel()