aio-scrapy 2.1.3__py3-none-any.whl → 2.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
  2. {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -40
  3. aio_scrapy-2.1.6.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +523 -18
  8. aioscrapy/core/downloader/handlers/__init__.py +188 -6
  9. aioscrapy/core/downloader/handlers/aiohttp.py +188 -4
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +125 -4
  11. aioscrapy/core/downloader/handlers/httpx.py +134 -4
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +133 -4
  13. aioscrapy/core/downloader/handlers/requests.py +121 -3
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +170 -14
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +193 -7
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +313 -13
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.3.dist-info/RECORD +0 -133
  105. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -110
  106. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -53
  107. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  108. aioscrapy/http/response/playwright.py +0 -36
  109. aioscrapy/libs/pipelines/execl.py +0 -169
  110. {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
  111. {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,31 @@
1
+ """
2
+ Metric Extension for AioScrapy
3
+ AioScrapy的指标扩展
4
+
5
+ This module provides extensions for collecting and reporting metrics from AioScrapy
6
+ spiders. It supports sending metrics to InfluxDB over HTTP or logging them to files.
7
+ 此模块提供了用于收集和报告AioScrapy爬虫指标的扩展。
8
+ 它支持通过HTTP将指标发送到InfluxDB或将它们记录到文件中。
9
+
10
+ The metrics are collected periodically and can be configured using the following settings:
11
+ 指标会定期收集,可以使用以下设置进行配置:
12
+
13
+ * METRIC_INTERVAL: How often to collect and report metrics (in seconds)
14
+ 收集和报告指标的频率(以秒为单位)
15
+ * METRIC_INFLUXDB_URL: URL of the InfluxDB server
16
+ InfluxDB服务器的URL
17
+ * METRIC_INFLUXDB_TOKEN: Authentication token for InfluxDB
18
+ InfluxDB的认证令牌
19
+ * METRIC_LOCATION: Location identifier for the metrics
20
+ 指标的位置标识符
21
+ * METRIC_RETRY_TIMES: Number of times to retry sending metrics to InfluxDB
22
+ 重试向InfluxDB发送指标的次数
23
+ * METRIC_LOG_ARGS: Arguments for configuring metric logging
24
+ 配置指标日志记录的参数
25
+ * METRICS: Dictionary of metrics to collect (if not specified, all stats are collected)
26
+ 要收集的指标字典(如果未指定,则收集所有统计信息)
27
+ """
28
+
1
29
  import asyncio
2
30
  import os
3
31
  import platform
@@ -13,135 +41,476 @@ from aioscrapy.utils.tools import create_task
13
41
 
14
42
 
15
43
  class InfluxBase:
44
+ """
45
+ Base class for InfluxDB metric reporters.
46
+ InfluxDB指标报告器的基类。
47
+
48
+ This abstract class defines the interface for classes that report metrics
49
+ to InfluxDB or similar time-series databases. It provides methods for
50
+ formatting metrics in InfluxDB line protocol format, recording metrics,
51
+ and closing connections.
52
+ 这个抽象类定义了向InfluxDB或类似时间序列数据库报告指标的类的接口。
53
+ 它提供了以InfluxDB行协议格式格式化指标、记录指标和关闭连接的方法。
54
+ """
55
+
16
56
  @staticmethod
17
57
  def format_metric(metric_name, value, spider_name, location, measurement=None):
58
+ """
59
+ Format a metric in InfluxDB line protocol format.
60
+ 以InfluxDB行协议格式格式化指标。
61
+
62
+ The line protocol format is:
63
+ <measurement>,<tag_set> <field_set> <timestamp>
64
+
65
+ Args:
66
+ metric_name: The name of the metric.
67
+ 指标的名称。
68
+ value: The value of the metric.
69
+ 指标的值。
70
+ spider_name: The name of the spider that generated the metric.
71
+ 生成指标的爬虫的名称。
72
+ location: The location identifier for the metric.
73
+ 指标的位置标识符。
74
+ measurement: Optional measurement name. If not provided, metric_name is used.
75
+ 可选的测量名称。如果未提供,则使用metric_name。
76
+
77
+ Returns:
78
+ str: The formatted metric in InfluxDB line protocol format.
79
+ 以InfluxDB行协议格式格式化的指标。
80
+ """
81
+ # Use metric_name as measurement if not provided
82
+ # 如果未提供,则使用metric_name作为measurement
18
83
  measurement = measurement or metric_name
84
+
85
+ # Format the metric in InfluxDB line protocol format
86
+ # 以InfluxDB行协议格式格式化指标
87
+ # Add a random component to the timestamp to avoid collisions
88
+ # 向时间戳添加随机组件以避免冲突
19
89
  return f"{measurement},spider_name={spider_name},location={location} {metric_name}={value} {time.time_ns() + int(random.random() * 100000)}"
20
90
 
21
91
  async def record(self, obj: "Metric"):
92
+ """
93
+ Record metrics from a Metric object.
94
+ 记录来自Metric对象的指标。
95
+
96
+ This is an abstract method that must be implemented by subclasses.
97
+ 这是一个必须由子类实现的抽象方法。
98
+
99
+ Args:
100
+ obj: The Metric object containing the metrics to record.
101
+ 包含要记录的指标的Metric对象。
102
+
103
+ Raises:
104
+ NotImplementedError: This method must be implemented by subclasses.
105
+ 此方法必须由子类实现。
106
+ """
22
107
  raise NotImplementedError
23
108
 
24
109
  async def close(self):
110
+ """
111
+ Close any resources used by the reporter.
112
+ 关闭报告器使用的任何资源。
113
+
114
+ This method is called when the spider is closed. It should release
115
+ any resources used by the reporter, such as network connections.
116
+ 当爬虫关闭时调用此方法。它应该释放报告器使用的任何资源,
117
+ 例如网络连接。
118
+
119
+ Returns:
120
+ None
121
+ """
25
122
  pass
26
123
 
27
124
 
28
125
  class InfluxHttp(InfluxBase):
126
+ """
127
+ InfluxDB HTTP reporter for metrics.
128
+ 用于指标的InfluxDB HTTP报告器。
129
+
130
+ This class sends metrics to an InfluxDB server over HTTP using the InfluxDB
131
+ line protocol. It handles authentication, retries, and connection management.
132
+ 此类使用InfluxDB行协议通过HTTP将指标发送到InfluxDB服务器。
133
+ 它处理身份验证、重试和连接管理。
134
+ """
135
+
29
136
  def __init__(self, spider_name: str, settings: Settings):
137
+ """
138
+ Initialize the InfluxDB HTTP reporter.
139
+ 初始化InfluxDB HTTP报告器。
140
+
141
+ Args:
142
+ spider_name: The name of the spider generating the metrics.
143
+ 生成指标的爬虫的名称。
144
+ settings: The AioScrapy settings object.
145
+ AioScrapy设置对象。
146
+ """
147
+ # Get configuration from settings
148
+ # 从设置获取配置
30
149
  influxdb_url = settings.get('METRIC_INFLUXDB_URL')
31
150
  token = settings.get('METRIC_INFLUXDB_TOKEN')
32
151
  location = settings.get('METRIC_LOCATION')
33
152
  self.retry_times = settings.getint('METRIC_RETRY_TIMES', 5)
153
+
154
+ # Set location identifier, using node name and process ID as default
155
+ # 设置位置标识符,默认使用节点名称和进程ID
34
156
  self.location = location or f"{platform.node()}_{os.getpid()}"
35
157
  self.spider_name = spider_name
158
+
159
+ # Create HTTP session with appropriate headers for InfluxDB
160
+ # 创建带有适用于InfluxDB的适当头部的HTTP会话
36
161
  self.session = ClientSession(headers={
37
162
  "Authorization": f"Token {token}",
38
163
  "Content-Type": "text/plain; charset=utf-8",
39
164
  "Accept": "application/json",
40
165
  })
41
166
  self.url = influxdb_url
167
+
168
+ # Lock to ensure only one record operation happens at a time
169
+ # 锁定以确保一次只发生一个记录操作
42
170
  self.lock = asyncio.Lock()
43
171
 
44
172
  async def emit(self, data):
173
+ """
174
+ Send metrics data to the InfluxDB server.
175
+ 将指标数据发送到InfluxDB服务器。
176
+
177
+ Args:
178
+ data: The metrics data in InfluxDB line protocol format.
179
+ InfluxDB行协议格式的指标数据。
180
+
181
+ Returns:
182
+ None
183
+ """
184
+ # Send data to InfluxDB server
185
+ # 将数据发送到InfluxDB服务器
45
186
  async with self.session.post(self.url, data=data) as response:
46
187
  await response.read()
47
188
  logger.debug(f"emit metric success<{response.status}>: \n{data}")
48
189
 
49
190
  async def record(self, obj: "Metric"):
191
+ """
192
+ Record metrics from a Metric object to InfluxDB.
193
+ 将Metric对象中的指标记录到InfluxDB。
194
+
195
+ This method calculates the delta for each metric since the last recording
196
+ and sends only the changes to InfluxDB.
197
+ 此方法计算自上次记录以来每个指标的增量,并仅将更改发送到InfluxDB。
198
+
199
+ Args:
200
+ obj: The Metric object containing the metrics to record.
201
+ 包含要记录的指标的Metric对象。
202
+ """
203
+ # Use lock to ensure only one record operation happens at a time
204
+ # 使用锁确保一次只发生一个记录操作
50
205
  async with self.lock:
51
206
  data = ''
207
+
208
+ # Process each metric
209
+ # 处理每个指标
52
210
  for metric_name in obj.metrics.keys():
211
+ # Get current value
212
+ # 获取当前值
53
213
  current_cnt = obj.stats.get_value(metric_name, 0)
214
+
215
+ # Skip non-numeric metrics
216
+ # 跳过非数字指标
54
217
  if not isinstance(current_cnt, (int, float)):
55
218
  continue
219
+
220
+ # Calculate delta since last recording
221
+ # 计算自上次记录以来的增量
56
222
  cnt = current_cnt - obj.prev.get(metric_name, 0)
223
+
224
+ # Only record if there's a change
225
+ # 仅在有变化时记录
57
226
  if cnt:
58
227
  data += self.format_metric(
59
228
  metric_name.replace('/', '-'), cnt, self.spider_name, self.location
60
229
  ) + '\n'
230
+
231
+ # Update previous value
232
+ # 更新先前的值
61
233
  obj.prev[metric_name] = current_cnt
234
+
235
+ # If we have data to send
236
+ # 如果我们有数据要发送
62
237
  if data:
238
+ # Try to send data with retries
239
+ # 尝试使用重试发送数据
63
240
  for _ in range(self.retry_times):
64
241
  try:
65
242
  await self.emit(data)
66
243
  return
67
244
  except:
68
245
  continue
246
+
247
+ # Log warning if all retries failed
248
+ # 如果所有重试都失败,则记录警告
69
249
  logger.warning(f"emit metric failed:\n{data}")
70
250
 
71
251
  async def close(self):
252
+ """
253
+ Close the HTTP session.
254
+ 关闭HTTP会话。
255
+
256
+ This method is called when the spider is closed. It closes the HTTP session
257
+ and waits a short time to ensure all pending requests are completed.
258
+ 当爬虫关闭时调用此方法。它关闭HTTP会话并等待短暂时间以确保所有
259
+ 待处理的请求都已完成。
260
+ """
72
261
  if self.session is not None:
73
262
  await self.session.close()
263
+ # Wait a short time to ensure all pending requests are completed
264
+ # 等待短暂时间以确保所有待处理的请求都已完成
74
265
  await asyncio.sleep(0.250)
75
266
 
76
267
 
77
268
  class InfluxLog(InfluxBase):
269
+ """
270
+ Logger-based reporter for metrics.
271
+ 基于日志记录器的指标报告器。
272
+
273
+ This class logs metrics to a file or other logging sink instead of sending them
274
+ to an InfluxDB server. It formats the metrics in InfluxDB line protocol format
275
+ for consistency with the InfluxHttp reporter.
276
+ 此类将指标记录到文件或其他日志接收器,而不是将它们发送到InfluxDB服务器。
277
+ 它以InfluxDB行协议格式格式化指标,以与InfluxHttp报告器保持一致。
278
+ """
279
+
78
280
  def __init__(self, spider_name: str, settings: Settings):
281
+ """
282
+ Initialize the logger-based metric reporter.
283
+ 初始化基于日志记录器的指标报告器。
284
+
285
+ Args:
286
+ spider_name: The name of the spider generating the metrics.
287
+ 生成指标的爬虫的名称。
288
+ settings: The AioScrapy settings object.
289
+ AioScrapy设置对象。
290
+ """
291
+ # Get location from settings or use default
292
+ # 从设置获取位置或使用默认值
79
293
  location = settings.get('METRIC_LOCATION')
80
294
  self.location = location or f"{platform.node()}_{os.getpid()}"
81
295
  self.spider_name = spider_name
82
296
 
297
+ # Configure logging based on settings
298
+ # 根据设置配置日志记录
83
299
  log_args = settings.getdict('METRIC_LOG_ARGS')
84
300
  if log_args:
301
+ # Add filter to only log records with metric extra field
302
+ # 添加过滤器,仅记录具有metric额外字段的记录
85
303
  log_args.update(dict(
86
304
  filter=lambda record: record["extra"].get("metric") is not None,
87
305
  encoding="utf-8"
88
306
  ))
307
+
308
+ # Set default logging parameters if not provided
309
+ # 如果未提供,则设置默认日志参数
89
310
  for k, v in dict(
90
311
  sink=f'{spider_name}.metric', level="INFO", rotation='20MB', retention=3,
91
312
  format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> <level>{message}</level>",
92
313
  ).items():
93
314
  log_args.setdefault(k, v)
94
315
 
316
+ # Configure logger with the specified parameters
317
+ # 使用指定的参数配置日志记录器
95
318
  _logger.add(**log_args)
96
319
  self.log = _logger.bind(metric="metric")
97
320
  else:
321
+ # Use default logger if no specific configuration
322
+ # 如果没有特定配置,则使用默认日志记录器
98
323
  self.log = logger
99
324
 
100
325
  async def record(self, obj: "Metric"):
326
+ """
327
+ Record metrics from a Metric object to the log.
328
+ 将Metric对象中的指标记录到日志。
329
+
330
+ This method calculates the delta for each metric since the last recording
331
+ and logs only the changes.
332
+ 此方法计算自上次记录以来每个指标的增量,并仅记录更改。
333
+
334
+ Args:
335
+ obj: The Metric object containing the metrics to record.
336
+ 包含要记录的指标的Metric对象。
337
+ """
338
+ # Process each metric
339
+ # 处理每个指标
101
340
  for metric_name in obj.metrics.keys():
341
+ # Get current value
342
+ # 获取当前值
102
343
  current_cnt = obj.stats.get_value(metric_name, 0)
344
+
345
+ # Skip non-numeric metrics
346
+ # 跳过非数字指标
103
347
  if not isinstance(current_cnt, (int, float)):
104
348
  continue
349
+
350
+ # Calculate delta since last recording
351
+ # 计算自上次记录以来的增量
105
352
  prev_cnt = obj.prev.get(metric_name, 0)
106
353
  cnt = current_cnt - prev_cnt
354
+
355
+ # Only log if there's a change
356
+ # 仅在有变化时记录
107
357
  if cnt:
358
+ # Format the metric and log it
359
+ # 格式化指标并记录它
108
360
  msg = self.format_metric(metric_name.replace('/', '-'), cnt, self.spider_name, self.location)
109
361
  self.log.info(f'METRIC: {msg}')
362
+
363
+ # Update previous value
364
+ # 更新先前的值
110
365
  obj.prev[metric_name] = current_cnt
111
366
 
112
367
 
113
368
  class Metric:
114
- """Log Metric scraping stats periodically"""
369
+ """
370
+ Extension to log metrics from spider scraping stats periodically.
371
+ 定期记录爬虫抓取统计信息指标的扩展。
372
+
373
+ This extension periodically collects statistics from the spider's stats collector
374
+ and records them using either an InfluxDB HTTP reporter or a logger-based reporter.
375
+ It calculates the delta for each metric since the last recording to track the
376
+ rate of change.
377
+ 此扩展定期从爬虫的统计收集器收集统计信息,并使用InfluxDB HTTP报告器或
378
+ 基于日志记录器的报告器记录它们。它计算自上次记录以来每个指标的增量,
379
+ 以跟踪变化率。
380
+ """
115
381
 
116
382
  def __init__(self, stats, spider_name, settings, interval=10.0):
383
+ """
384
+ Initialize the Metric extension.
385
+ 初始化Metric扩展。
386
+
387
+ Args:
388
+ stats: The stats collector instance.
389
+ 统计收集器实例。
390
+ spider_name: The name of the spider.
391
+ 爬虫的名称。
392
+ settings: The AioScrapy settings object.
393
+ AioScrapy设置对象。
394
+ interval: How often to collect and record metrics, in seconds.
395
+ 收集和记录指标的频率,以秒为单位。
396
+ Defaults to 10.0 seconds.
397
+ 默认为10.0秒。
398
+ """
399
+ # Choose the appropriate reporter based on settings
400
+ # 根据设置选择适当的报告器
117
401
  if settings.get('METRIC_INFLUXDB_URL'):
118
402
  self.influx = InfluxHttp(spider_name, settings)
119
403
  else:
120
404
  self.influx = InfluxLog(spider_name, settings)
405
+
121
406
  self.stats = stats
407
+
408
+ # Get metrics to collect from settings, or use all stats if not specified
409
+ # 从设置获取要收集的指标,如果未指定,则使用所有统计信息
122
410
  self.metrics = settings.getdict('METRICS') or self.stats._stats
123
411
  self.interval = interval
124
412
  self.task = None
413
+
414
+ # Dictionary to store previous values for calculating deltas
415
+ # 用于存储先前值以计算增量的字典
125
416
  self.prev = {}
126
417
 
127
418
  @classmethod
128
419
  def from_crawler(cls, crawler):
420
+ """
421
+ Create a Metric instance from a crawler.
422
+ 从爬虫创建Metric实例。
423
+
424
+ This is the factory method used by AioScrapy to create extension instances.
425
+ 这是AioScrapy用于创建扩展实例的工厂方法。
426
+
427
+ Args:
428
+ crawler: The crawler that will use this extension.
429
+ 将使用此扩展的爬虫。
430
+
431
+ Returns:
432
+ Metric: A new Metric instance.
433
+ 一个新的Metric实例。
434
+ """
435
+ # Get interval from settings
436
+ # 从设置获取间隔
129
437
  interval = crawler.settings.getfloat('METRIC_INTERVAL', 10.0)
438
+
439
+ # Create instance
440
+ # 创建实例
130
441
  o = cls(crawler.stats, crawler.spider.name, crawler.settings, interval)
442
+
443
+ # Connect to signals
444
+ # 连接到信号
131
445
  crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
132
446
  crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
447
+
133
448
  return o
134
449
 
135
450
  def spider_opened(self, spider):
451
+ """
452
+ Signal handler for the spider_opened signal.
453
+ spider_opened信号的处理程序。
454
+
455
+ Starts the periodic task to collect and record metrics.
456
+ 启动定期收集和记录指标的任务。
457
+
458
+ Args:
459
+ spider: The spider that was opened.
460
+ 被打开的爬虫。
461
+ """
462
+ # Start the periodic task
463
+ # 启动定期任务
136
464
  self.task = create_task(self.run(spider))
137
465
 
138
466
  async def run(self, spider):
467
+ """
468
+ Periodically collect and record metrics.
469
+ 定期收集和记录指标。
470
+
471
+ This method waits for the configured interval, records the current metrics,
472
+ then schedules itself to run again.
473
+ 此方法等待配置的间隔,记录当前指标,然后安排自己再次运行。
474
+
475
+ Args:
476
+ spider: The spider instance.
477
+ 爬虫实例。
478
+ """
479
+ # Wait for the configured interval
480
+ # 等待配置的间隔
139
481
  await asyncio.sleep(self.interval)
482
+
483
+ # Record metrics
484
+ # 记录指标
140
485
  await self.influx.record(self)
486
+
487
+ # Schedule next run
488
+ # 安排下一次运行
141
489
  self.task = create_task(self.run(spider))
142
490
 
143
491
  async def spider_closed(self, spider, reason):
492
+ """
493
+ Signal handler for the spider_closed signal.
494
+ spider_closed信号的处理程序。
495
+
496
+ Cancels the periodic task, records final metrics, and closes the reporter.
497
+ 取消定期任务,记录最终指标,并关闭报告器。
498
+
499
+ Args:
500
+ spider: The spider that was closed.
501
+ 被关闭的爬虫。
502
+ reason: The reason why the spider was closed.
503
+ 爬虫被关闭的原因。
504
+ """
505
+ # Cancel the periodic task if it's running
506
+ # 如果定期任务正在运行,则取消它
144
507
  if self.task and not self.task.done():
145
508
  self.task.cancel()
509
+
510
+ # Record final metrics
511
+ # 记录最终指标
146
512
  await self.influx.record(self)
513
+
514
+ # Close the reporter
515
+ # 关闭报告器
147
516
  await self.influx.close()