aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/core/scraper.py CHANGED
@@ -1,12 +1,41 @@
1
- """This module implements the Scraper component which parses responses and
2
- extracts information from them"""
1
+ """
2
+ Scraper Module
3
+ 抓取器模块
4
+
5
+ This module implements the Scraper component which parses responses and
6
+ extracts information from them. The Scraper is the central component that
7
+ coordinates the processing of downloaded content and manages the flow of
8
+ extracted data through the system.
9
+ 此模块实现了Scraper组件,用于解析响应并从中提取信息。Scraper是协调下载内容处理
10
+ 并管理提取数据在系统中流动的中央组件。
11
+
12
+ The Scraper is responsible for:
13
+ Scraper负责:
14
+ 1. Processing downloaded responses through spider callbacks
15
+ 通过爬虫回调处理下载的响应
16
+ 2. Handling spider output (requests and items)
17
+ 处理爬虫输出(请求和项目)
18
+ 3. Processing items through the item pipeline
19
+ 通过项目管道处理项目
20
+ 4. Handling errors during the scraping process
21
+ 处理抓取过程中的错误
22
+ 5. Managing memory usage and concurrency
23
+ 管理内存使用和并发性
24
+
25
+ The module contains two main classes:
26
+ 模块包含两个主要类:
27
+ 1. Slot: Tracks active requests and memory usage for a spider
28
+ 跟踪爬虫的活动请求和内存使用情况
29
+ 2. Scraper: Processes responses and extracts items
30
+ 处理响应并提取项目
31
+ """
3
32
  import asyncio
4
33
  from typing import Any, AsyncGenerator, Set, Union, Optional
5
34
 
6
35
  import aioscrapy
7
36
  from aioscrapy import signals, Spider
8
37
  from aioscrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
9
- from aioscrapy.http import PlaywrightResponse
38
+ from aioscrapy.http import WebDriverResponse
10
39
  from aioscrapy.http import Request, Response
11
40
  from aioscrapy.logformatter import LogFormatter
12
41
  from aioscrapy.middleware import ItemPipelineManager, SpiderMiddlewareManager
@@ -17,39 +46,136 @@ from aioscrapy.utils.tools import call_helper, create_task
17
46
 
18
47
 
19
48
  class Slot:
20
- """Scraper slot (one per running spider)"""
49
+ """
50
+ Scraper slot (one per running spider).
51
+ 抓取器槽(每个运行的爬虫一个)。
21
52
 
22
- MIN_RESPONSE_SIZE = 1024
53
+ This class keeps track of active requests and memory usage
54
+ to control the scraper's memory footprint.
55
+ 此类跟踪活动请求和内存使用情况,以控制抓取器的内存占用。
56
+ """
57
+
58
+ MIN_RESPONSE_SIZE = 1024 # Minimum size in bytes to account for a response
59
+ # 计算响应的最小字节大小
23
60
 
24
61
  def __init__(self, max_active_size: int = 5000000):
25
- self.max_active_size = max_active_size
26
- self.active: Set[Request] = set()
27
- self.active_size: int = 0
28
- self.itemproc_size: int = 0
62
+ """
63
+ Initialize a scraper slot.
64
+ 初始化抓取器槽。
65
+
66
+ Args:
67
+ max_active_size: Maximum allowed size in bytes for active responses.
68
+ 活动响应允许的最大字节大小。
69
+ Default is 5MB.
70
+ 默认为5MB。
71
+ """
72
+ self.max_active_size = max_active_size # Maximum memory allowed for active responses
73
+ # 活动响应允许的最大内存
74
+ self.active: Set[Request] = set() # Set of active requests being processed
75
+ # 正在处理的活动请求集合
76
+ self.active_size: int = 0 # Current memory usage of active responses
77
+ # 活动响应的当前内存使用量
78
+ self.itemproc_size: int = 0 # Number of items being processed by the item pipeline
79
+ # 项目管道正在处理的项目数量
29
80
 
30
81
  def add_response_request(self, result: Union[Response, BaseException], request: Request) -> None:
82
+ """
83
+ Add a request and its result to the active set.
84
+ 将请求及其结果添加到活动集合中。
85
+
86
+ This method tracks the request and updates the memory usage counter
87
+ based on the size of the response.
88
+ 此方法跟踪请求并根据响应的大小更新内存使用计数器。
89
+
90
+ Args:
91
+ result: The response or exception from processing the request.
92
+ 处理请求的响应或异常。
93
+ request: The request being processed.
94
+ 正在处理的请求。
95
+ """
31
96
  self.active.add(request)
32
97
  if isinstance(result, Response):
98
+ # Account for the response body size, with a minimum threshold
99
+ # 计算响应体大小,设有最小阈值
33
100
  self.active_size += max(len(result.body), self.MIN_RESPONSE_SIZE)
34
101
  else:
102
+ # For exceptions, use the minimum size
103
+ # 对于异常,使用最小大小
35
104
  self.active_size += self.MIN_RESPONSE_SIZE
36
105
 
37
106
  def finish_response(self, request: Request, result: Union[Response, BaseException]) -> None:
107
+ """
108
+ Remove a request and its result from the active set.
109
+ 从活动集合中移除请求及其结果。
110
+
111
+ This method is called when processing of a request is complete.
112
+ It updates the memory usage counter and cleans up resources.
113
+ 当请求处理完成时调用此方法。它更新内存使用计数器并清理资源。
114
+
115
+ Args:
116
+ request: The request that has been processed.
117
+ 已处理的请求。
118
+ result: The response or exception from processing the request.
119
+ 处理请求的响应或异常。
120
+ """
38
121
  self.active.remove(request)
39
122
  if isinstance(result, Response):
123
+ # Decrease the memory counter by the response size
124
+ # 按响应大小减少内存计数器
40
125
  self.active_size -= max(len(result.body), self.MIN_RESPONSE_SIZE)
126
+ # Clear cached selector to free memory
127
+ # 清除缓存的选择器以释放内存
41
128
  result._cached_selector = None
42
129
  else:
130
+ # For exceptions, decrease by the minimum size
131
+ # 对于异常,按最小大小减少
43
132
  self.active_size -= self.MIN_RESPONSE_SIZE
44
133
 
45
134
  def is_idle(self) -> bool:
135
+ """
136
+ Check if the slot is idle (no active requests).
137
+ 检查槽是否空闲(没有活动请求)。
138
+
139
+ Returns:
140
+ bool: True if there are no active requests, False otherwise.
141
+ 如果没有活动请求,则为True,否则为False。
142
+ """
46
143
  return not self.active
47
144
 
48
145
  def needs_backout(self) -> bool:
146
+ """
147
+ Check if the slot needs to back out (stop accepting new requests).
148
+ 检查槽是否需要退出(停止接受新请求)。
149
+
150
+ This method determines if the memory usage has exceeded the maximum
151
+ allowed size, in which case the scraper should stop accepting new
152
+ requests until some current ones complete.
153
+ 此方法确定内存使用是否已超过允许的最大大小,在这种情况下,
154
+ 抓取器应停止接受新请求,直到一些当前请求完成。
155
+
156
+ Returns:
157
+ bool: True if memory usage exceeds the maximum, False otherwise.
158
+ 如果内存使用超过最大值,则为True,否则为False。
159
+ """
49
160
  return self.active_size > self.max_active_size
50
161
 
51
162
 
52
163
  class Scraper:
164
+ """
165
+ The Scraper processes downloaded responses and extracts items.
166
+ Scraper处理下载的响应并提取项目。
167
+
168
+ This class is responsible for:
169
+ 此类负责:
170
+ 1. Processing responses through spider callbacks
171
+ 通过爬虫回调处理响应
172
+ 2. Handling spider output (requests and items)
173
+ 处理爬虫输出(请求和项目)
174
+ 3. Processing items through the item pipeline
175
+ 通过项目管道处理项目
176
+ 4. Managing memory usage and concurrency
177
+ 管理内存使用和并发
178
+ """
53
179
 
54
180
  def __init__(
55
181
  self,
@@ -58,146 +184,428 @@ class Scraper:
58
184
  spidermw: SpiderMiddlewareManager,
59
185
  itemproc: ItemPipelineManager,
60
186
  ):
187
+ """
188
+ Initialize the Scraper.
189
+ 初始化Scraper。
190
+
191
+ Args:
192
+ crawler: The crawler instance that this scraper belongs to.
193
+ 此抓取器所属的爬虫实例。
194
+ slot: The slot for tracking active requests and memory usage.
195
+ 用于跟踪活动请求和内存使用的槽。
196
+ spidermw: The spider middleware manager.
197
+ 爬虫中间件管理器。
198
+ itemproc: The item pipeline manager.
199
+ 项目管道管理器。
200
+ """
61
201
  self.crawler = crawler
62
202
  self.spider: Spider = crawler.spider
63
203
  self.signals: SignalManager = self.crawler.signals
64
204
  self.logformatter: LogFormatter = self.crawler.logformatter
65
205
 
66
- self.slot = slot
67
- self.spidermw = spidermw
68
- self.itemproc = itemproc
69
-
70
- self.finish: bool = False
206
+ self.slot = slot # Slot for tracking active requests and memory
207
+ # 用于跟踪活动请求和内存的槽
208
+ self.spidermw = spidermw # Spider middleware manager
209
+ # 爬虫中间件管理器
210
+ self.itemproc = itemproc # Item pipeline manager
211
+ # 项目管道管理器
212
+
213
+ self.finish: bool = False # Flag to indicate if scraper is shutting down
214
+ # 指示抓取器是否正在关闭的标志
215
+ # Semaphore to limit concurrent parsing
216
+ # 用于限制并发解析的信号量
71
217
  self.concurrent_parser = asyncio.Semaphore(crawler.settings.getint('CONCURRENT_PARSER', 1))
72
218
 
73
219
  @classmethod
74
220
  async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "Scraper":
221
+ """
222
+ Create a Scraper instance from a crawler.
223
+ 从爬虫创建Scraper实例。
224
+
225
+ This factory method creates a new Scraper instance with all the
226
+ necessary components initialized from the crawler.
227
+ 此工厂方法创建一个新的Scraper实例,所有必要的组件都从爬虫初始化。
228
+
229
+ Args:
230
+ crawler: The crawler instance that will use this scraper.
231
+ 将使用此抓取器的爬虫实例。
232
+
233
+ Returns:
234
+ Scraper: A new scraper instance.
235
+ 一个新的抓取器实例。
236
+ """
237
+ # Create the scraper instance with all required components
238
+ # 创建具有所有必需组件的抓取器实例
75
239
  instance: "Scraper" = cls(
76
240
  crawler,
241
+ # Create a slot with the maximum active size from settings
242
+ # 使用设置中的最大活动大小创建槽
77
243
  Slot(crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE')),
244
+ # Initialize the spider middleware manager
245
+ # 初始化爬虫中间件管理器
78
246
  await call_helper(SpiderMiddlewareManager.from_crawler, crawler),
247
+ # Initialize the item pipeline manager
248
+ # 初始化项目管道管理器
79
249
  await call_helper(load_object(crawler.settings['ITEM_PROCESSOR']).from_crawler, crawler)
80
250
  )
251
+ # Open the item processor for the spider
252
+ # 为爬虫打开项目处理器
81
253
  await instance.itemproc.open_spider(crawler.spider)
82
254
  return instance
83
255
 
84
256
  async def close(self) -> None:
85
- """Close a spider being scraped and release its resources"""
257
+ """
258
+ Close a spider being scraped and release its resources.
259
+ 关闭正在抓取的爬虫并释放其资源。
260
+
261
+ This method closes the item processor for the spider and
262
+ marks the scraper as finished.
263
+ 此方法关闭爬虫的项目处理器并将抓取器标记为已完成。
264
+ """
86
265
  await self.itemproc.close_spider(self.spider)
87
266
  self.finish = True
88
267
 
89
268
  def is_idle(self) -> bool:
90
- """Return True if there isn't any more spiders to process"""
269
+ """
270
+ Check if the scraper is idle (no active requests).
271
+ 检查抓取器是否空闲(没有活动请求)。
272
+
273
+ Returns:
274
+ bool: True if there aren't any more requests to process, False otherwise.
275
+ 如果没有更多要处理的请求,则为True,否则为False。
276
+ """
91
277
  return self.slot.is_idle()
92
278
 
93
279
  def needs_backout(self) -> bool:
280
+ """
281
+ Check if the scraper needs to back out (stop accepting new requests).
282
+ 检查抓取器是否需要退出(停止接受新请求)。
283
+
284
+ This method delegates to the slot to determine if memory usage
285
+ has exceeded the maximum allowed size.
286
+ 此方法委托给槽来确定内存使用是否已超过允许的最大大小。
287
+
288
+ Returns:
289
+ bool: True if memory usage exceeds the maximum, False otherwise.
290
+ 如果内存使用超过最大值,则为True,否则为False。
291
+ """
94
292
  return self.slot.needs_backout()
95
293
 
96
294
  async def enqueue_scrape(self, result: Union[Response, BaseException], request: Request) -> None:
295
+ """
296
+ Enqueue a response or exception for scraping.
297
+ 将响应或异常排队等待抓取。
298
+
299
+ This method adds the request and result to the active set in the slot
300
+ and starts the scraping process.
301
+ 此方法将请求和结果添加到槽中的活动集合,并开始抓取过程。
302
+
303
+ Args:
304
+ result: The response or exception from processing the request.
305
+ 处理请求的响应或异常。
306
+ request: The request that was processed.
307
+ 已处理的请求。
308
+ """
97
309
  # Cache the results in the slot
310
+ # 在槽中缓存结果
98
311
  self.slot.add_response_request(result, request)
99
312
  await self._scrape(result, request)
100
313
 
101
314
  async def _scrape(self, result: Union[Response, BaseException], request: Request) -> None:
102
- """Handle the downloaded response or failure through the spider callback/errback"""
315
+ """
316
+ Handle the downloaded response or failure through the spider callback/errback.
317
+ 通过爬虫回调/错误回调处理下载的响应或失败。
318
+
319
+ This method processes the response or exception through the appropriate
320
+ spider callback or errback, and handles any output or errors.
321
+ 此方法通过适当的爬虫回调或错误回调处理响应或异常,并处理任何输出或错误。
322
+
323
+ Args:
324
+ result: The response or exception from processing the request.
325
+ 处理请求的响应或异常。
326
+ request: The request that was processed.
327
+ 已处理的请求。
328
+ """
329
+ # Use semaphore to limit concurrent parsing
330
+ # 使用信号量限制并发解析
103
331
  async with self.concurrent_parser:
104
332
  try:
333
+ # Validate the result type
334
+ # 验证结果类型
105
335
  if not isinstance(result, (Response, BaseException)):
106
336
  raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
107
337
  try:
338
+ # Process the result through spider middleware and callbacks
339
+ # 通过爬虫中间件和回调处理结果
108
340
  output = await self._scrape2(result, request) # returns spider's processed output
109
341
  except BaseException as e:
342
+ # Handle any errors during processing
343
+ # 处理处理过程中的任何错误
110
344
  await self.handle_spider_error(e, request, result)
111
345
  else:
346
+ # Handle the output from the spider
347
+ # 处理爬虫的输出
112
348
  await self.handle_spider_output(output, request, result)
113
349
  except BaseException as e:
350
+ # Handle any errors that weren't caught earlier
351
+ # 处理之前未捕获的任何错误
114
352
  await self.handle_spider_error(e, request, result)
115
353
  finally:
116
- # 控制指纹是否移除
354
+ # Update dupefilter with parse status
355
+ # 使用解析状态更新重复过滤器
117
356
  self.spider.dupefilter and \
118
357
  not request.dont_filter and \
119
358
  await self.spider.dupefilter.done(request, done_type="parse_ok" if getattr(request, "parse_ok", False) else "parse_err")
120
359
 
121
- if isinstance(result, PlaywrightResponse):
360
+ # Release playwright/drissionpage response resources if applicable
361
+ # 如果适用,释放playwright/drissionpage等响应资源
362
+ if isinstance(result, WebDriverResponse):
122
363
  await result.release()
123
364
 
124
365
  # Delete the cache result from the slot
366
+ # 从槽中删除缓存结果
125
367
  self.slot.finish_response(request, result)
126
368
 
127
369
  async def _scrape2(self, result: Union[Response, BaseException], request: Request) -> Optional[AsyncGenerator]:
128
- """Handle the different cases of request's result been a Response or a Exception"""
129
-
370
+ """
371
+ Handle the different cases of request's result being a Response or an Exception.
372
+ 处理请求结果为Response或Exception的不同情况。
373
+
374
+ This method routes the result to the appropriate processing path based on
375
+ whether it's a successful response or an exception.
376
+ 此方法根据结果是成功的响应还是异常,将结果路由到适当的处理路径。
377
+
378
+ Args:
379
+ result: The response or exception from processing the request.
380
+ 处理请求的响应或异常。
381
+ request: The request that was processed.
382
+ 已处理的请求。
383
+
384
+ Returns:
385
+ Optional[AsyncGenerator]: The output from processing the result, or None.
386
+ 处理结果的输出,或None。
387
+ """
130
388
  if isinstance(result, Response):
389
+ # For responses, pass through spider middleware
390
+ # 对于响应,通过爬虫中间件传递
131
391
  # Throw the response to the middleware of the spider,
132
392
  # and the processing results will be processed to the self.call_spider
393
+ # 将响应抛给爬虫的中间件,处理结果将被处理到self.call_spider
133
394
  return await self.spidermw.scrape_response(self.call_spider, result, request, self.spider)
134
395
  else:
135
396
  try:
397
+ # For exceptions, call spider directly (bypass middleware)
398
+ # 对于异常,直接调用爬虫(绕过中间件)
136
399
  # Processing Exception of download and download's middleware
400
+ # 处理下载和下载中间件的异常
137
401
  return await self.call_spider(result, request)
138
402
  except BaseException as e:
403
+ # Log any errors that occur during exception handling
404
+ # 记录异常处理期间发生的任何错误
139
405
  await self._log_download_errors(e, result, request)
140
406
 
141
407
  async def call_spider(self, result: Union[Response, BaseException], request: Request) -> Optional[AsyncGenerator]:
408
+ """
409
+ Call the appropriate spider method to handle a result.
410
+ 调用适当的爬虫方法来处理结果。
411
+
412
+ This method calls either the callback or errback method of the spider
413
+ based on whether the result is a response or an exception.
414
+ 此方法根据结果是响应还是异常,调用爬虫的回调或错误回调方法。
415
+
416
+ Args:
417
+ result: The response or exception to process.
418
+ 要处理的响应或异常。
419
+ request: The request associated with the result.
420
+ 与结果关联的请求。
421
+
422
+ Returns:
423
+ Optional[AsyncGenerator]: The output from the spider method, or None.
424
+ 爬虫方法的输出,或None。
425
+
426
+ Raises:
427
+ BaseException: If result is an exception and no errback is defined.
428
+ 如果结果是异常且未定义错误回调。
429
+ """
142
430
  if isinstance(result, Response):
431
+ # For responses, call the callback method
432
+ # 对于响应,调用回调方法
143
433
  # throws Response to Spider's parse
434
+ # 将Response抛给爬虫的parse
144
435
  callback = request.callback or self.spider._parse
145
436
  return await call_helper(callback, result, **result.request.cb_kwargs)
146
437
  else:
438
+ # For exceptions, call the errback method if defined
439
+ # 对于异常,如果定义了错误回调方法,则调用它
147
440
  if request.errback is None:
441
+ # If no errback is defined, re-raise the exception
442
+ # 如果未定义错误回调,则重新引发异常
148
443
  raise result
149
444
  # throws Exception of download and download's middleware to Spider's errback
445
+ # 将下载和下载中间件的异常抛给爬虫的errback
150
446
  return await call_helper(request.errback, result)
151
447
 
152
- async def handle_spider_error(self, exc: BaseException, request: Request, response: Response) -> None:
448
+ async def handle_spider_error(self, exc: BaseException, request: Request, response: Union[Response, BaseException]) -> None:
449
+ """
450
+ Handle errors raised during spider callback processing.
451
+ 处理爬虫回调处理期间引发的错误。
452
+
453
+ This method handles exceptions that occur during the processing of
454
+ responses by spider callbacks. It logs the error, sends the spider_error signal,
455
+ and updates error statistics.
456
+ 此方法处理爬虫回调处理响应期间发生的异常。它记录错误、发送spider_error信号
457
+ 并更新错误统计信息。
458
+
459
+ Args:
460
+ exc: The exception that was raised.
461
+ 引发的异常。
462
+ request: The request being processed when the error occurred.
463
+ 发生错误时正在处理的请求。
464
+ response: The response or exception being processed when the error occurred.
465
+ 发生错误时正在处理的响应或异常。
466
+ This can be either a Response object or an Exception object in case
467
+ the error occurred during processing of an errback.
468
+ 这可以是Response对象或Exception对象,以防错误发生在处理errback期间。
469
+ """
470
+ # Handle CloseSpider exceptions specially
471
+ # 特别处理CloseSpider异常
153
472
  if isinstance(exc, CloseSpider):
154
473
  create_task(self.crawler.engine.close_spider(self.spider, exc.reason or 'cancelled'))
155
474
  return
475
+
476
+ # Log the error
477
+ # 记录错误
156
478
  logger.exception(self.logformatter.spider_error(exc, request, response, self.spider))
479
+
480
+ # Send the spider_error signal
481
+ # 发送spider_error信号
157
482
  await self.signals.send_catch_log(
158
483
  signal=signals.spider_error,
159
484
  failure=exc, response=response,
160
485
  spider=self.spider
161
486
  )
487
+
488
+ # Update error statistics by exception type and total count
489
+ # 按异常类型和总计数更新错误统计信息
162
490
  self.crawler.stats.inc_value("spider_exceptions/%s" % exc.__class__.__name__, spider=self.spider)
163
491
  self.crawler.stats.inc_value("spider_exceptions", spider=self.spider)
164
492
 
165
- async def handle_spider_output(self, result: AsyncGenerator, request: Request, response: Response) -> None:
166
- """Iter each Request/Item (given in the output parameter) returned from the given spider"""
493
+ async def handle_spider_output(self, result: Optional[AsyncGenerator], request: Request, response: Union[Response, BaseException]) -> None:
494
+ """
495
+ Process each Request/Item returned from the spider.
496
+ 处理从爬虫返回的每个Request/Item。
497
+
498
+ This method iterates through the async generator returned by the spider
499
+ callback and processes each yielded item. It handles any exceptions that
500
+ occur during iteration and marks the request as successfully parsed or not.
501
+ 此方法遍历爬虫回调返回的异步生成器,并处理每个产生的项目。
502
+ 它处理迭代期间发生的任何异常,并将请求标记为成功解析或未成功解析。
503
+
504
+ Args:
505
+ result: The async generator returned by the spider callback, or None.
506
+ 爬虫回调返回的异步生成器,或None。
507
+ If None, the method returns immediately.
508
+ 如果为None,方法立即返回。
509
+ request: The request that was processed.
510
+ 已处理的请求。
511
+ response: The response or exception that was processed.
512
+ 已处理的响应或异常。
513
+ This can be either a Response object or an Exception object in case
514
+ the output came from an errback.
515
+ 这可以是Response对象或Exception对象,以防输出来自errback。
516
+ """
167
517
  if not result:
168
518
  return
169
519
 
170
520
  parse_ok = True
171
521
  while True:
172
522
  try:
523
+ # Get the next item from the generator
524
+ # 从生成器获取下一个项目
173
525
  output = await result.__anext__()
174
526
  except StopAsyncIteration:
527
+ # End of generator
528
+ # 生成器结束
175
529
  break
176
530
  except Exception as e:
531
+ # Error during iteration
532
+ # 迭代期间出错
177
533
  parse_ok = False
178
534
  await self.handle_spider_error(e, request, response)
179
535
  else:
536
+ # Process the output item
537
+ # 处理输出项目
180
538
  await self._process_spidermw_output(output, request, response)
181
539
 
540
+ # Mark the request as successfully parsed (or not) for dupefilter
541
+ # 将请求标记为成功解析(或未成功)以供重复过滤器使用
182
542
  self.spider.dupefilter and \
183
543
  not request.dont_filter and \
184
544
  setattr(request, "parse_ok", parse_ok)
185
545
 
186
- async def _process_spidermw_output(self, output: Any, request: Request, response: Response) -> None:
187
- """Process each Request/Item (given in the output parameter) returned from the given spider"""
188
-
546
+ async def _process_spidermw_output(self, output: Any, request: Request, response: Union[Response, BaseException]) -> None:
547
+ """
548
+ Process each Request/Item returned from the spider.
549
+ 处理从爬虫返回的每个Request/Item。
550
+
551
+ This method handles different types of output from the spider:
552
+ 此方法处理爬虫的不同类型的输出:
553
+
554
+ - Request: Schedule it for crawling
555
+ Request:安排它进行爬取
556
+ - dict: Process it through the item pipeline
557
+ dict:通过项目管道处理它
558
+ - None: Ignore it
559
+ None:忽略它
560
+ - Other types: Log an error
561
+ 其他类型:记录错误
562
+
563
+ Args:
564
+ output: The output from the spider to process.
565
+ 要处理的爬虫输出。
566
+ This can be a Request, a dict (item), None, or any other type.
567
+ 这可以是Request、dict(项目)、None或任何其他类型。
568
+ request: The original request that generated this output.
569
+ 生成此输出的原始请求。
570
+ This is used for logging and tracking purposes.
571
+ 这用于日志记录和跟踪目的。
572
+ response: The response or exception that was processed.
573
+ 已处理的响应或异常。
574
+ This can be either a Response object or an Exception object in case
575
+ the output came from an errback.
576
+ 这可以是Response对象或Exception对象,以防输出来自errback。
577
+ """
189
578
  if isinstance(output, Request):
579
+ # Schedule new requests for crawling
580
+ # 安排新请求进行爬取
190
581
  await self.crawler.engine.crawl(request=output)
191
582
  elif isinstance(output, dict):
583
+ # Process items through the item pipeline
584
+ # 通过项目管道处理项目
192
585
  self.slot.itemproc_size += 1
193
- item = await self.itemproc.process_item(output, self.spider)
194
- process_item_method = getattr(self.spider, 'process_item', None)
195
- if process_item_method:
196
- await call_helper(process_item_method, item)
586
+ try:
587
+ # Process the item through the pipeline
588
+ # 通过管道处理项目
589
+ item = await self.itemproc.process_item(output, self.spider)
590
+ # Call the spider's process_item method if it exists
591
+ # 如果存在,调用爬虫的process_item方法
592
+ if process_item_method := getattr(self.spider, 'process_item', None):
593
+ await call_helper(process_item_method, item)
594
+ except Exception as e:
595
+ # Handle exceptions during item processing
596
+ # 处理项目处理期间的异常
597
+ item = output
598
+ output = e
599
+ # Handle the processed item or exception
600
+ # 处理已处理的项目或异常
197
601
  await self._itemproc_finished(output, item, response)
198
602
  elif output is None:
603
+ # Ignore None outputs
604
+ # 忽略None输出
199
605
  pass
200
606
  else:
607
+ # Log an error for unexpected output types
608
+ # 记录意外输出类型的错误
201
609
  typename = type(output).__name__
202
610
  logger.error(
203
611
  'Spider must return request, item, or None, got %(typename)r in %(request)s' % {'request': request,
@@ -210,28 +618,96 @@ class Scraper:
210
618
  download_exception: BaseException,
211
619
  request: Request
212
620
  ) -> None:
213
- """Process and record errors"""
621
+ """
622
+ Process and record download errors.
623
+ 处理和记录下载错误。
624
+
625
+ This method logs download errors and re-raises spider exceptions
626
+ if they are different from the download exception. It's typically called
627
+ when an error occurs during the processing of an errback.
628
+ 此方法记录下载错误,如果爬虫异常与下载异常不同,则重新引发爬虫异常。
629
+ 它通常在处理errback期间发生错误时调用。
630
+
631
+ Args:
632
+ spider_exception: The exception raised during spider processing.
633
+ 爬虫处理期间引发的异常。
634
+ This is the exception that occurred while processing
635
+ the download exception in the spider's errback.
636
+ 这是在爬虫的errback中处理下载异常时发生的异常。
637
+ download_exception: The exception raised during download.
638
+ 下载期间引发的异常。
639
+ This is the original exception that occurred during
640
+ the download process.
641
+ 这是下载过程中发生的原始异常。
642
+ request: The request that caused the error.
643
+ 导致错误的请求。
644
+ This is used for logging purposes and to provide context
645
+ about which request failed.
646
+ 这用于日志记录目的,并提供有关哪个请求失败的上下文。
647
+
648
+ Raises:
649
+ BaseException: Re-raises spider_exception if it's different from download_exception.
650
+ 如果spider_exception与download_exception不同,则重新引发spider_exception。
651
+ This ensures that new exceptions raised during errback processing
652
+ are properly propagated.
653
+ 这确保在errback处理期间引发的新异常被正确传播。
654
+ """
655
+ # Log download errors (except IgnoreRequest which is not an error)
656
+ # 记录下载错误(除了IgnoreRequest,它不是错误)
214
657
  if isinstance(download_exception, BaseException) and not isinstance(download_exception, IgnoreRequest):
215
658
  logger.exception(self.logformatter.download_error(download_exception, request, self.spider))
216
659
 
660
+ # Re-raise spider exceptions if they're different from the download exception
661
+ # 如果爬虫异常与下载异常不同,则重新引发爬虫异常
217
662
  if spider_exception is not download_exception:
218
663
  raise spider_exception
219
664
 
220
665
  async def _itemproc_finished(self, output: Any, item: Any, response: Response) -> None:
221
- """ItemProcessor finished for the given ``item`` and returned ``output``"""
666
+ """
667
+ Handle the result of item processing.
668
+ 处理项目处理的结果。
669
+
670
+ This method is called when the item pipeline has finished processing an item.
671
+ It handles different outcomes based on the result:
672
+ 当项目管道完成处理项目时调用此方法。它根据结果处理不同的结果:
673
+
674
+ - If output is a DropItem exception: Log it and send item_dropped signal
675
+ 如果输出是DropItem异常:记录它并发送item_dropped信号
676
+ - If output is another exception: Log it and send item_error signal
677
+ 如果输出是另一个异常:记录它并发送item_error信号
678
+ - If output is a valid item: Log it and send item_scraped signal
679
+ 如果输出是有效项目:记录它并发送item_scraped信号
680
+
681
+ Args:
682
+ output: The result of item processing (item or exception).
683
+ 项目处理的结果(项目或异常)。
684
+ item: The original item before processing.
685
+ 处理前的原始项目。
686
+ response: The response from which the item was extracted.
687
+ 从中提取项目的响应。
688
+ """
689
+ # Decrease the item processing counter
690
+ # 减少项目处理计数器
222
691
  self.slot.itemproc_size -= 1
692
+
223
693
  if isinstance(output, BaseException):
224
694
  if isinstance(output, DropItem):
695
+ # Item was intentionally dropped by a pipeline
696
+ # 项目被管道有意丢弃
225
697
  logger.log(**self.logformatter.dropped(item, output, response, self.spider))
226
698
  return await self.signals.send_catch_log_deferred(
227
699
  signal=signals.item_dropped, item=item, response=response,
228
700
  spider=self.spider, exception=output)
229
701
  else:
702
+ # An error occurred during item processing
703
+ # 项目处理期间发生错误
230
704
  logger.exception(self.logformatter.item_error(item, output, response, self.spider))
231
705
  return await self.signals.send_catch_log_deferred(
232
706
  signal=signals.item_error, item=item, response=response,
233
707
  spider=self.spider, failure=output)
234
708
  else:
709
+ # Item was successfully processed
710
+ # 项目已成功处理
235
711
  logger.log(**self.logformatter.scraped(output, response, self.spider))
236
712
  return await self.signals.send_catch_log_deferred(
237
713
  signal=signals.item_scraped, item=output, response=response,