aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,23 @@
1
1
  """
2
- Spider Middleware manager
2
+ Spider Middleware Manager
3
+ 爬虫中间件管理器
4
+
5
+ This module provides the SpiderMiddlewareManager class, which manages the execution
6
+ of spider middleware components. Spider middlewares process the input and output of
7
+ spiders, allowing modification of requests and responses, as well as handling of
8
+ exceptions that occur during spider processing.
9
+ 此模块提供了SpiderMiddlewareManager类,用于管理爬虫中间件组件的执行。爬虫中间件
10
+ 处理爬虫的输入和输出,允许修改请求和响应,以及处理爬虫处理过程中发生的异常。
11
+
12
+ Spider middlewares are loaded from the SPIDER_MIDDLEWARES setting and are executed
13
+ in the order specified by their priority values. They can intercept and modify the
14
+ requests generated by spiders, the responses passed to spiders, and the items and
15
+ requests yielded by spiders.
16
+ 爬虫中间件从SPIDER_MIDDLEWARES设置加载,并按照其优先级值指定的顺序执行。它们可以
17
+ 拦截和修改由爬虫生成的请求、传递给爬虫的响应,以及由爬虫产生的项目和请求。
3
18
 
4
19
  See documentation in docs/topics/spider-middleware.rst
20
+ 参见文档 docs/topics/spider-middleware.rst
5
21
  """
6
22
  from itertools import islice
7
23
  from typing import AsyncIterable, Iterable, Callable, Union
@@ -18,115 +34,452 @@ from aioscrapy.middleware.absmanager import AbsMiddlewareManager
18
34
 
19
35
 
20
36
  def _fname(f):
37
+ """
38
+ Get the full name of a method.
39
+ 获取方法的完整名称。
40
+
41
+ This helper function returns the full name of a method in the format
42
+ "ClassName.method_name". It is used for error messages to identify which
43
+ middleware method raised an exception or returned an invalid output.
44
+ 此辅助函数返回格式为"ClassName.method_name"的方法的完整名称。它用于
45
+ 错误消息,以识别哪个中间件方法引发了异常或返回了无效输出。
46
+
47
+ Args:
48
+ f: The method to get the name of.
49
+ 要获取名称的方法。
50
+
51
+ Returns:
52
+ str: The full name of the method in the format "ClassName.method_name".
53
+ 格式为"ClassName.method_name"的方法的完整名称。
54
+ """
55
+ # Get the class name and method name and format them
56
+ # 获取类名和方法名并格式化它们
21
57
  return "{}.{}".format(
22
- f.__self__.__class__.__name__,
23
- f.__func__.__name__
58
+ f.__self__.__class__.__name__, # Class name 类名
59
+ f.__func__.__name__ # Method name 方法名
24
60
  )
25
61
 
26
62
 
27
63
  class SpiderMiddlewareManager(AbsMiddlewareManager):
64
+ """
65
+ Manager for spider middleware components.
66
+ 爬虫中间件组件的管理器。
67
+
68
+ This class manages the execution of spider middleware components, which process
69
+ the input and output of spiders. It inherits from AbsMiddlewareManager and
70
+ implements the specific behavior for spider middlewares.
71
+ 此类管理爬虫中间件组件的执行,这些组件处理爬虫的输入和输出。它继承自
72
+ AbsMiddlewareManager,并实现了爬虫中间件的特定行为。
73
+
74
+ Spider middlewares can intercept and modify the requests generated by spiders,
75
+ the responses passed to spiders, and the items and requests yielded by spiders.
76
+ They can also handle exceptions that occur during spider processing.
77
+ 爬虫中间件可以拦截和修改由爬虫生成的请求、传递给爬虫的响应,以及由爬虫产生的
78
+ 项目和请求。它们还可以处理爬虫处理过程中发生的异常。
79
+ """
80
+
81
+ # Name of the component
82
+ # 组件的名称
28
83
  component_name = 'spider middleware'
29
84
 
30
85
  @classmethod
31
86
  def _get_mwlist_from_settings(cls, settings: Settings):
87
+ """
88
+ Get the list of spider middleware classes from settings.
89
+ 从设置中获取爬虫中间件类列表。
90
+
91
+ This method implements the abstract method from AbsMiddlewareManager.
92
+ It retrieves the list of spider middleware classes from the
93
+ SPIDER_MIDDLEWARES setting.
94
+ 此方法实现了AbsMiddlewareManager中的抽象方法。它从SPIDER_MIDDLEWARES
95
+ 设置中检索爬虫中间件类列表。
96
+
97
+ Args:
98
+ settings: The settings object.
99
+ 设置对象。
100
+
101
+ Returns:
102
+ list: A list of spider middleware class paths.
103
+ 爬虫中间件类路径列表。
104
+ """
105
+ # Build component list from SPIDER_MIDDLEWARES setting
106
+ # 从SPIDER_MIDDLEWARES设置构建组件列表
32
107
  return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
33
108
 
34
109
  def _add_middleware(self, mw):
110
+ """
111
+ Add a middleware instance to the manager.
112
+ 将中间件实例添加到管理器。
113
+
114
+ This method overrides the method from AbsMiddlewareManager to register
115
+ the specific methods of spider middlewares: process_spider_input,
116
+ process_start_requests, process_spider_output, and process_spider_exception.
117
+ 此方法覆盖了AbsMiddlewareManager中的方法,以注册爬虫中间件的特定方法:
118
+ process_spider_input、process_start_requests、process_spider_output和
119
+ process_spider_exception。
120
+
121
+ Note that process_spider_input and process_start_requests methods are called
122
+ in the order they are registered, while process_spider_output and
123
+ process_spider_exception methods are called in reverse order.
124
+ 请注意,process_spider_input和process_start_requests方法按照它们注册的
125
+ 顺序调用,而process_spider_output和process_spider_exception方法按照
126
+ 相反的顺序调用。
127
+
128
+ Args:
129
+ mw: The middleware instance to add.
130
+ 要添加的中间件实例。
131
+ """
132
+ # Call parent method to register open_spider and close_spider methods
133
+ # 调用父方法来注册open_spider和close_spider方法
35
134
  super(SpiderMiddlewareManager, self)._add_middleware(mw)
135
+
136
+ # Register process_spider_input method if it exists
137
+ # 如果存在,则注册process_spider_input方法
36
138
  if hasattr(mw, 'process_spider_input'):
37
139
  self.methods['process_spider_input'].append(mw.process_spider_input)
140
+
141
+ # Register process_start_requests method if it exists (added to the left for reverse order)
142
+ # 如果存在,则注册process_start_requests方法(添加到左侧以便逆序执行)
38
143
  if hasattr(mw, 'process_start_requests'):
39
144
  self.methods['process_start_requests'].appendleft(mw.process_start_requests)
145
+
146
+ # Get process_spider_output method if it exists
147
+ # 如果存在,则获取process_spider_output方法
40
148
  process_spider_output = getattr(mw, 'process_spider_output', None)
149
+
150
+ # Register process_spider_output method (added to the left for reverse order)
151
+ # 注册process_spider_output方法(添加到左侧以便逆序执行)
41
152
  self.methods['process_spider_output'].appendleft(process_spider_output)
153
+
154
+ # Get process_spider_exception method if it exists
155
+ # 如果存在,则获取process_spider_exception方法
42
156
  process_spider_exception = getattr(mw, 'process_spider_exception', None)
157
+
158
+ # Register process_spider_exception method (added to the left for reverse order)
159
+ # 注册process_spider_exception方法(添加到左侧以便逆序执行)
43
160
  self.methods['process_spider_exception'].appendleft(process_spider_exception)
44
161
 
45
162
  async def scrape_response(self, scrape_func: Callable, response: Response, request: Request, spider: Spider):
163
+ """
164
+ Process a response through the spider middleware chain.
165
+ 通过爬虫中间件链处理响应。
166
+
167
+ This method is the core of the spider middleware processing. It handles the
168
+ entire process of passing a response through the middleware chain, calling
169
+ the spider's callback function, and processing the output through the
170
+ middleware chain again.
171
+ 此方法是爬虫中间件处理的核心。它处理通过中间件链传递响应、调用爬虫的回调
172
+ 函数以及再次通过中间件链处理输出的整个过程。
173
+
174
+ Args:
175
+ scrape_func: The spider's callback function to process the response.
176
+ 处理响应的爬虫回调函数。
177
+ response: The response to process.
178
+ 要处理的响应。
179
+ request: The request that generated the response.
180
+ 生成响应的请求。
181
+ spider: The spider that made the request.
182
+ 发出请求的爬虫。
46
183
 
184
+ Returns:
185
+ The result of processing the response through the middleware chain.
186
+ 通过中间件链处理响应的结果。
187
+ """
47
188
  async def process_spider_input(response_) -> Union[AsyncIterable, Iterable]:
189
+ """
190
+ Process a response through all registered process_spider_input methods.
191
+ 通过所有已注册的process_spider_input方法处理响应。
192
+
193
+ This function calls each middleware's process_spider_input method in the
194
+ order they were registered. If any middleware raises an exception, it is
195
+ handled by the scrape_func.
196
+ 此函数按照它们注册的顺序调用每个中间件的process_spider_input方法。
197
+ 如果任何中间件引发异常,则由scrape_func处理。
198
+
199
+ Args:
200
+ response_: The response to process.
201
+ 要处理的响应。
202
+
203
+ Returns:
204
+ Union[AsyncIterable, Iterable]: The result of processing the response.
205
+ 处理响应的结果。
206
+ """
207
+ # Process the response through all registered process_spider_input methods
208
+ # 通过所有已注册的process_spider_input方法处理响应
48
209
  for method in self.methods['process_spider_input']:
49
210
  try:
211
+ # Call the method with the response
212
+ # 使用响应调用方法
50
213
  result = await call_helper(method, response=response_, spider=spider)
214
+
215
+ # Validate the return value
216
+ # 验证返回值
51
217
  if result is not None:
52
218
  raise _InvalidOutput(
53
219
  f"Middleware {_fname(method)} must return None or raise an exception, got {type(result)}"
54
220
  )
55
221
  except _InvalidOutput:
222
+ # Re-raise _InvalidOutput exceptions
223
+ # 重新引发_InvalidOutput异常
56
224
  raise
57
225
  except BaseException as exception:
226
+ # Handle other exceptions by calling the scrape_func with the exception
227
+ # 通过使用异常调用scrape_func来处理其他异常
58
228
  iterable_or_exception = await call_helper(scrape_func, exception, request)
229
+
230
+ # If the scrape_func returned the exception, re-raise it
231
+ # 如果scrape_func返回了异常,则重新引发它
59
232
  if iterable_or_exception is exception:
60
233
  raise iterable_or_exception
234
+
235
+ # Otherwise, return the result
236
+ # 否则,返回结果
61
237
  return iterable_or_exception
238
+
239
+ # If all middleware methods passed, call the scrape_func with the response
240
+ # 如果所有中间件方法都通过,则使用响应调用scrape_func
62
241
  return await call_helper(scrape_func, response_, request)
63
242
 
64
243
  async def _evaluate_iterable(result: Union[AsyncIterable, Iterable], exception_processor_index):
244
+ """
245
+ Evaluate an iterable and handle any exceptions that occur.
246
+ 评估可迭代对象并处理发生的任何异常。
247
+
248
+ This function converts any iterable to an async generator and yields
249
+ its items. If an exception occurs during iteration, it is handled by
250
+ the process_spider_exception function.
251
+ 此函数将任何可迭代对象转换为异步生成器并产生其项目。如果在迭代期间
252
+ 发生异常,则由process_spider_exception函数处理。
253
+
254
+ Args:
255
+ result: The iterable to evaluate.
256
+ 要评估的可迭代对象。
257
+ exception_processor_index: The index to start processing exceptions from.
258
+ 开始处理异常的索引。
259
+
260
+ Yields:
261
+ The items from the iterable.
262
+ 可迭代对象中的项目。
263
+ """
65
264
  try:
265
+ # Convert all non-AsyncGeneratorType to AsyncGeneratorType objects
66
266
  # 将所有非AsyncGeneratorType变成AsyncGeneratorType对象
67
267
  async for r in await async_generator_wrapper(result):
68
268
  yield r
69
269
  except BaseException as ex:
270
+ # Handle exceptions by calling process_spider_exception
271
+ # 通过调用process_spider_exception处理异常
70
272
  exception_result = await process_spider_exception(ex, exception_processor_index)
273
+
274
+ # If the result is an exception, re-raise it
275
+ # 如果结果是异常,则重新引发它
71
276
  if isinstance(exception_result, BaseException):
72
277
  raise exception_result
278
+
279
+ # Otherwise, recursively evaluate the result
280
+ # 否则,递归评估结果
73
281
  async for r in _evaluate_iterable(exception_result, exception_processor_index):
74
282
  yield r
75
283
 
76
284
  async def process_spider_exception(exception, start_index=0):
77
- # don't handle _InvalidOutput exception
285
+ """
286
+ Process an exception through all registered process_spider_exception methods.
287
+ 通过所有已注册的process_spider_exception方法处理异常。
288
+
289
+ This function calls each middleware's process_spider_exception method in
290
+ reverse order from how they were registered. If any middleware returns an
291
+ iterable, the exception handling stops and the iterable is processed by
292
+ the process_spider_output function.
293
+ 此函数按照与它们注册的相反的顺序调用每个中间件的process_spider_exception方法。
294
+ 如果任何中间件返回可迭代对象,则异常处理停止,并且可迭代对象由
295
+ process_spider_output函数处理。
296
+
297
+ Args:
298
+ exception: The exception to process.
299
+ 要处理的异常。
300
+ start_index: The index to start processing from.
301
+ 开始处理的索引。
302
+
303
+ Returns:
304
+ The result of processing the exception, or the exception itself if
305
+ no middleware handled it.
306
+ 处理异常的结果,如果没有中间件处理它,则为异常本身。
307
+ """
308
+ # Don't handle _InvalidOutput exceptions
309
+ # 不处理_InvalidOutput异常
78
310
  if isinstance(exception, _InvalidOutput):
79
311
  raise exception
312
+
313
+ # Get the list of methods to call, starting from start_index
314
+ # 获取要调用的方法列表,从start_index开始
80
315
  method_list = islice(self.methods['process_spider_exception'], start_index, None)
316
+
317
+ # Process each method
318
+ # 处理每个方法
81
319
  for method_index, method in enumerate(method_list, start=start_index):
320
+ # Skip None methods
321
+ # 跳过None方法
82
322
  if method is None:
83
323
  continue
324
+
325
+ # Call the method with the exception
326
+ # 使用异常调用方法
84
327
  result = await call_helper(method, response=response, exception=exception, spider=spider)
328
+
329
+ # If the result is an iterable, stop exception handling and process the output
330
+ # 如果结果是可迭代对象,则停止异常处理并处理输出
85
331
  if isinstance(result, AsyncIterable):
86
- # stop exception handling by handing control over to the
332
+ # Stop exception handling by handing control over to the
87
333
  # process_spider_output chain if an iterable has been returned
334
+ # 如果返回了可迭代对象,则通过将控制权交给process_spider_output链来停止异常处理
88
335
  return await process_spider_output(result, method_index + 1)
89
336
  elif result is None:
337
+ # If the result is None, continue to the next method
338
+ # 如果结果为None,则继续下一个方法
90
339
  continue
91
340
  else:
341
+ # If the result is neither an iterable nor None, raise an error
342
+ # 如果结果既不是可迭代对象也不是None,则引发错误
92
343
  raise _InvalidOutput(
93
344
  f"Middleware {_fname(method)} must return None or an iterable, got {type(result)}"
94
345
  )
346
+
347
+ # If no middleware handled the exception, re-raise it
348
+ # 如果没有中间件处理异常,则重新引发它
95
349
  raise exception
96
350
 
97
351
  async def process_spider_output(result, start_index=0):
98
- # items in this iterable do not need to go through the process_spider_output
352
+ """
353
+ Process an iterable through all registered process_spider_output methods.
354
+ 通过所有已注册的process_spider_output方法处理可迭代对象。
355
+
356
+ This function calls each middleware's process_spider_output method in
357
+ reverse order from how they were registered. Each middleware can modify
358
+ the iterable or return a new one.
359
+ 此函数按照与它们注册的相反的顺序调用每个中间件的process_spider_output方法。
360
+ 每个中间件可以修改可迭代对象或返回一个新的可迭代对象。
361
+
362
+ Args:
363
+ result: The iterable to process.
364
+ 要处理的可迭代对象。
365
+ start_index: The index to start processing from.
366
+ 开始处理的索引。
367
+
368
+ Returns:
369
+ The processed iterable.
370
+ 处理后的可迭代对象。
371
+ """
372
+ # Items in this iterable do not need to go through the process_spider_output
99
373
  # chain, they went through it already from the process_spider_exception method
374
+ # 此可迭代对象中的项目不需要通过process_spider_output链,
375
+ # 它们已经通过process_spider_exception方法通过了它
100
376
 
377
+ # Get the list of methods to call, starting from start_index
378
+ # 获取要调用的方法列表,从start_index开始
101
379
  method_list = islice(self.methods['process_spider_output'], start_index, None)
380
+
381
+ # Process each method
382
+ # 处理每个方法
102
383
  for method_index, method in enumerate(method_list, start=start_index):
384
+ # Skip None methods
385
+ # 跳过None方法
103
386
  if method is None:
104
387
  continue
388
+
105
389
  try:
106
- # might fail directly if the output value is not a generator
390
+ # Call the method with the result
391
+ # 使用结果调用方法
392
+ # Might fail directly if the output value is not a generator
393
+ # 如果输出值不是生成器,可能会直接失败
107
394
  result = await call_helper(method, response=response, result=result, spider=spider)
108
395
  except BaseException as ex:
396
+ # Handle exceptions by calling process_spider_exception
397
+ # 通过调用process_spider_exception处理异常
109
398
  exception_result = await process_spider_exception(ex, method_index + 1)
399
+
400
+ # If the result is an exception, re-raise it
401
+ # 如果结果是异常,则重新引发它
110
402
  if isinstance(exception_result, BaseException):
111
403
  raise
404
+
405
+ # Otherwise, return the result
406
+ # 否则,返回结果
112
407
  return exception_result
408
+
409
+ # Validate the return value
410
+ # 验证返回值
113
411
  if isinstance(result, AsyncIterable):
412
+ # If the result is an iterable, evaluate it
413
+ # 如果结果是可迭代对象,则评估它
114
414
  result = _evaluate_iterable(result, method_index + 1)
115
415
  else:
416
+ # If the result is not an iterable, raise an error
417
+ # 如果结果不是可迭代对象,则引发错误
116
418
  raise _InvalidOutput(f"Middleware {_fname(method)} must return an iterable, got {type(result)}")
117
419
 
420
+ # Return the final result
421
+ # 返回最终结果
118
422
  return result
119
423
 
120
424
  async def process_callback_output(result: Union[AsyncIterable, Iterable]):
425
+ """
426
+ Process the output of the spider's callback function.
427
+ 处理爬虫回调函数的输出。
428
+
429
+ This function evaluates the iterable returned by the spider's callback
430
+ function and processes it through the process_spider_output chain.
431
+ 此函数评估爬虫回调函数返回的可迭代对象,并通过process_spider_output链处理它。
432
+
433
+ Args:
434
+ result: The iterable returned by the spider's callback function.
435
+ 爬虫回调函数返回的可迭代对象。
436
+
437
+ Returns:
438
+ The processed iterable.
439
+ 处理后的可迭代对象。
440
+ """
441
+ # Evaluate the iterable
442
+ # 评估可迭代对象
121
443
  result: AsyncIterable = _evaluate_iterable(result, 0)
444
+
445
+ # Process the result through the process_spider_output chain
446
+ # 通过process_spider_output链处理结果
122
447
  return await process_spider_output(result)
123
448
 
124
449
  try:
450
+ # Process the response through the process_spider_input chain
451
+ # 通过process_spider_input链处理响应
125
452
  _iterable = await process_spider_input(response)
126
453
  except BaseException as exc:
454
+ # If an exception occurs, process it through the process_spider_exception chain
455
+ # 如果发生异常,则通过process_spider_exception链处理它
127
456
  return await process_spider_exception(exc)
128
457
  else:
458
+ # If no exception occurs, process the output through the process_callback_output chain
459
+ # 如果没有发生异常,则通过process_callback_output链处理输出
129
460
  return await process_callback_output(_iterable)
130
461
 
131
462
  async def process_start_requests(self, start_requests, spider):
463
+ """
464
+ Process start requests through all registered process_start_requests methods.
465
+ 通过所有已注册的process_start_requests方法处理起始请求。
466
+
467
+ This method calls each middleware's process_start_requests method in the
468
+ order they were registered. Each middleware can modify the start requests
469
+ or return a new iterable of requests.
470
+ 此方法按照它们注册的顺序调用每个中间件的process_start_requests方法。
471
+ 每个中间件可以修改起始请求或返回一个新的请求可迭代对象。
472
+
473
+ Args:
474
+ start_requests: The iterable of start requests from the spider.
475
+ 来自爬虫的起始请求可迭代对象。
476
+ spider: The spider that generated the start requests.
477
+ 生成起始请求的爬虫。
478
+
479
+ Returns:
480
+ The processed iterable of start requests.
481
+ 处理后的起始请求可迭代对象。
482
+ """
483
+ # Process the start requests through the chain of process_start_requests methods
484
+ # 通过process_start_requests方法链处理起始请求
132
485
  return await self._process_chain('process_start_requests', start_requests, spider)