aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,17 @@
1
+ """
2
+ Downloader Middleware Manager Module
3
+ 下载器中间件管理器模块
4
+
5
+ This module provides the DownloaderMiddlewareManager class, which manages the execution
6
+ of downloader middleware components. Downloader middlewares process requests before they
7
+ are sent to the downloader and responses before they are sent to the spider.
8
+ 此模块提供了DownloaderMiddlewareManager类,用于管理下载器中间件组件的执行。
9
+ 下载器中间件在请求发送到下载器之前处理请求,在响应发送到爬虫之前处理响应。
10
+
11
+ Downloader middlewares can modify requests and responses, return alternative responses,
12
+ or handle exceptions that occur during the download process.
13
+ 下载器中间件可以修改请求和响应,返回替代响应,或处理下载过程中发生的异常。
14
+ """
1
15
  from aioscrapy.exceptions import _InvalidOutput
2
16
  from aioscrapy.http import Request, Response
3
17
  from aioscrapy.middleware.absmanager import AbsMiddlewareManager
@@ -6,64 +20,268 @@ from aioscrapy.utils.tools import call_helper
6
20
 
7
21
 
8
22
  class DownloaderMiddlewareManager(AbsMiddlewareManager):
23
+ """
24
+ Manager for downloader middleware components.
25
+ 下载器中间件组件的管理器。
26
+
27
+ This class manages the execution of downloader middleware components, which process
28
+ requests before they are sent to the downloader and responses before they are sent
29
+ to the spider. It inherits from AbsMiddlewareManager and implements the specific
30
+ behavior for downloader middlewares.
31
+ 此类管理下载器中间件组件的执行,这些组件在请求发送到下载器之前处理请求,
32
+ 在响应发送到爬虫之前处理响应。它继承自AbsMiddlewareManager,并实现了
33
+ 下载器中间件的特定行为。
34
+ """
35
+
36
+ # Name of the middleware component
37
+ # 中间件组件的名称
9
38
  component_name = 'downloader middleware'
10
39
 
11
40
  @classmethod
12
41
  def _get_mwlist_from_settings(cls, settings):
42
+ """
43
+ Get the list of middleware classes from settings.
44
+ 从设置中获取中间件类列表。
45
+
46
+ This method implements the abstract method from AbsMiddlewareManager.
47
+ It retrieves the list of downloader middleware classes from the
48
+ DOWNLOADER_MIDDLEWARES setting.
49
+ 此方法实现了AbsMiddlewareManager中的抽象方法。它从DOWNLOADER_MIDDLEWARES
50
+ 设置中检索下载器中间件类列表。
51
+
52
+ Args:
53
+ settings: The settings object.
54
+ 设置对象。
55
+
56
+ Returns:
57
+ list: A list of middleware class paths.
58
+ 中间件类路径列表。
59
+ """
60
+ # Build component list from DOWNLOADER_MIDDLEWARES setting
61
+ # 从DOWNLOADER_MIDDLEWARES设置构建组件列表
13
62
  return build_component_list(
14
63
  settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
15
64
 
16
65
  def _add_middleware(self, mw):
66
+ """
67
+ Add a middleware instance to the manager.
68
+ 将中间件实例添加到管理器。
69
+
70
+ This method overrides the method from AbsMiddlewareManager to register
71
+ the specific methods of downloader middlewares: process_request,
72
+ process_response, and process_exception.
73
+ 此方法覆盖了AbsMiddlewareManager中的方法,以注册下载器中间件的特定方法:
74
+ process_request、process_response和process_exception。
75
+
76
+ Note that process_request methods are called in the order they are registered,
77
+ while process_response and process_exception methods are called in reverse order.
78
+ 请注意,process_request方法按照它们注册的顺序调用,而process_response
79
+ 和process_exception方法按照相反的顺序调用。
80
+
81
+ Args:
82
+ mw: The middleware instance to add.
83
+ 要添加的中间件实例。
84
+ """
85
+ # Register process_request method if it exists
86
+ # 如果存在,则注册process_request方法
17
87
  if hasattr(mw, 'process_request'):
18
88
  self.methods['process_request'].append(mw.process_request)
89
+
90
+ # Register process_response method if it exists (added to the left for reverse order)
91
+ # 如果存在,则注册process_response方法(添加到左侧以便逆序执行)
19
92
  if hasattr(mw, 'process_response'):
20
93
  self.methods['process_response'].appendleft(mw.process_response)
94
+
95
+ # Register process_exception method if it exists (added to the left for reverse order)
96
+ # 如果存在,则注册process_exception方法(添加到左侧以便逆序执行)
21
97
  if hasattr(mw, 'process_exception'):
22
98
  self.methods['process_exception'].appendleft(mw.process_exception)
23
99
 
24
100
  def iter_mw_method(self, spider, process_type: str):
101
+ """
102
+ Iterate over middleware methods of a specific type.
103
+ 迭代特定类型的中间件方法。
104
+
105
+ This method yields all middleware methods of the specified type, followed
106
+ by the spider's method of the same type if it exists.
107
+ 此方法产生指定类型的所有中间件方法,如果存在,则后跟爬虫的同类型方法。
108
+
109
+ Args:
110
+ spider: The spider instance.
111
+ 爬虫实例。
112
+ process_type: The type of middleware method to iterate over.
113
+ 要迭代的中间件方法类型。
114
+
115
+ Yields:
116
+ callable: Middleware methods of the specified type.
117
+ 指定类型的中间件方法。
118
+ """
119
+ # Get the spider's method of the specified type if it exists
120
+ # 如果存在,则获取爬虫的指定类型方法
25
121
  spider_method = getattr(spider, process_type, None)
122
+
123
+ # Yield all middleware methods of the specified type
124
+ # 产生指定类型的所有中间件方法
26
125
  for method in self.methods[process_type]:
27
126
  yield method
127
+
128
+ # Yield the spider's method if it exists
129
+ # 如果存在,则产生爬虫的方法
28
130
  if spider_method:
29
131
  yield spider_method
30
132
 
31
133
  async def process_request(self, spider, request):
134
+ """
135
+ Process a request through all registered process_request methods.
136
+ 通过所有已注册的process_request方法处理请求。
137
+
138
+ This method calls each middleware's process_request method in the order
139
+ they were registered. If any middleware returns a Response or Request,
140
+ the process stops and that object is returned.
141
+ 此方法按照它们注册的顺序调用每个中间件的process_request方法。如果任何
142
+ 中间件返回Response或Request,则过程停止并返回该对象。
143
+
144
+ Args:
145
+ spider: The spider instance.
146
+ 爬虫实例。
147
+ request: The request to process.
148
+ 要处理的请求。
149
+
150
+ Returns:
151
+ None, Response, or Request: The result of processing the request.
152
+ 处理请求的结果。
153
+
154
+ Raises:
155
+ _InvalidOutput: If a middleware returns a value that is not None, Response, or Request.
156
+ 如果中间件返回的值不是None、Response或Request。
157
+ """
158
+ # Iterate over all process_request methods
159
+ # 迭代所有process_request方法
32
160
  for method in self.iter_mw_method(spider, 'process_request'):
161
+ # Call the method with the request
162
+ # 使用请求调用方法
33
163
  response = await call_helper(method, request=request, spider=spider)
164
+
165
+ # Validate the return value
166
+ # 验证返回值
34
167
  if response is not None and not isinstance(response, (Response, Request)):
35
168
  raise _InvalidOutput(
36
169
  "Middleware %s.process_request must return None, Response or Request, got %s"
37
170
  % (method.__self__.__class__.__name__, response.__class__.__name__)
38
171
  )
172
+
173
+ # If a non-None value was returned, return it and stop processing
174
+ # 如果返回了非None值,则返回它并停止处理
39
175
  if response:
40
176
  return response
41
177
 
42
178
  async def process_response(self, spider, request, response):
179
+ """
180
+ Process a response through all registered process_response methods.
181
+ 通过所有已注册的process_response方法处理响应。
182
+
183
+ This method calls each middleware's process_response method in reverse
184
+ order from how they were registered. If any middleware returns a Request,
185
+ the process stops and that Request is returned.
186
+ 此方法按照与它们注册的相反的顺序调用每个中间件的process_response方法。
187
+ 如果任何中间件返回Request,则过程停止并返回该Request。
188
+
189
+ Args:
190
+ spider: The spider instance.
191
+ 爬虫实例。
192
+ request: The request that generated the response.
193
+ 生成响应的请求。
194
+ response: The response to process.
195
+ 要处理的响应。
196
+
197
+ Returns:
198
+ Response or Request: The result of processing the response.
199
+ 处理响应的结果。
200
+
201
+ Raises:
202
+ TypeError: If response is None.
203
+ 如果响应为None。
204
+ _InvalidOutput: If a middleware returns a value that is not Response or Request.
205
+ 如果中间件返回的值不是Response或Request。
206
+ """
207
+ # Validate the response
208
+ # 验证响应
43
209
  if response is None:
44
210
  raise TypeError("Received None in process_response")
45
211
  elif isinstance(response, Request):
46
212
  return response
47
213
 
214
+ # Iterate over all process_response methods
215
+ # 迭代所有process_response方法
48
216
  for method in self.iter_mw_method(spider, 'process_response'):
217
+ # Call the method with the request and response
218
+ # 使用请求和响应调用方法
49
219
  response = await call_helper(method, request=request, response=response, spider=spider)
220
+
221
+ # Validate the return value
222
+ # 验证返回值
50
223
  if not isinstance(response, (Response, Request)):
51
224
  raise _InvalidOutput(
52
225
  "Middleware %s.process_response must return Response or Request, got %s"
53
226
  % (method.__self__.__class__.__name__, type(response))
54
227
  )
228
+
229
+ # If a Request was returned, return it and stop processing
230
+ # 如果返回了Request,则返回它并停止处理
55
231
  if isinstance(response, Request):
56
232
  return response
233
+
234
+ # Return the final response
235
+ # 返回最终响应
57
236
  return response
58
237
 
59
238
  async def process_exception(self, spider, request, exception):
239
+ """
240
+ Process an exception through all registered process_exception methods.
241
+ 通过所有已注册的process_exception方法处理异常。
242
+
243
+ This method calls each middleware's process_exception method in reverse
244
+ order from how they were registered. If any middleware returns a Response
245
+ or Request, the process stops and that object is returned.
246
+ 此方法按照与它们注册的相反的顺序调用每个中间件的process_exception方法。
247
+ 如果任何中间件返回Response或Request,则过程停止并返回该对象。
248
+
249
+ Args:
250
+ spider: The spider instance.
251
+ 爬虫实例。
252
+ request: The request that caused the exception.
253
+ 导致异常的请求。
254
+ exception: The exception that occurred.
255
+ 发生的异常。
256
+
257
+ Returns:
258
+ Exception, Response, or Request: The result of processing the exception.
259
+ 处理异常的结果。
260
+
261
+ Raises:
262
+ _InvalidOutput: If a middleware returns a value that is not None, Response, or Request.
263
+ 如果中间件返回的值不是None、Response或Request。
264
+ """
265
+ # Iterate over all process_exception methods
266
+ # 迭代所有process_exception方法
60
267
  for method in self.iter_mw_method(spider, 'process_exception'):
268
+ # Call the method with the request and exception
269
+ # 使用请求和异常调用方法
61
270
  response = await call_helper(method, request=request, exception=exception, spider=spider)
271
+
272
+ # Validate the return value
273
+ # 验证返回值
62
274
  if response is not None and not isinstance(response, (Response, Request)):
63
275
  raise _InvalidOutput(
64
276
  "Middleware %s.process_exception must return None, Response or Request, got %s"
65
277
  % (method.__self__.__class__.__name__, type(response))
66
278
  )
279
+
280
+ # If a non-None value was returned, return it and stop processing
281
+ # 如果返回了非None值,则返回它并停止处理
67
282
  if response:
68
283
  return response
284
+
285
+ # If no middleware handled the exception, return it
286
+ # 如果没有中间件处理异常,则返回它
69
287
  return exception
@@ -1,16 +1,65 @@
1
1
  """
2
2
  The Extension Manager
3
+ 扩展管理器
3
4
 
4
- See documentation in docs/topics/extensions.rst
5
+ This module provides the ExtensionManager class, which manages the loading and
6
+ execution of extensions. Extensions are components that can hook into various
7
+ parts of the Scrapy process to add functionality or modify behavior.
8
+ 此模块提供了ExtensionManager类,用于管理扩展的加载和执行。扩展是可以挂钩到
9
+ Scrapy流程的各个部分以添加功能或修改行为的组件。
10
+
11
+ Extensions are loaded from the EXTENSIONS setting and can be enabled or disabled
12
+ through this setting. They can connect to signals to execute code at specific
13
+ points in the crawling process.
14
+ 扩展从EXTENSIONS设置加载,可以通过此设置启用或禁用。它们可以连接到信号,
15
+ 以在爬取过程的特定点执行代码。
5
16
  """
6
17
  from aioscrapy.middleware.absmanager import AbsMiddlewareManager
7
18
  from aioscrapy.utils.conf import build_component_list
8
19
 
9
20
 
10
21
  class ExtensionManager(AbsMiddlewareManager):
22
+ """
23
+ Manager for extension components.
24
+ 扩展组件的管理器。
25
+
26
+ This class manages the loading and execution of extensions. It inherits from
27
+ AbsMiddlewareManager and implements the specific behavior for extensions.
28
+ Extensions are components that can hook into various parts of the Scrapy
29
+ process to add functionality or modify behavior.
30
+ 此类管理扩展的加载和执行。它继承自AbsMiddlewareManager,并实现了扩展的特定行为。
31
+ 扩展是可以挂钩到Scrapy流程的各个部分以添加功能或修改行为的组件。
32
+
33
+ Extensions typically connect to signals to execute code at specific points in
34
+ the crawling process. They can be enabled or disabled through the EXTENSIONS
35
+ setting.
36
+ 扩展通常连接到信号,以在爬取过程的特定点执行代码。它们可以通过EXTENSIONS设置
37
+ 启用或禁用。
38
+ """
11
39
 
40
+ # Name of the component
41
+ # 组件的名称
12
42
  component_name = 'extension'
13
43
 
14
44
  @classmethod
15
45
  def _get_mwlist_from_settings(cls, settings):
46
+ """
47
+ Get the list of extension classes from settings.
48
+ 从设置中获取扩展类列表。
49
+
50
+ This method implements the abstract method from AbsMiddlewareManager.
51
+ It retrieves the list of extension classes from the EXTENSIONS setting.
52
+ 此方法实现了AbsMiddlewareManager中的抽象方法。它从EXTENSIONS设置中检索
53
+ 扩展类列表。
54
+
55
+ Args:
56
+ settings: The settings object.
57
+ 设置对象。
58
+
59
+ Returns:
60
+ list: A list of extension class paths.
61
+ 扩展类路径列表。
62
+ """
63
+ # Build component list from EXTENSIONS setting
64
+ # 从EXTENSIONS设置构建组件列表
16
65
  return build_component_list(settings.getwithbase('EXTENSIONS'))
@@ -1,18 +1,114 @@
1
+ """
2
+ Item Pipeline Manager Module
3
+ 项目管道管理器模块
4
+
5
+ This module provides the ItemPipelineManager class, which manages the execution
6
+ of item pipeline components. Item pipelines are components that process items
7
+ after they have been extracted by spiders, typically for cleaning, validation,
8
+ persistence, or other post-processing tasks.
9
+ 此模块提供了ItemPipelineManager类,用于管理项目管道组件的执行。项目管道是
10
+ 在项目被爬虫提取后处理项目的组件,通常用于清洗、验证、持久化或其他后处理任务。
11
+
12
+ Item pipelines are loaded from the ITEM_PIPELINES setting and are executed in
13
+ the order specified by their priority values. Each pipeline component can process
14
+ an item and either return it for further processing, drop it, or raise an exception.
15
+ 项目管道从ITEM_PIPELINES设置加载,并按照其优先级值指定的顺序执行。每个管道组件
16
+ 可以处理一个项目,并返回它以供进一步处理、丢弃它或引发异常。
17
+ """
1
18
  from aioscrapy.middleware.absmanager import AbsMiddlewareManager
2
19
  from aioscrapy.utils.conf import build_component_list
3
20
 
4
21
 
5
22
  class ItemPipelineManager(AbsMiddlewareManager):
23
+ """
24
+ Manager for item pipeline components.
25
+ 项目管道组件的管理器。
26
+
27
+ This class manages the execution of item pipeline components, which process items
28
+ after they have been extracted by spiders. It inherits from AbsMiddlewareManager
29
+ and implements the specific behavior for item pipelines.
30
+ 此类管理项目管道组件的执行,这些组件在项目被爬虫提取后进行处理。它继承自
31
+ AbsMiddlewareManager,并实现了项目管道的特定行为。
32
+
33
+ Item pipelines are executed in the order specified by their priority values in
34
+ the ITEM_PIPELINES setting. Each pipeline can process an item and either return
35
+ it for further processing, drop it, or raise an exception.
36
+ 项目管道按照ITEM_PIPELINES设置中指定的优先级值顺序执行。每个管道可以处理一个
37
+ 项目,并返回它以供进一步处理、丢弃它或引发异常。
38
+ """
39
+
40
+ # Name of the component
41
+ # 组件的名称
6
42
  component_name = 'item pipeline'
7
43
 
8
44
  @classmethod
9
45
  def _get_mwlist_from_settings(cls, settings):
46
+ """
47
+ Get the list of item pipeline classes from settings.
48
+ 从设置中获取项目管道类列表。
49
+
50
+ This method implements the abstract method from AbsMiddlewareManager.
51
+ It retrieves the list of item pipeline classes from the ITEM_PIPELINES setting.
52
+ 此方法实现了AbsMiddlewareManager中的抽象方法。它从ITEM_PIPELINES设置中
53
+ 检索项目管道类列表。
54
+
55
+ Args:
56
+ settings: The settings object.
57
+ 设置对象。
58
+
59
+ Returns:
60
+ list: A list of item pipeline class paths.
61
+ 项目管道类路径列表。
62
+ """
63
+ # Build component list from ITEM_PIPELINES setting
64
+ # 从ITEM_PIPELINES设置构建组件列表
10
65
  return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
11
66
 
12
67
  def _add_middleware(self, pipe):
68
+ """
69
+ Add a pipeline instance to the manager.
70
+ 将管道实例添加到管理器。
71
+
72
+ This method overrides the method from AbsMiddlewareManager to register
73
+ the process_item method of item pipelines. It first calls the parent method
74
+ to register open_spider and close_spider methods if they exist.
75
+ 此方法覆盖了AbsMiddlewareManager中的方法,以注册项目管道的process_item方法。
76
+ 它首先调用父方法来注册open_spider和close_spider方法(如果存在)。
77
+
78
+ Args:
79
+ pipe: The pipeline instance to add.
80
+ 要添加的管道实例。
81
+ """
82
+ # Call parent method to register open_spider and close_spider methods
83
+ # 调用父方法来注册open_spider和close_spider方法
13
84
  super()._add_middleware(pipe)
85
+
86
+ # Register process_item method if it exists
87
+ # 如果存在,则注册process_item方法
14
88
  if hasattr(pipe, 'process_item'):
15
89
  self.methods['process_item'].append(pipe.process_item)
16
90
 
17
91
  async def process_item(self, item, spider):
92
+ """
93
+ Process an item through all registered process_item methods.
94
+ 通过所有已注册的process_item方法处理项目。
95
+
96
+ This method calls each pipeline's process_item method in the order they
97
+ were registered. The result of each pipeline is passed to the next one
98
+ in a chain, allowing pipelines to modify the item or drop it by returning None.
99
+ 此方法按照它们注册的顺序调用每个管道的process_item方法。每个管道的结果
100
+ 以链式方式传递给下一个管道,允许管道修改项目或通过返回None来丢弃它。
101
+
102
+ Args:
103
+ item: The item to process.
104
+ 要处理的项目。
105
+ spider: The spider that generated the item.
106
+ 生成项目的爬虫。
107
+
108
+ Returns:
109
+ The processed item, or None if it was dropped by a pipeline.
110
+ 处理后的项目,如果被管道丢弃则为None。
111
+ """
112
+ # Process the item through the chain of process_item methods
113
+ # 通过process_item方法链处理项目
18
114
  return await self._process_chain('process_item', item, spider)