aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Downloader Middleware Manager Module
|
|
3
|
+
下载器中间件管理器模块
|
|
4
|
+
|
|
5
|
+
This module provides the DownloaderMiddlewareManager class, which manages the execution
|
|
6
|
+
of downloader middleware components. Downloader middlewares process requests before they
|
|
7
|
+
are sent to the downloader and responses before they are sent to the spider.
|
|
8
|
+
此模块提供了DownloaderMiddlewareManager类,用于管理下载器中间件组件的执行。
|
|
9
|
+
下载器中间件在请求发送到下载器之前处理请求,在响应发送到爬虫之前处理响应。
|
|
10
|
+
|
|
11
|
+
Downloader middlewares can modify requests and responses, return alternative responses,
|
|
12
|
+
or handle exceptions that occur during the download process.
|
|
13
|
+
下载器中间件可以修改请求和响应,返回替代响应,或处理下载过程中发生的异常。
|
|
14
|
+
"""
|
|
1
15
|
from aioscrapy.exceptions import _InvalidOutput
|
|
2
16
|
from aioscrapy.http import Request, Response
|
|
3
17
|
from aioscrapy.middleware.absmanager import AbsMiddlewareManager
|
|
@@ -6,64 +20,268 @@ from aioscrapy.utils.tools import call_helper
|
|
|
6
20
|
|
|
7
21
|
|
|
8
22
|
class DownloaderMiddlewareManager(AbsMiddlewareManager):
|
|
23
|
+
"""
|
|
24
|
+
Manager for downloader middleware components.
|
|
25
|
+
下载器中间件组件的管理器。
|
|
26
|
+
|
|
27
|
+
This class manages the execution of downloader middleware components, which process
|
|
28
|
+
requests before they are sent to the downloader and responses before they are sent
|
|
29
|
+
to the spider. It inherits from AbsMiddlewareManager and implements the specific
|
|
30
|
+
behavior for downloader middlewares.
|
|
31
|
+
此类管理下载器中间件组件的执行,这些组件在请求发送到下载器之前处理请求,
|
|
32
|
+
在响应发送到爬虫之前处理响应。它继承自AbsMiddlewareManager,并实现了
|
|
33
|
+
下载器中间件的特定行为。
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Name of the middleware component
|
|
37
|
+
# 中间件组件的名称
|
|
9
38
|
component_name = 'downloader middleware'
|
|
10
39
|
|
|
11
40
|
@classmethod
|
|
12
41
|
def _get_mwlist_from_settings(cls, settings):
|
|
42
|
+
"""
|
|
43
|
+
Get the list of middleware classes from settings.
|
|
44
|
+
从设置中获取中间件类列表。
|
|
45
|
+
|
|
46
|
+
This method implements the abstract method from AbsMiddlewareManager.
|
|
47
|
+
It retrieves the list of downloader middleware classes from the
|
|
48
|
+
DOWNLOADER_MIDDLEWARES setting.
|
|
49
|
+
此方法实现了AbsMiddlewareManager中的抽象方法。它从DOWNLOADER_MIDDLEWARES
|
|
50
|
+
设置中检索下载器中间件类列表。
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
settings: The settings object.
|
|
54
|
+
设置对象。
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list: A list of middleware class paths.
|
|
58
|
+
中间件类路径列表。
|
|
59
|
+
"""
|
|
60
|
+
# Build component list from DOWNLOADER_MIDDLEWARES setting
|
|
61
|
+
# 从DOWNLOADER_MIDDLEWARES设置构建组件列表
|
|
13
62
|
return build_component_list(
|
|
14
63
|
settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
|
|
15
64
|
|
|
16
65
|
def _add_middleware(self, mw):
|
|
66
|
+
"""
|
|
67
|
+
Add a middleware instance to the manager.
|
|
68
|
+
将中间件实例添加到管理器。
|
|
69
|
+
|
|
70
|
+
This method overrides the method from AbsMiddlewareManager to register
|
|
71
|
+
the specific methods of downloader middlewares: process_request,
|
|
72
|
+
process_response, and process_exception.
|
|
73
|
+
此方法覆盖了AbsMiddlewareManager中的方法,以注册下载器中间件的特定方法:
|
|
74
|
+
process_request、process_response和process_exception。
|
|
75
|
+
|
|
76
|
+
Note that process_request methods are called in the order they are registered,
|
|
77
|
+
while process_response and process_exception methods are called in reverse order.
|
|
78
|
+
请注意,process_request方法按照它们注册的顺序调用,而process_response
|
|
79
|
+
和process_exception方法按照相反的顺序调用。
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
mw: The middleware instance to add.
|
|
83
|
+
要添加的中间件实例。
|
|
84
|
+
"""
|
|
85
|
+
# Register process_request method if it exists
|
|
86
|
+
# 如果存在,则注册process_request方法
|
|
17
87
|
if hasattr(mw, 'process_request'):
|
|
18
88
|
self.methods['process_request'].append(mw.process_request)
|
|
89
|
+
|
|
90
|
+
# Register process_response method if it exists (added to the left for reverse order)
|
|
91
|
+
# 如果存在,则注册process_response方法(添加到左侧以便逆序执行)
|
|
19
92
|
if hasattr(mw, 'process_response'):
|
|
20
93
|
self.methods['process_response'].appendleft(mw.process_response)
|
|
94
|
+
|
|
95
|
+
# Register process_exception method if it exists (added to the left for reverse order)
|
|
96
|
+
# 如果存在,则注册process_exception方法(添加到左侧以便逆序执行)
|
|
21
97
|
if hasattr(mw, 'process_exception'):
|
|
22
98
|
self.methods['process_exception'].appendleft(mw.process_exception)
|
|
23
99
|
|
|
24
100
|
def iter_mw_method(self, spider, process_type: str):
|
|
101
|
+
"""
|
|
102
|
+
Iterate over middleware methods of a specific type.
|
|
103
|
+
迭代特定类型的中间件方法。
|
|
104
|
+
|
|
105
|
+
This method yields all middleware methods of the specified type, followed
|
|
106
|
+
by the spider's method of the same type if it exists.
|
|
107
|
+
此方法产生指定类型的所有中间件方法,如果存在,则后跟爬虫的同类型方法。
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
spider: The spider instance.
|
|
111
|
+
爬虫实例。
|
|
112
|
+
process_type: The type of middleware method to iterate over.
|
|
113
|
+
要迭代的中间件方法类型。
|
|
114
|
+
|
|
115
|
+
Yields:
|
|
116
|
+
callable: Middleware methods of the specified type.
|
|
117
|
+
指定类型的中间件方法。
|
|
118
|
+
"""
|
|
119
|
+
# Get the spider's method of the specified type if it exists
|
|
120
|
+
# 如果存在,则获取爬虫的指定类型方法
|
|
25
121
|
spider_method = getattr(spider, process_type, None)
|
|
122
|
+
|
|
123
|
+
# Yield all middleware methods of the specified type
|
|
124
|
+
# 产生指定类型的所有中间件方法
|
|
26
125
|
for method in self.methods[process_type]:
|
|
27
126
|
yield method
|
|
127
|
+
|
|
128
|
+
# Yield the spider's method if it exists
|
|
129
|
+
# 如果存在,则产生爬虫的方法
|
|
28
130
|
if spider_method:
|
|
29
131
|
yield spider_method
|
|
30
132
|
|
|
31
133
|
async def process_request(self, spider, request):
|
|
134
|
+
"""
|
|
135
|
+
Process a request through all registered process_request methods.
|
|
136
|
+
通过所有已注册的process_request方法处理请求。
|
|
137
|
+
|
|
138
|
+
This method calls each middleware's process_request method in the order
|
|
139
|
+
they were registered. If any middleware returns a Response or Request,
|
|
140
|
+
the process stops and that object is returned.
|
|
141
|
+
此方法按照它们注册的顺序调用每个中间件的process_request方法。如果任何
|
|
142
|
+
中间件返回Response或Request,则过程停止并返回该对象。
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
spider: The spider instance.
|
|
146
|
+
爬虫实例。
|
|
147
|
+
request: The request to process.
|
|
148
|
+
要处理的请求。
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
None, Response, or Request: The result of processing the request.
|
|
152
|
+
处理请求的结果。
|
|
153
|
+
|
|
154
|
+
Raises:
|
|
155
|
+
_InvalidOutput: If a middleware returns a value that is not None, Response, or Request.
|
|
156
|
+
如果中间件返回的值不是None、Response或Request。
|
|
157
|
+
"""
|
|
158
|
+
# Iterate over all process_request methods
|
|
159
|
+
# 迭代所有process_request方法
|
|
32
160
|
for method in self.iter_mw_method(spider, 'process_request'):
|
|
161
|
+
# Call the method with the request
|
|
162
|
+
# 使用请求调用方法
|
|
33
163
|
response = await call_helper(method, request=request, spider=spider)
|
|
164
|
+
|
|
165
|
+
# Validate the return value
|
|
166
|
+
# 验证返回值
|
|
34
167
|
if response is not None and not isinstance(response, (Response, Request)):
|
|
35
168
|
raise _InvalidOutput(
|
|
36
169
|
"Middleware %s.process_request must return None, Response or Request, got %s"
|
|
37
170
|
% (method.__self__.__class__.__name__, response.__class__.__name__)
|
|
38
171
|
)
|
|
172
|
+
|
|
173
|
+
# If a non-None value was returned, return it and stop processing
|
|
174
|
+
# 如果返回了非None值,则返回它并停止处理
|
|
39
175
|
if response:
|
|
40
176
|
return response
|
|
41
177
|
|
|
42
178
|
async def process_response(self, spider, request, response):
|
|
179
|
+
"""
|
|
180
|
+
Process a response through all registered process_response methods.
|
|
181
|
+
通过所有已注册的process_response方法处理响应。
|
|
182
|
+
|
|
183
|
+
This method calls each middleware's process_response method in reverse
|
|
184
|
+
order from how they were registered. If any middleware returns a Request,
|
|
185
|
+
the process stops and that Request is returned.
|
|
186
|
+
此方法按照与它们注册的相反的顺序调用每个中间件的process_response方法。
|
|
187
|
+
如果任何中间件返回Request,则过程停止并返回该Request。
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
spider: The spider instance.
|
|
191
|
+
爬虫实例。
|
|
192
|
+
request: The request that generated the response.
|
|
193
|
+
生成响应的请求。
|
|
194
|
+
response: The response to process.
|
|
195
|
+
要处理的响应。
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Response or Request: The result of processing the response.
|
|
199
|
+
处理响应的结果。
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
TypeError: If response is None.
|
|
203
|
+
如果响应为None。
|
|
204
|
+
_InvalidOutput: If a middleware returns a value that is not Response or Request.
|
|
205
|
+
如果中间件返回的值不是Response或Request。
|
|
206
|
+
"""
|
|
207
|
+
# Validate the response
|
|
208
|
+
# 验证响应
|
|
43
209
|
if response is None:
|
|
44
210
|
raise TypeError("Received None in process_response")
|
|
45
211
|
elif isinstance(response, Request):
|
|
46
212
|
return response
|
|
47
213
|
|
|
214
|
+
# Iterate over all process_response methods
|
|
215
|
+
# 迭代所有process_response方法
|
|
48
216
|
for method in self.iter_mw_method(spider, 'process_response'):
|
|
217
|
+
# Call the method with the request and response
|
|
218
|
+
# 使用请求和响应调用方法
|
|
49
219
|
response = await call_helper(method, request=request, response=response, spider=spider)
|
|
220
|
+
|
|
221
|
+
# Validate the return value
|
|
222
|
+
# 验证返回值
|
|
50
223
|
if not isinstance(response, (Response, Request)):
|
|
51
224
|
raise _InvalidOutput(
|
|
52
225
|
"Middleware %s.process_response must return Response or Request, got %s"
|
|
53
226
|
% (method.__self__.__class__.__name__, type(response))
|
|
54
227
|
)
|
|
228
|
+
|
|
229
|
+
# If a Request was returned, return it and stop processing
|
|
230
|
+
# 如果返回了Request,则返回它并停止处理
|
|
55
231
|
if isinstance(response, Request):
|
|
56
232
|
return response
|
|
233
|
+
|
|
234
|
+
# Return the final response
|
|
235
|
+
# 返回最终响应
|
|
57
236
|
return response
|
|
58
237
|
|
|
59
238
|
async def process_exception(self, spider, request, exception):
|
|
239
|
+
"""
|
|
240
|
+
Process an exception through all registered process_exception methods.
|
|
241
|
+
通过所有已注册的process_exception方法处理异常。
|
|
242
|
+
|
|
243
|
+
This method calls each middleware's process_exception method in reverse
|
|
244
|
+
order from how they were registered. If any middleware returns a Response
|
|
245
|
+
or Request, the process stops and that object is returned.
|
|
246
|
+
此方法按照与它们注册的相反的顺序调用每个中间件的process_exception方法。
|
|
247
|
+
如果任何中间件返回Response或Request,则过程停止并返回该对象。
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
spider: The spider instance.
|
|
251
|
+
爬虫实例。
|
|
252
|
+
request: The request that caused the exception.
|
|
253
|
+
导致异常的请求。
|
|
254
|
+
exception: The exception that occurred.
|
|
255
|
+
发生的异常。
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Exception, Response, or Request: The result of processing the exception.
|
|
259
|
+
处理异常的结果。
|
|
260
|
+
|
|
261
|
+
Raises:
|
|
262
|
+
_InvalidOutput: If a middleware returns a value that is not None, Response, or Request.
|
|
263
|
+
如果中间件返回的值不是None、Response或Request。
|
|
264
|
+
"""
|
|
265
|
+
# Iterate over all process_exception methods
|
|
266
|
+
# 迭代所有process_exception方法
|
|
60
267
|
for method in self.iter_mw_method(spider, 'process_exception'):
|
|
268
|
+
# Call the method with the request and exception
|
|
269
|
+
# 使用请求和异常调用方法
|
|
61
270
|
response = await call_helper(method, request=request, exception=exception, spider=spider)
|
|
271
|
+
|
|
272
|
+
# Validate the return value
|
|
273
|
+
# 验证返回值
|
|
62
274
|
if response is not None and not isinstance(response, (Response, Request)):
|
|
63
275
|
raise _InvalidOutput(
|
|
64
276
|
"Middleware %s.process_exception must return None, Response or Request, got %s"
|
|
65
277
|
% (method.__self__.__class__.__name__, type(response))
|
|
66
278
|
)
|
|
279
|
+
|
|
280
|
+
# If a non-None value was returned, return it and stop processing
|
|
281
|
+
# 如果返回了非None值,则返回它并停止处理
|
|
67
282
|
if response:
|
|
68
283
|
return response
|
|
284
|
+
|
|
285
|
+
# If no middleware handled the exception, return it
|
|
286
|
+
# 如果没有中间件处理异常,则返回它
|
|
69
287
|
return exception
|
|
@@ -1,16 +1,65 @@
|
|
|
1
1
|
"""
|
|
2
2
|
The Extension Manager
|
|
3
|
+
扩展管理器
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This module provides the ExtensionManager class, which manages the loading and
|
|
6
|
+
execution of extensions. Extensions are components that can hook into various
|
|
7
|
+
parts of the Scrapy process to add functionality or modify behavior.
|
|
8
|
+
此模块提供了ExtensionManager类,用于管理扩展的加载和执行。扩展是可以挂钩到
|
|
9
|
+
Scrapy流程的各个部分以添加功能或修改行为的组件。
|
|
10
|
+
|
|
11
|
+
Extensions are loaded from the EXTENSIONS setting and can be enabled or disabled
|
|
12
|
+
through this setting. They can connect to signals to execute code at specific
|
|
13
|
+
points in the crawling process.
|
|
14
|
+
扩展从EXTENSIONS设置加载,可以通过此设置启用或禁用。它们可以连接到信号,
|
|
15
|
+
以在爬取过程的特定点执行代码。
|
|
5
16
|
"""
|
|
6
17
|
from aioscrapy.middleware.absmanager import AbsMiddlewareManager
|
|
7
18
|
from aioscrapy.utils.conf import build_component_list
|
|
8
19
|
|
|
9
20
|
|
|
10
21
|
class ExtensionManager(AbsMiddlewareManager):
|
|
22
|
+
"""
|
|
23
|
+
Manager for extension components.
|
|
24
|
+
扩展组件的管理器。
|
|
25
|
+
|
|
26
|
+
This class manages the loading and execution of extensions. It inherits from
|
|
27
|
+
AbsMiddlewareManager and implements the specific behavior for extensions.
|
|
28
|
+
Extensions are components that can hook into various parts of the Scrapy
|
|
29
|
+
process to add functionality or modify behavior.
|
|
30
|
+
此类管理扩展的加载和执行。它继承自AbsMiddlewareManager,并实现了扩展的特定行为。
|
|
31
|
+
扩展是可以挂钩到Scrapy流程的各个部分以添加功能或修改行为的组件。
|
|
32
|
+
|
|
33
|
+
Extensions typically connect to signals to execute code at specific points in
|
|
34
|
+
the crawling process. They can be enabled or disabled through the EXTENSIONS
|
|
35
|
+
setting.
|
|
36
|
+
扩展通常连接到信号,以在爬取过程的特定点执行代码。它们可以通过EXTENSIONS设置
|
|
37
|
+
启用或禁用。
|
|
38
|
+
"""
|
|
11
39
|
|
|
40
|
+
# Name of the component
|
|
41
|
+
# 组件的名称
|
|
12
42
|
component_name = 'extension'
|
|
13
43
|
|
|
14
44
|
@classmethod
|
|
15
45
|
def _get_mwlist_from_settings(cls, settings):
|
|
46
|
+
"""
|
|
47
|
+
Get the list of extension classes from settings.
|
|
48
|
+
从设置中获取扩展类列表。
|
|
49
|
+
|
|
50
|
+
This method implements the abstract method from AbsMiddlewareManager.
|
|
51
|
+
It retrieves the list of extension classes from the EXTENSIONS setting.
|
|
52
|
+
此方法实现了AbsMiddlewareManager中的抽象方法。它从EXTENSIONS设置中检索
|
|
53
|
+
扩展类列表。
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
settings: The settings object.
|
|
57
|
+
设置对象。
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
list: A list of extension class paths.
|
|
61
|
+
扩展类路径列表。
|
|
62
|
+
"""
|
|
63
|
+
# Build component list from EXTENSIONS setting
|
|
64
|
+
# 从EXTENSIONS设置构建组件列表
|
|
16
65
|
return build_component_list(settings.getwithbase('EXTENSIONS'))
|
|
@@ -1,18 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Item Pipeline Manager Module
|
|
3
|
+
项目管道管理器模块
|
|
4
|
+
|
|
5
|
+
This module provides the ItemPipelineManager class, which manages the execution
|
|
6
|
+
of item pipeline components. Item pipelines are components that process items
|
|
7
|
+
after they have been extracted by spiders, typically for cleaning, validation,
|
|
8
|
+
persistence, or other post-processing tasks.
|
|
9
|
+
此模块提供了ItemPipelineManager类,用于管理项目管道组件的执行。项目管道是
|
|
10
|
+
在项目被爬虫提取后处理项目的组件,通常用于清洗、验证、持久化或其他后处理任务。
|
|
11
|
+
|
|
12
|
+
Item pipelines are loaded from the ITEM_PIPELINES setting and are executed in
|
|
13
|
+
the order specified by their priority values. Each pipeline component can process
|
|
14
|
+
an item and either return it for further processing, drop it, or raise an exception.
|
|
15
|
+
项目管道从ITEM_PIPELINES设置加载,并按照其优先级值指定的顺序执行。每个管道组件
|
|
16
|
+
可以处理一个项目,并返回它以供进一步处理、丢弃它或引发异常。
|
|
17
|
+
"""
|
|
1
18
|
from aioscrapy.middleware.absmanager import AbsMiddlewareManager
|
|
2
19
|
from aioscrapy.utils.conf import build_component_list
|
|
3
20
|
|
|
4
21
|
|
|
5
22
|
class ItemPipelineManager(AbsMiddlewareManager):
|
|
23
|
+
"""
|
|
24
|
+
Manager for item pipeline components.
|
|
25
|
+
项目管道组件的管理器。
|
|
26
|
+
|
|
27
|
+
This class manages the execution of item pipeline components, which process items
|
|
28
|
+
after they have been extracted by spiders. It inherits from AbsMiddlewareManager
|
|
29
|
+
and implements the specific behavior for item pipelines.
|
|
30
|
+
此类管理项目管道组件的执行,这些组件在项目被爬虫提取后进行处理。它继承自
|
|
31
|
+
AbsMiddlewareManager,并实现了项目管道的特定行为。
|
|
32
|
+
|
|
33
|
+
Item pipelines are executed in the order specified by their priority values in
|
|
34
|
+
the ITEM_PIPELINES setting. Each pipeline can process an item and either return
|
|
35
|
+
it for further processing, drop it, or raise an exception.
|
|
36
|
+
项目管道按照ITEM_PIPELINES设置中指定的优先级值顺序执行。每个管道可以处理一个
|
|
37
|
+
项目,并返回它以供进一步处理、丢弃它或引发异常。
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# Name of the component
|
|
41
|
+
# 组件的名称
|
|
6
42
|
component_name = 'item pipeline'
|
|
7
43
|
|
|
8
44
|
@classmethod
|
|
9
45
|
def _get_mwlist_from_settings(cls, settings):
|
|
46
|
+
"""
|
|
47
|
+
Get the list of item pipeline classes from settings.
|
|
48
|
+
从设置中获取项目管道类列表。
|
|
49
|
+
|
|
50
|
+
This method implements the abstract method from AbsMiddlewareManager.
|
|
51
|
+
It retrieves the list of item pipeline classes from the ITEM_PIPELINES setting.
|
|
52
|
+
此方法实现了AbsMiddlewareManager中的抽象方法。它从ITEM_PIPELINES设置中
|
|
53
|
+
检索项目管道类列表。
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
settings: The settings object.
|
|
57
|
+
设置对象。
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
list: A list of item pipeline class paths.
|
|
61
|
+
项目管道类路径列表。
|
|
62
|
+
"""
|
|
63
|
+
# Build component list from ITEM_PIPELINES setting
|
|
64
|
+
# 从ITEM_PIPELINES设置构建组件列表
|
|
10
65
|
return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
|
|
11
66
|
|
|
12
67
|
def _add_middleware(self, pipe):
|
|
68
|
+
"""
|
|
69
|
+
Add a pipeline instance to the manager.
|
|
70
|
+
将管道实例添加到管理器。
|
|
71
|
+
|
|
72
|
+
This method overrides the method from AbsMiddlewareManager to register
|
|
73
|
+
the process_item method of item pipelines. It first calls the parent method
|
|
74
|
+
to register open_spider and close_spider methods if they exist.
|
|
75
|
+
此方法覆盖了AbsMiddlewareManager中的方法,以注册项目管道的process_item方法。
|
|
76
|
+
它首先调用父方法来注册open_spider和close_spider方法(如果存在)。
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
pipe: The pipeline instance to add.
|
|
80
|
+
要添加的管道实例。
|
|
81
|
+
"""
|
|
82
|
+
# Call parent method to register open_spider and close_spider methods
|
|
83
|
+
# 调用父方法来注册open_spider和close_spider方法
|
|
13
84
|
super()._add_middleware(pipe)
|
|
85
|
+
|
|
86
|
+
# Register process_item method if it exists
|
|
87
|
+
# 如果存在,则注册process_item方法
|
|
14
88
|
if hasattr(pipe, 'process_item'):
|
|
15
89
|
self.methods['process_item'].append(pipe.process_item)
|
|
16
90
|
|
|
17
91
|
async def process_item(self, item, spider):
|
|
92
|
+
"""
|
|
93
|
+
Process an item through all registered process_item methods.
|
|
94
|
+
通过所有已注册的process_item方法处理项目。
|
|
95
|
+
|
|
96
|
+
This method calls each pipeline's process_item method in the order they
|
|
97
|
+
were registered. The result of each pipeline is passed to the next one
|
|
98
|
+
in a chain, allowing pipelines to modify the item or drop it by returning None.
|
|
99
|
+
此方法按照它们注册的顺序调用每个管道的process_item方法。每个管道的结果
|
|
100
|
+
以链式方式传递给下一个管道,允许管道修改项目或通过返回None来丢弃它。
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
item: The item to process.
|
|
104
|
+
要处理的项目。
|
|
105
|
+
spider: The spider that generated the item.
|
|
106
|
+
生成项目的爬虫。
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
The processed item, or None if it was dropped by a pipeline.
|
|
110
|
+
处理后的项目,如果被管道丢弃则为None。
|
|
111
|
+
"""
|
|
112
|
+
# Process the item through the chain of process_item methods
|
|
113
|
+
# 通过process_item方法链处理项目
|
|
18
114
|
return await self._process_chain('process_item', item, spider)
|