aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/middleware/spider.py
CHANGED
|
@@ -1,7 +1,23 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Spider Middleware
|
|
2
|
+
Spider Middleware Manager
|
|
3
|
+
爬虫中间件管理器
|
|
4
|
+
|
|
5
|
+
This module provides the SpiderMiddlewareManager class, which manages the execution
|
|
6
|
+
of spider middleware components. Spider middlewares process the input and output of
|
|
7
|
+
spiders, allowing modification of requests and responses, as well as handling of
|
|
8
|
+
exceptions that occur during spider processing.
|
|
9
|
+
此模块提供了SpiderMiddlewareManager类,用于管理爬虫中间件组件的执行。爬虫中间件
|
|
10
|
+
处理爬虫的输入和输出,允许修改请求和响应,以及处理爬虫处理过程中发生的异常。
|
|
11
|
+
|
|
12
|
+
Spider middlewares are loaded from the SPIDER_MIDDLEWARES setting and are executed
|
|
13
|
+
in the order specified by their priority values. They can intercept and modify the
|
|
14
|
+
requests generated by spiders, the responses passed to spiders, and the items and
|
|
15
|
+
requests yielded by spiders.
|
|
16
|
+
爬虫中间件从SPIDER_MIDDLEWARES设置加载,并按照其优先级值指定的顺序执行。它们可以
|
|
17
|
+
拦截和修改由爬虫生成的请求、传递给爬虫的响应,以及由爬虫产生的项目和请求。
|
|
3
18
|
|
|
4
19
|
See documentation in docs/topics/spider-middleware.rst
|
|
20
|
+
参见文档 docs/topics/spider-middleware.rst
|
|
5
21
|
"""
|
|
6
22
|
from itertools import islice
|
|
7
23
|
from typing import AsyncIterable, Iterable, Callable, Union
|
|
@@ -18,115 +34,452 @@ from aioscrapy.middleware.absmanager import AbsMiddlewareManager
|
|
|
18
34
|
|
|
19
35
|
|
|
20
36
|
def _fname(f):
|
|
37
|
+
"""
|
|
38
|
+
Get the full name of a method.
|
|
39
|
+
获取方法的完整名称。
|
|
40
|
+
|
|
41
|
+
This helper function returns the full name of a method in the format
|
|
42
|
+
"ClassName.method_name". It is used for error messages to identify which
|
|
43
|
+
middleware method raised an exception or returned an invalid output.
|
|
44
|
+
此辅助函数返回格式为"ClassName.method_name"的方法的完整名称。它用于
|
|
45
|
+
错误消息,以识别哪个中间件方法引发了异常或返回了无效输出。
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
f: The method to get the name of.
|
|
49
|
+
要获取名称的方法。
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
str: The full name of the method in the format "ClassName.method_name".
|
|
53
|
+
格式为"ClassName.method_name"的方法的完整名称。
|
|
54
|
+
"""
|
|
55
|
+
# Get the class name and method name and format them
|
|
56
|
+
# 获取类名和方法名并格式化它们
|
|
21
57
|
return "{}.{}".format(
|
|
22
|
-
f.__self__.__class__.__name__,
|
|
23
|
-
f.__func__.__name__
|
|
58
|
+
f.__self__.__class__.__name__, # Class name 类名
|
|
59
|
+
f.__func__.__name__ # Method name 方法名
|
|
24
60
|
)
|
|
25
61
|
|
|
26
62
|
|
|
27
63
|
class SpiderMiddlewareManager(AbsMiddlewareManager):
|
|
64
|
+
"""
|
|
65
|
+
Manager for spider middleware components.
|
|
66
|
+
爬虫中间件组件的管理器。
|
|
67
|
+
|
|
68
|
+
This class manages the execution of spider middleware components, which process
|
|
69
|
+
the input and output of spiders. It inherits from AbsMiddlewareManager and
|
|
70
|
+
implements the specific behavior for spider middlewares.
|
|
71
|
+
此类管理爬虫中间件组件的执行,这些组件处理爬虫的输入和输出。它继承自
|
|
72
|
+
AbsMiddlewareManager,并实现了爬虫中间件的特定行为。
|
|
73
|
+
|
|
74
|
+
Spider middlewares can intercept and modify the requests generated by spiders,
|
|
75
|
+
the responses passed to spiders, and the items and requests yielded by spiders.
|
|
76
|
+
They can also handle exceptions that occur during spider processing.
|
|
77
|
+
爬虫中间件可以拦截和修改由爬虫生成的请求、传递给爬虫的响应,以及由爬虫产生的
|
|
78
|
+
项目和请求。它们还可以处理爬虫处理过程中发生的异常。
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
# Name of the component
|
|
82
|
+
# 组件的名称
|
|
28
83
|
component_name = 'spider middleware'
|
|
29
84
|
|
|
30
85
|
@classmethod
|
|
31
86
|
def _get_mwlist_from_settings(cls, settings: Settings):
|
|
87
|
+
"""
|
|
88
|
+
Get the list of spider middleware classes from settings.
|
|
89
|
+
从设置中获取爬虫中间件类列表。
|
|
90
|
+
|
|
91
|
+
This method implements the abstract method from AbsMiddlewareManager.
|
|
92
|
+
It retrieves the list of spider middleware classes from the
|
|
93
|
+
SPIDER_MIDDLEWARES setting.
|
|
94
|
+
此方法实现了AbsMiddlewareManager中的抽象方法。它从SPIDER_MIDDLEWARES
|
|
95
|
+
设置中检索爬虫中间件类列表。
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
settings: The settings object.
|
|
99
|
+
设置对象。
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
list: A list of spider middleware class paths.
|
|
103
|
+
爬虫中间件类路径列表。
|
|
104
|
+
"""
|
|
105
|
+
# Build component list from SPIDER_MIDDLEWARES setting
|
|
106
|
+
# 从SPIDER_MIDDLEWARES设置构建组件列表
|
|
32
107
|
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
|
|
33
108
|
|
|
34
109
|
def _add_middleware(self, mw):
|
|
110
|
+
"""
|
|
111
|
+
Add a middleware instance to the manager.
|
|
112
|
+
将中间件实例添加到管理器。
|
|
113
|
+
|
|
114
|
+
This method overrides the method from AbsMiddlewareManager to register
|
|
115
|
+
the specific methods of spider middlewares: process_spider_input,
|
|
116
|
+
process_start_requests, process_spider_output, and process_spider_exception.
|
|
117
|
+
此方法覆盖了AbsMiddlewareManager中的方法,以注册爬虫中间件的特定方法:
|
|
118
|
+
process_spider_input、process_start_requests、process_spider_output和
|
|
119
|
+
process_spider_exception。
|
|
120
|
+
|
|
121
|
+
Note that process_spider_input and process_start_requests methods are called
|
|
122
|
+
in the order they are registered, while process_spider_output and
|
|
123
|
+
process_spider_exception methods are called in reverse order.
|
|
124
|
+
请注意,process_spider_input和process_start_requests方法按照它们注册的
|
|
125
|
+
顺序调用,而process_spider_output和process_spider_exception方法按照
|
|
126
|
+
相反的顺序调用。
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
mw: The middleware instance to add.
|
|
130
|
+
要添加的中间件实例。
|
|
131
|
+
"""
|
|
132
|
+
# Call parent method to register open_spider and close_spider methods
|
|
133
|
+
# 调用父方法来注册open_spider和close_spider方法
|
|
35
134
|
super(SpiderMiddlewareManager, self)._add_middleware(mw)
|
|
135
|
+
|
|
136
|
+
# Register process_spider_input method if it exists
|
|
137
|
+
# 如果存在,则注册process_spider_input方法
|
|
36
138
|
if hasattr(mw, 'process_spider_input'):
|
|
37
139
|
self.methods['process_spider_input'].append(mw.process_spider_input)
|
|
140
|
+
|
|
141
|
+
# Register process_start_requests method if it exists (added to the left for reverse order)
|
|
142
|
+
# 如果存在,则注册process_start_requests方法(添加到左侧以便逆序执行)
|
|
38
143
|
if hasattr(mw, 'process_start_requests'):
|
|
39
144
|
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
|
|
145
|
+
|
|
146
|
+
# Get process_spider_output method if it exists
|
|
147
|
+
# 如果存在,则获取process_spider_output方法
|
|
40
148
|
process_spider_output = getattr(mw, 'process_spider_output', None)
|
|
149
|
+
|
|
150
|
+
# Register process_spider_output method (added to the left for reverse order)
|
|
151
|
+
# 注册process_spider_output方法(添加到左侧以便逆序执行)
|
|
41
152
|
self.methods['process_spider_output'].appendleft(process_spider_output)
|
|
153
|
+
|
|
154
|
+
# Get process_spider_exception method if it exists
|
|
155
|
+
# 如果存在,则获取process_spider_exception方法
|
|
42
156
|
process_spider_exception = getattr(mw, 'process_spider_exception', None)
|
|
157
|
+
|
|
158
|
+
# Register process_spider_exception method (added to the left for reverse order)
|
|
159
|
+
# 注册process_spider_exception方法(添加到左侧以便逆序执行)
|
|
43
160
|
self.methods['process_spider_exception'].appendleft(process_spider_exception)
|
|
44
161
|
|
|
45
162
|
async def scrape_response(self, scrape_func: Callable, response: Response, request: Request, spider: Spider):
|
|
163
|
+
"""
|
|
164
|
+
Process a response through the spider middleware chain.
|
|
165
|
+
通过爬虫中间件链处理响应。
|
|
166
|
+
|
|
167
|
+
This method is the core of the spider middleware processing. It handles the
|
|
168
|
+
entire process of passing a response through the middleware chain, calling
|
|
169
|
+
the spider's callback function, and processing the output through the
|
|
170
|
+
middleware chain again.
|
|
171
|
+
此方法是爬虫中间件处理的核心。它处理通过中间件链传递响应、调用爬虫的回调
|
|
172
|
+
函数以及再次通过中间件链处理输出的整个过程。
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
scrape_func: The spider's callback function to process the response.
|
|
176
|
+
处理响应的爬虫回调函数。
|
|
177
|
+
response: The response to process.
|
|
178
|
+
要处理的响应。
|
|
179
|
+
request: The request that generated the response.
|
|
180
|
+
生成响应的请求。
|
|
181
|
+
spider: The spider that made the request.
|
|
182
|
+
发出请求的爬虫。
|
|
46
183
|
|
|
184
|
+
Returns:
|
|
185
|
+
The result of processing the response through the middleware chain.
|
|
186
|
+
通过中间件链处理响应的结果。
|
|
187
|
+
"""
|
|
47
188
|
async def process_spider_input(response_) -> Union[AsyncIterable, Iterable]:
|
|
189
|
+
"""
|
|
190
|
+
Process a response through all registered process_spider_input methods.
|
|
191
|
+
通过所有已注册的process_spider_input方法处理响应。
|
|
192
|
+
|
|
193
|
+
This function calls each middleware's process_spider_input method in the
|
|
194
|
+
order they were registered. If any middleware raises an exception, it is
|
|
195
|
+
handled by the scrape_func.
|
|
196
|
+
此函数按照它们注册的顺序调用每个中间件的process_spider_input方法。
|
|
197
|
+
如果任何中间件引发异常,则由scrape_func处理。
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
response_: The response to process.
|
|
201
|
+
要处理的响应。
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Union[AsyncIterable, Iterable]: The result of processing the response.
|
|
205
|
+
处理响应的结果。
|
|
206
|
+
"""
|
|
207
|
+
# Process the response through all registered process_spider_input methods
|
|
208
|
+
# 通过所有已注册的process_spider_input方法处理响应
|
|
48
209
|
for method in self.methods['process_spider_input']:
|
|
49
210
|
try:
|
|
211
|
+
# Call the method with the response
|
|
212
|
+
# 使用响应调用方法
|
|
50
213
|
result = await call_helper(method, response=response_, spider=spider)
|
|
214
|
+
|
|
215
|
+
# Validate the return value
|
|
216
|
+
# 验证返回值
|
|
51
217
|
if result is not None:
|
|
52
218
|
raise _InvalidOutput(
|
|
53
219
|
f"Middleware {_fname(method)} must return None or raise an exception, got {type(result)}"
|
|
54
220
|
)
|
|
55
221
|
except _InvalidOutput:
|
|
222
|
+
# Re-raise _InvalidOutput exceptions
|
|
223
|
+
# 重新引发_InvalidOutput异常
|
|
56
224
|
raise
|
|
57
225
|
except BaseException as exception:
|
|
226
|
+
# Handle other exceptions by calling the scrape_func with the exception
|
|
227
|
+
# 通过使用异常调用scrape_func来处理其他异常
|
|
58
228
|
iterable_or_exception = await call_helper(scrape_func, exception, request)
|
|
229
|
+
|
|
230
|
+
# If the scrape_func returned the exception, re-raise it
|
|
231
|
+
# 如果scrape_func返回了异常,则重新引发它
|
|
59
232
|
if iterable_or_exception is exception:
|
|
60
233
|
raise iterable_or_exception
|
|
234
|
+
|
|
235
|
+
# Otherwise, return the result
|
|
236
|
+
# 否则,返回结果
|
|
61
237
|
return iterable_or_exception
|
|
238
|
+
|
|
239
|
+
# If all middleware methods passed, call the scrape_func with the response
|
|
240
|
+
# 如果所有中间件方法都通过,则使用响应调用scrape_func
|
|
62
241
|
return await call_helper(scrape_func, response_, request)
|
|
63
242
|
|
|
64
243
|
async def _evaluate_iterable(result: Union[AsyncIterable, Iterable], exception_processor_index):
|
|
244
|
+
"""
|
|
245
|
+
Evaluate an iterable and handle any exceptions that occur.
|
|
246
|
+
评估可迭代对象并处理发生的任何异常。
|
|
247
|
+
|
|
248
|
+
This function converts any iterable to an async generator and yields
|
|
249
|
+
its items. If an exception occurs during iteration, it is handled by
|
|
250
|
+
the process_spider_exception function.
|
|
251
|
+
此函数将任何可迭代对象转换为异步生成器并产生其项目。如果在迭代期间
|
|
252
|
+
发生异常,则由process_spider_exception函数处理。
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
result: The iterable to evaluate.
|
|
256
|
+
要评估的可迭代对象。
|
|
257
|
+
exception_processor_index: The index to start processing exceptions from.
|
|
258
|
+
开始处理异常的索引。
|
|
259
|
+
|
|
260
|
+
Yields:
|
|
261
|
+
The items from the iterable.
|
|
262
|
+
可迭代对象中的项目。
|
|
263
|
+
"""
|
|
65
264
|
try:
|
|
265
|
+
# Convert all non-AsyncGeneratorType to AsyncGeneratorType objects
|
|
66
266
|
# 将所有非AsyncGeneratorType变成AsyncGeneratorType对象
|
|
67
267
|
async for r in await async_generator_wrapper(result):
|
|
68
268
|
yield r
|
|
69
269
|
except BaseException as ex:
|
|
270
|
+
# Handle exceptions by calling process_spider_exception
|
|
271
|
+
# 通过调用process_spider_exception处理异常
|
|
70
272
|
exception_result = await process_spider_exception(ex, exception_processor_index)
|
|
273
|
+
|
|
274
|
+
# If the result is an exception, re-raise it
|
|
275
|
+
# 如果结果是异常,则重新引发它
|
|
71
276
|
if isinstance(exception_result, BaseException):
|
|
72
277
|
raise exception_result
|
|
278
|
+
|
|
279
|
+
# Otherwise, recursively evaluate the result
|
|
280
|
+
# 否则,递归评估结果
|
|
73
281
|
async for r in _evaluate_iterable(exception_result, exception_processor_index):
|
|
74
282
|
yield r
|
|
75
283
|
|
|
76
284
|
async def process_spider_exception(exception, start_index=0):
|
|
77
|
-
|
|
285
|
+
"""
|
|
286
|
+
Process an exception through all registered process_spider_exception methods.
|
|
287
|
+
通过所有已注册的process_spider_exception方法处理异常。
|
|
288
|
+
|
|
289
|
+
This function calls each middleware's process_spider_exception method in
|
|
290
|
+
reverse order from how they were registered. If any middleware returns an
|
|
291
|
+
iterable, the exception handling stops and the iterable is processed by
|
|
292
|
+
the process_spider_output function.
|
|
293
|
+
此函数按照与它们注册的相反的顺序调用每个中间件的process_spider_exception方法。
|
|
294
|
+
如果任何中间件返回可迭代对象,则异常处理停止,并且可迭代对象由
|
|
295
|
+
process_spider_output函数处理。
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
exception: The exception to process.
|
|
299
|
+
要处理的异常。
|
|
300
|
+
start_index: The index to start processing from.
|
|
301
|
+
开始处理的索引。
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
The result of processing the exception, or the exception itself if
|
|
305
|
+
no middleware handled it.
|
|
306
|
+
处理异常的结果,如果没有中间件处理它,则为异常本身。
|
|
307
|
+
"""
|
|
308
|
+
# Don't handle _InvalidOutput exceptions
|
|
309
|
+
# 不处理_InvalidOutput异常
|
|
78
310
|
if isinstance(exception, _InvalidOutput):
|
|
79
311
|
raise exception
|
|
312
|
+
|
|
313
|
+
# Get the list of methods to call, starting from start_index
|
|
314
|
+
# 获取要调用的方法列表,从start_index开始
|
|
80
315
|
method_list = islice(self.methods['process_spider_exception'], start_index, None)
|
|
316
|
+
|
|
317
|
+
# Process each method
|
|
318
|
+
# 处理每个方法
|
|
81
319
|
for method_index, method in enumerate(method_list, start=start_index):
|
|
320
|
+
# Skip None methods
|
|
321
|
+
# 跳过None方法
|
|
82
322
|
if method is None:
|
|
83
323
|
continue
|
|
324
|
+
|
|
325
|
+
# Call the method with the exception
|
|
326
|
+
# 使用异常调用方法
|
|
84
327
|
result = await call_helper(method, response=response, exception=exception, spider=spider)
|
|
328
|
+
|
|
329
|
+
# If the result is an iterable, stop exception handling and process the output
|
|
330
|
+
# 如果结果是可迭代对象,则停止异常处理并处理输出
|
|
85
331
|
if isinstance(result, AsyncIterable):
|
|
86
|
-
#
|
|
332
|
+
# Stop exception handling by handing control over to the
|
|
87
333
|
# process_spider_output chain if an iterable has been returned
|
|
334
|
+
# 如果返回了可迭代对象,则通过将控制权交给process_spider_output链来停止异常处理
|
|
88
335
|
return await process_spider_output(result, method_index + 1)
|
|
89
336
|
elif result is None:
|
|
337
|
+
# If the result is None, continue to the next method
|
|
338
|
+
# 如果结果为None,则继续下一个方法
|
|
90
339
|
continue
|
|
91
340
|
else:
|
|
341
|
+
# If the result is neither an iterable nor None, raise an error
|
|
342
|
+
# 如果结果既不是可迭代对象也不是None,则引发错误
|
|
92
343
|
raise _InvalidOutput(
|
|
93
344
|
f"Middleware {_fname(method)} must return None or an iterable, got {type(result)}"
|
|
94
345
|
)
|
|
346
|
+
|
|
347
|
+
# If no middleware handled the exception, re-raise it
|
|
348
|
+
# 如果没有中间件处理异常,则重新引发它
|
|
95
349
|
raise exception
|
|
96
350
|
|
|
97
351
|
async def process_spider_output(result, start_index=0):
|
|
98
|
-
|
|
352
|
+
"""
|
|
353
|
+
Process an iterable through all registered process_spider_output methods.
|
|
354
|
+
通过所有已注册的process_spider_output方法处理可迭代对象。
|
|
355
|
+
|
|
356
|
+
This function calls each middleware's process_spider_output method in
|
|
357
|
+
reverse order from how they were registered. Each middleware can modify
|
|
358
|
+
the iterable or return a new one.
|
|
359
|
+
此函数按照与它们注册的相反的顺序调用每个中间件的process_spider_output方法。
|
|
360
|
+
每个中间件可以修改可迭代对象或返回一个新的可迭代对象。
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
result: The iterable to process.
|
|
364
|
+
要处理的可迭代对象。
|
|
365
|
+
start_index: The index to start processing from.
|
|
366
|
+
开始处理的索引。
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
The processed iterable.
|
|
370
|
+
处理后的可迭代对象。
|
|
371
|
+
"""
|
|
372
|
+
# Items in this iterable do not need to go through the process_spider_output
|
|
99
373
|
# chain, they went through it already from the process_spider_exception method
|
|
374
|
+
# 此可迭代对象中的项目不需要通过process_spider_output链,
|
|
375
|
+
# 它们已经通过process_spider_exception方法通过了它
|
|
100
376
|
|
|
377
|
+
# Get the list of methods to call, starting from start_index
|
|
378
|
+
# 获取要调用的方法列表,从start_index开始
|
|
101
379
|
method_list = islice(self.methods['process_spider_output'], start_index, None)
|
|
380
|
+
|
|
381
|
+
# Process each method
|
|
382
|
+
# 处理每个方法
|
|
102
383
|
for method_index, method in enumerate(method_list, start=start_index):
|
|
384
|
+
# Skip None methods
|
|
385
|
+
# 跳过None方法
|
|
103
386
|
if method is None:
|
|
104
387
|
continue
|
|
388
|
+
|
|
105
389
|
try:
|
|
106
|
-
#
|
|
390
|
+
# Call the method with the result
|
|
391
|
+
# 使用结果调用方法
|
|
392
|
+
# Might fail directly if the output value is not a generator
|
|
393
|
+
# 如果输出值不是生成器,可能会直接失败
|
|
107
394
|
result = await call_helper(method, response=response, result=result, spider=spider)
|
|
108
395
|
except BaseException as ex:
|
|
396
|
+
# Handle exceptions by calling process_spider_exception
|
|
397
|
+
# 通过调用process_spider_exception处理异常
|
|
109
398
|
exception_result = await process_spider_exception(ex, method_index + 1)
|
|
399
|
+
|
|
400
|
+
# If the result is an exception, re-raise it
|
|
401
|
+
# 如果结果是异常,则重新引发它
|
|
110
402
|
if isinstance(exception_result, BaseException):
|
|
111
403
|
raise
|
|
404
|
+
|
|
405
|
+
# Otherwise, return the result
|
|
406
|
+
# 否则,返回结果
|
|
112
407
|
return exception_result
|
|
408
|
+
|
|
409
|
+
# Validate the return value
|
|
410
|
+
# 验证返回值
|
|
113
411
|
if isinstance(result, AsyncIterable):
|
|
412
|
+
# If the result is an iterable, evaluate it
|
|
413
|
+
# 如果结果是可迭代对象,则评估它
|
|
114
414
|
result = _evaluate_iterable(result, method_index + 1)
|
|
115
415
|
else:
|
|
416
|
+
# If the result is not an iterable, raise an error
|
|
417
|
+
# 如果结果不是可迭代对象,则引发错误
|
|
116
418
|
raise _InvalidOutput(f"Middleware {_fname(method)} must return an iterable, got {type(result)}")
|
|
117
419
|
|
|
420
|
+
# Return the final result
|
|
421
|
+
# 返回最终结果
|
|
118
422
|
return result
|
|
119
423
|
|
|
120
424
|
async def process_callback_output(result: Union[AsyncIterable, Iterable]):
|
|
425
|
+
"""
|
|
426
|
+
Process the output of the spider's callback function.
|
|
427
|
+
处理爬虫回调函数的输出。
|
|
428
|
+
|
|
429
|
+
This function evaluates the iterable returned by the spider's callback
|
|
430
|
+
function and processes it through the process_spider_output chain.
|
|
431
|
+
此函数评估爬虫回调函数返回的可迭代对象,并通过process_spider_output链处理它。
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
result: The iterable returned by the spider's callback function.
|
|
435
|
+
爬虫回调函数返回的可迭代对象。
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
The processed iterable.
|
|
439
|
+
处理后的可迭代对象。
|
|
440
|
+
"""
|
|
441
|
+
# Evaluate the iterable
|
|
442
|
+
# 评估可迭代对象
|
|
121
443
|
result: AsyncIterable = _evaluate_iterable(result, 0)
|
|
444
|
+
|
|
445
|
+
# Process the result through the process_spider_output chain
|
|
446
|
+
# 通过process_spider_output链处理结果
|
|
122
447
|
return await process_spider_output(result)
|
|
123
448
|
|
|
124
449
|
try:
|
|
450
|
+
# Process the response through the process_spider_input chain
|
|
451
|
+
# 通过process_spider_input链处理响应
|
|
125
452
|
_iterable = await process_spider_input(response)
|
|
126
453
|
except BaseException as exc:
|
|
454
|
+
# If an exception occurs, process it through the process_spider_exception chain
|
|
455
|
+
# 如果发生异常,则通过process_spider_exception链处理它
|
|
127
456
|
return await process_spider_exception(exc)
|
|
128
457
|
else:
|
|
458
|
+
# If no exception occurs, process the output through the process_callback_output chain
|
|
459
|
+
# 如果没有发生异常,则通过process_callback_output链处理输出
|
|
129
460
|
return await process_callback_output(_iterable)
|
|
130
461
|
|
|
131
462
|
async def process_start_requests(self, start_requests, spider):
|
|
463
|
+
"""
|
|
464
|
+
Process start requests through all registered process_start_requests methods.
|
|
465
|
+
通过所有已注册的process_start_requests方法处理起始请求。
|
|
466
|
+
|
|
467
|
+
This method calls each middleware's process_start_requests method in the
|
|
468
|
+
order they were registered. Each middleware can modify the start requests
|
|
469
|
+
or return a new iterable of requests.
|
|
470
|
+
此方法按照它们注册的顺序调用每个中间件的process_start_requests方法。
|
|
471
|
+
每个中间件可以修改起始请求或返回一个新的请求可迭代对象。
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
start_requests: The iterable of start requests from the spider.
|
|
475
|
+
来自爬虫的起始请求可迭代对象。
|
|
476
|
+
spider: The spider that generated the start requests.
|
|
477
|
+
生成起始请求的爬虫。
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
The processed iterable of start requests.
|
|
481
|
+
处理后的起始请求可迭代对象。
|
|
482
|
+
"""
|
|
483
|
+
# Process the start requests through the chain of process_start_requests methods
|
|
484
|
+
# 通过process_start_requests方法链处理起始请求
|
|
132
485
|
return await self._process_chain('process_start_requests', start_requests, spider)
|