aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/signalmanager.py
CHANGED
|
@@ -1,68 +1,199 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Signal Manager for AioScrapy
|
|
3
|
+
AioScrapy的信号管理器
|
|
4
|
+
|
|
5
|
+
This module provides the SignalManager class which is responsible for coordinating
|
|
6
|
+
signals and receivers in the AioScrapy framework. It wraps PyDispatcher to provide
|
|
7
|
+
a more convenient API for connecting, disconnecting, and sending signals.
|
|
8
|
+
此模块提供了SignalManager类,负责协调AioScrapy框架中的信号和接收器。
|
|
9
|
+
它封装了PyDispatcher,提供了更方便的API用于连接、断开和发送信号。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from pydispatch import dispatcher
|
|
2
13
|
from aioscrapy.utils import signal as _signal
|
|
3
14
|
|
|
4
15
|
|
|
5
16
|
class SignalManager:
|
|
17
|
+
"""
|
|
18
|
+
Class for managing signals in AioScrapy.
|
|
19
|
+
用于管理AioScrapy中信号的类。
|
|
20
|
+
|
|
21
|
+
This class provides methods to connect and disconnect receivers to signals,
|
|
22
|
+
as well as to send signals with proper exception handling. It serves as a
|
|
23
|
+
wrapper around PyDispatcher, providing a more convenient API for AioScrapy.
|
|
24
|
+
此类提供了将接收器连接到信号和断开连接的方法,以及发送带有适当异常处理的信号的方法。
|
|
25
|
+
它作为PyDispatcher的包装器,为AioScrapy提供了更方便的API。
|
|
26
|
+
"""
|
|
6
27
|
|
|
7
28
|
def __init__(self, sender=dispatcher.Anonymous):
|
|
29
|
+
"""
|
|
30
|
+
Initialize a SignalManager.
|
|
31
|
+
初始化一个SignalManager。
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
sender: The default sender to use when connecting or sending signals.
|
|
35
|
+
连接或发送信号时使用的默认发送者。
|
|
36
|
+
Defaults to dispatcher.Anonymous, which is a special object
|
|
37
|
+
used to identify anonymous senders.
|
|
38
|
+
默认为dispatcher.Anonymous,这是一个用于标识匿名发送者的特殊对象。
|
|
39
|
+
"""
|
|
8
40
|
self.sender = sender
|
|
9
41
|
|
|
10
42
|
def connect(self, receiver, signal, **kwargs):
|
|
11
43
|
"""
|
|
12
44
|
Connect a receiver function to a signal.
|
|
45
|
+
将接收器函数连接到信号。
|
|
13
46
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
47
|
+
This method connects a receiver function to a signal so that the function
|
|
48
|
+
is called when the signal is sent. The receiver function will receive the
|
|
49
|
+
signal object and any additional keyword arguments passed when the signal
|
|
50
|
+
is sent.
|
|
51
|
+
此方法将接收器函数连接到信号,以便在发送信号时调用该函数。
|
|
52
|
+
接收器函数将接收信号对象和发送信号时传递的任何其他关键字参数。
|
|
17
53
|
|
|
18
|
-
:
|
|
19
|
-
|
|
54
|
+
Args:
|
|
55
|
+
receiver: The function to be connected to the signal.
|
|
56
|
+
要连接到信号的函数。
|
|
57
|
+
This function will be called when the signal is sent.
|
|
58
|
+
当信号发送时,将调用此函数。
|
|
59
|
+
signal: The signal to connect to.
|
|
60
|
+
要连接的信号。
|
|
61
|
+
This can be any object, although AioScrapy comes with predefined
|
|
62
|
+
signals in the aioscrapy.signals module.
|
|
63
|
+
这可以是任何对象,尽管AioScrapy在aioscrapy.signals模块中
|
|
64
|
+
提供了预定义的信号。
|
|
65
|
+
**kwargs: Additional keyword arguments to pass to PyDispatcher's connect.
|
|
66
|
+
传递给PyDispatcher的connect的其他关键字参数。
|
|
20
67
|
|
|
21
|
-
:
|
|
22
|
-
|
|
68
|
+
Returns:
|
|
69
|
+
bool: True if the receiver was successfully connected, False otherwise.
|
|
70
|
+
如果接收器成功连接,则为True,否则为False。
|
|
23
71
|
"""
|
|
72
|
+
# Set the default sender if not provided
|
|
73
|
+
# 如果未提供,则设置默认发送者
|
|
24
74
|
kwargs.setdefault('sender', self.sender)
|
|
75
|
+
# Connect the receiver to the signal using PyDispatcher
|
|
76
|
+
# 使用PyDispatcher将接收器连接到信号
|
|
25
77
|
return dispatcher.connect(receiver, signal, **kwargs)
|
|
26
78
|
|
|
27
79
|
def disconnect(self, receiver, signal, **kwargs):
|
|
28
80
|
"""
|
|
29
|
-
Disconnect a receiver function from a signal.
|
|
30
|
-
|
|
31
|
-
|
|
81
|
+
Disconnect a receiver function from a signal.
|
|
82
|
+
断开接收器函数与信号的连接。
|
|
83
|
+
|
|
84
|
+
This method disconnects a previously connected receiver function from a signal.
|
|
85
|
+
It has the opposite effect of the connect method, and the arguments are the same.
|
|
86
|
+
此方法断开先前连接到信号的接收器函数。
|
|
87
|
+
它具有与connect方法相反的效果,参数相同。
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
receiver: The function to be disconnected from the signal.
|
|
91
|
+
要从信号断开连接的函数。
|
|
92
|
+
signal: The signal to disconnect from.
|
|
93
|
+
要断开连接的信号。
|
|
94
|
+
**kwargs: Additional keyword arguments to pass to PyDispatcher's disconnect.
|
|
95
|
+
传递给PyDispatcher的disconnect的其他关键字参数。
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
bool: True if the receiver was successfully disconnected, False otherwise.
|
|
99
|
+
如果接收器成功断开连接,则为True,否则为False。
|
|
100
|
+
False might indicate that the receiver was not connected to the signal.
|
|
101
|
+
False可能表示接收器未连接到信号。
|
|
32
102
|
"""
|
|
103
|
+
# Set the default sender if not provided
|
|
104
|
+
# 如果未提供,则设置默认发送者
|
|
33
105
|
kwargs.setdefault('sender', self.sender)
|
|
106
|
+
# Disconnect the receiver from the signal using PyDispatcher
|
|
107
|
+
# 使用PyDispatcher断开接收器与信号的连接
|
|
34
108
|
return dispatcher.disconnect(receiver, signal, **kwargs)
|
|
35
109
|
|
|
36
110
|
async def send_catch_log(self, signal, **kwargs):
|
|
37
111
|
"""
|
|
38
112
|
Send a signal, catch exceptions and log them.
|
|
113
|
+
发送信号,捕获异常并记录它们。
|
|
114
|
+
|
|
115
|
+
This method sends a signal to all connected receivers. If a receiver raises
|
|
116
|
+
an exception, it is caught and logged, but the signal continues to be sent
|
|
117
|
+
to other receivers. This ensures that one failing receiver doesn't prevent
|
|
118
|
+
other receivers from receiving the signal.
|
|
119
|
+
此方法向所有连接的接收器发送信号。如果接收器引发异常,
|
|
120
|
+
则会捕获并记录该异常,但信号继续发送给其他接收器。
|
|
121
|
+
这确保一个失败的接收器不会阻止其他接收器接收信号。
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
signal: The signal to send.
|
|
125
|
+
要发送的信号。
|
|
126
|
+
**kwargs: Keyword arguments to pass to the signal handlers.
|
|
127
|
+
传递给信号处理程序的关键字参数。
|
|
39
128
|
|
|
40
|
-
|
|
41
|
-
|
|
129
|
+
Returns:
|
|
130
|
+
list: A list of (receiver, response) tuples, where response is either
|
|
131
|
+
the return value of the handler or the exception that was caught.
|
|
132
|
+
(接收器, 响应)元组的列表,其中响应是处理程序的返回值或捕获的异常。
|
|
42
133
|
"""
|
|
134
|
+
# Set the default sender if not provided
|
|
135
|
+
# 如果未提供,则设置默认发送者
|
|
43
136
|
kwargs.setdefault('sender', self.sender)
|
|
137
|
+
# Send the signal using the utility function from aioscrapy.utils.signal
|
|
138
|
+
# 使用aioscrapy.utils.signal中的实用函数发送信号
|
|
44
139
|
return await _signal.send_catch_log(signal, **kwargs)
|
|
45
140
|
|
|
46
141
|
async def send_catch_log_deferred(self, signal, **kwargs):
|
|
47
142
|
"""
|
|
48
|
-
|
|
49
|
-
|
|
143
|
+
Send a signal and gather results from all handlers concurrently.
|
|
144
|
+
发送信号并同时收集所有处理程序的结果。
|
|
50
145
|
|
|
51
|
-
|
|
52
|
-
|
|
146
|
+
This method is similar to send_catch_log but runs all signal handlers
|
|
147
|
+
concurrently using asyncio tasks. It waits for all handlers to complete
|
|
148
|
+
before returning the results. This is useful when signal handlers are
|
|
149
|
+
independent of each other and can run in parallel.
|
|
150
|
+
此方法类似于send_catch_log,但使用asyncio任务同时运行所有信号处理程序。
|
|
151
|
+
它在返回结果之前等待所有处理程序完成。当信号处理程序彼此独立并且
|
|
152
|
+
可以并行运行时,这很有用。
|
|
53
153
|
|
|
54
|
-
|
|
55
|
-
|
|
154
|
+
Args:
|
|
155
|
+
signal: The signal to send.
|
|
156
|
+
要发送的信号。
|
|
157
|
+
**kwargs: Keyword arguments to pass to the signal handlers.
|
|
158
|
+
传递给信号处理程序的关键字参数。
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
list: A list of results from all signal handlers, in the order they were
|
|
162
|
+
registered. Each result is either the return value of the handler
|
|
163
|
+
or the exception that was caught.
|
|
164
|
+
所有信号处理程序的结果列表,按它们注册的顺序排列。
|
|
165
|
+
每个结果要么是处理程序的返回值,要么是捕获的异常。
|
|
56
166
|
"""
|
|
167
|
+
# Set the default sender if not provided
|
|
168
|
+
# 如果未提供,则设置默认发送者
|
|
57
169
|
kwargs.setdefault('sender', self.sender)
|
|
170
|
+
# Send the signal using the utility function from aioscrapy.utils.signal
|
|
171
|
+
# 使用aioscrapy.utils.signal中的实用函数发送信号
|
|
58
172
|
return await _signal.send_catch_log_deferred(signal, **kwargs)
|
|
59
173
|
|
|
60
174
|
def disconnect_all(self, signal, **kwargs):
|
|
61
175
|
"""
|
|
62
176
|
Disconnect all receivers from the given signal.
|
|
177
|
+
断开给定信号的所有接收器。
|
|
178
|
+
|
|
179
|
+
This method disconnects all receivers that are connected to the specified
|
|
180
|
+
signal. It's useful for cleaning up signal connections, especially during
|
|
181
|
+
testing or when shutting down a component.
|
|
182
|
+
此方法断开连接到指定信号的所有接收器。
|
|
183
|
+
它对于清理信号连接很有用,特别是在测试期间或关闭组件时。
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
signal: The signal to disconnect all receivers from.
|
|
187
|
+
要断开所有接收器的信号。
|
|
188
|
+
**kwargs: Additional keyword arguments to pass to the disconnect_all function.
|
|
189
|
+
传递给disconnect_all函数的其他关键字参数。
|
|
63
190
|
|
|
64
|
-
:
|
|
65
|
-
|
|
191
|
+
Returns:
|
|
192
|
+
None
|
|
66
193
|
"""
|
|
194
|
+
# Set the default sender if not provided
|
|
195
|
+
# 如果未提供,则设置默认发送者
|
|
67
196
|
kwargs.setdefault('sender', self.sender)
|
|
197
|
+
# Disconnect all receivers using the utility function from aioscrapy.utils.signal
|
|
198
|
+
# 使用aioscrapy.utils.signal中的实用函数断开所有接收器
|
|
68
199
|
return _signal.disconnect_all(signal, **kwargs)
|
aioscrapy/signals.py
CHANGED
|
@@ -1,24 +1,206 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
AioScrapy Signals
|
|
3
|
+
AioScrapy信号
|
|
4
|
+
|
|
5
|
+
This module defines all signals that the AioScrapy framework emits during the
|
|
6
|
+
execution of a crawl. These signals allow developers to hook into various points
|
|
7
|
+
of the crawling process to add custom functionality.
|
|
8
|
+
此模块定义了AioScrapy框架在爬取执行过程中发出的所有信号。
|
|
9
|
+
这些信号允许开发人员挂钩到爬取过程的各个点,以添加自定义功能。
|
|
10
|
+
|
|
11
|
+
Signals are implemented using the PyDispatcher library and are represented as
|
|
12
|
+
unique objects. To connect to a signal, use the crawler.signals.connect method.
|
|
13
|
+
信号使用PyDispatcher库实现,并表示为唯一对象。
|
|
14
|
+
要连接到信号,请使用crawler.signals.connect方法。
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
def handle_spider_opened(spider):
|
|
18
|
+
print(f"Spider {spider.name} opened")
|
|
19
|
+
|
|
20
|
+
crawler.signals.connect(handle_spider_opened, signal=signals.spider_opened)
|
|
3
21
|
|
|
4
22
|
These signals are documented in docs/topics/signals.rst. Please don't add new
|
|
5
23
|
signals here without documenting them there.
|
|
24
|
+
这些信号在docs/topics/signals.rst中有文档说明。
|
|
25
|
+
请不要在此处添加新信号,除非在那里记录它们。
|
|
6
26
|
"""
|
|
7
27
|
|
|
28
|
+
# Engine signals
|
|
29
|
+
# 引擎信号
|
|
30
|
+
|
|
31
|
+
#: Signal sent when the aioscrapy engine has started.
|
|
32
|
+
#: 当aioscrapy引擎启动时发送的信号。
|
|
33
|
+
#: Args: None
|
|
8
34
|
engine_started = object()
|
|
35
|
+
|
|
36
|
+
#: Signal sent when the aioscrapy engine has stopped.
|
|
37
|
+
#: 当aioscrapy引擎停止时发送的信号。
|
|
38
|
+
#: Args: None
|
|
9
39
|
engine_stopped = object()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Spider signals
|
|
43
|
+
# 爬虫信号
|
|
44
|
+
|
|
45
|
+
#: Signal sent when a spider has been opened for crawling.
|
|
46
|
+
#: 当爬虫被打开进行爬取时发送的信号。
|
|
47
|
+
#: Args:
|
|
48
|
+
#: spider (Spider): The spider that has been opened.
|
|
49
|
+
#: 已被打开的爬虫。
|
|
10
50
|
spider_opened = object()
|
|
51
|
+
|
|
52
|
+
#: Signal sent when a spider has no more requests to process.
|
|
53
|
+
#: 当爬虫没有更多请求要处理时发送的信号。
|
|
54
|
+
#: Args:
|
|
55
|
+
#: spider (Spider): The spider that has become idle.
|
|
56
|
+
#: 变为空闲的爬虫。
|
|
11
57
|
spider_idle = object()
|
|
58
|
+
|
|
59
|
+
#: Signal sent when a spider has been closed.
|
|
60
|
+
#: 当爬虫被关闭时发送的信号。
|
|
61
|
+
#: Args:
|
|
62
|
+
#: spider (Spider): The spider that has been closed.
|
|
63
|
+
#: 已被关闭的爬虫。
|
|
64
|
+
#: reason (str): A string describing the reason why the spider was closed.
|
|
65
|
+
#: 描述爬虫被关闭原因的字符串。
|
|
12
66
|
spider_closed = object()
|
|
67
|
+
|
|
68
|
+
#: Signal sent when a spider callback generates an error.
|
|
69
|
+
#: 当爬虫回调生成错误时发送的信号。
|
|
70
|
+
#: Args:
|
|
71
|
+
#: failure (Failure): The exception information.
|
|
72
|
+
#: 异常信息。
|
|
73
|
+
#: response (Response): The response that caused the error.
|
|
74
|
+
#: 导致错误的响应。
|
|
75
|
+
#: spider (Spider): The spider that raised the exception.
|
|
76
|
+
#: 引发异常的爬虫。
|
|
13
77
|
spider_error = object()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Request signals
|
|
81
|
+
# 请求信号
|
|
82
|
+
|
|
83
|
+
#: Signal sent when a new Request is scheduled to be downloaded.
|
|
84
|
+
#: 当新的Request被安排下载时发送的信号。
|
|
85
|
+
#: Args:
|
|
86
|
+
#: request (Request): The request that reached the scheduler.
|
|
87
|
+
#: 到达调度器的请求。
|
|
88
|
+
#: spider (Spider): The spider that generated the request.
|
|
89
|
+
#: 生成请求的爬虫。
|
|
14
90
|
request_scheduled = object()
|
|
91
|
+
|
|
92
|
+
#: Signal sent when a Request is dropped by the scheduler.
|
|
93
|
+
#: 当请求被调度器丢弃时发送的信号。
|
|
94
|
+
#: Args:
|
|
95
|
+
#: request (Request): The request that was dropped.
|
|
96
|
+
#: 被丢弃的请求。
|
|
97
|
+
#: spider (Spider): The spider that generated the request.
|
|
98
|
+
#: 生成请求的爬虫。
|
|
15
99
|
request_dropped = object()
|
|
100
|
+
|
|
101
|
+
#: Signal sent when a Request reaches the downloader.
|
|
102
|
+
#: 当请求到达下载器时发送的信号。
|
|
103
|
+
#: Args:
|
|
104
|
+
#: request (Request): The request that reached the downloader.
|
|
105
|
+
#: 到达下载器的请求。
|
|
106
|
+
#: spider (Spider): The spider that generated the request.
|
|
107
|
+
#: 生成请求的爬虫。
|
|
16
108
|
request_reached_downloader = object()
|
|
109
|
+
|
|
110
|
+
#: Signal sent when a Request leaves the downloader.
|
|
111
|
+
#: 当请求离开下载器时发送的信号。
|
|
112
|
+
#: Args:
|
|
113
|
+
#: request (Request): The request that left the downloader.
|
|
114
|
+
#: 离开下载器的请求。
|
|
115
|
+
#: spider (Spider): The spider that generated the request.
|
|
116
|
+
#: 生成请求的爬虫。
|
|
17
117
|
request_left_downloader = object()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# Response signals
|
|
121
|
+
# 响应信号
|
|
122
|
+
|
|
123
|
+
#: Signal sent when the downloader receives a response from the web server.
|
|
124
|
+
#: 当下载器从Web服务器接收到响应时发送的信号。
|
|
125
|
+
#: Args:
|
|
126
|
+
#: response (Response): The response received.
|
|
127
|
+
#: 接收到的响应。
|
|
128
|
+
#: request (Request): The request that generated the response.
|
|
129
|
+
#: 生成响应的请求。
|
|
130
|
+
#: spider (Spider): The spider that generated the request.
|
|
131
|
+
#: 生成请求的爬虫。
|
|
18
132
|
response_received = object()
|
|
133
|
+
|
|
134
|
+
#: Signal sent when a Response has been downloaded.
|
|
135
|
+
#: 当响应已被下载时发送的信号。
|
|
136
|
+
#: Args:
|
|
137
|
+
#: response (Response): The response downloaded.
|
|
138
|
+
#: 下载的响应。
|
|
139
|
+
#: request (Request): The request that generated the response.
|
|
140
|
+
#: 生成响应的请求。
|
|
141
|
+
#: spider (Spider): The spider that generated the request.
|
|
142
|
+
#: 生成请求的爬虫。
|
|
19
143
|
response_downloaded = object()
|
|
144
|
+
|
|
145
|
+
#: Signal sent when the HTTP headers are received for a request.
|
|
146
|
+
#: 当接收到请求的HTTP头时发送的信号。
|
|
147
|
+
#: Args:
|
|
148
|
+
#: headers (dict): The HTTP headers received.
|
|
149
|
+
#: 接收到的HTTP头。
|
|
150
|
+
#: body_length (int): Expected size of the response body.
|
|
151
|
+
#: 预期的响应正文大小。
|
|
152
|
+
#: request (Request): The request that generated the response.
|
|
153
|
+
#: 生成响应的请求。
|
|
154
|
+
#: spider (Spider): The spider that generated the request.
|
|
155
|
+
#: 生成请求的爬虫。
|
|
20
156
|
headers_received = object()
|
|
157
|
+
|
|
158
|
+
#: Signal sent when a chunk of response data is received.
|
|
159
|
+
#: 当接收到响应数据块时发送的信号。
|
|
160
|
+
#: Args:
|
|
161
|
+
#: data (bytes): The chunk of data received.
|
|
162
|
+
#: 接收到的数据块。
|
|
163
|
+
#: request (Request): The request that generated the response.
|
|
164
|
+
#: 生成响应的请求。
|
|
165
|
+
#: spider (Spider): The spider that generated the request.
|
|
166
|
+
#: 生成请求的爬虫。
|
|
21
167
|
bytes_received = object()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Item signals
|
|
171
|
+
# 项目信号
|
|
172
|
+
|
|
173
|
+
#: Signal sent when an item has been scraped by a spider.
|
|
174
|
+
#: 当项目被爬虫抓取时发送的信号。
|
|
175
|
+
#: Args:
|
|
176
|
+
#: item (Item or dict): The item scraped.
|
|
177
|
+
#: 抓取的项目。
|
|
178
|
+
#: response (Response): The response from which the item was scraped.
|
|
179
|
+
#: 项目被抓取的响应。
|
|
180
|
+
#: spider (Spider): The spider which scraped the item.
|
|
181
|
+
#: 抓取项目的爬虫。
|
|
22
182
|
item_scraped = object()
|
|
183
|
+
|
|
184
|
+
#: Signal sent when an item is dropped by an item pipeline.
|
|
185
|
+
#: 当项目被项目管道丢弃时发送的信号。
|
|
186
|
+
#: Args:
|
|
187
|
+
#: item (Item or dict): The item dropped from the pipeline.
|
|
188
|
+
#: 从管道丢弃的项目。
|
|
189
|
+
#: exception (Exception): The exception that caused the item to be dropped.
|
|
190
|
+
#: 导致项目被丢弃的异常。
|
|
191
|
+
#: spider (Spider): The spider which scraped the item.
|
|
192
|
+
#: 抓取项目的爬虫。
|
|
23
193
|
item_dropped = object()
|
|
194
|
+
|
|
195
|
+
#: Signal sent when an item causes an error in an item pipeline.
|
|
196
|
+
#: 当项目在项目管道中导致错误时发送的信号。
|
|
197
|
+
#: Args:
|
|
198
|
+
#: item (Item or dict): The item that caused the error.
|
|
199
|
+
#: 导致错误的项目。
|
|
200
|
+
#: exception (Exception): The exception raised.
|
|
201
|
+
#: 引发的异常。
|
|
202
|
+
#: spider (Spider): The spider which scraped the item.
|
|
203
|
+
#: 抓取项目的爬虫。
|
|
204
|
+
#: response (Response): The response from which the item was scraped.
|
|
205
|
+
#: 项目被抓取的响应。
|
|
24
206
|
item_error = object()
|
aioscrapy/spiderloader.py
CHANGED
|
@@ -9,37 +9,113 @@ from aioscrapy.utils.spider import iter_spider_classes
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class ISpiderLoader(Interface):
|
|
12
|
+
"""
|
|
13
|
+
Interface for spider loader implementations.
|
|
14
|
+
爬虫加载器实现的接口。
|
|
15
|
+
|
|
16
|
+
This interface defines the methods that spider loader implementations
|
|
17
|
+
must provide.
|
|
18
|
+
此接口定义了爬虫加载器实现必须提供的方法。
|
|
19
|
+
"""
|
|
12
20
|
|
|
13
21
|
def from_settings(settings):
|
|
14
|
-
"""
|
|
22
|
+
"""
|
|
23
|
+
Return an instance of the class for the given settings.
|
|
24
|
+
返回给定设置的类实例。
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
settings: The settings to use for the spider loader.
|
|
28
|
+
用于爬虫加载器的设置。
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
An instance of the spider loader.
|
|
32
|
+
爬虫加载器的实例。
|
|
33
|
+
"""
|
|
15
34
|
|
|
16
35
|
def load(spider_name):
|
|
17
|
-
"""
|
|
18
|
-
|
|
36
|
+
"""
|
|
37
|
+
Return the Spider class for the given spider name.
|
|
38
|
+
返回给定爬虫名称的Spider类。
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
spider_name: The name of the spider to load.
|
|
42
|
+
要加载的爬虫的名称。
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
The Spider class for the given spider name.
|
|
46
|
+
给定爬虫名称的Spider类。
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
KeyError: If the spider name is not found.
|
|
50
|
+
如果找不到爬虫名称。
|
|
51
|
+
"""
|
|
19
52
|
|
|
20
53
|
def list():
|
|
21
|
-
"""
|
|
22
|
-
project
|
|
54
|
+
"""
|
|
55
|
+
Return a list with the names of all spiders available in the project.
|
|
56
|
+
返回项目中所有可用爬虫的名称列表。
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
A list of spider names.
|
|
60
|
+
爬虫名称列表。
|
|
61
|
+
"""
|
|
23
62
|
|
|
24
63
|
def find_by_request(request):
|
|
25
|
-
"""
|
|
64
|
+
"""
|
|
65
|
+
Return the list of spider names that can handle the given request.
|
|
66
|
+
返回可以处理给定请求的爬虫名称列表。
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
request: The request to check.
|
|
70
|
+
要检查的请求。
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
A list of spider names that can handle the request.
|
|
74
|
+
可以处理请求的爬虫名称列表。
|
|
75
|
+
"""
|
|
26
76
|
|
|
27
77
|
|
|
28
78
|
@implementer(ISpiderLoader)
|
|
29
79
|
class SpiderLoader:
|
|
30
80
|
"""
|
|
31
|
-
SpiderLoader is a class which locates and loads spiders
|
|
32
|
-
|
|
81
|
+
SpiderLoader is a class which locates and loads spiders in a aioscrapy project.
|
|
82
|
+
SpiderLoader是一个定位和加载aioscrapy项目中爬虫的类。
|
|
83
|
+
|
|
84
|
+
This class implements the ISpiderLoader interface and provides methods to
|
|
85
|
+
find, load, and list spiders in a project.
|
|
86
|
+
此类实现了ISpiderLoader接口,并提供了在项目中查找、加载和列出爬虫的方法。
|
|
33
87
|
"""
|
|
34
88
|
|
|
35
89
|
def __init__(self, settings):
|
|
90
|
+
"""
|
|
91
|
+
Initialize the SpiderLoader.
|
|
92
|
+
初始化SpiderLoader。
|
|
93
|
+
|
|
94
|
+
This method initializes the SpiderLoader with the given settings and
|
|
95
|
+
loads all spiders from the specified modules.
|
|
96
|
+
此方法使用给定的设置初始化SpiderLoader,并从指定的模块加载所有爬虫。
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
settings: The settings object containing spider loader configuration.
|
|
100
|
+
包含爬虫加载器配置的设置对象。
|
|
101
|
+
"""
|
|
36
102
|
self.spider_modules = settings.getlist('SPIDER_MODULES')
|
|
37
103
|
self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
|
|
38
|
-
self._spiders = {}
|
|
39
|
-
|
|
104
|
+
self._spiders = {} # Dict of spider name -> spider class
|
|
105
|
+
# 爬虫名称 -> 爬虫类的字典
|
|
106
|
+
self._found = defaultdict(list) # Dict of spider name -> list of (module, class) locations
|
|
107
|
+
# 爬虫名称 -> (模块, 类)位置列表的字典
|
|
40
108
|
self._load_all_spiders()
|
|
41
109
|
|
|
42
110
|
def _check_name_duplicates(self):
|
|
111
|
+
"""
|
|
112
|
+
Check for duplicate spider names and issue warnings if found.
|
|
113
|
+
检查重复的爬虫名称,如果发现则发出警告。
|
|
114
|
+
|
|
115
|
+
This method checks if there are multiple spider classes with the same name
|
|
116
|
+
and issues a warning if duplicates are found.
|
|
117
|
+
此方法检查是否有多个具有相同名称的爬虫类,如果发现重复则发出警告。
|
|
118
|
+
"""
|
|
43
119
|
dupes = []
|
|
44
120
|
for name, locations in self._found.items():
|
|
45
121
|
dupes.extend([
|
|
@@ -57,11 +133,35 @@ class SpiderLoader:
|
|
|
57
133
|
)
|
|
58
134
|
|
|
59
135
|
def _load_spiders(self, module):
|
|
136
|
+
"""
|
|
137
|
+
Load spiders from a given module.
|
|
138
|
+
从给定模块加载爬虫。
|
|
139
|
+
|
|
140
|
+
This method finds all spider classes in the given module and adds them
|
|
141
|
+
to the internal dictionaries.
|
|
142
|
+
此方法查找给定模块中的所有爬虫类,并将它们添加到内部字典中。
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
module: The module to load spiders from.
|
|
146
|
+
要从中加载爬虫的模块。
|
|
147
|
+
"""
|
|
60
148
|
for spcls in iter_spider_classes(module):
|
|
61
149
|
self._found[spcls.name].append((module.__name__, spcls.__name__))
|
|
62
150
|
self._spiders[spcls.name] = spcls
|
|
63
151
|
|
|
64
152
|
def _load_all_spiders(self):
|
|
153
|
+
"""
|
|
154
|
+
Load all spiders from all modules specified in SPIDER_MODULES setting.
|
|
155
|
+
从SPIDER_MODULES设置中指定的所有模块加载所有爬虫。
|
|
156
|
+
|
|
157
|
+
This method walks through all the modules specified in the SPIDER_MODULES
|
|
158
|
+
setting, loads all spiders from them, and checks for duplicate names.
|
|
159
|
+
此方法遍历SPIDER_MODULES设置中指定的所有模块,从中加载所有爬虫,并检查重复的名称。
|
|
160
|
+
|
|
161
|
+
If an import error occurs and SPIDER_LOADER_WARN_ONLY is True, a warning
|
|
162
|
+
is issued instead of raising the exception.
|
|
163
|
+
如果发生导入错误且SPIDER_LOADER_WARN_ONLY为True,则发出警告而不是引发异常。
|
|
164
|
+
"""
|
|
65
165
|
for name in self.spider_modules:
|
|
66
166
|
try:
|
|
67
167
|
for module in walk_modules(name):
|
|
@@ -80,12 +180,43 @@ class SpiderLoader:
|
|
|
80
180
|
|
|
81
181
|
@classmethod
|
|
82
182
|
def from_settings(cls, settings):
|
|
183
|
+
"""
|
|
184
|
+
Create a SpiderLoader instance from settings.
|
|
185
|
+
从设置创建SpiderLoader实例。
|
|
186
|
+
|
|
187
|
+
This is a factory method that creates a new SpiderLoader instance
|
|
188
|
+
with the given settings.
|
|
189
|
+
这是一个工厂方法,使用给定的设置创建一个新的SpiderLoader实例。
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
settings: The settings to use for the spider loader.
|
|
193
|
+
用于爬虫加载器的设置。
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
A new SpiderLoader instance.
|
|
197
|
+
一个新的SpiderLoader实例。
|
|
198
|
+
"""
|
|
83
199
|
return cls(settings)
|
|
84
200
|
|
|
85
201
|
def load(self, spider_name):
|
|
86
202
|
"""
|
|
87
|
-
Return the Spider class for the given spider name.
|
|
88
|
-
|
|
203
|
+
Return the Spider class for the given spider name.
|
|
204
|
+
返回给定爬虫名称的Spider类。
|
|
205
|
+
|
|
206
|
+
This method looks up the spider class by name in the internal dictionary.
|
|
207
|
+
此方法在内部字典中按名称查找爬虫类。
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
spider_name: The name of the spider to load.
|
|
211
|
+
要加载的爬虫的名称。
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
The Spider class for the given spider name.
|
|
215
|
+
给定爬虫名称的Spider类。
|
|
216
|
+
|
|
217
|
+
Raises:
|
|
218
|
+
KeyError: If the spider name is not found.
|
|
219
|
+
如果找不到爬虫名称。
|
|
89
220
|
"""
|
|
90
221
|
try:
|
|
91
222
|
return self._spiders[spider_name]
|
|
@@ -95,6 +226,19 @@ class SpiderLoader:
|
|
|
95
226
|
def find_by_request(self, request):
|
|
96
227
|
"""
|
|
97
228
|
Return the list of spider names that can handle the given request.
|
|
229
|
+
返回可以处理给定请求的爬虫名称列表。
|
|
230
|
+
|
|
231
|
+
This method checks each spider's handles_request method to determine
|
|
232
|
+
if it can handle the given request.
|
|
233
|
+
此方法检查每个爬虫的handles_request方法,以确定它是否可以处理给定的请求。
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
request: The request to check.
|
|
237
|
+
要检查的请求。
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
A list of spider names that can handle the request.
|
|
241
|
+
可以处理请求的爬虫名称列表。
|
|
98
242
|
"""
|
|
99
243
|
return [
|
|
100
244
|
name for name, cls in self._spiders.items()
|
|
@@ -104,5 +248,14 @@ class SpiderLoader:
|
|
|
104
248
|
def list(self):
|
|
105
249
|
"""
|
|
106
250
|
Return a list with the names of all spiders available in the project.
|
|
251
|
+
返回项目中所有可用爬虫的名称列表。
|
|
252
|
+
|
|
253
|
+
This method returns a list of all spider names that have been loaded
|
|
254
|
+
by the spider loader.
|
|
255
|
+
此方法返回已由爬虫加载器加载的所有爬虫名称的列表。
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
A list of spider names.
|
|
259
|
+
爬虫名称列表。
|
|
107
260
|
"""
|
|
108
261
|
return list(self._spiders.keys())
|