aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,68 +1,199 @@
1
+ """
2
+ Signal Manager for AioScrapy
3
+ AioScrapy的信号管理器
4
+
5
+ This module provides the SignalManager class which is responsible for coordinating
6
+ signals and receivers in the AioScrapy framework. It wraps PyDispatcher to provide
7
+ a more convenient API for connecting, disconnecting, and sending signals.
8
+ 此模块提供了SignalManager类,负责协调AioScrapy框架中的信号和接收器。
9
+ 它封装了PyDispatcher,提供了更方便的API用于连接、断开和发送信号。
10
+ """
11
+
1
12
  from pydispatch import dispatcher
2
13
  from aioscrapy.utils import signal as _signal
3
14
 
4
15
 
5
16
  class SignalManager:
17
+ """
18
+ Class for managing signals in AioScrapy.
19
+ 用于管理AioScrapy中信号的类。
20
+
21
+ This class provides methods to connect and disconnect receivers to signals,
22
+ as well as to send signals with proper exception handling. It serves as a
23
+ wrapper around PyDispatcher, providing a more convenient API for AioScrapy.
24
+ 此类提供了将接收器连接到信号和断开连接的方法,以及发送带有适当异常处理的信号的方法。
25
+ 它作为PyDispatcher的包装器,为AioScrapy提供了更方便的API。
26
+ """
6
27
 
7
28
  def __init__(self, sender=dispatcher.Anonymous):
29
+ """
30
+ Initialize a SignalManager.
31
+ 初始化一个SignalManager。
32
+
33
+ Args:
34
+ sender: The default sender to use when connecting or sending signals.
35
+ 连接或发送信号时使用的默认发送者。
36
+ Defaults to dispatcher.Anonymous, which is a special object
37
+ used to identify anonymous senders.
38
+ 默认为dispatcher.Anonymous,这是一个用于标识匿名发送者的特殊对象。
39
+ """
8
40
  self.sender = sender
9
41
 
10
42
  def connect(self, receiver, signal, **kwargs):
11
43
  """
12
44
  Connect a receiver function to a signal.
45
+ 将接收器函数连接到信号。
13
46
 
14
- The signal can be any object, although Scrapy comes with some
15
- predefined signals that are documented in the :ref:`topics-signals`
16
- section.
47
+ This method connects a receiver function to a signal so that the function
48
+ is called when the signal is sent. The receiver function will receive the
49
+ signal object and any additional keyword arguments passed when the signal
50
+ is sent.
51
+ 此方法将接收器函数连接到信号,以便在发送信号时调用该函数。
52
+ 接收器函数将接收信号对象和发送信号时传递的任何其他关键字参数。
17
53
 
18
- :param receiver: the function to be connected
19
- :type receiver: callable
54
+ Args:
55
+ receiver: The function to be connected to the signal.
56
+ 要连接到信号的函数。
57
+ This function will be called when the signal is sent.
58
+ 当信号发送时,将调用此函数。
59
+ signal: The signal to connect to.
60
+ 要连接的信号。
61
+ This can be any object, although AioScrapy comes with predefined
62
+ signals in the aioscrapy.signals module.
63
+ 这可以是任何对象,尽管AioScrapy在aioscrapy.signals模块中
64
+ 提供了预定义的信号。
65
+ **kwargs: Additional keyword arguments to pass to PyDispatcher's connect.
66
+ 传递给PyDispatcher的connect的其他关键字参数。
20
67
 
21
- :param signal: the signal to connect to
22
- :type signal: object
68
+ Returns:
69
+ bool: True if the receiver was successfully connected, False otherwise.
70
+ 如果接收器成功连接,则为True,否则为False。
23
71
  """
72
+ # Set the default sender if not provided
73
+ # 如果未提供,则设置默认发送者
24
74
  kwargs.setdefault('sender', self.sender)
75
+ # Connect the receiver to the signal using PyDispatcher
76
+ # 使用PyDispatcher将接收器连接到信号
25
77
  return dispatcher.connect(receiver, signal, **kwargs)
26
78
 
27
79
  def disconnect(self, receiver, signal, **kwargs):
28
80
  """
29
- Disconnect a receiver function from a signal. This has the
30
- opposite effect of the :meth:`connect` method, and the arguments
31
- are the same.
81
+ Disconnect a receiver function from a signal.
82
+ 断开接收器函数与信号的连接。
83
+
84
+ This method disconnects a previously connected receiver function from a signal.
85
+ It has the opposite effect of the connect method, and the arguments are the same.
86
+ 此方法断开先前连接到信号的接收器函数。
87
+ 它具有与connect方法相反的效果,参数相同。
88
+
89
+ Args:
90
+ receiver: The function to be disconnected from the signal.
91
+ 要从信号断开连接的函数。
92
+ signal: The signal to disconnect from.
93
+ 要断开连接的信号。
94
+ **kwargs: Additional keyword arguments to pass to PyDispatcher's disconnect.
95
+ 传递给PyDispatcher的disconnect的其他关键字参数。
96
+
97
+ Returns:
98
+ bool: True if the receiver was successfully disconnected, False otherwise.
99
+ 如果接收器成功断开连接,则为True,否则为False。
100
+ False might indicate that the receiver was not connected to the signal.
101
+ False可能表示接收器未连接到信号。
32
102
  """
103
+ # Set the default sender if not provided
104
+ # 如果未提供,则设置默认发送者
33
105
  kwargs.setdefault('sender', self.sender)
106
+ # Disconnect the receiver from the signal using PyDispatcher
107
+ # 使用PyDispatcher断开接收器与信号的连接
34
108
  return dispatcher.disconnect(receiver, signal, **kwargs)
35
109
 
36
110
  async def send_catch_log(self, signal, **kwargs):
37
111
  """
38
112
  Send a signal, catch exceptions and log them.
113
+ 发送信号,捕获异常并记录它们。
114
+
115
+ This method sends a signal to all connected receivers. If a receiver raises
116
+ an exception, it is caught and logged, but the signal continues to be sent
117
+ to other receivers. This ensures that one failing receiver doesn't prevent
118
+ other receivers from receiving the signal.
119
+ 此方法向所有连接的接收器发送信号。如果接收器引发异常,
120
+ 则会捕获并记录该异常,但信号继续发送给其他接收器。
121
+ 这确保一个失败的接收器不会阻止其他接收器接收信号。
122
+
123
+ Args:
124
+ signal: The signal to send.
125
+ 要发送的信号。
126
+ **kwargs: Keyword arguments to pass to the signal handlers.
127
+ 传递给信号处理程序的关键字参数。
39
128
 
40
- The keyword arguments are passed to the signal handlers (connected
41
- through the :meth:`connect` method).
129
+ Returns:
130
+ list: A list of (receiver, response) tuples, where response is either
131
+ the return value of the handler or the exception that was caught.
132
+ (接收器, 响应)元组的列表,其中响应是处理程序的返回值或捕获的异常。
42
133
  """
134
+ # Set the default sender if not provided
135
+ # 如果未提供,则设置默认发送者
43
136
  kwargs.setdefault('sender', self.sender)
137
+ # Send the signal using the utility function from aioscrapy.utils.signal
138
+ # 使用aioscrapy.utils.signal中的实用函数发送信号
44
139
  return await _signal.send_catch_log(signal, **kwargs)
45
140
 
46
141
  async def send_catch_log_deferred(self, signal, **kwargs):
47
142
  """
48
- Like :meth:`send_catch_log` but supports returning
49
- :class:`~twisted.internet.defer.Deferred` objects from signal handlers.
143
+ Send a signal and gather results from all handlers concurrently.
144
+ 发送信号并同时收集所有处理程序的结果。
50
145
 
51
- Returns a Deferred that gets fired once all signal handlers
52
- deferreds were fired. Send a signal, catch exceptions and log them.
146
+ This method is similar to send_catch_log but runs all signal handlers
147
+ concurrently using asyncio tasks. It waits for all handlers to complete
148
+ before returning the results. This is useful when signal handlers are
149
+ independent of each other and can run in parallel.
150
+ 此方法类似于send_catch_log,但使用asyncio任务同时运行所有信号处理程序。
151
+ 它在返回结果之前等待所有处理程序完成。当信号处理程序彼此独立并且
152
+ 可以并行运行时,这很有用。
53
153
 
54
- The keyword arguments are passed to the signal handlers (connected
55
- through the :meth:`connect` method).
154
+ Args:
155
+ signal: The signal to send.
156
+ 要发送的信号。
157
+ **kwargs: Keyword arguments to pass to the signal handlers.
158
+ 传递给信号处理程序的关键字参数。
159
+
160
+ Returns:
161
+ list: A list of results from all signal handlers, in the order they were
162
+ registered. Each result is either the return value of the handler
163
+ or the exception that was caught.
164
+ 所有信号处理程序的结果列表,按它们注册的顺序排列。
165
+ 每个结果要么是处理程序的返回值,要么是捕获的异常。
56
166
  """
167
+ # Set the default sender if not provided
168
+ # 如果未提供,则设置默认发送者
57
169
  kwargs.setdefault('sender', self.sender)
170
+ # Send the signal using the utility function from aioscrapy.utils.signal
171
+ # 使用aioscrapy.utils.signal中的实用函数发送信号
58
172
  return await _signal.send_catch_log_deferred(signal, **kwargs)
59
173
 
60
174
  def disconnect_all(self, signal, **kwargs):
61
175
  """
62
176
  Disconnect all receivers from the given signal.
177
+ 断开给定信号的所有接收器。
178
+
179
+ This method disconnects all receivers that are connected to the specified
180
+ signal. It's useful for cleaning up signal connections, especially during
181
+ testing or when shutting down a component.
182
+ 此方法断开连接到指定信号的所有接收器。
183
+ 它对于清理信号连接很有用,特别是在测试期间或关闭组件时。
184
+
185
+ Args:
186
+ signal: The signal to disconnect all receivers from.
187
+ 要断开所有接收器的信号。
188
+ **kwargs: Additional keyword arguments to pass to the disconnect_all function.
189
+ 传递给disconnect_all函数的其他关键字参数。
63
190
 
64
- :param signal: the signal to disconnect from
65
- :type signal: object
191
+ Returns:
192
+ None
66
193
  """
194
+ # Set the default sender if not provided
195
+ # 如果未提供,则设置默认发送者
67
196
  kwargs.setdefault('sender', self.sender)
197
+ # Disconnect all receivers using the utility function from aioscrapy.utils.signal
198
+ # 使用aioscrapy.utils.signal中的实用函数断开所有接收器
68
199
  return _signal.disconnect_all(signal, **kwargs)
aioscrapy/signals.py CHANGED
@@ -1,24 +1,206 @@
1
1
  """
2
- Scrapy signals
2
+ AioScrapy Signals
3
+ AioScrapy信号
4
+
5
+ This module defines all signals that the AioScrapy framework emits during the
6
+ execution of a crawl. These signals allow developers to hook into various points
7
+ of the crawling process to add custom functionality.
8
+ 此模块定义了AioScrapy框架在爬取执行过程中发出的所有信号。
9
+ 这些信号允许开发人员挂钩到爬取过程的各个点,以添加自定义功能。
10
+
11
+ Signals are implemented using the PyDispatcher library and are represented as
12
+ unique objects. To connect to a signal, use the crawler.signals.connect method.
13
+ 信号使用PyDispatcher库实现,并表示为唯一对象。
14
+ 要连接到信号,请使用crawler.signals.connect方法。
15
+
16
+ Example:
17
+ def handle_spider_opened(spider):
18
+ print(f"Spider {spider.name} opened")
19
+
20
+ crawler.signals.connect(handle_spider_opened, signal=signals.spider_opened)
3
21
 
4
22
  These signals are documented in docs/topics/signals.rst. Please don't add new
5
23
  signals here without documenting them there.
24
+ 这些信号在docs/topics/signals.rst中有文档说明。
25
+ 请不要在此处添加新信号,除非在那里记录它们。
6
26
  """
7
27
 
28
+ # Engine signals
29
+ # 引擎信号
30
+
31
+ #: Signal sent when the aioscrapy engine has started.
32
+ #: 当aioscrapy引擎启动时发送的信号。
33
+ #: Args: None
8
34
  engine_started = object()
35
+
36
+ #: Signal sent when the aioscrapy engine has stopped.
37
+ #: 当aioscrapy引擎停止时发送的信号。
38
+ #: Args: None
9
39
  engine_stopped = object()
40
+
41
+
42
+ # Spider signals
43
+ # 爬虫信号
44
+
45
+ #: Signal sent when a spider has been opened for crawling.
46
+ #: 当爬虫被打开进行爬取时发送的信号。
47
+ #: Args:
48
+ #: spider (Spider): The spider that has been opened.
49
+ #: 已被打开的爬虫。
10
50
  spider_opened = object()
51
+
52
+ #: Signal sent when a spider has no more requests to process.
53
+ #: 当爬虫没有更多请求要处理时发送的信号。
54
+ #: Args:
55
+ #: spider (Spider): The spider that has become idle.
56
+ #: 变为空闲的爬虫。
11
57
  spider_idle = object()
58
+
59
+ #: Signal sent when a spider has been closed.
60
+ #: 当爬虫被关闭时发送的信号。
61
+ #: Args:
62
+ #: spider (Spider): The spider that has been closed.
63
+ #: 已被关闭的爬虫。
64
+ #: reason (str): A string describing the reason why the spider was closed.
65
+ #: 描述爬虫被关闭原因的字符串。
12
66
  spider_closed = object()
67
+
68
+ #: Signal sent when a spider callback generates an error.
69
+ #: 当爬虫回调生成错误时发送的信号。
70
+ #: Args:
71
+ #: failure (Failure): The exception information.
72
+ #: 异常信息。
73
+ #: response (Response): The response that caused the error.
74
+ #: 导致错误的响应。
75
+ #: spider (Spider): The spider that raised the exception.
76
+ #: 引发异常的爬虫。
13
77
  spider_error = object()
78
+
79
+
80
+ # Request signals
81
+ # 请求信号
82
+
83
+ #: Signal sent when a new Request is scheduled to be downloaded.
84
+ #: 当新的Request被安排下载时发送的信号。
85
+ #: Args:
86
+ #: request (Request): The request that reached the scheduler.
87
+ #: 到达调度器的请求。
88
+ #: spider (Spider): The spider that generated the request.
89
+ #: 生成请求的爬虫。
14
90
  request_scheduled = object()
91
+
92
+ #: Signal sent when a Request is dropped by the scheduler.
93
+ #: 当请求被调度器丢弃时发送的信号。
94
+ #: Args:
95
+ #: request (Request): The request that was dropped.
96
+ #: 被丢弃的请求。
97
+ #: spider (Spider): The spider that generated the request.
98
+ #: 生成请求的爬虫。
15
99
  request_dropped = object()
100
+
101
+ #: Signal sent when a Request reaches the downloader.
102
+ #: 当请求到达下载器时发送的信号。
103
+ #: Args:
104
+ #: request (Request): The request that reached the downloader.
105
+ #: 到达下载器的请求。
106
+ #: spider (Spider): The spider that generated the request.
107
+ #: 生成请求的爬虫。
16
108
  request_reached_downloader = object()
109
+
110
+ #: Signal sent when a Request leaves the downloader.
111
+ #: 当请求离开下载器时发送的信号。
112
+ #: Args:
113
+ #: request (Request): The request that left the downloader.
114
+ #: 离开下载器的请求。
115
+ #: spider (Spider): The spider that generated the request.
116
+ #: 生成请求的爬虫。
17
117
  request_left_downloader = object()
118
+
119
+
120
+ # Response signals
121
+ # 响应信号
122
+
123
+ #: Signal sent when the downloader receives a response from the web server.
124
+ #: 当下载器从Web服务器接收到响应时发送的信号。
125
+ #: Args:
126
+ #: response (Response): The response received.
127
+ #: 接收到的响应。
128
+ #: request (Request): The request that generated the response.
129
+ #: 生成响应的请求。
130
+ #: spider (Spider): The spider that generated the request.
131
+ #: 生成请求的爬虫。
18
132
  response_received = object()
133
+
134
+ #: Signal sent when a Response has been downloaded.
135
+ #: 当响应已被下载时发送的信号。
136
+ #: Args:
137
+ #: response (Response): The response downloaded.
138
+ #: 下载的响应。
139
+ #: request (Request): The request that generated the response.
140
+ #: 生成响应的请求。
141
+ #: spider (Spider): The spider that generated the request.
142
+ #: 生成请求的爬虫。
19
143
  response_downloaded = object()
144
+
145
+ #: Signal sent when the HTTP headers are received for a request.
146
+ #: 当接收到请求的HTTP头时发送的信号。
147
+ #: Args:
148
+ #: headers (dict): The HTTP headers received.
149
+ #: 接收到的HTTP头。
150
+ #: body_length (int): Expected size of the response body.
151
+ #: 预期的响应正文大小。
152
+ #: request (Request): The request that generated the response.
153
+ #: 生成响应的请求。
154
+ #: spider (Spider): The spider that generated the request.
155
+ #: 生成请求的爬虫。
20
156
  headers_received = object()
157
+
158
+ #: Signal sent when a chunk of response data is received.
159
+ #: 当接收到响应数据块时发送的信号。
160
+ #: Args:
161
+ #: data (bytes): The chunk of data received.
162
+ #: 接收到的数据块。
163
+ #: request (Request): The request that generated the response.
164
+ #: 生成响应的请求。
165
+ #: spider (Spider): The spider that generated the request.
166
+ #: 生成请求的爬虫。
21
167
  bytes_received = object()
168
+
169
+
170
+ # Item signals
171
+ # 项目信号
172
+
173
+ #: Signal sent when an item has been scraped by a spider.
174
+ #: 当项目被爬虫抓取时发送的信号。
175
+ #: Args:
176
+ #: item (Item or dict): The item scraped.
177
+ #: 抓取的项目。
178
+ #: response (Response): The response from which the item was scraped.
179
+ #: 项目被抓取的响应。
180
+ #: spider (Spider): The spider which scraped the item.
181
+ #: 抓取项目的爬虫。
22
182
  item_scraped = object()
183
+
184
+ #: Signal sent when an item is dropped by an item pipeline.
185
+ #: 当项目被项目管道丢弃时发送的信号。
186
+ #: Args:
187
+ #: item (Item or dict): The item dropped from the pipeline.
188
+ #: 从管道丢弃的项目。
189
+ #: exception (Exception): The exception that caused the item to be dropped.
190
+ #: 导致项目被丢弃的异常。
191
+ #: spider (Spider): The spider which scraped the item.
192
+ #: 抓取项目的爬虫。
23
193
  item_dropped = object()
194
+
195
+ #: Signal sent when an item causes an error in an item pipeline.
196
+ #: 当项目在项目管道中导致错误时发送的信号。
197
+ #: Args:
198
+ #: item (Item or dict): The item that caused the error.
199
+ #: 导致错误的项目。
200
+ #: exception (Exception): The exception raised.
201
+ #: 引发的异常。
202
+ #: spider (Spider): The spider which scraped the item.
203
+ #: 抓取项目的爬虫。
204
+ #: response (Response): The response from which the item was scraped.
205
+ #: 项目被抓取的响应。
24
206
  item_error = object()
aioscrapy/spiderloader.py CHANGED
@@ -9,37 +9,113 @@ from aioscrapy.utils.spider import iter_spider_classes
9
9
 
10
10
 
11
11
  class ISpiderLoader(Interface):
12
+ """
13
+ Interface for spider loader implementations.
14
+ 爬虫加载器实现的接口。
15
+
16
+ This interface defines the methods that spider loader implementations
17
+ must provide.
18
+ 此接口定义了爬虫加载器实现必须提供的方法。
19
+ """
12
20
 
13
21
  def from_settings(settings):
14
- """Return an instance of the class for the given settings"""
22
+ """
23
+ Return an instance of the class for the given settings.
24
+ 返回给定设置的类实例。
25
+
26
+ Args:
27
+ settings: The settings to use for the spider loader.
28
+ 用于爬虫加载器的设置。
29
+
30
+ Returns:
31
+ An instance of the spider loader.
32
+ 爬虫加载器的实例。
33
+ """
15
34
 
16
35
  def load(spider_name):
17
- """Return the Spider class for the given spider name. If the spider
18
- name is not found, it must raise a KeyError."""
36
+ """
37
+ Return the Spider class for the given spider name.
38
+ 返回给定爬虫名称的Spider类。
39
+
40
+ Args:
41
+ spider_name: The name of the spider to load.
42
+ 要加载的爬虫的名称。
43
+
44
+ Returns:
45
+ The Spider class for the given spider name.
46
+ 给定爬虫名称的Spider类。
47
+
48
+ Raises:
49
+ KeyError: If the spider name is not found.
50
+ 如果找不到爬虫名称。
51
+ """
19
52
 
20
53
  def list():
21
- """Return a list with the names of all spiders available in the
22
- project"""
54
+ """
55
+ Return a list with the names of all spiders available in the project.
56
+ 返回项目中所有可用爬虫的名称列表。
57
+
58
+ Returns:
59
+ A list of spider names.
60
+ 爬虫名称列表。
61
+ """
23
62
 
24
63
  def find_by_request(request):
25
- """Return the list of spiders names that can handle the given request"""
64
+ """
65
+ Return the list of spider names that can handle the given request.
66
+ 返回可以处理给定请求的爬虫名称列表。
67
+
68
+ Args:
69
+ request: The request to check.
70
+ 要检查的请求。
71
+
72
+ Returns:
73
+ A list of spider names that can handle the request.
74
+ 可以处理请求的爬虫名称列表。
75
+ """
26
76
 
27
77
 
28
78
  @implementer(ISpiderLoader)
29
79
  class SpiderLoader:
30
80
  """
31
- SpiderLoader is a class which locates and loads spiders
32
- in a aioscrapy project.
81
+ SpiderLoader is a class which locates and loads spiders in a aioscrapy project.
82
+ SpiderLoader是一个定位和加载aioscrapy项目中爬虫的类。
83
+
84
+ This class implements the ISpiderLoader interface and provides methods to
85
+ find, load, and list spiders in a project.
86
+ 此类实现了ISpiderLoader接口,并提供了在项目中查找、加载和列出爬虫的方法。
33
87
  """
34
88
 
35
89
  def __init__(self, settings):
90
+ """
91
+ Initialize the SpiderLoader.
92
+ 初始化SpiderLoader。
93
+
94
+ This method initializes the SpiderLoader with the given settings and
95
+ loads all spiders from the specified modules.
96
+ 此方法使用给定的设置初始化SpiderLoader,并从指定的模块加载所有爬虫。
97
+
98
+ Args:
99
+ settings: The settings object containing spider loader configuration.
100
+ 包含爬虫加载器配置的设置对象。
101
+ """
36
102
  self.spider_modules = settings.getlist('SPIDER_MODULES')
37
103
  self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
38
- self._spiders = {}
39
- self._found = defaultdict(list)
104
+ self._spiders = {} # Dict of spider name -> spider class
105
+ # 爬虫名称 -> 爬虫类的字典
106
+ self._found = defaultdict(list) # Dict of spider name -> list of (module, class) locations
107
+ # 爬虫名称 -> (模块, 类)位置列表的字典
40
108
  self._load_all_spiders()
41
109
 
42
110
  def _check_name_duplicates(self):
111
+ """
112
+ Check for duplicate spider names and issue warnings if found.
113
+ 检查重复的爬虫名称,如果发现则发出警告。
114
+
115
+ This method checks if there are multiple spider classes with the same name
116
+ and issues a warning if duplicates are found.
117
+ 此方法检查是否有多个具有相同名称的爬虫类,如果发现重复则发出警告。
118
+ """
43
119
  dupes = []
44
120
  for name, locations in self._found.items():
45
121
  dupes.extend([
@@ -57,11 +133,35 @@ class SpiderLoader:
57
133
  )
58
134
 
59
135
  def _load_spiders(self, module):
136
+ """
137
+ Load spiders from a given module.
138
+ 从给定模块加载爬虫。
139
+
140
+ This method finds all spider classes in the given module and adds them
141
+ to the internal dictionaries.
142
+ 此方法查找给定模块中的所有爬虫类,并将它们添加到内部字典中。
143
+
144
+ Args:
145
+ module: The module to load spiders from.
146
+ 要从中加载爬虫的模块。
147
+ """
60
148
  for spcls in iter_spider_classes(module):
61
149
  self._found[spcls.name].append((module.__name__, spcls.__name__))
62
150
  self._spiders[spcls.name] = spcls
63
151
 
64
152
  def _load_all_spiders(self):
153
+ """
154
+ Load all spiders from all modules specified in SPIDER_MODULES setting.
155
+ 从SPIDER_MODULES设置中指定的所有模块加载所有爬虫。
156
+
157
+ This method walks through all the modules specified in the SPIDER_MODULES
158
+ setting, loads all spiders from them, and checks for duplicate names.
159
+ 此方法遍历SPIDER_MODULES设置中指定的所有模块,从中加载所有爬虫,并检查重复的名称。
160
+
161
+ If an import error occurs and SPIDER_LOADER_WARN_ONLY is True, a warning
162
+ is issued instead of raising the exception.
163
+ 如果发生导入错误且SPIDER_LOADER_WARN_ONLY为True,则发出警告而不是引发异常。
164
+ """
65
165
  for name in self.spider_modules:
66
166
  try:
67
167
  for module in walk_modules(name):
@@ -80,12 +180,43 @@ class SpiderLoader:
80
180
 
81
181
  @classmethod
82
182
  def from_settings(cls, settings):
183
+ """
184
+ Create a SpiderLoader instance from settings.
185
+ 从设置创建SpiderLoader实例。
186
+
187
+ This is a factory method that creates a new SpiderLoader instance
188
+ with the given settings.
189
+ 这是一个工厂方法,使用给定的设置创建一个新的SpiderLoader实例。
190
+
191
+ Args:
192
+ settings: The settings to use for the spider loader.
193
+ 用于爬虫加载器的设置。
194
+
195
+ Returns:
196
+ A new SpiderLoader instance.
197
+ 一个新的SpiderLoader实例。
198
+ """
83
199
  return cls(settings)
84
200
 
85
201
  def load(self, spider_name):
86
202
  """
87
- Return the Spider class for the given spider name. If the spider
88
- name is not found, raise a KeyError.
203
+ Return the Spider class for the given spider name.
204
+ 返回给定爬虫名称的Spider类。
205
+
206
+ This method looks up the spider class by name in the internal dictionary.
207
+ 此方法在内部字典中按名称查找爬虫类。
208
+
209
+ Args:
210
+ spider_name: The name of the spider to load.
211
+ 要加载的爬虫的名称。
212
+
213
+ Returns:
214
+ The Spider class for the given spider name.
215
+ 给定爬虫名称的Spider类。
216
+
217
+ Raises:
218
+ KeyError: If the spider name is not found.
219
+ 如果找不到爬虫名称。
89
220
  """
90
221
  try:
91
222
  return self._spiders[spider_name]
@@ -95,6 +226,19 @@ class SpiderLoader:
95
226
  def find_by_request(self, request):
96
227
  """
97
228
  Return the list of spider names that can handle the given request.
229
+ 返回可以处理给定请求的爬虫名称列表。
230
+
231
+ This method checks each spider's handles_request method to determine
232
+ if it can handle the given request.
233
+ 此方法检查每个爬虫的handles_request方法,以确定它是否可以处理给定的请求。
234
+
235
+ Args:
236
+ request: The request to check.
237
+ 要检查的请求。
238
+
239
+ Returns:
240
+ A list of spider names that can handle the request.
241
+ 可以处理请求的爬虫名称列表。
98
242
  """
99
243
  return [
100
244
  name for name, cls in self._spiders.items()
@@ -104,5 +248,14 @@ class SpiderLoader:
104
248
  def list(self):
105
249
  """
106
250
  Return a list with the names of all spiders available in the project.
251
+ 返回项目中所有可用爬虫的名称列表。
252
+
253
+ This method returns a list of all spider names that have been loaded
254
+ by the spider loader.
255
+ 此方法返回已由爬虫加载器加载的所有爬虫名称的列表。
256
+
257
+ Returns:
258
+ A list of spider names.
259
+ 爬虫名称列表。
107
260
  """
108
261
  return list(self._spiders.keys())