aio-scrapy 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
  2. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -41
  3. aio_scrapy-2.1.6.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +187 -3
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +124 -3
  11. aioscrapy/core/downloader/handlers/httpx.py +133 -3
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +132 -3
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +313 -13
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  105. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  106. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  107. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  108. aioscrapy/http/response/playwright.py +0 -36
  109. aioscrapy/libs/pipelines/execl.py +0 -169
  110. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,17 @@
1
+ """
2
+ Abstract Middleware Manager Module
3
+ 抽象中间件管理器模块
4
+
5
+ This module provides the abstract base class for middleware managers in AioScrapy.
6
+ Middleware managers are responsible for loading, organizing, and executing middleware
7
+ components in the correct order during the request/response processing cycle.
8
+ 此模块提供了AioScrapy中间件管理器的抽象基类。中间件管理器负责在请求/响应处理
9
+ 周期中以正确的顺序加载、组织和执行中间件组件。
10
+
11
+ Concrete implementations of this abstract class include the DownloaderMiddlewareManager
12
+ and SpiderMiddlewareManager classes.
13
+ 此抽象类的具体实现包括DownloaderMiddlewareManager和SpiderMiddlewareManager类。
14
+ """
1
15
  import pprint
2
16
  from abc import ABCMeta, abstractmethod
3
17
  from collections import defaultdict, deque
@@ -9,82 +23,394 @@ from aioscrapy.utils.tools import call_helper
9
23
 
10
24
 
11
25
  class AbsMiddlewareManager(object, metaclass=ABCMeta):
12
- """Base class for implementing middleware managers"""
26
+ """
27
+ Abstract base class for implementing middleware managers.
28
+ 实现中间件管理器的抽象基类。
29
+
30
+ This class provides the common functionality for managing middleware components,
31
+ including loading middleware from settings, organizing them in the correct order,
32
+ and executing their methods during the request/response processing cycle.
33
+ 此类提供了管理中间件组件的通用功能,包括从设置加载中间件、以正确的顺序组织它们,
34
+ 以及在请求/响应处理周期中执行它们的方法。
13
35
 
36
+ Concrete subclasses must implement the _get_mwlist_from_settings method to
37
+ specify how to retrieve the middleware list from settings.
38
+ 具体子类必须实现_get_mwlist_from_settings方法,以指定如何从设置中检索中间件列表。
39
+ """
40
+
41
+ # Name of the middleware component, to be overridden by subclasses
42
+ # 中间件组件的名称,由子类覆盖
14
43
  component_name = 'foo middleware'
15
44
 
16
45
  def __init__(self, *middlewares):
46
+ """
47
+ Initialize the middleware manager with a list of middleware instances.
48
+ 使用中间件实例列表初始化中间件管理器。
49
+
50
+ Args:
51
+ *middlewares: Variable length list of middleware instances.
52
+ 可变长度的中间件实例列表。
53
+ """
54
+ # Store the middleware instances
55
+ # 存储中间件实例
17
56
  self.middlewares = middlewares
57
+
58
+ # Dictionary to store middleware methods by name
59
+ # 按名称存储中间件方法的字典
18
60
  self.methods = defaultdict(deque)
61
+
62
+ # Add each middleware to the manager
63
+ # 将每个中间件添加到管理器
19
64
  for mw in middlewares:
20
65
  self._add_middleware(mw)
21
66
 
22
67
  @classmethod
23
68
  @abstractmethod
24
69
  def _get_mwlist_from_settings(cls, settings):
25
- """get middleware list from settings"""
70
+ """
71
+ Get middleware list from settings.
72
+ 从设置中获取中间件列表。
73
+
74
+ This abstract method must be implemented by subclasses to specify
75
+ how to retrieve the middleware list from the settings object.
76
+ 此抽象方法必须由子类实现,以指定如何从设置对象中检索中间件列表。
77
+
78
+ Args:
79
+ settings: The settings object.
80
+ 设置对象。
81
+
82
+ Returns:
83
+ list: A list of middleware class paths.
84
+ 中间件类路径列表。
85
+ """
86
+ pass
26
87
 
27
88
  @classmethod
28
89
  async def from_settings(cls, settings, crawler=None):
90
+ """
91
+ Create a middleware manager from settings.
92
+ 从设置创建中间件管理器。
93
+
94
+ This method loads middleware instances from the settings object and
95
+ creates a new middleware manager with those instances.
96
+ 此方法从设置对象加载中间件实例,并使用这些实例创建新的中间件管理器。
97
+
98
+ Args:
99
+ settings: The settings object.
100
+ 设置对象。
101
+ crawler: Optional crawler instance.
102
+ 可选的爬虫实例。
103
+ Defaults to None.
104
+ 默认为None。
105
+
106
+ Returns:
107
+ AbsMiddlewareManager: A new middleware manager instance.
108
+ 新的中间件管理器实例。
109
+ """
110
+ # Get middleware list from settings
111
+ # 从设置中获取中间件列表
29
112
  mwlist = cls._get_mwlist_from_settings(settings)
113
+
114
+ # Lists to store loaded middleware instances and enabled middleware paths
115
+ # 用于存储已加载的中间件实例和已启用的中间件路径的列表
30
116
  middlewares = []
31
117
  enabled = []
118
+
119
+ # Load each middleware
120
+ # 加载每个中间件
32
121
  for clspath in mwlist:
33
122
  try:
123
+ # Load the middleware instance
124
+ # 加载中间件实例
34
125
  middlewares.append(await load_instance(clspath, settings=settings, crawler=crawler))
126
+
127
+ # Add to enabled list
128
+ # 添加到已启用列表
35
129
  enabled.append(clspath)
36
130
  except NotConfigured as e:
131
+ # Log warning if middleware is disabled with a reason
132
+ # 如果中间件因某种原因被禁用,则记录警告
37
133
  if e.args:
38
134
  clsname = clspath.split('.')[-1]
39
135
  logger.warning("Disabled %(clsname)s: %(eargs)s" % {'clsname': clsname, 'eargs': e.args[0]})
40
136
 
137
+ # Log enabled middlewares
138
+ # 记录已启用的中间件
41
139
  logger.info(f"Enabled {cls.component_name}s:\n{pprint.pformat(enabled)}")
140
+
141
+ # Create and return a new middleware manager instance
142
+ # 创建并返回新的中间件管理器实例
42
143
  return cls(*middlewares)
43
144
 
44
145
  @classmethod
45
146
  async def from_crawler(cls, crawler):
147
+ """
148
+ Create a middleware manager from a crawler.
149
+ 从爬虫创建中间件管理器。
150
+
151
+ This is a convenience method that calls from_settings with the crawler's settings.
152
+ 这是一个便捷方法,使用爬虫的设置调用from_settings。
153
+
154
+ Args:
155
+ crawler: The crawler instance.
156
+ 爬虫实例。
157
+
158
+ Returns:
159
+ AbsMiddlewareManager: A new middleware manager instance.
160
+ 新的中间件管理器实例。
161
+ """
162
+ # Create middleware manager from crawler's settings
163
+ # 从爬虫的设置创建中间件管理器
46
164
  return await cls.from_settings(crawler.settings, crawler)
47
165
 
48
166
  def _add_middleware(self, mw):
167
+ """
168
+ Add a middleware instance to the manager.
169
+ 将中间件实例添加到管理器。
170
+
171
+ This method registers the middleware's open_spider and close_spider methods
172
+ if they exist. Note that close_spider methods are added to the left of the
173
+ deque, so they are executed in reverse order.
174
+ 此方法注册中间件的open_spider和close_spider方法(如果存在)。请注意,
175
+ close_spider方法被添加到deque的左侧,因此它们以相反的顺序执行。
176
+
177
+ Args:
178
+ mw: The middleware instance to add.
179
+ 要添加的中间件实例。
180
+ """
181
+ # Register open_spider method if it exists
182
+ # 如果存在,则注册open_spider方法
49
183
  if hasattr(mw, 'open_spider'):
50
184
  self.methods['open_spider'].append(mw.open_spider)
185
+
186
+ # Register close_spider method if it exists (added to the left for reverse order)
187
+ # 如果存在,则注册close_spider方法(添加到左侧以便逆序执行)
51
188
  if hasattr(mw, 'close_spider'):
52
189
  self.methods['close_spider'].appendleft(mw.close_spider)
53
190
 
54
191
  async def _process_parallel(self, methodname, obj, *args):
192
+ """
193
+ Process middleware methods in parallel.
194
+ 并行处理中间件方法。
195
+
196
+ This method calls the process_parallel static method with the middleware
197
+ methods registered for the given method name.
198
+ 此方法使用为给定方法名注册的中间件方法调用process_parallel静态方法。
199
+
200
+ Args:
201
+ methodname: The name of the middleware method to call.
202
+ 要调用的中间件方法的名称。
203
+ obj: The object to pass to the middleware methods.
204
+ 传递给中间件方法的对象。
205
+ *args: Additional arguments to pass to the middleware methods.
206
+ 传递给中间件方法的其他参数。
207
+
208
+ Returns:
209
+ The result of process_parallel.
210
+ process_parallel的结果。
211
+ """
212
+ # Call process_parallel with the methods registered for methodname
213
+ # 使用为methodname注册的方法调用process_parallel
55
214
  return await self.process_parallel(self.methods[methodname], obj, *args)
56
215
 
57
216
  async def _process_chain(self, methodname, obj, *args):
217
+ """
218
+ Process middleware methods in a chain.
219
+ 链式处理中间件方法。
220
+
221
+ This method calls the process_chain static method with the middleware
222
+ methods registered for the given method name.
223
+ 此方法使用为给定方法名注册的中间件方法调用process_chain静态方法。
224
+
225
+ Args:
226
+ methodname: The name of the middleware method to call.
227
+ 要调用的中间件方法的名称。
228
+ obj: The object to pass to the middleware methods.
229
+ 传递给中间件方法的对象。
230
+ *args: Additional arguments to pass to the middleware methods.
231
+ 传递给中间件方法的其他参数。
232
+
233
+ Returns:
234
+ The result of process_chain.
235
+ process_chain的结果。
236
+ """
237
+ # Call process_chain with the methods registered for methodname
238
+ # 使用为methodname注册的方法调用process_chain
58
239
  return await self.process_chain(self.methods[methodname], obj, *args)
59
240
 
60
241
  async def _process_chain_both(self, cb_methodname, eb_methodname, obj, *args):
242
+ """
243
+ Process middleware methods in a chain with error handling.
244
+ 带错误处理的链式处理中间件方法。
245
+
246
+ This method calls the process_chain_both static method with the middleware
247
+ methods registered for the given callback and errback method names.
248
+ 此方法使用为给定回调和错误回调方法名注册的中间件方法调用process_chain_both静态方法。
249
+
250
+ Args:
251
+ cb_methodname: The name of the callback middleware method.
252
+ 回调中间件方法的名称。
253
+ eb_methodname: The name of the errback middleware method.
254
+ 错误回调中间件方法的名称。
255
+ obj: The object to pass to the middleware methods.
256
+ 传递给中间件方法的对象。
257
+ *args: Additional arguments to pass to the middleware methods.
258
+ 传递给中间件方法的其他参数。
259
+
260
+ Returns:
261
+ The result of process_chain_both.
262
+ process_chain_both的结果。
263
+ """
264
+ # Call process_chain_both with the methods registered for cb_methodname and eb_methodname
265
+ # 使用为cb_methodname和eb_methodname注册的方法调用process_chain_both
61
266
  return await self.process_chain_both(self.methods[cb_methodname],
62
267
  self.methods[eb_methodname], obj, *args)
63
268
 
64
269
  async def open_spider(self, spider):
270
+ """
271
+ Call the open_spider method of all middlewares.
272
+ 调用所有中间件的open_spider方法。
273
+
274
+ This method is called when a spider is opened. It calls the open_spider
275
+ method of all middlewares in parallel.
276
+ 当爬虫打开时调用此方法。它并行调用所有中间件的open_spider方法。
277
+
278
+ Args:
279
+ spider: The spider being opened.
280
+ 正在打开的爬虫。
281
+ """
282
+ # Process open_spider methods in parallel
283
+ # 并行处理open_spider方法
65
284
  return await self._process_parallel('open_spider', spider)
66
285
 
67
286
  async def close_spider(self, spider):
287
+ """
288
+ Call the close_spider method of all middlewares.
289
+ 调用所有中间件的close_spider方法。
290
+
291
+ This method is called when a spider is closed. It calls the close_spider
292
+ method of all middlewares in parallel, but in reverse order from how they
293
+ were registered.
294
+ 当爬虫关闭时调用此方法。它并行调用所有中间件的close_spider方法,
295
+ 但顺序与它们注册的顺序相反。
296
+
297
+ Args:
298
+ spider: The spider being closed.
299
+ 正在关闭的爬虫。
300
+ """
301
+ # Process close_spider methods in parallel (in reverse order)
302
+ # 并行处理close_spider方法(以相反的顺序)
68
303
  return await self._process_parallel('close_spider', spider)
69
304
 
70
305
  @staticmethod
71
306
  async def process_parallel(callbacks, input_, *a, **kw):
307
+ """
308
+ Process callbacks in parallel.
309
+ 并行处理回调函数。
310
+
311
+ This method calls all callbacks with the same input object. The callbacks
312
+ are executed in the order they were registered, but their results are not
313
+ passed to subsequent callbacks.
314
+ 此方法使用相同的输入对象调用所有回调函数。回调函数按照它们注册的顺序执行,
315
+ 但它们的结果不会传递给后续的回调函数。
316
+
317
+ Args:
318
+ callbacks: List of callback functions.
319
+ 回调函数列表。
320
+ input_: Input object to pass to callbacks.
321
+ 传递给回调函数的输入对象。
322
+ *a: Additional positional arguments.
323
+ 额外的位置参数。
324
+ **kw: Additional keyword arguments.
325
+ 额外的关键字参数。
326
+ """
327
+ # Call each callback with the same input
328
+ # 使用相同的输入调用每个回调函数
72
329
  for callback in callbacks:
73
330
  await call_helper(callback, input_, *a, **kw)
74
331
 
75
332
  @staticmethod
76
333
  async def process_chain(callbacks, input_, *a, **kw):
334
+ """
335
+ Process callbacks in a chain.
336
+ 链式处理回调函数。
337
+
338
+ This method calls callbacks in sequence, passing the result of each callback
339
+ to the next one. If a callback returns None, the original input is passed
340
+ to the next callback instead.
341
+ 此方法按顺序调用回调函数,将每个回调函数的结果传递给下一个回调函数。
342
+ 如果回调函数返回None,则原始输入将传递给下一个回调函数。
343
+
344
+ Args:
345
+ callbacks: List of callback functions.
346
+ 回调函数列表。
347
+ input_: Initial input object.
348
+ 初始输入对象。
349
+ *a: Additional positional arguments.
350
+ 额外的位置参数。
351
+ **kw: Additional keyword arguments.
352
+ 额外的关键字参数。
353
+
354
+ Returns:
355
+ object: The final result after all callbacks have been processed.
356
+ 所有回调函数处理后的最终结果。
357
+ """
358
+ # Process each callback in sequence
359
+ # 按顺序处理每个回调函数
77
360
  for callback in callbacks:
361
+ # Call the callback with the current input
362
+ # 使用当前输入调用回调函数
78
363
  input_result = await call_helper(callback, input_, *a, **kw)
364
+
365
+ # Update input if the callback returned a non-None result
366
+ # 如果回调函数返回非None结果,则更新输入
79
367
  if input_result is not None:
80
368
  input_ = input_result
369
+
370
+ # Return the final result
371
+ # 返回最终结果
81
372
  return input_
82
373
 
83
374
  @staticmethod
84
375
  async def process_chain_both(callbacks, errbacks, input_, *a, **kw):
376
+ """
377
+ Process callbacks and errbacks in a chain.
378
+ 链式处理回调函数和错误回调函数。
379
+
380
+ This method calls callbacks in sequence, passing the result of each callback
381
+ to the next one. If a callback raises an exception, the corresponding errback
382
+ is called with the same input.
383
+ 此方法按顺序调用回调函数,将每个回调函数的结果传递给下一个回调函数。
384
+ 如果回调函数引发异常,则使用相同的输入调用相应的错误回调函数。
385
+
386
+ Args:
387
+ callbacks: List of callback functions.
388
+ 回调函数列表。
389
+ errbacks: List of error callback functions.
390
+ 错误回调函数列表。
391
+ input_: Initial input object.
392
+ 初始输入对象。
393
+ *a: Additional positional arguments.
394
+ 额外的位置参数。
395
+ **kw: Additional keyword arguments.
396
+ 额外的关键字参数。
397
+
398
+ Returns:
399
+ object: The final result after all callbacks have been processed.
400
+ 所有回调函数处理后的最终结果。
401
+ """
402
+ # Process each callback/errback pair
403
+ # 处理每对回调/错误回调函数
85
404
  for cb, eb in zip(callbacks, errbacks):
86
405
  try:
406
+ # Try to call the callback
407
+ # 尝试调用回调函数
87
408
  input_ = await call_helper(cb, input_, *a, **kw)
88
409
  except(Exception, BaseException):
410
+ # If an exception occurs, call the errback
411
+ # 如果发生异常,调用错误回调函数
89
412
  input_ = await call_helper(eb, input_, *a, **kw)
413
+
414
+ # Return after the first callback/errback pair
415
+ # 在第一对回调/错误回调函数后返回
90
416
  return input_