aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,51 +1,237 @@
1
+ """
2
+ Auto Throttle Extension
3
+ 自动限速扩展
4
+
5
+ This extension automatically adjusts the download delay between requests based on
6
+ the response latency, helping to avoid overloading servers and improving crawling
7
+ efficiency. It dynamically increases or decreases the delay to maintain a target
8
+ level of concurrency.
9
+ 此扩展根据响应延迟自动调整请求之间的下载延迟,有助于避免服务器过载并提高爬取效率。
10
+ 它动态地增加或减少延迟以维持目标并发级别。
11
+
12
+ The extension works by measuring the latency of responses and adjusting the delay
13
+ to try to maintain a specified number of concurrent requests to each domain.
14
+ 该扩展通过测量响应的延迟并调整延迟来尝试维持对每个域的指定数量的并发请求。
15
+ """
1
16
  from aioscrapy import signals
2
17
  from aioscrapy.exceptions import NotConfigured
3
18
  from aioscrapy.utils.log import logger
4
19
 
5
20
 
6
21
  class AutoThrottle:
22
+ """
23
+ Extension for automatically adjusting download delays based on response latency.
24
+ 基于响应延迟自动调整下载延迟的扩展。
25
+
26
+ This extension dynamically adjusts the download delay between requests to maintain
27
+ a target level of concurrency. It helps to avoid overloading servers while
28
+ maximizing the crawling speed.
29
+ 此扩展动态调整请求之间的下载延迟以维持目标并发级别。它有助于避免服务器过载,
30
+ 同时最大化爬取速度。
31
+ """
7
32
 
8
33
  def __init__(self, crawler):
34
+ """
35
+ Initialize the AutoThrottle extension.
36
+ 初始化AutoThrottle扩展。
37
+
38
+ Args:
39
+ crawler: The crawler that will use this extension.
40
+ 将使用此扩展的爬虫。
41
+
42
+ Raises:
43
+ NotConfigured: If AUTOTHROTTLE_ENABLED is not set to True in the settings.
44
+ 如果在设置中未将AUTOTHROTTLE_ENABLED设置为True。
45
+ """
46
+ # Store the crawler
47
+ # 存储爬虫
9
48
  self.crawler = crawler
49
+
50
+ # Check if the extension is enabled
51
+ # 检查扩展是否已启用
10
52
  if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
11
53
  raise NotConfigured
12
54
 
55
+ # Get debug setting
56
+ # 获取调试设置
13
57
  self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
58
+
59
+ # Get target concurrency setting
60
+ # 获取目标并发设置
14
61
  self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
62
+
63
+ # Connect to signals
64
+ # 连接到信号
15
65
  crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
16
66
  crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
17
67
 
18
68
  @classmethod
19
69
  def from_crawler(cls, crawler):
70
+ """
71
+ Create an AutoThrottle instance from a crawler.
72
+ 从爬虫创建AutoThrottle实例。
73
+
74
+ This is the factory method used by Scrapy to create the extension.
75
+ 这是Scrapy用于创建扩展的工厂方法。
76
+
77
+ Args:
78
+ crawler: The crawler that will use this extension.
79
+ 将使用此扩展的爬虫。
80
+
81
+ Returns:
82
+ AutoThrottle: A new AutoThrottle instance.
83
+ 一个新的AutoThrottle实例。
84
+ """
85
+ # Create and return a new instance
86
+ # 创建并返回一个新实例
20
87
  return cls(crawler)
21
88
 
22
89
  def _spider_opened(self, spider):
90
+ """
91
+ Handle the spider_opened signal.
92
+ 处理spider_opened信号。
93
+
94
+ This method is called when a spider is opened. It initializes the minimum,
95
+ maximum, and starting download delays.
96
+ 当爬虫打开时调用此方法。它初始化最小、最大和起始下载延迟。
97
+
98
+ Args:
99
+ spider: The spider that was opened.
100
+ 被打开的爬虫。
101
+ """
102
+ # Calculate minimum delay
103
+ # 计算最小延迟
23
104
  self.mindelay = self._min_delay(spider)
105
+
106
+ # Calculate maximum delay
107
+ # 计算最大延迟
24
108
  self.maxdelay = self._max_delay(spider)
109
+
110
+ # Set initial download delay for the spider
111
+ # 为爬虫设置初始下载延迟
25
112
  spider.download_delay = self._start_delay(spider)
26
113
 
27
114
  def _min_delay(self, spider):
115
+ """
116
+ Get the minimum download delay.
117
+ 获取最小下载延迟。
118
+
119
+ This method returns the minimum download delay, which is either the spider's
120
+ download_delay attribute or the DOWNLOAD_DELAY setting.
121
+ 此方法返回最小下载延迟,即爬虫的download_delay属性或DOWNLOAD_DELAY设置。
122
+
123
+ Args:
124
+ spider: The spider to get the minimum delay for.
125
+ 要获取最小延迟的爬虫。
126
+
127
+ Returns:
128
+ float: The minimum download delay in seconds.
129
+ 最小下载延迟(以秒为单位)。
130
+ """
131
+ # Get settings
132
+ # 获取设置
28
133
  s = self.crawler.settings
134
+
135
+ # Return spider's download_delay attribute or DOWNLOAD_DELAY setting
136
+ # 返回爬虫的download_delay属性或DOWNLOAD_DELAY设置
29
137
  return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
30
138
 
31
139
  def _max_delay(self, spider):
140
+ """
141
+ Get the maximum download delay.
142
+ 获取最大下载延迟。
143
+
144
+ This method returns the maximum download delay from the AUTOTHROTTLE_MAX_DELAY setting.
145
+ 此方法从AUTOTHROTTLE_MAX_DELAY设置返回最大下载延迟。
146
+
147
+ Args:
148
+ spider: The spider to get the maximum delay for.
149
+ 要获取最大延迟的爬虫。
150
+
151
+ Returns:
152
+ float: The maximum download delay in seconds.
153
+ 最大下载延迟(以秒为单位)。
154
+ """
155
+ # Return AUTOTHROTTLE_MAX_DELAY setting
156
+ # 返回AUTOTHROTTLE_MAX_DELAY设置
32
157
  return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
33
158
 
34
159
  def _start_delay(self, spider):
160
+ """
161
+ Get the initial download delay.
162
+ 获取初始下载延迟。
163
+
164
+ This method returns the initial download delay, which is the maximum of
165
+ the minimum delay and the AUTOTHROTTLE_START_DELAY setting.
166
+ 此方法返回初始下载延迟,即最小延迟和AUTOTHROTTLE_START_DELAY设置的最大值。
167
+
168
+ Args:
169
+ spider: The spider to get the start delay for.
170
+ 要获取起始延迟的爬虫。
171
+
172
+ Returns:
173
+ float: The initial download delay in seconds.
174
+ 初始下载延迟(以秒为单位)。
175
+ """
176
+ # Return the maximum of minimum delay and AUTOTHROTTLE_START_DELAY setting
177
+ # 返回最小延迟和AUTOTHROTTLE_START_DELAY设置的最大值
35
178
  return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
36
179
 
37
180
  def _response_downloaded(self, response, request, spider):
181
+ """
182
+ Handle the response_downloaded signal.
183
+ 处理response_downloaded信号。
184
+
185
+ This method is called when a response is downloaded. It adjusts the download
186
+ delay based on the response latency and logs debug information if enabled.
187
+ 当下载响应时调用此方法。它根据响应延迟调整下载延迟,并在启用时记录调试信息。
188
+
189
+ Args:
190
+ response: The downloaded response.
191
+ 下载的响应。
192
+ request: The request that generated the response.
193
+ 生成响应的请求。
194
+ spider: The spider that made the request.
195
+ 发出请求的爬虫。
196
+ """
197
+ # Get the download slot for the request
198
+ # 获取请求的下载槽
38
199
  key, slot = self._get_slot(request, spider)
200
+
201
+ # Get the download latency from the request metadata
202
+ # 从请求元数据获取下载延迟
39
203
  latency = request.meta.get('download_latency')
204
+
205
+ # If latency or slot is not available, do nothing
206
+ # 如果延迟或槽不可用,则不执行任何操作
40
207
  if latency is None or slot is None:
41
208
  return
42
209
 
210
+ # Store the old delay for logging
211
+ # 存储旧延迟以供记录
43
212
  olddelay = slot.delay
213
+
214
+ # Adjust the delay based on the latency and response
215
+ # 根据延迟和响应调整延迟
44
216
  self._adjust_delay(slot, latency, response)
217
+
218
+ # Log debug information if enabled
219
+ # 如果启用,则记录调试信息
45
220
  if self.debug:
221
+ # Calculate the delay difference
222
+ # 计算延迟差异
46
223
  diff = slot.delay - olddelay
224
+
225
+ # Get the response size
226
+ # 获取响应大小
47
227
  size = len(response.body)
228
+
229
+ # Get the number of concurrent requests
230
+ # 获取并发请求数
48
231
  conc = len(slot.transferring)
232
+
233
+ # Log the debug information
234
+ # 记录调试信息
49
235
  logger.info(
50
236
  "slot: %(slot)s | conc:%(concurrency)2d | "
51
237
  "delay:%(delay)5d ms (%(delaydiff)+d) | "
@@ -57,32 +243,80 @@ class AutoThrottle:
57
243
  )
58
244
 
59
245
  def _get_slot(self, request, spider):
246
+ """
247
+ Get the download slot for a request.
248
+ 获取请求的下载槽。
249
+
250
+ This method returns the download slot key and the slot object for a request.
251
+ 此方法返回请求的下载槽键和槽对象。
252
+
253
+ Args:
254
+ request: The request to get the slot for.
255
+ 要获取槽的请求。
256
+ spider: The spider that made the request.
257
+ 发出请求的爬虫。
258
+
259
+ Returns:
260
+ tuple: A tuple containing the slot key and the slot object.
261
+ 包含槽键和槽对象的元组。
262
+ """
263
+ # Get the download slot key from the request metadata
264
+ # 从请求元数据获取下载槽键
60
265
  key = request.meta.get('download_slot')
266
+
267
+ # Return the key and the corresponding slot object
268
+ # 返回键和相应的槽对象
61
269
  return key, self.crawler.engine.downloader.slots.get(key)
62
270
 
63
271
  def _adjust_delay(self, slot, latency, response):
64
- """Define delay adjustment policy"""
272
+ """
273
+ Adjust the download delay based on the response latency.
274
+ 根据响应延迟调整下载延迟。
275
+
276
+ This method implements the delay adjustment policy. It calculates a new
277
+ download delay based on the response latency and the target concurrency,
278
+ and updates the slot's delay if appropriate.
279
+ 此方法实现延迟调整策略。它根据响应延迟和目标并发计算新的下载延迟,
280
+ 并在适当时更新槽的延迟。
65
281
 
282
+ Args:
283
+ slot: The download slot to adjust the delay for.
284
+ 要调整延迟的下载槽。
285
+ latency: The download latency of the response.
286
+ 响应的下载延迟。
287
+ response: The downloaded response.
288
+ 下载的响应。
289
+ """
66
290
  # If a server needs `latency` seconds to respond then
67
291
  # we should send a request each `latency/N` seconds
68
292
  # to have N requests processed in parallel
293
+ # 如果服务器需要`latency`秒来响应,那么我们应该每`latency/N`秒发送一个请求,
294
+ # 以便并行处理N个请求
69
295
  target_delay = latency / self.target_concurrency
70
296
 
71
297
  # Adjust the delay to make it closer to target_delay
298
+ # 调整延迟使其更接近target_delay
72
299
  new_delay = (slot.delay + target_delay) / 2.0
73
300
 
74
301
  # If target delay is bigger than old delay, then use it instead of mean.
75
302
  # It works better with problematic sites.
303
+ # 如果目标延迟大于旧延迟,则使用它而不是平均值。
304
+ # 这对于有问题的站点效果更好。
76
305
  new_delay = max(target_delay, new_delay)
77
306
 
78
307
  # Make sure self.mindelay <= new_delay <= self.max_delay
308
+ # 确保self.mindelay <= new_delay <= self.max_delay
79
309
  new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
80
310
 
81
311
  # Dont adjust delay if response status != 200 and new delay is smaller
82
312
  # than old one, as error pages (and redirections) are usually small and
83
313
  # so tend to reduce latency, thus provoking a positive feedback by
84
314
  # reducing delay instead of increase.
315
+ # 如果响应状态 != 200且新延迟小于旧延迟,则不调整延迟,因为错误页面(和重定向)
316
+ # 通常很小,因此倾向于减少延迟,从而引发正反馈,减少延迟而不是增加。
85
317
  if response.status != 200 and new_delay <= slot.delay:
86
318
  return
87
319
 
320
+ # Update the slot's delay
321
+ # 更新槽的延迟
88
322
  slot.delay = new_delay