aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,38 @@
1
+ """
2
+ Downloader Module
3
+ 下载器模块
4
+
5
+ This module provides the core downloader functionality for AioScrapy. The downloader
6
+ is responsible for fetching web pages and other resources from the internet, managing
7
+ concurrency, handling delays between requests, and processing the results through
8
+ middleware.
9
+ 此模块为AioScrapy提供核心下载功能。下载器负责从互联网获取网页和其他资源,
10
+ 管理并发,处理请求之间的延迟,并通过中间件处理结果。
11
+
12
+ The main components are:
13
+ 主要组件包括:
14
+
15
+ 1. BaseDownloader: Abstract base class defining the downloader interface
16
+ 定义下载器接口的抽象基类
17
+ 2. Downloader: Default implementation of the downloader
18
+ 下载器的默认实现
19
+ 3. Slot: Class for managing per-domain or per-IP concurrency and delays
20
+ 用于管理每个域名或每个IP的并发和延迟的类
21
+
22
+ The downloader respects various settings like:
23
+ 下载器遵循各种设置,如:
24
+
25
+ - CONCURRENT_REQUESTS: Global concurrency limit
26
+ 全局并发限制
27
+ - CONCURRENT_REQUESTS_PER_DOMAIN: Per-domain concurrency limit
28
+ 每个域名的并发限制
29
+ - CONCURRENT_REQUESTS_PER_IP: Per-IP concurrency limit
30
+ 每个IP的并发限制
31
+ - DOWNLOAD_DELAY: Delay between requests
32
+ 请求之间的延迟
33
+ - RANDOMIZE_DOWNLOAD_DELAY: Whether to randomize delays
34
+ 是否随机化延迟
35
+ """
1
36
  import asyncio
2
37
  import random
3
38
  from abc import abstractmethod
@@ -21,11 +56,48 @@ from aioscrapy.utils.tools import call_helper, create_task
21
56
 
22
57
 
23
58
  class BaseDownloaderMeta(type):
59
+ """
60
+ Metaclass for BaseDownloader that implements the virtual subclass pattern.
61
+ BaseDownloader的元类,实现虚拟子类模式。
62
+
63
+ This metaclass allows classes to be recognized as BaseDownloader subclasses
64
+ if they implement the required interface, even if they don't explicitly inherit from it.
65
+ 该元类允许类被识别为BaseDownloader的子类,如果它们实现了所需的接口,即使它们没有显式地继承它。
66
+ """
24
67
 
25
68
  def __instancecheck__(cls, instance):
69
+ """
70
+ Check if an instance is an instance of this class.
71
+ 检查实例是否是此类的实例。
72
+
73
+ Args:
74
+ instance: The instance to check.
75
+ 要检查的实例。
76
+
77
+ Returns:
78
+ bool: True if the instance implements the required interface.
79
+ 如果实例实现了所需的接口,则为True。
80
+ """
26
81
  return cls.__subclasscheck__(type(instance))
27
82
 
28
83
  def __subclasscheck__(cls, subclass):
84
+ """
85
+ Check if a class is a subclass of this class.
86
+ 检查类是否是此类的子类。
87
+
88
+ A class is considered a subclass if it implements the required methods:
89
+ 如果类实现了所需的方法,则被视为子类:
90
+ - fetch: For downloading requests
91
+ - needs_backout: For checking if the downloader is at capacity
92
+
93
+ Args:
94
+ subclass: The class to check.
95
+ 要检查的类。
96
+
97
+ Returns:
98
+ bool: True if the class implements the required interface.
99
+ 如果类实现了所需的接口,则为True。
100
+ """
29
101
  return (
30
102
  hasattr(subclass, "fetch") and callable(subclass.fetch)
31
103
  and hasattr(subclass, "needs_backout") and callable(subclass.needs_backout)
@@ -33,20 +105,79 @@ class BaseDownloaderMeta(type):
33
105
 
34
106
 
35
107
  class BaseDownloader(metaclass=BaseDownloaderMeta):
108
+ """
109
+ Abstract base class for downloaders.
110
+ 下载器的抽象基类。
111
+
112
+ This class defines the interface that all downloaders must implement.
113
+ 此类定义了所有下载器必须实现的接口。
114
+ """
36
115
 
37
116
  @classmethod
38
117
  async def from_crawler(cls, crawler) -> "BaseDownloader":
118
+ """
119
+ Create a downloader instance from a crawler.
120
+ 从爬虫创建下载器实例。
121
+
122
+ This is a factory method that creates a downloader instance from a crawler.
123
+ In the base class, the crawler parameter is not used, but subclasses can
124
+ override this method to use crawler settings or other attributes to
125
+ configure the downloader.
126
+ 这是一个工厂方法,从爬虫创建下载器实例。在基类中,crawler参数未被使用,
127
+ 但子类可以覆盖此方法以使用爬虫设置或其他属性来配置下载器。
128
+
129
+ Args:
130
+ crawler: The crawler instance that will use this downloader.
131
+ 将使用此下载器的爬虫实例。
132
+ This parameter is not used in the base implementation but is
133
+ provided for subclasses to use.
134
+ 此参数在基本实现中未使用,但提供给子类使用。
135
+
136
+ Returns:
137
+ BaseDownloader: A new downloader instance.
138
+ 一个新的下载器实例。
139
+ """
140
+ # The crawler parameter is intentionally unused in the base implementation
141
+ # 在基本实现中有意不使用crawler参数
39
142
  return cls()
40
143
 
41
144
  async def close(self) -> None:
145
+ """
146
+ Close the downloader and release its resources.
147
+ 关闭下载器并释放其资源。
148
+
149
+ This method is called when the spider is closed.
150
+ 当爬虫关闭时调用此方法。
151
+ """
42
152
  pass
43
153
 
44
154
  @abstractmethod
45
155
  async def fetch(self, request: Request) -> None:
156
+ """
157
+ Fetch a request.
158
+ 获取请求。
159
+
160
+ This method should download the given request and call the appropriate
161
+ callback with the result.
162
+ 此方法应下载给定的请求并使用结果调用适当的回调。
163
+
164
+ Args:
165
+ request: The request to fetch.
166
+ 要获取的请求。
167
+ """
46
168
  raise NotImplementedError()
47
169
 
48
170
  @abstractmethod
49
171
  def needs_backout(self) -> bool:
172
+ """
173
+ Check if the downloader needs to back out (stop accepting new requests).
174
+ 检查下载器是否需要退出(停止接受新请求)。
175
+
176
+ Returns:
177
+ bool: True if the downloader is at capacity and should not accept
178
+ new requests, False otherwise.
179
+ 如果下载器已达到容量并且不应接受新请求,则为True,否则为False。
180
+ """
50
181
  raise NotImplementedError()
51
182
 
52
183
 
@@ -54,33 +185,96 @@ DownloaderTV = TypeVar("DownloaderTV", bound="Downloader")
54
185
 
55
186
 
56
187
  class Slot:
57
- """Downloader slot"""
188
+ """
189
+ Downloader slot for managing per-domain or per-IP concurrency and delays.
190
+ 用于管理每个域名或每个IP的并发和延迟的下载器槽。
191
+
192
+ Each domain or IP has its own slot to control:
193
+ 每个域名或IP都有自己的槽来控制:
194
+ - Concurrency: How many requests can be processed simultaneously
195
+ 并发:可以同时处理多少请求
196
+ - Delay: How long to wait between requests
197
+ 延迟:请求之间等待多长时间
198
+ """
58
199
 
59
200
  def __init__(self, concurrency: int, delay: float, randomize_delay: bool) -> None:
201
+ """
202
+ Initialize a new downloader slot.
203
+ 初始化一个新的下载器槽。
204
+
205
+ Args:
206
+ concurrency: Maximum number of concurrent requests for this slot.
207
+ 此槽的最大并发请求数。
208
+ delay: Minimum delay between requests in seconds.
209
+ 请求之间的最小延迟(秒)。
210
+ randomize_delay: Whether to randomize the delay between requests.
211
+ 是否随机化请求之间的延迟。
212
+ """
60
213
  self.concurrency = concurrency
61
214
  self.delay = delay
62
215
  self.randomize_delay = randomize_delay
63
216
 
64
- self.active: Set[Request] = set()
65
- self.transferring: Set[Request] = set()
66
- self.queue: Deque[Request] = deque()
67
- self.lastseen: float = 0
68
- self.delay_lock: bool = False
69
-
70
- def free_transfer_slots(self):
217
+ self.active: Set[Request] = set() # All requests being processed by this slot
218
+ # 此槽正在处理的所有请求
219
+ self.transferring: Set[Request] = set() # Requests being downloaded
220
+ # 正在下载的请求
221
+ self.queue: Deque[Request] = deque() # Requests queued for download
222
+ # 排队等待下载的请求
223
+ self.lastseen: float = 0 # Timestamp of last request processed
224
+ # 上次处理请求的时间戳
225
+ self.delay_lock: bool = False # Lock to prevent concurrent delay processing
226
+ # 锁定以防止并发延迟处理
227
+
228
+ def free_transfer_slots(self) -> int:
229
+ """
230
+ Calculate how many more requests can be processed concurrently.
231
+ 计算可以同时处理多少个更多的请求。
232
+
233
+ Returns:
234
+ int: Number of available transfer slots.
235
+ 可用传输槽的数量。
236
+ """
71
237
  return self.concurrency - len(self.transferring)
72
238
 
73
239
  def download_delay(self) -> float:
240
+ """
241
+ Get the delay to use between requests.
242
+ 获取请求之间使用的延迟。
243
+
244
+ If randomize_delay is True, the delay will be randomized between
245
+ 0.5 and 1.5 times the configured delay.
246
+ 如果randomize_delay为True,延迟将在配置的延迟的0.5到1.5倍之间随机化。
247
+
248
+ Returns:
249
+ float: The delay in seconds.
250
+ 延迟(秒)。
251
+ """
74
252
  if self.randomize_delay:
75
253
  return random.uniform(0.5 * self.delay, 1.5 * self.delay)
76
254
  return self.delay
77
255
 
78
- def __repr__(self):
256
+ def __repr__(self) -> str:
257
+ """
258
+ Return a string representation of the slot for debugging.
259
+ 返回用于调试的槽的字符串表示。
260
+
261
+ Returns:
262
+ str: A string representation of the slot.
263
+ 槽的字符串表示。
264
+ """
79
265
  cls_name = self.__class__.__name__
80
266
  return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % (
81
267
  cls_name, self.concurrency, self.delay, self.randomize_delay)
82
268
 
83
- def __str__(self):
269
+ def __str__(self) -> str:
270
+ """
271
+ Return a detailed string representation of the slot.
272
+ 返回槽的详细字符串表示。
273
+
274
+ Returns:
275
+ str: A detailed string representation of the slot.
276
+ 槽的详细字符串表示。
277
+ """
84
278
  return (
85
279
  "<downloader.Slot concurrency=%r delay=%0.2f randomize_delay=%r "
86
280
  "len(active)=%d len(queue)=%d len(transferring)=%d lastseen=%s>" % (
@@ -92,10 +286,37 @@ class Slot:
92
286
 
93
287
 
94
288
  def _get_concurrency_delay(concurrency: int, spider: Spider, settings: Settings) -> Tuple[int, float]:
289
+ """
290
+ Get the concurrency and delay settings for a spider.
291
+ 获取爬虫的并发和延迟设置。
292
+
293
+ This function determines the appropriate concurrency and delay values
294
+ by checking both the settings and spider attributes.
295
+ 此函数通过检查设置和爬虫属性来确定适当的并发和延迟值。
296
+
297
+ Spider-specific settings take precedence over global settings.
298
+ 爬虫特定的设置优先于全局设置。
299
+
300
+ Args:
301
+ concurrency: Default concurrency value from settings.
302
+ 来自设置的默认并发值。
303
+ spider: The spider instance.
304
+ 爬虫实例。
305
+ settings: The settings object.
306
+ 设置对象。
307
+
308
+ Returns:
309
+ Tuple[int, float]: A tuple containing (concurrency, delay).
310
+ 包含(并发,延迟)的元组。
311
+ """
312
+ # Get delay from settings, then override with spider attribute if available
313
+ # 从设置获取延迟,然后如果可用,用爬虫属性覆盖
95
314
  delay = settings.getfloat('DOWNLOAD_DELAY')
96
315
  if hasattr(spider, 'download_delay'):
97
316
  delay = spider.download_delay
98
317
 
318
+ # Get concurrency from settings, then override with spider attribute if available
319
+ # 从设置获取并发,然后如果可用,用爬虫属性覆盖
99
320
  if hasattr(spider, 'max_concurrent_requests'):
100
321
  concurrency = spider.max_concurrent_requests
101
322
 
@@ -103,7 +324,17 @@ def _get_concurrency_delay(concurrency: int, spider: Spider, settings: Settings)
103
324
 
104
325
 
105
326
  class Downloader(BaseDownloader):
106
- DOWNLOAD_SLOT: str = 'download_slot'
327
+ """
328
+ Default implementation of the downloader.
329
+ 下载器的默认实现。
330
+
331
+ This class handles downloading requests, managing concurrency and delays,
332
+ and processing the results through middleware.
333
+ 此类处理下载请求、管理并发和延迟,并通过中间件处理结果。
334
+ """
335
+
336
+ DOWNLOAD_SLOT: str = 'download_slot' # Meta key for custom download slot
337
+ # 自定义下载槽的元键
107
338
 
108
339
  def __init__(
109
340
  self,
@@ -114,140 +345,414 @@ class Downloader(BaseDownloader):
114
345
  proxy: Optional[AbsProxy] = None,
115
346
  dupefilter: Optional[DupeFilterBase] = None,
116
347
  ):
348
+ """
349
+ Initialize the downloader.
350
+ 初始化下载器。
351
+
352
+ Args:
353
+ crawler: The crawler instance that this downloader belongs to.
354
+ 此下载器所属的爬虫实例。
355
+ handler: The download handler manager.
356
+ 下载处理程序管理器。
357
+ middleware: The downloader middleware manager.
358
+ 下载器中间件管理器。
359
+ proxy: Optional proxy handler for managing proxies.
360
+ 可选的代理处理程序,用于管理代理。
361
+ dupefilter: Optional duplicate filter for avoiding duplicate requests.
362
+ 可选的重复过滤器,用于避免重复请求。
363
+ """
364
+ # Components from crawler
365
+ # 来自爬虫的组件
117
366
  self.settings: Settings = crawler.settings
118
367
  self.signals: SignalManager = crawler.signals
119
368
  self.spider: Spider = crawler.spider
120
369
  self.spider.proxy = proxy
121
370
  self._call_engine: Callable = crawler.engine.handle_downloader_output
122
371
 
372
+ # External components
373
+ # 外部组件
123
374
  self.middleware = middleware
124
375
  self.handler = handler
125
376
  self.proxy = proxy
126
377
  self.dupefilter = dupefilter
127
378
 
379
+ # Concurrency and delay settings
380
+ # 并发和延迟设置
128
381
  self.total_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS')
129
382
  self.get_requests_count: int = self.settings.getint('GET_REQUESTS_COUNT') or self.total_concurrency
130
383
  self.domain_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
131
384
  self.ip_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
132
385
  self.randomize_delay: bool = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
133
386
 
134
- self.active: Set[Request] = set()
135
- self.slots: dict = {}
387
+ # State
388
+ # 状态
389
+ self.active: Set[Request] = set() # All active requests
390
+ # 所有活动请求
391
+ self.slots: dict = {} # Domain/IP -> Slot mapping
392
+ # 域名/IP -> 槽映射
136
393
  self.running: bool = True
394
+
395
+ # Start slot garbage collector
396
+ # 启动槽垃圾收集器
137
397
  create_task(self._slot_gc(60))
138
398
 
139
399
  @classmethod
140
400
  async def from_crawler(cls, crawler) -> "Downloader":
401
+ """
402
+ Create a downloader instance from a crawler.
403
+ 从爬虫创建下载器实例。
404
+
405
+ This factory method creates and initializes a downloader with all the
406
+ necessary components from the crawler.
407
+ 此工厂方法创建并初始化具有爬虫中所有必要组件的下载器。
408
+
409
+ Args:
410
+ crawler: The crawler instance that will use this downloader.
411
+ 将使用此下载器的爬虫实例。
412
+
413
+ Returns:
414
+ Downloader: A new downloader instance.
415
+ 一个新的下载器实例。
416
+ """
417
+ # Initialize dupefilter if configured
418
+ # 如果已配置,则初始化重复过滤器
141
419
  df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'],
142
420
  crawler=crawler)
421
+ # Bind dupefilter to spider for access in spider callbacks
422
+ # 将重复过滤器绑定到爬虫,以便在爬虫回调中访问
143
423
  crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
424
+
425
+ # Initialize proxy handler if configured
426
+ # 如果已配置,则初始化代理处理程序
427
+ proxy_handler = crawler.settings.get("PROXY_HANDLER") and await load_instance(
428
+ crawler.settings["PROXY_HANDLER"],
429
+ crawler=crawler
430
+ )
431
+
144
432
  return cls(
145
433
  crawler,
146
434
  await call_helper(DownloadHandlerManager.from_crawler, crawler),
147
435
  await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
148
- proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
149
- crawler=crawler),
436
+ proxy=proxy_handler,
150
437
  dupefilter=df
151
438
  )
152
439
 
153
440
  async def fetch(self, request: Request) -> None:
441
+ """
442
+ Fetch a request.
443
+ 获取请求。
444
+
445
+ This method adds the request to the appropriate download slot and
446
+ starts processing the queue if possible.
447
+ 此方法将请求添加到适当的下载槽,并在可能的情况下开始处理队列。
448
+
449
+ Args:
450
+ request: The request to fetch.
451
+ 要获取的请求。
452
+ """
453
+ # Add to global active requests set
454
+ # 添加到全局活动请求集
154
455
  self.active.add(request)
456
+
457
+ # Get the appropriate slot for this request
458
+ # 获取此请求的适当槽
155
459
  key, slot = self._get_slot(request, self.spider)
156
460
  request.meta[self.DOWNLOAD_SLOT] = key
157
461
 
462
+ # Add to slot's active and queue sets
463
+ # 添加到槽的活动和队列集
158
464
  slot.active.add(request)
159
465
  slot.queue.append(request)
466
+
467
+ # Start processing the queue
468
+ # 开始处理队列
160
469
  await self._process_queue(slot)
161
470
 
162
471
  async def _process_queue(self, slot: Slot) -> None:
472
+ """
473
+ Process the request queue for a slot.
474
+ 处理槽的请求队列。
475
+
476
+ This method handles the download delay between requests and starts
477
+ downloading requests when slots are available.
478
+ 此方法处理请求之间的下载延迟,并在槽可用时开始下载请求。
479
+
480
+ Args:
481
+ slot: The slot whose queue should be processed.
482
+ 应处理其队列的槽。
483
+ """
484
+ # If the slot is already waiting for a delay, don't process again
485
+ # 如果槽已经在等待延迟,则不要再次处理
163
486
  if slot.delay_lock:
164
487
  return
165
488
 
166
489
  now = time()
167
490
  delay = slot.download_delay()
491
+
492
+ # Handle download delay between requests
493
+ # 处理请求之间的下载延迟
168
494
  if delay:
169
495
  penalty = delay - now + slot.lastseen
170
496
  if penalty > 0:
497
+ # Need to wait before processing next request
498
+ # 需要等待才能处理下一个请求
171
499
  slot.delay_lock = True
172
500
  await asyncio.sleep(penalty)
173
501
  slot.delay_lock = False
502
+ # Schedule another processing after the delay
503
+ # 延迟后安排另一次处理
174
504
  create_task(self._process_queue(slot))
175
505
  return
176
506
 
507
+ # Process as many queued requests as possible
508
+ # 尽可能多地处理排队的请求
177
509
  while slot.queue and slot.free_transfer_slots() > 0:
178
510
  request = slot.queue.popleft()
179
511
  slot.transferring.add(request)
180
512
  create_task(self._download(slot, request))
513
+ # If there's a delay, only process one request at a time
514
+ # 如果有延迟,一次只处理一个请求
181
515
  if delay:
182
516
  break
183
517
 
184
518
  async def _download(self, slot: Slot, request: Request) -> None:
519
+ """
520
+ Download a request and process the result.
521
+ 下载请求并处理结果。
522
+
523
+ This method handles the entire download process including:
524
+ 此方法处理整个下载过程,包括:
525
+
526
+ 1. Duplicate filtering
527
+ 重复过滤
528
+ 2. Middleware processing
529
+ 中间件处理
530
+ 3. Actual downloading
531
+ 实际下载
532
+ 4. Proxy handling
533
+ 代理处理
534
+ 5. Response processing
535
+ 响应处理
536
+ 6. Exception handling
537
+ 异常处理
538
+ 7. Cleanup and callback
539
+ 清理和回调
540
+
541
+ Args:
542
+ slot: The slot that the request belongs to.
543
+ 请求所属的槽。
544
+ request: The request to download.
545
+ 要下载的请求。
546
+ """
185
547
  result = None
186
548
  try:
549
+ # Check if request is a duplicate
550
+ # 检查请求是否重复
187
551
  if self.dupefilter and not request.dont_filter and await self.dupefilter.request_seen(request):
188
552
  self.dupefilter.log(request, self.spider)
189
553
  return
554
+
555
+ # Update last seen timestamp
556
+ # 更新上次看到的时间戳
190
557
  slot.lastseen = time()
558
+
559
+ # Process request through middleware
560
+ # 通过中间件处理请求
191
561
  result = await self.middleware.process_request(self.spider, request)
562
+
563
+ # If middleware didn't return a response, download the request
564
+ # 如果中间件没有返回响应,则下载请求
192
565
  if result is None:
566
+ # Add proxy if available
567
+ # 如果可用,添加代理
193
568
  self.proxy and await self.proxy.add_proxy(request)
194
569
  result = await self.handler.download_request(request, self.spider)
195
570
  except BaseException as exc:
571
+ # Handle exceptions
572
+ # 处理异常
196
573
  self.proxy and self.proxy.check(request, exception=exc)
197
574
  result = await self.middleware.process_exception(self.spider, request, exc)
198
575
  else:
576
+ # Process successful response
577
+ # 处理成功的响应
199
578
  if isinstance(result, Response):
200
579
  try:
580
+ # Check proxy status with response
581
+ # 使用响应检查代理状态
201
582
  self.proxy and self.proxy.check(request, response=result)
202
583
  result = await self.middleware.process_response(self.spider, request, result)
203
584
  except BaseException as exc:
204
585
  result = exc
205
586
  finally:
587
+ # Cleanup: remove request from all tracking collections
588
+ # 清理:从所有跟踪集合中删除请求
206
589
  slot.transferring.remove(request)
207
590
  slot.active.remove(request)
208
591
  self.active.remove(request)
209
592
 
593
+ # Send signal if we got a response
594
+ # 如果我们得到响应,发送信号
210
595
  if isinstance(result, Response):
211
596
  await self.signals.send_catch_log(signal=signals.response_downloaded,
212
597
  response=result,
213
598
  request=request,
214
599
  spider=self.spider)
215
- # 控制指纹是否移除
600
+
601
+ # Update dupefilter with request status
602
+ # 使用请求状态更新重复过滤器
216
603
  self.dupefilter and \
217
604
  not request.dont_filter and \
218
605
  await self.dupefilter.done(request, done_type="request_ok" if isinstance(result, Response) else "request_err")
219
606
 
607
+ # Send result to engine and process next request
608
+ # 将结果发送到引擎并处理下一个请求
220
609
  await self._call_engine(result, request)
221
610
  await self._process_queue(slot)
222
611
 
223
612
  async def close(self) -> None:
613
+ """
614
+ Close the downloader and release its resources.
615
+ 关闭下载器并释放其资源。
616
+
617
+ This method stops the downloader from accepting new requests and
618
+ closes the dupefilter if one is being used.
619
+ 此方法停止下载器接受新请求,并在使用重复过滤器时关闭它。
620
+ """
621
+ # Stop accepting new requests
622
+ # 停止接受新请求
224
623
  self.running = False
624
+
625
+ # Close the dupefilter if one exists
626
+ # 如果存在重复过滤器,则关闭它
225
627
  self.dupefilter and await self.dupefilter.close()
226
628
 
227
629
  async def _slot_gc(self, age=60):
630
+ """
631
+ Garbage collector for download slots.
632
+ 下载槽的垃圾收集器。
633
+
634
+ This method periodically checks for inactive slots and removes them
635
+ to free up memory.
636
+ 此方法定期检查不活动的槽并删除它们以释放内存。
637
+
638
+ Args:
639
+ age: The minimum age in seconds for a slot to be considered for removal.
640
+ 槽被考虑删除的最小年龄(秒)。
641
+ """
228
642
  while self.running:
643
+ # Wait for the specified age before checking
644
+ # 在检查之前等待指定的年龄
229
645
  await asyncio.sleep(age)
646
+
647
+ # Iterate through a copy of the slots to avoid modification during iteration
648
+ # 遍历槽的副本以避免在迭代期间修改
230
649
  for key, slot in list(self.slots.items()):
650
+ # Log slot state for debugging
651
+ # 记录槽状态以进行调试
231
652
  logger.debug(slot)
653
+
654
+ # Remove slots that have been inactive for at least 'age' seconds
655
+ # 删除至少'age'秒不活动的槽
232
656
  if not slot.active and slot.lastseen + slot.delay < (time() - age):
233
657
  self.slots.pop(key)
234
658
 
235
- def needs_backout(self):
659
+ def needs_backout(self) -> bool:
660
+ """
661
+ Check if the downloader needs to stop accepting new requests.
662
+ 检查下载器是否停止接受新请求。
663
+
664
+ This method checks if the downloader has reached its maximum concurrency
665
+ limit and should not accept new requests.
666
+ 此方法检查下载器是否已达到其最大并发限制,并且不应接受新请求。
667
+
668
+ Returns:
669
+ bool: True if the downloader is at capacity and should not accept
670
+ new requests, False otherwise.
671
+ 如果下载器已达到容量并且不应接受新请求,则为True,否则为False。
672
+ """
236
673
  return len(self.active) >= self.total_concurrency
237
674
 
238
675
  def _get_slot(self, request, spider):
676
+ """
677
+ Get or create a download slot for a request.
678
+ 获取或创建请求的下载槽。
679
+
680
+ This method determines which slot a request should use based on its
681
+ domain, IP, or custom slot key, and creates the slot if it doesn't exist.
682
+ 此方法根据请求的域名、IP或自定义槽键确定请求应使用哪个槽,如果槽不存在则创建它。
683
+
684
+ Args:
685
+ request: The request to get a slot for.
686
+ 要获取槽的请求。
687
+ spider: The spider making the request.
688
+ 发出请求的爬虫。
689
+
690
+ Returns:
691
+ Tuple[str, Slot]: A tuple containing the slot key and the slot object.
692
+ 包含槽键和槽对象的元组。
693
+ """
694
+ # Get the slot key for this request
695
+ # 获取此请求的槽键
239
696
  key = self._get_slot_key(request, spider)
697
+
698
+ # Create the slot if it doesn't exist
699
+ # 如果槽不存在,则创建它
240
700
  if key not in self.slots:
701
+ # Determine concurrency based on settings
702
+ # 根据设置确定并发
241
703
  conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
704
+
705
+ # Get spider-specific concurrency and delay
706
+ # 获取爬虫特定的并发和延迟
242
707
  conc, delay = _get_concurrency_delay(conc, spider, self.settings)
708
+
709
+ # Create a new slot with the determined settings
710
+ # 使用确定的设置创建新槽
243
711
  self.slots[key] = Slot(conc, delay, self.randomize_delay)
712
+
244
713
  return key, self.slots[key]
245
714
 
246
715
  def _get_slot_key(self, request, spider):
716
+ """
717
+ Get the key for determining which download slot to use for a request.
718
+ 获取用于确定请求使用哪个下载槽的键。
719
+
720
+ The slot key is determined in the following order:
721
+ 槽键按以下顺序确定:
722
+
723
+ 1. Custom slot from request.meta['download_slot'] if present
724
+ 如果存在,则从request.meta['download_slot']获取自定义槽
725
+ 2. Proxy address if IP concurrency is enabled
726
+ 如果启用了IP并发,则使用代理地址
727
+ 3. Request hostname
728
+ 请求主机名
729
+
730
+ Args:
731
+ request: The request to get a slot key for.
732
+ 要获取槽键的请求。
733
+ spider: The spider making the request (not used in this implementation
734
+ but kept for interface consistency).
735
+ 发出请求的爬虫(在此实现中未使用,但保留以保持接口一致性)。
736
+ This parameter is included to maintain a consistent interface with
737
+ other methods that might need the spider instance, and to allow
738
+ subclasses to use it if needed.
739
+ 包含此参数是为了保持与可能需要爬虫实例的其他方法的一致接口,
740
+ 并允许子类在需要时使用它。
741
+
742
+ Returns:
743
+ str: The slot key for the request.
744
+ 请求的槽键。
745
+ """
746
+ # Check for custom slot in request meta
747
+ # 检查请求元数据中的自定义槽
247
748
  if self.DOWNLOAD_SLOT in request.meta:
248
749
  return request.meta[self.DOWNLOAD_SLOT]
249
750
 
751
+ # Use proxy as key if IP concurrency is enabled
752
+ # 如果启用了IP并发,则使用代理作为键
250
753
  if self.ip_concurrency:
251
754
  return request.meta.get("proxy", '')
755
+ # Otherwise use hostname
756
+ # 否则使用主机名
252
757
  else:
253
758
  return urlparse_cached(request).hostname or ''