aio-scrapy 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
  2. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -41
  3. aio_scrapy-2.1.6.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +187 -3
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +124 -3
  11. aioscrapy/core/downloader/handlers/httpx.py +133 -3
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +132 -3
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +313 -13
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  105. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  106. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  107. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  108. aioscrapy/http/response/playwright.py +0 -36
  109. aioscrapy/libs/pipelines/execl.py +0 -169
  110. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,15 @@
1
+ """
2
+ Redis-based Duplicate Filters for AioScrapy
3
+ AioScrapy的基于Redis的重复过滤器
4
+
5
+ This module provides several implementations of duplicate filters that use Redis
6
+ for storage. It includes a simple set-based filter, a Bloom filter implementation,
7
+ and extended versions of both that support removing fingerprints under certain
8
+ conditions.
9
+ 此模块提供了几种使用Redis进行存储的重复过滤器实现。它包括一个简单的基于集合的过滤器、
10
+ 一个布隆过滤器实现,以及两者的扩展版本,支持在特定条件下移除指纹。
11
+ """
12
+
1
13
  from typing import Literal
2
14
 
3
15
  from aioscrapy import Request
@@ -6,7 +18,16 @@ from aioscrapy.dupefilters import DupeFilterBase
6
18
 
7
19
 
8
20
  class RedisRFPDupeFilter(DupeFilterBase):
9
- """Request Fingerprint duplicates filter built with Set of Redis"""
21
+ """
22
+ Request Fingerprint duplicates filter built with Set of Redis.
23
+ 使用Redis集合构建的请求指纹重复过滤器。
24
+
25
+ This filter uses a Redis SET to store request fingerprints. It implements
26
+ the DupeFilterBase interface and provides methods for checking if a request
27
+ has been seen before and for clearing the filter.
28
+ 此过滤器使用Redis SET来存储请求指纹。它实现了DupeFilterBase接口,
29
+ 并提供了检查请求是否已经被看到过以及清除过滤器的方法。
30
+ """
10
31
 
11
32
  def __init__(
12
33
  self,
@@ -16,179 +37,724 @@ class RedisRFPDupeFilter(DupeFilterBase):
16
37
  keep_on_close: bool = True,
17
38
  info: bool = False,
18
39
  ):
40
+ """
41
+ Initialize the Redis-based request fingerprint filter.
42
+ 初始化基于Redis的请求指纹过滤器。
43
+
44
+ Args:
45
+ server: The Redis server connection.
46
+ Redis服务器连接。
47
+ key: The Redis key to use for storing fingerprints.
48
+ 用于存储指纹的Redis键。
49
+ debug: Whether to log filtered requests.
50
+ 是否记录被过滤的请求。
51
+ Defaults to False.
52
+ 默认为False。
53
+ keep_on_close: Whether to keep the fingerprints in Redis when the spider closes.
54
+ 爬虫关闭时是否保留Redis中的指纹。
55
+ Defaults to True.
56
+ 默认为True。
57
+ info: Whether to log duplicate requests at INFO level.
58
+ 是否在INFO级别记录重复的请求。
59
+ Defaults to False.
60
+ 默认为False。
61
+ """
62
+ # Redis server connection
63
+ # Redis服务器连接
19
64
  self.server = server
65
+
66
+ # Redis key for storing fingerprints
67
+ # 用于存储指纹的Redis键
20
68
  self.key = key
69
+
70
+ # Whether to log filtered requests
71
+ # 是否记录被过滤的请求
21
72
  self.debug = debug
73
+
74
+ # Whether to keep fingerprints when the spider closes
75
+ # 爬虫关闭时是否保留指纹
22
76
  self.keep_on_close = keep_on_close
77
+
78
+ # Whether to log duplicate requests (will be set to False after first log)
79
+ # 是否记录重复的请求(在第一次记录后将设置为False)
23
80
  self.logdupes: bool = True
81
+
82
+ # Whether to log duplicate requests at INFO level
83
+ # 是否在INFO级别记录重复的请求
24
84
  self.info: bool = info
25
85
 
26
86
  @classmethod
27
87
  def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
88
+ """
89
+ Create a RedisRFPDupeFilter instance from a crawler.
90
+ 从爬虫创建RedisRFPDupeFilter实例。
91
+
92
+ This is the factory method used by AioScrapy to create the dupefilter.
93
+ 这是AioScrapy用于创建重复过滤器的工厂方法。
94
+
95
+ Args:
96
+ crawler: The crawler that will use this dupefilter.
97
+ 将使用此重复过滤器的爬虫。
98
+
99
+ Returns:
100
+ RedisRFPDupeFilter: A new RedisRFPDupeFilter instance.
101
+ 一个新的RedisRFPDupeFilter实例。
102
+ """
103
+ # Get Redis connection from database manager
104
+ # 从数据库管理器获取Redis连接
28
105
  server = db_manager.redis.queue
106
+
107
+ # Get dupefilter key pattern from settings, default to '%(spider)s:dupefilter'
108
+ # 从设置获取重复过滤器键模式,默认为'%(spider)s:dupefilter'
29
109
  dupefilter_key = crawler.settings.get("SCHEDULER_DUPEFILTER_KEY", '%(spider)s:dupefilter')
110
+
111
+ # Get keep_on_close setting, default to True
112
+ # 获取keep_on_close设置,默认为True
30
113
  keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
114
+
115
+ # Format the key with the spider name
116
+ # 使用爬虫名称格式化键
31
117
  key = dupefilter_key % {'spider': crawler.spider.name}
118
+
119
+ # Get debug setting, default to False
120
+ # 获取debug设置,默认为False
32
121
  debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
122
+
123
+ # Get info setting, default to False
124
+ # 获取info设置,默认为False
33
125
  info = crawler.settings.getbool('DUPEFILTER_INFO', False)
126
+
127
+ # Create and return a new instance
128
+ # 创建并返回一个新实例
34
129
  instance = cls(server, key=key, debug=debug, keep_on_close=keep_on_close, info=info)
35
130
  return instance
36
131
 
37
- async def request_seen(self, request: Request):
132
+ async def request_seen(self, request: Request) -> bool:
133
+ """
134
+ Check if a request has been seen before.
135
+ 检查请求是否已经被请求过。
136
+
137
+ This method adds the request's fingerprint to the Redis SET and checks
138
+ if it was already there. If the fingerprint was already in the SET,
139
+ the request is considered a duplicate.
140
+ 此方法将请求的指纹添加到Redis SET中,并检查它是否已经存在。
141
+ 如果指纹已经在SET中,则认为请求是重复的。
142
+
143
+ Args:
144
+ request: The request to check.
145
+ 要检查的请求。
146
+
147
+ Returns:
148
+ bool: True if the request has been seen before, False otherwise.
149
+ 如果请求之前已经被看到过,则为True,否则为False。
150
+ """
151
+ # Add the fingerprint to the Redis SET and check if it was already there
152
+ # sadd returns 0 if the member already exists in the set
153
+ # 将指纹添加到Redis SET中,并检查它是否已经存在
154
+ # sadd在成员已经存在于集合中时返回0
38
155
  return await self.server.sadd(self.key, request.fingerprint) == 0
39
156
 
40
- async def close(self, reason=''):
157
+ async def close(self, reason: str = ''):
158
+ """
159
+ Close the dupefilter.
160
+ 关闭重复过滤器。
161
+
162
+ This method is called when the spider is closed. If keep_on_close is False,
163
+ it clears the fingerprints from Redis.
164
+ 当爬虫关闭时调用此方法。如果keep_on_close为False,它会从Redis中清除指纹。
165
+
166
+ Args:
167
+ reason: The reason why the spider was closed.
168
+ 爬虫被关闭的原因。
169
+ """
170
+ # If keep_on_close is False, clear the fingerprints
171
+ # 如果keep_on_close为False,清除指纹
41
172
  if not self.keep_on_close:
42
173
  await self.clear()
43
174
 
44
175
  async def clear(self):
176
+ """
177
+ Clear all fingerprints from Redis.
178
+ 从Redis中清除所有指纹。
179
+
180
+ This method deletes the Redis key used to store fingerprints,
181
+ effectively clearing the filter.
182
+ 此方法删除用于存储指纹的Redis键,有效地清除过滤器。
183
+ """
184
+ # Delete the Redis key
185
+ # 删除Redis键
45
186
  await self.server.delete(self.key)
46
187
 
47
188
 
48
189
  class HashMap(object):
49
- def __init__(self, m, seed):
190
+ """
191
+ Simple hash map implementation for Bloom filter.
192
+ 布隆过滤器的简单哈希映射实现。
193
+
194
+ This class implements a simple hash function that can be used by a Bloom filter
195
+ to map values to bit positions in the filter.
196
+ 此类实现了一个简单的哈希函数,布隆过滤器可以使用它将值映射到过滤器中的位位置。
197
+ """
198
+
199
+ def __init__(self, m: int, seed: int):
200
+ """
201
+ Initialize the hash map.
202
+ 初始化哈希映射。
203
+
204
+ Args:
205
+ m: The size of the bit array (should be a power of 2).
206
+ 位数组的大小(应该是2的幂)。
207
+ seed: The seed value for the hash function.
208
+ 哈希函数的种子值。
209
+ """
210
+ # Size of the bit array
211
+ # 位数组的大小
50
212
  self.m = m
213
+
214
+ # Seed value for the hash function
215
+ # 哈希函数的种子值
51
216
  self.seed = seed
52
217
 
53
- def hash(self, value):
218
+ def hash(self, value: str) -> int:
54
219
  """
55
- Hash Algorithm
56
- :param value: Value
57
- :return: Hash Value
220
+ Hash a string value to an integer.
221
+ 将字符串值哈希为整数。
222
+
223
+ This method implements a simple hash function that converts a string
224
+ to an integer hash value within the range of the bit array.
225
+ 此方法实现了一个简单的哈希函数,将字符串转换为位数组范围内的整数哈希值。
226
+
227
+ Args:
228
+ value: The string value to hash.
229
+ 要哈希的字符串值。
230
+
231
+ Returns:
232
+ int: The hash value, which is an integer between 0 and m-1.
233
+ 哈希值,是0到m-1之间的整数。
58
234
  """
235
+ # Initialize the return value
236
+ # 初始化返回值
59
237
  ret = 0
238
+
239
+ # Calculate the hash value
240
+ # 计算哈希值
60
241
  for i in range(len(value)):
61
242
  ret += self.seed * ret + ord(value[i])
243
+
244
+ # Ensure the hash value is within the range of the bit array
245
+ # 确保哈希值在位数组的范围内
62
246
  return (self.m - 1) & ret
63
247
 
64
248
 
65
249
  class BloomFilter(object):
66
- def __init__(self, server, key, bit=30, hash_number=6):
250
+ """
251
+ Bloom filter implementation using Redis bitsets.
252
+ 使用Redis位集实现的布隆过滤器。
253
+
254
+ A Bloom filter is a space-efficient probabilistic data structure that is used
255
+ to test whether an element is a member of a set. False positives are possible,
256
+ but false negatives are not.
257
+ 布隆过滤器是一种空间效率高的概率数据结构,用于测试元素是否是集合的成员。
258
+ 可能出现假阳性,但不会出现假阴性。
259
+ """
260
+
261
+ def __init__(self, server: "redis.asyncio.Redis", key: str, bit: int = 30, hash_number: int = 6):
67
262
  """
68
- Initialize BloomFilter
69
- :param server: Redis Server
70
- :param key: BloomFilter Key
71
- :param bit: m = 2 ^ bit
72
- :param hash_number: the number of hash function
263
+ Initialize the Bloom filter.
264
+ 初始化布隆过滤器。
265
+
266
+ Args:
267
+ server: The Redis server connection.
268
+ Redis服务器连接。
269
+ key: The Redis key to use for the Bloom filter.
270
+ 用于布隆过滤器的Redis键。
271
+ bit: The power of 2 to use for the bit array size (m = 2^bit).
272
+ 用于位数组大小的2的幂(m = 2^bit)。
273
+ Defaults to 30, which gives a bit array of size 2^30 = 1,073,741,824 bits = 128MB.
274
+ 默认为30,这给出了大小为2^30 = 1,073,741,824位 = 128MB的位数组。
275
+ hash_number: The number of hash functions to use.
276
+ 要使用的哈希函数的数量。
277
+ Defaults to 6.
278
+ 默认为6。
73
279
  """
74
- # default to 1 << 30 = 10,7374,1824 = 2^30 = 128MB, max filter 2^30/hash_number = 1,7895,6970 fingerprints
280
+ # Calculate the bit array size (m = 2^bit)
281
+ # 计算位数组大小(m = 2^bit)
282
+ # default to 1 << 30 = 1,073,741,824 = 2^30 = 128MB
283
+ # max filter capacity is approximately 2^30/hash_number = 178,956,970 fingerprints
284
+ # 默认为1 << 30 = 1,073,741,824 = 2^30 = 128MB
285
+ # 最大过滤器容量约为2^30/hash_number = 178,956,970个指纹
75
286
  self.m = 1 << bit
287
+
288
+ # Generate seeds for the hash functions
289
+ # 生成哈希函数的种子
76
290
  self.seeds = range(hash_number)
291
+
292
+ # Redis server connection
293
+ # Redis服务器连接
77
294
  self.server = server
295
+
296
+ # Redis key for the Bloom filter
297
+ # 布隆过滤器的Redis键
78
298
  self.key = key
299
+
300
+ # Create hash maps for each seed
301
+ # 为每个种子创建哈希映射
79
302
  self.maps = [HashMap(self.m, seed) for seed in self.seeds]
80
303
 
81
- async def exists(self, value):
304
+ async def exists(self, value: str) -> bool:
305
+ """
306
+ Check if a value might exist in the Bloom filter.
307
+ 检查值是否可能存在于布隆过滤器中。
308
+
309
+ This method checks if a value might be in the set represented by the Bloom filter.
310
+ If it returns False, the value is definitely not in the set. If it returns True,
311
+ the value might be in the set (false positives are possible).
312
+ 此方法检查值是否可能在布隆过滤器表示的集合中。
313
+ 如果返回False,则该值肯定不在集合中。如果返回True,
314
+ 则该值可能在集合中(可能出现假阳性)。
315
+
316
+ Args:
317
+ value: The value to check.
318
+ 要检查的值。
319
+
320
+ Returns:
321
+ bool: True if the value might exist in the set, False if it definitely does not.
322
+ 如果值可能存在于集合中,则为True;如果它肯定不存在,则为False。
323
+ """
324
+ # Empty values are never in the set
325
+ # 空值永远不在集合中
82
326
  if not value:
83
327
  return False
328
+
329
+ # Use a Redis pipeline to get all the bits in one round-trip
330
+ # 使用Redis管道在一次往返中获取所有位
84
331
  async with self.server.pipeline(transaction=True) as pipe:
332
+ # For each hash function, get the bit at the hashed position
333
+ # 对于每个哈希函数,获取哈希位置的位
85
334
  for f in self.maps:
86
335
  offset = f.hash(value)
87
336
  pipe.getbit(self.key, offset)
337
+
338
+ # Execute the pipeline and get the results
339
+ # 执行管道并获取结果
88
340
  result = await pipe.execute()
341
+
342
+ # If all bits are set, the value might be in the set
343
+ # 如果所有位都已设置,则该值可能在集合中
89
344
  return all(result)
90
345
 
91
- async def insert(self, value):
346
+ async def insert(self, value: str) -> None:
92
347
  """
93
- add value to bloom
94
- :param value:
95
- :return:
348
+ Insert a value into the Bloom filter.
349
+ 将值插入布隆过滤器。
350
+
351
+ This method sets the bits in the Bloom filter corresponding to the value,
352
+ so that future calls to exists() for this value will return True.
353
+ 此方法设置布隆过滤器中与值对应的位,
354
+ 以便将来对此值调用exists()将返回True。
355
+
356
+ Args:
357
+ value: The value to insert.
358
+ 要插入的值。
96
359
  """
360
+ # Use a Redis pipeline to set all the bits in one round-trip
361
+ # 使用Redis管道在一次往返中设置所有位
97
362
  async with self.server.pipeline(transaction=True) as pipe:
363
+ # For each hash function, set the bit at the hashed position
364
+ # 对于每个哈希函数,设置哈希位置的位
98
365
  for f in self.maps:
99
366
  offset = f.hash(value)
100
367
  pipe.setbit(self.key, offset, 1)
368
+
369
+ # Execute the pipeline
370
+ # 执行管道
101
371
  await pipe.execute()
102
372
 
103
373
 
104
374
  class RedisBloomDupeFilter(RedisRFPDupeFilter):
105
- """Bloom filter built with the bitis bitmap of redis"""
106
-
107
- def __init__(self, server, key, debug, bit, hash_number, keep_on_close, info):
375
+ """
376
+ Bloom filter-based duplicate filter built with Redis bitmaps.
377
+ 使用Redis位图构建的基于布隆过滤器的重复过滤器。
378
+
379
+ This filter uses a Bloom filter implemented with Redis bitmaps to store
380
+ request fingerprints. It is more space-efficient than the simple SET-based
381
+ filter, but has a small probability of false positives.
382
+ 此过滤器使用使用Redis位图实现的布隆过滤器来存储请求指纹。
383
+ 它比简单的基于SET的过滤器更节省空间,但有小概率出现假阳性。
384
+ """
385
+
386
+ def __init__(self, server: "redis.asyncio.Redis", key: str, debug: bool, bit: int,
387
+ hash_number: int, keep_on_close: bool, info: bool):
388
+ """
389
+ Initialize the Bloom filter-based duplicate filter.
390
+ 初始化基于布隆过滤器的重复过滤器。
391
+
392
+ Args:
393
+ server: The Redis server connection.
394
+ Redis服务器连接。
395
+ key: The Redis key to use for the Bloom filter.
396
+ 用于布隆过滤器的Redis键。
397
+ debug: Whether to log filtered requests.
398
+ 是否记录被过滤的请求。
399
+ bit: The power of 2 to use for the bit array size (m = 2^bit).
400
+ 用于位数组大小的2的幂(m = 2^bit)。
401
+ hash_number: The number of hash functions to use.
402
+ 要使用的哈希函数的数量。
403
+ keep_on_close: Whether to keep the fingerprints in Redis when the spider closes.
404
+ 爬虫关闭时是否保留Redis中的指纹。
405
+ info: Whether to log duplicate requests at INFO level.
406
+ 是否在INFO级别记录重复的请求。
407
+ """
408
+ # Initialize the parent class
409
+ # 初始化父类
108
410
  super().__init__(server, key, debug, keep_on_close, info)
411
+
412
+ # Store Bloom filter parameters
413
+ # 存储布隆过滤器参数
109
414
  self.bit = bit
110
415
  self.hash_number = hash_number
416
+
417
+ # Create the Bloom filter
418
+ # 创建布隆过滤器
111
419
  self.bf = BloomFilter(server, self.key, bit, hash_number)
112
420
 
113
421
  @classmethod
114
422
  async def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
423
+ """
424
+ Create a RedisBloomDupeFilter instance from a crawler.
425
+ 从爬虫创建RedisBloomDupeFilter实例。
426
+
427
+ This is the factory method used by AioScrapy to create the dupefilter.
428
+ 这是AioScrapy用于创建重复过滤器的工厂方法。
429
+
430
+ Args:
431
+ crawler: The crawler that will use this dupefilter.
432
+ 将使用此重复过滤器的爬虫。
433
+
434
+ Returns:
435
+ RedisBloomDupeFilter: A new RedisBloomDupeFilter instance.
436
+ 一个新的RedisBloomDupeFilter实例。
437
+ """
438
+ # Get Redis connection from database manager
439
+ # 从数据库管理器获取Redis连接
115
440
  server = db_manager.redis.queue
441
+
442
+ # Get dupefilter key pattern from settings, default to '%(spider)s:bloomfilter'
443
+ # 从设置获取重复过滤器键模式,默认为'%(spider)s:bloomfilter'
116
444
  dupefilter_key = crawler.settings.get("SCHEDULER_DUPEFILTER_KEY", '%(spider)s:bloomfilter')
445
+
446
+ # Get keep_on_close setting, default to True
447
+ # 获取keep_on_close设置,默认为True
117
448
  keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
449
+
450
+ # Format the key with the spider name
451
+ # 使用爬虫名称格式化键
118
452
  key = dupefilter_key % {'spider': crawler.spider.name}
453
+
454
+ # Get debug setting, default to False
455
+ # 获取debug设置,默认为False
119
456
  debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
457
+
458
+ # Get info setting, default to False
459
+ # 获取info设置,默认为False
120
460
  info = crawler.settings.getbool('DUPEFILTER_INFO', False)
461
+
462
+ # Get Bloom filter parameters from settings
463
+ # 从设置获取布隆过滤器参数
121
464
  bit = crawler.settings.getint('BLOOMFILTER_BIT', 30)
122
465
  hash_number = crawler.settings.getint('BLOOMFILTER_HASH_NUMBER', 6)
123
- return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number, keep_on_close=keep_on_close, info=info)
466
+
467
+ # Create and return a new instance
468
+ # 创建并返回一个新实例
469
+ return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number,
470
+ keep_on_close=keep_on_close, info=info)
124
471
 
125
472
  async def request_seen(self, request: Request) -> bool:
473
+ """
474
+ Check if a request has been seen before.
475
+ 检查请求是否已经被看到过。
476
+
477
+ This method checks if the request's fingerprint exists in the Bloom filter.
478
+ If it does, the request is considered a duplicate. If not, the fingerprint
479
+ is added to the Bloom filter.
480
+ 此方法检查请求的指纹是否存在于布隆过滤器中。
481
+ 如果存在,则认为请求是重复的。如果不存在,则将指纹添加到布隆过滤器中。
482
+
483
+ Args:
484
+ request: The request to check.
485
+ 要检查的请求。
486
+
487
+ Returns:
488
+ bool: True if the request has been seen before, False otherwise.
489
+ 如果请求之前已经被看到过,则为True,否则为False。
490
+ """
491
+ # Check if the fingerprint exists in the Bloom filter
492
+ # 检查指纹是否存在于布隆过滤器中
126
493
  fp = await self.bf.exists(request.fingerprint)
494
+
495
+ # If the fingerprint exists, the request is a duplicate
496
+ # 如果指纹存在,则请求是重复的
127
497
  if fp:
128
498
  return True
499
+
500
+ # If not, add the fingerprint to the Bloom filter
501
+ # 如果不存在,则将指纹添加到布隆过滤器中
129
502
  await self.bf.insert(request.fingerprint)
503
+
504
+ # The request has not been seen before
505
+ # 请求之前未被看到过
130
506
  return False
131
507
 
132
508
 
133
509
  class ExRedisBloomDupeFilter(RedisBloomDupeFilter):
134
-
135
- def __init__(self, server, key, key_set, ttl, debug, bit, hash_number, keep_on_close, info):
510
+ """
511
+ Extended Bloom filter-based duplicate filter with temporary SET storage.
512
+ 具有临时SET存储的扩展基于布隆过滤器的重复过滤器。
513
+
514
+ This filter extends the RedisBloomDupeFilter by adding a temporary SET to store
515
+ fingerprints of requests that are currently being processed. This allows for
516
+ removing fingerprints from the filter if the request fails, which can be useful
517
+ for retrying failed requests.
518
+ 此过滤器通过添加一个临时SET来扩展RedisBloomDupeFilter,用于存储当前正在处理的
519
+ 请求的指纹。这允许在请求失败时从过滤器中删除指纹,这对于重试失败的请求很有用。
520
+ """
521
+
522
+ def __init__(self, server: "redis.asyncio.Redis", key: str, key_set: str, ttl: int,
523
+ debug: bool, bit: int, hash_number: int, keep_on_close: bool, info: bool):
524
+ """
525
+ Initialize the extended Bloom filter-based duplicate filter.
526
+ 初始化扩展的基于布隆过滤器的重复过滤器。
527
+
528
+ Args:
529
+ server: The Redis server connection.
530
+ Redis服务器连接。
531
+ key: The Redis key to use for the Bloom filter.
532
+ 用于布隆过滤器的Redis键。
533
+ key_set: The Redis key to use for the temporary SET.
534
+ 用于临时SET的Redis键。
535
+ ttl: The time-to-live in seconds for the temporary SET.
536
+ 临时SET的生存时间(秒)。
537
+ debug: Whether to log filtered requests.
538
+ 是否记录被过滤的请求。
539
+ bit: The power of 2 to use for the bit array size (m = 2^bit).
540
+ 用于位数组大小的2的幂(m = 2^bit)。
541
+ hash_number: The number of hash functions to use.
542
+ 要使用的哈希函数的数量。
543
+ keep_on_close: Whether to keep the fingerprints in Redis when the spider closes.
544
+ 爬虫关闭时是否保留Redis中的指纹。
545
+ info: Whether to log duplicate requests at INFO level.
546
+ 是否在INFO级别记录重复的请求。
547
+ """
548
+ # Initialize the parent class
549
+ # 初始化父类
136
550
  super().__init__(server, key, debug, bit, hash_number, keep_on_close, info)
551
+
552
+ # Redis key for the temporary SET
553
+ # 临时SET的Redis键
137
554
  self.key_set = key_set
555
+
556
+ # Time-to-live for the temporary SET
557
+ # 临时SET的生存时间
138
558
  self.ttl = ttl
139
559
 
140
560
  @classmethod
141
561
  async def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
562
+ """
563
+ Create an ExRedisBloomDupeFilter instance from a crawler.
564
+ 从爬虫创建ExRedisBloomDupeFilter实例。
565
+
566
+ This is the factory method used by AioScrapy to create the dupefilter.
567
+ 这是AioScrapy用于创建重复过滤器的工厂方法。
568
+
569
+ Args:
570
+ crawler: The crawler that will use this dupefilter.
571
+ 将使用此重复过滤器的爬虫。
572
+
573
+ Returns:
574
+ ExRedisBloomDupeFilter: A new ExRedisBloomDupeFilter instance.
575
+ 一个新的ExRedisBloomDupeFilter实例。
576
+ """
577
+ # Get Redis connection from database manager
578
+ # 从数据库管理器获取Redis连接
142
579
  server = db_manager.redis.queue
580
+
581
+ # Get dupefilter key pattern from settings, default to '%(spider)s:bloomfilter'
582
+ # 从设置获取重复过滤器键模式,默认为'%(spider)s:bloomfilter'
143
583
  dupefilter_key = crawler.settings.get("SCHEDULER_DUPEFILTER_KEY", '%(spider)s:bloomfilter')
584
+
585
+ # Get keep_on_close setting, default to True
586
+ # 获取keep_on_close设置,默认为True
144
587
  keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
588
+
589
+ # Format the key with the spider name
590
+ # 使用爬虫名称格式化键
145
591
  key = dupefilter_key % {'spider': crawler.spider.name}
592
+
593
+ # Get debug setting, default to False
594
+ # 获取debug设置,默认为False
146
595
  debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
596
+
597
+ # Get info setting, default to False
598
+ # 获取info设置,默认为False
147
599
  info = crawler.settings.getbool('DUPEFILTER_INFO', False)
600
+
601
+ # Get Bloom filter parameters from settings
602
+ # 从设置获取布隆过滤器参数
148
603
  bit = crawler.settings.getint('BLOOMFILTER_BIT', 30)
149
604
  hash_number = crawler.settings.getint('BLOOMFILTER_HASH_NUMBER', 6)
605
+
606
+ # Get TTL for the temporary SET, default to 180 seconds
607
+ # 获取临时SET的TTL,默认为180秒
150
608
  ttl = crawler.settings.getint('DUPEFILTER_SET_KEY_TTL', 180)
151
- return cls(server, key=key, key_set=key + "_set", ttl=ttl, debug=debug, bit=bit, hash_number=hash_number,
152
- keep_on_close=keep_on_close, info=info)
609
+
610
+ # Create and return a new instance
611
+ # 创建并返回一个新实例
612
+ return cls(server, key=key, key_set=key + "_set", ttl=ttl, debug=debug, bit=bit,
613
+ hash_number=hash_number, keep_on_close=keep_on_close, info=info)
153
614
 
154
615
  async def request_seen(self, request: Request) -> bool:
616
+ """
617
+ Check if a request has been seen before.
618
+ 检查请求是否已经被看到过。
619
+
620
+ This method first checks if the request's fingerprint exists in the Bloom filter.
621
+ If it does, the request is considered a duplicate. If not, the fingerprint is
622
+ added to the temporary SET with a TTL, but not yet to the Bloom filter.
623
+ 此方法首先检查请求的指纹是否存在于布隆过滤器中。
624
+ 如果存在,则认为请求是重复的。如果不存在,则将指纹添加到具有TTL的临时SET中,
625
+ 但尚未添加到布隆过滤器中。
626
+
627
+ Args:
628
+ request: The request to check.
629
+ 要检查的请求。
630
+
631
+ Returns:
632
+ bool: True if the request has been seen before, False otherwise.
633
+ 如果请求之前已经被看到过,则为True,否则为False。
634
+ """
635
+ # Check if the fingerprint exists in the Bloom filter
636
+ # 检查指纹是否存在于布隆过滤器中
155
637
  fp = await self.bf.exists(request.fingerprint)
638
+
639
+ # If the fingerprint exists in the Bloom filter, the request is a duplicate
640
+ # 如果指纹存在于布隆过滤器中,则请求是重复的
156
641
  if fp:
157
642
  return True
643
+
644
+ # If not, add the fingerprint to the temporary SET with a TTL
645
+ # 如果不存在,则将指纹添加到具有TTL的临时SET中
158
646
  async with self.server.pipeline() as pipe:
159
647
  pipe.sadd(self.key_set, request.fingerprint)
160
648
  pipe.expire(self.key_set, self.ttl)
161
649
  ret, _ = await pipe.execute()
650
+
651
+ # Return True if the fingerprint was already in the temporary SET (ret == 0)
652
+ # 如果指纹已经在临时SET中,则返回True(ret == 0)
162
653
  return ret == 0
163
654
 
164
655
  async def done(
165
656
  self,
166
657
  request: Request,
167
658
  done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
168
- ):
659
+ ) -> None:
660
+ """
661
+ Handle the completion of a request.
662
+ 处理请求的完成。
663
+
664
+ This method is called when a request has been processed. It handles the
665
+ fingerprint differently based on the done_type:
666
+ - For "request_ok" or "request_err", it removes the fingerprint from the temporary SET.
667
+ - For "parse_ok", it adds the fingerprint to the Bloom filter.
668
+ 当请求已处理时调用此方法。它根据done_type不同地处理指纹:
669
+ - 对于"request_ok"或"request_err",它从临时SET中删除指纹。
670
+ - 对于"parse_ok",它将指纹添加到布隆过滤器中。
671
+
672
+ Args:
673
+ request: The request that has been processed.
674
+ 已处理的请求。
675
+ done_type: The status of the request processing.
676
+ 请求处理的状态。
677
+ Can be one of: "request_ok", "request_err", "parse_ok", "parse_err".
678
+ 可以是以下之一:"request_ok"、"request_err"、"parse_ok"、"parse_err"。
679
+ """
680
+ # If the request was successful or failed at the request level,
681
+ # remove the fingerprint from the temporary SET
682
+ # 如果请求成功或在请求级别失败,则从临时SET中删除指纹
169
683
  if done_type == "request_ok" or done_type == "request_err":
170
684
  await self.server.srem(self.key_set, request.fingerprint)
685
+
686
+ # If the request was successfully parsed, add the fingerprint to the Bloom filter
687
+ # 如果请求成功解析,则将指纹添加到布隆过滤器中
171
688
  elif done_type == "parse_ok":
172
689
  await self.bf.insert(request.fingerprint)
173
690
 
174
- async def close(self, reason=''):
691
+ async def close(self, reason: str = ''):
692
+ """
693
+ Close the dupefilter.
694
+ 关闭重复过滤器。
695
+
696
+ This method is called when the spider is closed. If keep_on_close is False,
697
+ it clears the Bloom filter. It also deletes the temporary SET.
698
+ 当爬虫关闭时调用此方法。如果keep_on_close为False,它会清除布隆过滤器。
699
+ 它还会删除临时SET。
700
+
701
+ Args:
702
+ reason: The reason why the spider was closed.
703
+ 爬虫被关闭的原因。
704
+ """
705
+ # If keep_on_close is False, clear the Bloom filter
706
+ # 如果keep_on_close为False,清除布隆过滤器
175
707
  if not self.keep_on_close:
176
708
  await self.clear()
709
+
710
+ # Delete the temporary SET
711
+ # 删除临时SET
177
712
  await self.server.delete(self.key_set)
178
713
 
179
714
 
180
715
  class ExRedisRFPDupeFilter(RedisRFPDupeFilter):
716
+ """
717
+ Extended Redis SET-based duplicate filter with fingerprint removal.
718
+ 具有指纹移除功能的扩展Redis SET基于的重复过滤器。
719
+
720
+ This filter extends the RedisRFPDupeFilter by adding the ability to remove
721
+ fingerprints from the filter if the request fails, which can be useful for
722
+ retrying failed requests.
723
+ 此过滤器通过添加在请求失败时从过滤器中删除指纹的功能来扩展RedisRFPDupeFilter,
724
+ 这对于重试失败的请求很有用。
725
+ """
181
726
 
182
727
  async def done(
183
728
  self,
184
729
  request: Request,
185
730
  done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
186
- ):
187
- # 当请求失败或解析失败的时候 从Redis的Set中移除指纹
731
+ ) -> None:
732
+ """
733
+ Handle the completion of a request.
734
+ 处理请求的完成。
735
+
736
+ This method is called when a request has been processed. It removes the
737
+ fingerprint from the Redis SET if the request or parsing failed, allowing
738
+ the request to be retried.
739
+ 当请求已处理时调用此方法。如果请求或解析失败,它会从Redis SET中删除指纹,
740
+ 允许重试请求。
741
+
742
+ Args:
743
+ request: The request that has been processed.
744
+ 已处理的请求。
745
+ done_type: The status of the request processing.
746
+ 请求处理的状态。
747
+ Can be one of: "request_ok", "request_err", "parse_ok", "parse_err".
748
+ 可以是以下之一:"request_ok"、"request_err"、"parse_ok"、"parse_err"。
749
+ """
750
+ # When the request or parsing fails, remove the fingerprint from the Redis SET
751
+ # 当请求失败或解析失败时,从Redis的Set中移除指纹
188
752
  if done_type == "request_err" or done_type == "parse_err":
189
753
  await self.server.srem(self.key, request.fingerprint)
190
754
 
191
755
 
756
+ # Aliases for backward compatibility
757
+ # 用于向后兼容的别名
192
758
  RFPDupeFilter = RedisRFPDupeFilter
193
759
  ExRFPDupeFilter = ExRedisRFPDupeFilter
194
760
  BloomDupeFilter = RedisBloomDupeFilter