aio-scrapy 2.1.3__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -40
- aio_scrapy-2.1.6.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +523 -18
- aioscrapy/core/downloader/handlers/__init__.py +188 -6
- aioscrapy/core/downloader/handlers/aiohttp.py +188 -4
- aioscrapy/core/downloader/handlers/curl_cffi.py +125 -4
- aioscrapy/core/downloader/handlers/httpx.py +134 -4
- aioscrapy/core/downloader/handlers/pyhttpx.py +133 -4
- aioscrapy/core/downloader/handlers/requests.py +121 -3
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +170 -14
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +193 -7
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +313 -13
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.3.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -110
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -53
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
aioscrapy/dupefilters/redis.py
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redis-based Duplicate Filters for AioScrapy
|
|
3
|
+
AioScrapy的基于Redis的重复过滤器
|
|
4
|
+
|
|
5
|
+
This module provides several implementations of duplicate filters that use Redis
|
|
6
|
+
for storage. It includes a simple set-based filter, a Bloom filter implementation,
|
|
7
|
+
and extended versions of both that support removing fingerprints under certain
|
|
8
|
+
conditions.
|
|
9
|
+
此模块提供了几种使用Redis进行存储的重复过滤器实现。它包括一个简单的基于集合的过滤器、
|
|
10
|
+
一个布隆过滤器实现,以及两者的扩展版本,支持在特定条件下移除指纹。
|
|
11
|
+
"""
|
|
12
|
+
|
|
1
13
|
from typing import Literal
|
|
2
14
|
|
|
3
15
|
from aioscrapy import Request
|
|
@@ -6,7 +18,16 @@ from aioscrapy.dupefilters import DupeFilterBase
|
|
|
6
18
|
|
|
7
19
|
|
|
8
20
|
class RedisRFPDupeFilter(DupeFilterBase):
|
|
9
|
-
"""
|
|
21
|
+
"""
|
|
22
|
+
Request Fingerprint duplicates filter built with Set of Redis.
|
|
23
|
+
使用Redis集合构建的请求指纹重复过滤器。
|
|
24
|
+
|
|
25
|
+
This filter uses a Redis SET to store request fingerprints. It implements
|
|
26
|
+
the DupeFilterBase interface and provides methods for checking if a request
|
|
27
|
+
has been seen before and for clearing the filter.
|
|
28
|
+
此过滤器使用Redis SET来存储请求指纹。它实现了DupeFilterBase接口,
|
|
29
|
+
并提供了检查请求是否已经被看到过以及清除过滤器的方法。
|
|
30
|
+
"""
|
|
10
31
|
|
|
11
32
|
def __init__(
|
|
12
33
|
self,
|
|
@@ -16,179 +37,724 @@ class RedisRFPDupeFilter(DupeFilterBase):
|
|
|
16
37
|
keep_on_close: bool = True,
|
|
17
38
|
info: bool = False,
|
|
18
39
|
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the Redis-based request fingerprint filter.
|
|
42
|
+
初始化基于Redis的请求指纹过滤器。
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
server: The Redis server connection.
|
|
46
|
+
Redis服务器连接。
|
|
47
|
+
key: The Redis key to use for storing fingerprints.
|
|
48
|
+
用于存储指纹的Redis键。
|
|
49
|
+
debug: Whether to log filtered requests.
|
|
50
|
+
是否记录被过滤的请求。
|
|
51
|
+
Defaults to False.
|
|
52
|
+
默认为False。
|
|
53
|
+
keep_on_close: Whether to keep the fingerprints in Redis when the spider closes.
|
|
54
|
+
爬虫关闭时是否保留Redis中的指纹。
|
|
55
|
+
Defaults to True.
|
|
56
|
+
默认为True。
|
|
57
|
+
info: Whether to log duplicate requests at INFO level.
|
|
58
|
+
是否在INFO级别记录重复的请求。
|
|
59
|
+
Defaults to False.
|
|
60
|
+
默认为False。
|
|
61
|
+
"""
|
|
62
|
+
# Redis server connection
|
|
63
|
+
# Redis服务器连接
|
|
19
64
|
self.server = server
|
|
65
|
+
|
|
66
|
+
# Redis key for storing fingerprints
|
|
67
|
+
# 用于存储指纹的Redis键
|
|
20
68
|
self.key = key
|
|
69
|
+
|
|
70
|
+
# Whether to log filtered requests
|
|
71
|
+
# 是否记录被过滤的请求
|
|
21
72
|
self.debug = debug
|
|
73
|
+
|
|
74
|
+
# Whether to keep fingerprints when the spider closes
|
|
75
|
+
# 爬虫关闭时是否保留指纹
|
|
22
76
|
self.keep_on_close = keep_on_close
|
|
77
|
+
|
|
78
|
+
# Whether to log duplicate requests (will be set to False after first log)
|
|
79
|
+
# 是否记录重复的请求(在第一次记录后将设置为False)
|
|
23
80
|
self.logdupes: bool = True
|
|
81
|
+
|
|
82
|
+
# Whether to log duplicate requests at INFO level
|
|
83
|
+
# 是否在INFO级别记录重复的请求
|
|
24
84
|
self.info: bool = info
|
|
25
85
|
|
|
26
86
|
@classmethod
|
|
27
87
|
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
88
|
+
"""
|
|
89
|
+
Create a RedisRFPDupeFilter instance from a crawler.
|
|
90
|
+
从爬虫创建RedisRFPDupeFilter实例。
|
|
91
|
+
|
|
92
|
+
This is the factory method used by AioScrapy to create the dupefilter.
|
|
93
|
+
这是AioScrapy用于创建重复过滤器的工厂方法。
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
crawler: The crawler that will use this dupefilter.
|
|
97
|
+
将使用此重复过滤器的爬虫。
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
RedisRFPDupeFilter: A new RedisRFPDupeFilter instance.
|
|
101
|
+
一个新的RedisRFPDupeFilter实例。
|
|
102
|
+
"""
|
|
103
|
+
# Get Redis connection from database manager
|
|
104
|
+
# 从数据库管理器获取Redis连接
|
|
28
105
|
server = db_manager.redis.queue
|
|
106
|
+
|
|
107
|
+
# Get dupefilter key pattern from settings, default to '%(spider)s:dupefilter'
|
|
108
|
+
# 从设置获取重复过滤器键模式,默认为'%(spider)s:dupefilter'
|
|
29
109
|
dupefilter_key = crawler.settings.get("SCHEDULER_DUPEFILTER_KEY", '%(spider)s:dupefilter')
|
|
110
|
+
|
|
111
|
+
# Get keep_on_close setting, default to True
|
|
112
|
+
# 获取keep_on_close设置,默认为True
|
|
30
113
|
keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
|
|
114
|
+
|
|
115
|
+
# Format the key with the spider name
|
|
116
|
+
# 使用爬虫名称格式化键
|
|
31
117
|
key = dupefilter_key % {'spider': crawler.spider.name}
|
|
118
|
+
|
|
119
|
+
# Get debug setting, default to False
|
|
120
|
+
# 获取debug设置,默认为False
|
|
32
121
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
122
|
+
|
|
123
|
+
# Get info setting, default to False
|
|
124
|
+
# 获取info设置,默认为False
|
|
33
125
|
info = crawler.settings.getbool('DUPEFILTER_INFO', False)
|
|
126
|
+
|
|
127
|
+
# Create and return a new instance
|
|
128
|
+
# 创建并返回一个新实例
|
|
34
129
|
instance = cls(server, key=key, debug=debug, keep_on_close=keep_on_close, info=info)
|
|
35
130
|
return instance
|
|
36
131
|
|
|
37
|
-
async def request_seen(self, request: Request):
|
|
132
|
+
async def request_seen(self, request: Request) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
Check if a request has been seen before.
|
|
135
|
+
检查请求是否已经被请求过。
|
|
136
|
+
|
|
137
|
+
This method adds the request's fingerprint to the Redis SET and checks
|
|
138
|
+
if it was already there. If the fingerprint was already in the SET,
|
|
139
|
+
the request is considered a duplicate.
|
|
140
|
+
此方法将请求的指纹添加到Redis SET中,并检查它是否已经存在。
|
|
141
|
+
如果指纹已经在SET中,则认为请求是重复的。
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
request: The request to check.
|
|
145
|
+
要检查的请求。
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
bool: True if the request has been seen before, False otherwise.
|
|
149
|
+
如果请求之前已经被看到过,则为True,否则为False。
|
|
150
|
+
"""
|
|
151
|
+
# Add the fingerprint to the Redis SET and check if it was already there
|
|
152
|
+
# sadd returns 0 if the member already exists in the set
|
|
153
|
+
# 将指纹添加到Redis SET中,并检查它是否已经存在
|
|
154
|
+
# sadd在成员已经存在于集合中时返回0
|
|
38
155
|
return await self.server.sadd(self.key, request.fingerprint) == 0
|
|
39
156
|
|
|
40
|
-
async def close(self, reason=''):
|
|
157
|
+
async def close(self, reason: str = ''):
|
|
158
|
+
"""
|
|
159
|
+
Close the dupefilter.
|
|
160
|
+
关闭重复过滤器。
|
|
161
|
+
|
|
162
|
+
This method is called when the spider is closed. If keep_on_close is False,
|
|
163
|
+
it clears the fingerprints from Redis.
|
|
164
|
+
当爬虫关闭时调用此方法。如果keep_on_close为False,它会从Redis中清除指纹。
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
reason: The reason why the spider was closed.
|
|
168
|
+
爬虫被关闭的原因。
|
|
169
|
+
"""
|
|
170
|
+
# If keep_on_close is False, clear the fingerprints
|
|
171
|
+
# 如果keep_on_close为False,清除指纹
|
|
41
172
|
if not self.keep_on_close:
|
|
42
173
|
await self.clear()
|
|
43
174
|
|
|
44
175
|
async def clear(self):
|
|
176
|
+
"""
|
|
177
|
+
Clear all fingerprints from Redis.
|
|
178
|
+
从Redis中清除所有指纹。
|
|
179
|
+
|
|
180
|
+
This method deletes the Redis key used to store fingerprints,
|
|
181
|
+
effectively clearing the filter.
|
|
182
|
+
此方法删除用于存储指纹的Redis键,有效地清除过滤器。
|
|
183
|
+
"""
|
|
184
|
+
# Delete the Redis key
|
|
185
|
+
# 删除Redis键
|
|
45
186
|
await self.server.delete(self.key)
|
|
46
187
|
|
|
47
188
|
|
|
48
189
|
class HashMap(object):
|
|
49
|
-
|
|
190
|
+
"""
|
|
191
|
+
Simple hash map implementation for Bloom filter.
|
|
192
|
+
布隆过滤器的简单哈希映射实现。
|
|
193
|
+
|
|
194
|
+
This class implements a simple hash function that can be used by a Bloom filter
|
|
195
|
+
to map values to bit positions in the filter.
|
|
196
|
+
此类实现了一个简单的哈希函数,布隆过滤器可以使用它将值映射到过滤器中的位位置。
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, m: int, seed: int):
|
|
200
|
+
"""
|
|
201
|
+
Initialize the hash map.
|
|
202
|
+
初始化哈希映射。
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
m: The size of the bit array (should be a power of 2).
|
|
206
|
+
位数组的大小(应该是2的幂)。
|
|
207
|
+
seed: The seed value for the hash function.
|
|
208
|
+
哈希函数的种子值。
|
|
209
|
+
"""
|
|
210
|
+
# Size of the bit array
|
|
211
|
+
# 位数组的大小
|
|
50
212
|
self.m = m
|
|
213
|
+
|
|
214
|
+
# Seed value for the hash function
|
|
215
|
+
# 哈希函数的种子值
|
|
51
216
|
self.seed = seed
|
|
52
217
|
|
|
53
|
-
def hash(self, value):
|
|
218
|
+
def hash(self, value: str) -> int:
|
|
54
219
|
"""
|
|
55
|
-
Hash
|
|
56
|
-
|
|
57
|
-
|
|
220
|
+
Hash a string value to an integer.
|
|
221
|
+
将字符串值哈希为整数。
|
|
222
|
+
|
|
223
|
+
This method implements a simple hash function that converts a string
|
|
224
|
+
to an integer hash value within the range of the bit array.
|
|
225
|
+
此方法实现了一个简单的哈希函数,将字符串转换为位数组范围内的整数哈希值。
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
value: The string value to hash.
|
|
229
|
+
要哈希的字符串值。
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
int: The hash value, which is an integer between 0 and m-1.
|
|
233
|
+
哈希值,是0到m-1之间的整数。
|
|
58
234
|
"""
|
|
235
|
+
# Initialize the return value
|
|
236
|
+
# 初始化返回值
|
|
59
237
|
ret = 0
|
|
238
|
+
|
|
239
|
+
# Calculate the hash value
|
|
240
|
+
# 计算哈希值
|
|
60
241
|
for i in range(len(value)):
|
|
61
242
|
ret += self.seed * ret + ord(value[i])
|
|
243
|
+
|
|
244
|
+
# Ensure the hash value is within the range of the bit array
|
|
245
|
+
# 确保哈希值在位数组的范围内
|
|
62
246
|
return (self.m - 1) & ret
|
|
63
247
|
|
|
64
248
|
|
|
65
249
|
class BloomFilter(object):
|
|
66
|
-
|
|
250
|
+
"""
|
|
251
|
+
Bloom filter implementation using Redis bitsets.
|
|
252
|
+
使用Redis位集实现的布隆过滤器。
|
|
253
|
+
|
|
254
|
+
A Bloom filter is a space-efficient probabilistic data structure that is used
|
|
255
|
+
to test whether an element is a member of a set. False positives are possible,
|
|
256
|
+
but false negatives are not.
|
|
257
|
+
布隆过滤器是一种空间效率高的概率数据结构,用于测试元素是否是集合的成员。
|
|
258
|
+
可能出现假阳性,但不会出现假阴性。
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
def __init__(self, server: "redis.asyncio.Redis", key: str, bit: int = 30, hash_number: int = 6):
|
|
67
262
|
"""
|
|
68
|
-
Initialize
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
:
|
|
72
|
-
|
|
263
|
+
Initialize the Bloom filter.
|
|
264
|
+
初始化布隆过滤器。
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
server: The Redis server connection.
|
|
268
|
+
Redis服务器连接。
|
|
269
|
+
key: The Redis key to use for the Bloom filter.
|
|
270
|
+
用于布隆过滤器的Redis键。
|
|
271
|
+
bit: The power of 2 to use for the bit array size (m = 2^bit).
|
|
272
|
+
用于位数组大小的2的幂(m = 2^bit)。
|
|
273
|
+
Defaults to 30, which gives a bit array of size 2^30 = 1,073,741,824 bits = 128MB.
|
|
274
|
+
默认为30,这给出了大小为2^30 = 1,073,741,824位 = 128MB的位数组。
|
|
275
|
+
hash_number: The number of hash functions to use.
|
|
276
|
+
要使用的哈希函数的数量。
|
|
277
|
+
Defaults to 6.
|
|
278
|
+
默认为6。
|
|
73
279
|
"""
|
|
74
|
-
#
|
|
280
|
+
# Calculate the bit array size (m = 2^bit)
|
|
281
|
+
# 计算位数组大小(m = 2^bit)
|
|
282
|
+
# default to 1 << 30 = 1,073,741,824 = 2^30 = 128MB
|
|
283
|
+
# max filter capacity is approximately 2^30/hash_number = 178,956,970 fingerprints
|
|
284
|
+
# 默认为1 << 30 = 1,073,741,824 = 2^30 = 128MB
|
|
285
|
+
# 最大过滤器容量约为2^30/hash_number = 178,956,970个指纹
|
|
75
286
|
self.m = 1 << bit
|
|
287
|
+
|
|
288
|
+
# Generate seeds for the hash functions
|
|
289
|
+
# 生成哈希函数的种子
|
|
76
290
|
self.seeds = range(hash_number)
|
|
291
|
+
|
|
292
|
+
# Redis server connection
|
|
293
|
+
# Redis服务器连接
|
|
77
294
|
self.server = server
|
|
295
|
+
|
|
296
|
+
# Redis key for the Bloom filter
|
|
297
|
+
# 布隆过滤器的Redis键
|
|
78
298
|
self.key = key
|
|
299
|
+
|
|
300
|
+
# Create hash maps for each seed
|
|
301
|
+
# 为每个种子创建哈希映射
|
|
79
302
|
self.maps = [HashMap(self.m, seed) for seed in self.seeds]
|
|
80
303
|
|
|
81
|
-
async def exists(self, value):
|
|
304
|
+
async def exists(self, value: str) -> bool:
|
|
305
|
+
"""
|
|
306
|
+
Check if a value might exist in the Bloom filter.
|
|
307
|
+
检查值是否可能存在于布隆过滤器中。
|
|
308
|
+
|
|
309
|
+
This method checks if a value might be in the set represented by the Bloom filter.
|
|
310
|
+
If it returns False, the value is definitely not in the set. If it returns True,
|
|
311
|
+
the value might be in the set (false positives are possible).
|
|
312
|
+
此方法检查值是否可能在布隆过滤器表示的集合中。
|
|
313
|
+
如果返回False,则该值肯定不在集合中。如果返回True,
|
|
314
|
+
则该值可能在集合中(可能出现假阳性)。
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
value: The value to check.
|
|
318
|
+
要检查的值。
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
bool: True if the value might exist in the set, False if it definitely does not.
|
|
322
|
+
如果值可能存在于集合中,则为True;如果它肯定不存在,则为False。
|
|
323
|
+
"""
|
|
324
|
+
# Empty values are never in the set
|
|
325
|
+
# 空值永远不在集合中
|
|
82
326
|
if not value:
|
|
83
327
|
return False
|
|
328
|
+
|
|
329
|
+
# Use a Redis pipeline to get all the bits in one round-trip
|
|
330
|
+
# 使用Redis管道在一次往返中获取所有位
|
|
84
331
|
async with self.server.pipeline(transaction=True) as pipe:
|
|
332
|
+
# For each hash function, get the bit at the hashed position
|
|
333
|
+
# 对于每个哈希函数,获取哈希位置的位
|
|
85
334
|
for f in self.maps:
|
|
86
335
|
offset = f.hash(value)
|
|
87
336
|
pipe.getbit(self.key, offset)
|
|
337
|
+
|
|
338
|
+
# Execute the pipeline and get the results
|
|
339
|
+
# 执行管道并获取结果
|
|
88
340
|
result = await pipe.execute()
|
|
341
|
+
|
|
342
|
+
# If all bits are set, the value might be in the set
|
|
343
|
+
# 如果所有位都已设置,则该值可能在集合中
|
|
89
344
|
return all(result)
|
|
90
345
|
|
|
91
|
-
async def insert(self, value):
|
|
346
|
+
async def insert(self, value: str) -> None:
|
|
92
347
|
"""
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
348
|
+
Insert a value into the Bloom filter.
|
|
349
|
+
将值插入布隆过滤器。
|
|
350
|
+
|
|
351
|
+
This method sets the bits in the Bloom filter corresponding to the value,
|
|
352
|
+
so that future calls to exists() for this value will return True.
|
|
353
|
+
此方法设置布隆过滤器中与值对应的位,
|
|
354
|
+
以便将来对此值调用exists()将返回True。
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
value: The value to insert.
|
|
358
|
+
要插入的值。
|
|
96
359
|
"""
|
|
360
|
+
# Use a Redis pipeline to set all the bits in one round-trip
|
|
361
|
+
# 使用Redis管道在一次往返中设置所有位
|
|
97
362
|
async with self.server.pipeline(transaction=True) as pipe:
|
|
363
|
+
# For each hash function, set the bit at the hashed position
|
|
364
|
+
# 对于每个哈希函数,设置哈希位置的位
|
|
98
365
|
for f in self.maps:
|
|
99
366
|
offset = f.hash(value)
|
|
100
367
|
pipe.setbit(self.key, offset, 1)
|
|
368
|
+
|
|
369
|
+
# Execute the pipeline
|
|
370
|
+
# 执行管道
|
|
101
371
|
await pipe.execute()
|
|
102
372
|
|
|
103
373
|
|
|
104
374
|
class RedisBloomDupeFilter(RedisRFPDupeFilter):
|
|
105
|
-
"""
|
|
106
|
-
|
|
107
|
-
|
|
375
|
+
"""
|
|
376
|
+
Bloom filter-based duplicate filter built with Redis bitmaps.
|
|
377
|
+
使用Redis位图构建的基于布隆过滤器的重复过滤器。
|
|
378
|
+
|
|
379
|
+
This filter uses a Bloom filter implemented with Redis bitmaps to store
|
|
380
|
+
request fingerprints. It is more space-efficient than the simple SET-based
|
|
381
|
+
filter, but has a small probability of false positives.
|
|
382
|
+
此过滤器使用使用Redis位图实现的布隆过滤器来存储请求指纹。
|
|
383
|
+
它比简单的基于SET的过滤器更节省空间,但有小概率出现假阳性。
|
|
384
|
+
"""
|
|
385
|
+
|
|
386
|
+
def __init__(self, server: "redis.asyncio.Redis", key: str, debug: bool, bit: int,
|
|
387
|
+
hash_number: int, keep_on_close: bool, info: bool):
|
|
388
|
+
"""
|
|
389
|
+
Initialize the Bloom filter-based duplicate filter.
|
|
390
|
+
初始化基于布隆过滤器的重复过滤器。
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
server: The Redis server connection.
|
|
394
|
+
Redis服务器连接。
|
|
395
|
+
key: The Redis key to use for the Bloom filter.
|
|
396
|
+
用于布隆过滤器的Redis键。
|
|
397
|
+
debug: Whether to log filtered requests.
|
|
398
|
+
是否记录被过滤的请求。
|
|
399
|
+
bit: The power of 2 to use for the bit array size (m = 2^bit).
|
|
400
|
+
用于位数组大小的2的幂(m = 2^bit)。
|
|
401
|
+
hash_number: The number of hash functions to use.
|
|
402
|
+
要使用的哈希函数的数量。
|
|
403
|
+
keep_on_close: Whether to keep the fingerprints in Redis when the spider closes.
|
|
404
|
+
爬虫关闭时是否保留Redis中的指纹。
|
|
405
|
+
info: Whether to log duplicate requests at INFO level.
|
|
406
|
+
是否在INFO级别记录重复的请求。
|
|
407
|
+
"""
|
|
408
|
+
# Initialize the parent class
|
|
409
|
+
# 初始化父类
|
|
108
410
|
super().__init__(server, key, debug, keep_on_close, info)
|
|
411
|
+
|
|
412
|
+
# Store Bloom filter parameters
|
|
413
|
+
# 存储布隆过滤器参数
|
|
109
414
|
self.bit = bit
|
|
110
415
|
self.hash_number = hash_number
|
|
416
|
+
|
|
417
|
+
# Create the Bloom filter
|
|
418
|
+
# 创建布隆过滤器
|
|
111
419
|
self.bf = BloomFilter(server, self.key, bit, hash_number)
|
|
112
420
|
|
|
113
421
|
@classmethod
|
|
114
422
|
async def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
423
|
+
"""
|
|
424
|
+
Create a RedisBloomDupeFilter instance from a crawler.
|
|
425
|
+
从爬虫创建RedisBloomDupeFilter实例。
|
|
426
|
+
|
|
427
|
+
This is the factory method used by AioScrapy to create the dupefilter.
|
|
428
|
+
这是AioScrapy用于创建重复过滤器的工厂方法。
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
crawler: The crawler that will use this dupefilter.
|
|
432
|
+
将使用此重复过滤器的爬虫。
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
RedisBloomDupeFilter: A new RedisBloomDupeFilter instance.
|
|
436
|
+
一个新的RedisBloomDupeFilter实例。
|
|
437
|
+
"""
|
|
438
|
+
# Get Redis connection from database manager
|
|
439
|
+
# 从数据库管理器获取Redis连接
|
|
115
440
|
server = db_manager.redis.queue
|
|
441
|
+
|
|
442
|
+
# Get dupefilter key pattern from settings, default to '%(spider)s:bloomfilter'
|
|
443
|
+
# 从设置获取重复过滤器键模式,默认为'%(spider)s:bloomfilter'
|
|
116
444
|
dupefilter_key = crawler.settings.get("SCHEDULER_DUPEFILTER_KEY", '%(spider)s:bloomfilter')
|
|
445
|
+
|
|
446
|
+
# Get keep_on_close setting, default to True
|
|
447
|
+
# 获取keep_on_close设置,默认为True
|
|
117
448
|
keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
|
|
449
|
+
|
|
450
|
+
# Format the key with the spider name
|
|
451
|
+
# 使用爬虫名称格式化键
|
|
118
452
|
key = dupefilter_key % {'spider': crawler.spider.name}
|
|
453
|
+
|
|
454
|
+
# Get debug setting, default to False
|
|
455
|
+
# 获取debug设置,默认为False
|
|
119
456
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
457
|
+
|
|
458
|
+
# Get info setting, default to False
|
|
459
|
+
# 获取info设置,默认为False
|
|
120
460
|
info = crawler.settings.getbool('DUPEFILTER_INFO', False)
|
|
461
|
+
|
|
462
|
+
# Get Bloom filter parameters from settings
|
|
463
|
+
# 从设置获取布隆过滤器参数
|
|
121
464
|
bit = crawler.settings.getint('BLOOMFILTER_BIT', 30)
|
|
122
465
|
hash_number = crawler.settings.getint('BLOOMFILTER_HASH_NUMBER', 6)
|
|
123
|
-
|
|
466
|
+
|
|
467
|
+
# Create and return a new instance
|
|
468
|
+
# 创建并返回一个新实例
|
|
469
|
+
return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number,
|
|
470
|
+
keep_on_close=keep_on_close, info=info)
|
|
124
471
|
|
|
125
472
|
async def request_seen(self, request: Request) -> bool:
|
|
473
|
+
"""
|
|
474
|
+
Check if a request has been seen before.
|
|
475
|
+
检查请求是否已经被看到过。
|
|
476
|
+
|
|
477
|
+
This method checks if the request's fingerprint exists in the Bloom filter.
|
|
478
|
+
If it does, the request is considered a duplicate. If not, the fingerprint
|
|
479
|
+
is added to the Bloom filter.
|
|
480
|
+
此方法检查请求的指纹是否存在于布隆过滤器中。
|
|
481
|
+
如果存在,则认为请求是重复的。如果不存在,则将指纹添加到布隆过滤器中。
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
request: The request to check.
|
|
485
|
+
要检查的请求。
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
bool: True if the request has been seen before, False otherwise.
|
|
489
|
+
如果请求之前已经被看到过,则为True,否则为False。
|
|
490
|
+
"""
|
|
491
|
+
# Check if the fingerprint exists in the Bloom filter
|
|
492
|
+
# 检查指纹是否存在于布隆过滤器中
|
|
126
493
|
fp = await self.bf.exists(request.fingerprint)
|
|
494
|
+
|
|
495
|
+
# If the fingerprint exists, the request is a duplicate
|
|
496
|
+
# 如果指纹存在,则请求是重复的
|
|
127
497
|
if fp:
|
|
128
498
|
return True
|
|
499
|
+
|
|
500
|
+
# If not, add the fingerprint to the Bloom filter
|
|
501
|
+
# 如果不存在,则将指纹添加到布隆过滤器中
|
|
129
502
|
await self.bf.insert(request.fingerprint)
|
|
503
|
+
|
|
504
|
+
# The request has not been seen before
|
|
505
|
+
# 请求之前未被看到过
|
|
130
506
|
return False
|
|
131
507
|
|
|
132
508
|
|
|
133
509
|
class ExRedisBloomDupeFilter(RedisBloomDupeFilter):
|
|
134
|
-
|
|
135
|
-
|
|
510
|
+
"""
|
|
511
|
+
Extended Bloom filter-based duplicate filter with temporary SET storage.
|
|
512
|
+
具有临时SET存储的扩展基于布隆过滤器的重复过滤器。
|
|
513
|
+
|
|
514
|
+
This filter extends the RedisBloomDupeFilter by adding a temporary SET to store
|
|
515
|
+
fingerprints of requests that are currently being processed. This allows for
|
|
516
|
+
removing fingerprints from the filter if the request fails, which can be useful
|
|
517
|
+
for retrying failed requests.
|
|
518
|
+
此过滤器通过添加一个临时SET来扩展RedisBloomDupeFilter,用于存储当前正在处理的
|
|
519
|
+
请求的指纹。这允许在请求失败时从过滤器中删除指纹,这对于重试失败的请求很有用。
|
|
520
|
+
"""
|
|
521
|
+
|
|
522
|
+
def __init__(self, server: "redis.asyncio.Redis", key: str, key_set: str, ttl: int,
|
|
523
|
+
debug: bool, bit: int, hash_number: int, keep_on_close: bool, info: bool):
|
|
524
|
+
"""
|
|
525
|
+
Initialize the extended Bloom filter-based duplicate filter.
|
|
526
|
+
初始化扩展的基于布隆过滤器的重复过滤器。
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
server: The Redis server connection.
|
|
530
|
+
Redis服务器连接。
|
|
531
|
+
key: The Redis key to use for the Bloom filter.
|
|
532
|
+
用于布隆过滤器的Redis键。
|
|
533
|
+
key_set: The Redis key to use for the temporary SET.
|
|
534
|
+
用于临时SET的Redis键。
|
|
535
|
+
ttl: The time-to-live in seconds for the temporary SET.
|
|
536
|
+
临时SET的生存时间(秒)。
|
|
537
|
+
debug: Whether to log filtered requests.
|
|
538
|
+
是否记录被过滤的请求。
|
|
539
|
+
bit: The power of 2 to use for the bit array size (m = 2^bit).
|
|
540
|
+
用于位数组大小的2的幂(m = 2^bit)。
|
|
541
|
+
hash_number: The number of hash functions to use.
|
|
542
|
+
要使用的哈希函数的数量。
|
|
543
|
+
keep_on_close: Whether to keep the fingerprints in Redis when the spider closes.
|
|
544
|
+
爬虫关闭时是否保留Redis中的指纹。
|
|
545
|
+
info: Whether to log duplicate requests at INFO level.
|
|
546
|
+
是否在INFO级别记录重复的请求。
|
|
547
|
+
"""
|
|
548
|
+
# Initialize the parent class
|
|
549
|
+
# 初始化父类
|
|
136
550
|
super().__init__(server, key, debug, bit, hash_number, keep_on_close, info)
|
|
551
|
+
|
|
552
|
+
# Redis key for the temporary SET
|
|
553
|
+
# 临时SET的Redis键
|
|
137
554
|
self.key_set = key_set
|
|
555
|
+
|
|
556
|
+
# Time-to-live for the temporary SET
|
|
557
|
+
# 临时SET的生存时间
|
|
138
558
|
self.ttl = ttl
|
|
139
559
|
|
|
140
560
|
@classmethod
|
|
141
561
|
async def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
562
|
+
"""
|
|
563
|
+
Create an ExRedisBloomDupeFilter instance from a crawler.
|
|
564
|
+
从爬虫创建ExRedisBloomDupeFilter实例。
|
|
565
|
+
|
|
566
|
+
This is the factory method used by AioScrapy to create the dupefilter.
|
|
567
|
+
这是AioScrapy用于创建重复过滤器的工厂方法。
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
crawler: The crawler that will use this dupefilter.
|
|
571
|
+
将使用此重复过滤器的爬虫。
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
ExRedisBloomDupeFilter: A new ExRedisBloomDupeFilter instance.
|
|
575
|
+
一个新的ExRedisBloomDupeFilter实例。
|
|
576
|
+
"""
|
|
577
|
+
# Get Redis connection from database manager
|
|
578
|
+
# 从数据库管理器获取Redis连接
|
|
142
579
|
server = db_manager.redis.queue
|
|
580
|
+
|
|
581
|
+
# Get dupefilter key pattern from settings, default to '%(spider)s:bloomfilter'
|
|
582
|
+
# 从设置获取重复过滤器键模式,默认为'%(spider)s:bloomfilter'
|
|
143
583
|
dupefilter_key = crawler.settings.get("SCHEDULER_DUPEFILTER_KEY", '%(spider)s:bloomfilter')
|
|
584
|
+
|
|
585
|
+
# Get keep_on_close setting, default to True
|
|
586
|
+
# 获取keep_on_close设置,默认为True
|
|
144
587
|
keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
|
|
588
|
+
|
|
589
|
+
# Format the key with the spider name
|
|
590
|
+
# 使用爬虫名称格式化键
|
|
145
591
|
key = dupefilter_key % {'spider': crawler.spider.name}
|
|
592
|
+
|
|
593
|
+
# Get debug setting, default to False
|
|
594
|
+
# 获取debug设置,默认为False
|
|
146
595
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
596
|
+
|
|
597
|
+
# Get info setting, default to False
|
|
598
|
+
# 获取info设置,默认为False
|
|
147
599
|
info = crawler.settings.getbool('DUPEFILTER_INFO', False)
|
|
600
|
+
|
|
601
|
+
# Get Bloom filter parameters from settings
|
|
602
|
+
# 从设置获取布隆过滤器参数
|
|
148
603
|
bit = crawler.settings.getint('BLOOMFILTER_BIT', 30)
|
|
149
604
|
hash_number = crawler.settings.getint('BLOOMFILTER_HASH_NUMBER', 6)
|
|
605
|
+
|
|
606
|
+
# Get TTL for the temporary SET, default to 180 seconds
|
|
607
|
+
# 获取临时SET的TTL,默认为180秒
|
|
150
608
|
ttl = crawler.settings.getint('DUPEFILTER_SET_KEY_TTL', 180)
|
|
151
|
-
|
|
152
|
-
|
|
609
|
+
|
|
610
|
+
# Create and return a new instance
|
|
611
|
+
# 创建并返回一个新实例
|
|
612
|
+
return cls(server, key=key, key_set=key + "_set", ttl=ttl, debug=debug, bit=bit,
|
|
613
|
+
hash_number=hash_number, keep_on_close=keep_on_close, info=info)
|
|
153
614
|
|
|
154
615
|
async def request_seen(self, request: Request) -> bool:
|
|
616
|
+
"""
|
|
617
|
+
Check if a request has been seen before.
|
|
618
|
+
检查请求是否已经被看到过。
|
|
619
|
+
|
|
620
|
+
This method first checks if the request's fingerprint exists in the Bloom filter.
|
|
621
|
+
If it does, the request is considered a duplicate. If not, the fingerprint is
|
|
622
|
+
added to the temporary SET with a TTL, but not yet to the Bloom filter.
|
|
623
|
+
此方法首先检查请求的指纹是否存在于布隆过滤器中。
|
|
624
|
+
如果存在,则认为请求是重复的。如果不存在,则将指纹添加到具有TTL的临时SET中,
|
|
625
|
+
但尚未添加到布隆过滤器中。
|
|
626
|
+
|
|
627
|
+
Args:
|
|
628
|
+
request: The request to check.
|
|
629
|
+
要检查的请求。
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
bool: True if the request has been seen before, False otherwise.
|
|
633
|
+
如果请求之前已经被看到过,则为True,否则为False。
|
|
634
|
+
"""
|
|
635
|
+
# Check if the fingerprint exists in the Bloom filter
|
|
636
|
+
# 检查指纹是否存在于布隆过滤器中
|
|
155
637
|
fp = await self.bf.exists(request.fingerprint)
|
|
638
|
+
|
|
639
|
+
# If the fingerprint exists in the Bloom filter, the request is a duplicate
|
|
640
|
+
# 如果指纹存在于布隆过滤器中,则请求是重复的
|
|
156
641
|
if fp:
|
|
157
642
|
return True
|
|
643
|
+
|
|
644
|
+
# If not, add the fingerprint to the temporary SET with a TTL
|
|
645
|
+
# 如果不存在,则将指纹添加到具有TTL的临时SET中
|
|
158
646
|
async with self.server.pipeline() as pipe:
|
|
159
647
|
pipe.sadd(self.key_set, request.fingerprint)
|
|
160
648
|
pipe.expire(self.key_set, self.ttl)
|
|
161
649
|
ret, _ = await pipe.execute()
|
|
650
|
+
|
|
651
|
+
# Return True if the fingerprint was already in the temporary SET (ret == 0)
|
|
652
|
+
# 如果指纹已经在临时SET中,则返回True(ret == 0)
|
|
162
653
|
return ret == 0
|
|
163
654
|
|
|
164
655
|
async def done(
|
|
165
656
|
self,
|
|
166
657
|
request: Request,
|
|
167
658
|
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
168
|
-
):
|
|
659
|
+
) -> None:
|
|
660
|
+
"""
|
|
661
|
+
Handle the completion of a request.
|
|
662
|
+
处理请求的完成。
|
|
663
|
+
|
|
664
|
+
This method is called when a request has been processed. It handles the
|
|
665
|
+
fingerprint differently based on the done_type:
|
|
666
|
+
- For "request_ok" or "request_err", it removes the fingerprint from the temporary SET.
|
|
667
|
+
- For "parse_ok", it adds the fingerprint to the Bloom filter.
|
|
668
|
+
当请求已处理时调用此方法。它根据done_type不同地处理指纹:
|
|
669
|
+
- 对于"request_ok"或"request_err",它从临时SET中删除指纹。
|
|
670
|
+
- 对于"parse_ok",它将指纹添加到布隆过滤器中。
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
request: The request that has been processed.
|
|
674
|
+
已处理的请求。
|
|
675
|
+
done_type: The status of the request processing.
|
|
676
|
+
请求处理的状态。
|
|
677
|
+
Can be one of: "request_ok", "request_err", "parse_ok", "parse_err".
|
|
678
|
+
可以是以下之一:"request_ok"、"request_err"、"parse_ok"、"parse_err"。
|
|
679
|
+
"""
|
|
680
|
+
# If the request was successful or failed at the request level,
|
|
681
|
+
# remove the fingerprint from the temporary SET
|
|
682
|
+
# 如果请求成功或在请求级别失败,则从临时SET中删除指纹
|
|
169
683
|
if done_type == "request_ok" or done_type == "request_err":
|
|
170
684
|
await self.server.srem(self.key_set, request.fingerprint)
|
|
685
|
+
|
|
686
|
+
# If the request was successfully parsed, add the fingerprint to the Bloom filter
|
|
687
|
+
# 如果请求成功解析,则将指纹添加到布隆过滤器中
|
|
171
688
|
elif done_type == "parse_ok":
|
|
172
689
|
await self.bf.insert(request.fingerprint)
|
|
173
690
|
|
|
174
|
-
async def close(self, reason=''):
|
|
691
|
+
async def close(self, reason: str = ''):
|
|
692
|
+
"""
|
|
693
|
+
Close the dupefilter.
|
|
694
|
+
关闭重复过滤器。
|
|
695
|
+
|
|
696
|
+
This method is called when the spider is closed. If keep_on_close is False,
|
|
697
|
+
it clears the Bloom filter. It also deletes the temporary SET.
|
|
698
|
+
当爬虫关闭时调用此方法。如果keep_on_close为False,它会清除布隆过滤器。
|
|
699
|
+
它还会删除临时SET。
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
reason: The reason why the spider was closed.
|
|
703
|
+
爬虫被关闭的原因。
|
|
704
|
+
"""
|
|
705
|
+
# If keep_on_close is False, clear the Bloom filter
|
|
706
|
+
# 如果keep_on_close为False,清除布隆过滤器
|
|
175
707
|
if not self.keep_on_close:
|
|
176
708
|
await self.clear()
|
|
709
|
+
|
|
710
|
+
# Delete the temporary SET
|
|
711
|
+
# 删除临时SET
|
|
177
712
|
await self.server.delete(self.key_set)
|
|
178
713
|
|
|
179
714
|
|
|
180
715
|
class ExRedisRFPDupeFilter(RedisRFPDupeFilter):
|
|
716
|
+
"""
|
|
717
|
+
Extended Redis SET-based duplicate filter with fingerprint removal.
|
|
718
|
+
具有指纹移除功能的扩展Redis SET基于的重复过滤器。
|
|
719
|
+
|
|
720
|
+
This filter extends the RedisRFPDupeFilter by adding the ability to remove
|
|
721
|
+
fingerprints from the filter if the request fails, which can be useful for
|
|
722
|
+
retrying failed requests.
|
|
723
|
+
此过滤器通过添加在请求失败时从过滤器中删除指纹的功能来扩展RedisRFPDupeFilter,
|
|
724
|
+
这对于重试失败的请求很有用。
|
|
725
|
+
"""
|
|
181
726
|
|
|
182
727
|
async def done(
|
|
183
728
|
self,
|
|
184
729
|
request: Request,
|
|
185
730
|
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
186
|
-
):
|
|
187
|
-
|
|
731
|
+
) -> None:
|
|
732
|
+
"""
|
|
733
|
+
Handle the completion of a request.
|
|
734
|
+
处理请求的完成。
|
|
735
|
+
|
|
736
|
+
This method is called when a request has been processed. It removes the
|
|
737
|
+
fingerprint from the Redis SET if the request or parsing failed, allowing
|
|
738
|
+
the request to be retried.
|
|
739
|
+
当请求已处理时调用此方法。如果请求或解析失败,它会从Redis SET中删除指纹,
|
|
740
|
+
允许重试请求。
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
request: The request that has been processed.
|
|
744
|
+
已处理的请求。
|
|
745
|
+
done_type: The status of the request processing.
|
|
746
|
+
请求处理的状态。
|
|
747
|
+
Can be one of: "request_ok", "request_err", "parse_ok", "parse_err".
|
|
748
|
+
可以是以下之一:"request_ok"、"request_err"、"parse_ok"、"parse_err"。
|
|
749
|
+
"""
|
|
750
|
+
# When the request or parsing fails, remove the fingerprint from the Redis SET
|
|
751
|
+
# 当请求失败或解析失败时,从Redis的Set中移除指纹
|
|
188
752
|
if done_type == "request_err" or done_type == "parse_err":
|
|
189
753
|
await self.server.srem(self.key, request.fingerprint)
|
|
190
754
|
|
|
191
755
|
|
|
756
|
+
# Aliases for backward compatibility
|
|
757
|
+
# 用于向后兼容的别名
|
|
192
758
|
RFPDupeFilter = RedisRFPDupeFilter
|
|
193
759
|
ExRFPDupeFilter = ExRedisRFPDupeFilter
|
|
194
760
|
BloomDupeFilter = RedisBloomDupeFilter
|