aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/db/aioredis.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redis connection pool manager for aioscrapy.
|
|
3
|
+
aioscrapy的Redis连接池管理器。
|
|
4
|
+
|
|
5
|
+
This module provides classes for managing Redis connection pools in aioscrapy.
|
|
6
|
+
It includes a pool manager for creating and managing Redis clients, and an executor
|
|
7
|
+
for convenient access to Redis commands.
|
|
8
|
+
此模块提供了在aioscrapy中管理Redis连接池的类。
|
|
9
|
+
它包括一个用于创建和管理Redis客户端的池管理器,以及一个用于方便访问Redis命令的执行器。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from redis.asyncio import BlockingConnectionPool, Redis
|
|
2
13
|
|
|
3
14
|
import aioscrapy
|
|
@@ -5,90 +16,328 @@ from aioscrapy.db.absmanager import AbsDBPoolManager
|
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class RedisExecutor:
|
|
19
|
+
"""
|
|
20
|
+
Executor for Redis commands.
|
|
21
|
+
Redis命令的执行器。
|
|
22
|
+
|
|
23
|
+
This class provides a convenient way to execute Redis commands on a specific
|
|
24
|
+
Redis client. It dynamically forwards command calls to the underlying Redis client.
|
|
25
|
+
此类提供了一种在特定Redis客户端上执行Redis命令的便捷方式。
|
|
26
|
+
它动态地将命令调用转发到底层Redis客户端。
|
|
27
|
+
"""
|
|
8
28
|
|
|
9
29
|
def __init__(self, alias: str, pool_manager: "AioRedisPoolManager"):
|
|
30
|
+
"""
|
|
31
|
+
Initialize a RedisExecutor.
|
|
32
|
+
初始化RedisExecutor。
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
alias: The alias of the Redis client to use.
|
|
36
|
+
要使用的Redis客户端的别名。
|
|
37
|
+
pool_manager: The Redis pool manager that manages the Redis client.
|
|
38
|
+
管理Redis客户端的Redis池管理器。
|
|
39
|
+
"""
|
|
10
40
|
self.alias = alias
|
|
11
41
|
self.pool_manager = pool_manager
|
|
12
42
|
|
|
13
43
|
def __getattr__(self, command: str):
|
|
44
|
+
"""
|
|
45
|
+
Dynamically forward command calls to the Redis client.
|
|
46
|
+
动态地将命令调用转发到Redis客户端。
|
|
47
|
+
|
|
48
|
+
This method allows calling Redis commands directly on the executor:
|
|
49
|
+
executor.get('key'), executor.set('key', 'value'), etc.
|
|
50
|
+
此方法允许直接在执行器上调用Redis命令:
|
|
51
|
+
executor.get('key')、executor.set('key', 'value')等。
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
command: The Redis command to execute.
|
|
55
|
+
要执行的Redis命令。
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
The method of the Redis client corresponding to the command.
|
|
59
|
+
对应于命令的Redis客户端的方法。
|
|
60
|
+
"""
|
|
14
61
|
redis_pool: Redis = self.pool_manager.get_pool(self.alias)
|
|
15
62
|
return getattr(redis_pool, command)
|
|
16
63
|
|
|
17
64
|
|
|
18
65
|
class AioRedisPoolManager(AbsDBPoolManager):
|
|
66
|
+
"""
|
|
67
|
+
Pool manager for Redis connections.
|
|
68
|
+
Redis连接的池管理器。
|
|
69
|
+
|
|
70
|
+
This class manages Redis connection pools and clients. It implements the
|
|
71
|
+
AbsDBPoolManager interface for Redis connections, providing methods for
|
|
72
|
+
creating, accessing, and closing Redis clients.
|
|
73
|
+
此类管理Redis连接池和客户端。它为Redis连接实现了AbsDBPoolManager接口,
|
|
74
|
+
提供了创建、访问和关闭Redis客户端的方法。
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
# Dictionary to store Redis clients by alias
|
|
78
|
+
# 按别名存储Redis客户端的字典
|
|
19
79
|
_clients = {}
|
|
20
80
|
|
|
21
81
|
async def create(self, alias: str, params: dict) -> Redis:
|
|
22
|
-
"""
|
|
82
|
+
"""
|
|
83
|
+
Create a new Redis client.
|
|
84
|
+
创建新的Redis客户端。
|
|
85
|
+
|
|
86
|
+
This method creates a new Redis client with the given alias and parameters.
|
|
87
|
+
If a client with the given alias already exists, it returns the existing client.
|
|
88
|
+
此方法使用给定的别名和参数创建新的Redis客户端。
|
|
89
|
+
如果具有给定别名的客户端已经存在,则返回现有客户端。
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
alias: The alias for the new Redis client.
|
|
93
|
+
新Redis客户端的别名。
|
|
94
|
+
params: The parameters for creating the Redis client. Can include:
|
|
95
|
+
创建Redis客户端的参数。可以包括:
|
|
96
|
+
- url: Redis connection URL (e.g., 'redis://user:password@host:port/db')
|
|
97
|
+
Redis连接URL(例如,'redis://user:password@host:port/db')
|
|
98
|
+
- host: Redis server host
|
|
99
|
+
Redis服务器主机
|
|
100
|
+
- port: Redis server port
|
|
101
|
+
Redis服务器端口
|
|
102
|
+
- db: Redis database number
|
|
103
|
+
Redis数据库编号
|
|
104
|
+
- password: Redis server password
|
|
105
|
+
Redis服务器密码
|
|
106
|
+
- socket_connect_timeout: Connection timeout in seconds
|
|
107
|
+
连接超时(秒)
|
|
108
|
+
- and other parameters accepted by BlockingConnectionPool
|
|
109
|
+
以及BlockingConnectionPool接受的其他参数
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Redis: The created or existing Redis client.
|
|
113
|
+
创建的或现有的Redis客户端。
|
|
114
|
+
"""
|
|
115
|
+
# Return existing client if it exists
|
|
116
|
+
# 如果客户端已存在,则返回现有客户端
|
|
23
117
|
if alias in self._clients:
|
|
24
118
|
return self._clients[alias]
|
|
25
119
|
|
|
120
|
+
# Make a copy of params to avoid modifying the original
|
|
121
|
+
# 复制params以避免修改原始参数
|
|
26
122
|
params = params.copy()
|
|
123
|
+
|
|
124
|
+
# Extract URL if provided
|
|
125
|
+
# 如果提供了URL,则提取它
|
|
27
126
|
url = params.pop('url', None)
|
|
127
|
+
|
|
128
|
+
# Set default connection timeout
|
|
129
|
+
# 设置默认连接超时
|
|
28
130
|
params.setdefault('socket_connect_timeout', 30)
|
|
131
|
+
|
|
132
|
+
# Create connection pool from URL or parameters
|
|
133
|
+
# 从URL或参数创建连接池
|
|
29
134
|
if url:
|
|
30
135
|
connection_pool = BlockingConnectionPool.from_url(url, **params)
|
|
31
136
|
else:
|
|
32
137
|
connection_pool = BlockingConnectionPool(**params)
|
|
138
|
+
|
|
139
|
+
# Create Redis client with the connection pool
|
|
140
|
+
# 使用连接池创建Redis客户端
|
|
33
141
|
redis = Redis(connection_pool=connection_pool)
|
|
142
|
+
|
|
143
|
+
# Store and return the client
|
|
144
|
+
# 存储并返回客户端
|
|
34
145
|
return self._clients.setdefault(alias, redis)
|
|
35
146
|
|
|
36
|
-
def get_pool(self, alias: str):
|
|
37
|
-
"""
|
|
147
|
+
def get_pool(self, alias: str) -> Redis:
|
|
148
|
+
"""
|
|
149
|
+
Get a Redis client by its alias.
|
|
150
|
+
通过别名获取Redis客户端。
|
|
151
|
+
|
|
152
|
+
This method retrieves an existing Redis client with the given alias.
|
|
153
|
+
此方法检索具有给定别名的现有Redis客户端。
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
alias: The alias of the Redis client to retrieve.
|
|
157
|
+
要检索的Redis客户端的别名。
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Redis: The Redis client with the given alias.
|
|
161
|
+
具有给定别名的Redis客户端。
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
AssertionError: If no Redis client exists with the given alias.
|
|
165
|
+
如果不存在具有给定别名的Redis客户端。
|
|
166
|
+
"""
|
|
38
167
|
redis_pool: Redis = self._clients.get(alias)
|
|
39
168
|
assert redis_pool is not None, f"Dont create the redis client named {alias}"
|
|
40
169
|
return redis_pool
|
|
41
170
|
|
|
42
171
|
def executor(self, alias: str) -> RedisExecutor:
|
|
43
|
-
"""
|
|
172
|
+
"""
|
|
173
|
+
Get a RedisExecutor for a specific Redis client.
|
|
174
|
+
获取特定Redis客户端的RedisExecutor。
|
|
175
|
+
|
|
176
|
+
This method creates a RedisExecutor that provides a convenient way to
|
|
177
|
+
execute commands on the Redis client with the given alias.
|
|
178
|
+
此方法创建一个RedisExecutor,提供了一种在具有给定别名的Redis客户端上
|
|
179
|
+
执行命令的便捷方式。
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
alias: The alias of the Redis client to use.
|
|
183
|
+
要使用的Redis客户端的别名。
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
RedisExecutor: An executor for the Redis client.
|
|
187
|
+
Redis客户端的执行器。
|
|
188
|
+
"""
|
|
44
189
|
return RedisExecutor(alias, self)
|
|
45
190
|
|
|
46
191
|
async def close(self, alias: str):
|
|
47
|
-
"""
|
|
192
|
+
"""
|
|
193
|
+
Close a specific Redis client.
|
|
194
|
+
关闭特定的Redis客户端。
|
|
195
|
+
|
|
196
|
+
This method closes the Redis client with the given alias and removes it
|
|
197
|
+
from the managed clients.
|
|
198
|
+
此方法关闭具有给定别名的Redis客户端,并将其从管理的客户端中移除。
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
alias: The alias of the Redis client to close.
|
|
202
|
+
要关闭的Redis客户端的别名。
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
None
|
|
206
|
+
"""
|
|
207
|
+
# Remove the client from the managed clients
|
|
208
|
+
# 从管理的客户端中移除客户端
|
|
48
209
|
redis = self._clients.pop(alias, None)
|
|
210
|
+
|
|
211
|
+
# Close the client if it exists
|
|
212
|
+
# 如果客户端存在,则关闭它
|
|
49
213
|
if redis:
|
|
214
|
+
# Close the Redis client
|
|
215
|
+
# 关闭Redis客户端
|
|
50
216
|
await redis.close()
|
|
217
|
+
|
|
218
|
+
# Disconnect the connection pool
|
|
219
|
+
# 断开连接池
|
|
51
220
|
await redis.connection_pool.disconnect()
|
|
52
221
|
|
|
53
222
|
async def close_all(self):
|
|
54
|
-
"""
|
|
223
|
+
"""
|
|
224
|
+
Close all Redis clients.
|
|
225
|
+
关闭所有Redis客户端。
|
|
226
|
+
|
|
227
|
+
This method closes all Redis clients managed by this manager.
|
|
228
|
+
此方法关闭此管理器管理的所有Redis客户端。
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
None
|
|
232
|
+
"""
|
|
233
|
+
# Create a copy of the keys to avoid modifying the dictionary during iteration
|
|
234
|
+
# 创建键的副本,以避免在迭代期间修改字典
|
|
55
235
|
for alias in list(self._clients.keys()):
|
|
56
236
|
await self.close(alias)
|
|
57
237
|
|
|
58
238
|
async def from_dict(self, db_args: dict):
|
|
59
|
-
"""
|
|
239
|
+
"""
|
|
240
|
+
Initialize Redis clients from a configuration dictionary.
|
|
241
|
+
从配置字典初始化Redis客户端。
|
|
242
|
+
|
|
243
|
+
This method creates Redis clients based on the configuration in db_args.
|
|
244
|
+
此方法根据db_args中的配置创建Redis客户端。
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
db_args: A dictionary mapping aliases to Redis connection parameters.
|
|
248
|
+
将别名映射到Redis连接参数的字典。
|
|
249
|
+
Example:
|
|
250
|
+
{
|
|
251
|
+
'default': {'url': 'redis://localhost:6379/0'},
|
|
252
|
+
'cache': {'host': 'cache.example.com', 'port': 6379, 'db': 1}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
None
|
|
257
|
+
"""
|
|
60
258
|
for alias, redis_args in db_args.items():
|
|
61
259
|
await self.create(alias, redis_args)
|
|
62
260
|
|
|
63
261
|
async def from_settings(self, settings: aioscrapy.Settings):
|
|
64
|
-
"""
|
|
262
|
+
"""
|
|
263
|
+
Initialize Redis clients from aioscrapy settings.
|
|
264
|
+
从aioscrapy设置初始化Redis客户端。
|
|
265
|
+
|
|
266
|
+
This method creates Redis clients based on the REDIS_ARGS setting.
|
|
267
|
+
此方法根据REDIS_ARGS设置创建Redis客户端。
|
|
268
|
+
|
|
269
|
+
The REDIS_ARGS setting should be a dictionary mapping aliases to Redis
|
|
270
|
+
connection parameters, for example:
|
|
271
|
+
REDIS_ARGS设置应该是一个将别名映射到Redis连接参数的字典,例如:
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
REDIS_ARGS = {
|
|
275
|
+
'default': {'url': 'redis://localhost:6379/0'},
|
|
276
|
+
'cache': {'host': 'cache.example.com', 'port': 6379, 'db': 1}
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
settings: The aioscrapy settings object.
|
|
282
|
+
aioscrapy设置对象。
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
None
|
|
286
|
+
"""
|
|
65
287
|
for alias, redis_args in settings.getdict('REDIS_ARGS').items():
|
|
66
288
|
await self.create(alias, redis_args)
|
|
67
289
|
|
|
68
290
|
|
|
291
|
+
# Singleton instance of AioRedisPoolManager
|
|
292
|
+
# AioRedisPoolManager的单例实例
|
|
69
293
|
redis_manager = AioRedisPoolManager()
|
|
70
294
|
|
|
295
|
+
# Example usage
|
|
296
|
+
# 示例用法
|
|
71
297
|
if __name__ == '__main__':
|
|
72
298
|
import asyncio
|
|
73
299
|
|
|
74
300
|
|
|
75
301
|
async def test():
|
|
302
|
+
"""
|
|
303
|
+
Test function demonstrating the usage of the Redis manager.
|
|
304
|
+
演示Redis管理器用法的测试函数。
|
|
305
|
+
"""
|
|
306
|
+
# Create a Redis client with alias 'default'
|
|
307
|
+
# 创建别名为'default'的Redis客户端
|
|
76
308
|
await redis_manager.create('default', {
|
|
77
309
|
'url': 'redis://@192.168.234.128:6379/9',
|
|
78
310
|
})
|
|
311
|
+
|
|
312
|
+
# Get a Redis executor for the 'default' client
|
|
313
|
+
# 获取'default'客户端的Redis执行器
|
|
79
314
|
redis = redis_manager.executor('default')
|
|
315
|
+
|
|
316
|
+
# Add a value to a sorted set
|
|
317
|
+
# 向有序集合添加一个值
|
|
80
318
|
print(await redis.zadd('key1', {'value': 2}))
|
|
81
319
|
|
|
320
|
+
# Use a pipeline to execute multiple commands atomically
|
|
321
|
+
# 使用管道原子地执行多个命令
|
|
82
322
|
async with redis.pipeline(transaction=True) as pipe:
|
|
83
|
-
|
|
323
|
+
# Get the first element and remove it in one transaction
|
|
324
|
+
# 在一个事务中获取第一个元素并删除它
|
|
325
|
+
results, _ = await (
|
|
84
326
|
pipe.zrange('key1', 0, 0)
|
|
85
327
|
.zremrangebyrank('key1', 0, 0)
|
|
86
328
|
.execute()
|
|
87
329
|
)
|
|
88
330
|
|
|
331
|
+
# Print the results
|
|
332
|
+
# 打印结果
|
|
89
333
|
print(results)
|
|
334
|
+
|
|
335
|
+
# Close all Redis clients
|
|
336
|
+
# 关闭所有Redis客户端
|
|
90
337
|
await redis_manager.close_all()
|
|
91
338
|
|
|
92
339
|
|
|
93
|
-
#
|
|
94
|
-
|
|
340
|
+
# Run the test function
|
|
341
|
+
# 运行测试函数
|
|
342
|
+
# asyncio.run(test()) # For Python 3.7+
|
|
343
|
+
asyncio.get_event_loop().run_until_complete(test()) # For Python 3.6
|
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Duplicate Filter Base Module for AioScrapy
|
|
3
|
+
AioScrapy的重复过滤器基础模块
|
|
4
|
+
|
|
5
|
+
This module provides the abstract base class for duplicate filters in AioScrapy.
|
|
6
|
+
Duplicate filters are used to avoid crawling the same URL multiple times by
|
|
7
|
+
tracking request fingerprints.
|
|
8
|
+
此模块提供了AioScrapy中重复过滤器的抽象基类。
|
|
9
|
+
重复过滤器用于通过跟踪请求指纹来避免多次爬取相同的URL。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from typing import Literal
|
|
2
13
|
from abc import ABCMeta, abstractmethod
|
|
3
14
|
|
|
@@ -6,30 +17,104 @@ from aioscrapy.utils.log import logger
|
|
|
6
17
|
|
|
7
18
|
|
|
8
19
|
class DupeFilterBase(metaclass=ABCMeta):
|
|
9
|
-
"""
|
|
20
|
+
"""
|
|
21
|
+
Abstract base class for request fingerprint duplicate filters.
|
|
22
|
+
请求指纹重复过滤器的抽象基类。
|
|
23
|
+
|
|
24
|
+
This class defines the interface that all duplicate filters must implement.
|
|
25
|
+
Duplicate filters are used to avoid crawling the same URL multiple times by
|
|
26
|
+
tracking request fingerprints.
|
|
27
|
+
此类定义了所有重复过滤器必须实现的接口。
|
|
28
|
+
重复过滤器用于通过跟踪请求指纹来避免多次爬取相同的URL。
|
|
29
|
+
"""
|
|
10
30
|
|
|
11
31
|
@classmethod
|
|
12
32
|
@abstractmethod
|
|
13
33
|
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
14
|
-
"""
|
|
34
|
+
"""
|
|
35
|
+
Create a duplicate filter instance from a crawler.
|
|
36
|
+
从爬虫创建重复过滤器实例。
|
|
37
|
+
|
|
38
|
+
This is the factory method used by AioScrapy to create the dupefilter.
|
|
39
|
+
这是AioScrapy用于创建重复过滤器的工厂方法。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
crawler: The crawler that will use this dupefilter.
|
|
43
|
+
将使用此重复过滤器的爬虫。
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
DupeFilterBase: A new dupefilter instance.
|
|
47
|
+
一个新的重复过滤器实例。
|
|
48
|
+
"""
|
|
49
|
+
pass
|
|
15
50
|
|
|
16
51
|
@abstractmethod
|
|
17
52
|
async def request_seen(self, request: Request) -> bool:
|
|
18
|
-
"""
|
|
53
|
+
"""
|
|
54
|
+
Check if a request has been seen before.
|
|
55
|
+
检查请求是否已经被看到过。
|
|
56
|
+
|
|
57
|
+
This method checks if the request's fingerprint is in the set of seen
|
|
58
|
+
fingerprints. If it is, the request is considered a duplicate.
|
|
59
|
+
此方法检查请求的指纹是否在已见过的指纹集合中。如果是,则认为请求是重复的。
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
request: The request to check.
|
|
63
|
+
要检查的请求。
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
bool: True if the request has been seen before, False otherwise.
|
|
67
|
+
如果请求之前已经被看到过,则为True,否则为False。
|
|
68
|
+
"""
|
|
69
|
+
pass
|
|
19
70
|
|
|
20
71
|
@abstractmethod
|
|
21
72
|
async def close(self, reason: str = '') -> None:
|
|
22
|
-
"""
|
|
73
|
+
"""
|
|
74
|
+
Close the dupefilter.
|
|
75
|
+
关闭过滤器。
|
|
76
|
+
|
|
77
|
+
This method is called when the spider is closed. It should clean up
|
|
78
|
+
any resources used by the dupefilter.
|
|
79
|
+
当爬虫关闭时调用此方法。它应该清理重复过滤器使用的任何资源。
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
reason: The reason why the spider was closed.
|
|
83
|
+
爬虫被关闭的原因。
|
|
84
|
+
"""
|
|
85
|
+
pass
|
|
23
86
|
|
|
24
87
|
def log(self, request: Request, spider: Spider):
|
|
88
|
+
"""
|
|
89
|
+
Log a filtered duplicate request.
|
|
90
|
+
记录被过滤的重复请求。
|
|
91
|
+
|
|
92
|
+
This method logs information about duplicate requests based on the
|
|
93
|
+
logging settings (info, debug, logdupes). It also increments the
|
|
94
|
+
dupefilter/filtered stats counter.
|
|
95
|
+
此方法根据日志设置(info、debug、logdupes)记录有关重复请求的信息。
|
|
96
|
+
它还增加dupefilter/filtered统计计数器。
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
request: The duplicate request that was filtered.
|
|
100
|
+
被过滤的重复请求。
|
|
101
|
+
spider: The spider that generated the request.
|
|
102
|
+
生成请求的爬虫。
|
|
103
|
+
"""
|
|
104
|
+
# Log at INFO level if info is True
|
|
105
|
+
# 如果info为True,则在INFO级别记录
|
|
25
106
|
if self.info:
|
|
26
107
|
logger.info("Filtered duplicate request: %(request)s" % {
|
|
27
108
|
'request': request.meta.get('dupefilter_msg') or request
|
|
28
109
|
})
|
|
110
|
+
# Log at DEBUG level if debug is True
|
|
111
|
+
# 如果debug为True,则在DEBUG级别记录
|
|
29
112
|
elif self.debug:
|
|
30
113
|
logger.debug("Filtered duplicate request: %(request)s" % {
|
|
31
114
|
'request': request.meta.get('dupefilter_msg') or request
|
|
32
115
|
})
|
|
116
|
+
# Log the first duplicate at DEBUG level and disable further logging
|
|
117
|
+
# 在DEBUG级别记录第一个重复项并禁用进一步的日志记录
|
|
33
118
|
elif self.logdupes:
|
|
34
119
|
msg = ("Filtered duplicate request: %(request)s"
|
|
35
120
|
" - no more duplicates will be shown"
|
|
@@ -37,6 +122,8 @@ class DupeFilterBase(metaclass=ABCMeta):
|
|
|
37
122
|
logger.debug(msg % {'request': request.meta.get('dupefilter_msg') or request})
|
|
38
123
|
self.logdupes = False
|
|
39
124
|
|
|
125
|
+
# Increment the dupefilter/filtered stats counter
|
|
126
|
+
# 增加dupefilter/filtered统计计数器
|
|
40
127
|
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
41
128
|
|
|
42
129
|
async def done(
|
|
@@ -44,4 +131,22 @@ class DupeFilterBase(metaclass=ABCMeta):
|
|
|
44
131
|
request: Request,
|
|
45
132
|
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
46
133
|
) -> None:
|
|
47
|
-
"""
|
|
134
|
+
"""
|
|
135
|
+
Control the removal of fingerprints based on the done_type status.
|
|
136
|
+
根据done_type的状态控制指纹的移除。
|
|
137
|
+
|
|
138
|
+
This method can be implemented by subclasses to handle the removal of
|
|
139
|
+
fingerprints from the filter based on the status of the request processing.
|
|
140
|
+
子类可以实现此方法,以根据请求处理的状态处理从过滤器中移除指纹。
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
request: The request that has been processed.
|
|
144
|
+
已处理的请求。
|
|
145
|
+
done_type: The status of the request processing.
|
|
146
|
+
请求处理的状态。
|
|
147
|
+
Can be one of: "request_ok", "request_err", "parse_ok", "parse_err".
|
|
148
|
+
可以是以下之一:"request_ok"、"request_err"、"parse_ok"、"parse_err"。
|
|
149
|
+
"""
|
|
150
|
+
# Default implementation does nothing
|
|
151
|
+
# 默认实现不执行任何操作
|
|
152
|
+
pass
|
aioscrapy/dupefilters/disk.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Disk-based Request Fingerprint Duplicate Filter for AioScrapy
|
|
3
|
+
AioScrapy的基于磁盘的请求指纹重复过滤器
|
|
4
|
+
|
|
5
|
+
This module provides a duplicate filter that stores request fingerprints on disk,
|
|
6
|
+
allowing for persistence between crawler runs. It implements the DupeFilterBase
|
|
7
|
+
interface and is used to avoid crawling the same URL multiple times.
|
|
8
|
+
此模块提供了一个将请求指纹存储在磁盘上的重复过滤器,允许在爬虫运行之间保持持久性。
|
|
9
|
+
它实现了DupeFilterBase接口,用于避免多次爬取相同的URL。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
import os
|
|
2
13
|
from typing import Optional, Set
|
|
3
14
|
|
|
@@ -6,14 +17,51 @@ from aioscrapy.dupefilters import DupeFilterBase
|
|
|
6
17
|
|
|
7
18
|
|
|
8
19
|
class DiskRFPDupeFilter(DupeFilterBase):
|
|
9
|
-
"""
|
|
20
|
+
"""
|
|
21
|
+
Request Fingerprint duplicates filter built with Disk storage.
|
|
22
|
+
基于磁盘存储构建的请求指纹重复过滤器。
|
|
23
|
+
|
|
24
|
+
This filter stores request fingerprints in memory and on disk, allowing for
|
|
25
|
+
persistence between crawler runs. It implements the DupeFilterBase interface
|
|
26
|
+
and is used to avoid crawling the same URL multiple times.
|
|
27
|
+
此过滤器将请求指纹存储在内存和磁盘上,允许在爬虫运行之间保持持久性。
|
|
28
|
+
它实现了DupeFilterBase接口,用于避免多次爬取相同的URL。
|
|
29
|
+
"""
|
|
10
30
|
|
|
11
31
|
def __init__(self, path: Optional[str] = None, debug: bool = False, info: bool = False):
|
|
12
|
-
|
|
32
|
+
"""
|
|
33
|
+
Initialize the disk-based request fingerprint filter.
|
|
34
|
+
初始化基于磁盘的请求指纹过滤器。
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
path: Directory path where to store the requests.seen file.
|
|
38
|
+
存储requests.seen文件的目录路径。
|
|
39
|
+
If None, no persistence will be used.
|
|
40
|
+
如果为None,则不会使用持久性存储。
|
|
41
|
+
debug: Whether to log filtered requests.
|
|
42
|
+
是否记录被过滤的请求。
|
|
43
|
+
info: Whether to log duplicate requests.
|
|
44
|
+
是否记录重复的请求。
|
|
45
|
+
"""
|
|
46
|
+
# File handle for the requests.seen file
|
|
47
|
+
# requests.seen文件的文件句柄
|
|
48
|
+
self.file: Optional[object] = None
|
|
49
|
+
|
|
50
|
+
# Whether to log filtered requests
|
|
51
|
+
# 是否记录被过滤的请求
|
|
13
52
|
self.debug = debug
|
|
53
|
+
|
|
54
|
+
# Set of request fingerprints
|
|
55
|
+
# 请求指纹的集合
|
|
14
56
|
self.fingerprints: Set = set()
|
|
57
|
+
|
|
58
|
+
# Whether to log duplicate requests
|
|
59
|
+
# 是否记录重复的请求
|
|
15
60
|
self.logdupes: bool = True
|
|
16
61
|
self.info: bool = info
|
|
62
|
+
|
|
63
|
+
# If a path is provided, open the requests.seen file and load existing fingerprints
|
|
64
|
+
# 如果提供了路径,则打开requests.seen文件并加载现有的指纹
|
|
17
65
|
if path:
|
|
18
66
|
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
|
19
67
|
self.file.seek(0)
|
|
@@ -21,24 +69,98 @@ class DiskRFPDupeFilter(DupeFilterBase):
|
|
|
21
69
|
|
|
22
70
|
@classmethod
|
|
23
71
|
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
72
|
+
"""
|
|
73
|
+
Create a DiskRFPDupeFilter instance from a crawler.
|
|
74
|
+
从爬虫创建DiskRFPDupeFilter实例。
|
|
75
|
+
|
|
76
|
+
This is the factory method used by AioScrapy to create the dupefilter.
|
|
77
|
+
这是AioScrapy用于创建重复过滤器的工厂方法。
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
crawler: The crawler that will use this dupefilter.
|
|
81
|
+
将使用此重复过滤器的爬虫。
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
DiskRFPDupeFilter: A new DiskRFPDupeFilter instance.
|
|
85
|
+
一个新的DiskRFPDupeFilter实例。
|
|
86
|
+
"""
|
|
87
|
+
# Get debug setting from crawler settings
|
|
88
|
+
# 从爬虫设置获取debug设置
|
|
24
89
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG')
|
|
90
|
+
|
|
91
|
+
# Get info setting from crawler settings
|
|
92
|
+
# 从爬虫设置获取info设置
|
|
25
93
|
info = crawler.settings.getbool('DUPEFILTER_INFO')
|
|
94
|
+
|
|
95
|
+
# Get job directory from crawler settings, default to './job_dir'
|
|
96
|
+
# 从爬虫设置获取作业目录,默认为'./job_dir'
|
|
26
97
|
path = crawler.settings.get('JOBDIR', './job_dir')
|
|
98
|
+
|
|
99
|
+
# Create the job directory if it doesn't exist
|
|
100
|
+
# 如果作业目录不存在,则创建它
|
|
27
101
|
if path and not os.path.exists(path):
|
|
28
102
|
os.makedirs(path)
|
|
103
|
+
|
|
104
|
+
# Create and return a new instance
|
|
105
|
+
# 创建并返回一个新实例
|
|
29
106
|
return cls(path, debug, info)
|
|
30
107
|
|
|
31
108
|
async def request_seen(self, request: Request) -> bool:
|
|
109
|
+
"""
|
|
110
|
+
Check if a request has been seen before.
|
|
111
|
+
检查请求是否已经被看到过。
|
|
112
|
+
|
|
113
|
+
This method checks if the request's fingerprint is in the set of seen
|
|
114
|
+
fingerprints. If it is, the request is considered a duplicate. If not,
|
|
115
|
+
the fingerprint is added to the set and written to the requests.seen file.
|
|
116
|
+
此方法检查请求的指纹是否在已见过的指纹集合中。如果是,则认为请求是重复的。
|
|
117
|
+
如果不是,则将指纹添加到集合中并写入requests.seen文件。
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
request: The request to check.
|
|
121
|
+
要检查的请求。
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
bool: True if the request has been seen before, False otherwise.
|
|
125
|
+
如果请求之前已经被看到过,则为True,否则为False。
|
|
126
|
+
"""
|
|
127
|
+
# Check if the request's fingerprint is in the set of seen fingerprints
|
|
128
|
+
# 检查请求的指纹是否在已见过的指纹集合中
|
|
32
129
|
if request.fingerprint in self.fingerprints:
|
|
33
130
|
return True
|
|
131
|
+
|
|
132
|
+
# Add the fingerprint to the set
|
|
133
|
+
# 将指纹添加到集合中
|
|
34
134
|
self.fingerprints.add(request.fingerprint)
|
|
135
|
+
|
|
136
|
+
# If we're using a file, write the fingerprint to it
|
|
137
|
+
# 如果我们使用文件,则将指纹写入文件
|
|
35
138
|
if self.file:
|
|
36
139
|
self.file.write(request.fingerprint + '\n')
|
|
140
|
+
|
|
141
|
+
# The request has not been seen before
|
|
142
|
+
# 请求之前未被看到过
|
|
37
143
|
return False
|
|
38
144
|
|
|
39
145
|
async def close(self, reason: str = '') -> None:
|
|
146
|
+
"""
|
|
147
|
+
Close the dupefilter.
|
|
148
|
+
关闭重复过滤器。
|
|
149
|
+
|
|
150
|
+
This method is called when the spider is closed. It closes the requests.seen
|
|
151
|
+
file if it was opened.
|
|
152
|
+
当爬虫关闭时调用此方法。如果requests.seen文件已打开,则关闭它。
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
reason: The reason why the spider was closed.
|
|
156
|
+
爬虫被关闭的原因。
|
|
157
|
+
"""
|
|
158
|
+
# Close the file if it was opened
|
|
159
|
+
# 如果文件已打开,则关闭它
|
|
40
160
|
if self.file:
|
|
41
161
|
self.file.close()
|
|
42
162
|
|
|
43
163
|
|
|
164
|
+
# Alias for backward compatibility
|
|
165
|
+
# 用于向后兼容的别名
|
|
44
166
|
RFPDupeFilter = DiskRFPDupeFilter
|