aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/db/aioredis.py CHANGED
@@ -1,3 +1,14 @@
1
+ """
2
+ Redis connection pool manager for aioscrapy.
3
+ aioscrapy的Redis连接池管理器。
4
+
5
+ This module provides classes for managing Redis connection pools in aioscrapy.
6
+ It includes a pool manager for creating and managing Redis clients, and an executor
7
+ for convenient access to Redis commands.
8
+ 此模块提供了在aioscrapy中管理Redis连接池的类。
9
+ 它包括一个用于创建和管理Redis客户端的池管理器,以及一个用于方便访问Redis命令的执行器。
10
+ """
11
+
1
12
  from redis.asyncio import BlockingConnectionPool, Redis
2
13
 
3
14
  import aioscrapy
@@ -5,90 +16,328 @@ from aioscrapy.db.absmanager import AbsDBPoolManager
5
16
 
6
17
 
7
18
  class RedisExecutor:
19
+ """
20
+ Executor for Redis commands.
21
+ Redis命令的执行器。
22
+
23
+ This class provides a convenient way to execute Redis commands on a specific
24
+ Redis client. It dynamically forwards command calls to the underlying Redis client.
25
+ 此类提供了一种在特定Redis客户端上执行Redis命令的便捷方式。
26
+ 它动态地将命令调用转发到底层Redis客户端。
27
+ """
8
28
 
9
29
  def __init__(self, alias: str, pool_manager: "AioRedisPoolManager"):
30
+ """
31
+ Initialize a RedisExecutor.
32
+ 初始化RedisExecutor。
33
+
34
+ Args:
35
+ alias: The alias of the Redis client to use.
36
+ 要使用的Redis客户端的别名。
37
+ pool_manager: The Redis pool manager that manages the Redis client.
38
+ 管理Redis客户端的Redis池管理器。
39
+ """
10
40
  self.alias = alias
11
41
  self.pool_manager = pool_manager
12
42
 
13
43
  def __getattr__(self, command: str):
44
+ """
45
+ Dynamically forward command calls to the Redis client.
46
+ 动态地将命令调用转发到Redis客户端。
47
+
48
+ This method allows calling Redis commands directly on the executor:
49
+ executor.get('key'), executor.set('key', 'value'), etc.
50
+ 此方法允许直接在执行器上调用Redis命令:
51
+ executor.get('key')、executor.set('key', 'value')等。
52
+
53
+ Args:
54
+ command: The Redis command to execute.
55
+ 要执行的Redis命令。
56
+
57
+ Returns:
58
+ The method of the Redis client corresponding to the command.
59
+ 对应于命令的Redis客户端的方法。
60
+ """
14
61
  redis_pool: Redis = self.pool_manager.get_pool(self.alias)
15
62
  return getattr(redis_pool, command)
16
63
 
17
64
 
18
65
  class AioRedisPoolManager(AbsDBPoolManager):
66
+ """
67
+ Pool manager for Redis connections.
68
+ Redis连接的池管理器。
69
+
70
+ This class manages Redis connection pools and clients. It implements the
71
+ AbsDBPoolManager interface for Redis connections, providing methods for
72
+ creating, accessing, and closing Redis clients.
73
+ 此类管理Redis连接池和客户端。它为Redis连接实现了AbsDBPoolManager接口,
74
+ 提供了创建、访问和关闭Redis客户端的方法。
75
+ """
76
+
77
+ # Dictionary to store Redis clients by alias
78
+ # 按别名存储Redis客户端的字典
19
79
  _clients = {}
20
80
 
21
81
  async def create(self, alias: str, params: dict) -> Redis:
22
- """Create Redis client"""
82
+ """
83
+ Create a new Redis client.
84
+ 创建新的Redis客户端。
85
+
86
+ This method creates a new Redis client with the given alias and parameters.
87
+ If a client with the given alias already exists, it returns the existing client.
88
+ 此方法使用给定的别名和参数创建新的Redis客户端。
89
+ 如果具有给定别名的客户端已经存在,则返回现有客户端。
90
+
91
+ Args:
92
+ alias: The alias for the new Redis client.
93
+ 新Redis客户端的别名。
94
+ params: The parameters for creating the Redis client. Can include:
95
+ 创建Redis客户端的参数。可以包括:
96
+ - url: Redis connection URL (e.g., 'redis://user:password@host:port/db')
97
+ Redis连接URL(例如,'redis://user:password@host:port/db')
98
+ - host: Redis server host
99
+ Redis服务器主机
100
+ - port: Redis server port
101
+ Redis服务器端口
102
+ - db: Redis database number
103
+ Redis数据库编号
104
+ - password: Redis server password
105
+ Redis服务器密码
106
+ - socket_connect_timeout: Connection timeout in seconds
107
+ 连接超时(秒)
108
+ - and other parameters accepted by BlockingConnectionPool
109
+ 以及BlockingConnectionPool接受的其他参数
110
+
111
+ Returns:
112
+ Redis: The created or existing Redis client.
113
+ 创建的或现有的Redis客户端。
114
+ """
115
+ # Return existing client if it exists
116
+ # 如果客户端已存在,则返回现有客户端
23
117
  if alias in self._clients:
24
118
  return self._clients[alias]
25
119
 
120
+ # Make a copy of params to avoid modifying the original
121
+ # 复制params以避免修改原始参数
26
122
  params = params.copy()
123
+
124
+ # Extract URL if provided
125
+ # 如果提供了URL,则提取它
27
126
  url = params.pop('url', None)
127
+
128
+ # Set default connection timeout
129
+ # 设置默认连接超时
28
130
  params.setdefault('socket_connect_timeout', 30)
131
+
132
+ # Create connection pool from URL or parameters
133
+ # 从URL或参数创建连接池
29
134
  if url:
30
135
  connection_pool = BlockingConnectionPool.from_url(url, **params)
31
136
  else:
32
137
  connection_pool = BlockingConnectionPool(**params)
138
+
139
+ # Create Redis client with the connection pool
140
+ # 使用连接池创建Redis客户端
33
141
  redis = Redis(connection_pool=connection_pool)
142
+
143
+ # Store and return the client
144
+ # 存储并返回客户端
34
145
  return self._clients.setdefault(alias, redis)
35
146
 
36
- def get_pool(self, alias: str):
37
- """Get redis client"""
147
+ def get_pool(self, alias: str) -> Redis:
148
+ """
149
+ Get a Redis client by its alias.
150
+ 通过别名获取Redis客户端。
151
+
152
+ This method retrieves an existing Redis client with the given alias.
153
+ 此方法检索具有给定别名的现有Redis客户端。
154
+
155
+ Args:
156
+ alias: The alias of the Redis client to retrieve.
157
+ 要检索的Redis客户端的别名。
158
+
159
+ Returns:
160
+ Redis: The Redis client with the given alias.
161
+ 具有给定别名的Redis客户端。
162
+
163
+ Raises:
164
+ AssertionError: If no Redis client exists with the given alias.
165
+ 如果不存在具有给定别名的Redis客户端。
166
+ """
38
167
  redis_pool: Redis = self._clients.get(alias)
39
168
  assert redis_pool is not None, f"Dont create the redis client named {alias}"
40
169
  return redis_pool
41
170
 
42
171
  def executor(self, alias: str) -> RedisExecutor:
43
- """Get RedisExecutor"""
172
+ """
173
+ Get a RedisExecutor for a specific Redis client.
174
+ 获取特定Redis客户端的RedisExecutor。
175
+
176
+ This method creates a RedisExecutor that provides a convenient way to
177
+ execute commands on the Redis client with the given alias.
178
+ 此方法创建一个RedisExecutor,提供了一种在具有给定别名的Redis客户端上
179
+ 执行命令的便捷方式。
180
+
181
+ Args:
182
+ alias: The alias of the Redis client to use.
183
+ 要使用的Redis客户端的别名。
184
+
185
+ Returns:
186
+ RedisExecutor: An executor for the Redis client.
187
+ Redis客户端的执行器。
188
+ """
44
189
  return RedisExecutor(alias, self)
45
190
 
46
191
  async def close(self, alias: str):
47
- """Close redis pool named `alias`"""
192
+ """
193
+ Close a specific Redis client.
194
+ 关闭特定的Redis客户端。
195
+
196
+ This method closes the Redis client with the given alias and removes it
197
+ from the managed clients.
198
+ 此方法关闭具有给定别名的Redis客户端,并将其从管理的客户端中移除。
199
+
200
+ Args:
201
+ alias: The alias of the Redis client to close.
202
+ 要关闭的Redis客户端的别名。
203
+
204
+ Returns:
205
+ None
206
+ """
207
+ # Remove the client from the managed clients
208
+ # 从管理的客户端中移除客户端
48
209
  redis = self._clients.pop(alias, None)
210
+
211
+ # Close the client if it exists
212
+ # 如果客户端存在,则关闭它
49
213
  if redis:
214
+ # Close the Redis client
215
+ # 关闭Redis客户端
50
216
  await redis.close()
217
+
218
+ # Disconnect the connection pool
219
+ # 断开连接池
51
220
  await redis.connection_pool.disconnect()
52
221
 
53
222
  async def close_all(self):
54
- """Close all clients of redis"""
223
+ """
224
+ Close all Redis clients.
225
+ 关闭所有Redis客户端。
226
+
227
+ This method closes all Redis clients managed by this manager.
228
+ 此方法关闭此管理器管理的所有Redis客户端。
229
+
230
+ Returns:
231
+ None
232
+ """
233
+ # Create a copy of the keys to avoid modifying the dictionary during iteration
234
+ # 创建键的副本,以避免在迭代期间修改字典
55
235
  for alias in list(self._clients.keys()):
56
236
  await self.close(alias)
57
237
 
58
238
  async def from_dict(self, db_args: dict):
59
- """Create redis with dict"""
239
+ """
240
+ Initialize Redis clients from a configuration dictionary.
241
+ 从配置字典初始化Redis客户端。
242
+
243
+ This method creates Redis clients based on the configuration in db_args.
244
+ 此方法根据db_args中的配置创建Redis客户端。
245
+
246
+ Args:
247
+ db_args: A dictionary mapping aliases to Redis connection parameters.
248
+ 将别名映射到Redis连接参数的字典。
249
+ Example:
250
+ {
251
+ 'default': {'url': 'redis://localhost:6379/0'},
252
+ 'cache': {'host': 'cache.example.com', 'port': 6379, 'db': 1}
253
+ }
254
+
255
+ Returns:
256
+ None
257
+ """
60
258
  for alias, redis_args in db_args.items():
61
259
  await self.create(alias, redis_args)
62
260
 
63
261
  async def from_settings(self, settings: aioscrapy.Settings):
64
- """Create redis with settings"""
262
+ """
263
+ Initialize Redis clients from aioscrapy settings.
264
+ 从aioscrapy设置初始化Redis客户端。
265
+
266
+ This method creates Redis clients based on the REDIS_ARGS setting.
267
+ 此方法根据REDIS_ARGS设置创建Redis客户端。
268
+
269
+ The REDIS_ARGS setting should be a dictionary mapping aliases to Redis
270
+ connection parameters, for example:
271
+ REDIS_ARGS设置应该是一个将别名映射到Redis连接参数的字典,例如:
272
+
273
+ ```python
274
+ REDIS_ARGS = {
275
+ 'default': {'url': 'redis://localhost:6379/0'},
276
+ 'cache': {'host': 'cache.example.com', 'port': 6379, 'db': 1}
277
+ }
278
+ ```
279
+
280
+ Args:
281
+ settings: The aioscrapy settings object.
282
+ aioscrapy设置对象。
283
+
284
+ Returns:
285
+ None
286
+ """
65
287
  for alias, redis_args in settings.getdict('REDIS_ARGS').items():
66
288
  await self.create(alias, redis_args)
67
289
 
68
290
 
291
+ # Singleton instance of AioRedisPoolManager
292
+ # AioRedisPoolManager的单例实例
69
293
  redis_manager = AioRedisPoolManager()
70
294
 
295
+ # Example usage
296
+ # 示例用法
71
297
  if __name__ == '__main__':
72
298
  import asyncio
73
299
 
74
300
 
75
301
  async def test():
302
+ """
303
+ Test function demonstrating the usage of the Redis manager.
304
+ 演示Redis管理器用法的测试函数。
305
+ """
306
+ # Create a Redis client with alias 'default'
307
+ # 创建别名为'default'的Redis客户端
76
308
  await redis_manager.create('default', {
77
309
  'url': 'redis://@192.168.234.128:6379/9',
78
310
  })
311
+
312
+ # Get a Redis executor for the 'default' client
313
+ # 获取'default'客户端的Redis执行器
79
314
  redis = redis_manager.executor('default')
315
+
316
+ # Add a value to a sorted set
317
+ # 向有序集合添加一个值
80
318
  print(await redis.zadd('key1', {'value': 2}))
81
319
 
320
+ # Use a pipeline to execute multiple commands atomically
321
+ # 使用管道原子地执行多个命令
82
322
  async with redis.pipeline(transaction=True) as pipe:
83
- results, count = await (
323
+ # Get the first element and remove it in one transaction
324
+ # 在一个事务中获取第一个元素并删除它
325
+ results, _ = await (
84
326
  pipe.zrange('key1', 0, 0)
85
327
  .zremrangebyrank('key1', 0, 0)
86
328
  .execute()
87
329
  )
88
330
 
331
+ # Print the results
332
+ # 打印结果
89
333
  print(results)
334
+
335
+ # Close all Redis clients
336
+ # 关闭所有Redis客户端
90
337
  await redis_manager.close_all()
91
338
 
92
339
 
93
- # asyncio.run(test())
94
- asyncio.get_event_loop().run_until_complete(test())
340
+ # Run the test function
341
+ # 运行测试函数
342
+ # asyncio.run(test()) # For Python 3.7+
343
+ asyncio.get_event_loop().run_until_complete(test()) # For Python 3.6
@@ -1,3 +1,14 @@
1
+ """
2
+ Duplicate Filter Base Module for AioScrapy
3
+ AioScrapy的重复过滤器基础模块
4
+
5
+ This module provides the abstract base class for duplicate filters in AioScrapy.
6
+ Duplicate filters are used to avoid crawling the same URL multiple times by
7
+ tracking request fingerprints.
8
+ 此模块提供了AioScrapy中重复过滤器的抽象基类。
9
+ 重复过滤器用于通过跟踪请求指纹来避免多次爬取相同的URL。
10
+ """
11
+
1
12
  from typing import Literal
2
13
  from abc import ABCMeta, abstractmethod
3
14
 
@@ -6,30 +17,104 @@ from aioscrapy.utils.log import logger
6
17
 
7
18
 
8
19
  class DupeFilterBase(metaclass=ABCMeta):
9
- """Request Fingerprint duplicates filter"""
20
+ """
21
+ Abstract base class for request fingerprint duplicate filters.
22
+ 请求指纹重复过滤器的抽象基类。
23
+
24
+ This class defines the interface that all duplicate filters must implement.
25
+ Duplicate filters are used to avoid crawling the same URL multiple times by
26
+ tracking request fingerprints.
27
+ 此类定义了所有重复过滤器必须实现的接口。
28
+ 重复过滤器用于通过跟踪请求指纹来避免多次爬取相同的URL。
29
+ """
10
30
 
11
31
  @classmethod
12
32
  @abstractmethod
13
33
  def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
14
- """ Get Instance of RFPDupeFilter from crawler """
34
+ """
35
+ Create a duplicate filter instance from a crawler.
36
+ 从爬虫创建重复过滤器实例。
37
+
38
+ This is the factory method used by AioScrapy to create the dupefilter.
39
+ 这是AioScrapy用于创建重复过滤器的工厂方法。
40
+
41
+ Args:
42
+ crawler: The crawler that will use this dupefilter.
43
+ 将使用此重复过滤器的爬虫。
44
+
45
+ Returns:
46
+ DupeFilterBase: A new dupefilter instance.
47
+ 一个新的重复过滤器实例。
48
+ """
49
+ pass
15
50
 
16
51
  @abstractmethod
17
52
  async def request_seen(self, request: Request) -> bool:
18
- """ Check whether fingerprint of request exists """
53
+ """
54
+ Check if a request has been seen before.
55
+ 检查请求是否已经被看到过。
56
+
57
+ This method checks if the request's fingerprint is in the set of seen
58
+ fingerprints. If it is, the request is considered a duplicate.
59
+ 此方法检查请求的指纹是否在已见过的指纹集合中。如果是,则认为请求是重复的。
60
+
61
+ Args:
62
+ request: The request to check.
63
+ 要检查的请求。
64
+
65
+ Returns:
66
+ bool: True if the request has been seen before, False otherwise.
67
+ 如果请求之前已经被看到过,则为True,否则为False。
68
+ """
69
+ pass
19
70
 
20
71
  @abstractmethod
21
72
  async def close(self, reason: str = '') -> None:
22
- """ Delete data on close """
73
+ """
74
+ Close the dupefilter.
75
+ 关闭过滤器。
76
+
77
+ This method is called when the spider is closed. It should clean up
78
+ any resources used by the dupefilter.
79
+ 当爬虫关闭时调用此方法。它应该清理重复过滤器使用的任何资源。
80
+
81
+ Args:
82
+ reason: The reason why the spider was closed.
83
+ 爬虫被关闭的原因。
84
+ """
85
+ pass
23
86
 
24
87
  def log(self, request: Request, spider: Spider):
88
+ """
89
+ Log a filtered duplicate request.
90
+ 记录被过滤的重复请求。
91
+
92
+ This method logs information about duplicate requests based on the
93
+ logging settings (info, debug, logdupes). It also increments the
94
+ dupefilter/filtered stats counter.
95
+ 此方法根据日志设置(info、debug、logdupes)记录有关重复请求的信息。
96
+ 它还增加dupefilter/filtered统计计数器。
97
+
98
+ Args:
99
+ request: The duplicate request that was filtered.
100
+ 被过滤的重复请求。
101
+ spider: The spider that generated the request.
102
+ 生成请求的爬虫。
103
+ """
104
+ # Log at INFO level if info is True
105
+ # 如果info为True,则在INFO级别记录
25
106
  if self.info:
26
107
  logger.info("Filtered duplicate request: %(request)s" % {
27
108
  'request': request.meta.get('dupefilter_msg') or request
28
109
  })
110
+ # Log at DEBUG level if debug is True
111
+ # 如果debug为True,则在DEBUG级别记录
29
112
  elif self.debug:
30
113
  logger.debug("Filtered duplicate request: %(request)s" % {
31
114
  'request': request.meta.get('dupefilter_msg') or request
32
115
  })
116
+ # Log the first duplicate at DEBUG level and disable further logging
117
+ # 在DEBUG级别记录第一个重复项并禁用进一步的日志记录
33
118
  elif self.logdupes:
34
119
  msg = ("Filtered duplicate request: %(request)s"
35
120
  " - no more duplicates will be shown"
@@ -37,6 +122,8 @@ class DupeFilterBase(metaclass=ABCMeta):
37
122
  logger.debug(msg % {'request': request.meta.get('dupefilter_msg') or request})
38
123
  self.logdupes = False
39
124
 
125
+ # Increment the dupefilter/filtered stats counter
126
+ # 增加dupefilter/filtered统计计数器
40
127
  spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
41
128
 
42
129
  async def done(
@@ -44,4 +131,22 @@ class DupeFilterBase(metaclass=ABCMeta):
44
131
  request: Request,
45
132
  done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
46
133
  ) -> None:
47
- """ 根据done_type的状态 控制指纹的移除 """
134
+ """
135
+ Control the removal of fingerprints based on the done_type status.
136
+ 根据done_type的状态控制指纹的移除。
137
+
138
+ This method can be implemented by subclasses to handle the removal of
139
+ fingerprints from the filter based on the status of the request processing.
140
+ 子类可以实现此方法,以根据请求处理的状态处理从过滤器中移除指纹。
141
+
142
+ Args:
143
+ request: The request that has been processed.
144
+ 已处理的请求。
145
+ done_type: The status of the request processing.
146
+ 请求处理的状态。
147
+ Can be one of: "request_ok", "request_err", "parse_ok", "parse_err".
148
+ 可以是以下之一:"request_ok"、"request_err"、"parse_ok"、"parse_err"。
149
+ """
150
+ # Default implementation does nothing
151
+ # 默认实现不执行任何操作
152
+ pass
@@ -1,3 +1,14 @@
1
+ """
2
+ Disk-based Request Fingerprint Duplicate Filter for AioScrapy
3
+ AioScrapy的基于磁盘的请求指纹重复过滤器
4
+
5
+ This module provides a duplicate filter that stores request fingerprints on disk,
6
+ allowing for persistence between crawler runs. It implements the DupeFilterBase
7
+ interface and is used to avoid crawling the same URL multiple times.
8
+ 此模块提供了一个将请求指纹存储在磁盘上的重复过滤器,允许在爬虫运行之间保持持久性。
9
+ 它实现了DupeFilterBase接口,用于避免多次爬取相同的URL。
10
+ """
11
+
1
12
  import os
2
13
  from typing import Optional, Set
3
14
 
@@ -6,14 +17,51 @@ from aioscrapy.dupefilters import DupeFilterBase
6
17
 
7
18
 
8
19
  class DiskRFPDupeFilter(DupeFilterBase):
9
- """Request Fingerprint duplicates filter built with Disk storage"""
20
+ """
21
+ Request Fingerprint duplicates filter built with Disk storage.
22
+ 基于磁盘存储构建的请求指纹重复过滤器。
23
+
24
+ This filter stores request fingerprints in memory and on disk, allowing for
25
+ persistence between crawler runs. It implements the DupeFilterBase interface
26
+ and is used to avoid crawling the same URL multiple times.
27
+ 此过滤器将请求指纹存储在内存和磁盘上,允许在爬虫运行之间保持持久性。
28
+ 它实现了DupeFilterBase接口,用于避免多次爬取相同的URL。
29
+ """
10
30
 
11
31
  def __init__(self, path: Optional[str] = None, debug: bool = False, info: bool = False):
12
- self.file: Optional["File object"] = None
32
+ """
33
+ Initialize the disk-based request fingerprint filter.
34
+ 初始化基于磁盘的请求指纹过滤器。
35
+
36
+ Args:
37
+ path: Directory path where to store the requests.seen file.
38
+ 存储requests.seen文件的目录路径。
39
+ If None, no persistence will be used.
40
+ 如果为None,则不会使用持久性存储。
41
+ debug: Whether to log filtered requests.
42
+ 是否记录被过滤的请求。
43
+ info: Whether to log duplicate requests.
44
+ 是否记录重复的请求。
45
+ """
46
+ # File handle for the requests.seen file
47
+ # requests.seen文件的文件句柄
48
+ self.file: Optional[object] = None
49
+
50
+ # Whether to log filtered requests
51
+ # 是否记录被过滤的请求
13
52
  self.debug = debug
53
+
54
+ # Set of request fingerprints
55
+ # 请求指纹的集合
14
56
  self.fingerprints: Set = set()
57
+
58
+ # Whether to log duplicate requests
59
+ # 是否记录重复的请求
15
60
  self.logdupes: bool = True
16
61
  self.info: bool = info
62
+
63
+ # If a path is provided, open the requests.seen file and load existing fingerprints
64
+ # 如果提供了路径,则打开requests.seen文件并加载现有的指纹
17
65
  if path:
18
66
  self.file = open(os.path.join(path, 'requests.seen'), 'a+')
19
67
  self.file.seek(0)
@@ -21,24 +69,98 @@ class DiskRFPDupeFilter(DupeFilterBase):
21
69
 
22
70
  @classmethod
23
71
  def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
72
+ """
73
+ Create a DiskRFPDupeFilter instance from a crawler.
74
+ 从爬虫创建DiskRFPDupeFilter实例。
75
+
76
+ This is the factory method used by AioScrapy to create the dupefilter.
77
+ 这是AioScrapy用于创建重复过滤器的工厂方法。
78
+
79
+ Args:
80
+ crawler: The crawler that will use this dupefilter.
81
+ 将使用此重复过滤器的爬虫。
82
+
83
+ Returns:
84
+ DiskRFPDupeFilter: A new DiskRFPDupeFilter instance.
85
+ 一个新的DiskRFPDupeFilter实例。
86
+ """
87
+ # Get debug setting from crawler settings
88
+ # 从爬虫设置获取debug设置
24
89
  debug = crawler.settings.getbool('DUPEFILTER_DEBUG')
90
+
91
+ # Get info setting from crawler settings
92
+ # 从爬虫设置获取info设置
25
93
  info = crawler.settings.getbool('DUPEFILTER_INFO')
94
+
95
+ # Get job directory from crawler settings, default to './job_dir'
96
+ # 从爬虫设置获取作业目录,默认为'./job_dir'
26
97
  path = crawler.settings.get('JOBDIR', './job_dir')
98
+
99
+ # Create the job directory if it doesn't exist
100
+ # 如果作业目录不存在,则创建它
27
101
  if path and not os.path.exists(path):
28
102
  os.makedirs(path)
103
+
104
+ # Create and return a new instance
105
+ # 创建并返回一个新实例
29
106
  return cls(path, debug, info)
30
107
 
31
108
  async def request_seen(self, request: Request) -> bool:
109
+ """
110
+ Check if a request has been seen before.
111
+ 检查请求是否已经被看到过。
112
+
113
+ This method checks if the request's fingerprint is in the set of seen
114
+ fingerprints. If it is, the request is considered a duplicate. If not,
115
+ the fingerprint is added to the set and written to the requests.seen file.
116
+ 此方法检查请求的指纹是否在已见过的指纹集合中。如果是,则认为请求是重复的。
117
+ 如果不是,则将指纹添加到集合中并写入requests.seen文件。
118
+
119
+ Args:
120
+ request: The request to check.
121
+ 要检查的请求。
122
+
123
+ Returns:
124
+ bool: True if the request has been seen before, False otherwise.
125
+ 如果请求之前已经被看到过,则为True,否则为False。
126
+ """
127
+ # Check if the request's fingerprint is in the set of seen fingerprints
128
+ # 检查请求的指纹是否在已见过的指纹集合中
32
129
  if request.fingerprint in self.fingerprints:
33
130
  return True
131
+
132
+ # Add the fingerprint to the set
133
+ # 将指纹添加到集合中
34
134
  self.fingerprints.add(request.fingerprint)
135
+
136
+ # If we're using a file, write the fingerprint to it
137
+ # 如果我们使用文件,则将指纹写入文件
35
138
  if self.file:
36
139
  self.file.write(request.fingerprint + '\n')
140
+
141
+ # The request has not been seen before
142
+ # 请求之前未被看到过
37
143
  return False
38
144
 
39
145
  async def close(self, reason: str = '') -> None:
146
+ """
147
+ Close the dupefilter.
148
+ 关闭重复过滤器。
149
+
150
+ This method is called when the spider is closed. It closes the requests.seen
151
+ file if it was opened.
152
+ 当爬虫关闭时调用此方法。如果requests.seen文件已打开,则关闭它。
153
+
154
+ Args:
155
+ reason: The reason why the spider was closed.
156
+ 爬虫被关闭的原因。
157
+ """
158
+ # Close the file if it was opened
159
+ # 如果文件已打开,则关闭它
40
160
  if self.file:
41
161
  self.file.close()
42
162
 
43
163
 
164
+ # Alias for backward compatibility
165
+ # 用于向后兼容的别名
44
166
  RFPDupeFilter = DiskRFPDupeFilter