aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/db/aiomongo.py CHANGED
@@ -1,3 +1,14 @@
1
+ """
2
+ MongoDB connection manager for aioscrapy.
3
+ aioscrapy的MongoDB连接管理器。
4
+
5
+ This module provides classes for managing MongoDB connections in aioscrapy.
6
+ It includes a connection manager for creating and managing MongoDB clients, and an executor
7
+ for convenient access to MongoDB collections and operations.
8
+ 此模块提供了在aioscrapy中管理MongoDB连接的类。
9
+ 它包括一个用于创建和管理MongoDB客户端的连接管理器,以及一个用于方便访问MongoDB集合和操作的执行器。
10
+ """
11
+
1
12
  from motor.motor_asyncio import AsyncIOMotorClient
2
13
  from pymongo.errors import NetworkTimeout
3
14
 
@@ -7,86 +18,357 @@ from loguru import logger
7
18
 
8
19
 
9
20
  class MongoExecutor:
21
+ """
22
+ Executor for MongoDB operations.
23
+ MongoDB操作的执行器。
24
+
25
+ This class provides a convenient way to execute MongoDB operations on a specific
26
+ MongoDB client. It offers methods for inserting data and direct access to collections.
27
+ 此类提供了一种在特定MongoDB客户端上执行MongoDB操作的便捷方式。
28
+ 它提供了插入数据和直接访问集合的方法。
29
+ """
30
+
10
31
  def __init__(self, alias: str, pool_manager: "AioMongoManager"):
32
+ """
33
+ Initialize a MongoExecutor.
34
+ 初始化MongoExecutor。
35
+
36
+ Args:
37
+ alias: The alias of the MongoDB client to use.
38
+ 要使用的MongoDB客户端的别名。
39
+ pool_manager: The MongoDB manager that manages the client.
40
+ 管理客户端的MongoDB管理器。
41
+ """
11
42
  self.alias = alias
12
43
  self.pool_manager = pool_manager
13
44
 
14
- async def insert(self, table_name, values, db_name=None, ordered=False, retry_times=3):
45
+ async def insert(self, table_name: str, values: list, db_name=None, ordered=False, retry_times=3):
46
+ """
47
+ Insert multiple documents into a MongoDB collection.
48
+ 向MongoDB集合中插入多个文档。
49
+
50
+ This method inserts multiple documents into a MongoDB collection with retry
51
+ capability in case of network timeouts.
52
+ 此方法向MongoDB集合中插入多个文档,在网络超时的情况下具有重试功能。
53
+
54
+ Args:
55
+ table_name: The name of the collection to insert into.
56
+ 要插入的集合名称。
57
+ values: A list of documents (dictionaries) to insert.
58
+ 要插入的文档(字典)列表。
59
+ db_name: The name of the database to use. If None, uses the default database.
60
+ 要使用的数据库名称。如果为None,则使用默认数据库。
61
+ ordered: If True, performs an ordered insert operation, which stops on first error.
62
+ 如果为True,执行有序插入操作,在第一个错误时停止。
63
+ retry_times: Number of times to retry in case of network timeout.
64
+ 网络超时时重试的次数。
65
+
66
+ Returns:
67
+ InsertManyResult: The result of the insert operation.
68
+ 插入操作的结果。
69
+
70
+ Raises:
71
+ NetworkTimeout: If the operation times out after all retries.
72
+ 如果操作在所有重试后超时。
73
+ """
74
+ # Get the MongoDB client and default database name
75
+ # 获取MongoDB客户端和默认数据库名称
15
76
  client, db_name_default = self.pool_manager.get_pool(self.alias)
77
+
78
+ # Use the provided database name or fall back to the default
79
+ # 使用提供的数据库名称或回退到默认值
16
80
  db_name = db_name or db_name_default
81
+
82
+ # Retry the insert operation in case of network timeout
83
+ # 在网络超时的情况下重试插入操作
17
84
  for _ in range(retry_times):
18
85
  try:
19
86
  return await client[f'{db_name}'][f'{table_name}'].insert_many(values, ordered=ordered)
20
87
  except NetworkTimeout:
21
88
  logger.warning("mongo insert error by NetworkTimeout, retrying...")
22
89
 
90
+ # If all retries fail, raise the exception
91
+ # 如果所有重试都失败,则引发异常
23
92
  raise NetworkTimeout
24
93
 
25
94
  def __getattr__(self, table_name: str):
95
+ """
96
+ Access a MongoDB collection directly as an attribute.
97
+ 直接将MongoDB集合作为属性访问。
98
+
99
+ This method allows accessing MongoDB collections using attribute syntax:
100
+ executor.users, executor.products, etc.
101
+ 此方法允许使用属性语法访问MongoDB集合:
102
+ executor.users、executor.products等。
103
+
104
+ Args:
105
+ table_name: The name of the collection to access.
106
+ 要访问的集合名称。
107
+
108
+ Returns:
109
+ Collection: The MongoDB collection object.
110
+ MongoDB集合对象。
111
+ """
112
+ # Get the MongoDB client and default database name
113
+ # 获取MongoDB客户端和默认数据库名称
26
114
  client, db_name_default = self.pool_manager.get_pool(self.alias)
115
+
116
+ # Return the collection from the default database
117
+ # 从默认数据库返回集合
27
118
  return client[f'{db_name_default}'][f'{table_name}']
28
119
 
29
120
 
30
121
  class AioMongoManager(AbsDBPoolManager):
122
+ """
123
+ Manager for MongoDB connections.
124
+ MongoDB连接的管理器。
125
+
126
+ This class manages MongoDB clients and connections. It implements the
127
+ AbsDBPoolManager interface for MongoDB connections, providing methods for
128
+ creating, accessing, and closing MongoDB clients.
129
+ 此类管理MongoDB客户端和连接。它为MongoDB连接实现了AbsDBPoolManager接口,
130
+ 提供了创建、访问和关闭MongoDB客户端的方法。
131
+ """
132
+
133
+ # Dictionary to store MongoDB clients by alias
134
+ # 按别名存储MongoDB客户端的字典
31
135
  _clients = {}
32
136
 
33
137
  async def create(self, alias: str, params: dict):
138
+ """
139
+ Create a new MongoDB client.
140
+ 创建新的MongoDB客户端。
141
+
142
+ This method creates a new MongoDB client with the given alias and parameters.
143
+ If a client with the given alias already exists, it returns the existing client.
144
+ 此方法使用给定的别名和参数创建新的MongoDB客户端。
145
+ 如果具有给定别名的客户端已经存在,则返回现有客户端。
146
+
147
+ Args:
148
+ alias: The alias for the new MongoDB client.
149
+ 新MongoDB客户端的别名。
150
+ params: The parameters for creating the MongoDB client. Can include:
151
+ 创建MongoDB客户端的参数。可以包括:
152
+ - host: MongoDB server host or connection string
153
+ MongoDB服务器主机或连接字符串
154
+ - port: MongoDB server port
155
+ MongoDB服务器端口
156
+ - db: MongoDB database name (required)
157
+ MongoDB数据库名称(必需)
158
+ - username: MongoDB username
159
+ MongoDB用户名
160
+ - password: MongoDB password
161
+ MongoDB密码
162
+ - connecttimeoutms: Connection timeout in milliseconds
163
+ 连接超时(毫秒)
164
+ - and other parameters accepted by AsyncIOMotorClient
165
+ 以及AsyncIOMotorClient接受的其他参数
166
+
167
+ Returns:
168
+ tuple: A tuple containing (client, db_name).
169
+ 包含(客户端, 数据库名称)的元组。
170
+ """
171
+ # Return existing client if it exists
172
+ # 如果客户端已存在,则返回现有客户端
34
173
  if alias in self._clients:
35
174
  return self._clients[alias]
36
175
 
176
+ # Make a copy of params to avoid modifying the original
177
+ # 复制params以避免修改原始参数
37
178
  params = params.copy()
179
+
180
+ # Extract database name
181
+ # 提取数据库名称
38
182
  db_name = params.pop('db')
183
+
184
+ # Set default connection timeout
185
+ # 设置默认连接超时
39
186
  params.setdefault('connecttimeoutms', 30)
187
+
188
+ # Create the MongoDB client
189
+ # 创建MongoDB客户端
40
190
  client = AsyncIOMotorClient(**params)
191
+
192
+ # Store and return the client with its database name
193
+ # 存储并返回客户端及其数据库名称
41
194
  return self._clients.setdefault(alias, (client, db_name))
42
195
 
43
196
  def get_pool(self, alias: str):
197
+ """
198
+ Get a MongoDB client by its alias.
199
+ 通过别名获取MongoDB客户端。
200
+
201
+ This method retrieves an existing MongoDB client with the given alias.
202
+ 此方法检索具有给定别名的现有MongoDB客户端。
203
+
204
+ Args:
205
+ alias: The alias of the MongoDB client to retrieve.
206
+ 要检索的MongoDB客户端的别名。
207
+
208
+ Returns:
209
+ tuple: A tuple containing (client, db_name), or None if not found.
210
+ 包含(客户端, 数据库名称)的元组,如果未找到则为None。
211
+ """
44
212
  return self._clients.get(alias)
45
213
 
46
214
  def executor(self, alias: str) -> MongoExecutor:
47
- """Get RedisExecutor"""
215
+ """
216
+ Get a MongoExecutor for a specific MongoDB client.
217
+ 获取特定MongoDB客户端的MongoExecutor。
218
+
219
+ This method creates a MongoExecutor that provides a convenient way to
220
+ execute operations on the MongoDB client with the given alias.
221
+ 此方法创建一个MongoExecutor,提供了一种在具有给定别名的MongoDB客户端上
222
+ 执行操作的便捷方式。
223
+
224
+ Args:
225
+ alias: The alias of the MongoDB client to use.
226
+ 要使用的MongoDB客户端的别名。
227
+
228
+ Returns:
229
+ MongoExecutor: An executor for the MongoDB client.
230
+ MongoDB客户端的执行器。
231
+ """
48
232
  return MongoExecutor(alias, self)
49
233
 
50
234
  async def close(self, alias: str):
51
- """Close mongo pool named `alias`"""
52
- client, *_ = self._clients.pop(alias, None)
53
- if client:
235
+ """
236
+ Close a specific MongoDB client.
237
+ 关闭特定的MongoDB客户端。
238
+
239
+ This method closes the MongoDB client with the given alias and removes it
240
+ from the managed clients.
241
+ 此方法关闭具有给定别名的MongoDB客户端,并将其从管理的客户端中移除。
242
+
243
+ Args:
244
+ alias: The alias of the MongoDB client to close.
245
+ 要关闭的MongoDB客户端的别名。
246
+
247
+ Returns:
248
+ None
249
+ """
250
+ # Remove the client from the managed clients
251
+ # 从管理的客户端中移除客户端
252
+ client_tuple = self._clients.pop(alias, None)
253
+
254
+ # Close the client if it exists
255
+ # 如果客户端存在,则关闭它
256
+ if client_tuple:
257
+ client, *_ = client_tuple
54
258
  client.close()
55
259
 
56
260
  async def close_all(self):
57
- """Close all clients of mongo"""
261
+ """
262
+ Close all MongoDB clients.
263
+ 关闭所有MongoDB客户端。
264
+
265
+ This method closes all MongoDB clients managed by this manager.
266
+ 此方法关闭此管理器管理的所有MongoDB客户端。
267
+
268
+ Returns:
269
+ None
270
+ """
271
+ # Create a copy of the keys to avoid modifying the dictionary during iteration
272
+ # 创建键的副本,以避免在迭代期间修改字典
58
273
  for alias in list(self._clients.keys()):
59
274
  await self.close(alias)
60
275
 
61
276
  async def from_dict(self, db_args: dict):
62
- """Create mongo with dict"""
277
+ """
278
+ Initialize MongoDB clients from a configuration dictionary.
279
+ 从配置字典初始化MongoDB客户端。
280
+
281
+ This method creates MongoDB clients based on the configuration in db_args.
282
+ 此方法根据db_args中的配置创建MongoDB客户端。
283
+
284
+ Args:
285
+ db_args: A dictionary mapping aliases to MongoDB connection parameters.
286
+ 将别名映射到MongoDB连接参数的字典。
287
+ Example:
288
+ {
289
+ 'default': {'host': 'mongodb://localhost:27017', 'db': 'mydb'},
290
+ 'analytics': {'host': 'mongodb://analytics.example.com:27017', 'db': 'analytics'}
291
+ }
292
+
293
+ Returns:
294
+ None
295
+ """
63
296
  for alias, args in db_args.items():
64
297
  await self.create(alias, args)
65
298
 
66
299
  async def from_settings(self, settings: aioscrapy.Settings):
67
- """Create mongo with settings"""
300
+ """
301
+ Initialize MongoDB clients from aioscrapy settings.
302
+ 从aioscrapy设置初始化MongoDB客户端。
303
+
304
+ This method creates MongoDB clients based on the MONGO_ARGS setting.
305
+ 此方法根据MONGO_ARGS设置创建MongoDB客户端。
306
+
307
+ The MONGO_ARGS setting should be a dictionary mapping aliases to MongoDB
308
+ connection parameters, for example:
309
+ MONGO_ARGS设置应该是一个将别名映射到MongoDB连接参数的字典,例如:
310
+
311
+ ```python
312
+ MONGO_ARGS = {
313
+ 'default': {'host': 'mongodb://localhost:27017', 'db': 'mydb'},
314
+ 'analytics': {'host': 'mongodb://analytics.example.com:27017', 'db': 'analytics'}
315
+ }
316
+ ```
317
+
318
+ Args:
319
+ settings: The aioscrapy settings object.
320
+ aioscrapy设置对象。
321
+
322
+ Returns:
323
+ None
324
+ """
68
325
  for alias, args in settings.getdict('MONGO_ARGS').items():
69
326
  await self.create(alias, args)
70
327
 
71
328
 
329
+ # Singleton instance of AioMongoManager
330
+ # AioMongoManager的单例实例
72
331
  mongo_manager = AioMongoManager()
73
332
 
333
+ # Example usage
334
+ # 示例用法
74
335
  if __name__ == '__main__':
75
336
  import asyncio
76
337
 
77
338
 
78
339
  async def test():
340
+ """
341
+ Test function demonstrating the usage of the MongoDB manager.
342
+ 演示MongoDB管理器用法的测试函数。
343
+ """
344
+ # Create a MongoDB client with alias 'default'
345
+ # 创建别名为'default'的MongoDB客户端
79
346
  await mongo_manager.create('default', {
80
347
  'host': 'mongodb://root:root@192.168.234.128:27017',
81
348
  'db': 'test',
82
349
  })
350
+
351
+ # Get a MongoDB executor for the 'default' client
352
+ # 获取'default'客户端的MongoDB执行器
83
353
  mongo = mongo_manager.executor('default')
84
- result = await mongo.insert('user', [{'name': 'zhang', 'age': 18}, {'name': 'li', 'age': 20}])
85
- # print('inserted %d docs' % (len(result.inserted_ids),))
86
354
 
355
+ # Insert documents into the 'user' collection
356
+ # 向'user'集合插入文档
357
+ inserted = await mongo.insert('user', [{'name': 'zhang', 'age': 18}, {'name': 'li', 'age': 20}])
358
+ # Uncomment to print the number of inserted documents
359
+ # 取消注释以打印插入的文档数量
360
+ # print('inserted %d docs' % (len(inserted.inserted_ids),))
361
+
362
+ # Query a document from the 'user' collection
363
+ # 从'user'集合查询文档
87
364
  document = await mongo.user.find_one({'img_url': {'$gt': 19}})
88
365
  print(document)
366
+
367
+ # Close all MongoDB clients
368
+ # 关闭所有MongoDB客户端
89
369
  await mongo_manager.close_all()
90
370
 
91
371
 
372
+ # Run the test function
373
+ # 运行测试函数
92
374
  asyncio.get_event_loop().run_until_complete(test())