aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/db/aiopg.py CHANGED
@@ -1,3 +1,14 @@
1
+ """
2
+ PostgreSQL connection pool manager for aioscrapy.
3
+ aioscrapy的PostgreSQL连接池管理器。
4
+
5
+ This module provides classes for managing PostgreSQL connection pools in aioscrapy.
6
+ It includes a pool manager for creating and managing PostgreSQL connections, and an executor
7
+ for convenient execution of SQL queries.
8
+ 此模块提供了在aioscrapy中管理PostgreSQL连接池的类。
9
+ 它包括一个用于创建和管理PostgreSQL连接的池管理器,以及一个用于方便执行SQL查询的执行器。
10
+ """
11
+
1
12
  from contextlib import asynccontextmanager
2
13
 
3
14
  from asyncpg.pool import create_pool
@@ -7,75 +18,361 @@ from aioscrapy.db.absmanager import AbsDBPoolManager
7
18
 
8
19
 
9
20
  class PGExecutor:
21
+ """
22
+ Executor for PostgreSQL queries.
23
+ PostgreSQL查询的执行器。
24
+
25
+ This class provides a convenient way to execute SQL queries on a specific
26
+ PostgreSQL connection pool. It offers methods for inserting data, fetching results,
27
+ and executing queries.
28
+ 此类提供了一种在特定PostgreSQL连接池上执行SQL查询的便捷方式。
29
+ 它提供了插入数据、获取结果和执行查询的方法。
30
+ """
31
+
10
32
  def __init__(self, alias: str, pool_manager: "AioPGPoolManager"):
33
+ """
34
+ Initialize a PGExecutor.
35
+ 初始化PGExecutor。
36
+
37
+ Args:
38
+ alias: The alias of the PostgreSQL connection pool to use.
39
+ 要使用的PostgreSQL连接池的别名。
40
+ pool_manager: The PostgreSQL pool manager that manages the connection pool.
41
+ 管理连接池的PostgreSQL池管理器。
42
+ """
11
43
  self.alias = alias
12
44
  self.pool_manager = pool_manager
13
45
 
14
- async def insert(self, sql, value):
46
+ async def insert(self, sql: str, value: list):
47
+ """
48
+ Insert multiple rows into a PostgreSQL table.
49
+ 向PostgreSQL表中插入多行数据。
50
+
51
+ This method executes an INSERT statement with multiple sets of values.
52
+ It automatically handles transactions, rolling back on failure.
53
+ 此方法执行带有多组值的INSERT语句。
54
+ 它自动处理事务,在失败时回滚。
55
+
56
+ Args:
57
+ sql: The SQL INSERT statement with placeholders.
58
+ 带有占位符的SQL INSERT语句。
59
+ value: A list of tuples or lists, each containing values for one row.
60
+ 元组或列表的列表,每个包含一行的值。
61
+
62
+ Returns:
63
+ The result of the insert operation.
64
+ 插入操作的结果。
65
+
66
+ Raises:
67
+ Exception: If the query fails.
68
+ 如果查询失败。
69
+ """
15
70
  async with self.pool_manager.get(self.alias) as connect:
16
71
  try:
72
+ # Execute the query with multiple sets of values
73
+ # 使用多组值执行查询
17
74
  result = await connect.executemany(sql, value)
18
75
  return result
19
76
  except Exception as e:
77
+ # Roll back the transaction on error
78
+ # 出错时回滚事务
20
79
  await connect.rollback()
21
80
  raise Exception from e
22
81
 
23
82
  async def fetch(self, sql: str):
83
+ """
84
+ Execute a SQL query and fetch all results.
85
+ 执行SQL查询并获取所有结果。
86
+
87
+ This method executes a SQL query and returns all rows from the result.
88
+ 此方法执行SQL查询并返回结果中的所有行。
89
+
90
+ Args:
91
+ sql: The SQL query to execute.
92
+ 要执行的SQL查询。
93
+
94
+ Returns:
95
+ list: A list of records containing the query results.
96
+ 包含查询结果的记录列表。
97
+ """
24
98
  async with self.pool_manager.get(self.alias) as connect:
99
+ # Execute the query and fetch all results
100
+ # 执行查询并获取所有结果
25
101
  return await connect.fetch(sql)
26
102
 
27
103
  async def query(self, sql: str):
104
+ """
105
+ Alias for fetch method.
106
+ fetch方法的别名。
107
+
108
+ This method is a convenience alias for the fetch method.
109
+ 此方法是fetch方法的便捷别名。
110
+
111
+ Args:
112
+ sql: The SQL query to execute.
113
+ 要执行的SQL查询。
114
+
115
+ Returns:
116
+ list: A list of records containing the query results.
117
+ 包含查询结果的记录列表。
118
+ """
28
119
  return await self.fetch(sql)
29
120
 
30
121
 
31
122
  class AioPGPoolManager(AbsDBPoolManager):
123
+ """
124
+ Pool manager for PostgreSQL connections.
125
+ PostgreSQL连接的池管理器。
126
+
127
+ This class manages PostgreSQL connection pools. It implements the
128
+ AbsDBPoolManager interface for PostgreSQL connections, providing methods for
129
+ creating, accessing, and closing PostgreSQL connection pools.
130
+ 此类管理PostgreSQL连接池。它为PostgreSQL连接实现了AbsDBPoolManager接口,
131
+ 提供了创建、访问和关闭PostgreSQL连接池的方法。
132
+ """
133
+
134
+ # Dictionary to store PostgreSQL connection pools by alias
135
+ # 按别名存储PostgreSQL连接池的字典
32
136
  _clients = {}
33
137
 
34
138
  async def create(self, alias: str, params: dict):
139
+ """
140
+ Create a new PostgreSQL connection pool.
141
+ 创建新的PostgreSQL连接池。
142
+
143
+ This method creates a new PostgreSQL connection pool with the given alias and parameters.
144
+ If a pool with the given alias already exists, it returns the existing pool.
145
+ 此方法使用给定的别名和参数创建新的PostgreSQL连接池。
146
+ 如果具有给定别名的池已经存在,则返回现有池。
147
+
148
+ Args:
149
+ alias: The alias for the new PostgreSQL connection pool.
150
+ 新PostgreSQL连接池的别名。
151
+ params: The parameters for creating the PostgreSQL connection pool. Can include:
152
+ 创建PostgreSQL连接池的参数。可以包括:
153
+ - host: PostgreSQL server host
154
+ PostgreSQL服务器主机
155
+ - port: PostgreSQL server port
156
+ PostgreSQL服务器端口
157
+ - user: PostgreSQL username
158
+ PostgreSQL用户名
159
+ - password: PostgreSQL password
160
+ PostgreSQL密码
161
+ - database: PostgreSQL database name
162
+ PostgreSQL数据库名称
163
+ - timeout: Connection timeout in seconds
164
+ 连接超时(秒)
165
+ - and other parameters accepted by asyncpg.create_pool
166
+ 以及asyncpg.create_pool接受的其他参数
167
+
168
+ Returns:
169
+ Pool: The created or existing PostgreSQL connection pool.
170
+ 创建的或现有的PostgreSQL连接池。
171
+ """
172
+ # Return existing pool if it exists
173
+ # 如果池已存在,则返回现有池
35
174
  if alias in self._clients:
36
175
  return self._clients[alias]
37
176
 
177
+ # Make a copy of params to avoid modifying the original
178
+ # 复制params以避免修改原始参数
38
179
  params = params.copy()
180
+
181
+ # Set default connection timeout
182
+ # 设置默认连接超时
39
183
  params.setdefault('timeout', 30)
184
+
185
+ # Create the PostgreSQL connection pool
186
+ # 创建PostgreSQL连接池
40
187
  pg_pool = await create_pool(**params)
188
+
189
+ # Store and return the pool
190
+ # 存储并返回池
41
191
  return self._clients.setdefault(alias, pg_pool)
42
192
 
43
193
  def get_pool(self, alias: str):
194
+ """
195
+ Get a PostgreSQL connection pool by its alias.
196
+ 通过别名获取PostgreSQL连接池。
197
+
198
+ This method retrieves an existing PostgreSQL connection pool with the given alias.
199
+ 此方法检索具有给定别名的现有PostgreSQL连接池。
200
+
201
+ Args:
202
+ alias: The alias of the PostgreSQL connection pool to retrieve.
203
+ 要检索的PostgreSQL连接池的别名。
204
+
205
+ Returns:
206
+ Pool: The PostgreSQL connection pool with the given alias.
207
+ 具有给定别名的PostgreSQL连接池。
208
+
209
+ Raises:
210
+ AssertionError: If no PostgreSQL connection pool exists with the given alias.
211
+ 如果不存在具有给定别名的PostgreSQL连接池。
212
+ """
44
213
  pg_pool = self._clients.get(alias)
45
214
  assert pg_pool is not None, f"Dont create the PG pool named {alias}"
46
215
  return pg_pool
47
216
 
48
217
  @asynccontextmanager
49
218
  async def get(self, alias: str):
50
- """ Get connection of pg """
219
+ """
220
+ Get a PostgreSQL connection as an async context manager.
221
+ 获取PostgreSQL连接作为异步上下文管理器。
222
+
223
+ This method provides a convenient way to acquire a connection
224
+ from a PostgreSQL connection pool, and automatically release it when the
225
+ context is exited.
226
+ 此方法提供了一种从PostgreSQL连接池获取连接的便捷方式,
227
+ 并在退出上下文时自动释放它。
228
+
229
+ Example:
230
+ ```python
231
+ async with pg_manager.get('default') as conn:
232
+ result = await conn.fetch('SELECT * FROM users')
233
+ ```
234
+
235
+ Args:
236
+ alias: The alias of the PostgreSQL connection pool to use.
237
+ 要使用的PostgreSQL连接池的别名。
238
+
239
+ Yields:
240
+ Connection: A PostgreSQL connection.
241
+ PostgreSQL连接。
242
+ """
243
+ # Get the PostgreSQL connection pool
244
+ # 获取PostgreSQL连接池
51
245
  pg_pool = self.get_pool(alias)
246
+
247
+ # Acquire a connection from the pool
248
+ # 从池中获取连接
52
249
  conn = await pg_pool.acquire()
250
+
53
251
  try:
252
+ # Yield the connection to the caller
253
+ # 将连接传递给调用者
54
254
  yield conn
55
255
  finally:
256
+ # Always release the connection back to the pool
257
+ # 始终将连接释放回池
56
258
  await pg_pool.release(conn)
57
259
 
58
260
  def executor(self, alias: str) -> PGExecutor:
261
+ """
262
+ Get a PGExecutor for a specific PostgreSQL connection pool.
263
+ 获取特定PostgreSQL连接池的PGExecutor。
264
+
265
+ This method creates a PGExecutor that provides a convenient way to
266
+ execute SQL queries on the PostgreSQL connection pool with the given alias.
267
+ 此方法创建一个PGExecutor,提供了一种在具有给定别名的PostgreSQL连接池上
268
+ 执行SQL查询的便捷方式。
269
+
270
+ Args:
271
+ alias: The alias of the PostgreSQL connection pool to use.
272
+ 要使用的PostgreSQL连接池的别名。
273
+
274
+ Returns:
275
+ PGExecutor: An executor for the PostgreSQL connection pool.
276
+ PostgreSQL连接池的执行器。
277
+ """
59
278
  return PGExecutor(alias, self)
60
279
 
61
280
  async def close(self, alias: str):
281
+ """
282
+ Close a specific PostgreSQL connection pool.
283
+ 关闭特定的PostgreSQL连接池。
284
+
285
+ This method closes the PostgreSQL connection pool with the given alias and removes it
286
+ from the managed pools.
287
+ 此方法关闭具有给定别名的PostgreSQL连接池,并将其从管理的池中移除。
288
+
289
+ Args:
290
+ alias: The alias of the PostgreSQL connection pool to close.
291
+ 要关闭的PostgreSQL连接池的别名。
292
+
293
+ Returns:
294
+ None
295
+ """
296
+ # Remove the pool from the managed pools
297
+ # 从管理的池中移除池
62
298
  pg_pool = self._clients.pop(alias, None)
299
+
300
+ # Close the pool if it exists
301
+ # 如果池存在,则关闭它
63
302
  if pg_pool:
64
303
  await pg_pool.close()
65
304
 
66
305
  async def close_all(self):
306
+ """
307
+ Close all PostgreSQL connection pools.
308
+ 关闭所有PostgreSQL连接池。
309
+
310
+ This method closes all PostgreSQL connection pools managed by this manager.
311
+ 此方法关闭此管理器管理的所有PostgreSQL连接池。
312
+
313
+ Returns:
314
+ None
315
+ """
316
+ # Create a copy of the keys to avoid modifying the dictionary during iteration
317
+ # 创建键的副本,以避免在迭代期间修改字典
67
318
  for alias in list(self._clients.keys()):
68
319
  await self.close(alias)
69
320
 
70
321
  async def from_dict(self, db_args: dict):
322
+ """
323
+ Initialize PostgreSQL connection pools from a configuration dictionary.
324
+ 从配置字典初始化PostgreSQL连接池。
325
+
326
+ This method creates PostgreSQL connection pools based on the configuration in db_args.
327
+ 此方法根据db_args中的配置创建PostgreSQL连接池。
328
+
329
+ Args:
330
+ db_args: A dictionary mapping aliases to PostgreSQL connection parameters.
331
+ 将别名映射到PostgreSQL连接参数的字典。
332
+ Example:
333
+ {
334
+ 'default': {'host': 'localhost', 'user': 'postgres', 'password': 'password', 'database': 'mydb'},
335
+ 'analytics': {'host': 'analytics.example.com', 'user': 'analyst', 'password': 'password', 'database': 'analytics'}
336
+ }
337
+
338
+ Returns:
339
+ None
340
+ """
71
341
  for alias, pg_args in db_args.items():
72
342
  await self.create(alias, pg_args)
73
343
 
74
344
  async def from_settings(self, settings: aioscrapy.Settings):
345
+ """
346
+ Initialize PostgreSQL connection pools from aioscrapy settings.
347
+ 从aioscrapy设置初始化PostgreSQL连接池。
348
+
349
+ This method creates PostgreSQL connection pools based on the PG_ARGS setting.
350
+ 此方法根据PG_ARGS设置创建PostgreSQL连接池。
351
+
352
+ The PG_ARGS setting should be a dictionary mapping aliases to PostgreSQL
353
+ connection parameters, for example:
354
+ PG_ARGS设置应该是一个将别名映射到PostgreSQL连接参数的字典,例如:
355
+
356
+ ```python
357
+ PG_ARGS = {
358
+ 'default': {'host': 'localhost', 'user': 'postgres', 'password': 'password', 'database': 'mydb'},
359
+ 'analytics': {'host': 'analytics.example.com', 'user': 'analyst', 'password': 'password', 'database': 'analytics'}
360
+ }
361
+ ```
362
+
363
+ Args:
364
+ settings: The aioscrapy settings object.
365
+ aioscrapy设置对象。
366
+
367
+ Returns:
368
+ None
369
+ """
75
370
  for alias, pg_args in settings.getdict('PG_ARGS').items():
76
371
  await self.create(alias, pg_args)
77
372
 
78
373
 
374
+ # Singleton instance of AioPGPoolManager
375
+ # AioPGPoolManager的单例实例
79
376
  pg_manager = AioPGPoolManager()
80
377
 
81
378
  if __name__ == '__main__':