aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,14 @@
1
+ """
2
+ Database Pipeline Utilities for AioScrapy
3
+ AioScrapy的数据库管道实用工具
4
+
5
+ This module provides base classes and utilities for implementing database pipelines
6
+ in AioScrapy. It includes SQL formatting utilities for different database types
7
+ and a base pipeline class with caching functionality.
8
+ 此模块提供了在AioScrapy中实现数据库管道的基类和实用工具。
9
+ 它包括用于不同数据库类型的SQL格式化实用工具和具有缓存功能的基本管道类。
10
+ """
11
+
1
12
  import asyncio
2
13
 
3
14
  from aioscrapy.utils.log import logger
@@ -5,19 +16,91 @@ from aioscrapy.utils.tools import create_task
5
16
 
6
17
 
7
18
  class SqlFormat:
19
+ """
20
+ SQL query formatter for different database types.
21
+ 不同数据库类型的SQL查询格式化器。
22
+
23
+ This class provides static methods to generate SQL INSERT statements
24
+ for different database types (PostgreSQL, MySQL) and different insert types
25
+ (regular insert, ignore insert, update insert).
26
+ 此类提供静态方法,用于为不同的数据库类型(PostgreSQL、MySQL)
27
+ 和不同的插入类型(常规插入、忽略插入、更新插入)生成SQL INSERT语句。
28
+ """
8
29
 
9
30
  @staticmethod
10
31
  def pg_insert(table: str, fields: list, *args) -> str:
32
+ """
33
+ Generate a PostgreSQL INSERT statement.
34
+ 生成PostgreSQL INSERT语句。
35
+
36
+ Args:
37
+ table: The table name to insert into.
38
+ 要插入的表名。
39
+ fields: List of field names to insert.
40
+ 要插入的字段名列表。
41
+ *args: Additional arguments (not used).
42
+ 额外的参数(未使用)。
43
+
44
+ Returns:
45
+ str: The formatted PostgreSQL INSERT statement.
46
+ 格式化的PostgreSQL INSERT语句。
47
+ """
11
48
  placeholder = ','.join([f'${i + 1}' for i in range(len(fields))])
12
49
  return f'''INSERT INTO {table} ({",".join(fields)}) VALUES ({placeholder})'''
13
50
 
14
51
  @staticmethod
15
52
  def pg_ignore_insert(table: str, fields: list, *args) -> str:
53
+ """
54
+ Generate a PostgreSQL INSERT statement with ON CONFLICT DO NOTHING.
55
+ 生成带有ON CONFLICT DO NOTHING的PostgreSQL INSERT语句。
56
+
57
+ This type of insert will not raise an error if a duplicate key conflict occurs.
58
+ 如果发生重复键冲突,这种类型的插入不会引发错误。
59
+
60
+ Args:
61
+ table: The table name to insert into.
62
+ 要插入的表名。
63
+ fields: List of field names to insert.
64
+ 要插入的字段名列表。
65
+ *args: Additional arguments (not used).
66
+ 额外的参数(未使用)。
67
+
68
+ Returns:
69
+ str: The formatted PostgreSQL INSERT statement with conflict handling.
70
+ 带有冲突处理的格式化PostgreSQL INSERT语句。
71
+ """
16
72
  placeholder = ','.join([f'${i + 1}' for i in range(len(fields))])
17
73
  return f'''INSERT INTO {table} ({",".join(fields)}) VALUES ({placeholder}) ON CONFLICT DO NOTHING'''
18
74
 
19
75
  @staticmethod
20
76
  def pg_update_insert(table: str, fields: list, update_fields: list, on_conflict: str, *args) -> str:
77
+ """
78
+ Generate a PostgreSQL UPSERT statement (INSERT with ON CONFLICT UPDATE).
79
+ 生成PostgreSQL UPSERT语句(带有ON CONFLICT UPDATE的INSERT)。
80
+
81
+ This type of insert will update existing rows if a conflict occurs.
82
+ 如果发生冲突,这种类型的插入将更新现有行。
83
+
84
+ Args:
85
+ table: The table name to insert into.
86
+ 要插入的表名。
87
+ fields: List of field names to insert.
88
+ 要插入的字段名列表。
89
+ update_fields: List of fields to update on conflict. If empty, all fields will be updated.
90
+ 冲突时要更新的字段列表。如果为空,将更新所有字段。
91
+ on_conflict: The field name(s) that determine the conflict.
92
+ 确定冲突的字段名称。
93
+ *args: Additional arguments (not used).
94
+ 额外的参数(未使用)。
95
+
96
+ Returns:
97
+ str: The formatted PostgreSQL UPSERT statement.
98
+ 格式化的PostgreSQL UPSERT语句。
99
+
100
+ Raises:
101
+ AssertionError: If on_conflict is None.
102
+ 如果on_conflict为None。
103
+ """
21
104
  assert on_conflict is not None, "on_conflict must be str, eg: 'id'"
22
105
  placeholder = ','.join([f'${i + 1}' for i in range(len(fields))])
23
106
  if not update_fields:
@@ -27,18 +110,74 @@ class SqlFormat:
27
110
 
28
111
  @staticmethod
29
112
  def mysql_insert(table: str, fields: list, *args) -> str:
113
+ """
114
+ Generate a MySQL INSERT statement.
115
+ 生成MySQL INSERT语句。
116
+
117
+ Args:
118
+ table: The table name to insert into.
119
+ 要插入的表名。
120
+ fields: List of field names to insert.
121
+ 要插入的字段名列表。
122
+ *args: Additional arguments (not used).
123
+ 额外的参数(未使用)。
124
+
125
+ Returns:
126
+ str: The formatted MySQL INSERT statement.
127
+ 格式化的MySQL INSERT语句。
128
+ """
30
129
  placeholder = ','.join(['%s'] * len(fields))
31
130
  fields = ','.join(fields)
32
131
  return f'''INSERT INTO {table} ({fields}) VALUES ({placeholder})'''
33
132
 
34
133
  @staticmethod
35
134
  def mysql_ignore_insert(table: str, fields: list, *args) -> str:
135
+ """
136
+ Generate a MySQL INSERT IGNORE statement.
137
+ 生成MySQL INSERT IGNORE语句。
138
+
139
+ This type of insert will not raise an error if a duplicate key conflict occurs.
140
+ 如果发生重复键冲突,这种类型的插入不会引发错误。
141
+
142
+ Args:
143
+ table: The table name to insert into.
144
+ 要插入的表名。
145
+ fields: List of field names to insert.
146
+ 要插入的字段名列表。
147
+ *args: Additional arguments (not used).
148
+ 额外的参数(未使用)。
149
+
150
+ Returns:
151
+ str: The formatted MySQL INSERT IGNORE statement.
152
+ 格式化的MySQL INSERT IGNORE语句。
153
+ """
36
154
  placeholder = ','.join(['%s'] * len(fields))
37
155
  fields = ','.join(fields)
38
156
  return f'''INSERT IGNORE INTO {table} ({fields}) VALUES ({placeholder})'''
39
157
 
40
158
  @staticmethod
41
159
  def mysql_update_insert(table: str, fields: list, update_fields: list, *args) -> str:
160
+ """
161
+ Generate a MySQL INSERT ... ON DUPLICATE KEY UPDATE statement.
162
+ 生成MySQL INSERT ... ON DUPLICATE KEY UPDATE语句。
163
+
164
+ This type of insert will update existing rows if a duplicate key conflict occurs.
165
+ 如果发生重复键冲突,这种类型的插入将更新现有行。
166
+
167
+ Args:
168
+ table: The table name to insert into.
169
+ 要插入的表名。
170
+ fields: List of field names to insert.
171
+ 要插入的字段名列表。
172
+ update_fields: List of fields to update on duplicate key. If empty, all fields will be updated.
173
+ 重复键时要更新的字段列表。如果为空,将更新所有字段。
174
+ *args: Additional arguments (not used).
175
+ 额外的参数(未使用)。
176
+
177
+ Returns:
178
+ str: The formatted MySQL INSERT ... ON DUPLICATE KEY UPDATE statement.
179
+ 格式化的MySQL INSERT ... ON DUPLICATE KEY UPDATE语句。
180
+ """
42
181
  placeholder = ','.join(['%s'] * len(fields))
43
182
  if not update_fields:
44
183
  update_fields = fields
@@ -47,53 +186,177 @@ class SqlFormat:
47
186
  return f'INSERT INTO {table} ({fields}) VALUES ({placeholder}) ON DUPLICATE KEY UPDATE {update_fields}'
48
187
 
49
188
  def __call__(self, *args, db_type='mysql', insert_type='insert'):
189
+ """
190
+ Call the appropriate SQL formatting method based on database type and insert type.
191
+ 根据数据库类型和插入类型调用适当的SQL格式化方法。
192
+
193
+ This method makes the SqlFormat instance callable, allowing it to be used as a function.
194
+ 此方法使SqlFormat实例可调用,允许将其用作函数。
195
+
196
+ Args:
197
+ *args: Arguments to pass to the SQL formatting method.
198
+ 传递给SQL格式化方法的参数。
199
+ db_type: The database type ('mysql' or 'pg').
200
+ 数据库类型('mysql'或'pg')。
201
+ Defaults to 'mysql'.
202
+ 默认为'mysql'。
203
+ insert_type: The insert type ('insert', 'ignore_insert', or 'update_insert').
204
+ 插入类型('insert'、'ignore_insert'或'update_insert')。
205
+ Defaults to 'insert'.
206
+ 默认为'insert'。
207
+
208
+ Returns:
209
+ str: The formatted SQL statement.
210
+ 格式化的SQL语句。
211
+
212
+ Raises:
213
+ Exception: If the requested database type and insert type combination is not supported.
214
+ 如果不支持请求的数据库类型和插入类型组合。
215
+ """
50
216
  if getattr(self, f'{db_type}_{insert_type}'):
51
217
  return getattr(self, f'{db_type}_{insert_type}')(*args)
52
218
  raise Exception(f"This write type is not supported: {db_type}_{insert_type}")
53
219
 
54
220
 
221
+ # Global instance of SqlFormat for generating SQL statements
55
222
  get_sql = SqlFormat()
56
223
 
57
224
 
58
225
  class ItemCacheMixin:
226
+ """
227
+ Mixin class for caching items before database insertion.
228
+ 用于在数据库插入前缓存项目的混入类。
229
+
230
+ This class provides functionality to cache items and their metadata
231
+ before batch insertion into a database. It helps optimize database
232
+ operations by reducing the number of database calls.
233
+ 此类提供了在批量插入数据库之前缓存项目及其元数据的功能。
234
+ 它通过减少数据库调用次数来帮助优化数据库操作。
235
+ """
236
+
59
237
  def __init__(self, db_type: str):
238
+ """
239
+ Initialize the ItemCacheMixin.
240
+ 初始化ItemCacheMixin。
241
+
242
+ Args:
243
+ db_type: The database type (e.g., 'mysql', 'pg', 'mongo').
244
+ 数据库类型(例如'mysql'、'pg'、'mongo')。
245
+ """
246
+ # The database type
247
+ # 数据库类型
60
248
  self.db_type = db_type
249
+
250
+ # Dictionary to cache items by cache key
251
+ # 按缓存键缓存项目的字典
61
252
  self.item_cache = {}
253
+
254
+ # Dictionary to cache field lists by cache key
255
+ # 按缓存键缓存字段列表的字典
62
256
  self.fields_cache = {}
257
+
258
+ # Dictionary to cache table names by cache key
259
+ # 按缓存键缓存表名的字典
63
260
  self.table_cache = {}
261
+
262
+ # Dictionary to cache SQL insert statements by cache key
263
+ # 按缓存键缓存SQL插入语句的字典
64
264
  self.insert_sql_cache = {}
265
+
266
+ # Dictionary to cache database aliases by cache key
267
+ # 按缓存键缓存数据库别名的字典
65
268
  self.db_alias_cache = {}
66
269
 
67
- def parse_item_to_cache(self, item: dict, save_info):
270
+ def parse_item_to_cache(self, item: dict, save_info: dict):
271
+ """
272
+ Parse an item and add it to the cache.
273
+ 解析项目并将其添加到缓存中。
274
+
275
+ This method extracts information from the save_info dictionary,
276
+ generates a cache key, and adds the item to the appropriate cache.
277
+ 此方法从save_info字典中提取信息,生成缓存键,并将项目添加到适当的缓存中。
278
+
279
+ Args:
280
+ item: The item to cache.
281
+ 要缓存的项目。
282
+ save_info: Dictionary containing information about how to save the item.
283
+ 包含有关如何保存项目的信息的字典。
284
+ Must contain 'table_name' and may contain 'insert_type',
285
+ 'update_fields', 'db_alias', and 'on_conflict'.
286
+ 必须包含'table_name',可能包含'insert_type'、
287
+ 'update_fields'、'db_alias'和'on_conflict'。
288
+
289
+ Returns:
290
+ tuple: A tuple containing the cache key and the number of items in the cache.
291
+ 包含缓存键和缓存中项目数量的元组。
292
+
293
+ Raises:
294
+ AssertionError: If table_name is not provided in save_info.
295
+ 如果在save_info中未提供table_name。
296
+ """
297
+ # Extract information from save_info
298
+ # 从save_info中提取信息
68
299
  table_name = save_info.get('table_name')
69
300
  assert table_name is not None, 'Missing table_name'
70
301
  insert_type = save_info.get('insert_type', 'insert')
71
302
  update_fields = save_info.get('update_fields', [])
72
303
  db_alias = save_info.get('db_alias', ['default'])
73
304
  on_conflict = save_info.get('on_conflict')
305
+
306
+ # Convert string db_alias to list
307
+ # 将字符串db_alias转换为列表
74
308
  if isinstance(db_alias, str):
75
309
  db_alias = [db_alias]
76
310
 
311
+ # Generate a unique cache key based on the item and save_info
312
+ # 根据项目和save_info生成唯一的缓存键
77
313
  fields = list(item.keys())
78
314
  cache_key = ''.join(fields + update_fields + db_alias) + insert_type + table_name + (on_conflict or '')
79
315
 
316
+ # If this is a new cache key, initialize the caches
317
+ # 如果这是一个新的缓存键,初始化缓存
80
318
  if self.fields_cache.get(cache_key) is None:
81
319
  self.db_alias_cache[cache_key] = db_alias
82
320
  self.table_cache[cache_key] = table_name
83
321
  self.fields_cache[cache_key] = fields
84
322
  self.item_cache[cache_key] = []
323
+
324
+ # Generate the SQL insert statement
325
+ # 生成SQL插入语句
85
326
  self.insert_sql_cache[cache_key] = get_sql(
86
327
  table_name, fields, update_fields, on_conflict,
87
328
  db_type=self.db_type,
88
329
  insert_type=insert_type,
89
330
  )
90
331
 
332
+ # Add the item values to the cache
333
+ # 将项目值添加到缓存
91
334
  self.item_cache[cache_key].append([item[field] for field in self.fields_cache[cache_key]])
335
+
336
+ # Return the cache key and the number of items in the cache
337
+ # 返回缓存键和缓存中的项目数量
92
338
  return cache_key, len(self.item_cache[cache_key])
93
339
 
94
340
 
95
341
  class DBPipelineBase(ItemCacheMixin):
342
+ """
343
+ Base class for database pipelines.
344
+ 数据库管道的基类。
345
+
346
+ This class provides common functionality for database pipelines, including
347
+ caching items and periodically saving them to the database.
348
+ 此类为数据库管道提供通用功能,包括缓存项目并定期将其保存到数据库。
349
+ """
350
+
96
351
  def __init__(self, settings, db_type: str):
352
+ """
353
+ Initialize the database pipeline.
354
+ 初始化数据库管道。
355
+
356
+ Args:
357
+ settings: The settings object. 设置对象。
358
+ db_type: The database type (e.g., 'mysql', 'pg', 'mongo'). 数据库类型(例如'mysql'、'pg'、'mongo')。
359
+ """
97
360
  super().__init__(db_type)
98
361
  self.cache_num = settings.getint('SAVE_CACHE_NUM', 500)
99
362
  self.save_cache_interval = settings.getint('SAVE_CACHE_INTERVAL', 10)
@@ -102,14 +365,46 @@ class DBPipelineBase(ItemCacheMixin):
102
365
  self.item_save_key: str = f'__{db_type}__'
103
366
 
104
367
  async def open_spider(self, spider):
368
+ """
369
+ Called when the spider is opened.
370
+ 当爬虫打开时调用。
371
+
372
+ This method starts the save heartbeat task.
373
+ 此方法启动保存心跳任务。
374
+
375
+ Args:
376
+ spider: The spider instance. 爬虫实例。
377
+ """
105
378
  create_task(self.save_heartbeat())
106
379
 
107
380
  async def save_heartbeat(self):
381
+ """
382
+ Periodically save cached items to the database.
383
+ 定期将缓存的项目保存到数据库。
384
+
385
+ This method runs in the background and saves cached items
386
+ every `save_cache_interval` seconds.
387
+ 此方法在后台运行,每隔`save_cache_interval`秒保存一次缓存的项目。
388
+ """
108
389
  while self.running:
109
390
  await asyncio.sleep(self.save_cache_interval)
110
391
  create_task(self.save_all())
111
392
 
112
393
  async def process_item(self, item, spider):
394
+ """
395
+ Process an item.
396
+ 处理一个项目。
397
+
398
+ This method is called for every item pipeline component.
399
+ 此方法对每个项目管道组件调用。
400
+
401
+ Args:
402
+ item: The item to process. 要处理的项目。
403
+ spider: The spider instance. 爬虫实例。
404
+
405
+ Returns:
406
+ The processed item. 处理后的项目。
407
+ """
113
408
  save_info = item.pop(self.item_save_key, None)
114
409
  if save_info is None:
115
410
  logger.warning(f"item Missing key {self.item_save_key}, not stored")
@@ -119,19 +414,67 @@ class DBPipelineBase(ItemCacheMixin):
119
414
  return item
120
415
 
121
416
  async def close_spider(self, spider):
417
+ """
418
+ Called when the spider is closed.
419
+ 当爬虫关闭时调用。
420
+
421
+ This method stops the save heartbeat task and saves all remaining items.
422
+ 此方法停止保存心跳任务并保存所有剩余项目。
423
+
424
+ Args:
425
+ spider: The spider instance. 爬虫实例。
426
+ """
122
427
  self.running = False
123
428
  await self.save_all()
124
429
 
125
430
  async def save_all(self):
431
+ """
432
+ Save all cached items to the database.
433
+ 将所有缓存的项目保存到数据库。
434
+
435
+ This method is called periodically by the save heartbeat task
436
+ and when the spider is closed.
437
+ 此方法由保存心跳任务定期调用,并在爬虫关闭时调用。
438
+ """
126
439
  async with self.lock:
127
440
  for cache_key, items in self.item_cache.items():
128
441
  items and await self._save(cache_key)
129
442
 
130
443
  async def save_item(self, item: dict, save_info: dict):
444
+ """
445
+ Save an item to the cache and possibly to the database.
446
+ 将项目保存到缓存,可能还会保存到数据库。
447
+
448
+ If the cache reaches the configured size, all cached items are saved to the database.
449
+ 如果缓存达到配置的大小,所有缓存的项目都会保存到数据库。
450
+
451
+ Args:
452
+ item: The item to save. 要保存的项目。
453
+ save_info: Information about how to save the item. 有关如何保存项目的信息。
454
+ """
131
455
  async with self.lock:
132
456
  cache_key, cache_count = self.parse_item_to_cache(item, save_info)
133
457
  if cache_count >= self.cache_num:
134
458
  await self._save(cache_key)
135
459
 
136
460
  async def _save(self, cache_key):
137
- raise NotImplementedError
461
+ """
462
+ Save cached items with the given cache key to the database.
463
+ 将具有给定缓存键的缓存项目保存到数据库。
464
+
465
+ This is an abstract method that must be implemented by subclasses.
466
+ It should retrieve the cached items using the cache_key, execute the
467
+ appropriate database operation, and then clear the cache.
468
+ 这是一个必须由子类实现的抽象方法。
469
+ 它应该使用cache_key检索缓存的项目,执行适当的数据库操作,然后清除缓存。
470
+
471
+ Args:
472
+ cache_key: The cache key used to retrieve the cached items, SQL statement,
473
+ and other metadata needed for the database operation.
474
+ 用于检索缓存项目、SQL语句和数据库操作所需的其他元数据的缓存键。
475
+
476
+ Raises:
477
+ NotImplementedError: This method must be implemented by subclasses.
478
+ 此方法必须由子类实现。
479
+ """
480
+ raise NotImplementedError("Subclasses must implement the _save method")