crawlo 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (44) hide show
  1. crawlo/__version__.py +1 -1
  2. crawlo/downloader/cffi_downloader.py +3 -1
  3. crawlo/middleware/proxy.py +171 -348
  4. crawlo/pipelines/mysql_pipeline.py +339 -188
  5. crawlo/settings/default_settings.py +38 -30
  6. crawlo/stats_collector.py +10 -1
  7. crawlo/templates/project/settings.py.tmpl +10 -55
  8. crawlo/templates/project/settings_distributed.py.tmpl +20 -22
  9. crawlo/templates/project/settings_gentle.py.tmpl +5 -0
  10. crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
  11. crawlo/templates/project/settings_minimal.py.tmpl +25 -1
  12. crawlo/templates/project/settings_simple.py.tmpl +5 -0
  13. crawlo/templates/run.py.tmpl +1 -8
  14. crawlo/templates/spider/spider.py.tmpl +5 -108
  15. crawlo/utils/db_helper.py +11 -5
  16. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/METADATA +1 -1
  17. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/RECORD +43 -29
  18. tests/authenticated_proxy_example.py +10 -6
  19. tests/explain_mysql_update_behavior.py +77 -0
  20. tests/simulate_mysql_update_test.py +140 -0
  21. tests/test_asyncmy_usage.py +57 -0
  22. tests/test_crawlo_proxy_integration.py +8 -2
  23. tests/test_downloader_proxy_compatibility.py +24 -20
  24. tests/test_mysql_pipeline_config.py +165 -0
  25. tests/test_mysql_pipeline_error.py +99 -0
  26. tests/test_mysql_pipeline_init_log.py +83 -0
  27. tests/test_mysql_pipeline_integration.py +133 -0
  28. tests/test_mysql_pipeline_refactor.py +144 -0
  29. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  30. tests/test_mysql_pipeline_robustness.py +196 -0
  31. tests/test_mysql_pipeline_types.py +89 -0
  32. tests/test_mysql_update_columns.py +94 -0
  33. tests/test_proxy_middleware.py +104 -8
  34. tests/test_proxy_middleware_enhanced.py +1 -5
  35. tests/test_proxy_middleware_integration.py +7 -2
  36. tests/test_proxy_middleware_refactored.py +25 -2
  37. tests/test_proxy_only.py +84 -0
  38. tests/test_proxy_with_downloader.py +153 -0
  39. tests/test_real_scenario_proxy.py +17 -17
  40. tests/verify_mysql_warnings.py +110 -0
  41. crawlo/middleware/simple_proxy.py +0 -65
  42. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
  43. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
  44. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
@@ -2,20 +2,28 @@
2
2
  import asyncio
3
3
  import aiomysql
4
4
  from asyncmy import create_pool
5
- from typing import Optional, List, Dict
5
+ from typing import Optional, List, Dict, Any
6
+ from abc import ABC, abstractmethod
7
+ import async_timeout
6
8
 
9
+ from crawlo.items import Item
7
10
  from crawlo.exceptions import ItemDiscard
8
11
  from crawlo.utils.db_helper import SQLBuilder
9
12
  from crawlo.utils.log import get_logger
10
13
  from . import BasePipeline
11
14
 
12
15
 
13
- class AsyncmyMySQLPipeline:
16
+ class BaseMySQLPipeline(BasePipeline, ABC):
17
+ """MySQL管道的基类,封装公共功能"""
18
+
14
19
  def __init__(self, crawler):
15
20
  self.crawler = crawler
16
21
  self.settings = crawler.settings
17
22
  self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
18
23
 
24
+ # 记录管道初始化
25
+ self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
26
+
19
27
  # 使用异步锁和初始化标志确保线程安全
20
28
  self._pool_lock = asyncio.Lock()
21
29
  self._pool_initialized = False
@@ -30,46 +38,35 @@ class AsyncmyMySQLPipeline:
30
38
  spider_table_name or
31
39
  self.settings.get('MYSQL_TABLE') or
32
40
  getattr(crawler.spider, 'mysql_table', None) or
33
- f"{crawler.spider.name}_items"
41
+ f"{getattr(crawler.spider, 'name', 'default')}_items"
34
42
  )
35
-
43
+
44
+ # 验证表名是否有效
45
+ if not self.table_name or not isinstance(self.table_name, str):
46
+ raise ValueError(f"Invalid table name: {self.table_name}. Table name must be a non-empty string.")
47
+
48
+ # 清理表名,移除可能的非法字符
49
+ self.table_name = self.table_name.strip().replace(' ', '_').replace('-', '_')
50
+
36
51
  # 批量插入配置
37
- self.batch_size = self.settings.get_int('MYSQL_BATCH_SIZE', 100)
52
+ self.batch_size = max(1, self.settings.get_int('MYSQL_BATCH_SIZE', 100)) # 确保至少为1
38
53
  self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
39
54
  self.batch_buffer: List[Dict] = [] # 批量缓冲区
40
55
 
56
+ # SQL生成配置
57
+ self.auto_update = self.settings.get_bool('MYSQL_AUTO_UPDATE', False)
58
+ self.insert_ignore = self.settings.get_bool('MYSQL_INSERT_IGNORE', False)
59
+ self.update_columns = self.settings.get('MYSQL_UPDATE_COLUMNS', ())
60
+
61
+ # 验证 update_columns 是否为元组或列表
62
+ if self.update_columns and not isinstance(self.update_columns, (tuple, list)):
63
+ self.logger.warning(f"update_columns should be a tuple or list, got {type(self.update_columns)}. Converting to tuple.")
64
+ self.update_columns = (self.update_columns,)
65
+
41
66
  # 注册关闭事件
42
67
  crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
43
68
 
44
- @classmethod
45
- def from_crawler(cls, crawler):
46
- return cls(crawler)
47
-
48
- async def _ensure_pool(self):
49
- """确保连接池已初始化(线程安全)"""
50
- if self._pool_initialized:
51
- return
52
-
53
- async with self._pool_lock:
54
- if not self._pool_initialized: # 双重检查避免竞争条件
55
- try:
56
- self.pool = await create_pool(
57
- host=self.settings.get('MYSQL_HOST', 'localhost'),
58
- port=self.settings.get_int('MYSQL_PORT', 3306),
59
- user=self.settings.get('MYSQL_USER', 'root'),
60
- password=self.settings.get('MYSQL_PASSWORD', ''),
61
- db=self.settings.get('MYSQL_DB', 'scrapy_db'),
62
- minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
63
- maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
64
- echo=self.settings.get_bool('MYSQL_ECHO', False)
65
- )
66
- self._pool_initialized = True
67
- self.logger.debug(f"MySQL连接池初始化完成(表: {self.table_name})")
68
- except Exception as e:
69
- self.logger.error(f"MySQL连接池初始化失败: {e}")
70
- raise
71
-
72
- async def process_item(self, item, spider, kwargs=None) -> Optional[dict]:
69
+ async def process_item(self, item: Item, spider, kwargs: Dict[str, Any] = None) -> Item:
73
70
  """处理item的核心方法"""
74
71
  kwargs = kwargs or {}
75
72
  spider_name = getattr(spider, 'name', 'unknown') # 获取爬虫名称
@@ -87,8 +84,13 @@ class AsyncmyMySQLPipeline:
87
84
  # 单条插入逻辑
88
85
  try:
89
86
  await self._ensure_pool()
87
+
88
+ # 检查连接池是否有效
89
+ if not self._pool_initialized or not self.pool:
90
+ raise RuntimeError("Database connection pool is not initialized or invalid")
91
+
90
92
  item_dict = dict(item)
91
- sql = SQLBuilder.make_insert(table=self.table_name, data=item_dict, **kwargs)
93
+ sql = await self._make_insert_sql(item_dict, **kwargs)
92
94
 
93
95
  rowcount = await self._execute_sql(sql=sql)
94
96
  if rowcount > 1:
@@ -100,37 +102,38 @@ class AsyncmyMySQLPipeline:
100
102
  f"爬虫 {spider_name} 成功插入单条记录到表 {self.table_name}"
101
103
  )
102
104
  else:
103
- self.logger.warning(
104
- f"爬虫 {spider_name}: SQL执行成功但未插入新记录 - {sql[:100]}..."
105
- )
105
+ # 当使用 MYSQL_UPDATE_COLUMNS 时,如果更新的字段值与现有记录相同,
106
+ # MySQL 不会实际更新任何数据,rowcount 会是 0
107
+ if self.update_columns:
108
+ self.logger.info(
109
+ f"爬虫 {spider_name}: SQL执行完成,使用更新列配置 {self.update_columns},"
110
+ f"可能未实际更新数据(字段值未变化)"
111
+ )
112
+ else:
113
+ self.logger.warning(
114
+ f"爬虫 {spider_name}: SQL执行成功但未插入新记录"
115
+ )
106
116
 
107
117
  # 统计计数移到这里,与AiomysqlMySQLPipeline保持一致
108
118
  self.crawler.stats.inc_value('mysql/insert_success')
109
119
  return item
110
120
 
111
121
  except Exception as e:
112
- self.logger.error(f"处理item时发生错误: {e}")
122
+ # 添加更多调试信息
123
+ error_msg = f"处理失败: {str(e)}"
124
+ self.logger.error(f"处理item时发生错误: {error_msg}")
113
125
  self.crawler.stats.inc_value('mysql/insert_failed')
114
- raise ItemDiscard(f"处理失败: {e}")
126
+ raise ItemDiscard(error_msg)
115
127
 
128
+ @abstractmethod
116
129
  async def _execute_sql(self, sql: str, values: list = None) -> int:
117
- """执行SQL语句并处理结果"""
118
- async with self.pool.acquire() as conn:
119
- async with conn.cursor() as cursor:
120
- try:
121
- # 根据是否有参数值选择不同的执行方法
122
- if values is not None:
123
- rowcount = await cursor.execute(sql, values)
124
- else:
125
- rowcount = await cursor.execute(sql)
130
+ """执行SQL语句并处理结果 - 子类需要重写此方法"""
131
+ raise NotImplementedError("子类必须实现 _execute_sql 方法")
126
132
 
127
- await conn.commit()
128
- # 移除这里的统计计数
129
- return rowcount
130
- except Exception as e:
131
- await conn.rollback()
132
- # 移除这里的统计计数
133
- raise ItemDiscard(f"MySQL插入失败: {e}")
133
+ @abstractmethod
134
+ async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
135
+ """执行批量SQL语句 - 子类需要重写此方法"""
136
+ raise NotImplementedError("子类必须实现 _execute_batch_sql 方法")
134
137
 
135
138
  async def _flush_batch(self, spider_name: str):
136
139
  """刷新批量缓冲区并执行批量插入"""
@@ -140,81 +143,260 @@ class AsyncmyMySQLPipeline:
140
143
  try:
141
144
  await self._ensure_pool()
142
145
 
143
- # 使用批量SQL生成函数
144
- batch_result = SQLBuilder.make_batch(table=self.table_name, datas=self.batch_buffer)
145
- if batch_result is None:
146
- self.logger.warning("批量插入数据为空")
147
- self.batch_buffer.clear()
148
- return
146
+ # 检查连接池是否有效
147
+ if not self._pool_initialized or not self.pool:
148
+ raise RuntimeError("Database connection pool is not initialized or invalid")
149
+
150
+ # 使用 SQLBuilder 生成批量插入 SQL
151
+ batch_result = SQLBuilder.make_batch(
152
+ table=self.table_name,
153
+ datas=self.batch_buffer,
154
+ auto_update=self.auto_update,
155
+ update_columns=self.update_columns
156
+ )
157
+
158
+ if batch_result:
159
+ sql, values_list = batch_result
160
+ rowcount = await self._execute_batch_sql(sql=sql, values_list=values_list)
161
+
162
+ if rowcount > 0:
163
+ self.logger.info(
164
+ f"爬虫 {spider_name} 批量插入 {len(self.batch_buffer)} 条记录到表 {self.table_name},实际影响 {rowcount} 行"
165
+ )
166
+ else:
167
+ # 当使用 MYSQL_UPDATE_COLUMNS 时,如果更新的字段值与现有记录相同,
168
+ # MySQL 不会实际更新任何数据,rowcount 会是 0
169
+ if self.update_columns:
170
+ self.logger.debug(
171
+ f"爬虫 {spider_name}: 批量SQL执行完成,使用更新列配置 {self.update_columns},"
172
+ f"可能未实际更新数据(字段值未变化)"
173
+ )
174
+ else:
175
+ self.logger.warning(
176
+ f"爬虫 {spider_name}: 批量SQL执行完成但未插入新记录"
177
+ )
149
178
 
150
- sql, values_list = batch_result
179
+ # 清空缓冲区
180
+ self.batch_buffer.clear()
181
+ self.crawler.stats.inc_value('mysql/batch_insert_success')
182
+ else:
183
+ self.logger.warning(f"爬虫 {spider_name}: 批量数据为空,跳过插入")
151
184
 
152
- async with self.pool.acquire() as conn:
153
- async with conn.cursor() as cursor:
154
- try:
155
- # 执行批量插入
156
- rowcount = await cursor.executemany(sql, values_list)
157
- await conn.commit()
158
-
159
- self.logger.info(
160
- f"爬虫 {spider_name} 批量插入 {rowcount} 条记录到表 {self.table_name}"
161
- )
162
- # 更新统计计数
163
- self.crawler.stats.inc_value('mysql/insert_success', rowcount)
164
- self.batch_buffer.clear()
165
- except Exception as e:
166
- await conn.rollback()
167
- self.crawler.stats.inc_value('mysql/insert_failed', len(self.batch_buffer))
168
- self.logger.error(f"批量插入失败: {e}")
169
- raise ItemDiscard(f"批量插入失败: {e}")
170
185
  except Exception as e:
171
- self.logger.error(f"批量插入过程中发生错误: {e}")
172
- raise ItemDiscard(f"批量插入处理失败: {e}")
186
+ # 添加更多调试信息
187
+ error_msg = f"批量插入失败: {str(e)}"
188
+ self.logger.error(f"批量处理时发生错误: {error_msg}")
189
+ self.crawler.stats.inc_value('mysql/batch_insert_failed')
190
+ # 不清空缓冲区,以便可能的重试
191
+ # 但如果错误是由于数据问题导致的,可能需要清空缓冲区以避免无限重试
192
+ if "Duplicate entry" in str(e) or "Data too long" in str(e):
193
+ self.logger.warning("由于数据问题导致的错误,清空缓冲区以避免无限重试")
194
+ self.batch_buffer.clear()
195
+ raise ItemDiscard(error_msg)
173
196
 
174
197
  async def spider_closed(self):
175
198
  """关闭爬虫时清理资源"""
176
199
  # 在关闭前刷新剩余的批量数据
177
200
  if self.use_batch and self.batch_buffer:
178
201
  spider_name = getattr(self.crawler.spider, 'name', 'unknown')
179
- await self._flush_batch(spider_name)
202
+ try:
203
+ await self._flush_batch(spider_name)
204
+ except Exception as e:
205
+ self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
180
206
 
181
207
  if self.pool:
182
- self.pool.close()
183
- await self.pool.wait_closed()
184
- self.logger.info("MySQL连接池已关闭")
208
+ try:
209
+ pool_stats = {
210
+ 'size': getattr(self.pool, 'size', 'unknown'),
211
+ 'minsize': getattr(self.pool, 'minsize', 'unknown'),
212
+ 'maxsize': getattr(self.pool, 'maxsize', 'unknown')
213
+ }
214
+ self.logger.info(f"正在关闭MySQL连接池,当前状态: {pool_stats}")
215
+ self.pool.close()
216
+ await self.pool.wait_closed()
217
+ self.logger.info("MySQL连接池已关闭")
218
+ except Exception as e:
219
+ self.logger.error(f"关闭MySQL连接池时发生错误: {e}")
220
+
221
+ async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
222
+ """生成插入SQL语句,子类可以重写此方法"""
223
+ # 合并管道配置和传入的kwargs参数
224
+ sql_kwargs = {
225
+ 'auto_update': self.auto_update,
226
+ 'insert_ignore': self.insert_ignore,
227
+ 'update_columns': self.update_columns
228
+ }
229
+ sql_kwargs.update(kwargs)
230
+
231
+ return SQLBuilder.make_insert(
232
+ table=self.table_name,
233
+ data=item_dict,
234
+ **sql_kwargs
235
+ )
236
+
237
+ @abstractmethod
238
+ async def _ensure_pool(self):
239
+ """确保连接池已初始化(线程安全),子类必须实现此方法"""
240
+ pass
185
241
 
186
242
 
187
- class AiomysqlMySQLPipeline:
243
+ class AsyncmyMySQLPipeline(BaseMySQLPipeline):
244
+ """使用asyncmy库的MySQL管道实现"""
245
+
188
246
  def __init__(self, crawler):
189
- self.crawler = crawler
190
- self.settings = crawler.settings
191
- self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
247
+ super().__init__(crawler)
248
+ self.logger.info(f"AsyncmyMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
192
249
 
193
- # 使用异步锁和初始化标志
194
- self._pool_lock = asyncio.Lock()
195
- self._pool_initialized = False
196
- self.pool = None
197
- self.table_name = (
198
- self.settings.get('MYSQL_TABLE') or
199
- getattr(crawler.spider, 'mysql_table', None) or
200
- f"{crawler.spider.name}_items"
201
- )
250
+ @classmethod
251
+ def from_crawler(cls, crawler):
252
+ return cls(crawler)
202
253
 
203
- # 批量插入配置
204
- self.batch_size = self.settings.get_int('MYSQL_BATCH_SIZE', 100)
205
- self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
206
- self.batch_buffer: List[Dict] = [] # 批量缓冲区
254
+ async def _ensure_pool(self):
255
+ """确保连接池已初始化(线程安全)"""
256
+ if self._pool_initialized:
257
+ # 检查连接池是否仍然有效
258
+ if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
259
+ return
260
+ else:
261
+ self.logger.warning("连接池已初始化但无效,重新初始化")
262
+
263
+ async with self._pool_lock:
264
+ if not self._pool_initialized: # 双重检查避免竞争条件
265
+ try:
266
+ self.pool = await create_pool(
267
+ host=self.settings.get('MYSQL_HOST', 'localhost'),
268
+ port=self.settings.get_int('MYSQL_PORT', 3306),
269
+ user=self.settings.get('MYSQL_USER', 'root'),
270
+ password=self.settings.get('MYSQL_PASSWORD', ''),
271
+ db=self.settings.get('MYSQL_DB', 'scrapy_db'),
272
+ minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
273
+ maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
274
+ echo=self.settings.get_bool('MYSQL_ECHO', False)
275
+ )
276
+ self._pool_initialized = True
277
+ pool_stats = {
278
+ 'minsize': getattr(self.pool, 'minsize', 'unknown'),
279
+ 'maxsize': getattr(self.pool, 'maxsize', 'unknown')
280
+ }
281
+ self.logger.info(f"MySQL连接池初始化完成(表: {self.table_name}, 配置: {pool_stats})")
282
+ except Exception as e:
283
+ self.logger.error(f"MySQL连接池初始化失败: {e}")
284
+ # 重置状态以便重试
285
+ self._pool_initialized = False
286
+ self.pool = None
287
+ raise
288
+
289
+ async def _execute_sql(self, sql: str, values: list = None) -> int:
290
+ """执行SQL语句并处理结果,包含死锁重试机制"""
291
+ max_retries = 3
292
+ timeout = 30 # 30秒超时
293
+
294
+ for attempt in range(max_retries):
295
+ try:
296
+ # 检查连接池状态
297
+ if not self.pool:
298
+ raise RuntimeError("Database connection pool is not available")
299
+
300
+ # 使用asyncmy的连接方式,带超时
301
+ async with async_timeout.timeout(timeout):
302
+ async with self.pool.acquire() as conn:
303
+ async with conn.cursor() as cursor:
304
+ # 根据是否有参数值选择不同的执行方法
305
+ if values is not None:
306
+ rowcount = await cursor.execute(sql, values)
307
+ else:
308
+ rowcount = await cursor.execute(sql)
309
+
310
+ await conn.commit()
311
+ return rowcount
312
+ except asyncio.TimeoutError:
313
+ self.logger.error(f"执行SQL超时 ({timeout}秒): {sql[:100]}...")
314
+ raise ItemDiscard(f"MySQL操作超时: {sql[:100]}...")
315
+ except Exception as e:
316
+ # 检查是否是死锁错误
317
+ if "Deadlock found" in str(e) and attempt < max_retries - 1:
318
+ self.logger.warning(f"检测到死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
319
+ await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
320
+ continue
321
+ # 检查是否是连接错误,尝试重新初始化连接池
322
+ elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
323
+ self.logger.warning(f"检测到连接错误,尝试重新初始化连接池并重试: {str(e)}")
324
+ self._pool_initialized = False
325
+ self.pool = None
326
+ await asyncio.sleep(0.5 * (attempt + 1)) # 简单退避
327
+ continue
328
+ else:
329
+ # 添加更多调试信息
330
+ error_msg = f"MySQL插入失败: {str(e)}"
331
+ self.logger.error(f"执行SQL时发生错误: {error_msg}")
332
+ # 如果是批量操作,记录SQL和值以便调试
333
+ if values:
334
+ self.logger.debug(f"SQL: {sql[:200]}..., Values: {values[:5] if isinstance(values, list) else '...'}")
335
+ raise ItemDiscard(error_msg)
336
+
337
+ async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
338
+ """执行批量SQL语句,包含死锁重试机制"""
339
+ max_retries = 3
340
+ timeout = 60 # 60秒超时,批量操作可能需要更长时间
341
+
342
+ for attempt in range(max_retries):
343
+ try:
344
+ # 检查连接池状态
345
+ if not self.pool:
346
+ raise RuntimeError("Database connection pool is not available")
347
+
348
+ # 带超时的批量执行
349
+ async with async_timeout.timeout(timeout):
350
+ async with self.pool.acquire() as conn:
351
+ async with conn.cursor() as cursor:
352
+ # 执行批量插入
353
+ rowcount = await cursor.executemany(sql, values_list)
354
+ await conn.commit()
355
+ return rowcount
356
+ except asyncio.TimeoutError:
357
+ self.logger.error(f"执行批量SQL超时 ({timeout}秒)")
358
+ raise ItemDiscard(f"MySQL批量操作超时")
359
+ except Exception as e:
360
+ # 检查是否是死锁错误
361
+ if "Deadlock found" in str(e) and attempt < max_retries - 1:
362
+ self.logger.warning(f"检测到批量插入死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
363
+ await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
364
+ continue
365
+ # 检查是否是连接错误,尝试重新初始化连接池
366
+ elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
367
+ self.logger.warning(f"检测到连接错误,尝试重新初始化连接池并重试: {str(e)}")
368
+ self._pool_initialized = False
369
+ self.pool = None
370
+ await asyncio.sleep(0.5 * (attempt + 1)) # 简单退避
371
+ continue
372
+ else:
373
+ # 添加更多调试信息
374
+ error_msg = f"MySQL批量插入失败: {str(e)}"
375
+ self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
376
+ # 记录SQL和值的概要以便调试
377
+ self.logger.debug(f"SQL: {sql[:200]}..., Values count: {len(values_list) if isinstance(values_list, list) else 'unknown'}")
378
+ raise ItemDiscard(error_msg)
207
379
 
208
- crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
380
+
381
+ class AiomysqlMySQLPipeline(BaseMySQLPipeline):
382
+ """使用aiomysql库的MySQL管道实现"""
383
+
384
+ def __init__(self, crawler):
385
+ super().__init__(crawler)
386
+ self.logger.info(f"AiomysqlMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
209
387
 
210
388
  @classmethod
211
389
  def from_crawler(cls, crawler):
212
390
  return cls(crawler)
213
391
 
214
- async def _init_pool(self):
392
+ async def _ensure_pool(self):
215
393
  """延迟初始化连接池(线程安全)"""
216
394
  if self._pool_initialized:
217
- return
395
+ # 检查连接池是否仍然有效
396
+ if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
397
+ return
398
+ else:
399
+ self.logger.warning("连接池已初始化但无效,重新初始化")
218
400
 
219
401
  async with self._pool_lock:
220
402
  if not self._pool_initialized:
@@ -231,96 +413,65 @@ class AiomysqlMySQLPipeline:
231
413
  autocommit=False
232
414
  )
233
415
  self._pool_initialized = True
234
- self.logger.debug(f"aiomysql连接池已初始化(表: {self.table_name})")
416
+ pool_stats = {
417
+ 'minsize': getattr(self.pool, 'minsize', 'unknown'),
418
+ 'maxsize': getattr(self.pool, 'maxsize', 'unknown')
419
+ }
420
+ self.logger.info(f"aiomysql连接池已初始化(表: {self.table_name}, 配置: {pool_stats})")
235
421
  except Exception as e:
236
422
  self.logger.error(f"aiomysql连接池初始化失败: {e}")
423
+ # 重置状态以便重试
424
+ self._pool_initialized = False
425
+ self.pool = None
237
426
  raise
238
427
 
239
- async def process_item(self, item, spider) -> Optional[dict]:
240
- """处理item方法"""
241
- # 如果启用批量插入,将item添加到缓冲区
242
- if self.use_batch:
243
- self.batch_buffer.append(dict(item))
244
-
245
- # 如果缓冲区达到批量大小,执行批量插入
246
- if len(self.batch_buffer) >= self.batch_size:
247
- spider_name = getattr(spider, 'name', 'unknown')
248
- await self._flush_batch(spider_name)
249
-
250
- return item
251
- else:
252
- # 单条插入逻辑
428
+ async def _execute_sql(self, sql: str, values: list = None) -> int:
429
+ """执行SQL语句并处理结果,包含死锁重试机制"""
430
+ max_retries = 3
431
+ for attempt in range(max_retries):
253
432
  try:
254
- await self._init_pool()
255
-
256
- item_dict = dict(item)
257
- # 使用SQLBuilder生成SQL
258
- sql = SQLBuilder.make_insert(table=self.table_name, data=item_dict)
259
-
433
+ # 使用aiomysql的异步上下文管理器方式
260
434
  async with self.pool.acquire() as conn:
261
435
  async with conn.cursor() as cursor:
262
- try:
263
- await cursor.execute(sql)
264
- await conn.commit()
265
- self.crawler.stats.inc_value('mysql/insert_success')
266
- except aiomysql.Error as e:
267
- await conn.rollback()
268
- self.crawler.stats.inc_value('mysql/insert_failed')
269
- raise ItemDiscard(f"MySQL错误: {e.args[1]}")
270
-
271
- return item
436
+ # 根据是否有参数值选择不同的执行方法
437
+ if values is not None:
438
+ rowcount = await cursor.execute(sql, values)
439
+ else:
440
+ rowcount = await cursor.execute(sql)
272
441
 
442
+ await conn.commit()
443
+ return rowcount
273
444
  except Exception as e:
274
- self.logger.error(f"Pipeline处理异常: {e}")
275
- raise ItemDiscard(f"处理失败: {e}")
276
-
277
- async def _flush_batch(self, spider_name: str):
278
- """刷新批量缓冲区并执行批量插入"""
279
- if not self.batch_buffer:
280
- return
281
-
282
- try:
283
- await self._init_pool()
284
-
285
- # 使用批量SQL生成函数
286
- batch_result = SQLBuilder.make_batch(table=self.table_name, datas=self.batch_buffer)
287
- if batch_result is None:
288
- self.logger.warning("批量插入数据为空")
289
- self.batch_buffer.clear()
290
- return
291
-
292
- sql, values_list = batch_result
293
-
294
- async with self.pool.acquire() as conn:
295
- async with conn.cursor() as cursor:
296
- try:
445
+ # 检查是否是死锁错误
446
+ if "Deadlock found" in str(e) and attempt < max_retries - 1:
447
+ self.logger.warning(f"检测到死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
448
+ await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
449
+ continue
450
+ else:
451
+ # 添加更多调试信息
452
+ error_msg = f"MySQL插入失败: {str(e)}"
453
+ self.logger.error(f"执行SQL时发生错误: {error_msg}")
454
+ raise ItemDiscard(error_msg)
455
+
456
+ async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
457
+ """执行批量SQL语句,包含死锁重试机制"""
458
+ max_retries = 3
459
+ for attempt in range(max_retries):
460
+ try:
461
+ async with self.pool.acquire() as conn:
462
+ async with conn.cursor() as cursor:
297
463
  # 执行批量插入
298
464
  rowcount = await cursor.executemany(sql, values_list)
299
465
  await conn.commit()
300
-
301
- self.logger.info(
302
- f"爬虫 {spider_name} 批量插入 {rowcount} 条记录到表 {self.table_name}"
303
- )
304
- # 更新统计计数
305
- self.crawler.stats.inc_value('mysql/insert_success', rowcount)
306
- self.batch_buffer.clear()
307
- except Exception as e:
308
- await conn.rollback()
309
- self.crawler.stats.inc_value('mysql/insert_failed', len(self.batch_buffer))
310
- self.logger.error(f"批量插入失败: {e}")
311
- raise ItemDiscard(f"批量插入失败: {e}")
312
- except Exception as e:
313
- self.logger.error(f"批量插入过程中发生错误: {e}")
314
- raise ItemDiscard(f"批量插入处理失败: {e}")
315
-
316
- async def spider_closed(self):
317
- """资源清理"""
318
- # 在关闭前刷新剩余的批量数据
319
- if self.use_batch and self.batch_buffer:
320
- spider_name = getattr(self.crawler.spider, 'name', 'unknown')
321
- await self._flush_batch(spider_name)
322
-
323
- if self.pool:
324
- self.pool.close()
325
- await self.pool.wait_closed()
326
- self.logger.info("aiomysql连接池已释放")
466
+ return rowcount
467
+ except Exception as e:
468
+ # 检查是否是死锁错误
469
+ if "Deadlock found" in str(e) and attempt < max_retries - 1:
470
+ self.logger.warning(f"检测到批量插入死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
471
+ await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
472
+ continue
473
+ else:
474
+ # 添加更多调试信息
475
+ error_msg = f"MySQL批量插入失败: {str(e)}"
476
+ self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
477
+ raise ItemDiscard(error_msg)