PyPI - crawlo - Versions diffs - 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl - Mend

crawlo 1.4.5py3-none-any.whl → 1.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (44) hide show

crawlo/__version__.py +1 -1
crawlo/downloader/cffi_downloader.py +3 -1
crawlo/middleware/proxy.py +171 -348
crawlo/pipelines/mysql_pipeline.py +339 -188
crawlo/settings/default_settings.py +38 -30
crawlo/stats_collector.py +10 -1
crawlo/templates/project/settings.py.tmpl +10 -55
crawlo/templates/project/settings_distributed.py.tmpl +20 -22
crawlo/templates/project/settings_gentle.py.tmpl +5 -0
crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
crawlo/templates/project/settings_minimal.py.tmpl +25 -1
crawlo/templates/project/settings_simple.py.tmpl +5 -0
crawlo/templates/run.py.tmpl +1 -8
crawlo/templates/spider/spider.py.tmpl +5 -108
crawlo/utils/db_helper.py +11 -5
{crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/METADATA +1 -1
{crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/RECORD +43 -29
tests/authenticated_proxy_example.py +10 -6
tests/explain_mysql_update_behavior.py +77 -0
tests/simulate_mysql_update_test.py +140 -0
tests/test_asyncmy_usage.py +57 -0
tests/test_crawlo_proxy_integration.py +8 -2
tests/test_downloader_proxy_compatibility.py +24 -20
tests/test_mysql_pipeline_config.py +165 -0
tests/test_mysql_pipeline_error.py +99 -0
tests/test_mysql_pipeline_init_log.py +83 -0
tests/test_mysql_pipeline_integration.py +133 -0
tests/test_mysql_pipeline_refactor.py +144 -0
tests/test_mysql_pipeline_refactor_simple.py +86 -0
tests/test_mysql_pipeline_robustness.py +196 -0
tests/test_mysql_pipeline_types.py +89 -0
tests/test_mysql_update_columns.py +94 -0
tests/test_proxy_middleware.py +104 -8
tests/test_proxy_middleware_enhanced.py +1 -5
tests/test_proxy_middleware_integration.py +7 -2
tests/test_proxy_middleware_refactored.py +25 -2
tests/test_proxy_only.py +84 -0
tests/test_proxy_with_downloader.py +153 -0
tests/test_real_scenario_proxy.py +17 -17
tests/verify_mysql_warnings.py +110 -0
crawlo/middleware/simple_proxy.py +0 -65
{crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
{crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0

crawlo/pipelines/mysql_pipeline.py CHANGED Viewed

@@ -2,20 +2,28 @@
 import asyncio
 import aiomysql
 from asyncmy import create_pool
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Any
+from abc import ABC, abstractmethod
+import async_timeout
+from crawlo.items import Item
 from crawlo.exceptions import ItemDiscard
 from crawlo.utils.db_helper import SQLBuilder
 from crawlo.utils.log import get_logger
 from . import BasePipeline
-class AsyncmyMySQLPipeline:
+class BaseMySQLPipeline(BasePipeline, ABC):
+    """MySQL管道的基类，封装公共功能"""
     def __init__(self, crawler):
         self.crawler = crawler
         self.settings = crawler.settings
         self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
+        # 记录管道初始化
+        self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
         # 使用异步锁和初始化标志确保线程安全
         self._pool_lock = asyncio.Lock()
         self._pool_initialized = False
@@ -30,46 +38,35 @@ class AsyncmyMySQLPipeline:
                 spider_table_name or
                 self.settings.get('MYSQL_TABLE') or
                 getattr(crawler.spider, 'mysql_table', None) or
-                f"{crawler.spider.name}_items"
+                f"{getattr(crawler.spider, 'name', 'default')}_items"
         )
+        # 验证表名是否有效
+        if not self.table_name or not isinstance(self.table_name, str):
+            raise ValueError(f"Invalid table name: {self.table_name}. Table name must be a non-empty string.")
+        # 清理表名，移除可能的非法字符
+        self.table_name = self.table_name.strip().replace(' ', '_').replace('-', '_')
         # 批量插入配置
-        self.batch_size = self.settings.get_int('MYSQL_BATCH_SIZE', 100)
+        self.batch_size = max(1, self.settings.get_int('MYSQL_BATCH_SIZE', 100))  # 确保至少为1
         self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
         self.batch_buffer: List[Dict] = []  # 批量缓冲区
+        # SQL生成配置
+        self.auto_update = self.settings.get_bool('MYSQL_AUTO_UPDATE', False)
+        self.insert_ignore = self.settings.get_bool('MYSQL_INSERT_IGNORE', False)
+        self.update_columns = self.settings.get('MYSQL_UPDATE_COLUMNS', ())
+        # 验证 update_columns 是否为元组或列表
+        if self.update_columns and not isinstance(self.update_columns, (tuple, list)):
+            self.logger.warning(f"update_columns should be a tuple or list, got {type(self.update_columns)}. Converting to tuple.")
+            self.update_columns = (self.update_columns,)
         # 注册关闭事件
         crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
-    @classmethod
-    def from_crawler(cls, crawler):
-        return cls(crawler)
-    async def _ensure_pool(self):
-        """确保连接池已初始化（线程安全）"""
-        if self._pool_initialized:
-            return
-        async with self._pool_lock:
-            if not self._pool_initialized:  # 双重检查避免竞争条件
-                try:
-                    self.pool = await create_pool(
-                        host=self.settings.get('MYSQL_HOST', 'localhost'),
-                        port=self.settings.get_int('MYSQL_PORT', 3306),
-                        user=self.settings.get('MYSQL_USER', 'root'),
-                        password=self.settings.get('MYSQL_PASSWORD', ''),
-                        db=self.settings.get('MYSQL_DB', 'scrapy_db'),
-                        minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
-                        maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
-                        echo=self.settings.get_bool('MYSQL_ECHO', False)
-                    )
-                    self._pool_initialized = True
-                    self.logger.debug(f"MySQL连接池初始化完成（表: {self.table_name}）")
-                except Exception as e:
-                    self.logger.error(f"MySQL连接池初始化失败: {e}")
-                    raise
-    async def process_item(self, item, spider, kwargs=None) -> Optional[dict]:
+    async def process_item(self, item: Item, spider, kwargs: Dict[str, Any] = None) -> Item:
         """处理item的核心方法"""
         kwargs = kwargs or {}
         spider_name = getattr(spider, 'name', 'unknown')  # 获取爬虫名称
@@ -87,8 +84,13 @@ class AsyncmyMySQLPipeline:
             # 单条插入逻辑
             try:
                 await self._ensure_pool()
+                # 检查连接池是否有效
+                if not self._pool_initialized or not self.pool:
+                    raise RuntimeError("Database connection pool is not initialized or invalid")
                 item_dict = dict(item)
-                sql = SQLBuilder.make_insert(table=self.table_name, data=item_dict, **kwargs)
+                sql = await self._make_insert_sql(item_dict, **kwargs)
                 rowcount = await self._execute_sql(sql=sql)
                 if rowcount > 1:
@@ -100,37 +102,38 @@ class AsyncmyMySQLPipeline:
                         f"爬虫 {spider_name} 成功插入单条记录到表 {self.table_name}"
                     )
                 else:
-                    self.logger.warning(
-                        f"爬虫 {spider_name}: SQL执行成功但未插入新记录 - {sql[:100]}..."
-                    )
+                    # 当使用 MYSQL_UPDATE_COLUMNS 时，如果更新的字段值与现有记录相同，
+                    # MySQL 不会实际更新任何数据，rowcount 会是 0
+                    if self.update_columns:
+                        self.logger.info(
+                            f"爬虫 {spider_name}: SQL执行完成，使用更新列配置 {self.update_columns}，"
+                            f"可能未实际更新数据（字段值未变化）"
+                        )
+                    else:
+                        self.logger.warning(
+                            f"爬虫 {spider_name}: SQL执行成功但未插入新记录"
+                        )
                 # 统计计数移到这里，与AiomysqlMySQLPipeline保持一致
                 self.crawler.stats.inc_value('mysql/insert_success')
                 return item
             except Exception as e:
-                self.logger.error(f"处理item时发生错误: {e}")
+                # 添加更多调试信息
+                error_msg = f"处理失败: {str(e)}"
+                self.logger.error(f"处理item时发生错误: {error_msg}")
                 self.crawler.stats.inc_value('mysql/insert_failed')
-                raise ItemDiscard(f"处理失败: {e}")
+                raise ItemDiscard(error_msg)
+    @abstractmethod
     async def _execute_sql(self, sql: str, values: list = None) -> int:
-        """执行SQL语句并处理结果"""
-        async with self.pool.acquire() as conn:
-            async with conn.cursor() as cursor:
-                try:
-                    # 根据是否有参数值选择不同的执行方法
-                    if values is not None:
-                        rowcount = await cursor.execute(sql, values)
-                    else:
-                        rowcount = await cursor.execute(sql)
+        """执行SQL语句并处理结果 - 子类需要重写此方法"""
+        raise NotImplementedError("子类必须实现 _execute_sql 方法")
-                    await conn.commit()
-                    # 移除这里的统计计数
-                    return rowcount
-                except Exception as e:
-                    await conn.rollback()
-                    # 移除这里的统计计数
-                    raise ItemDiscard(f"MySQL插入失败: {e}")
+    @abstractmethod
+    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
+        """执行批量SQL语句 - 子类需要重写此方法"""
+        raise NotImplementedError("子类必须实现 _execute_batch_sql 方法")
     async def _flush_batch(self, spider_name: str):
         """刷新批量缓冲区并执行批量插入"""
@@ -140,81 +143,260 @@ class AsyncmyMySQLPipeline:
         try:
             await self._ensure_pool()
-            # 使用批量SQL生成函数
-            batch_result = SQLBuilder.make_batch(table=self.table_name, datas=self.batch_buffer)
-            if batch_result is None:
-                self.logger.warning("批量插入数据为空")
-                self.batch_buffer.clear()
-                return
+            # 检查连接池是否有效
+            if not self._pool_initialized or not self.pool:
+                raise RuntimeError("Database connection pool is not initialized or invalid")
+            # 使用 SQLBuilder 生成批量插入 SQL
+            batch_result = SQLBuilder.make_batch(
+                table=self.table_name,
+                datas=self.batch_buffer,
+                auto_update=self.auto_update,
+                update_columns=self.update_columns
+            )
+            if batch_result:
+                sql, values_list = batch_result
+                rowcount = await self._execute_batch_sql(sql=sql, values_list=values_list)
+                if rowcount > 0:
+                    self.logger.info(
+                        f"爬虫 {spider_name} 批量插入 {len(self.batch_buffer)} 条记录到表 {self.table_name}，实际影响 {rowcount} 行"
+                    )
+                else:
+                    # 当使用 MYSQL_UPDATE_COLUMNS 时，如果更新的字段值与现有记录相同，
+                    # MySQL 不会实际更新任何数据，rowcount 会是 0
+                    if self.update_columns:
+                        self.logger.debug(
+                            f"爬虫 {spider_name}: 批量SQL执行完成，使用更新列配置 {self.update_columns}，"
+                            f"可能未实际更新数据（字段值未变化）"
+                        )
+                    else:
+                        self.logger.warning(
+                            f"爬虫 {spider_name}: 批量SQL执行完成但未插入新记录"
+                        )
-            sql, values_list = batch_result
+                # 清空缓冲区
+                self.batch_buffer.clear()
+                self.crawler.stats.inc_value('mysql/batch_insert_success')
+            else:
+                self.logger.warning(f"爬虫 {spider_name}: 批量数据为空，跳过插入")
-            async with self.pool.acquire() as conn:
-                async with conn.cursor() as cursor:
-                    try:
-                        # 执行批量插入
-                        rowcount = await cursor.executemany(sql, values_list)
-                        await conn.commit()
-                        self.logger.info(
-                            f"爬虫 {spider_name} 批量插入 {rowcount} 条记录到表 {self.table_name}"
-                        )
-                        # 更新统计计数
-                        self.crawler.stats.inc_value('mysql/insert_success', rowcount)
-                        self.batch_buffer.clear()
-                    except Exception as e:
-                        await conn.rollback()
-                        self.crawler.stats.inc_value('mysql/insert_failed', len(self.batch_buffer))
-                        self.logger.error(f"批量插入失败: {e}")
-                        raise ItemDiscard(f"批量插入失败: {e}")
         except Exception as e:
-            self.logger.error(f"批量插入过程中发生错误: {e}")
-            raise ItemDiscard(f"批量插入处理失败: {e}")
+            # 添加更多调试信息
+            error_msg = f"批量插入失败: {str(e)}"
+            self.logger.error(f"批量处理时发生错误: {error_msg}")
+            self.crawler.stats.inc_value('mysql/batch_insert_failed')
+            # 不清空缓冲区，以便可能的重试
+            # 但如果错误是由于数据问题导致的，可能需要清空缓冲区以避免无限重试
+            if "Duplicate entry" in str(e) or "Data too long" in str(e):
+                self.logger.warning("由于数据问题导致的错误，清空缓冲区以避免无限重试")
+                self.batch_buffer.clear()
+            raise ItemDiscard(error_msg)
     async def spider_closed(self):
         """关闭爬虫时清理资源"""
         # 在关闭前刷新剩余的批量数据
         if self.use_batch and self.batch_buffer:
             spider_name = getattr(self.crawler.spider, 'name', 'unknown')
-            await self._flush_batch(spider_name)
+            try:
+                await self._flush_batch(spider_name)
+            except Exception as e:
+                self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
         if self.pool:
-            self.pool.close()
-            await self.pool.wait_closed()
-            self.logger.info("MySQL连接池已关闭")
+            try:
+                pool_stats = {
+                    'size': getattr(self.pool, 'size', 'unknown'),
+                    'minsize': getattr(self.pool, 'minsize', 'unknown'),
+                    'maxsize': getattr(self.pool, 'maxsize', 'unknown')
+                }
+                self.logger.info(f"正在关闭MySQL连接池，当前状态: {pool_stats}")
+                self.pool.close()
+                await self.pool.wait_closed()
+                self.logger.info("MySQL连接池已关闭")
+            except Exception as e:
+                self.logger.error(f"关闭MySQL连接池时发生错误: {e}")
+    async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
+        """生成插入SQL语句，子类可以重写此方法"""
+        # 合并管道配置和传入的kwargs参数
+        sql_kwargs = {
+            'auto_update': self.auto_update,
+            'insert_ignore': self.insert_ignore,
+            'update_columns': self.update_columns
+        }
+        sql_kwargs.update(kwargs)
+        return SQLBuilder.make_insert(
+            table=self.table_name,
+            data=item_dict,
+            **sql_kwargs
+        )
+    @abstractmethod
+    async def _ensure_pool(self):
+        """确保连接池已初始化（线程安全），子类必须实现此方法"""
+        pass
-class AiomysqlMySQLPipeline:
+class AsyncmyMySQLPipeline(BaseMySQLPipeline):
+    """使用asyncmy库的MySQL管道实现"""
     def __init__(self, crawler):
-        self.crawler = crawler
-        self.settings = crawler.settings
-        self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
+        super().__init__(crawler)
+        self.logger.info(f"AsyncmyMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
-        # 使用异步锁和初始化标志
-        self._pool_lock = asyncio.Lock()
-        self._pool_initialized = False
-        self.pool = None
-        self.table_name = (
-                self.settings.get('MYSQL_TABLE') or
-                getattr(crawler.spider, 'mysql_table', None) or
-                f"{crawler.spider.name}_items"
-        )
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler)
-        # 批量插入配置
-        self.batch_size = self.settings.get_int('MYSQL_BATCH_SIZE', 100)
-        self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
-        self.batch_buffer: List[Dict] = []  # 批量缓冲区
+    async def _ensure_pool(self):
+        """确保连接池已初始化（线程安全）"""
+        if self._pool_initialized:
+            # 检查连接池是否仍然有效
+            if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
+                return
+            else:
+                self.logger.warning("连接池已初始化但无效，重新初始化")
+        async with self._pool_lock:
+            if not self._pool_initialized:  # 双重检查避免竞争条件
+                try:
+                    self.pool = await create_pool(
+                        host=self.settings.get('MYSQL_HOST', 'localhost'),
+                        port=self.settings.get_int('MYSQL_PORT', 3306),
+                        user=self.settings.get('MYSQL_USER', 'root'),
+                        password=self.settings.get('MYSQL_PASSWORD', ''),
+                        db=self.settings.get('MYSQL_DB', 'scrapy_db'),
+                        minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
+                        maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
+                        echo=self.settings.get_bool('MYSQL_ECHO', False)
+                    )
+                    self._pool_initialized = True
+                    pool_stats = {
+                        'minsize': getattr(self.pool, 'minsize', 'unknown'),
+                        'maxsize': getattr(self.pool, 'maxsize', 'unknown')
+                    }
+                    self.logger.info(f"MySQL连接池初始化完成（表: {self.table_name}, 配置: {pool_stats}）")
+                except Exception as e:
+                    self.logger.error(f"MySQL连接池初始化失败: {e}")
+                    # 重置状态以便重试
+                    self._pool_initialized = False
+                    self.pool = None
+                    raise
+    async def _execute_sql(self, sql: str, values: list = None) -> int:
+        """执行SQL语句并处理结果，包含死锁重试机制"""
+        max_retries = 3
+        timeout = 30  # 30秒超时
+        for attempt in range(max_retries):
+            try:
+                # 检查连接池状态
+                if not self.pool:
+                    raise RuntimeError("Database connection pool is not available")
+                # 使用asyncmy的连接方式，带超时
+                async with async_timeout.timeout(timeout):
+                    async with self.pool.acquire() as conn:
+                        async with conn.cursor() as cursor:
+                            # 根据是否有参数值选择不同的执行方法
+                            if values is not None:
+                                rowcount = await cursor.execute(sql, values)
+                            else:
+                                rowcount = await cursor.execute(sql)
+                            await conn.commit()
+                            return rowcount
+            except asyncio.TimeoutError:
+                self.logger.error(f"执行SQL超时 ({timeout}秒): {sql[:100]}...")
+                raise ItemDiscard(f"MySQL操作超时: {sql[:100]}...")
+            except Exception as e:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                # 检查是否是连接错误，尝试重新初始化连接池
+                elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到连接错误，尝试重新初始化连接池并重试: {str(e)}")
+                    self._pool_initialized = False
+                    self.pool = None
+                    await asyncio.sleep(0.5 * (attempt + 1))  # 简单退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL插入失败: {str(e)}"
+                    self.logger.error(f"执行SQL时发生错误: {error_msg}")
+                    # 如果是批量操作，记录SQL和值以便调试
+                    if values:
+                        self.logger.debug(f"SQL: {sql[:200]}..., Values: {values[:5] if isinstance(values, list) else '...'}")
+                    raise ItemDiscard(error_msg)
+    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
+        """执行批量SQL语句，包含死锁重试机制"""
+        max_retries = 3
+        timeout = 60  # 60秒超时，批量操作可能需要更长时间
+        for attempt in range(max_retries):
+            try:
+                # 检查连接池状态
+                if not self.pool:
+                    raise RuntimeError("Database connection pool is not available")
+                # 带超时的批量执行
+                async with async_timeout.timeout(timeout):
+                    async with self.pool.acquire() as conn:
+                        async with conn.cursor() as cursor:
+                            # 执行批量插入
+                            rowcount = await cursor.executemany(sql, values_list)
+                            await conn.commit()
+                            return rowcount
+            except asyncio.TimeoutError:
+                self.logger.error(f"执行批量SQL超时 ({timeout}秒)")
+                raise ItemDiscard(f"MySQL批量操作超时")
+            except Exception as e:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到批量插入死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                # 检查是否是连接错误，尝试重新初始化连接池
+                elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到连接错误，尝试重新初始化连接池并重试: {str(e)}")
+                    self._pool_initialized = False
+                    self.pool = None
+                    await asyncio.sleep(0.5 * (attempt + 1))  # 简单退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL批量插入失败: {str(e)}"
+                    self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
+                    # 记录SQL和值的概要以便调试
+                    self.logger.debug(f"SQL: {sql[:200]}..., Values count: {len(values_list) if isinstance(values_list, list) else 'unknown'}")
+                    raise ItemDiscard(error_msg)
-        crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
+class AiomysqlMySQLPipeline(BaseMySQLPipeline):
+    """使用aiomysql库的MySQL管道实现"""
+    def __init__(self, crawler):
+        super().__init__(crawler)
+        self.logger.info(f"AiomysqlMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
     @classmethod
     def from_crawler(cls, crawler):
         return cls(crawler)
-    async def _init_pool(self):
+    async def _ensure_pool(self):
         """延迟初始化连接池（线程安全）"""
         if self._pool_initialized:
-            return
+            # 检查连接池是否仍然有效
+            if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
+                return
+            else:
+                self.logger.warning("连接池已初始化但无效，重新初始化")
         async with self._pool_lock:
             if not self._pool_initialized:
@@ -231,96 +413,65 @@ class AiomysqlMySQLPipeline:
                         autocommit=False
                     )
                     self._pool_initialized = True
-                    self.logger.debug(f"aiomysql连接池已初始化（表: {self.table_name}）")
+                    pool_stats = {
+                        'minsize': getattr(self.pool, 'minsize', 'unknown'),
+                        'maxsize': getattr(self.pool, 'maxsize', 'unknown')
+                    }
+                    self.logger.info(f"aiomysql连接池已初始化（表: {self.table_name}, 配置: {pool_stats}）")
                 except Exception as e:
                     self.logger.error(f"aiomysql连接池初始化失败: {e}")
+                    # 重置状态以便重试
+                    self._pool_initialized = False
+                    self.pool = None
                     raise
-    async def process_item(self, item, spider) -> Optional[dict]:
-        """处理item方法"""
-        # 如果启用批量插入，将item添加到缓冲区
-        if self.use_batch:
-            self.batch_buffer.append(dict(item))
-            # 如果缓冲区达到批量大小，执行批量插入
-            if len(self.batch_buffer) >= self.batch_size:
-                spider_name = getattr(spider, 'name', 'unknown')
-                await self._flush_batch(spider_name)
-            return item
-        else:
-            # 单条插入逻辑
+    async def _execute_sql(self, sql: str, values: list = None) -> int:
+        """执行SQL语句并处理结果，包含死锁重试机制"""
+        max_retries = 3
+        for attempt in range(max_retries):
             try:
-                await self._init_pool()
-                item_dict = dict(item)
-                # 使用SQLBuilder生成SQL
-                sql = SQLBuilder.make_insert(table=self.table_name, data=item_dict)
+                # 使用aiomysql的异步上下文管理器方式
                 async with self.pool.acquire() as conn:
                     async with conn.cursor() as cursor:
-                        try:
-                            await cursor.execute(sql)
-                            await conn.commit()
-                            self.crawler.stats.inc_value('mysql/insert_success')
-                        except aiomysql.Error as e:
-                            await conn.rollback()
-                            self.crawler.stats.inc_value('mysql/insert_failed')
-                            raise ItemDiscard(f"MySQL错误: {e.args[1]}")
-                return item
+                        # 根据是否有参数值选择不同的执行方法
+                        if values is not None:
+                            rowcount = await cursor.execute(sql, values)
+                        else:
+                            rowcount = await cursor.execute(sql)
+                        await conn.commit()
+                        return rowcount
             except Exception as e:
-                self.logger.error(f"Pipeline处理异常: {e}")
-                raise ItemDiscard(f"处理失败: {e}")
-    async def _flush_batch(self, spider_name: str):
-        """刷新批量缓冲区并执行批量插入"""
-        if not self.batch_buffer:
-            return
-        try:
-            await self._init_pool()
-            # 使用批量SQL生成函数
-            batch_result = SQLBuilder.make_batch(table=self.table_name, datas=self.batch_buffer)
-            if batch_result is None:
-                self.logger.warning("批量插入数据为空")
-                self.batch_buffer.clear()
-                return
-            sql, values_list = batch_result
-            async with self.pool.acquire() as conn:
-                async with conn.cursor() as cursor:
-                    try:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL插入失败: {str(e)}"
+                    self.logger.error(f"执行SQL时发生错误: {error_msg}")
+                    raise ItemDiscard(error_msg)
+    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
+        """执行批量SQL语句，包含死锁重试机制"""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                async with self.pool.acquire() as conn:
+                    async with conn.cursor() as cursor:
                         # 执行批量插入
                         rowcount = await cursor.executemany(sql, values_list)
                         await conn.commit()
-                        self.logger.info(
-                            f"爬虫 {spider_name} 批量插入 {rowcount} 条记录到表 {self.table_name}"
-                        )
-                        # 更新统计计数
-                        self.crawler.stats.inc_value('mysql/insert_success', rowcount)
-                        self.batch_buffer.clear()
-                    except Exception as e:
-                        await conn.rollback()
-                        self.crawler.stats.inc_value('mysql/insert_failed', len(self.batch_buffer))
-                        self.logger.error(f"批量插入失败: {e}")
-                        raise ItemDiscard(f"批量插入失败: {e}")
-        except Exception as e:
-            self.logger.error(f"批量插入过程中发生错误: {e}")
-            raise ItemDiscard(f"批量插入处理失败: {e}")
-    async def spider_closed(self):
-        """资源清理"""
-        # 在关闭前刷新剩余的批量数据
-        if self.use_batch and self.batch_buffer:
-            spider_name = getattr(self.crawler.spider, 'name', 'unknown')
-            await self._flush_batch(spider_name)
-        if self.pool:
-            self.pool.close()
-            await self.pool.wait_closed()
-            self.logger.info("aiomysql连接池已释放")
+                        return rowcount
+            except Exception as e:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到批量插入死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL批量插入失败: {str(e)}"
+                    self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
+                    raise ItemDiscard(error_msg)

crawlo 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.5py3-none-any.whl → 1.4.6py3-none-any.whl