PyPI - crawlo - Versions diffs - 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl - Mend

crawlo 1.4.7py3-none-any.whl → 1.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (348) hide show

crawlo/__init__.py +90 -90
crawlo/__version__.py +1 -1
crawlo/cli.py +75 -75
crawlo/commands/__init__.py +14 -14
crawlo/commands/check.py +594 -594
crawlo/commands/genspider.py +186 -186
crawlo/commands/help.py +140 -140
crawlo/commands/list.py +155 -155
crawlo/commands/run.py +379 -379
crawlo/commands/startproject.py +460 -460
crawlo/commands/stats.py +187 -187
crawlo/commands/utils.py +196 -196
crawlo/config.py +320 -320
crawlo/config_validator.py +277 -277
crawlo/core/__init__.py +52 -52
crawlo/core/engine.py +451 -451
crawlo/core/processor.py +47 -47
crawlo/core/scheduler.py +290 -290
crawlo/crawler.py +698 -698
crawlo/data/__init__.py +5 -5
crawlo/data/user_agents.py +194 -194
crawlo/downloader/__init__.py +280 -280
crawlo/downloader/aiohttp_downloader.py +233 -233
crawlo/downloader/cffi_downloader.py +250 -250
crawlo/downloader/httpx_downloader.py +265 -265
crawlo/downloader/hybrid_downloader.py +212 -212
crawlo/downloader/playwright_downloader.py +425 -425
crawlo/downloader/selenium_downloader.py +486 -486
crawlo/event.py +45 -45
crawlo/exceptions.py +214 -214
crawlo/extension/__init__.py +64 -64
crawlo/extension/health_check.py +141 -141
crawlo/extension/log_interval.py +94 -94
crawlo/extension/log_stats.py +70 -70
crawlo/extension/logging_extension.py +53 -53
crawlo/extension/memory_monitor.py +104 -104
crawlo/extension/performance_profiler.py +133 -133
crawlo/extension/request_recorder.py +107 -107
crawlo/factories/__init__.py +27 -27
crawlo/factories/base.py +68 -68
crawlo/factories/crawler.py +104 -104
crawlo/factories/registry.py +84 -84
crawlo/factories/utils.py +134 -134
crawlo/filters/__init__.py +170 -170
crawlo/filters/aioredis_filter.py +347 -347
crawlo/filters/memory_filter.py +261 -261
crawlo/framework.py +306 -306
crawlo/initialization/__init__.py +44 -44
crawlo/initialization/built_in.py +391 -391
crawlo/initialization/context.py +141 -141
crawlo/initialization/core.py +240 -240
crawlo/initialization/phases.py +229 -229
crawlo/initialization/registry.py +143 -143
crawlo/initialization/utils.py +48 -48
crawlo/interfaces.py +23 -23
crawlo/items/__init__.py +23 -23
crawlo/items/base.py +23 -23
crawlo/items/fields.py +52 -52
crawlo/items/items.py +104 -104
crawlo/logging/__init__.py +42 -42
crawlo/logging/config.py +280 -276
crawlo/logging/factory.py +175 -175
crawlo/logging/manager.py +104 -104
crawlo/middleware/__init__.py +87 -87
crawlo/middleware/default_header.py +132 -132
crawlo/middleware/download_delay.py +104 -104
crawlo/middleware/middleware_manager.py +142 -142
crawlo/middleware/offsite.py +123 -123
crawlo/middleware/proxy.py +209 -209
crawlo/middleware/request_ignore.py +86 -86
crawlo/middleware/response_code.py +150 -150
crawlo/middleware/response_filter.py +136 -136
crawlo/middleware/retry.py +124 -124
crawlo/mode_manager.py +287 -287
crawlo/network/__init__.py +21 -21
crawlo/network/request.py +408 -376
crawlo/network/response.py +598 -569
crawlo/pipelines/__init__.py +52 -52
crawlo/pipelines/base_pipeline.py +452 -452
crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
crawlo/pipelines/console_pipeline.py +39 -39
crawlo/pipelines/csv_pipeline.py +316 -316
crawlo/pipelines/database_dedup_pipeline.py +196 -197
crawlo/pipelines/json_pipeline.py +218 -218
crawlo/pipelines/memory_dedup_pipeline.py +104 -105
crawlo/pipelines/mongo_pipeline.py +140 -139
crawlo/pipelines/mysql_pipeline.py +468 -469
crawlo/pipelines/pipeline_manager.py +100 -100
crawlo/pipelines/redis_dedup_pipeline.py +155 -155
crawlo/project.py +347 -347
crawlo/queue/__init__.py +9 -9
crawlo/queue/pqueue.py +38 -38
crawlo/queue/queue_manager.py +591 -591
crawlo/queue/redis_priority_queue.py +518 -518
crawlo/settings/__init__.py +7 -7
crawlo/settings/default_settings.py +287 -284
crawlo/settings/setting_manager.py +219 -219
crawlo/spider/__init__.py +658 -657
crawlo/stats_collector.py +81 -81
crawlo/subscriber.py +129 -129
crawlo/task_manager.py +138 -138
crawlo/templates/crawlo.cfg.tmpl +10 -10
crawlo/templates/project/__init__.py.tmpl +1 -1
crawlo/templates/project/items.py.tmpl +13 -13
crawlo/templates/project/middlewares.py.tmpl +38 -38
crawlo/templates/project/pipelines.py.tmpl +35 -35
crawlo/templates/project/settings.py.tmpl +113 -109
crawlo/templates/project/settings_distributed.py.tmpl +160 -156
crawlo/templates/project/settings_gentle.py.tmpl +174 -170
crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
crawlo/templates/project/settings_minimal.py.tmpl +102 -98
crawlo/templates/project/settings_simple.py.tmpl +172 -168
crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
crawlo/templates/run.py.tmpl +23 -23
crawlo/templates/spider/spider.py.tmpl +32 -32
crawlo/templates/spiders_init.py.tmpl +4 -4
crawlo/tools/__init__.py +86 -86
crawlo/tools/date_tools.py +289 -289
crawlo/tools/distributed_coordinator.py +384 -384
crawlo/tools/scenario_adapter.py +262 -262
crawlo/tools/text_cleaner.py +232 -232
crawlo/utils/__init__.py +74 -50
crawlo/utils/batch_processor.py +276 -276
crawlo/utils/config_manager.py +442 -442
crawlo/utils/controlled_spider_mixin.py +439 -439
crawlo/utils/db_helper.py +250 -250
crawlo/utils/encoding_helper.py +190 -0
crawlo/utils/error_handler.py +410 -410
crawlo/utils/fingerprint.py +121 -121
crawlo/utils/func_tools.py +82 -82
crawlo/utils/large_scale_helper.py +344 -344
crawlo/utils/leak_detector.py +335 -335
crawlo/utils/misc.py +81 -81
crawlo/utils/mongo_connection_pool.py +157 -157
crawlo/utils/mysql_connection_pool.py +197 -197
crawlo/utils/performance_monitor.py +285 -285
crawlo/utils/queue_helper.py +175 -175
crawlo/utils/redis_checker.py +90 -90
crawlo/utils/redis_connection_pool.py +578 -578
crawlo/utils/redis_key_validator.py +198 -198
crawlo/utils/request.py +278 -278
crawlo/utils/request_serializer.py +225 -225
crawlo/utils/resource_manager.py +337 -337
crawlo/utils/response_helper.py +113 -0
crawlo/utils/selector_helper.py +138 -137
crawlo/utils/singleton.py +69 -69
crawlo/utils/spider_loader.py +201 -201
crawlo/utils/text_helper.py +94 -94
{crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
crawlo-1.4.8.dist-info/RECORD +347 -0
examples/__init__.py +7 -7
tests/__init__.py +7 -7
tests/advanced_tools_example.py +217 -217
tests/authenticated_proxy_example.py +110 -110
tests/baidu_performance_test.py +108 -108
tests/baidu_test.py +59 -59
tests/bug_check_test.py +250 -250
tests/cleaners_example.py +160 -160
tests/comprehensive_framework_test.py +212 -212
tests/comprehensive_test.py +81 -81
tests/comprehensive_testing_summary.md +186 -186
tests/config_validation_demo.py +142 -142
tests/controlled_spider_example.py +205 -205
tests/date_tools_example.py +180 -180
tests/debug_configure.py +69 -69
tests/debug_framework_logger.py +84 -84
tests/debug_log_config.py +126 -126
tests/debug_log_levels.py +63 -63
tests/debug_pipelines.py +66 -66
tests/detailed_log_test.py +233 -233
tests/direct_selector_helper_test.py +96 -96
tests/distributed_dedup_test.py +467 -467
tests/distributed_test.py +66 -66
tests/distributed_test_debug.py +76 -76
tests/dynamic_loading_example.py +523 -523
tests/dynamic_loading_test.py +104 -104
tests/error_handling_example.py +171 -171
tests/explain_mysql_update_behavior.py +76 -76
tests/final_comprehensive_test.py +151 -151
tests/final_log_test.py +260 -260
tests/final_validation_test.py +182 -182
tests/fix_log_test.py +142 -142
tests/framework_performance_test.py +202 -202
tests/log_buffering_test.py +111 -111
tests/log_generation_timing_test.py +153 -153
tests/monitor_redis_dedup.sh +72 -72
tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
tests/ofweek_scrapy/scrapy.cfg +11 -11
tests/optimized_performance_test.py +211 -211
tests/performance_comparison.py +244 -244
tests/queue_blocking_test.py +113 -113
tests/queue_test.py +89 -89
tests/redis_key_validation_demo.py +130 -130
tests/request_params_example.py +150 -150
tests/response_improvements_example.py +144 -144
tests/scrapy_comparison/ofweek_scrapy.py +138 -138
tests/scrapy_comparison/scrapy_test.py +133 -133
tests/simple_cli_test.py +54 -54
tests/simple_command_test.py +119 -119
tests/simple_crawlo_test.py +126 -126
tests/simple_follow_test.py +38 -38
tests/simple_log_test2.py +137 -137
tests/simple_optimization_test.py +128 -128
tests/simple_queue_type_test.py +41 -41
tests/simple_response_selector_test.py +94 -94
tests/simple_selector_helper_test.py +154 -154
tests/simple_selector_test.py +207 -207
tests/simple_spider_test.py +49 -49
tests/simple_url_test.py +73 -73
tests/simulate_mysql_update_test.py +139 -139
tests/spider_log_timing_test.py +177 -177
tests/test_advanced_tools.py +148 -148
tests/test_all_commands.py +230 -230
tests/test_all_pipeline_fingerprints.py +133 -133
tests/test_all_redis_key_configs.py +145 -145
tests/test_asyncmy_usage.py +56 -56
tests/test_batch_processor.py +178 -178
tests/test_cleaners.py +54 -54
tests/test_cli_arguments.py +118 -118
tests/test_component_factory.py +174 -174
tests/test_config_consistency.py +80 -80
tests/test_config_merge.py +152 -152
tests/test_config_validator.py +182 -182
tests/test_controlled_spider_mixin.py +79 -79
tests/test_crawler_process_import.py +38 -38
tests/test_crawler_process_spider_modules.py +47 -47
tests/test_crawlo_proxy_integration.py +114 -114
tests/test_date_tools.py +123 -123
tests/test_dedup_fix.py +220 -220
tests/test_dedup_pipeline_consistency.py +124 -124
tests/test_default_header_middleware.py +313 -313
tests/test_distributed.py +65 -65
tests/test_double_crawlo_fix.py +204 -204
tests/test_double_crawlo_fix_simple.py +124 -124
tests/test_download_delay_middleware.py +221 -221
tests/test_downloader_proxy_compatibility.py +272 -272
tests/test_edge_cases.py +305 -305
tests/test_encoding_core.py +56 -56
tests/test_encoding_detection.py +126 -126
tests/test_enhanced_error_handler.py +270 -270
tests/test_enhanced_error_handler_comprehensive.py +245 -245
tests/test_error_handler_compatibility.py +112 -112
tests/test_factories.py +252 -252
tests/test_factory_compatibility.py +196 -196
tests/test_final_validation.py +153 -153
tests/test_fingerprint_consistency.py +135 -135
tests/test_fingerprint_simple.py +51 -51
tests/test_get_component_logger.py +83 -83
tests/test_hash_performance.py +99 -99
tests/test_integration.py +169 -169
tests/test_item_dedup_redis_key.py +122 -122
tests/test_large_scale_helper.py +235 -235
tests/test_logging_enhancements.py +374 -374
tests/test_logging_final.py +184 -184
tests/test_logging_integration.py +312 -312
tests/test_logging_system.py +282 -282
tests/test_middleware_debug.py +141 -141
tests/test_mode_consistency.py +51 -51
tests/test_multi_directory.py +67 -67
tests/test_multiple_spider_modules.py +80 -80
tests/test_mysql_pipeline_config.py +164 -164
tests/test_mysql_pipeline_error.py +98 -98
tests/test_mysql_pipeline_init_log.py +82 -82
tests/test_mysql_pipeline_integration.py +132 -132
tests/test_mysql_pipeline_refactor.py +143 -143
tests/test_mysql_pipeline_refactor_simple.py +85 -85
tests/test_mysql_pipeline_robustness.py +195 -195
tests/test_mysql_pipeline_types.py +88 -88
tests/test_mysql_update_columns.py +93 -93
tests/test_offsite_middleware.py +244 -244
tests/test_offsite_middleware_simple.py +203 -203
tests/test_optimized_selector_naming.py +100 -100
tests/test_parsel.py +29 -29
tests/test_performance.py +327 -327
tests/test_performance_monitor.py +115 -115
tests/test_pipeline_fingerprint_consistency.py +86 -86
tests/test_priority_behavior.py +211 -211
tests/test_priority_consistency.py +151 -151
tests/test_priority_consistency_fixed.py +249 -249
tests/test_proxy_health_check.py +32 -32
tests/test_proxy_middleware.py +217 -217
tests/test_proxy_middleware_enhanced.py +212 -212
tests/test_proxy_middleware_integration.py +142 -142
tests/test_proxy_middleware_refactored.py +207 -207
tests/test_proxy_only.py +83 -83
tests/test_proxy_providers.py +56 -56
tests/test_proxy_stats.py +19 -19
tests/test_proxy_strategies.py +59 -59
tests/test_proxy_with_downloader.py +152 -152
tests/test_queue_empty_check.py +41 -41
tests/test_queue_manager_double_crawlo.py +173 -173
tests/test_queue_manager_redis_key.py +179 -179
tests/test_queue_naming.py +154 -154
tests/test_queue_type.py +106 -106
tests/test_queue_type_redis_config_consistency.py +130 -130
tests/test_random_headers_default.py +322 -322
tests/test_random_headers_necessity.py +308 -308
tests/test_random_user_agent.py +72 -72
tests/test_redis_config.py +28 -28
tests/test_redis_connection_pool.py +294 -294
tests/test_redis_key_naming.py +181 -181
tests/test_redis_key_validator.py +123 -123
tests/test_redis_queue.py +224 -224
tests/test_redis_queue_name_fix.py +175 -175
tests/test_redis_queue_type_fallback.py +129 -129
tests/test_request_ignore_middleware.py +182 -182
tests/test_request_params.py +111 -111
tests/test_request_serialization.py +70 -70
tests/test_response_code_middleware.py +349 -349
tests/test_response_filter_middleware.py +427 -427
tests/test_response_follow.py +104 -104
tests/test_response_improvements.py +152 -152
tests/test_response_selector_methods.py +92 -92
tests/test_response_url_methods.py +70 -70
tests/test_response_urljoin.py +86 -86
tests/test_retry_middleware.py +333 -333
tests/test_retry_middleware_realistic.py +273 -273
tests/test_scheduler.py +252 -252
tests/test_scheduler_config_update.py +133 -133
tests/test_scrapy_style_encoding.py +112 -112
tests/test_selector_helper.py +100 -100
tests/test_selector_optimizations.py +146 -146
tests/test_simple_response.py +61 -61
tests/test_spider_loader.py +49 -49
tests/test_spider_loader_comprehensive.py +69 -69
tests/test_spider_modules.py +84 -84
tests/test_spiders/test_spider.py +9 -9
tests/test_telecom_spider_redis_key.py +205 -205
tests/test_template_content.py +87 -87
tests/test_template_redis_key.py +134 -134
tests/test_tools.py +159 -159
tests/test_user_agent_randomness.py +176 -176
tests/test_user_agents.py +96 -96
tests/untested_features_report.md +138 -138
tests/verify_debug.py +51 -51
tests/verify_distributed.py +117 -117
tests/verify_log_fix.py +111 -111
tests/verify_mysql_warnings.py +109 -109
crawlo/utils/log.py +0 -80
crawlo/utils/url_utils.py +0 -40
crawlo-1.4.7.dist-info/RECORD +0 -347
{crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
{crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0

crawlo/pipelines/mysql_pipeline.py CHANGED Viewed

@@ -1,470 +1,469 @@
-# -*- coding: utf-8 -*-
-import asyncio
-from abc import ABC, abstractmethod
-from typing import List, Dict, Any
-import async_timeout
-from crawlo.exceptions import ItemDiscard
-from crawlo.items import Item
-from crawlo.utils.db_helper import SQLBuilder
-from crawlo.logging import get_logger
-from crawlo.utils.mysql_connection_pool import MySQLConnectionPoolManager
-from . import BasePipeline
-class BaseMySQLPipeline(BasePipeline, ABC):
-    """MySQL管道的基类，封装公共功能"""
-    def __init__(self, crawler):
-        self.crawler = crawler
-        self.settings = crawler.settings
-        self.logger = get_logger(self.__class__.__name__)
-        # 记录管道初始化
-        self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
-        # 使用异步锁和初始化标志确保线程安全
-        self._pool_lock = asyncio.Lock()
-        self._pool_initialized = False
-        self.pool = None
-        # 优先从爬虫的custom_settings中获取表名，如果没有则使用默认值
-        spider_table_name = None
-        if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'custom_settings'):
-            spider_table_name = crawler.spider.custom_settings.get('MYSQL_TABLE')
-        self.table_name = (
-                spider_table_name or
-                self.settings.get('MYSQL_TABLE') or
-                getattr(crawler.spider, 'mysql_table', None) or
-                f"{getattr(crawler.spider, 'name', 'default')}_items"
-        )
-        # 验证表名是否有效
-        if not self.table_name or not isinstance(self.table_name, str):
-            raise ValueError(f"Invalid table name: {self.table_name}. Table name must be a non-empty string.")
-        # 清理表名，移除可能的非法字符
-        self.table_name = self.table_name.strip().replace(' ', '_').replace('-', '_')
-        # 批量插入配置
-        self.batch_size = max(1, self.settings.get_int('MYSQL_BATCH_SIZE', 100))  # 确保至少为1
-        self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
-        self.batch_buffer: List[Dict] = []  # 批量缓冲区
-        # SQL生成配置
-        self.auto_update = self.settings.get_bool('MYSQL_AUTO_UPDATE', False)
-        self.insert_ignore = self.settings.get_bool('MYSQL_INSERT_IGNORE', False)
-        self.update_columns = self.settings.get('MYSQL_UPDATE_COLUMNS', ())
-        # 验证 update_columns 是否为元组或列表
-        if self.update_columns and not isinstance(self.update_columns, (tuple, list)):
-            self.logger.warning(f"update_columns should be a tuple or list, got {type(self.update_columns)}. Converting to tuple.")
-            self.update_columns = (self.update_columns,)
-        # 注册关闭事件
-        crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
-    async def process_item(self, item: Item, spider, kwargs: Dict[str, Any] = None) -> Item:
-        """处理item的核心方法"""
-        kwargs = kwargs or {}
-        spider_name = getattr(spider, 'name', 'unknown')  # 获取爬虫名称
-        # 如果启用批量插入，将item添加到缓冲区
-        if self.use_batch:
-            self.batch_buffer.append(dict(item))
-            # 如果缓冲区达到批量大小，执行批量插入
-            if len(self.batch_buffer) >= self.batch_size:
-                await self._flush_batch(spider_name)
-            return item
-        else:
-            # 单条插入逻辑
-            try:
-                await self._ensure_pool()
-                # 检查连接池是否有效
-                if not self._pool_initialized or not self.pool:
-                    raise RuntimeError("Database connection pool is not initialized or invalid")
-                item_dict = dict(item)
-                sql = await self._make_insert_sql(item_dict, **kwargs)
-                rowcount = await self._execute_sql(sql=sql)
-                if rowcount > 1:
-                    self.logger.info(
-                        f"爬虫 {spider_name} 成功插入 {rowcount} 条记录到表 {self.table_name}"
-                    )
-                elif rowcount == 1:
-                    self.logger.debug(
-                        f"爬虫 {spider_name} 成功插入单条记录到表 {self.table_name}"
-                    )
-                else:
-                    # 当使用 MYSQL_UPDATE_COLUMNS 时，如果更新的字段值与现有记录相同，
-                    # MySQL 不会实际更新任何数据，rowcount 会是 0
-                    if self.update_columns:
-                        self.logger.info(
-                            f"爬虫 {spider_name}: SQL执行完成，使用更新列配置 {self.update_columns}，"
-                            f"可能未实际更新数据（字段值未变化）"
-                        )
-                    else:
-                        self.logger.warning(
-                            f"爬虫 {spider_name}: SQL执行成功但未插入新记录"
-                        )
-                # 统计计数移到这里，与AiomysqlMySQLPipeline保持一致
-                self.crawler.stats.inc_value('mysql/insert_success')
-                return item
-            except Exception as e:
-                # 添加更多调试信息
-                error_msg = f"处理失败: {str(e)}"
-                self.logger.error(f"处理item时发生错误: {error_msg}")
-                self.crawler.stats.inc_value('mysql/insert_failed')
-                raise ItemDiscard(error_msg)
-    @abstractmethod
-    async def _execute_sql(self, sql: str, values: list = None) -> int:
-        """执行SQL语句并处理结果 - 子类需要重写此方法"""
-        raise NotImplementedError("子类必须实现 _execute_sql 方法")
-    @abstractmethod
-    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
-        """执行批量SQL语句 - 子类需要重写此方法"""
-        raise NotImplementedError("子类必须实现 _execute_batch_sql 方法")
-    async def _flush_batch(self, spider_name: str):
-        """刷新批量缓冲区并执行批量插入"""
-        if not self.batch_buffer:
-            return
-        try:
-            await self._ensure_pool()
-            # 检查连接池是否有效
-            if not self._pool_initialized or not self.pool:
-                raise RuntimeError("Database connection pool is not initialized or invalid")
-            # 使用 SQLBuilder 生成批量插入 SQL
-            batch_result = SQLBuilder.make_batch(
-                table=self.table_name,
-                datas=self.batch_buffer,
-                auto_update=self.auto_update,
-                update_columns=self.update_columns
-            )
-            if batch_result:
-                sql, values_list = batch_result
-                rowcount = await self._execute_batch_sql(sql=sql, values_list=values_list)
-                if rowcount > 0:
-                    self.logger.info(
-                        f"爬虫 {spider_name} 批量插入 {len(self.batch_buffer)} 条记录到表 {self.table_name}，实际影响 {rowcount} 行"
-                    )
-                else:
-                    # 当使用 MYSQL_UPDATE_COLUMNS 时，如果更新的字段值与现有记录相同，
-                    # MySQL 不会实际更新任何数据，rowcount 会是 0
-                    if self.update_columns:
-                        self.logger.debug(
-                            f"爬虫 {spider_name}: 批量SQL执行完成，使用更新列配置 {self.update_columns}，"
-                            f"可能未实际更新数据（字段值未变化）"
-                        )
-                    else:
-                        self.logger.warning(
-                            f"爬虫 {spider_name}: 批量SQL执行完成但未插入新记录"
-                        )
-                # 清空缓冲区
-                self.batch_buffer.clear()
-                self.crawler.stats.inc_value('mysql/batch_insert_success')
-            else:
-                self.logger.warning(f"爬虫 {spider_name}: 批量数据为空，跳过插入")
-        except Exception as e:
-            # 添加更多调试信息
-            error_msg = f"批量插入失败: {str(e)}"
-            self.logger.error(f"批量处理时发生错误: {error_msg}")
-            self.crawler.stats.inc_value('mysql/batch_insert_failed')
-            # 不清空缓冲区，以便可能的重试
-            # 但如果错误是由于数据问题导致的，可能需要清空缓冲区以避免无限重试
-            if "Duplicate entry" in str(e) or "Data too long" in str(e):
-                self.logger.warning("由于数据问题导致的错误，清空缓冲区以避免无限重试")
-                self.batch_buffer.clear()
-            raise ItemDiscard(error_msg)
-    async def spider_closed(self):
-        """关闭爬虫时清理资源"""
-        # 在关闭前刷新剩余的批量数据
-        if self.use_batch and self.batch_buffer:
-            spider_name = getattr(self.crawler.spider, 'name', 'unknown')
-            try:
-                await self._flush_batch(spider_name)
-            except Exception as e:
-                self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
-        # 注意：不再关闭连接池，因为连接池是全局共享的
-        # 连接池的关闭由 MySQLConnectionPoolManager.close_all_pools() 统一管理
-        if self.pool:
-            self.logger.info(
-                f"MySQL Pipeline 关闭，但保留全局共享连接池以供其他爬虫使用"
-            )
-    async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
-        """生成插入SQL语句，子类可以重写此方法"""
-        # 合并管道配置和传入的kwargs参数
-        sql_kwargs = {
-            'auto_update': self.auto_update,
-            'insert_ignore': self.insert_ignore,
-            'update_columns': self.update_columns
-        }
-        sql_kwargs.update(kwargs)
-        return SQLBuilder.make_insert(
-            table=self.table_name,
-            data=item_dict,
-            **sql_kwargs
-        )
-    @abstractmethod
-    async def _ensure_pool(self):
-        """确保连接池已初始化（线程安全），子类必须实现此方法"""
-        pass
-class AsyncmyMySQLPipeline(BaseMySQLPipeline):
-    """使用asyncmy库的MySQL管道实现"""
-    def __init__(self, crawler):
-        super().__init__(crawler)
-        self.logger.info(f"AsyncmyMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
-    @classmethod
-    def from_crawler(cls, crawler):
-        return cls(crawler)
-    async def _ensure_pool(self):
-        """确保连接池已初始化（线程安全）"""
-        if self._pool_initialized and self.pool:
-            # 检查连接池是否仍然有效
-            if hasattr(self.pool, 'closed') and not self.pool.closed:
-                return
-            else:
-                self.logger.warning("连接池已初始化但无效，重新初始化")
-        async with self._pool_lock:
-            if not self._pool_initialized:  # 双重检查避免竞争条件
-                try:
-                    # 使用单例连接池管理器
-                    self.pool = await MySQLConnectionPoolManager.get_pool(
-                        pool_type='asyncmy',
-                        host=self.settings.get('MYSQL_HOST', 'localhost'),
-                        port=self.settings.get_int('MYSQL_PORT', 3306),
-                        user=self.settings.get('MYSQL_USER', 'root'),
-                        password=self.settings.get('MYSQL_PASSWORD', ''),
-                        db=self.settings.get('MYSQL_DB', 'scrapy_db'),
-                        minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
-                        maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
-                        echo=self.settings.get_bool('MYSQL_ECHO', False)
-                    )
-                    self._pool_initialized = True
-                    self.logger.info(
-                        f"MySQL连接池初始化完成（表: {self.table_name}, "
-                        f"使用全局共享连接池）"
-                    )
-                except Exception as e:
-                    self.logger.error(f"MySQL连接池初始化失败: {e}")
-                    # 重置状态以便重试
-                    self._pool_initialized = False
-                    self.pool = None
-                    raise
-    async def _execute_sql(self, sql: str, values: list = None) -> int:
-        """执行SQL语句并处理结果，包含死锁重试机制"""
-        max_retries = 3
-        timeout = 30  # 30秒超时
-        for attempt in range(max_retries):
-            try:
-                # 检查连接池状态
-                if not self.pool:
-                    raise RuntimeError("Database connection pool is not available")
-                # 使用asyncmy的连接方式，带超时
-                async with async_timeout.timeout(timeout):
-                    async with self.pool.acquire() as conn:
-                        async with conn.cursor() as cursor:
-                            # 根据是否有参数值选择不同的执行方法
-                            if values is not None:
-                                rowcount = await cursor.execute(sql, values)
-                            else:
-                                rowcount = await cursor.execute(sql)
-                            await conn.commit()
-                            return rowcount
-            except asyncio.TimeoutError:
-                self.logger.error(f"执行SQL超时 ({timeout}秒): {sql[:100]}...")
-                raise ItemDiscard(f"MySQL操作超时: {sql[:100]}...")
-            except Exception as e:
-                # 检查是否是死锁错误
-                if "Deadlock found" in str(e) and attempt < max_retries - 1:
-                    self.logger.warning(f"检测到死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
-                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
-                    continue
-                # 检查是否是连接错误，尝试重新初始化连接池
-                elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
-                    self.logger.warning(f"检测到连接错误，尝试重新初始化连接池并重试: {str(e)}")
-                    self._pool_initialized = False
-                    self.pool = None
-                    await asyncio.sleep(0.5 * (attempt + 1))  # 简单退避
-                    continue
-                else:
-                    # 添加更多调试信息
-                    error_msg = f"MySQL插入失败: {str(e)}"
-                    self.logger.error(f"执行SQL时发生错误: {error_msg}")
-                    # 如果是批量操作，记录SQL和值以便调试
-                    if values:
-                        self.logger.debug(f"SQL: {sql[:200]}..., Values: {values[:5] if isinstance(values, list) else '...'}")
-                    raise ItemDiscard(error_msg)
-    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
-        """执行批量SQL语句，包含死锁重试机制"""
-        max_retries = 3
-        timeout = 60  # 60秒超时，批量操作可能需要更长时间
-        for attempt in range(max_retries):
-            try:
-                # 检查连接池状态
-                if not self.pool:
-                    raise RuntimeError("Database connection pool is not available")
-                # 带超时的批量执行
-                async with async_timeout.timeout(timeout):
-                    async with self.pool.acquire() as conn:
-                        async with conn.cursor() as cursor:
-                            # 执行批量插入
-                            rowcount = await cursor.executemany(sql, values_list)
-                            await conn.commit()
-                            return rowcount
-            except asyncio.TimeoutError:
-                self.logger.error(f"执行批量SQL超时 ({timeout}秒)")
-                raise ItemDiscard(f"MySQL批量操作超时")
-            except Exception as e:
-                # 检查是否是死锁错误
-                if "Deadlock found" in str(e) and attempt < max_retries - 1:
-                    self.logger.warning(f"检测到批量插入死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
-                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
-                    continue
-                # 检查是否是连接错误，尝试重新初始化连接池
-                elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
-                    self.logger.warning(f"检测到连接错误，尝试重新初始化连接池并重试: {str(e)}")
-                    self._pool_initialized = False
-                    self.pool = None
-                    await asyncio.sleep(0.5 * (attempt + 1))  # 简单退避
-                    continue
-                else:
-                    # 添加更多调试信息
-                    error_msg = f"MySQL批量插入失败: {str(e)}"
-                    self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
-                    # 记录SQL和值的概要以便调试
-                    self.logger.debug(f"SQL: {sql[:200]}..., Values count: {len(values_list) if isinstance(values_list, list) else 'unknown'}")
-                    raise ItemDiscard(error_msg)
-class AiomysqlMySQLPipeline(BaseMySQLPipeline):
-    """使用aiomysql库的MySQL管道实现"""
-    def __init__(self, crawler):
-        super().__init__(crawler)
-        self.logger.info(f"AiomysqlMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
-    @classmethod
-    def from_crawler(cls, crawler):
-        return cls(crawler)
-    async def _ensure_pool(self):
-        """延迟初始化连接池（线程安全）"""
-        if self._pool_initialized and self.pool:
-            # 检查连接池是否仍然有效
-            if hasattr(self.pool, 'closed') and not self.pool.closed:
-                return
-            else:
-                self.logger.warning("连接池已初始化但无效，重新初始化")
-        async with self._pool_lock:
-            if not self._pool_initialized:
-                try:
-                    # 使用单例连接池管理器
-                    self.pool = await MySQLConnectionPoolManager.get_pool(
-                        pool_type='aiomysql',
-                        host=self.settings.get('MYSQL_HOST', 'localhost'),
-                        port=self.settings.get_int('MYSQL_PORT', 3306),
-                        user=self.settings.get('MYSQL_USER', 'root'),
-                        password=self.settings.get('MYSQL_PASSWORD', ''),
-                        db=self.settings.get('MYSQL_DB', 'scrapy_db'),
-                        minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
-                        maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5)
-                    )
-                    self._pool_initialized = True
-                    self.logger.info(
-                        f"aiomysql连接池已初始化（表: {self.table_name}, "
-                        f"使用全局共享连接池）"
-                    )
-                except Exception as e:
-                    self.logger.error(f"aiomysql连接池初始化失败: {e}")
-                    # 重置状态以便重试
-                    self._pool_initialized = False
-                    self.pool = None
-                    raise
-    async def _execute_sql(self, sql: str, values: list = None) -> int:
-        """执行SQL语句并处理结果，包含死锁重试机制"""
-        max_retries = 3
-        for attempt in range(max_retries):
-            try:
-                # 使用aiomysql的异步上下文管理器方式
-                async with self.pool.acquire() as conn:
-                    async with conn.cursor() as cursor:
-                        # 根据是否有参数值选择不同的执行方法
-                        if values is not None:
-                            rowcount = await cursor.execute(sql, values)
-                        else:
-                            rowcount = await cursor.execute(sql)
-                        await conn.commit()
-                        return rowcount
-            except Exception as e:
-                # 检查是否是死锁错误
-                if "Deadlock found" in str(e) and attempt < max_retries - 1:
-                    self.logger.warning(f"检测到死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
-                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
-                    continue
-                else:
-                    # 添加更多调试信息
-                    error_msg = f"MySQL插入失败: {str(e)}"
-                    self.logger.error(f"执行SQL时发生错误: {error_msg}")
-                    raise ItemDiscard(error_msg)
-    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
-        """执行批量SQL语句，包含死锁重试机制"""
-        max_retries = 3
-        for attempt in range(max_retries):
-            try:
-                async with self.pool.acquire() as conn:
-                    async with conn.cursor() as cursor:
-                        # 执行批量插入
-                        rowcount = await cursor.executemany(sql, values_list)
-                        await conn.commit()
-                        return rowcount
-            except Exception as e:
-                # 检查是否是死锁错误
-                if "Deadlock found" in str(e) and attempt < max_retries - 1:
-                    self.logger.warning(f"检测到批量插入死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
-                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
-                    continue
-                else:
-                    # 添加更多调试信息
-                    error_msg = f"MySQL批量插入失败: {str(e)}"
-                    self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
+# -*- coding: utf-8 -*-
+import asyncio
+import async_timeout
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any
+from crawlo.exceptions import ItemDiscard
+from crawlo.items import Item
+from crawlo.utils.db_helper import SQLBuilder
+from crawlo.logging import get_logger
+from crawlo.utils.mysql_connection_pool import MySQLConnectionPoolManager
+from . import BasePipeline
+class BaseMySQLPipeline(BasePipeline, ABC):
+    """MySQL管道的基类，封装公共功能"""
+    def __init__(self, crawler):
+        self.crawler = crawler
+        self.settings = crawler.settings
+        self.logger = get_logger(self.__class__.__name__)
+        # 记录管道初始化
+        self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
+        # 使用异步锁和初始化标志确保线程安全
+        self._pool_lock = asyncio.Lock()
+        self._pool_initialized = False
+        self.pool = None
+        # 优先从爬虫的custom_settings中获取表名，如果没有则使用默认值
+        spider_table_name = None
+        if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'custom_settings'):
+            spider_table_name = crawler.spider.custom_settings.get('MYSQL_TABLE')
+        self.table_name = (
+                spider_table_name or
+                self.settings.get('MYSQL_TABLE') or
+                getattr(crawler.spider, 'mysql_table', None) or
+                f"{getattr(crawler.spider, 'name', 'default')}_items"
+        )
+        # 验证表名是否有效
+        if not self.table_name or not isinstance(self.table_name, str):
+            raise ValueError(f"Invalid table name: {self.table_name}. Table name must be a non-empty string.")
+        # 清理表名，移除可能的非法字符
+        self.table_name = self.table_name.strip().replace(' ', '_').replace('-', '_')
+        # 批量插入配置
+        self.batch_size = max(1, self.settings.get_int('MYSQL_BATCH_SIZE', 100))  # 确保至少为1
+        self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
+        self.batch_buffer: List[Dict] = []  # 批量缓冲区
+        # SQL生成配置
+        self.auto_update = self.settings.get_bool('MYSQL_AUTO_UPDATE', False)
+        self.insert_ignore = self.settings.get_bool('MYSQL_INSERT_IGNORE', False)
+        self.update_columns = self.settings.get('MYSQL_UPDATE_COLUMNS', ())
+        # 验证 update_columns 是否为元组或列表
+        if self.update_columns and not isinstance(self.update_columns, (tuple, list)):
+            self.logger.warning(f"update_columns should be a tuple or list, got {type(self.update_columns)}. Converting to tuple.")
+            self.update_columns = (self.update_columns,)
+        # 注册关闭事件
+        crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
+    async def process_item(self, item: Item, spider, kwargs: Dict[str, Any] = None) -> Item:
+        """处理item的核心方法"""
+        kwargs = kwargs or {}
+        spider_name = getattr(spider, 'name', 'unknown')  # 获取爬虫名称
+        # 如果启用批量插入，将item添加到缓冲区
+        if self.use_batch:
+            self.batch_buffer.append(dict(item))
+            # 如果缓冲区达到批量大小，执行批量插入
+            if len(self.batch_buffer) >= self.batch_size:
+                await self._flush_batch(spider_name)
+            return item
+        else:
+            # 单条插入逻辑
+            try:
+                await self._ensure_pool()
+                # 检查连接池是否有效
+                if not self._pool_initialized or not self.pool:
+                    raise RuntimeError("Database connection pool is not initialized or invalid")
+                item_dict = dict(item)
+                sql = await self._make_insert_sql(item_dict, **kwargs)
+                rowcount = await self._execute_sql(sql=sql)
+                if rowcount > 1:
+                    self.logger.info(
+                        f"爬虫 {spider_name} 成功插入 {rowcount} 条记录到表 {self.table_name}"
+                    )
+                elif rowcount == 1:
+                    self.logger.debug(
+                        f"爬虫 {spider_name} 成功插入单条记录到表 {self.table_name}"
+                    )
+                else:
+                    # 当使用 MYSQL_UPDATE_COLUMNS 时，如果更新的字段值与现有记录相同，
+                    # MySQL 不会实际更新任何数据，rowcount 会是 0
+                    if self.update_columns:
+                        self.logger.info(
+                            f"爬虫 {spider_name}: SQL执行完成，使用更新列配置 {self.update_columns}，"
+                            f"可能未实际更新数据（字段值未变化）"
+                        )
+                    else:
+                        self.logger.warning(
+                            f"爬虫 {spider_name}: SQL执行成功但未插入新记录"
+                        )
+                # 统计计数移到这里，与AiomysqlMySQLPipeline保持一致
+                self.crawler.stats.inc_value('mysql/insert_success')
+                return item
+            except Exception as e:
+                # 添加更多调试信息
+                error_msg = f"处理失败: {str(e)}"
+                self.logger.error(f"处理item时发生错误: {error_msg}")
+                self.crawler.stats.inc_value('mysql/insert_failed')
+                raise ItemDiscard(error_msg)
+    @abstractmethod
+    async def _execute_sql(self, sql: str, values: list = None) -> int:
+        """执行SQL语句并处理结果 - 子类需要重写此方法"""
+        raise NotImplementedError("子类必须实现 _execute_sql 方法")
+    @abstractmethod
+    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
+        """执行批量SQL语句 - 子类需要重写此方法"""
+        raise NotImplementedError("子类必须实现 _execute_batch_sql 方法")
+    async def _flush_batch(self, spider_name: str):
+        """刷新批量缓冲区并执行批量插入"""
+        if not self.batch_buffer:
+            return
+        try:
+            await self._ensure_pool()
+            # 检查连接池是否有效
+            if not self._pool_initialized or not self.pool:
+                raise RuntimeError("Database connection pool is not initialized or invalid")
+            # 使用 SQLBuilder 生成批量插入 SQL
+            batch_result = SQLBuilder.make_batch(
+                table=self.table_name,
+                datas=self.batch_buffer,
+                auto_update=self.auto_update,
+                update_columns=self.update_columns
+            )
+            if batch_result:
+                sql, values_list = batch_result
+                rowcount = await self._execute_batch_sql(sql=sql, values_list=values_list)
+                if rowcount > 0:
+                    self.logger.info(
+                        f"爬虫 {spider_name} 批量插入 {len(self.batch_buffer)} 条记录到表 {self.table_name}，实际影响 {rowcount} 行"
+                    )
+                else:
+                    # 当使用 MYSQL_UPDATE_COLUMNS 时，如果更新的字段值与现有记录相同，
+                    # MySQL 不会实际更新任何数据，rowcount 会是 0
+                    if self.update_columns:
+                        self.logger.debug(
+                            f"爬虫 {spider_name}: 批量SQL执行完成，使用更新列配置 {self.update_columns}，"
+                            f"可能未实际更新数据（字段值未变化）"
+                        )
+                    else:
+                        self.logger.warning(
+                            f"爬虫 {spider_name}: 批量SQL执行完成但未插入新记录"
+                        )
+                # 清空缓冲区
+                self.batch_buffer.clear()
+                self.crawler.stats.inc_value('mysql/batch_insert_success')
+            else:
+                self.logger.warning(f"爬虫 {spider_name}: 批量数据为空，跳过插入")
+        except Exception as e:
+            # 添加更多调试信息
+            error_msg = f"批量插入失败: {str(e)}"
+            self.logger.error(f"批量处理时发生错误: {error_msg}")
+            self.crawler.stats.inc_value('mysql/batch_insert_failed')
+            # 不清空缓冲区，以便可能的重试
+            # 但如果错误是由于数据问题导致的，可能需要清空缓冲区以避免无限重试
+            if "Duplicate entry" in str(e) or "Data too long" in str(e):
+                self.logger.warning("由于数据问题导致的错误，清空缓冲区以避免无限重试")
+                self.batch_buffer.clear()
+            raise ItemDiscard(error_msg)
+    async def spider_closed(self):
+        """关闭爬虫时清理资源"""
+        # 在关闭前刷新剩余的批量数据
+        if self.use_batch and self.batch_buffer:
+            spider_name = getattr(self.crawler.spider, 'name', 'unknown')
+            try:
+                await self._flush_batch(spider_name)
+            except Exception as e:
+                self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
+        # 注意：不再关闭连接池，因为连接池是全局共享的
+        # 连接池的关闭由 MySQLConnectionPoolManager.close_all_pools() 统一管理
+        if self.pool:
+            self.logger.info(
+                f"MySQL Pipeline 关闭，但保留全局共享连接池以供其他爬虫使用"
+            )
+    async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
+        """生成插入SQL语句，子类可以重写此方法"""
+        # 合并管道配置和传入的kwargs参数
+        sql_kwargs = {
+            'auto_update': self.auto_update,
+            'insert_ignore': self.insert_ignore,
+            'update_columns': self.update_columns
+        }
+        sql_kwargs.update(kwargs)
+        return SQLBuilder.make_insert(
+            table=self.table_name,
+            data=item_dict,
+            **sql_kwargs
+        )
+    @abstractmethod
+    async def _ensure_pool(self):
+        """确保连接池已初始化（线程安全），子类必须实现此方法"""
+        pass
+class AsyncmyMySQLPipeline(BaseMySQLPipeline):
+    """使用asyncmy库的MySQL管道实现"""
+    def __init__(self, crawler):
+        super().__init__(crawler)
+        self.logger.info(f"AsyncmyMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler)
+    async def _ensure_pool(self):
+        """确保连接池已初始化（线程安全）"""
+        if self._pool_initialized and self.pool:
+            # 检查连接池是否仍然有效
+            if hasattr(self.pool, 'closed') and not self.pool.closed:
+                return
+            else:
+                self.logger.warning("连接池已初始化但无效，重新初始化")
+        async with self._pool_lock:
+            if not self._pool_initialized:  # 双重检查避免竞争条件
+                try:
+                    # 使用单例连接池管理器
+                    self.pool = await MySQLConnectionPoolManager.get_pool(
+                        pool_type='asyncmy',
+                        host=self.settings.get('MYSQL_HOST', 'localhost'),
+                        port=self.settings.get_int('MYSQL_PORT', 3306),
+                        user=self.settings.get('MYSQL_USER', 'root'),
+                        password=self.settings.get('MYSQL_PASSWORD', ''),
+                        db=self.settings.get('MYSQL_DB', 'scrapy_db'),
+                        minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
+                        maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
+                        echo=self.settings.get_bool('MYSQL_ECHO', False)
+                    )
+                    self._pool_initialized = True
+                    self.logger.info(
+                        f"MySQL连接池初始化完成（表: {self.table_name}, "
+                        f"使用全局共享连接池）"
+                    )
+                except Exception as e:
+                    self.logger.error(f"MySQL连接池初始化失败: {e}")
+                    # 重置状态以便重试
+                    self._pool_initialized = False
+                    self.pool = None
+                    raise
+    async def _execute_sql(self, sql: str, values: list = None) -> int:
+        """执行SQL语句并处理结果，包含死锁重试机制"""
+        max_retries = 3
+        timeout = 30  # 30秒超时
+        for attempt in range(max_retries):
+            try:
+                # 检查连接池状态
+                if not self.pool:
+                    raise RuntimeError("Database connection pool is not available")
+                # 使用asyncmy的连接方式，带超时
+                async with async_timeout.timeout(timeout):
+                    async with self.pool.acquire() as conn:
+                        async with conn.cursor() as cursor:
+                            # 根据是否有参数值选择不同的执行方法
+                            if values is not None:
+                                rowcount = await cursor.execute(sql, values)
+                            else:
+                                rowcount = await cursor.execute(sql)
+                            await conn.commit()
+                            return rowcount
+            except asyncio.TimeoutError:
+                self.logger.error(f"执行SQL超时 ({timeout}秒): {sql[:100]}...")
+                raise ItemDiscard(f"MySQL操作超时: {sql[:100]}...")
+            except Exception as e:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                # 检查是否是连接错误，尝试重新初始化连接池
+                elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到连接错误，尝试重新初始化连接池并重试: {str(e)}")
+                    self._pool_initialized = False
+                    self.pool = None
+                    await asyncio.sleep(0.5 * (attempt + 1))  # 简单退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL插入失败: {str(e)}"
+                    self.logger.error(f"执行SQL时发生错误: {error_msg}")
+                    # 如果是批量操作，记录SQL和值以便调试
+                    if values:
+                        self.logger.debug(f"SQL: {sql[:200]}..., Values: {values[:5] if isinstance(values, list) else '...'}")
+                    raise ItemDiscard(error_msg)
+    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
+        """执行批量SQL语句，包含死锁重试机制"""
+        max_retries = 3
+        timeout = 60  # 60秒超时，批量操作可能需要更长时间
+        for attempt in range(max_retries):
+            try:
+                # 检查连接池状态
+                if not self.pool:
+                    raise RuntimeError("Database connection pool is not available")
+                # 带超时的批量执行
+                async with async_timeout.timeout(timeout):
+                    async with self.pool.acquire() as conn:
+                        async with conn.cursor() as cursor:
+                            # 执行批量插入
+                            rowcount = await cursor.executemany(sql, values_list)
+                            await conn.commit()
+                            return rowcount
+            except asyncio.TimeoutError:
+                self.logger.error(f"执行批量SQL超时 ({timeout}秒)")
+                raise ItemDiscard(f"MySQL批量操作超时")
+            except Exception as e:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到批量插入死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                # 检查是否是连接错误，尝试重新初始化连接池
+                elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到连接错误，尝试重新初始化连接池并重试: {str(e)}")
+                    self._pool_initialized = False
+                    self.pool = None
+                    await asyncio.sleep(0.5 * (attempt + 1))  # 简单退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL批量插入失败: {str(e)}"
+                    self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
+                    # 记录SQL和值的概要以便调试
+                    self.logger.debug(f"SQL: {sql[:200]}..., Values count: {len(values_list) if isinstance(values_list, list) else 'unknown'}")
+                    raise ItemDiscard(error_msg)
+class AiomysqlMySQLPipeline(BaseMySQLPipeline):
+    """使用aiomysql库的MySQL管道实现"""
+    def __init__(self, crawler):
+        super().__init__(crawler)
+        self.logger.info(f"AiomysqlMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler)
+    async def _ensure_pool(self):
+        """延迟初始化连接池（线程安全）"""
+        if self._pool_initialized and self.pool:
+            # 检查连接池是否仍然有效
+            if hasattr(self.pool, 'closed') and not self.pool.closed:
+                return
+            else:
+                self.logger.warning("连接池已初始化但无效，重新初始化")
+        async with self._pool_lock:
+            if not self._pool_initialized:
+                try:
+                    # 使用单例连接池管理器
+                    self.pool = await MySQLConnectionPoolManager.get_pool(
+                        pool_type='aiomysql',
+                        host=self.settings.get('MYSQL_HOST', 'localhost'),
+                        port=self.settings.get_int('MYSQL_PORT', 3306),
+                        user=self.settings.get('MYSQL_USER', 'root'),
+                        password=self.settings.get('MYSQL_PASSWORD', ''),
+                        db=self.settings.get('MYSQL_DB', 'scrapy_db'),
+                        minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
+                        maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5)
+                    )
+                    self._pool_initialized = True
+                    self.logger.info(
+                        f"aiomysql连接池已初始化（表: {self.table_name}, "
+                        f"使用全局共享连接池）"
+                    )
+                except Exception as e:
+                    self.logger.error(f"aiomysql连接池初始化失败: {e}")
+                    # 重置状态以便重试
+                    self._pool_initialized = False
+                    self.pool = None
+                    raise
+    async def _execute_sql(self, sql: str, values: list = None) -> int:
+        """执行SQL语句并处理结果，包含死锁重试机制"""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                # 使用aiomysql的异步上下文管理器方式
+                async with self.pool.acquire() as conn:
+                    async with conn.cursor() as cursor:
+                        # 根据是否有参数值选择不同的执行方法
+                        if values is not None:
+                            rowcount = await cursor.execute(sql, values)
+                        else:
+                            rowcount = await cursor.execute(sql)
+                        await conn.commit()
+                        return rowcount
+            except Exception as e:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL插入失败: {str(e)}"
+                    self.logger.error(f"执行SQL时发生错误: {error_msg}")
+                    raise ItemDiscard(error_msg)
+    async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
+        """执行批量SQL语句，包含死锁重试机制"""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                async with self.pool.acquire() as conn:
+                    async with conn.cursor() as cursor:
+                        # 执行批量插入
+                        rowcount = await cursor.executemany(sql, values_list)
+                        await conn.commit()
+                        return rowcount
+            except Exception as e:
+                # 检查是否是死锁错误
+                if "Deadlock found" in str(e) and attempt < max_retries - 1:
+                    self.logger.warning(f"检测到批量插入死锁，正在进行第 {attempt + 1} 次重试: {str(e)}")
+                    await asyncio.sleep(0.1 * (2 ** attempt))  # 指数退避
+                    continue
+                else:
+                    # 添加更多调试信息
+                    error_msg = f"MySQL批量插入失败: {str(e)}"
+                    self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
                     raise ItemDiscard(error_msg)

crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.7py3-none-any.whl → 1.4.8py3-none-any.whl