crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,470 +1,469 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
import asyncio
|
|
3
|
-
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
from crawlo.
|
|
9
|
-
from crawlo.
|
|
10
|
-
from crawlo.
|
|
11
|
-
from crawlo.
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
self.
|
|
21
|
-
self.
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
self.
|
|
29
|
-
self.
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
getattr(crawler.spider, '
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
#
|
|
52
|
-
self.
|
|
53
|
-
self.
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
self.
|
|
58
|
-
self.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
self.
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
rowcount
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
#
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
f"
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
self.
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
"
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
"
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
sql, values_list
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
#
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
f"
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
self.
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
self.
|
|
189
|
-
|
|
190
|
-
#
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
self.
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
#
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
'
|
|
219
|
-
'
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
self.
|
|
272
|
-
|
|
273
|
-
f"
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
self.
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
self.
|
|
318
|
-
self.
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
self.
|
|
362
|
-
self.
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
self.
|
|
410
|
-
|
|
411
|
-
f"
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
self.
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import asyncio
|
|
3
|
+
import async_timeout
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import List, Dict, Any
|
|
6
|
+
|
|
7
|
+
from crawlo.exceptions import ItemDiscard
|
|
8
|
+
from crawlo.items import Item
|
|
9
|
+
from crawlo.utils.db_helper import SQLBuilder
|
|
10
|
+
from crawlo.logging import get_logger
|
|
11
|
+
from crawlo.utils.mysql_connection_pool import MySQLConnectionPoolManager
|
|
12
|
+
from . import BasePipeline
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseMySQLPipeline(BasePipeline, ABC):
|
|
16
|
+
"""MySQL管道的基类,封装公共功能"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, crawler):
|
|
19
|
+
self.crawler = crawler
|
|
20
|
+
self.settings = crawler.settings
|
|
21
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
22
|
+
|
|
23
|
+
# 记录管道初始化
|
|
24
|
+
self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
|
|
25
|
+
|
|
26
|
+
# 使用异步锁和初始化标志确保线程安全
|
|
27
|
+
self._pool_lock = asyncio.Lock()
|
|
28
|
+
self._pool_initialized = False
|
|
29
|
+
self.pool = None
|
|
30
|
+
|
|
31
|
+
# 优先从爬虫的custom_settings中获取表名,如果没有则使用默认值
|
|
32
|
+
spider_table_name = None
|
|
33
|
+
if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'custom_settings'):
|
|
34
|
+
spider_table_name = crawler.spider.custom_settings.get('MYSQL_TABLE')
|
|
35
|
+
|
|
36
|
+
self.table_name = (
|
|
37
|
+
spider_table_name or
|
|
38
|
+
self.settings.get('MYSQL_TABLE') or
|
|
39
|
+
getattr(crawler.spider, 'mysql_table', None) or
|
|
40
|
+
f"{getattr(crawler.spider, 'name', 'default')}_items"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# 验证表名是否有效
|
|
44
|
+
if not self.table_name or not isinstance(self.table_name, str):
|
|
45
|
+
raise ValueError(f"Invalid table name: {self.table_name}. Table name must be a non-empty string.")
|
|
46
|
+
|
|
47
|
+
# 清理表名,移除可能的非法字符
|
|
48
|
+
self.table_name = self.table_name.strip().replace(' ', '_').replace('-', '_')
|
|
49
|
+
|
|
50
|
+
# 批量插入配置
|
|
51
|
+
self.batch_size = max(1, self.settings.get_int('MYSQL_BATCH_SIZE', 100)) # 确保至少为1
|
|
52
|
+
self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
|
|
53
|
+
self.batch_buffer: List[Dict] = [] # 批量缓冲区
|
|
54
|
+
|
|
55
|
+
# SQL生成配置
|
|
56
|
+
self.auto_update = self.settings.get_bool('MYSQL_AUTO_UPDATE', False)
|
|
57
|
+
self.insert_ignore = self.settings.get_bool('MYSQL_INSERT_IGNORE', False)
|
|
58
|
+
self.update_columns = self.settings.get('MYSQL_UPDATE_COLUMNS', ())
|
|
59
|
+
|
|
60
|
+
# 验证 update_columns 是否为元组或列表
|
|
61
|
+
if self.update_columns and not isinstance(self.update_columns, (tuple, list)):
|
|
62
|
+
self.logger.warning(f"update_columns should be a tuple or list, got {type(self.update_columns)}. Converting to tuple.")
|
|
63
|
+
self.update_columns = (self.update_columns,)
|
|
64
|
+
|
|
65
|
+
# 注册关闭事件
|
|
66
|
+
crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
|
|
67
|
+
|
|
68
|
+
async def process_item(self, item: Item, spider, kwargs: Dict[str, Any] = None) -> Item:
|
|
69
|
+
"""处理item的核心方法"""
|
|
70
|
+
kwargs = kwargs or {}
|
|
71
|
+
spider_name = getattr(spider, 'name', 'unknown') # 获取爬虫名称
|
|
72
|
+
|
|
73
|
+
# 如果启用批量插入,将item添加到缓冲区
|
|
74
|
+
if self.use_batch:
|
|
75
|
+
self.batch_buffer.append(dict(item))
|
|
76
|
+
|
|
77
|
+
# 如果缓冲区达到批量大小,执行批量插入
|
|
78
|
+
if len(self.batch_buffer) >= self.batch_size:
|
|
79
|
+
await self._flush_batch(spider_name)
|
|
80
|
+
|
|
81
|
+
return item
|
|
82
|
+
else:
|
|
83
|
+
# 单条插入逻辑
|
|
84
|
+
try:
|
|
85
|
+
await self._ensure_pool()
|
|
86
|
+
|
|
87
|
+
# 检查连接池是否有效
|
|
88
|
+
if not self._pool_initialized or not self.pool:
|
|
89
|
+
raise RuntimeError("Database connection pool is not initialized or invalid")
|
|
90
|
+
|
|
91
|
+
item_dict = dict(item)
|
|
92
|
+
sql = await self._make_insert_sql(item_dict, **kwargs)
|
|
93
|
+
|
|
94
|
+
rowcount = await self._execute_sql(sql=sql)
|
|
95
|
+
if rowcount > 1:
|
|
96
|
+
self.logger.info(
|
|
97
|
+
f"爬虫 {spider_name} 成功插入 {rowcount} 条记录到表 {self.table_name}"
|
|
98
|
+
)
|
|
99
|
+
elif rowcount == 1:
|
|
100
|
+
self.logger.debug(
|
|
101
|
+
f"爬虫 {spider_name} 成功插入单条记录到表 {self.table_name}"
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
# 当使用 MYSQL_UPDATE_COLUMNS 时,如果更新的字段值与现有记录相同,
|
|
105
|
+
# MySQL 不会实际更新任何数据,rowcount 会是 0
|
|
106
|
+
if self.update_columns:
|
|
107
|
+
self.logger.info(
|
|
108
|
+
f"爬虫 {spider_name}: SQL执行完成,使用更新列配置 {self.update_columns},"
|
|
109
|
+
f"可能未实际更新数据(字段值未变化)"
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
self.logger.warning(
|
|
113
|
+
f"爬虫 {spider_name}: SQL执行成功但未插入新记录"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# 统计计数移到这里,与AiomysqlMySQLPipeline保持一致
|
|
117
|
+
self.crawler.stats.inc_value('mysql/insert_success')
|
|
118
|
+
return item
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
# 添加更多调试信息
|
|
122
|
+
error_msg = f"处理失败: {str(e)}"
|
|
123
|
+
self.logger.error(f"处理item时发生错误: {error_msg}")
|
|
124
|
+
self.crawler.stats.inc_value('mysql/insert_failed')
|
|
125
|
+
raise ItemDiscard(error_msg)
|
|
126
|
+
|
|
127
|
+
@abstractmethod
|
|
128
|
+
async def _execute_sql(self, sql: str, values: list = None) -> int:
|
|
129
|
+
"""执行SQL语句并处理结果 - 子类需要重写此方法"""
|
|
130
|
+
raise NotImplementedError("子类必须实现 _execute_sql 方法")
|
|
131
|
+
|
|
132
|
+
@abstractmethod
|
|
133
|
+
async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
|
|
134
|
+
"""执行批量SQL语句 - 子类需要重写此方法"""
|
|
135
|
+
raise NotImplementedError("子类必须实现 _execute_batch_sql 方法")
|
|
136
|
+
|
|
137
|
+
async def _flush_batch(self, spider_name: str):
|
|
138
|
+
"""刷新批量缓冲区并执行批量插入"""
|
|
139
|
+
if not self.batch_buffer:
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
await self._ensure_pool()
|
|
144
|
+
|
|
145
|
+
# 检查连接池是否有效
|
|
146
|
+
if not self._pool_initialized or not self.pool:
|
|
147
|
+
raise RuntimeError("Database connection pool is not initialized or invalid")
|
|
148
|
+
|
|
149
|
+
# 使用 SQLBuilder 生成批量插入 SQL
|
|
150
|
+
batch_result = SQLBuilder.make_batch(
|
|
151
|
+
table=self.table_name,
|
|
152
|
+
datas=self.batch_buffer,
|
|
153
|
+
auto_update=self.auto_update,
|
|
154
|
+
update_columns=self.update_columns
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
if batch_result:
|
|
158
|
+
sql, values_list = batch_result
|
|
159
|
+
rowcount = await self._execute_batch_sql(sql=sql, values_list=values_list)
|
|
160
|
+
|
|
161
|
+
if rowcount > 0:
|
|
162
|
+
self.logger.info(
|
|
163
|
+
f"爬虫 {spider_name} 批量插入 {len(self.batch_buffer)} 条记录到表 {self.table_name},实际影响 {rowcount} 行"
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
# 当使用 MYSQL_UPDATE_COLUMNS 时,如果更新的字段值与现有记录相同,
|
|
167
|
+
# MySQL 不会实际更新任何数据,rowcount 会是 0
|
|
168
|
+
if self.update_columns:
|
|
169
|
+
self.logger.debug(
|
|
170
|
+
f"爬虫 {spider_name}: 批量SQL执行完成,使用更新列配置 {self.update_columns},"
|
|
171
|
+
f"可能未实际更新数据(字段值未变化)"
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
self.logger.warning(
|
|
175
|
+
f"爬虫 {spider_name}: 批量SQL执行完成但未插入新记录"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# 清空缓冲区
|
|
179
|
+
self.batch_buffer.clear()
|
|
180
|
+
self.crawler.stats.inc_value('mysql/batch_insert_success')
|
|
181
|
+
else:
|
|
182
|
+
self.logger.warning(f"爬虫 {spider_name}: 批量数据为空,跳过插入")
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
# 添加更多调试信息
|
|
186
|
+
error_msg = f"批量插入失败: {str(e)}"
|
|
187
|
+
self.logger.error(f"批量处理时发生错误: {error_msg}")
|
|
188
|
+
self.crawler.stats.inc_value('mysql/batch_insert_failed')
|
|
189
|
+
# 不清空缓冲区,以便可能的重试
|
|
190
|
+
# 但如果错误是由于数据问题导致的,可能需要清空缓冲区以避免无限重试
|
|
191
|
+
if "Duplicate entry" in str(e) or "Data too long" in str(e):
|
|
192
|
+
self.logger.warning("由于数据问题导致的错误,清空缓冲区以避免无限重试")
|
|
193
|
+
self.batch_buffer.clear()
|
|
194
|
+
raise ItemDiscard(error_msg)
|
|
195
|
+
|
|
196
|
+
async def spider_closed(self):
|
|
197
|
+
"""关闭爬虫时清理资源"""
|
|
198
|
+
# 在关闭前刷新剩余的批量数据
|
|
199
|
+
if self.use_batch and self.batch_buffer:
|
|
200
|
+
spider_name = getattr(self.crawler.spider, 'name', 'unknown')
|
|
201
|
+
try:
|
|
202
|
+
await self._flush_batch(spider_name)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
|
|
205
|
+
|
|
206
|
+
# 注意:不再关闭连接池,因为连接池是全局共享的
|
|
207
|
+
# 连接池的关闭由 MySQLConnectionPoolManager.close_all_pools() 统一管理
|
|
208
|
+
if self.pool:
|
|
209
|
+
self.logger.info(
|
|
210
|
+
f"MySQL Pipeline 关闭,但保留全局共享连接池以供其他爬虫使用"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
|
|
214
|
+
"""生成插入SQL语句,子类可以重写此方法"""
|
|
215
|
+
# 合并管道配置和传入的kwargs参数
|
|
216
|
+
sql_kwargs = {
|
|
217
|
+
'auto_update': self.auto_update,
|
|
218
|
+
'insert_ignore': self.insert_ignore,
|
|
219
|
+
'update_columns': self.update_columns
|
|
220
|
+
}
|
|
221
|
+
sql_kwargs.update(kwargs)
|
|
222
|
+
|
|
223
|
+
return SQLBuilder.make_insert(
|
|
224
|
+
table=self.table_name,
|
|
225
|
+
data=item_dict,
|
|
226
|
+
**sql_kwargs
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
@abstractmethod
|
|
230
|
+
async def _ensure_pool(self):
|
|
231
|
+
"""确保连接池已初始化(线程安全),子类必须实现此方法"""
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class AsyncmyMySQLPipeline(BaseMySQLPipeline):
|
|
236
|
+
"""使用asyncmy库的MySQL管道实现"""
|
|
237
|
+
|
|
238
|
+
def __init__(self, crawler):
|
|
239
|
+
super().__init__(crawler)
|
|
240
|
+
self.logger.info(f"AsyncmyMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
|
|
241
|
+
|
|
242
|
+
@classmethod
|
|
243
|
+
def from_crawler(cls, crawler):
|
|
244
|
+
return cls(crawler)
|
|
245
|
+
|
|
246
|
+
async def _ensure_pool(self):
|
|
247
|
+
"""确保连接池已初始化(线程安全)"""
|
|
248
|
+
if self._pool_initialized and self.pool:
|
|
249
|
+
# 检查连接池是否仍然有效
|
|
250
|
+
if hasattr(self.pool, 'closed') and not self.pool.closed:
|
|
251
|
+
return
|
|
252
|
+
else:
|
|
253
|
+
self.logger.warning("连接池已初始化但无效,重新初始化")
|
|
254
|
+
|
|
255
|
+
async with self._pool_lock:
|
|
256
|
+
if not self._pool_initialized: # 双重检查避免竞争条件
|
|
257
|
+
try:
|
|
258
|
+
# 使用单例连接池管理器
|
|
259
|
+
self.pool = await MySQLConnectionPoolManager.get_pool(
|
|
260
|
+
pool_type='asyncmy',
|
|
261
|
+
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
262
|
+
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
263
|
+
user=self.settings.get('MYSQL_USER', 'root'),
|
|
264
|
+
password=self.settings.get('MYSQL_PASSWORD', ''),
|
|
265
|
+
db=self.settings.get('MYSQL_DB', 'scrapy_db'),
|
|
266
|
+
minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
|
|
267
|
+
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
|
|
268
|
+
echo=self.settings.get_bool('MYSQL_ECHO', False)
|
|
269
|
+
)
|
|
270
|
+
self._pool_initialized = True
|
|
271
|
+
self.logger.info(
|
|
272
|
+
f"MySQL连接池初始化完成(表: {self.table_name}, "
|
|
273
|
+
f"使用全局共享连接池)"
|
|
274
|
+
)
|
|
275
|
+
except Exception as e:
|
|
276
|
+
self.logger.error(f"MySQL连接池初始化失败: {e}")
|
|
277
|
+
# 重置状态以便重试
|
|
278
|
+
self._pool_initialized = False
|
|
279
|
+
self.pool = None
|
|
280
|
+
raise
|
|
281
|
+
|
|
282
|
+
async def _execute_sql(self, sql: str, values: list = None) -> int:
|
|
283
|
+
"""执行SQL语句并处理结果,包含死锁重试机制"""
|
|
284
|
+
max_retries = 3
|
|
285
|
+
timeout = 30 # 30秒超时
|
|
286
|
+
|
|
287
|
+
for attempt in range(max_retries):
|
|
288
|
+
try:
|
|
289
|
+
# 检查连接池状态
|
|
290
|
+
if not self.pool:
|
|
291
|
+
raise RuntimeError("Database connection pool is not available")
|
|
292
|
+
|
|
293
|
+
# 使用asyncmy的连接方式,带超时
|
|
294
|
+
async with async_timeout.timeout(timeout):
|
|
295
|
+
async with self.pool.acquire() as conn:
|
|
296
|
+
async with conn.cursor() as cursor:
|
|
297
|
+
# 根据是否有参数值选择不同的执行方法
|
|
298
|
+
if values is not None:
|
|
299
|
+
rowcount = await cursor.execute(sql, values)
|
|
300
|
+
else:
|
|
301
|
+
rowcount = await cursor.execute(sql)
|
|
302
|
+
|
|
303
|
+
await conn.commit()
|
|
304
|
+
return rowcount
|
|
305
|
+
except asyncio.TimeoutError:
|
|
306
|
+
self.logger.error(f"执行SQL超时 ({timeout}秒): {sql[:100]}...")
|
|
307
|
+
raise ItemDiscard(f"MySQL操作超时: {sql[:100]}...")
|
|
308
|
+
except Exception as e:
|
|
309
|
+
# 检查是否是死锁错误
|
|
310
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
311
|
+
self.logger.warning(f"检测到死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
312
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
313
|
+
continue
|
|
314
|
+
# 检查是否是连接错误,尝试重新初始化连接池
|
|
315
|
+
elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
|
|
316
|
+
self.logger.warning(f"检测到连接错误,尝试重新初始化连接池并重试: {str(e)}")
|
|
317
|
+
self._pool_initialized = False
|
|
318
|
+
self.pool = None
|
|
319
|
+
await asyncio.sleep(0.5 * (attempt + 1)) # 简单退避
|
|
320
|
+
continue
|
|
321
|
+
else:
|
|
322
|
+
# 添加更多调试信息
|
|
323
|
+
error_msg = f"MySQL插入失败: {str(e)}"
|
|
324
|
+
self.logger.error(f"执行SQL时发生错误: {error_msg}")
|
|
325
|
+
# 如果是批量操作,记录SQL和值以便调试
|
|
326
|
+
if values:
|
|
327
|
+
self.logger.debug(f"SQL: {sql[:200]}..., Values: {values[:5] if isinstance(values, list) else '...'}")
|
|
328
|
+
raise ItemDiscard(error_msg)
|
|
329
|
+
|
|
330
|
+
async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
|
|
331
|
+
"""执行批量SQL语句,包含死锁重试机制"""
|
|
332
|
+
max_retries = 3
|
|
333
|
+
timeout = 60 # 60秒超时,批量操作可能需要更长时间
|
|
334
|
+
|
|
335
|
+
for attempt in range(max_retries):
|
|
336
|
+
try:
|
|
337
|
+
# 检查连接池状态
|
|
338
|
+
if not self.pool:
|
|
339
|
+
raise RuntimeError("Database connection pool is not available")
|
|
340
|
+
|
|
341
|
+
# 带超时的批量执行
|
|
342
|
+
async with async_timeout.timeout(timeout):
|
|
343
|
+
async with self.pool.acquire() as conn:
|
|
344
|
+
async with conn.cursor() as cursor:
|
|
345
|
+
# 执行批量插入
|
|
346
|
+
rowcount = await cursor.executemany(sql, values_list)
|
|
347
|
+
await conn.commit()
|
|
348
|
+
return rowcount
|
|
349
|
+
except asyncio.TimeoutError:
|
|
350
|
+
self.logger.error(f"执行批量SQL超时 ({timeout}秒)")
|
|
351
|
+
raise ItemDiscard(f"MySQL批量操作超时")
|
|
352
|
+
except Exception as e:
|
|
353
|
+
# 检查是否是死锁错误
|
|
354
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
355
|
+
self.logger.warning(f"检测到批量插入死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
356
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
357
|
+
continue
|
|
358
|
+
# 检查是否是连接错误,尝试重新初始化连接池
|
|
359
|
+
elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
|
|
360
|
+
self.logger.warning(f"检测到连接错误,尝试重新初始化连接池并重试: {str(e)}")
|
|
361
|
+
self._pool_initialized = False
|
|
362
|
+
self.pool = None
|
|
363
|
+
await asyncio.sleep(0.5 * (attempt + 1)) # 简单退避
|
|
364
|
+
continue
|
|
365
|
+
else:
|
|
366
|
+
# 添加更多调试信息
|
|
367
|
+
error_msg = f"MySQL批量插入失败: {str(e)}"
|
|
368
|
+
self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
|
|
369
|
+
# 记录SQL和值的概要以便调试
|
|
370
|
+
self.logger.debug(f"SQL: {sql[:200]}..., Values count: {len(values_list) if isinstance(values_list, list) else 'unknown'}")
|
|
371
|
+
raise ItemDiscard(error_msg)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
class AiomysqlMySQLPipeline(BaseMySQLPipeline):
|
|
375
|
+
"""使用aiomysql库的MySQL管道实现"""
|
|
376
|
+
|
|
377
|
+
def __init__(self, crawler):
|
|
378
|
+
super().__init__(crawler)
|
|
379
|
+
self.logger.info(f"AiomysqlMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
|
|
380
|
+
|
|
381
|
+
@classmethod
|
|
382
|
+
def from_crawler(cls, crawler):
|
|
383
|
+
return cls(crawler)
|
|
384
|
+
|
|
385
|
+
async def _ensure_pool(self):
|
|
386
|
+
"""延迟初始化连接池(线程安全)"""
|
|
387
|
+
if self._pool_initialized and self.pool:
|
|
388
|
+
# 检查连接池是否仍然有效
|
|
389
|
+
if hasattr(self.pool, 'closed') and not self.pool.closed:
|
|
390
|
+
return
|
|
391
|
+
else:
|
|
392
|
+
self.logger.warning("连接池已初始化但无效,重新初始化")
|
|
393
|
+
|
|
394
|
+
async with self._pool_lock:
|
|
395
|
+
if not self._pool_initialized:
|
|
396
|
+
try:
|
|
397
|
+
# 使用单例连接池管理器
|
|
398
|
+
self.pool = await MySQLConnectionPoolManager.get_pool(
|
|
399
|
+
pool_type='aiomysql',
|
|
400
|
+
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
401
|
+
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
402
|
+
user=self.settings.get('MYSQL_USER', 'root'),
|
|
403
|
+
password=self.settings.get('MYSQL_PASSWORD', ''),
|
|
404
|
+
db=self.settings.get('MYSQL_DB', 'scrapy_db'),
|
|
405
|
+
minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
|
|
406
|
+
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5)
|
|
407
|
+
)
|
|
408
|
+
self._pool_initialized = True
|
|
409
|
+
self.logger.info(
|
|
410
|
+
f"aiomysql连接池已初始化(表: {self.table_name}, "
|
|
411
|
+
f"使用全局共享连接池)"
|
|
412
|
+
)
|
|
413
|
+
except Exception as e:
|
|
414
|
+
self.logger.error(f"aiomysql连接池初始化失败: {e}")
|
|
415
|
+
# 重置状态以便重试
|
|
416
|
+
self._pool_initialized = False
|
|
417
|
+
self.pool = None
|
|
418
|
+
raise
|
|
419
|
+
|
|
420
|
+
async def _execute_sql(self, sql: str, values: list = None) -> int:
|
|
421
|
+
"""执行SQL语句并处理结果,包含死锁重试机制"""
|
|
422
|
+
max_retries = 3
|
|
423
|
+
for attempt in range(max_retries):
|
|
424
|
+
try:
|
|
425
|
+
# 使用aiomysql的异步上下文管理器方式
|
|
426
|
+
async with self.pool.acquire() as conn:
|
|
427
|
+
async with conn.cursor() as cursor:
|
|
428
|
+
# 根据是否有参数值选择不同的执行方法
|
|
429
|
+
if values is not None:
|
|
430
|
+
rowcount = await cursor.execute(sql, values)
|
|
431
|
+
else:
|
|
432
|
+
rowcount = await cursor.execute(sql)
|
|
433
|
+
|
|
434
|
+
await conn.commit()
|
|
435
|
+
return rowcount
|
|
436
|
+
except Exception as e:
|
|
437
|
+
# 检查是否是死锁错误
|
|
438
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
439
|
+
self.logger.warning(f"检测到死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
440
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
441
|
+
continue
|
|
442
|
+
else:
|
|
443
|
+
# 添加更多调试信息
|
|
444
|
+
error_msg = f"MySQL插入失败: {str(e)}"
|
|
445
|
+
self.logger.error(f"执行SQL时发生错误: {error_msg}")
|
|
446
|
+
raise ItemDiscard(error_msg)
|
|
447
|
+
|
|
448
|
+
async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
|
|
449
|
+
"""执行批量SQL语句,包含死锁重试机制"""
|
|
450
|
+
max_retries = 3
|
|
451
|
+
for attempt in range(max_retries):
|
|
452
|
+
try:
|
|
453
|
+
async with self.pool.acquire() as conn:
|
|
454
|
+
async with conn.cursor() as cursor:
|
|
455
|
+
# 执行批量插入
|
|
456
|
+
rowcount = await cursor.executemany(sql, values_list)
|
|
457
|
+
await conn.commit()
|
|
458
|
+
return rowcount
|
|
459
|
+
except Exception as e:
|
|
460
|
+
# 检查是否是死锁错误
|
|
461
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
462
|
+
self.logger.warning(f"检测到批量插入死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
463
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
464
|
+
continue
|
|
465
|
+
else:
|
|
466
|
+
# 添加更多调试信息
|
|
467
|
+
error_msg = f"MySQL批量插入失败: {str(e)}"
|
|
468
|
+
self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
|
|
470
469
|
raise ItemDiscard(error_msg)
|