crawlo 1.4.5__py3-none-any.whl → 1.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (375) hide show
  1. crawlo/__init__.py +90 -89
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -341
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -438
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -291
  19. crawlo/crawler.py +698 -657
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -276
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -245
  25. crawlo/downloader/httpx_downloader.py +265 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -402
  28. crawlo/downloader/selenium_downloader.py +486 -472
  29. crawlo/event.py +45 -11
  30. crawlo/exceptions.py +215 -82
  31. crawlo/extension/__init__.py +65 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +135 -0
  44. crawlo/filters/__init__.py +170 -153
  45. crawlo/filters/aioredis_filter.py +348 -264
  46. crawlo/filters/memory_filter.py +261 -276
  47. crawlo/framework.py +306 -292
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -434
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -194
  52. crawlo/initialization/phases.py +230 -149
  53. crawlo/initialization/registry.py +143 -145
  54. crawlo/initialization/utils.py +49 -0
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -46
  61. crawlo/logging/config.py +277 -197
  62. crawlo/logging/factory.py +175 -171
  63. crawlo/logging/manager.py +104 -112
  64. crawlo/middleware/__init__.py +87 -24
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -386
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -253
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +375 -379
  77. crawlo/network/response.py +569 -664
  78. crawlo/pipelines/__init__.py +53 -22
  79. crawlo/pipelines/base_pipeline.py +452 -0
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -132
  87. crawlo/pipelines/mysql_pipeline.py +470 -326
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +10 -0
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -525
  94. crawlo/queue/redis_priority_queue.py +519 -370
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +285 -270
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +657 -657
  99. crawlo/stats_collector.py +82 -73
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +2 -4
  104. crawlo/templates/project/items.py.tmpl +13 -17
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -36
  107. crawlo/templates/project/settings.py.tmpl +110 -157
  108. crawlo/templates/project/settings_distributed.py.tmpl +156 -161
  109. crawlo/templates/project/settings_gentle.py.tmpl +170 -171
  110. crawlo/templates/project/settings_high_performance.py.tmpl +171 -172
  111. crawlo/templates/project/settings_minimal.py.tmpl +99 -77
  112. crawlo/templates/project/settings_simple.py.tmpl +168 -169
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -30
  115. crawlo/templates/spider/spider.py.tmpl +33 -144
  116. crawlo/templates/spiders_init.py.tmpl +5 -10
  117. crawlo/tools/__init__.py +86 -189
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +50 -50
  123. crawlo/utils/batch_processor.py +276 -259
  124. crawlo/utils/config_manager.py +442 -0
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -244
  127. crawlo/utils/error_handler.py +410 -410
  128. crawlo/utils/fingerprint.py +121 -121
  129. crawlo/utils/func_tools.py +82 -82
  130. crawlo/utils/large_scale_helper.py +344 -344
  131. crawlo/utils/leak_detector.py +335 -0
  132. crawlo/utils/log.py +79 -79
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -0
  135. crawlo/utils/mysql_connection_pool.py +197 -0
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +91 -0
  139. crawlo/utils/redis_connection_pool.py +578 -388
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -256
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -0
  144. crawlo/utils/selector_helper.py +137 -137
  145. crawlo/utils/singleton.py +70 -0
  146. crawlo/utils/spider_loader.py +201 -201
  147. crawlo/utils/text_helper.py +94 -94
  148. crawlo/utils/{url.py → url_utils.py} +39 -39
  149. crawlo-1.4.7.dist-info/METADATA +689 -0
  150. crawlo-1.4.7.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -275
  154. tests/authenticated_proxy_example.py +110 -106
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -0
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +77 -0
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -0
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/scrapy.cfg +11 -11
  192. tests/optimized_performance_test.py +211 -211
  193. tests/performance_comparison.py +244 -244
  194. tests/queue_blocking_test.py +113 -113
  195. tests/queue_test.py +89 -89
  196. tests/redis_key_validation_demo.py +130 -130
  197. tests/request_params_example.py +150 -150
  198. tests/response_improvements_example.py +144 -144
  199. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  200. tests/scrapy_comparison/scrapy_test.py +133 -133
  201. tests/simple_cli_test.py +55 -0
  202. tests/simple_command_test.py +119 -119
  203. tests/simple_crawlo_test.py +126 -126
  204. tests/simple_follow_test.py +38 -38
  205. tests/simple_log_test2.py +137 -137
  206. tests/simple_optimization_test.py +128 -128
  207. tests/simple_queue_type_test.py +41 -41
  208. tests/simple_response_selector_test.py +94 -94
  209. tests/simple_selector_helper_test.py +154 -154
  210. tests/simple_selector_test.py +207 -207
  211. tests/simple_spider_test.py +49 -49
  212. tests/simple_url_test.py +73 -73
  213. tests/simulate_mysql_update_test.py +140 -0
  214. tests/spider_log_timing_test.py +177 -177
  215. tests/test_advanced_tools.py +148 -148
  216. tests/test_all_commands.py +230 -230
  217. tests/test_all_pipeline_fingerprints.py +133 -133
  218. tests/test_all_redis_key_configs.py +145 -145
  219. tests/test_asyncmy_usage.py +57 -0
  220. tests/test_batch_processor.py +178 -178
  221. tests/test_cleaners.py +54 -54
  222. tests/test_cli_arguments.py +119 -0
  223. tests/test_component_factory.py +174 -174
  224. tests/test_config_consistency.py +80 -80
  225. tests/test_config_merge.py +152 -152
  226. tests/test_config_validator.py +182 -182
  227. tests/test_controlled_spider_mixin.py +79 -79
  228. tests/test_crawler_process_import.py +38 -38
  229. tests/test_crawler_process_spider_modules.py +47 -47
  230. tests/test_crawlo_proxy_integration.py +114 -108
  231. tests/test_date_tools.py +123 -123
  232. tests/test_dedup_fix.py +220 -220
  233. tests/test_dedup_pipeline_consistency.py +124 -124
  234. tests/test_default_header_middleware.py +313 -313
  235. tests/test_distributed.py +65 -65
  236. tests/test_double_crawlo_fix.py +204 -204
  237. tests/test_double_crawlo_fix_simple.py +124 -124
  238. tests/test_download_delay_middleware.py +221 -221
  239. tests/test_downloader_proxy_compatibility.py +272 -268
  240. tests/test_edge_cases.py +305 -305
  241. tests/test_encoding_core.py +56 -56
  242. tests/test_encoding_detection.py +126 -126
  243. tests/test_enhanced_error_handler.py +270 -270
  244. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  245. tests/test_error_handler_compatibility.py +112 -112
  246. tests/test_factories.py +252 -252
  247. tests/test_factory_compatibility.py +196 -196
  248. tests/test_final_validation.py +153 -153
  249. tests/test_fingerprint_consistency.py +135 -135
  250. tests/test_fingerprint_simple.py +51 -51
  251. tests/test_get_component_logger.py +83 -83
  252. tests/test_hash_performance.py +99 -99
  253. tests/test_integration.py +169 -169
  254. tests/test_item_dedup_redis_key.py +122 -122
  255. tests/test_large_scale_helper.py +235 -235
  256. tests/test_logging_enhancements.py +374 -374
  257. tests/test_logging_final.py +184 -184
  258. tests/test_logging_integration.py +312 -312
  259. tests/test_logging_system.py +282 -282
  260. tests/test_middleware_debug.py +141 -141
  261. tests/test_mode_consistency.py +51 -51
  262. tests/test_multi_directory.py +67 -67
  263. tests/test_multiple_spider_modules.py +80 -80
  264. tests/test_mysql_pipeline_config.py +165 -0
  265. tests/test_mysql_pipeline_error.py +99 -0
  266. tests/test_mysql_pipeline_init_log.py +83 -0
  267. tests/test_mysql_pipeline_integration.py +133 -0
  268. tests/test_mysql_pipeline_refactor.py +144 -0
  269. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  270. tests/test_mysql_pipeline_robustness.py +196 -0
  271. tests/test_mysql_pipeline_types.py +89 -0
  272. tests/test_mysql_update_columns.py +94 -0
  273. tests/test_offsite_middleware.py +244 -244
  274. tests/test_offsite_middleware_simple.py +203 -203
  275. tests/test_optimized_selector_naming.py +100 -100
  276. tests/test_parsel.py +29 -29
  277. tests/test_performance.py +327 -327
  278. tests/test_performance_monitor.py +115 -115
  279. tests/test_pipeline_fingerprint_consistency.py +86 -86
  280. tests/test_priority_behavior.py +211 -211
  281. tests/test_priority_consistency.py +151 -151
  282. tests/test_priority_consistency_fixed.py +249 -249
  283. tests/test_proxy_health_check.py +32 -32
  284. tests/test_proxy_middleware.py +217 -121
  285. tests/test_proxy_middleware_enhanced.py +212 -216
  286. tests/test_proxy_middleware_integration.py +142 -137
  287. tests/test_proxy_middleware_refactored.py +207 -184
  288. tests/test_proxy_only.py +84 -0
  289. tests/test_proxy_providers.py +56 -56
  290. tests/test_proxy_stats.py +19 -19
  291. tests/test_proxy_strategies.py +59 -59
  292. tests/test_proxy_with_downloader.py +153 -0
  293. tests/test_queue_empty_check.py +41 -41
  294. tests/test_queue_manager_double_crawlo.py +173 -173
  295. tests/test_queue_manager_redis_key.py +179 -179
  296. tests/test_queue_naming.py +154 -154
  297. tests/test_queue_type.py +106 -106
  298. tests/test_queue_type_redis_config_consistency.py +130 -130
  299. tests/test_random_headers_default.py +322 -322
  300. tests/test_random_headers_necessity.py +308 -308
  301. tests/test_random_user_agent.py +72 -72
  302. tests/test_redis_config.py +28 -28
  303. tests/test_redis_connection_pool.py +294 -294
  304. tests/test_redis_key_naming.py +181 -181
  305. tests/test_redis_key_validator.py +123 -123
  306. tests/test_redis_queue.py +224 -224
  307. tests/test_redis_queue_name_fix.py +175 -175
  308. tests/test_redis_queue_type_fallback.py +129 -129
  309. tests/test_request_ignore_middleware.py +182 -182
  310. tests/test_request_params.py +111 -111
  311. tests/test_request_serialization.py +70 -70
  312. tests/test_response_code_middleware.py +349 -349
  313. tests/test_response_filter_middleware.py +427 -427
  314. tests/test_response_follow.py +104 -104
  315. tests/test_response_improvements.py +152 -152
  316. tests/test_response_selector_methods.py +92 -92
  317. tests/test_response_url_methods.py +70 -70
  318. tests/test_response_urljoin.py +86 -86
  319. tests/test_retry_middleware.py +333 -333
  320. tests/test_retry_middleware_realistic.py +273 -273
  321. tests/test_scheduler.py +252 -252
  322. tests/test_scheduler_config_update.py +133 -133
  323. tests/test_scrapy_style_encoding.py +112 -112
  324. tests/test_selector_helper.py +100 -100
  325. tests/test_selector_optimizations.py +146 -146
  326. tests/test_simple_response.py +61 -61
  327. tests/test_spider_loader.py +49 -49
  328. tests/test_spider_loader_comprehensive.py +69 -69
  329. tests/test_spider_modules.py +84 -84
  330. tests/test_spiders/test_spider.py +9 -9
  331. tests/test_telecom_spider_redis_key.py +205 -205
  332. tests/test_template_content.py +87 -87
  333. tests/test_template_redis_key.py +134 -134
  334. tests/test_tools.py +159 -159
  335. tests/test_user_agent_randomness.py +176 -176
  336. tests/test_user_agents.py +96 -96
  337. tests/untested_features_report.md +138 -138
  338. tests/verify_debug.py +51 -51
  339. tests/verify_distributed.py +117 -117
  340. tests/verify_log_fix.py +111 -111
  341. tests/verify_mysql_warnings.py +110 -0
  342. crawlo/logging/async_handler.py +0 -181
  343. crawlo/logging/monitor.py +0 -153
  344. crawlo/logging/sampler.py +0 -167
  345. crawlo/middleware/simple_proxy.py +0 -65
  346. crawlo/tools/authenticated_proxy.py +0 -241
  347. crawlo/tools/data_formatter.py +0 -226
  348. crawlo/tools/data_validator.py +0 -181
  349. crawlo/tools/encoding_converter.py +0 -127
  350. crawlo/tools/network_diagnostic.py +0 -365
  351. crawlo/tools/request_tools.py +0 -83
  352. crawlo/tools/retry_mechanism.py +0 -224
  353. crawlo/utils/env_config.py +0 -143
  354. crawlo/utils/large_scale_config.py +0 -287
  355. crawlo/utils/system.py +0 -11
  356. crawlo/utils/tools.py +0 -5
  357. crawlo-1.4.5.dist-info/METADATA +0 -329
  358. crawlo-1.4.5.dist-info/RECORD +0 -347
  359. tests/env_config_example.py +0 -134
  360. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  361. tests/test_authenticated_proxy.py +0 -142
  362. tests/test_comprehensive.py +0 -147
  363. tests/test_dynamic_downloaders_proxy.py +0 -125
  364. tests/test_dynamic_proxy.py +0 -93
  365. tests/test_dynamic_proxy_config.py +0 -147
  366. tests/test_dynamic_proxy_real.py +0 -110
  367. tests/test_env_config.py +0 -122
  368. tests/test_framework_env_usage.py +0 -104
  369. tests/test_large_scale_config.py +0 -113
  370. tests/test_proxy_api.py +0 -265
  371. tests/test_real_scenario_proxy.py +0 -196
  372. tests/tools_example.py +0 -261
  373. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
  374. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
  375. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
@@ -1,389 +1,579 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Redis连接池优化工具
5
- 提供优化的Redis连接池管理和配置
6
- """
7
- from contextlib import asynccontextmanager
8
- from typing import Dict, Any, Optional
9
-
10
- import redis.asyncio as aioredis
11
-
12
- # 延迟导入避免循环依赖
13
- # from crawlo.utils.error_handler import ErrorHandler
14
- # from crawlo.utils.log import get_logger
15
-
16
-
17
- class OptimizedRedisConnectionPool:
18
- """优化的Redis连接池管理器"""
19
-
20
- # 默认连接池配置
21
- DEFAULT_CONFIG = {
22
- 'max_connections': 50,
23
- 'socket_connect_timeout': 5,
24
- 'socket_timeout': 30,
25
- 'socket_keepalive': True,
26
- 'health_check_interval': 30,
27
- 'retry_on_timeout': True,
28
- 'encoding': 'utf-8',
29
- 'decode_responses': False,
30
- }
31
-
32
- def __init__(self, redis_url: str, **kwargs):
33
- self.redis_url = redis_url
34
- self.config = {**self.DEFAULT_CONFIG, **kwargs}
35
-
36
- # 延迟初始化logger和error_handler
37
- self._logger = None
38
- self._error_handler = None
39
-
40
- # 连接池实例
41
- self._connection_pool: Optional[aioredis.ConnectionPool] = None
42
- self._redis_client: Optional[aioredis.Redis] = None
43
- self._connection_tested = False # 标记是否已测试连接
44
-
45
- # 连接池统计信息
46
- self._stats = {
47
- 'created_connections': 0,
48
- 'active_connections': 0,
49
- 'idle_connections': 0,
50
- 'errors': 0
51
- }
52
-
53
- # 初始化连接池
54
- self._initialize_pool()
55
-
56
- @property
57
- def logger(self):
58
- """延迟初始化logger"""
59
- if self._logger is None:
60
- from crawlo.utils.log import get_logger
61
- self._logger = get_logger(self.__class__.__name__)
62
- return self._logger
63
-
64
- @property
65
- def error_handler(self):
66
- """延迟初始化error_handler"""
67
- if self._error_handler is None:
68
- from crawlo.utils.error_handler import ErrorHandler
69
- self._error_handler = ErrorHandler(self.__class__.__name__)
70
- return self._error_handler
71
-
72
- def _initialize_pool(self):
73
- """初始化连接池"""
74
- try:
75
- self._connection_pool = aioredis.ConnectionPool.from_url(
76
- self.redis_url,
77
- **self.config
78
- )
79
-
80
- self._redis_client = aioredis.Redis(
81
- connection_pool=self._connection_pool
82
- )
83
-
84
- # 只在调试模式下输出详细连接池信息
85
- self.logger.debug(f"Redis连接池初始化成功: {self.redis_url}")
86
- self.logger.debug(f" 连接池配置: {self.config}")
87
-
88
- except Exception as e:
89
- self.error_handler.handle_error(
90
- e,
91
- context="Redis连接池初始化失败",
92
- raise_error=True
93
- )
94
-
95
- async def _test_connection(self):
96
- """测试Redis连接"""
97
- if self._redis_client and not self._connection_tested:
98
- try:
99
- await self._redis_client.ping()
100
- self._connection_tested = True
101
- # 只在调试模式下输出连接测试成功信息
102
- self.logger.debug(f"Redis连接测试成功: {self.redis_url}")
103
- except Exception as e:
104
- self.logger.error(f"Redis连接测试失败: {self.redis_url} - {e}")
105
- raise
106
-
107
- async def get_connection(self) -> aioredis.Redis:
108
- """
109
- 获取Redis连接实例
110
-
111
- Returns:
112
- Redis连接实例
113
- """
114
- if not self._redis_client:
115
- self._initialize_pool()
116
-
117
- # 确保连接有效
118
- await self._test_connection()
119
-
120
- self._stats['active_connections'] += 1
121
- return self._redis_client
122
-
123
- async def ping(self) -> bool:
124
- """
125
- 检查Redis连接是否正常
126
-
127
- Returns:
128
- 连接是否正常
129
- """
130
- try:
131
- if self._redis_client:
132
- await self._redis_client.ping()
133
- return True
134
- return False
135
- except Exception as e:
136
- self.logger.warning(f"Redis连接检查失败: {e}")
137
- return False
138
-
139
- async def close(self):
140
- """关闭连接池"""
141
- try:
142
- if self._redis_client:
143
- await self._redis_client.close()
144
- self._redis_client = None
145
-
146
- if self._connection_pool:
147
- await self._connection_pool.disconnect()
148
- self._connection_pool = None
149
-
150
- self.logger.info("Redis连接池已关闭")
151
- except Exception as e:
152
- self.error_handler.handle_error(
153
- e,
154
- context="关闭Redis连接池失败",
155
- raise_error=False
156
- )
157
-
158
- def get_stats(self) -> Dict[str, Any]:
159
- """
160
- 获取连接池统计信息
161
-
162
- Returns:
163
- 统计信息字典
164
- """
165
- if self._connection_pool:
166
- pool_stats = {
167
- 'max_connections': self._connection_pool.max_connections,
168
- 'created_connections': self._connection_pool.created_connections,
169
- 'available_connections': len(self._connection_pool._available_connections),
170
- 'in_use_connections': len(self._connection_pool._in_use_connections),
171
- }
172
- self._stats.update(pool_stats)
173
-
174
- return self._stats.copy()
175
-
176
- @asynccontextmanager
177
- async def connection_context(self):
178
- """
179
- 连接上下文管理器
180
-
181
- Yields:
182
- Redis连接实例
183
- """
184
- connection = await self.get_connection()
185
- try:
186
- yield connection
187
- finally:
188
- self._stats['active_connections'] -= 1
189
- self._stats['idle_connections'] += 1
190
-
191
-
192
- class RedisBatchOperationHelper:
193
- """Redis批量操作助手"""
194
-
195
- def __init__(self, redis_client: aioredis.Redis, batch_size: int = 100):
196
- self.redis_client = redis_client
197
- self.batch_size = batch_size
198
-
199
- # 延迟初始化logger和error_handler
200
- self._logger = None
201
- self._error_handler = None
202
-
203
- @property
204
- def logger(self):
205
- """延迟初始化logger"""
206
- if self._logger is None:
207
- from crawlo.utils.log import get_logger
208
- self._logger = get_logger(self.__class__.__name__)
209
- return self._logger
210
-
211
- @property
212
- def error_handler(self):
213
- """延迟初始化error_handler"""
214
- if self._error_handler is None:
215
- from crawlo.utils.error_handler import ErrorHandler
216
- self._error_handler = ErrorHandler(self.__class__.__name__)
217
- return self._error_handler
218
-
219
- async def batch_execute(self, operations: list, batch_size: Optional[int] = None) -> list:
220
- """
221
- 批量执行Redis操作
222
-
223
- Args:
224
- operations: 操作列表,每个操作是一个包含(command, *args)的元组
225
- batch_size: 批次大小(如果为None则使用实例的batch_size)
226
-
227
- Returns:
228
- 执行结果列表
229
- """
230
- actual_batch_size = batch_size or self.batch_size
231
- results = []
232
-
233
- try:
234
- for i in range(0, len(operations), actual_batch_size):
235
- batch = operations[i:i + actual_batch_size]
236
- self.logger.debug(f"执行批次 {i//actual_batch_size + 1}/{(len(operations)-1)//actual_batch_size + 1}")
237
-
238
- try:
239
- pipe = self.redis_client.pipeline()
240
- for operation in batch:
241
- command, *args = operation
242
- getattr(pipe, command)(*args)
243
-
244
- batch_results = await pipe.execute()
245
- results.extend(batch_results)
246
-
247
- except Exception as e:
248
- self.logger.error(f"执行批次失败: {e}")
249
- # 继续执行下一个批次而不是中断
250
-
251
- except Exception as e:
252
- self.error_handler.handle_error(
253
- e,
254
- context="Redis批量操作执行失败",
255
- raise_error=False
256
- )
257
-
258
- return results
259
-
260
- async def batch_set_hash(self, hash_key: str, items: Dict[str, Any]) -> int:
261
- """
262
- 批量设置Hash字段
263
-
264
- Args:
265
- hash_key: Hash键名
266
- items: 要设置的字段字典
267
-
268
- Returns:
269
- 成功设置的字段数量
270
- """
271
- try:
272
- if not items:
273
- return 0
274
-
275
- pipe = self.redis_client.pipeline()
276
- count = 0
277
-
278
- for key, value in items.items():
279
- pipe.hset(hash_key, key, value)
280
- count += 1
281
-
282
- # 每达到批次大小就执行一次
283
- if count % self.batch_size == 0:
284
- await pipe.execute()
285
- pipe = self.redis_client.pipeline()
286
-
287
- # 执行剩余的操作
288
- if count % self.batch_size != 0:
289
- await pipe.execute()
290
-
291
- self.logger.debug(f"批量设置Hash {count} 个字段")
292
- return count
293
-
294
- except Exception as e:
295
- self.error_handler.handle_error(
296
- e,
297
- context="Redis批量设置Hash失败",
298
- raise_error=False
299
- )
300
- return 0
301
-
302
- async def batch_get_hash(self, hash_key: str, fields: list) -> Dict[str, Any]:
303
- """
304
- 批量获取Hash字段值
305
-
306
- Args:
307
- hash_key: Hash键名
308
- fields: 要获取的字段列表
309
-
310
- Returns:
311
- 字段值字典
312
- """
313
- try:
314
- if not fields:
315
- return {}
316
-
317
- # 使用管道批量获取
318
- pipe = self.redis_client.pipeline()
319
- for field in fields:
320
- pipe.hget(hash_key, field)
321
-
322
- results = await pipe.execute()
323
-
324
- # 构建结果字典
325
- result_dict = {}
326
- for i, field in enumerate(fields):
327
- if results[i] is not None:
328
- result_dict[field] = results[i]
329
-
330
- self.logger.debug(f"批量获取Hash {len(result_dict)} 个字段")
331
- return result_dict
332
-
333
- except Exception as e:
334
- self.error_handler.handle_error(
335
- e,
336
- context="Redis批量获取Hash失败",
337
- raise_error=False
338
- )
339
- return {}
340
-
341
-
342
- # 全局连接池管理器
343
- _connection_pools: Dict[str, OptimizedRedisConnectionPool] = {}
344
-
345
-
346
- def get_redis_pool(redis_url: str, **kwargs) -> OptimizedRedisConnectionPool:
347
- """
348
- 获取Redis连接池实例(单例模式)
349
-
350
- Args:
351
- redis_url: Redis URL
352
- **kwargs: 连接池配置参数
353
-
354
- Returns:
355
- Redis连接池实例
356
- """
357
- if redis_url not in _connection_pools:
358
- _connection_pools[redis_url] = OptimizedRedisConnectionPool(redis_url, **kwargs)
359
-
360
- return _connection_pools[redis_url]
361
-
362
-
363
- async def close_all_pools():
364
- """关闭所有连接池"""
365
- global _connection_pools
366
-
367
- for pool in _connection_pools.values():
368
- await pool.close()
369
-
370
- _connection_pools.clear()
371
-
372
-
373
- # 便捷函数
374
- async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100) -> list:
375
- """
376
- 便捷函数:执行Redis批量操作
377
-
378
- Args:
379
- redis_url: Redis URL
380
- operations: 操作列表
381
- batch_size: 批次大小
382
-
383
- Returns:
384
- 执行结果列表
385
- """
386
- pool = get_redis_pool(redis_url)
387
- redis_client = await pool.get_connection()
388
- helper = RedisBatchOperationHelper(redis_client, batch_size)
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Redis连接池工具
5
+ 提供Redis连接池管理和配置
6
+ """
7
+ from contextlib import asynccontextmanager
8
+ from typing import Dict, Any, Optional, List, Union, TYPE_CHECKING
9
+ import re
10
+
11
+ import redis.asyncio as aioredis
12
+
13
+ # 尝试导入Redis集群支持
14
+ try:
15
+ from redis.asyncio.cluster import RedisCluster
16
+ from redis.asyncio.cluster import ClusterNode
17
+ REDIS_CLUSTER_AVAILABLE = True
18
+ except ImportError:
19
+ RedisCluster = None
20
+ ClusterNode = None
21
+ REDIS_CLUSTER_AVAILABLE = False
22
+
23
+
24
+ if TYPE_CHECKING:
25
+ from crawlo.utils.error_handler import ErrorHandler
26
+
27
+
28
+ class RedisConnectionPool:
29
+ """Redis连接池管理器"""
30
+
31
+ # 默认连接池配置
32
+ DEFAULT_CONFIG = {
33
+ 'max_connections': 50,
34
+ 'socket_connect_timeout': 5,
35
+ 'socket_timeout': 30,
36
+ 'socket_keepalive': True,
37
+ 'health_check_interval': 30,
38
+ 'retry_on_timeout': True,
39
+ 'encoding': 'utf-8',
40
+ 'decode_responses': False,
41
+ }
42
+
43
+ # Redis集群不支持的配置参数
44
+ CLUSTER_UNSUPPORTED_CONFIG = {
45
+ 'retry_on_timeout',
46
+ 'health_check_interval',
47
+ 'socket_keepalive'
48
+ }
49
+
50
+ def __init__(self, redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs):
51
+ self.redis_url = redis_url
52
+ self.is_cluster = is_cluster
53
+ self.cluster_nodes = cluster_nodes
54
+ self.config = {**self.DEFAULT_CONFIG, **kwargs}
55
+
56
+ # 延迟初始化logger和error_handler
57
+ self._logger = None
58
+ self._error_handler: Optional["ErrorHandler"] = None
59
+
60
+ # 连接池实例
61
+ self._connection_pool: Optional[aioredis.ConnectionPool] = None
62
+ self._redis_client = None
63
+ self._connection_tested = False # 标记是否已测试连接
64
+
65
+ # 连接池统计信息
66
+ self._stats = {
67
+ 'created_connections': 0,
68
+ 'active_connections': 0,
69
+ 'idle_connections': 0,
70
+ 'errors': 0
71
+ }
72
+
73
+ # 初始化连接池
74
+ self._initialize_pool()
75
+
76
+ @property
77
+ def logger(self):
78
+ """延迟初始化logger"""
79
+ if self._logger is None:
80
+ from crawlo.logging import get_logger
81
+ self._logger = get_logger(self.__class__.__name__)
82
+ return self._logger
83
+
84
+ @property
85
+ def error_handler(self):
86
+ """延迟初始化error_handler"""
87
+ if self._error_handler is None:
88
+ from crawlo.utils.error_handler import ErrorHandler
89
+ self._error_handler = ErrorHandler(self.__class__.__name__)
90
+ return self._error_handler
91
+
92
+ def _is_cluster_url(self) -> bool:
93
+ """判断是否为集群URL格式"""
94
+ if self.cluster_nodes:
95
+ return True
96
+ # 检查URL是否包含多个节点(逗号分隔)
97
+ if ',' in self.redis_url:
98
+ return True
99
+ # 检查URL是否为集群格式
100
+ if 'redis-cluster://' in self.redis_url or 'rediss-cluster://' in self.redis_url:
101
+ return True
102
+ return False
103
+
104
+ def _parse_cluster_nodes(self) -> List[Dict[str, Union[str, int]]]:
105
+ """解析集群节点"""
106
+ nodes = []
107
+ if self.cluster_nodes:
108
+ node_list = self.cluster_nodes
109
+ else:
110
+ # 从URL中解析节点
111
+ # 支持格式: redis://host1:port1,host2:port2,host3:port3
112
+ # 或: host1:port1,host2:port2,host3:port3
113
+ url_part = self.redis_url.replace('redis://', '').replace('rediss://', '')
114
+ node_list = url_part.split(',')
115
+
116
+ for node in node_list:
117
+ # 解析host:port格式
118
+ if ':' in node:
119
+ host, port = node.rsplit(':', 1)
120
+ try:
121
+ nodes.append({
122
+ 'host': str(host.strip()),
123
+ 'port': int(port.strip())
124
+ })
125
+ except ValueError:
126
+ self.logger.warning(f"无效的节点格式: {node}")
127
+ else:
128
+ # 默认端口
129
+ nodes.append({
130
+ 'host': str(node.strip()),
131
+ 'port': 6379
132
+ })
133
+
134
+ return nodes
135
+
136
+ def _get_cluster_config(self) -> Dict[str, Any]:
137
+ """获取适用于Redis集群的配置"""
138
+ # 移除集群不支持的配置参数
139
+ cluster_config = self.config.copy()
140
+ for unsupported_key in self.CLUSTER_UNSUPPORTED_CONFIG:
141
+ cluster_config.pop(unsupported_key, None)
142
+ return cluster_config
143
+
144
+ def _initialize_pool(self):
145
+ """初始化连接池"""
146
+ try:
147
+ # 智能检测是否应该使用集群模式
148
+ should_use_cluster = self.is_cluster or self._is_cluster_url()
149
+
150
+ if should_use_cluster and REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and ClusterNode is not None:
151
+ # 使用Redis集群
152
+ nodes = self._parse_cluster_nodes()
153
+ cluster_config = self._get_cluster_config()
154
+
155
+ if nodes:
156
+ if len(nodes) == 1:
157
+ # 单节点集群
158
+ self._redis_client = RedisCluster(
159
+ host=str(nodes[0]['host']),
160
+ port=int(nodes[0]['port']),
161
+ **cluster_config
162
+ )
163
+ else:
164
+ # 多节点集群
165
+ cluster_node_objects = [ClusterNode(str(node['host']), int(node['port'])) for node in nodes]
166
+ self._redis_client = RedisCluster(
167
+ startup_nodes=cluster_node_objects,
168
+ **cluster_config
169
+ )
170
+ self.logger.info(f"Redis集群连接池初始化成功: {len(nodes)} 个节点")
171
+ else:
172
+ # 回退到单实例模式
173
+ self._connection_pool = aioredis.ConnectionPool.from_url(
174
+ self.redis_url,
175
+ **self.config
176
+ )
177
+ self._redis_client = aioredis.Redis(
178
+ connection_pool=self._connection_pool
179
+ )
180
+ self.logger.warning("无法解析集群节点,回退到单实例模式")
181
+ else:
182
+ # 使用单实例Redis
183
+ self._connection_pool = aioredis.ConnectionPool.from_url(
184
+ self.redis_url,
185
+ **self.config
186
+ )
187
+
188
+ self._redis_client = aioredis.Redis(
189
+ connection_pool=self._connection_pool
190
+ )
191
+
192
+ # 只在调试模式下输出详细连接池信息
193
+ if should_use_cluster and REDIS_CLUSTER_AVAILABLE:
194
+ self.logger.debug(f"Redis集群连接池初始化成功: {self.redis_url}")
195
+ else:
196
+ self.logger.debug(f"Redis连接池初始化成功: {self.redis_url}")
197
+ self.logger.debug(f" 连接池配置: {self.config}")
198
+
199
+ except Exception as e:
200
+ from crawlo.utils.error_handler import ErrorContext
201
+ error_context = ErrorContext(context="Redis连接池初始化失败")
202
+ self.error_handler.handle_error(
203
+ e,
204
+ context=error_context,
205
+ raise_error=True
206
+ )
207
+
208
+ async def _test_connection(self):
209
+ """测试Redis连接"""
210
+ if self._redis_client and not self._connection_tested:
211
+ try:
212
+ await self._redis_client.ping()
213
+ self._connection_tested = True
214
+ # 只在调试模式下输出连接测试成功信息
215
+ if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and isinstance(self._redis_client, RedisCluster):
216
+ self.logger.debug(f"Redis集群连接测试成功: {self.redis_url}")
217
+ else:
218
+ self.logger.debug(f"Redis连接测试成功: {self.redis_url}")
219
+ except Exception as e:
220
+ self.logger.error(f"Redis连接测试失败: {self.redis_url} - {e}")
221
+ raise
222
+
223
+ async def get_connection(self):
224
+ """
225
+ 获取Redis连接实例
226
+
227
+ Returns:
228
+ Redis连接实例
229
+ """
230
+ if not self._redis_client:
231
+ self._initialize_pool()
232
+
233
+ # 确保连接有效
234
+ await self._test_connection()
235
+
236
+ self._stats['active_connections'] += 1
237
+ return self._redis_client
238
+
239
+ async def ping(self) -> bool:
240
+ """
241
+ 检查Redis连接是否正常
242
+
243
+ Returns:
244
+ 连接是否正常
245
+ """
246
+ try:
247
+ if self._redis_client:
248
+ await self._redis_client.ping()
249
+ return True
250
+ return False
251
+ except Exception as e:
252
+ self.logger.warning(f"Redis连接检查失败: {e}")
253
+ return False
254
+
255
+ async def close(self):
256
+ """关闭连接池"""
257
+ try:
258
+ if self._redis_client:
259
+ await self._redis_client.close()
260
+ self._redis_client = None
261
+
262
+ if self._connection_pool:
263
+ await self._connection_pool.disconnect()
264
+ self._connection_pool = None
265
+
266
+ self.logger.info("Redis连接池已关闭")
267
+ except Exception as e:
268
+ from crawlo.utils.error_handler import ErrorContext
269
+ error_context = ErrorContext(context="关闭Redis连接池失败")
270
+ self.error_handler.handle_error(
271
+ e,
272
+ context=error_context,
273
+ raise_error=False
274
+ )
275
+
276
+ def get_stats(self) -> Dict[str, Any]:
277
+ """
278
+ 获取连接池统计信息
279
+
280
+ Returns:
281
+ 统计信息字典
282
+ """
283
+ if self._connection_pool and hasattr(self._connection_pool, 'max_connections'):
284
+ pool_stats = {
285
+ 'max_connections': self._connection_pool.max_connections,
286
+ 'available_connections': len(self._connection_pool._available_connections) if hasattr(self._connection_pool, '_available_connections') else 0,
287
+ 'in_use_connections': len(self._connection_pool._in_use_connections) if hasattr(self._connection_pool, '_in_use_connections') else 0,
288
+ }
289
+ self._stats.update(pool_stats)
290
+
291
+ return self._stats.copy()
292
+
293
+ @asynccontextmanager
294
+ async def connection_context(self):
295
+ """
296
+ 连接上下文管理器
297
+
298
+ Yields:
299
+ Redis连接实例
300
+ """
301
+ connection = await self.get_connection()
302
+ try:
303
+ yield connection
304
+ finally:
305
+ self._stats['active_connections'] -= 1
306
+ self._stats['idle_connections'] += 1
307
+
308
+
309
+ class RedisBatchOperationHelper:
310
+ """Redis批量操作助手"""
311
+
312
+ def __init__(self, redis_client, batch_size: int = 100):
313
+ self.redis_client = redis_client
314
+ self.batch_size = batch_size
315
+
316
+ # 延迟初始化logger和error_handler
317
+ self._logger = None
318
+ self._error_handler = None
319
+
320
+ @property
321
+ def logger(self):
322
+ """延迟初始化logger"""
323
+ if self._logger is None:
324
+ from crawlo.logging import get_logger
325
+ self._logger = get_logger(self.__class__.__name__)
326
+ return self._logger
327
+
328
+ @property
329
+ def error_handler(self):
330
+ """延迟初始化error_handler"""
331
+ if self._error_handler is None:
332
+ from crawlo.utils.error_handler import ErrorHandler
333
+ self._error_handler = ErrorHandler(self.__class__.__name__)
334
+ return self._error_handler
335
+
336
+ async def batch_execute(self, operations: list, batch_size: Optional[int] = None) -> list:
337
+ """
338
+ 批量执行Redis操作
339
+
340
+ Args:
341
+ operations: 操作列表,每个操作是一个包含(command, *args)的元组
342
+ batch_size: 批次大小(如果为None则使用实例的batch_size)
343
+
344
+ Returns:
345
+ 执行结果列表
346
+ """
347
+ actual_batch_size = batch_size or self.batch_size
348
+ results = []
349
+
350
+ try:
351
+ for i in range(0, len(operations), actual_batch_size):
352
+ batch = operations[i:i + actual_batch_size]
353
+ self.logger.debug(f"执行批次 {i//actual_batch_size + 1}/{(len(operations)-1)//actual_batch_size + 1}")
354
+
355
+ try:
356
+ # 处理集群模式下的管道操作
357
+ if hasattr(self.redis_client, 'pipeline'):
358
+ pipe = self.redis_client.pipeline()
359
+ for operation in batch:
360
+ command, *args = operation
361
+ getattr(pipe, command)(*args)
362
+
363
+ batch_results = await pipe.execute()
364
+ results.extend(batch_results)
365
+ else:
366
+ # 集群模式可能不支持跨slot的管道操作,逐个执行
367
+ batch_results = []
368
+ for operation in batch:
369
+ command, *args = operation
370
+ result = await getattr(self.redis_client, command)(*args)
371
+ batch_results.append(result)
372
+ results.extend(batch_results)
373
+
374
+ except Exception as e:
375
+ self.logger.error(f"执行批次失败: {e}")
376
+ # 继续执行下一个批次而不是中断
377
+
378
+ except Exception as e:
379
+ from crawlo.utils.error_handler import ErrorContext
380
+ error_context = ErrorContext(context="Redis批量操作执行失败")
381
+ self.error_handler.handle_error(
382
+ e,
383
+ context=error_context,
384
+ raise_error=False
385
+ )
386
+
387
+ return results
388
+
389
+ async def batch_set_hash(self, hash_key: str, items: Dict[str, Any]) -> int:
390
+ """
391
+ 批量设置Hash字段
392
+
393
+ Args:
394
+ hash_key: Hash键名
395
+ items: 要设置的字段字典
396
+
397
+ Returns:
398
+ 成功设置的字段数量
399
+ """
400
+ try:
401
+ if not items:
402
+ return 0
403
+
404
+ # 处理集群模式
405
+ if hasattr(self.redis_client, 'pipeline'):
406
+ pipe = self.redis_client.pipeline()
407
+ count = 0
408
+
409
+ for key, value in items.items():
410
+ pipe.hset(hash_key, key, value)
411
+ count += 1
412
+
413
+ # 每达到批次大小就执行一次
414
+ if count % self.batch_size == 0:
415
+ await pipe.execute()
416
+ pipe = self.redis_client.pipeline()
417
+
418
+ # 执行剩余的操作
419
+ if count % self.batch_size != 0:
420
+ await pipe.execute()
421
+ else:
422
+ # 集群模式逐个执行
423
+ count = 0
424
+ batch_count = 0
425
+ for key, value in items.items():
426
+ await self.redis_client.hset(hash_key, key, value)
427
+ count += 1
428
+ batch_count += 1
429
+
430
+ # 每达到批次大小就暂停一下
431
+ if batch_count % self.batch_size == 0:
432
+ import asyncio
433
+ await asyncio.sleep(0.001) # 避免过于频繁的请求
434
+ batch_count = 0
435
+
436
+ self.logger.debug(f"批量设置Hash {count} 个字段")
437
+ return count
438
+
439
+ except Exception as e:
440
+ from crawlo.utils.error_handler import ErrorContext
441
+ error_context = ErrorContext(context="Redis批量设置Hash失败")
442
+ self.error_handler.handle_error(
443
+ e,
444
+ context=error_context,
445
+ raise_error=False
446
+ )
447
+ return 0
448
+
449
+ async def batch_get_hash(self, hash_key: str, fields: list) -> Dict[str, Any]:
450
+ """
451
+ 批量获取Hash字段值
452
+
453
+ Args:
454
+ hash_key: Hash键名
455
+ fields: 要获取的字段列表
456
+
457
+ Returns:
458
+ 字段值字典
459
+ """
460
+ try:
461
+ if not fields:
462
+ return {}
463
+
464
+ # 处理集群模式
465
+ if hasattr(self.redis_client, 'pipeline'):
466
+ # 使用管道批量获取
467
+ pipe = self.redis_client.pipeline()
468
+ for field in fields:
469
+ pipe.hget(hash_key, field)
470
+
471
+ results = await pipe.execute()
472
+ else:
473
+ # 集群模式逐个获取
474
+ results = []
475
+ for field in fields:
476
+ result = await self.redis_client.hget(hash_key, field)
477
+ results.append(result)
478
+
479
+ # 构建结果字典
480
+ result_dict = {}
481
+ for i, field in enumerate(fields):
482
+ if results[i] is not None:
483
+ result_dict[field] = results[i]
484
+
485
+ self.logger.debug(f"批量获取Hash {len(result_dict)} 个字段")
486
+ return result_dict
487
+
488
+ except Exception as e:
489
+ from crawlo.utils.error_handler import ErrorContext
490
+ error_context = ErrorContext(context="Redis批量获取Hash失败")
491
+ self.error_handler.handle_error(
492
+ e,
493
+ context=error_context,
494
+ raise_error=False
495
+ )
496
+ return {}
497
+
498
+
499
+ # 全局连接池管理器
500
+ _connection_pools: Dict[str, RedisConnectionPool] = {}
501
+
502
+
503
+ def get_redis_pool(redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs) -> RedisConnectionPool:
504
+ """
505
+ 获取Redis连接池实例(单例模式)
506
+
507
+ Args:
508
+ redis_url: Redis URL
509
+ is_cluster: 是否为集群模式
510
+ cluster_nodes: 集群节点列表
511
+ **kwargs: 连接池配置参数
512
+
513
+ Returns:
514
+ Redis连接池实例
515
+ """
516
+ # 创建唯一标识符,包含集群相关信息
517
+ pool_key = f"{redis_url}_{is_cluster}_{','.join(cluster_nodes) if cluster_nodes else ''}"
518
+
519
+ if pool_key not in _connection_pools:
520
+ _connection_pools[pool_key] = RedisConnectionPool(redis_url, is_cluster, cluster_nodes, **kwargs)
521
+
522
+ return _connection_pools[pool_key]
523
+
524
+
525
+ async def close_all_pools():
526
+ """关闭所有连接池"""
527
+ import asyncio
528
+ global _connection_pools
529
+
530
+ from crawlo.logging import get_logger
531
+ logger = get_logger('RedisConnectionPool')
532
+
533
+ if not _connection_pools:
534
+ logger.debug("No Redis connection pools to close")
535
+ return
536
+
537
+ logger.info(f"Closing {len(_connection_pools)} Redis connection pool(s)...")
538
+
539
+ close_tasks = []
540
+ for pool_key, pool in _connection_pools.items():
541
+ try:
542
+ close_tasks.append(pool.close())
543
+ except Exception as e:
544
+ logger.error(f"Error scheduling close for pool {pool_key}: {e}")
545
+
546
+ # 并发关闭所有连接池
547
+ if close_tasks:
548
+ results = await asyncio.gather(*close_tasks, return_exceptions=True)
549
+
550
+ # 检查结果
551
+ error_count = sum(1 for r in results if isinstance(r, Exception))
552
+ if error_count > 0:
553
+ logger.warning(f"Failed to close {error_count} pool(s)")
554
+ else:
555
+ logger.info("All Redis connection pools closed successfully")
556
+
557
+ _connection_pools.clear()
558
+ logger.debug("Redis connection pools registry cleared")
559
+
560
+
561
+ # 便捷函数
562
+ async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None) -> list:
563
+ """
564
+ 便捷函数:执行Redis批量操作
565
+
566
+ Args:
567
+ redis_url: Redis URL
568
+ operations: 操作列表
569
+ batch_size: 批次大小
570
+ is_cluster: 是否为集群模式
571
+ cluster_nodes: 集群节点列表
572
+
573
+ Returns:
574
+ 执行结果列表
575
+ """
576
+ pool = get_redis_pool(redis_url, is_cluster, cluster_nodes)
577
+ redis_client = await pool.get_connection()
578
+ helper = RedisBatchOperationHelper(redis_client, batch_size)
389
579
  return await helper.batch_execute(operations)