crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (348) hide show
  1. crawlo/__init__.py +90 -90
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -140
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -379
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -320
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -451
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -290
  19. crawlo/crawler.py +698 -698
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -280
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -250
  25. crawlo/downloader/httpx_downloader.py +265 -265
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -425
  28. crawlo/downloader/selenium_downloader.py +486 -486
  29. crawlo/event.py +45 -45
  30. crawlo/exceptions.py +214 -214
  31. crawlo/extension/__init__.py +64 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -53
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -104
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +134 -134
  44. crawlo/filters/__init__.py +170 -170
  45. crawlo/filters/aioredis_filter.py +347 -347
  46. crawlo/filters/memory_filter.py +261 -261
  47. crawlo/framework.py +306 -306
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -391
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -240
  52. crawlo/initialization/phases.py +229 -229
  53. crawlo/initialization/registry.py +143 -143
  54. crawlo/initialization/utils.py +48 -48
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -42
  61. crawlo/logging/config.py +280 -276
  62. crawlo/logging/factory.py +175 -175
  63. crawlo/logging/manager.py +104 -104
  64. crawlo/middleware/__init__.py +87 -87
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -287
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +408 -376
  77. crawlo/network/response.py +598 -569
  78. crawlo/pipelines/__init__.py +52 -52
  79. crawlo/pipelines/base_pipeline.py +452 -452
  80. crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +196 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +104 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -139
  87. crawlo/pipelines/mysql_pipeline.py +468 -469
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -155
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +9 -9
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -591
  94. crawlo/queue/redis_priority_queue.py +518 -518
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +287 -284
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +658 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +1 -1
  104. crawlo/templates/project/items.py.tmpl +13 -13
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -35
  107. crawlo/templates/project/settings.py.tmpl +113 -109
  108. crawlo/templates/project/settings_distributed.py.tmpl +160 -156
  109. crawlo/templates/project/settings_gentle.py.tmpl +174 -170
  110. crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
  111. crawlo/templates/project/settings_minimal.py.tmpl +102 -98
  112. crawlo/templates/project/settings_simple.py.tmpl +172 -168
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -32
  116. crawlo/templates/spiders_init.py.tmpl +4 -4
  117. crawlo/tools/__init__.py +86 -86
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +74 -50
  123. crawlo/utils/batch_processor.py +276 -276
  124. crawlo/utils/config_manager.py +442 -442
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/encoding_helper.py +190 -0
  128. crawlo/utils/error_handler.py +410 -410
  129. crawlo/utils/fingerprint.py +121 -121
  130. crawlo/utils/func_tools.py +82 -82
  131. crawlo/utils/large_scale_helper.py +344 -344
  132. crawlo/utils/leak_detector.py +335 -335
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -157
  135. crawlo/utils/mysql_connection_pool.py +197 -197
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +90 -90
  139. crawlo/utils/redis_connection_pool.py +578 -578
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -278
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -337
  144. crawlo/utils/response_helper.py +113 -0
  145. crawlo/utils/selector_helper.py +138 -137
  146. crawlo/utils/singleton.py +69 -69
  147. crawlo/utils/spider_loader.py +201 -201
  148. crawlo/utils/text_helper.py +94 -94
  149. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
  150. crawlo-1.4.8.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -217
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -467
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -72
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  192. tests/ofweek_scrapy/scrapy.cfg +11 -11
  193. tests/optimized_performance_test.py +211 -211
  194. tests/performance_comparison.py +244 -244
  195. tests/queue_blocking_test.py +113 -113
  196. tests/queue_test.py +89 -89
  197. tests/redis_key_validation_demo.py +130 -130
  198. tests/request_params_example.py +150 -150
  199. tests/response_improvements_example.py +144 -144
  200. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  201. tests/scrapy_comparison/scrapy_test.py +133 -133
  202. tests/simple_cli_test.py +54 -54
  203. tests/simple_command_test.py +119 -119
  204. tests/simple_crawlo_test.py +126 -126
  205. tests/simple_follow_test.py +38 -38
  206. tests/simple_log_test2.py +137 -137
  207. tests/simple_optimization_test.py +128 -128
  208. tests/simple_queue_type_test.py +41 -41
  209. tests/simple_response_selector_test.py +94 -94
  210. tests/simple_selector_helper_test.py +154 -154
  211. tests/simple_selector_test.py +207 -207
  212. tests/simple_spider_test.py +49 -49
  213. tests/simple_url_test.py +73 -73
  214. tests/simulate_mysql_update_test.py +139 -139
  215. tests/spider_log_timing_test.py +177 -177
  216. tests/test_advanced_tools.py +148 -148
  217. tests/test_all_commands.py +230 -230
  218. tests/test_all_pipeline_fingerprints.py +133 -133
  219. tests/test_all_redis_key_configs.py +145 -145
  220. tests/test_asyncmy_usage.py +56 -56
  221. tests/test_batch_processor.py +178 -178
  222. tests/test_cleaners.py +54 -54
  223. tests/test_cli_arguments.py +118 -118
  224. tests/test_component_factory.py +174 -174
  225. tests/test_config_consistency.py +80 -80
  226. tests/test_config_merge.py +152 -152
  227. tests/test_config_validator.py +182 -182
  228. tests/test_controlled_spider_mixin.py +79 -79
  229. tests/test_crawler_process_import.py +38 -38
  230. tests/test_crawler_process_spider_modules.py +47 -47
  231. tests/test_crawlo_proxy_integration.py +114 -114
  232. tests/test_date_tools.py +123 -123
  233. tests/test_dedup_fix.py +220 -220
  234. tests/test_dedup_pipeline_consistency.py +124 -124
  235. tests/test_default_header_middleware.py +313 -313
  236. tests/test_distributed.py +65 -65
  237. tests/test_double_crawlo_fix.py +204 -204
  238. tests/test_double_crawlo_fix_simple.py +124 -124
  239. tests/test_download_delay_middleware.py +221 -221
  240. tests/test_downloader_proxy_compatibility.py +272 -272
  241. tests/test_edge_cases.py +305 -305
  242. tests/test_encoding_core.py +56 -56
  243. tests/test_encoding_detection.py +126 -126
  244. tests/test_enhanced_error_handler.py +270 -270
  245. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  246. tests/test_error_handler_compatibility.py +112 -112
  247. tests/test_factories.py +252 -252
  248. tests/test_factory_compatibility.py +196 -196
  249. tests/test_final_validation.py +153 -153
  250. tests/test_fingerprint_consistency.py +135 -135
  251. tests/test_fingerprint_simple.py +51 -51
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_helper.py +235 -235
  257. tests/test_logging_enhancements.py +374 -374
  258. tests/test_logging_final.py +184 -184
  259. tests/test_logging_integration.py +312 -312
  260. tests/test_logging_system.py +282 -282
  261. tests/test_middleware_debug.py +141 -141
  262. tests/test_mode_consistency.py +51 -51
  263. tests/test_multi_directory.py +67 -67
  264. tests/test_multiple_spider_modules.py +80 -80
  265. tests/test_mysql_pipeline_config.py +164 -164
  266. tests/test_mysql_pipeline_error.py +98 -98
  267. tests/test_mysql_pipeline_init_log.py +82 -82
  268. tests/test_mysql_pipeline_integration.py +132 -132
  269. tests/test_mysql_pipeline_refactor.py +143 -143
  270. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  271. tests/test_mysql_pipeline_robustness.py +195 -195
  272. tests/test_mysql_pipeline_types.py +88 -88
  273. tests/test_mysql_update_columns.py +93 -93
  274. tests/test_offsite_middleware.py +244 -244
  275. tests/test_offsite_middleware_simple.py +203 -203
  276. tests/test_optimized_selector_naming.py +100 -100
  277. tests/test_parsel.py +29 -29
  278. tests/test_performance.py +327 -327
  279. tests/test_performance_monitor.py +115 -115
  280. tests/test_pipeline_fingerprint_consistency.py +86 -86
  281. tests/test_priority_behavior.py +211 -211
  282. tests/test_priority_consistency.py +151 -151
  283. tests/test_priority_consistency_fixed.py +249 -249
  284. tests/test_proxy_health_check.py +32 -32
  285. tests/test_proxy_middleware.py +217 -217
  286. tests/test_proxy_middleware_enhanced.py +212 -212
  287. tests/test_proxy_middleware_integration.py +142 -142
  288. tests/test_proxy_middleware_refactored.py +207 -207
  289. tests/test_proxy_only.py +83 -83
  290. tests/test_proxy_providers.py +56 -56
  291. tests/test_proxy_stats.py +19 -19
  292. tests/test_proxy_strategies.py +59 -59
  293. tests/test_proxy_with_downloader.py +152 -152
  294. tests/test_queue_empty_check.py +41 -41
  295. tests/test_queue_manager_double_crawlo.py +173 -173
  296. tests/test_queue_manager_redis_key.py +179 -179
  297. tests/test_queue_naming.py +154 -154
  298. tests/test_queue_type.py +106 -106
  299. tests/test_queue_type_redis_config_consistency.py +130 -130
  300. tests/test_random_headers_default.py +322 -322
  301. tests/test_random_headers_necessity.py +308 -308
  302. tests/test_random_user_agent.py +72 -72
  303. tests/test_redis_config.py +28 -28
  304. tests/test_redis_connection_pool.py +294 -294
  305. tests/test_redis_key_naming.py +181 -181
  306. tests/test_redis_key_validator.py +123 -123
  307. tests/test_redis_queue.py +224 -224
  308. tests/test_redis_queue_name_fix.py +175 -175
  309. tests/test_redis_queue_type_fallback.py +129 -129
  310. tests/test_request_ignore_middleware.py +182 -182
  311. tests/test_request_params.py +111 -111
  312. tests/test_request_serialization.py +70 -70
  313. tests/test_response_code_middleware.py +349 -349
  314. tests/test_response_filter_middleware.py +427 -427
  315. tests/test_response_follow.py +104 -104
  316. tests/test_response_improvements.py +152 -152
  317. tests/test_response_selector_methods.py +92 -92
  318. tests/test_response_url_methods.py +70 -70
  319. tests/test_response_urljoin.py +86 -86
  320. tests/test_retry_middleware.py +333 -333
  321. tests/test_retry_middleware_realistic.py +273 -273
  322. tests/test_scheduler.py +252 -252
  323. tests/test_scheduler_config_update.py +133 -133
  324. tests/test_scrapy_style_encoding.py +112 -112
  325. tests/test_selector_helper.py +100 -100
  326. tests/test_selector_optimizations.py +146 -146
  327. tests/test_simple_response.py +61 -61
  328. tests/test_spider_loader.py +49 -49
  329. tests/test_spider_loader_comprehensive.py +69 -69
  330. tests/test_spider_modules.py +84 -84
  331. tests/test_spiders/test_spider.py +9 -9
  332. tests/test_telecom_spider_redis_key.py +205 -205
  333. tests/test_template_content.py +87 -87
  334. tests/test_template_redis_key.py +134 -134
  335. tests/test_tools.py +159 -159
  336. tests/test_user_agent_randomness.py +176 -176
  337. tests/test_user_agents.py +96 -96
  338. tests/untested_features_report.md +138 -138
  339. tests/verify_debug.py +51 -51
  340. tests/verify_distributed.py +117 -117
  341. tests/verify_log_fix.py +111 -111
  342. tests/verify_mysql_warnings.py +109 -109
  343. crawlo/utils/log.py +0 -80
  344. crawlo/utils/url_utils.py +0 -40
  345. crawlo-1.4.7.dist-info/RECORD +0 -347
  346. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  347. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  348. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,579 +1,579 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Redis连接池工具
5
- 提供Redis连接池管理和配置
6
- """
7
- from contextlib import asynccontextmanager
8
- from typing import Dict, Any, Optional, List, Union, TYPE_CHECKING
9
- import re
10
-
11
- import redis.asyncio as aioredis
12
-
13
- # 尝试导入Redis集群支持
14
- try:
15
- from redis.asyncio.cluster import RedisCluster
16
- from redis.asyncio.cluster import ClusterNode
17
- REDIS_CLUSTER_AVAILABLE = True
18
- except ImportError:
19
- RedisCluster = None
20
- ClusterNode = None
21
- REDIS_CLUSTER_AVAILABLE = False
22
-
23
-
24
- if TYPE_CHECKING:
25
- from crawlo.utils.error_handler import ErrorHandler
26
-
27
-
28
- class RedisConnectionPool:
29
- """Redis连接池管理器"""
30
-
31
- # 默认连接池配置
32
- DEFAULT_CONFIG = {
33
- 'max_connections': 50,
34
- 'socket_connect_timeout': 5,
35
- 'socket_timeout': 30,
36
- 'socket_keepalive': True,
37
- 'health_check_interval': 30,
38
- 'retry_on_timeout': True,
39
- 'encoding': 'utf-8',
40
- 'decode_responses': False,
41
- }
42
-
43
- # Redis集群不支持的配置参数
44
- CLUSTER_UNSUPPORTED_CONFIG = {
45
- 'retry_on_timeout',
46
- 'health_check_interval',
47
- 'socket_keepalive'
48
- }
49
-
50
- def __init__(self, redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs):
51
- self.redis_url = redis_url
52
- self.is_cluster = is_cluster
53
- self.cluster_nodes = cluster_nodes
54
- self.config = {**self.DEFAULT_CONFIG, **kwargs}
55
-
56
- # 延迟初始化logger和error_handler
57
- self._logger = None
58
- self._error_handler: Optional["ErrorHandler"] = None
59
-
60
- # 连接池实例
61
- self._connection_pool: Optional[aioredis.ConnectionPool] = None
62
- self._redis_client = None
63
- self._connection_tested = False # 标记是否已测试连接
64
-
65
- # 连接池统计信息
66
- self._stats = {
67
- 'created_connections': 0,
68
- 'active_connections': 0,
69
- 'idle_connections': 0,
70
- 'errors': 0
71
- }
72
-
73
- # 初始化连接池
74
- self._initialize_pool()
75
-
76
- @property
77
- def logger(self):
78
- """延迟初始化logger"""
79
- if self._logger is None:
80
- from crawlo.logging import get_logger
81
- self._logger = get_logger(self.__class__.__name__)
82
- return self._logger
83
-
84
- @property
85
- def error_handler(self):
86
- """延迟初始化error_handler"""
87
- if self._error_handler is None:
88
- from crawlo.utils.error_handler import ErrorHandler
89
- self._error_handler = ErrorHandler(self.__class__.__name__)
90
- return self._error_handler
91
-
92
- def _is_cluster_url(self) -> bool:
93
- """判断是否为集群URL格式"""
94
- if self.cluster_nodes:
95
- return True
96
- # 检查URL是否包含多个节点(逗号分隔)
97
- if ',' in self.redis_url:
98
- return True
99
- # 检查URL是否为集群格式
100
- if 'redis-cluster://' in self.redis_url or 'rediss-cluster://' in self.redis_url:
101
- return True
102
- return False
103
-
104
- def _parse_cluster_nodes(self) -> List[Dict[str, Union[str, int]]]:
105
- """解析集群节点"""
106
- nodes = []
107
- if self.cluster_nodes:
108
- node_list = self.cluster_nodes
109
- else:
110
- # 从URL中解析节点
111
- # 支持格式: redis://host1:port1,host2:port2,host3:port3
112
- # 或: host1:port1,host2:port2,host3:port3
113
- url_part = self.redis_url.replace('redis://', '').replace('rediss://', '')
114
- node_list = url_part.split(',')
115
-
116
- for node in node_list:
117
- # 解析host:port格式
118
- if ':' in node:
119
- host, port = node.rsplit(':', 1)
120
- try:
121
- nodes.append({
122
- 'host': str(host.strip()),
123
- 'port': int(port.strip())
124
- })
125
- except ValueError:
126
- self.logger.warning(f"无效的节点格式: {node}")
127
- else:
128
- # 默认端口
129
- nodes.append({
130
- 'host': str(node.strip()),
131
- 'port': 6379
132
- })
133
-
134
- return nodes
135
-
136
- def _get_cluster_config(self) -> Dict[str, Any]:
137
- """获取适用于Redis集群的配置"""
138
- # 移除集群不支持的配置参数
139
- cluster_config = self.config.copy()
140
- for unsupported_key in self.CLUSTER_UNSUPPORTED_CONFIG:
141
- cluster_config.pop(unsupported_key, None)
142
- return cluster_config
143
-
144
- def _initialize_pool(self):
145
- """初始化连接池"""
146
- try:
147
- # 智能检测是否应该使用集群模式
148
- should_use_cluster = self.is_cluster or self._is_cluster_url()
149
-
150
- if should_use_cluster and REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and ClusterNode is not None:
151
- # 使用Redis集群
152
- nodes = self._parse_cluster_nodes()
153
- cluster_config = self._get_cluster_config()
154
-
155
- if nodes:
156
- if len(nodes) == 1:
157
- # 单节点集群
158
- self._redis_client = RedisCluster(
159
- host=str(nodes[0]['host']),
160
- port=int(nodes[0]['port']),
161
- **cluster_config
162
- )
163
- else:
164
- # 多节点集群
165
- cluster_node_objects = [ClusterNode(str(node['host']), int(node['port'])) for node in nodes]
166
- self._redis_client = RedisCluster(
167
- startup_nodes=cluster_node_objects,
168
- **cluster_config
169
- )
170
- self.logger.info(f"Redis集群连接池初始化成功: {len(nodes)} 个节点")
171
- else:
172
- # 回退到单实例模式
173
- self._connection_pool = aioredis.ConnectionPool.from_url(
174
- self.redis_url,
175
- **self.config
176
- )
177
- self._redis_client = aioredis.Redis(
178
- connection_pool=self._connection_pool
179
- )
180
- self.logger.warning("无法解析集群节点,回退到单实例模式")
181
- else:
182
- # 使用单实例Redis
183
- self._connection_pool = aioredis.ConnectionPool.from_url(
184
- self.redis_url,
185
- **self.config
186
- )
187
-
188
- self._redis_client = aioredis.Redis(
189
- connection_pool=self._connection_pool
190
- )
191
-
192
- # 只在调试模式下输出详细连接池信息
193
- if should_use_cluster and REDIS_CLUSTER_AVAILABLE:
194
- self.logger.debug(f"Redis集群连接池初始化成功: {self.redis_url}")
195
- else:
196
- self.logger.debug(f"Redis连接池初始化成功: {self.redis_url}")
197
- self.logger.debug(f" 连接池配置: {self.config}")
198
-
199
- except Exception as e:
200
- from crawlo.utils.error_handler import ErrorContext
201
- error_context = ErrorContext(context="Redis连接池初始化失败")
202
- self.error_handler.handle_error(
203
- e,
204
- context=error_context,
205
- raise_error=True
206
- )
207
-
208
- async def _test_connection(self):
209
- """测试Redis连接"""
210
- if self._redis_client and not self._connection_tested:
211
- try:
212
- await self._redis_client.ping()
213
- self._connection_tested = True
214
- # 只在调试模式下输出连接测试成功信息
215
- if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and isinstance(self._redis_client, RedisCluster):
216
- self.logger.debug(f"Redis集群连接测试成功: {self.redis_url}")
217
- else:
218
- self.logger.debug(f"Redis连接测试成功: {self.redis_url}")
219
- except Exception as e:
220
- self.logger.error(f"Redis连接测试失败: {self.redis_url} - {e}")
221
- raise
222
-
223
- async def get_connection(self):
224
- """
225
- 获取Redis连接实例
226
-
227
- Returns:
228
- Redis连接实例
229
- """
230
- if not self._redis_client:
231
- self._initialize_pool()
232
-
233
- # 确保连接有效
234
- await self._test_connection()
235
-
236
- self._stats['active_connections'] += 1
237
- return self._redis_client
238
-
239
- async def ping(self) -> bool:
240
- """
241
- 检查Redis连接是否正常
242
-
243
- Returns:
244
- 连接是否正常
245
- """
246
- try:
247
- if self._redis_client:
248
- await self._redis_client.ping()
249
- return True
250
- return False
251
- except Exception as e:
252
- self.logger.warning(f"Redis连接检查失败: {e}")
253
- return False
254
-
255
- async def close(self):
256
- """关闭连接池"""
257
- try:
258
- if self._redis_client:
259
- await self._redis_client.close()
260
- self._redis_client = None
261
-
262
- if self._connection_pool:
263
- await self._connection_pool.disconnect()
264
- self._connection_pool = None
265
-
266
- self.logger.info("Redis连接池已关闭")
267
- except Exception as e:
268
- from crawlo.utils.error_handler import ErrorContext
269
- error_context = ErrorContext(context="关闭Redis连接池失败")
270
- self.error_handler.handle_error(
271
- e,
272
- context=error_context,
273
- raise_error=False
274
- )
275
-
276
- def get_stats(self) -> Dict[str, Any]:
277
- """
278
- 获取连接池统计信息
279
-
280
- Returns:
281
- 统计信息字典
282
- """
283
- if self._connection_pool and hasattr(self._connection_pool, 'max_connections'):
284
- pool_stats = {
285
- 'max_connections': self._connection_pool.max_connections,
286
- 'available_connections': len(self._connection_pool._available_connections) if hasattr(self._connection_pool, '_available_connections') else 0,
287
- 'in_use_connections': len(self._connection_pool._in_use_connections) if hasattr(self._connection_pool, '_in_use_connections') else 0,
288
- }
289
- self._stats.update(pool_stats)
290
-
291
- return self._stats.copy()
292
-
293
- @asynccontextmanager
294
- async def connection_context(self):
295
- """
296
- 连接上下文管理器
297
-
298
- Yields:
299
- Redis连接实例
300
- """
301
- connection = await self.get_connection()
302
- try:
303
- yield connection
304
- finally:
305
- self._stats['active_connections'] -= 1
306
- self._stats['idle_connections'] += 1
307
-
308
-
309
- class RedisBatchOperationHelper:
310
- """Redis批量操作助手"""
311
-
312
- def __init__(self, redis_client, batch_size: int = 100):
313
- self.redis_client = redis_client
314
- self.batch_size = batch_size
315
-
316
- # 延迟初始化logger和error_handler
317
- self._logger = None
318
- self._error_handler = None
319
-
320
- @property
321
- def logger(self):
322
- """延迟初始化logger"""
323
- if self._logger is None:
324
- from crawlo.logging import get_logger
325
- self._logger = get_logger(self.__class__.__name__)
326
- return self._logger
327
-
328
- @property
329
- def error_handler(self):
330
- """延迟初始化error_handler"""
331
- if self._error_handler is None:
332
- from crawlo.utils.error_handler import ErrorHandler
333
- self._error_handler = ErrorHandler(self.__class__.__name__)
334
- return self._error_handler
335
-
336
- async def batch_execute(self, operations: list, batch_size: Optional[int] = None) -> list:
337
- """
338
- 批量执行Redis操作
339
-
340
- Args:
341
- operations: 操作列表,每个操作是一个包含(command, *args)的元组
342
- batch_size: 批次大小(如果为None则使用实例的batch_size)
343
-
344
- Returns:
345
- 执行结果列表
346
- """
347
- actual_batch_size = batch_size or self.batch_size
348
- results = []
349
-
350
- try:
351
- for i in range(0, len(operations), actual_batch_size):
352
- batch = operations[i:i + actual_batch_size]
353
- self.logger.debug(f"执行批次 {i//actual_batch_size + 1}/{(len(operations)-1)//actual_batch_size + 1}")
354
-
355
- try:
356
- # 处理集群模式下的管道操作
357
- if hasattr(self.redis_client, 'pipeline'):
358
- pipe = self.redis_client.pipeline()
359
- for operation in batch:
360
- command, *args = operation
361
- getattr(pipe, command)(*args)
362
-
363
- batch_results = await pipe.execute()
364
- results.extend(batch_results)
365
- else:
366
- # 集群模式可能不支持跨slot的管道操作,逐个执行
367
- batch_results = []
368
- for operation in batch:
369
- command, *args = operation
370
- result = await getattr(self.redis_client, command)(*args)
371
- batch_results.append(result)
372
- results.extend(batch_results)
373
-
374
- except Exception as e:
375
- self.logger.error(f"执行批次失败: {e}")
376
- # 继续执行下一个批次而不是中断
377
-
378
- except Exception as e:
379
- from crawlo.utils.error_handler import ErrorContext
380
- error_context = ErrorContext(context="Redis批量操作执行失败")
381
- self.error_handler.handle_error(
382
- e,
383
- context=error_context,
384
- raise_error=False
385
- )
386
-
387
- return results
388
-
389
- async def batch_set_hash(self, hash_key: str, items: Dict[str, Any]) -> int:
390
- """
391
- 批量设置Hash字段
392
-
393
- Args:
394
- hash_key: Hash键名
395
- items: 要设置的字段字典
396
-
397
- Returns:
398
- 成功设置的字段数量
399
- """
400
- try:
401
- if not items:
402
- return 0
403
-
404
- # 处理集群模式
405
- if hasattr(self.redis_client, 'pipeline'):
406
- pipe = self.redis_client.pipeline()
407
- count = 0
408
-
409
- for key, value in items.items():
410
- pipe.hset(hash_key, key, value)
411
- count += 1
412
-
413
- # 每达到批次大小就执行一次
414
- if count % self.batch_size == 0:
415
- await pipe.execute()
416
- pipe = self.redis_client.pipeline()
417
-
418
- # 执行剩余的操作
419
- if count % self.batch_size != 0:
420
- await pipe.execute()
421
- else:
422
- # 集群模式逐个执行
423
- count = 0
424
- batch_count = 0
425
- for key, value in items.items():
426
- await self.redis_client.hset(hash_key, key, value)
427
- count += 1
428
- batch_count += 1
429
-
430
- # 每达到批次大小就暂停一下
431
- if batch_count % self.batch_size == 0:
432
- import asyncio
433
- await asyncio.sleep(0.001) # 避免过于频繁的请求
434
- batch_count = 0
435
-
436
- self.logger.debug(f"批量设置Hash {count} 个字段")
437
- return count
438
-
439
- except Exception as e:
440
- from crawlo.utils.error_handler import ErrorContext
441
- error_context = ErrorContext(context="Redis批量设置Hash失败")
442
- self.error_handler.handle_error(
443
- e,
444
- context=error_context,
445
- raise_error=False
446
- )
447
- return 0
448
-
449
- async def batch_get_hash(self, hash_key: str, fields: list) -> Dict[str, Any]:
450
- """
451
- 批量获取Hash字段值
452
-
453
- Args:
454
- hash_key: Hash键名
455
- fields: 要获取的字段列表
456
-
457
- Returns:
458
- 字段值字典
459
- """
460
- try:
461
- if not fields:
462
- return {}
463
-
464
- # 处理集群模式
465
- if hasattr(self.redis_client, 'pipeline'):
466
- # 使用管道批量获取
467
- pipe = self.redis_client.pipeline()
468
- for field in fields:
469
- pipe.hget(hash_key, field)
470
-
471
- results = await pipe.execute()
472
- else:
473
- # 集群模式逐个获取
474
- results = []
475
- for field in fields:
476
- result = await self.redis_client.hget(hash_key, field)
477
- results.append(result)
478
-
479
- # 构建结果字典
480
- result_dict = {}
481
- for i, field in enumerate(fields):
482
- if results[i] is not None:
483
- result_dict[field] = results[i]
484
-
485
- self.logger.debug(f"批量获取Hash {len(result_dict)} 个字段")
486
- return result_dict
487
-
488
- except Exception as e:
489
- from crawlo.utils.error_handler import ErrorContext
490
- error_context = ErrorContext(context="Redis批量获取Hash失败")
491
- self.error_handler.handle_error(
492
- e,
493
- context=error_context,
494
- raise_error=False
495
- )
496
- return {}
497
-
498
-
499
- # 全局连接池管理器
500
- _connection_pools: Dict[str, RedisConnectionPool] = {}
501
-
502
-
503
- def get_redis_pool(redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs) -> RedisConnectionPool:
504
- """
505
- 获取Redis连接池实例(单例模式)
506
-
507
- Args:
508
- redis_url: Redis URL
509
- is_cluster: 是否为集群模式
510
- cluster_nodes: 集群节点列表
511
- **kwargs: 连接池配置参数
512
-
513
- Returns:
514
- Redis连接池实例
515
- """
516
- # 创建唯一标识符,包含集群相关信息
517
- pool_key = f"{redis_url}_{is_cluster}_{','.join(cluster_nodes) if cluster_nodes else ''}"
518
-
519
- if pool_key not in _connection_pools:
520
- _connection_pools[pool_key] = RedisConnectionPool(redis_url, is_cluster, cluster_nodes, **kwargs)
521
-
522
- return _connection_pools[pool_key]
523
-
524
-
525
- async def close_all_pools():
526
- """关闭所有连接池"""
527
- import asyncio
528
- global _connection_pools
529
-
530
- from crawlo.logging import get_logger
531
- logger = get_logger('RedisConnectionPool')
532
-
533
- if not _connection_pools:
534
- logger.debug("No Redis connection pools to close")
535
- return
536
-
537
- logger.info(f"Closing {len(_connection_pools)} Redis connection pool(s)...")
538
-
539
- close_tasks = []
540
- for pool_key, pool in _connection_pools.items():
541
- try:
542
- close_tasks.append(pool.close())
543
- except Exception as e:
544
- logger.error(f"Error scheduling close for pool {pool_key}: {e}")
545
-
546
- # 并发关闭所有连接池
547
- if close_tasks:
548
- results = await asyncio.gather(*close_tasks, return_exceptions=True)
549
-
550
- # 检查结果
551
- error_count = sum(1 for r in results if isinstance(r, Exception))
552
- if error_count > 0:
553
- logger.warning(f"Failed to close {error_count} pool(s)")
554
- else:
555
- logger.info("All Redis connection pools closed successfully")
556
-
557
- _connection_pools.clear()
558
- logger.debug("Redis connection pools registry cleared")
559
-
560
-
561
- # 便捷函数
562
- async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None) -> list:
563
- """
564
- 便捷函数:执行Redis批量操作
565
-
566
- Args:
567
- redis_url: Redis URL
568
- operations: 操作列表
569
- batch_size: 批次大小
570
- is_cluster: 是否为集群模式
571
- cluster_nodes: 集群节点列表
572
-
573
- Returns:
574
- 执行结果列表
575
- """
576
- pool = get_redis_pool(redis_url, is_cluster, cluster_nodes)
577
- redis_client = await pool.get_connection()
578
- helper = RedisBatchOperationHelper(redis_client, batch_size)
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Redis连接池工具
5
+ 提供Redis连接池管理和配置
6
+ """
7
+ from contextlib import asynccontextmanager
8
+ from typing import Dict, Any, Optional, List, Union, TYPE_CHECKING
9
+ import re
10
+
11
+ import redis.asyncio as aioredis
12
+
13
+ # 尝试导入Redis集群支持
14
+ try:
15
+ from redis.asyncio.cluster import RedisCluster
16
+ from redis.asyncio.cluster import ClusterNode
17
+ REDIS_CLUSTER_AVAILABLE = True
18
+ except ImportError:
19
+ RedisCluster = None
20
+ ClusterNode = None
21
+ REDIS_CLUSTER_AVAILABLE = False
22
+
23
+
24
+ if TYPE_CHECKING:
25
+ from crawlo.utils.error_handler import ErrorHandler
26
+
27
+
28
+ class RedisConnectionPool:
29
+ """Redis连接池管理器"""
30
+
31
+ # 默认连接池配置
32
+ DEFAULT_CONFIG = {
33
+ 'max_connections': 50,
34
+ 'socket_connect_timeout': 5,
35
+ 'socket_timeout': 30,
36
+ 'socket_keepalive': True,
37
+ 'health_check_interval': 30,
38
+ 'retry_on_timeout': True,
39
+ 'encoding': 'utf-8',
40
+ 'decode_responses': False,
41
+ }
42
+
43
+ # Redis集群不支持的配置参数
44
+ CLUSTER_UNSUPPORTED_CONFIG = {
45
+ 'retry_on_timeout',
46
+ 'health_check_interval',
47
+ 'socket_keepalive'
48
+ }
49
+
50
+ def __init__(self, redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs):
51
+ self.redis_url = redis_url
52
+ self.is_cluster = is_cluster
53
+ self.cluster_nodes = cluster_nodes
54
+ self.config = {**self.DEFAULT_CONFIG, **kwargs}
55
+
56
+ # 延迟初始化logger和error_handler
57
+ self._logger = None
58
+ self._error_handler: Optional["ErrorHandler"] = None
59
+
60
+ # 连接池实例
61
+ self._connection_pool: Optional[aioredis.ConnectionPool] = None
62
+ self._redis_client = None
63
+ self._connection_tested = False # 标记是否已测试连接
64
+
65
+ # 连接池统计信息
66
+ self._stats = {
67
+ 'created_connections': 0,
68
+ 'active_connections': 0,
69
+ 'idle_connections': 0,
70
+ 'errors': 0
71
+ }
72
+
73
+ # 初始化连接池
74
+ self._initialize_pool()
75
+
76
+ @property
77
+ def logger(self):
78
+ """延迟初始化logger"""
79
+ if self._logger is None:
80
+ from crawlo.logging import get_logger
81
+ self._logger = get_logger(self.__class__.__name__)
82
+ return self._logger
83
+
84
+ @property
85
+ def error_handler(self):
86
+ """延迟初始化error_handler"""
87
+ if self._error_handler is None:
88
+ from crawlo.utils.error_handler import ErrorHandler
89
+ self._error_handler = ErrorHandler(self.__class__.__name__)
90
+ return self._error_handler
91
+
92
+ def _is_cluster_url(self) -> bool:
93
+ """判断是否为集群URL格式"""
94
+ if self.cluster_nodes:
95
+ return True
96
+ # 检查URL是否包含多个节点(逗号分隔)
97
+ if ',' in self.redis_url:
98
+ return True
99
+ # 检查URL是否为集群格式
100
+ if 'redis-cluster://' in self.redis_url or 'rediss-cluster://' in self.redis_url:
101
+ return True
102
+ return False
103
+
104
+ def _parse_cluster_nodes(self) -> List[Dict[str, Union[str, int]]]:
105
+ """解析集群节点"""
106
+ nodes = []
107
+ if self.cluster_nodes:
108
+ node_list = self.cluster_nodes
109
+ else:
110
+ # 从URL中解析节点
111
+ # 支持格式: redis://host1:port1,host2:port2,host3:port3
112
+ # 或: host1:port1,host2:port2,host3:port3
113
+ url_part = self.redis_url.replace('redis://', '').replace('rediss://', '')
114
+ node_list = url_part.split(',')
115
+
116
+ for node in node_list:
117
+ # 解析host:port格式
118
+ if ':' in node:
119
+ host, port = node.rsplit(':', 1)
120
+ try:
121
+ nodes.append({
122
+ 'host': str(host.strip()),
123
+ 'port': int(port.strip())
124
+ })
125
+ except ValueError:
126
+ self.logger.warning(f"无效的节点格式: {node}")
127
+ else:
128
+ # 默认端口
129
+ nodes.append({
130
+ 'host': str(node.strip()),
131
+ 'port': 6379
132
+ })
133
+
134
+ return nodes
135
+
136
+ def _get_cluster_config(self) -> Dict[str, Any]:
137
+ """获取适用于Redis集群的配置"""
138
+ # 移除集群不支持的配置参数
139
+ cluster_config = self.config.copy()
140
+ for unsupported_key in self.CLUSTER_UNSUPPORTED_CONFIG:
141
+ cluster_config.pop(unsupported_key, None)
142
+ return cluster_config
143
+
144
+ def _initialize_pool(self):
145
+ """初始化连接池"""
146
+ try:
147
+ # 智能检测是否应该使用集群模式
148
+ should_use_cluster = self.is_cluster or self._is_cluster_url()
149
+
150
+ if should_use_cluster and REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and ClusterNode is not None:
151
+ # 使用Redis集群
152
+ nodes = self._parse_cluster_nodes()
153
+ cluster_config = self._get_cluster_config()
154
+
155
+ if nodes:
156
+ if len(nodes) == 1:
157
+ # 单节点集群
158
+ self._redis_client = RedisCluster(
159
+ host=str(nodes[0]['host']),
160
+ port=int(nodes[0]['port']),
161
+ **cluster_config
162
+ )
163
+ else:
164
+ # 多节点集群
165
+ cluster_node_objects = [ClusterNode(str(node['host']), int(node['port'])) for node in nodes]
166
+ self._redis_client = RedisCluster(
167
+ startup_nodes=cluster_node_objects,
168
+ **cluster_config
169
+ )
170
+ self.logger.info(f"Redis集群连接池初始化成功: {len(nodes)} 个节点")
171
+ else:
172
+ # 回退到单实例模式
173
+ self._connection_pool = aioredis.ConnectionPool.from_url(
174
+ self.redis_url,
175
+ **self.config
176
+ )
177
+ self._redis_client = aioredis.Redis(
178
+ connection_pool=self._connection_pool
179
+ )
180
+ self.logger.warning("无法解析集群节点,回退到单实例模式")
181
+ else:
182
+ # 使用单实例Redis
183
+ self._connection_pool = aioredis.ConnectionPool.from_url(
184
+ self.redis_url,
185
+ **self.config
186
+ )
187
+
188
+ self._redis_client = aioredis.Redis(
189
+ connection_pool=self._connection_pool
190
+ )
191
+
192
+ # 只在调试模式下输出详细连接池信息
193
+ if should_use_cluster and REDIS_CLUSTER_AVAILABLE:
194
+ self.logger.debug(f"Redis集群连接池初始化成功: {self.redis_url}")
195
+ else:
196
+ self.logger.debug(f"Redis连接池初始化成功: {self.redis_url}")
197
+ self.logger.debug(f" 连接池配置: {self.config}")
198
+
199
+ except Exception as e:
200
+ from crawlo.utils.error_handler import ErrorContext
201
+ error_context = ErrorContext(context="Redis连接池初始化失败")
202
+ self.error_handler.handle_error(
203
+ e,
204
+ context=error_context,
205
+ raise_error=True
206
+ )
207
+
208
+ async def _test_connection(self):
209
+ """测试Redis连接"""
210
+ if self._redis_client and not self._connection_tested:
211
+ try:
212
+ await self._redis_client.ping()
213
+ self._connection_tested = True
214
+ # 只在调试模式下输出连接测试成功信息
215
+ if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and isinstance(self._redis_client, RedisCluster):
216
+ self.logger.debug(f"Redis集群连接测试成功: {self.redis_url}")
217
+ else:
218
+ self.logger.debug(f"Redis连接测试成功: {self.redis_url}")
219
+ except Exception as e:
220
+ self.logger.error(f"Redis连接测试失败: {self.redis_url} - {e}")
221
+ raise
222
+
223
+ async def get_connection(self):
224
+ """
225
+ 获取Redis连接实例
226
+
227
+ Returns:
228
+ Redis连接实例
229
+ """
230
+ if not self._redis_client:
231
+ self._initialize_pool()
232
+
233
+ # 确保连接有效
234
+ await self._test_connection()
235
+
236
+ self._stats['active_connections'] += 1
237
+ return self._redis_client
238
+
239
+ async def ping(self) -> bool:
240
+ """
241
+ 检查Redis连接是否正常
242
+
243
+ Returns:
244
+ 连接是否正常
245
+ """
246
+ try:
247
+ if self._redis_client:
248
+ await self._redis_client.ping()
249
+ return True
250
+ return False
251
+ except Exception as e:
252
+ self.logger.warning(f"Redis连接检查失败: {e}")
253
+ return False
254
+
255
+ async def close(self):
256
+ """关闭连接池"""
257
+ try:
258
+ if self._redis_client:
259
+ await self._redis_client.close()
260
+ self._redis_client = None
261
+
262
+ if self._connection_pool:
263
+ await self._connection_pool.disconnect()
264
+ self._connection_pool = None
265
+
266
+ self.logger.info("Redis连接池已关闭")
267
+ except Exception as e:
268
+ from crawlo.utils.error_handler import ErrorContext
269
+ error_context = ErrorContext(context="关闭Redis连接池失败")
270
+ self.error_handler.handle_error(
271
+ e,
272
+ context=error_context,
273
+ raise_error=False
274
+ )
275
+
276
+ def get_stats(self) -> Dict[str, Any]:
277
+ """
278
+ 获取连接池统计信息
279
+
280
+ Returns:
281
+ 统计信息字典
282
+ """
283
+ if self._connection_pool and hasattr(self._connection_pool, 'max_connections'):
284
+ pool_stats = {
285
+ 'max_connections': self._connection_pool.max_connections,
286
+ 'available_connections': len(self._connection_pool._available_connections) if hasattr(self._connection_pool, '_available_connections') else 0,
287
+ 'in_use_connections': len(self._connection_pool._in_use_connections) if hasattr(self._connection_pool, '_in_use_connections') else 0,
288
+ }
289
+ self._stats.update(pool_stats)
290
+
291
+ return self._stats.copy()
292
+
293
+ @asynccontextmanager
294
+ async def connection_context(self):
295
+ """
296
+ 连接上下文管理器
297
+
298
+ Yields:
299
+ Redis连接实例
300
+ """
301
+ connection = await self.get_connection()
302
+ try:
303
+ yield connection
304
+ finally:
305
+ self._stats['active_connections'] -= 1
306
+ self._stats['idle_connections'] += 1
307
+
308
+
309
+ class RedisBatchOperationHelper:
310
+ """Redis批量操作助手"""
311
+
312
+ def __init__(self, redis_client, batch_size: int = 100):
313
+ self.redis_client = redis_client
314
+ self.batch_size = batch_size
315
+
316
+ # 延迟初始化logger和error_handler
317
+ self._logger = None
318
+ self._error_handler = None
319
+
320
+ @property
321
+ def logger(self):
322
+ """延迟初始化logger"""
323
+ if self._logger is None:
324
+ from crawlo.logging import get_logger
325
+ self._logger = get_logger(self.__class__.__name__)
326
+ return self._logger
327
+
328
+ @property
329
+ def error_handler(self):
330
+ """延迟初始化error_handler"""
331
+ if self._error_handler is None:
332
+ from crawlo.utils.error_handler import ErrorHandler
333
+ self._error_handler = ErrorHandler(self.__class__.__name__)
334
+ return self._error_handler
335
+
336
+ async def batch_execute(self, operations: list, batch_size: Optional[int] = None) -> list:
337
+ """
338
+ 批量执行Redis操作
339
+
340
+ Args:
341
+ operations: 操作列表,每个操作是一个包含(command, *args)的元组
342
+ batch_size: 批次大小(如果为None则使用实例的batch_size)
343
+
344
+ Returns:
345
+ 执行结果列表
346
+ """
347
+ actual_batch_size = batch_size or self.batch_size
348
+ results = []
349
+
350
+ try:
351
+ for i in range(0, len(operations), actual_batch_size):
352
+ batch = operations[i:i + actual_batch_size]
353
+ self.logger.debug(f"执行批次 {i//actual_batch_size + 1}/{(len(operations)-1)//actual_batch_size + 1}")
354
+
355
+ try:
356
+ # 处理集群模式下的管道操作
357
+ if hasattr(self.redis_client, 'pipeline'):
358
+ pipe = self.redis_client.pipeline()
359
+ for operation in batch:
360
+ command, *args = operation
361
+ getattr(pipe, command)(*args)
362
+
363
+ batch_results = await pipe.execute()
364
+ results.extend(batch_results)
365
+ else:
366
+ # 集群模式可能不支持跨slot的管道操作,逐个执行
367
+ batch_results = []
368
+ for operation in batch:
369
+ command, *args = operation
370
+ result = await getattr(self.redis_client, command)(*args)
371
+ batch_results.append(result)
372
+ results.extend(batch_results)
373
+
374
+ except Exception as e:
375
+ self.logger.error(f"执行批次失败: {e}")
376
+ # 继续执行下一个批次而不是中断
377
+
378
+ except Exception as e:
379
+ from crawlo.utils.error_handler import ErrorContext
380
+ error_context = ErrorContext(context="Redis批量操作执行失败")
381
+ self.error_handler.handle_error(
382
+ e,
383
+ context=error_context,
384
+ raise_error=False
385
+ )
386
+
387
+ return results
388
+
389
+ async def batch_set_hash(self, hash_key: str, items: Dict[str, Any]) -> int:
390
+ """
391
+ 批量设置Hash字段
392
+
393
+ Args:
394
+ hash_key: Hash键名
395
+ items: 要设置的字段字典
396
+
397
+ Returns:
398
+ 成功设置的字段数量
399
+ """
400
+ try:
401
+ if not items:
402
+ return 0
403
+
404
+ # 处理集群模式
405
+ if hasattr(self.redis_client, 'pipeline'):
406
+ pipe = self.redis_client.pipeline()
407
+ count = 0
408
+
409
+ for key, value in items.items():
410
+ pipe.hset(hash_key, key, value)
411
+ count += 1
412
+
413
+ # 每达到批次大小就执行一次
414
+ if count % self.batch_size == 0:
415
+ await pipe.execute()
416
+ pipe = self.redis_client.pipeline()
417
+
418
+ # 执行剩余的操作
419
+ if count % self.batch_size != 0:
420
+ await pipe.execute()
421
+ else:
422
+ # 集群模式逐个执行
423
+ count = 0
424
+ batch_count = 0
425
+ for key, value in items.items():
426
+ await self.redis_client.hset(hash_key, key, value)
427
+ count += 1
428
+ batch_count += 1
429
+
430
+ # 每达到批次大小就暂停一下
431
+ if batch_count % self.batch_size == 0:
432
+ import asyncio
433
+ await asyncio.sleep(0.001) # 避免过于频繁的请求
434
+ batch_count = 0
435
+
436
+ self.logger.debug(f"批量设置Hash {count} 个字段")
437
+ return count
438
+
439
+ except Exception as e:
440
+ from crawlo.utils.error_handler import ErrorContext
441
+ error_context = ErrorContext(context="Redis批量设置Hash失败")
442
+ self.error_handler.handle_error(
443
+ e,
444
+ context=error_context,
445
+ raise_error=False
446
+ )
447
+ return 0
448
+
449
+ async def batch_get_hash(self, hash_key: str, fields: list) -> Dict[str, Any]:
450
+ """
451
+ 批量获取Hash字段值
452
+
453
+ Args:
454
+ hash_key: Hash键名
455
+ fields: 要获取的字段列表
456
+
457
+ Returns:
458
+ 字段值字典
459
+ """
460
+ try:
461
+ if not fields:
462
+ return {}
463
+
464
+ # 处理集群模式
465
+ if hasattr(self.redis_client, 'pipeline'):
466
+ # 使用管道批量获取
467
+ pipe = self.redis_client.pipeline()
468
+ for field in fields:
469
+ pipe.hget(hash_key, field)
470
+
471
+ results = await pipe.execute()
472
+ else:
473
+ # 集群模式逐个获取
474
+ results = []
475
+ for field in fields:
476
+ result = await self.redis_client.hget(hash_key, field)
477
+ results.append(result)
478
+
479
+ # 构建结果字典
480
+ result_dict = {}
481
+ for i, field in enumerate(fields):
482
+ if results[i] is not None:
483
+ result_dict[field] = results[i]
484
+
485
+ self.logger.debug(f"批量获取Hash {len(result_dict)} 个字段")
486
+ return result_dict
487
+
488
+ except Exception as e:
489
+ from crawlo.utils.error_handler import ErrorContext
490
+ error_context = ErrorContext(context="Redis批量获取Hash失败")
491
+ self.error_handler.handle_error(
492
+ e,
493
+ context=error_context,
494
+ raise_error=False
495
+ )
496
+ return {}
497
+
498
+
499
+ # 全局连接池管理器
500
+ _connection_pools: Dict[str, RedisConnectionPool] = {}
501
+
502
+
503
+ def get_redis_pool(redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs) -> RedisConnectionPool:
504
+ """
505
+ 获取Redis连接池实例(单例模式)
506
+
507
+ Args:
508
+ redis_url: Redis URL
509
+ is_cluster: 是否为集群模式
510
+ cluster_nodes: 集群节点列表
511
+ **kwargs: 连接池配置参数
512
+
513
+ Returns:
514
+ Redis连接池实例
515
+ """
516
+ # 创建唯一标识符,包含集群相关信息
517
+ pool_key = f"{redis_url}_{is_cluster}_{','.join(cluster_nodes) if cluster_nodes else ''}"
518
+
519
+ if pool_key not in _connection_pools:
520
+ _connection_pools[pool_key] = RedisConnectionPool(redis_url, is_cluster, cluster_nodes, **kwargs)
521
+
522
+ return _connection_pools[pool_key]
523
+
524
+
525
+ async def close_all_pools():
526
+ """关闭所有连接池"""
527
+ import asyncio
528
+ global _connection_pools
529
+
530
+ from crawlo.logging import get_logger
531
+ logger = get_logger('RedisConnectionPool')
532
+
533
+ if not _connection_pools:
534
+ logger.debug("No Redis connection pools to close")
535
+ return
536
+
537
+ logger.info(f"Closing {len(_connection_pools)} Redis connection pool(s)...")
538
+
539
+ close_tasks = []
540
+ for pool_key, pool in _connection_pools.items():
541
+ try:
542
+ close_tasks.append(pool.close())
543
+ except Exception as e:
544
+ logger.error(f"Error scheduling close for pool {pool_key}: {e}")
545
+
546
+ # 并发关闭所有连接池
547
+ if close_tasks:
548
+ results = await asyncio.gather(*close_tasks, return_exceptions=True)
549
+
550
+ # 检查结果
551
+ error_count = sum(1 for r in results if isinstance(r, Exception))
552
+ if error_count > 0:
553
+ logger.warning(f"Failed to close {error_count} pool(s)")
554
+ else:
555
+ logger.info("All Redis connection pools closed successfully")
556
+
557
+ _connection_pools.clear()
558
+ logger.debug("Redis connection pools registry cleared")
559
+
560
+
561
+ # 便捷函数
562
+ async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None) -> list:
563
+ """
564
+ 便捷函数:执行Redis批量操作
565
+
566
+ Args:
567
+ redis_url: Redis URL
568
+ operations: 操作列表
569
+ batch_size: 批次大小
570
+ is_cluster: 是否为集群模式
571
+ cluster_nodes: 集群节点列表
572
+
573
+ Returns:
574
+ 执行结果列表
575
+ """
576
+ pool = get_redis_pool(redis_url, is_cluster, cluster_nodes)
577
+ redis_client = await pool.get_connection()
578
+ helper = RedisBatchOperationHelper(redis_client, batch_size)
579
579
  return await helper.batch_execute(operations)