crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,32 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
3
  """
4
- Redis连接池优化工具
5
- 提供优化的Redis连接池管理和配置
4
+ Redis连接池工具
5
+ 提供Redis连接池管理和配置
6
6
  """
7
7
  from contextlib import asynccontextmanager
8
- from typing import Dict, Any, Optional
8
+ from typing import Dict, Any, Optional, List, Union, TYPE_CHECKING
9
+ import re
9
10
 
10
11
  import redis.asyncio as aioredis
11
12
 
12
- # 延迟导入避免循环依赖
13
- # from crawlo.utils.error_handler import ErrorHandler
14
- # from crawlo.utils.log import get_logger
13
+ # 尝试导入Redis集群支持
14
+ try:
15
+ from redis.asyncio.cluster import RedisCluster
16
+ from redis.asyncio.cluster import ClusterNode
17
+ REDIS_CLUSTER_AVAILABLE = True
18
+ except ImportError:
19
+ RedisCluster = None
20
+ ClusterNode = None
21
+ REDIS_CLUSTER_AVAILABLE = False
15
22
 
16
23
 
17
- class OptimizedRedisConnectionPool:
18
- """优化的Redis连接池管理器"""
24
+ if TYPE_CHECKING:
25
+ from crawlo.utils.error_handler import ErrorHandler
26
+
27
+
28
+ class RedisConnectionPool:
29
+ """Redis连接池管理器"""
19
30
 
20
31
  # 默认连接池配置
21
32
  DEFAULT_CONFIG = {
@@ -29,17 +40,26 @@ class OptimizedRedisConnectionPool:
29
40
  'decode_responses': False,
30
41
  }
31
42
 
32
- def __init__(self, redis_url: str, **kwargs):
43
+ # Redis集群不支持的配置参数
44
+ CLUSTER_UNSUPPORTED_CONFIG = {
45
+ 'retry_on_timeout',
46
+ 'health_check_interval',
47
+ 'socket_keepalive'
48
+ }
49
+
50
+ def __init__(self, redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs):
33
51
  self.redis_url = redis_url
52
+ self.is_cluster = is_cluster
53
+ self.cluster_nodes = cluster_nodes
34
54
  self.config = {**self.DEFAULT_CONFIG, **kwargs}
35
55
 
36
56
  # 延迟初始化logger和error_handler
37
57
  self._logger = None
38
- self._error_handler = None
58
+ self._error_handler: Optional["ErrorHandler"] = None
39
59
 
40
60
  # 连接池实例
41
61
  self._connection_pool: Optional[aioredis.ConnectionPool] = None
42
- self._redis_client: Optional[aioredis.Redis] = None
62
+ self._redis_client = None
43
63
  self._connection_tested = False # 标记是否已测试连接
44
64
 
45
65
  # 连接池统计信息
@@ -57,7 +77,7 @@ class OptimizedRedisConnectionPool:
57
77
  def logger(self):
58
78
  """延迟初始化logger"""
59
79
  if self._logger is None:
60
- from crawlo.utils.log import get_logger
80
+ from crawlo.logging import get_logger
61
81
  self._logger = get_logger(self.__class__.__name__)
62
82
  return self._logger
63
83
 
@@ -69,26 +89,119 @@ class OptimizedRedisConnectionPool:
69
89
  self._error_handler = ErrorHandler(self.__class__.__name__)
70
90
  return self._error_handler
71
91
 
92
+ def _is_cluster_url(self) -> bool:
93
+ """判断是否为集群URL格式"""
94
+ if self.cluster_nodes:
95
+ return True
96
+ # 检查URL是否包含多个节点(逗号分隔)
97
+ if ',' in self.redis_url:
98
+ return True
99
+ # 检查URL是否为集群格式
100
+ if 'redis-cluster://' in self.redis_url or 'rediss-cluster://' in self.redis_url:
101
+ return True
102
+ return False
103
+
104
+ def _parse_cluster_nodes(self) -> List[Dict[str, Union[str, int]]]:
105
+ """解析集群节点"""
106
+ nodes = []
107
+ if self.cluster_nodes:
108
+ node_list = self.cluster_nodes
109
+ else:
110
+ # 从URL中解析节点
111
+ # 支持格式: redis://host1:port1,host2:port2,host3:port3
112
+ # 或: host1:port1,host2:port2,host3:port3
113
+ url_part = self.redis_url.replace('redis://', '').replace('rediss://', '')
114
+ node_list = url_part.split(',')
115
+
116
+ for node in node_list:
117
+ # 解析host:port格式
118
+ if ':' in node:
119
+ host, port = node.rsplit(':', 1)
120
+ try:
121
+ nodes.append({
122
+ 'host': str(host.strip()),
123
+ 'port': int(port.strip())
124
+ })
125
+ except ValueError:
126
+ self.logger.warning(f"无效的节点格式: {node}")
127
+ else:
128
+ # 默认端口
129
+ nodes.append({
130
+ 'host': str(node.strip()),
131
+ 'port': 6379
132
+ })
133
+
134
+ return nodes
135
+
136
+ def _get_cluster_config(self) -> Dict[str, Any]:
137
+ """获取适用于Redis集群的配置"""
138
+ # 移除集群不支持的配置参数
139
+ cluster_config = self.config.copy()
140
+ for unsupported_key in self.CLUSTER_UNSUPPORTED_CONFIG:
141
+ cluster_config.pop(unsupported_key, None)
142
+ return cluster_config
143
+
72
144
  def _initialize_pool(self):
73
145
  """初始化连接池"""
74
146
  try:
75
- self._connection_pool = aioredis.ConnectionPool.from_url(
76
- self.redis_url,
77
- **self.config
78
- )
147
+ # 智能检测是否应该使用集群模式
148
+ should_use_cluster = self.is_cluster or self._is_cluster_url()
79
149
 
80
- self._redis_client = aioredis.Redis(
81
- connection_pool=self._connection_pool
82
- )
150
+ if should_use_cluster and REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and ClusterNode is not None:
151
+ # 使用Redis集群
152
+ nodes = self._parse_cluster_nodes()
153
+ cluster_config = self._get_cluster_config()
154
+
155
+ if nodes:
156
+ if len(nodes) == 1:
157
+ # 单节点集群
158
+ self._redis_client = RedisCluster(
159
+ host=str(nodes[0]['host']),
160
+ port=int(nodes[0]['port']),
161
+ **cluster_config
162
+ )
163
+ else:
164
+ # 多节点集群
165
+ cluster_node_objects = [ClusterNode(str(node['host']), int(node['port'])) for node in nodes]
166
+ self._redis_client = RedisCluster(
167
+ startup_nodes=cluster_node_objects,
168
+ **cluster_config
169
+ )
170
+ self.logger.info(f"Redis集群连接池初始化成功: {len(nodes)} 个节点")
171
+ else:
172
+ # 回退到单实例模式
173
+ self._connection_pool = aioredis.ConnectionPool.from_url(
174
+ self.redis_url,
175
+ **self.config
176
+ )
177
+ self._redis_client = aioredis.Redis(
178
+ connection_pool=self._connection_pool
179
+ )
180
+ self.logger.warning("无法解析集群节点,回退到单实例模式")
181
+ else:
182
+ # 使用单实例Redis
183
+ self._connection_pool = aioredis.ConnectionPool.from_url(
184
+ self.redis_url,
185
+ **self.config
186
+ )
187
+
188
+ self._redis_client = aioredis.Redis(
189
+ connection_pool=self._connection_pool
190
+ )
83
191
 
84
192
  # 只在调试模式下输出详细连接池信息
85
- self.logger.debug(f"Redis连接池初始化成功: {self.redis_url}")
86
- self.logger.debug(f" 连接池配置: {self.config}")
87
-
193
+ if should_use_cluster and REDIS_CLUSTER_AVAILABLE:
194
+ self.logger.debug(f"Redis集群连接池初始化成功: {self.redis_url}")
195
+ else:
196
+ self.logger.debug(f"Redis连接池初始化成功: {self.redis_url}")
197
+ self.logger.debug(f" 连接池配置: {self.config}")
198
+
88
199
  except Exception as e:
200
+ from crawlo.utils.error_handler import ErrorContext
201
+ error_context = ErrorContext(context="Redis连接池初始化失败")
89
202
  self.error_handler.handle_error(
90
203
  e,
91
- context="Redis连接池初始化失败",
204
+ context=error_context,
92
205
  raise_error=True
93
206
  )
94
207
 
@@ -99,12 +212,15 @@ class OptimizedRedisConnectionPool:
99
212
  await self._redis_client.ping()
100
213
  self._connection_tested = True
101
214
  # 只在调试模式下输出连接测试成功信息
102
- self.logger.debug(f"Redis连接测试成功: {self.redis_url}")
215
+ if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and isinstance(self._redis_client, RedisCluster):
216
+ self.logger.debug(f"Redis集群连接测试成功: {self.redis_url}")
217
+ else:
218
+ self.logger.debug(f"Redis连接测试成功: {self.redis_url}")
103
219
  except Exception as e:
104
220
  self.logger.error(f"Redis连接测试失败: {self.redis_url} - {e}")
105
221
  raise
106
222
 
107
- async def get_connection(self) -> aioredis.Redis:
223
+ async def get_connection(self):
108
224
  """
109
225
  获取Redis连接实例
110
226
 
@@ -149,9 +265,11 @@ class OptimizedRedisConnectionPool:
149
265
 
150
266
  self.logger.info("Redis连接池已关闭")
151
267
  except Exception as e:
268
+ from crawlo.utils.error_handler import ErrorContext
269
+ error_context = ErrorContext(context="关闭Redis连接池失败")
152
270
  self.error_handler.handle_error(
153
271
  e,
154
- context="关闭Redis连接池失败",
272
+ context=error_context,
155
273
  raise_error=False
156
274
  )
157
275
 
@@ -162,12 +280,11 @@ class OptimizedRedisConnectionPool:
162
280
  Returns:
163
281
  统计信息字典
164
282
  """
165
- if self._connection_pool:
283
+ if self._connection_pool and hasattr(self._connection_pool, 'max_connections'):
166
284
  pool_stats = {
167
285
  'max_connections': self._connection_pool.max_connections,
168
- 'created_connections': self._connection_pool.created_connections,
169
- 'available_connections': len(self._connection_pool._available_connections),
170
- 'in_use_connections': len(self._connection_pool._in_use_connections),
286
+ 'available_connections': len(self._connection_pool._available_connections) if hasattr(self._connection_pool, '_available_connections') else 0,
287
+ 'in_use_connections': len(self._connection_pool._in_use_connections) if hasattr(self._connection_pool, '_in_use_connections') else 0,
171
288
  }
172
289
  self._stats.update(pool_stats)
173
290
 
@@ -192,7 +309,7 @@ class OptimizedRedisConnectionPool:
192
309
  class RedisBatchOperationHelper:
193
310
  """Redis批量操作助手"""
194
311
 
195
- def __init__(self, redis_client: aioredis.Redis, batch_size: int = 100):
312
+ def __init__(self, redis_client, batch_size: int = 100):
196
313
  self.redis_client = redis_client
197
314
  self.batch_size = batch_size
198
315
 
@@ -204,7 +321,7 @@ class RedisBatchOperationHelper:
204
321
  def logger(self):
205
322
  """延迟初始化logger"""
206
323
  if self._logger is None:
207
- from crawlo.utils.log import get_logger
324
+ from crawlo.logging import get_logger
208
325
  self._logger = get_logger(self.__class__.__name__)
209
326
  return self._logger
210
327
 
@@ -236,22 +353,34 @@ class RedisBatchOperationHelper:
236
353
  self.logger.debug(f"执行批次 {i//actual_batch_size + 1}/{(len(operations)-1)//actual_batch_size + 1}")
237
354
 
238
355
  try:
239
- pipe = self.redis_client.pipeline()
240
- for operation in batch:
241
- command, *args = operation
242
- getattr(pipe, command)(*args)
243
-
244
- batch_results = await pipe.execute()
245
- results.extend(batch_results)
356
+ # 处理集群模式下的管道操作
357
+ if hasattr(self.redis_client, 'pipeline'):
358
+ pipe = self.redis_client.pipeline()
359
+ for operation in batch:
360
+ command, *args = operation
361
+ getattr(pipe, command)(*args)
362
+
363
+ batch_results = await pipe.execute()
364
+ results.extend(batch_results)
365
+ else:
366
+ # 集群模式可能不支持跨slot的管道操作,逐个执行
367
+ batch_results = []
368
+ for operation in batch:
369
+ command, *args = operation
370
+ result = await getattr(self.redis_client, command)(*args)
371
+ batch_results.append(result)
372
+ results.extend(batch_results)
246
373
 
247
374
  except Exception as e:
248
375
  self.logger.error(f"执行批次失败: {e}")
249
376
  # 继续执行下一个批次而不是中断
250
377
 
251
378
  except Exception as e:
379
+ from crawlo.utils.error_handler import ErrorContext
380
+ error_context = ErrorContext(context="Redis批量操作执行失败")
252
381
  self.error_handler.handle_error(
253
382
  e,
254
- context="Redis批量操作执行失败",
383
+ context=error_context,
255
384
  raise_error=False
256
385
  )
257
386
 
@@ -272,29 +401,47 @@ class RedisBatchOperationHelper:
272
401
  if not items:
273
402
  return 0
274
403
 
275
- pipe = self.redis_client.pipeline()
276
- count = 0
277
-
278
- for key, value in items.items():
279
- pipe.hset(hash_key, key, value)
280
- count += 1
404
+ # 处理集群模式
405
+ if hasattr(self.redis_client, 'pipeline'):
406
+ pipe = self.redis_client.pipeline()
407
+ count = 0
408
+
409
+ for key, value in items.items():
410
+ pipe.hset(hash_key, key, value)
411
+ count += 1
412
+
413
+ # 每达到批次大小就执行一次
414
+ if count % self.batch_size == 0:
415
+ await pipe.execute()
416
+ pipe = self.redis_client.pipeline()
281
417
 
282
- # 每达到批次大小就执行一次
283
- if count % self.batch_size == 0:
418
+ # 执行剩余的操作
419
+ if count % self.batch_size != 0:
284
420
  await pipe.execute()
285
- pipe = self.redis_client.pipeline()
286
-
287
- # 执行剩余的操作
288
- if count % self.batch_size != 0:
289
- await pipe.execute()
421
+ else:
422
+ # 集群模式逐个执行
423
+ count = 0
424
+ batch_count = 0
425
+ for key, value in items.items():
426
+ await self.redis_client.hset(hash_key, key, value)
427
+ count += 1
428
+ batch_count += 1
429
+
430
+ # 每达到批次大小就暂停一下
431
+ if batch_count % self.batch_size == 0:
432
+ import asyncio
433
+ await asyncio.sleep(0.001) # 避免过于频繁的请求
434
+ batch_count = 0
290
435
 
291
436
  self.logger.debug(f"批量设置Hash {count} 个字段")
292
437
  return count
293
438
 
294
439
  except Exception as e:
440
+ from crawlo.utils.error_handler import ErrorContext
441
+ error_context = ErrorContext(context="Redis批量设置Hash失败")
295
442
  self.error_handler.handle_error(
296
443
  e,
297
- context="Redis批量设置Hash失败",
444
+ context=error_context,
298
445
  raise_error=False
299
446
  )
300
447
  return 0
@@ -314,12 +461,20 @@ class RedisBatchOperationHelper:
314
461
  if not fields:
315
462
  return {}
316
463
 
317
- # 使用管道批量获取
318
- pipe = self.redis_client.pipeline()
319
- for field in fields:
320
- pipe.hget(hash_key, field)
321
-
322
- results = await pipe.execute()
464
+ # 处理集群模式
465
+ if hasattr(self.redis_client, 'pipeline'):
466
+ # 使用管道批量获取
467
+ pipe = self.redis_client.pipeline()
468
+ for field in fields:
469
+ pipe.hget(hash_key, field)
470
+
471
+ results = await pipe.execute()
472
+ else:
473
+ # 集群模式逐个获取
474
+ results = []
475
+ for field in fields:
476
+ result = await self.redis_client.hget(hash_key, field)
477
+ results.append(result)
323
478
 
324
479
  # 构建结果字典
325
480
  result_dict = {}
@@ -331,47 +486,80 @@ class RedisBatchOperationHelper:
331
486
  return result_dict
332
487
 
333
488
  except Exception as e:
489
+ from crawlo.utils.error_handler import ErrorContext
490
+ error_context = ErrorContext(context="Redis批量获取Hash失败")
334
491
  self.error_handler.handle_error(
335
492
  e,
336
- context="Redis批量获取Hash失败",
493
+ context=error_context,
337
494
  raise_error=False
338
495
  )
339
496
  return {}
340
497
 
341
498
 
342
499
  # 全局连接池管理器
343
- _connection_pools: Dict[str, OptimizedRedisConnectionPool] = {}
500
+ _connection_pools: Dict[str, RedisConnectionPool] = {}
344
501
 
345
502
 
346
- def get_redis_pool(redis_url: str, **kwargs) -> OptimizedRedisConnectionPool:
503
+ def get_redis_pool(redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs) -> RedisConnectionPool:
347
504
  """
348
505
  获取Redis连接池实例(单例模式)
349
506
 
350
507
  Args:
351
508
  redis_url: Redis URL
509
+ is_cluster: 是否为集群模式
510
+ cluster_nodes: 集群节点列表
352
511
  **kwargs: 连接池配置参数
353
512
 
354
513
  Returns:
355
514
  Redis连接池实例
356
515
  """
357
- if redis_url not in _connection_pools:
358
- _connection_pools[redis_url] = OptimizedRedisConnectionPool(redis_url, **kwargs)
516
+ # 创建唯一标识符,包含集群相关信息
517
+ pool_key = f"{redis_url}_{is_cluster}_{','.join(cluster_nodes) if cluster_nodes else ''}"
359
518
 
360
- return _connection_pools[redis_url]
519
+ if pool_key not in _connection_pools:
520
+ _connection_pools[pool_key] = RedisConnectionPool(redis_url, is_cluster, cluster_nodes, **kwargs)
521
+
522
+ return _connection_pools[pool_key]
361
523
 
362
524
 
363
525
  async def close_all_pools():
364
526
  """关闭所有连接池"""
527
+ import asyncio
365
528
  global _connection_pools
366
529
 
367
- for pool in _connection_pools.values():
368
- await pool.close()
530
+ from crawlo.logging import get_logger
531
+ logger = get_logger('RedisConnectionPool')
532
+
533
+ if not _connection_pools:
534
+ logger.debug("No Redis connection pools to close")
535
+ return
536
+
537
+ logger.info(f"Closing {len(_connection_pools)} Redis connection pool(s)...")
538
+
539
+ close_tasks = []
540
+ for pool_key, pool in _connection_pools.items():
541
+ try:
542
+ close_tasks.append(pool.close())
543
+ except Exception as e:
544
+ logger.error(f"Error scheduling close for pool {pool_key}: {e}")
545
+
546
+ # 并发关闭所有连接池
547
+ if close_tasks:
548
+ results = await asyncio.gather(*close_tasks, return_exceptions=True)
549
+
550
+ # 检查结果
551
+ error_count = sum(1 for r in results if isinstance(r, Exception))
552
+ if error_count > 0:
553
+ logger.warning(f"Failed to close {error_count} pool(s)")
554
+ else:
555
+ logger.info("All Redis connection pools closed successfully")
369
556
 
370
557
  _connection_pools.clear()
558
+ logger.debug("Redis connection pools registry cleared")
371
559
 
372
560
 
373
561
  # 便捷函数
374
- async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100) -> list:
562
+ async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None) -> list:
375
563
  """
376
564
  便捷函数:执行Redis批量操作
377
565
 
@@ -379,11 +567,13 @@ async def execute_redis_batch(redis_url: str, operations: list, batch_size: int
379
567
  redis_url: Redis URL
380
568
  operations: 操作列表
381
569
  batch_size: 批次大小
570
+ is_cluster: 是否为集群模式
571
+ cluster_nodes: 集群节点列表
382
572
 
383
573
  Returns:
384
574
  执行结果列表
385
575
  """
386
- pool = get_redis_pool(redis_url)
576
+ pool = get_redis_pool(redis_url, is_cluster, cluster_nodes)
387
577
  redis_client = await pool.get_connection()
388
578
  helper = RedisBatchOperationHelper(redis_client, batch_size)
389
579
  return await helper.batch_execute(operations)
@@ -7,7 +7,7 @@ Redis Key 验证工具
7
7
  """
8
8
  from typing import List, Tuple
9
9
 
10
- from crawlo.utils.log import get_logger
10
+ from crawlo.logging import get_logger
11
11
 
12
12
 
13
13
  class RedisKeyValidator:
crawlo/utils/request.py CHANGED
@@ -73,12 +73,34 @@ def request_fingerprint(
73
73
  ) -> str:
74
74
  """
75
75
  生成请求指纹,基于方法、标准化 URL、body 和可选的 headers。
76
- 使用 SHA256 哈希算法以提高安全性。
77
-
76
+
77
+ .. deprecated:: 1.0.0
78
+ 此函数已废弃。请使用 :class:`crawlo.utils.fingerprint.FingerprintGenerator` 代替:
79
+
80
+ .. code-block:: python
81
+
82
+ from crawlo.utils.fingerprint import FingerprintGenerator
83
+
84
+ fp = FingerprintGenerator.request_fingerprint(
85
+ method=request.method,
86
+ url=request.url,
87
+ body=request.body or b'',
88
+ headers=dict(request.headers) if hasattr(request, 'headers') else {}
89
+ )
90
+
91
+ 此函数保留仅为向后兼容,将在 2.0.0 版本中移除。
92
+
78
93
  :param request: Request 对象(需包含 method, url, body, headers)
79
94
  :param include_headers: 指定要参与指纹计算的 header 名称列表(str 或 bytes)
80
95
  :return: 请求指纹(hex string)
81
96
  """
97
+ import warnings
98
+ warnings.warn(
99
+ "request_fingerprint() is deprecated. "
100
+ "Use FingerprintGenerator.request_fingerprint() instead.",
101
+ DeprecationWarning,
102
+ stacklevel=2
103
+ )
82
104
  from crawlo.utils.fingerprint import FingerprintGenerator
83
105
 
84
106
  # 准备请求数据
@@ -8,7 +8,7 @@ import gc
8
8
  import logging
9
9
  import pickle
10
10
 
11
- from crawlo.utils.log import get_logger
11
+ from crawlo.logging import get_logger
12
12
 
13
13
 
14
14
  class RequestSerializer: