crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,32 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
3
|
"""
|
|
4
|
-
Redis
|
|
5
|
-
|
|
4
|
+
Redis连接池工具
|
|
5
|
+
提供Redis连接池管理和配置
|
|
6
6
|
"""
|
|
7
7
|
from contextlib import asynccontextmanager
|
|
8
|
-
from typing import Dict, Any, Optional
|
|
8
|
+
from typing import Dict, Any, Optional, List, Union, TYPE_CHECKING
|
|
9
|
+
import re
|
|
9
10
|
|
|
10
11
|
import redis.asyncio as aioredis
|
|
11
12
|
|
|
12
|
-
#
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
# 尝试导入Redis集群支持
|
|
14
|
+
try:
|
|
15
|
+
from redis.asyncio.cluster import RedisCluster
|
|
16
|
+
from redis.asyncio.cluster import ClusterNode
|
|
17
|
+
REDIS_CLUSTER_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
RedisCluster = None
|
|
20
|
+
ClusterNode = None
|
|
21
|
+
REDIS_CLUSTER_AVAILABLE = False
|
|
15
22
|
|
|
16
23
|
|
|
17
|
-
|
|
18
|
-
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from crawlo.utils.error_handler import ErrorHandler
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RedisConnectionPool:
|
|
29
|
+
"""Redis连接池管理器"""
|
|
19
30
|
|
|
20
31
|
# 默认连接池配置
|
|
21
32
|
DEFAULT_CONFIG = {
|
|
@@ -29,17 +40,26 @@ class OptimizedRedisConnectionPool:
|
|
|
29
40
|
'decode_responses': False,
|
|
30
41
|
}
|
|
31
42
|
|
|
32
|
-
|
|
43
|
+
# Redis集群不支持的配置参数
|
|
44
|
+
CLUSTER_UNSUPPORTED_CONFIG = {
|
|
45
|
+
'retry_on_timeout',
|
|
46
|
+
'health_check_interval',
|
|
47
|
+
'socket_keepalive'
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def __init__(self, redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs):
|
|
33
51
|
self.redis_url = redis_url
|
|
52
|
+
self.is_cluster = is_cluster
|
|
53
|
+
self.cluster_nodes = cluster_nodes
|
|
34
54
|
self.config = {**self.DEFAULT_CONFIG, **kwargs}
|
|
35
55
|
|
|
36
56
|
# 延迟初始化logger和error_handler
|
|
37
57
|
self._logger = None
|
|
38
|
-
self._error_handler = None
|
|
58
|
+
self._error_handler: Optional["ErrorHandler"] = None
|
|
39
59
|
|
|
40
60
|
# 连接池实例
|
|
41
61
|
self._connection_pool: Optional[aioredis.ConnectionPool] = None
|
|
42
|
-
self._redis_client
|
|
62
|
+
self._redis_client = None
|
|
43
63
|
self._connection_tested = False # 标记是否已测试连接
|
|
44
64
|
|
|
45
65
|
# 连接池统计信息
|
|
@@ -57,7 +77,7 @@ class OptimizedRedisConnectionPool:
|
|
|
57
77
|
def logger(self):
|
|
58
78
|
"""延迟初始化logger"""
|
|
59
79
|
if self._logger is None:
|
|
60
|
-
from crawlo.
|
|
80
|
+
from crawlo.logging import get_logger
|
|
61
81
|
self._logger = get_logger(self.__class__.__name__)
|
|
62
82
|
return self._logger
|
|
63
83
|
|
|
@@ -69,26 +89,119 @@ class OptimizedRedisConnectionPool:
|
|
|
69
89
|
self._error_handler = ErrorHandler(self.__class__.__name__)
|
|
70
90
|
return self._error_handler
|
|
71
91
|
|
|
92
|
+
def _is_cluster_url(self) -> bool:
|
|
93
|
+
"""判断是否为集群URL格式"""
|
|
94
|
+
if self.cluster_nodes:
|
|
95
|
+
return True
|
|
96
|
+
# 检查URL是否包含多个节点(逗号分隔)
|
|
97
|
+
if ',' in self.redis_url:
|
|
98
|
+
return True
|
|
99
|
+
# 检查URL是否为集群格式
|
|
100
|
+
if 'redis-cluster://' in self.redis_url or 'rediss-cluster://' in self.redis_url:
|
|
101
|
+
return True
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
def _parse_cluster_nodes(self) -> List[Dict[str, Union[str, int]]]:
|
|
105
|
+
"""解析集群节点"""
|
|
106
|
+
nodes = []
|
|
107
|
+
if self.cluster_nodes:
|
|
108
|
+
node_list = self.cluster_nodes
|
|
109
|
+
else:
|
|
110
|
+
# 从URL中解析节点
|
|
111
|
+
# 支持格式: redis://host1:port1,host2:port2,host3:port3
|
|
112
|
+
# 或: host1:port1,host2:port2,host3:port3
|
|
113
|
+
url_part = self.redis_url.replace('redis://', '').replace('rediss://', '')
|
|
114
|
+
node_list = url_part.split(',')
|
|
115
|
+
|
|
116
|
+
for node in node_list:
|
|
117
|
+
# 解析host:port格式
|
|
118
|
+
if ':' in node:
|
|
119
|
+
host, port = node.rsplit(':', 1)
|
|
120
|
+
try:
|
|
121
|
+
nodes.append({
|
|
122
|
+
'host': str(host.strip()),
|
|
123
|
+
'port': int(port.strip())
|
|
124
|
+
})
|
|
125
|
+
except ValueError:
|
|
126
|
+
self.logger.warning(f"无效的节点格式: {node}")
|
|
127
|
+
else:
|
|
128
|
+
# 默认端口
|
|
129
|
+
nodes.append({
|
|
130
|
+
'host': str(node.strip()),
|
|
131
|
+
'port': 6379
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
return nodes
|
|
135
|
+
|
|
136
|
+
def _get_cluster_config(self) -> Dict[str, Any]:
|
|
137
|
+
"""获取适用于Redis集群的配置"""
|
|
138
|
+
# 移除集群不支持的配置参数
|
|
139
|
+
cluster_config = self.config.copy()
|
|
140
|
+
for unsupported_key in self.CLUSTER_UNSUPPORTED_CONFIG:
|
|
141
|
+
cluster_config.pop(unsupported_key, None)
|
|
142
|
+
return cluster_config
|
|
143
|
+
|
|
72
144
|
def _initialize_pool(self):
|
|
73
145
|
"""初始化连接池"""
|
|
74
146
|
try:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
**self.config
|
|
78
|
-
)
|
|
147
|
+
# 智能检测是否应该使用集群模式
|
|
148
|
+
should_use_cluster = self.is_cluster or self._is_cluster_url()
|
|
79
149
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
150
|
+
if should_use_cluster and REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and ClusterNode is not None:
|
|
151
|
+
# 使用Redis集群
|
|
152
|
+
nodes = self._parse_cluster_nodes()
|
|
153
|
+
cluster_config = self._get_cluster_config()
|
|
154
|
+
|
|
155
|
+
if nodes:
|
|
156
|
+
if len(nodes) == 1:
|
|
157
|
+
# 单节点集群
|
|
158
|
+
self._redis_client = RedisCluster(
|
|
159
|
+
host=str(nodes[0]['host']),
|
|
160
|
+
port=int(nodes[0]['port']),
|
|
161
|
+
**cluster_config
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
# 多节点集群
|
|
165
|
+
cluster_node_objects = [ClusterNode(str(node['host']), int(node['port'])) for node in nodes]
|
|
166
|
+
self._redis_client = RedisCluster(
|
|
167
|
+
startup_nodes=cluster_node_objects,
|
|
168
|
+
**cluster_config
|
|
169
|
+
)
|
|
170
|
+
self.logger.info(f"Redis集群连接池初始化成功: {len(nodes)} 个节点")
|
|
171
|
+
else:
|
|
172
|
+
# 回退到单实例模式
|
|
173
|
+
self._connection_pool = aioredis.ConnectionPool.from_url(
|
|
174
|
+
self.redis_url,
|
|
175
|
+
**self.config
|
|
176
|
+
)
|
|
177
|
+
self._redis_client = aioredis.Redis(
|
|
178
|
+
connection_pool=self._connection_pool
|
|
179
|
+
)
|
|
180
|
+
self.logger.warning("无法解析集群节点,回退到单实例模式")
|
|
181
|
+
else:
|
|
182
|
+
# 使用单实例Redis
|
|
183
|
+
self._connection_pool = aioredis.ConnectionPool.from_url(
|
|
184
|
+
self.redis_url,
|
|
185
|
+
**self.config
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
self._redis_client = aioredis.Redis(
|
|
189
|
+
connection_pool=self._connection_pool
|
|
190
|
+
)
|
|
83
191
|
|
|
84
192
|
# 只在调试模式下输出详细连接池信息
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
193
|
+
if should_use_cluster and REDIS_CLUSTER_AVAILABLE:
|
|
194
|
+
self.logger.debug(f"Redis集群连接池初始化成功: {self.redis_url}")
|
|
195
|
+
else:
|
|
196
|
+
self.logger.debug(f"Redis连接池初始化成功: {self.redis_url}")
|
|
197
|
+
self.logger.debug(f" 连接池配置: {self.config}")
|
|
198
|
+
|
|
88
199
|
except Exception as e:
|
|
200
|
+
from crawlo.utils.error_handler import ErrorContext
|
|
201
|
+
error_context = ErrorContext(context="Redis连接池初始化失败")
|
|
89
202
|
self.error_handler.handle_error(
|
|
90
203
|
e,
|
|
91
|
-
context=
|
|
204
|
+
context=error_context,
|
|
92
205
|
raise_error=True
|
|
93
206
|
)
|
|
94
207
|
|
|
@@ -99,12 +212,15 @@ class OptimizedRedisConnectionPool:
|
|
|
99
212
|
await self._redis_client.ping()
|
|
100
213
|
self._connection_tested = True
|
|
101
214
|
# 只在调试模式下输出连接测试成功信息
|
|
102
|
-
self.
|
|
215
|
+
if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None and isinstance(self._redis_client, RedisCluster):
|
|
216
|
+
self.logger.debug(f"Redis集群连接测试成功: {self.redis_url}")
|
|
217
|
+
else:
|
|
218
|
+
self.logger.debug(f"Redis连接测试成功: {self.redis_url}")
|
|
103
219
|
except Exception as e:
|
|
104
220
|
self.logger.error(f"Redis连接测试失败: {self.redis_url} - {e}")
|
|
105
221
|
raise
|
|
106
222
|
|
|
107
|
-
async def get_connection(self)
|
|
223
|
+
async def get_connection(self):
|
|
108
224
|
"""
|
|
109
225
|
获取Redis连接实例
|
|
110
226
|
|
|
@@ -149,9 +265,11 @@ class OptimizedRedisConnectionPool:
|
|
|
149
265
|
|
|
150
266
|
self.logger.info("Redis连接池已关闭")
|
|
151
267
|
except Exception as e:
|
|
268
|
+
from crawlo.utils.error_handler import ErrorContext
|
|
269
|
+
error_context = ErrorContext(context="关闭Redis连接池失败")
|
|
152
270
|
self.error_handler.handle_error(
|
|
153
271
|
e,
|
|
154
|
-
context=
|
|
272
|
+
context=error_context,
|
|
155
273
|
raise_error=False
|
|
156
274
|
)
|
|
157
275
|
|
|
@@ -162,12 +280,11 @@ class OptimizedRedisConnectionPool:
|
|
|
162
280
|
Returns:
|
|
163
281
|
统计信息字典
|
|
164
282
|
"""
|
|
165
|
-
if self._connection_pool:
|
|
283
|
+
if self._connection_pool and hasattr(self._connection_pool, 'max_connections'):
|
|
166
284
|
pool_stats = {
|
|
167
285
|
'max_connections': self._connection_pool.max_connections,
|
|
168
|
-
'
|
|
169
|
-
'
|
|
170
|
-
'in_use_connections': len(self._connection_pool._in_use_connections),
|
|
286
|
+
'available_connections': len(self._connection_pool._available_connections) if hasattr(self._connection_pool, '_available_connections') else 0,
|
|
287
|
+
'in_use_connections': len(self._connection_pool._in_use_connections) if hasattr(self._connection_pool, '_in_use_connections') else 0,
|
|
171
288
|
}
|
|
172
289
|
self._stats.update(pool_stats)
|
|
173
290
|
|
|
@@ -192,7 +309,7 @@ class OptimizedRedisConnectionPool:
|
|
|
192
309
|
class RedisBatchOperationHelper:
|
|
193
310
|
"""Redis批量操作助手"""
|
|
194
311
|
|
|
195
|
-
def __init__(self, redis_client
|
|
312
|
+
def __init__(self, redis_client, batch_size: int = 100):
|
|
196
313
|
self.redis_client = redis_client
|
|
197
314
|
self.batch_size = batch_size
|
|
198
315
|
|
|
@@ -204,7 +321,7 @@ class RedisBatchOperationHelper:
|
|
|
204
321
|
def logger(self):
|
|
205
322
|
"""延迟初始化logger"""
|
|
206
323
|
if self._logger is None:
|
|
207
|
-
from crawlo.
|
|
324
|
+
from crawlo.logging import get_logger
|
|
208
325
|
self._logger = get_logger(self.__class__.__name__)
|
|
209
326
|
return self._logger
|
|
210
327
|
|
|
@@ -236,22 +353,34 @@ class RedisBatchOperationHelper:
|
|
|
236
353
|
self.logger.debug(f"执行批次 {i//actual_batch_size + 1}/{(len(operations)-1)//actual_batch_size + 1}")
|
|
237
354
|
|
|
238
355
|
try:
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
356
|
+
# 处理集群模式下的管道操作
|
|
357
|
+
if hasattr(self.redis_client, 'pipeline'):
|
|
358
|
+
pipe = self.redis_client.pipeline()
|
|
359
|
+
for operation in batch:
|
|
360
|
+
command, *args = operation
|
|
361
|
+
getattr(pipe, command)(*args)
|
|
362
|
+
|
|
363
|
+
batch_results = await pipe.execute()
|
|
364
|
+
results.extend(batch_results)
|
|
365
|
+
else:
|
|
366
|
+
# 集群模式可能不支持跨slot的管道操作,逐个执行
|
|
367
|
+
batch_results = []
|
|
368
|
+
for operation in batch:
|
|
369
|
+
command, *args = operation
|
|
370
|
+
result = await getattr(self.redis_client, command)(*args)
|
|
371
|
+
batch_results.append(result)
|
|
372
|
+
results.extend(batch_results)
|
|
246
373
|
|
|
247
374
|
except Exception as e:
|
|
248
375
|
self.logger.error(f"执行批次失败: {e}")
|
|
249
376
|
# 继续执行下一个批次而不是中断
|
|
250
377
|
|
|
251
378
|
except Exception as e:
|
|
379
|
+
from crawlo.utils.error_handler import ErrorContext
|
|
380
|
+
error_context = ErrorContext(context="Redis批量操作执行失败")
|
|
252
381
|
self.error_handler.handle_error(
|
|
253
382
|
e,
|
|
254
|
-
context=
|
|
383
|
+
context=error_context,
|
|
255
384
|
raise_error=False
|
|
256
385
|
)
|
|
257
386
|
|
|
@@ -272,29 +401,47 @@ class RedisBatchOperationHelper:
|
|
|
272
401
|
if not items:
|
|
273
402
|
return 0
|
|
274
403
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
404
|
+
# 处理集群模式
|
|
405
|
+
if hasattr(self.redis_client, 'pipeline'):
|
|
406
|
+
pipe = self.redis_client.pipeline()
|
|
407
|
+
count = 0
|
|
408
|
+
|
|
409
|
+
for key, value in items.items():
|
|
410
|
+
pipe.hset(hash_key, key, value)
|
|
411
|
+
count += 1
|
|
412
|
+
|
|
413
|
+
# 每达到批次大小就执行一次
|
|
414
|
+
if count % self.batch_size == 0:
|
|
415
|
+
await pipe.execute()
|
|
416
|
+
pipe = self.redis_client.pipeline()
|
|
281
417
|
|
|
282
|
-
#
|
|
283
|
-
if count % self.batch_size
|
|
418
|
+
# 执行剩余的操作
|
|
419
|
+
if count % self.batch_size != 0:
|
|
284
420
|
await pipe.execute()
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
421
|
+
else:
|
|
422
|
+
# 集群模式逐个执行
|
|
423
|
+
count = 0
|
|
424
|
+
batch_count = 0
|
|
425
|
+
for key, value in items.items():
|
|
426
|
+
await self.redis_client.hset(hash_key, key, value)
|
|
427
|
+
count += 1
|
|
428
|
+
batch_count += 1
|
|
429
|
+
|
|
430
|
+
# 每达到批次大小就暂停一下
|
|
431
|
+
if batch_count % self.batch_size == 0:
|
|
432
|
+
import asyncio
|
|
433
|
+
await asyncio.sleep(0.001) # 避免过于频繁的请求
|
|
434
|
+
batch_count = 0
|
|
290
435
|
|
|
291
436
|
self.logger.debug(f"批量设置Hash {count} 个字段")
|
|
292
437
|
return count
|
|
293
438
|
|
|
294
439
|
except Exception as e:
|
|
440
|
+
from crawlo.utils.error_handler import ErrorContext
|
|
441
|
+
error_context = ErrorContext(context="Redis批量设置Hash失败")
|
|
295
442
|
self.error_handler.handle_error(
|
|
296
443
|
e,
|
|
297
|
-
context=
|
|
444
|
+
context=error_context,
|
|
298
445
|
raise_error=False
|
|
299
446
|
)
|
|
300
447
|
return 0
|
|
@@ -314,12 +461,20 @@ class RedisBatchOperationHelper:
|
|
|
314
461
|
if not fields:
|
|
315
462
|
return {}
|
|
316
463
|
|
|
317
|
-
#
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
pipe.
|
|
321
|
-
|
|
322
|
-
|
|
464
|
+
# 处理集群模式
|
|
465
|
+
if hasattr(self.redis_client, 'pipeline'):
|
|
466
|
+
# 使用管道批量获取
|
|
467
|
+
pipe = self.redis_client.pipeline()
|
|
468
|
+
for field in fields:
|
|
469
|
+
pipe.hget(hash_key, field)
|
|
470
|
+
|
|
471
|
+
results = await pipe.execute()
|
|
472
|
+
else:
|
|
473
|
+
# 集群模式逐个获取
|
|
474
|
+
results = []
|
|
475
|
+
for field in fields:
|
|
476
|
+
result = await self.redis_client.hget(hash_key, field)
|
|
477
|
+
results.append(result)
|
|
323
478
|
|
|
324
479
|
# 构建结果字典
|
|
325
480
|
result_dict = {}
|
|
@@ -331,47 +486,80 @@ class RedisBatchOperationHelper:
|
|
|
331
486
|
return result_dict
|
|
332
487
|
|
|
333
488
|
except Exception as e:
|
|
489
|
+
from crawlo.utils.error_handler import ErrorContext
|
|
490
|
+
error_context = ErrorContext(context="Redis批量获取Hash失败")
|
|
334
491
|
self.error_handler.handle_error(
|
|
335
492
|
e,
|
|
336
|
-
context=
|
|
493
|
+
context=error_context,
|
|
337
494
|
raise_error=False
|
|
338
495
|
)
|
|
339
496
|
return {}
|
|
340
497
|
|
|
341
498
|
|
|
342
499
|
# 全局连接池管理器
|
|
343
|
-
_connection_pools: Dict[str,
|
|
500
|
+
_connection_pools: Dict[str, RedisConnectionPool] = {}
|
|
344
501
|
|
|
345
502
|
|
|
346
|
-
def get_redis_pool(redis_url: str, **kwargs) ->
|
|
503
|
+
def get_redis_pool(redis_url: str, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None, **kwargs) -> RedisConnectionPool:
|
|
347
504
|
"""
|
|
348
505
|
获取Redis连接池实例(单例模式)
|
|
349
506
|
|
|
350
507
|
Args:
|
|
351
508
|
redis_url: Redis URL
|
|
509
|
+
is_cluster: 是否为集群模式
|
|
510
|
+
cluster_nodes: 集群节点列表
|
|
352
511
|
**kwargs: 连接池配置参数
|
|
353
512
|
|
|
354
513
|
Returns:
|
|
355
514
|
Redis连接池实例
|
|
356
515
|
"""
|
|
357
|
-
|
|
358
|
-
|
|
516
|
+
# 创建唯一标识符,包含集群相关信息
|
|
517
|
+
pool_key = f"{redis_url}_{is_cluster}_{','.join(cluster_nodes) if cluster_nodes else ''}"
|
|
359
518
|
|
|
360
|
-
|
|
519
|
+
if pool_key not in _connection_pools:
|
|
520
|
+
_connection_pools[pool_key] = RedisConnectionPool(redis_url, is_cluster, cluster_nodes, **kwargs)
|
|
521
|
+
|
|
522
|
+
return _connection_pools[pool_key]
|
|
361
523
|
|
|
362
524
|
|
|
363
525
|
async def close_all_pools():
|
|
364
526
|
"""关闭所有连接池"""
|
|
527
|
+
import asyncio
|
|
365
528
|
global _connection_pools
|
|
366
529
|
|
|
367
|
-
|
|
368
|
-
|
|
530
|
+
from crawlo.logging import get_logger
|
|
531
|
+
logger = get_logger('RedisConnectionPool')
|
|
532
|
+
|
|
533
|
+
if not _connection_pools:
|
|
534
|
+
logger.debug("No Redis connection pools to close")
|
|
535
|
+
return
|
|
536
|
+
|
|
537
|
+
logger.info(f"Closing {len(_connection_pools)} Redis connection pool(s)...")
|
|
538
|
+
|
|
539
|
+
close_tasks = []
|
|
540
|
+
for pool_key, pool in _connection_pools.items():
|
|
541
|
+
try:
|
|
542
|
+
close_tasks.append(pool.close())
|
|
543
|
+
except Exception as e:
|
|
544
|
+
logger.error(f"Error scheduling close for pool {pool_key}: {e}")
|
|
545
|
+
|
|
546
|
+
# 并发关闭所有连接池
|
|
547
|
+
if close_tasks:
|
|
548
|
+
results = await asyncio.gather(*close_tasks, return_exceptions=True)
|
|
549
|
+
|
|
550
|
+
# 检查结果
|
|
551
|
+
error_count = sum(1 for r in results if isinstance(r, Exception))
|
|
552
|
+
if error_count > 0:
|
|
553
|
+
logger.warning(f"Failed to close {error_count} pool(s)")
|
|
554
|
+
else:
|
|
555
|
+
logger.info("All Redis connection pools closed successfully")
|
|
369
556
|
|
|
370
557
|
_connection_pools.clear()
|
|
558
|
+
logger.debug("Redis connection pools registry cleared")
|
|
371
559
|
|
|
372
560
|
|
|
373
561
|
# 便捷函数
|
|
374
|
-
async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100) -> list:
|
|
562
|
+
async def execute_redis_batch(redis_url: str, operations: list, batch_size: int = 100, is_cluster: bool = False, cluster_nodes: Optional[List[str]] = None) -> list:
|
|
375
563
|
"""
|
|
376
564
|
便捷函数:执行Redis批量操作
|
|
377
565
|
|
|
@@ -379,11 +567,13 @@ async def execute_redis_batch(redis_url: str, operations: list, batch_size: int
|
|
|
379
567
|
redis_url: Redis URL
|
|
380
568
|
operations: 操作列表
|
|
381
569
|
batch_size: 批次大小
|
|
570
|
+
is_cluster: 是否为集群模式
|
|
571
|
+
cluster_nodes: 集群节点列表
|
|
382
572
|
|
|
383
573
|
Returns:
|
|
384
574
|
执行结果列表
|
|
385
575
|
"""
|
|
386
|
-
pool = get_redis_pool(redis_url)
|
|
576
|
+
pool = get_redis_pool(redis_url, is_cluster, cluster_nodes)
|
|
387
577
|
redis_client = await pool.get_connection()
|
|
388
578
|
helper = RedisBatchOperationHelper(redis_client, batch_size)
|
|
389
579
|
return await helper.batch_execute(operations)
|
crawlo/utils/request.py
CHANGED
|
@@ -73,12 +73,34 @@ def request_fingerprint(
|
|
|
73
73
|
) -> str:
|
|
74
74
|
"""
|
|
75
75
|
生成请求指纹,基于方法、标准化 URL、body 和可选的 headers。
|
|
76
|
-
|
|
77
|
-
|
|
76
|
+
|
|
77
|
+
.. deprecated:: 1.0.0
|
|
78
|
+
此函数已废弃。请使用 :class:`crawlo.utils.fingerprint.FingerprintGenerator` 代替:
|
|
79
|
+
|
|
80
|
+
.. code-block:: python
|
|
81
|
+
|
|
82
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
83
|
+
|
|
84
|
+
fp = FingerprintGenerator.request_fingerprint(
|
|
85
|
+
method=request.method,
|
|
86
|
+
url=request.url,
|
|
87
|
+
body=request.body or b'',
|
|
88
|
+
headers=dict(request.headers) if hasattr(request, 'headers') else {}
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
此函数保留仅为向后兼容,将在 2.0.0 版本中移除。
|
|
92
|
+
|
|
78
93
|
:param request: Request 对象(需包含 method, url, body, headers)
|
|
79
94
|
:param include_headers: 指定要参与指纹计算的 header 名称列表(str 或 bytes)
|
|
80
95
|
:return: 请求指纹(hex string)
|
|
81
96
|
"""
|
|
97
|
+
import warnings
|
|
98
|
+
warnings.warn(
|
|
99
|
+
"request_fingerprint() is deprecated. "
|
|
100
|
+
"Use FingerprintGenerator.request_fingerprint() instead.",
|
|
101
|
+
DeprecationWarning,
|
|
102
|
+
stacklevel=2
|
|
103
|
+
)
|
|
82
104
|
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
83
105
|
|
|
84
106
|
# 准备请求数据
|