crawlo 1.2.5__py3-none-any.whl → 1.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlo/__version__.py +1 -1
- crawlo/core/engine.py +3 -1
- crawlo/core/scheduler.py +102 -6
- crawlo/filters/aioredis_filter.py +44 -91
- crawlo/queue/queue_manager.py +47 -8
- crawlo/queue/redis_priority_queue.py +9 -2
- crawlo/settings/default_settings.py +5 -7
- crawlo/templates/project/settings.py.tmpl +3 -39
- crawlo/templates/project/settings_distributed.py.tmpl +4 -1
- crawlo/templates/project/settings_gentle.py.tmpl +59 -86
- crawlo/templates/project/settings_high_performance.py.tmpl +84 -99
- crawlo/templates/project/settings_simple.py.tmpl +72 -76
- crawlo/templates/run.py.tmpl +1 -3
- crawlo/utils/redis_connection_pool.py +19 -2
- {crawlo-1.2.5.dist-info → crawlo-1.2.6.dist-info}/METADATA +1 -1
- {crawlo-1.2.5.dist-info → crawlo-1.2.6.dist-info}/RECORD +19 -19
- {crawlo-1.2.5.dist-info → crawlo-1.2.6.dist-info}/WHEEL +0 -0
- {crawlo-1.2.5.dist-info → crawlo-1.2.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.5.dist-info → crawlo-1.2.6.dist-info}/top_level.txt +0 -0
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.6"
|
crawlo/core/engine.py
CHANGED
|
@@ -88,8 +88,9 @@ class Engine(object):
|
|
|
88
88
|
self.downloader = downloader_cls(self.crawler)
|
|
89
89
|
if hasattr(self.downloader, 'open'):
|
|
90
90
|
if asyncio.iscoroutinefunction(self.downloader.open):
|
|
91
|
-
|
|
91
|
+
self.downloader.open()
|
|
92
92
|
else:
|
|
93
|
+
# DownloaderBase.open() 是同步方法,直接调用而不是await
|
|
93
94
|
self.downloader.open()
|
|
94
95
|
|
|
95
96
|
self.processor = Processor(self.crawler)
|
|
@@ -97,6 +98,7 @@ class Engine(object):
|
|
|
97
98
|
if asyncio.iscoroutinefunction(self.processor.open):
|
|
98
99
|
await self.processor.open()
|
|
99
100
|
else:
|
|
101
|
+
# Processor.open() 是同步方法
|
|
100
102
|
self.processor.open()
|
|
101
103
|
|
|
102
104
|
self.start_requests = iter(spider.start_requests())
|
crawlo/core/scheduler.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
3
|
from typing import Optional, Callable
|
|
4
|
+
import traceback
|
|
4
5
|
|
|
5
6
|
from crawlo.utils.log import get_logger
|
|
6
7
|
from crawlo.utils.request import set_request
|
|
7
8
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
8
9
|
from crawlo.utils.error_handler import ErrorHandler
|
|
9
|
-
from crawlo.queue.queue_manager import QueueManager, QueueConfig
|
|
10
|
+
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
10
11
|
from crawlo.project import load_class, common_call
|
|
11
12
|
|
|
12
13
|
|
|
@@ -46,19 +47,114 @@ class Scheduler:
|
|
|
46
47
|
|
|
47
48
|
# 初始化队列
|
|
48
49
|
self.logger.info("开始初始化队列管理器...")
|
|
49
|
-
|
|
50
|
-
if not success:
|
|
51
|
-
raise RuntimeError("队列初始化失败")
|
|
50
|
+
needs_config_update = await self.queue_manager.initialize()
|
|
52
51
|
|
|
53
|
-
|
|
52
|
+
self.logger.info(f"队列初始化完成,needs_config_update: {needs_config_update}")
|
|
53
|
+
self.logger.info(f"当前队列类型: {self.queue_manager._queue_type}")
|
|
54
|
+
|
|
55
|
+
# 检查是否需要更新过滤器配置
|
|
56
|
+
if needs_config_update:
|
|
57
|
+
# 如果返回True,说明队列类型发生了变化,需要检查当前队列类型来决定更新方向
|
|
58
|
+
self.logger.info("需要更新配置...")
|
|
59
|
+
if self.queue_manager._queue_type == QueueType.REDIS:
|
|
60
|
+
self.logger.info("更新为Redis配置...")
|
|
61
|
+
self._update_filter_config_for_redis()
|
|
62
|
+
else:
|
|
63
|
+
self.logger.info("更新为内存配置...")
|
|
64
|
+
self._update_filter_config_if_needed()
|
|
65
|
+
else:
|
|
66
|
+
# 检查是否需要更新配置(即使队列管理器没有要求更新)
|
|
67
|
+
self.logger.debug("检查是否需要更新配置...")
|
|
68
|
+
if self.queue_manager._queue_type == QueueType.REDIS:
|
|
69
|
+
# 检查当前过滤器是否为内存过滤器
|
|
70
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
71
|
+
if 'memory_filter' in current_filter_class:
|
|
72
|
+
self.logger.info("检测到需要更新为Redis配置...")
|
|
73
|
+
self._update_filter_config_for_redis()
|
|
74
|
+
elif self.queue_manager._queue_type == QueueType.MEMORY:
|
|
75
|
+
# 检查当前过滤器是否为Redis过滤器
|
|
76
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
77
|
+
if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
|
|
78
|
+
self.logger.info("检测到需要更新为内存配置...")
|
|
79
|
+
self._update_filter_config_if_needed()
|
|
80
|
+
|
|
81
|
+
# 只有在确实需要更新配置时才重新创建过滤器实例
|
|
82
|
+
# 检查是否真的进行了配置更新
|
|
83
|
+
filter_updated = (
|
|
84
|
+
(self.queue_manager._queue_type == QueueType.REDIS and 'memory_filter' in self.crawler.settings.get('FILTER_CLASS', '')) or
|
|
85
|
+
(self.queue_manager._queue_type == QueueType.MEMORY and ('aioredis_filter' in self.crawler.settings.get('FILTER_CLASS', '') or 'redis_filter' in self.crawler.settings.get('FILTER_CLASS', '')))
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if needs_config_update or filter_updated:
|
|
89
|
+
# 重新创建过滤器实例,确保使用更新后的配置
|
|
90
|
+
self.logger.debug("重新创建过滤器实例...")
|
|
91
|
+
filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
|
|
92
|
+
self.dupe_filter = filter_cls.create_instance(self.crawler)
|
|
93
|
+
self.logger.info(f"✅ 过滤器实例已更新为: {type(self.dupe_filter).__name__}")
|
|
94
|
+
else:
|
|
95
|
+
self.logger.debug("过滤器配置无需更新,跳过重新创建")
|
|
96
|
+
|
|
97
|
+
# 输出队列状态和配置信息
|
|
54
98
|
status = self.queue_manager.get_status()
|
|
99
|
+
current_filter = self.crawler.settings.get('FILTER_CLASS')
|
|
100
|
+
current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
101
|
+
|
|
55
102
|
self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
|
|
56
|
-
self.logger.info(f'
|
|
103
|
+
self.logger.info(f'当前过滤器: {type(self.dupe_filter).__name__} ({current_filter})')
|
|
104
|
+
self.logger.info(f'当前去重管道: {current_dedup_pipeline}')
|
|
57
105
|
self.logger.info("调度器初始化完成")
|
|
58
106
|
except Exception as e:
|
|
59
107
|
self.logger.error(f"❌ 调度器初始化失败: {e}")
|
|
60
108
|
self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
|
|
61
109
|
raise
|
|
110
|
+
|
|
111
|
+
def _update_filter_config_if_needed(self):
|
|
112
|
+
"""如果队列类型切换到内存模式,则更新过滤器配置"""
|
|
113
|
+
if self.queue_manager and self.queue_manager._queue_type == QueueType.MEMORY:
|
|
114
|
+
# 检查当前过滤器是否为Redis过滤器
|
|
115
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
116
|
+
if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
|
|
117
|
+
# 更新为内存过滤器
|
|
118
|
+
self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.memory_filter.MemoryFilter')
|
|
119
|
+
self.logger.info("✅ 已更新过滤器配置为内存模式")
|
|
120
|
+
|
|
121
|
+
# 检查当前去重管道是否为Redis去重管道
|
|
122
|
+
current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
|
|
123
|
+
if 'redis_dedup_pipeline' in current_dedup_pipeline:
|
|
124
|
+
# 更新为内存去重管道
|
|
125
|
+
self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline')
|
|
126
|
+
# 同时更新PIPELINES列表中的去重管道
|
|
127
|
+
pipelines = self.crawler.settings.get('PIPELINES', [])
|
|
128
|
+
if current_dedup_pipeline in pipelines:
|
|
129
|
+
# 找到并替换Redis去重管道为内存去重管道
|
|
130
|
+
index = pipelines.index(current_dedup_pipeline)
|
|
131
|
+
pipelines[index] = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
132
|
+
self.crawler.settings.set('PIPELINES', pipelines)
|
|
133
|
+
self.logger.info("✅ 已更新去重管道配置为内存模式")
|
|
134
|
+
|
|
135
|
+
def _update_filter_config_for_redis(self):
|
|
136
|
+
"""如果队列类型是Redis,则更新过滤器配置为Redis实现"""
|
|
137
|
+
if self.queue_manager and self.queue_manager._queue_type == QueueType.REDIS:
|
|
138
|
+
# 检查当前过滤器是否为内存过滤器
|
|
139
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
140
|
+
if 'memory_filter' in current_filter_class:
|
|
141
|
+
# 更新为Redis过滤器
|
|
142
|
+
self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.aioredis_filter.AioRedisFilter')
|
|
143
|
+
self.logger.info("✅ 已更新过滤器配置为Redis模式")
|
|
144
|
+
|
|
145
|
+
# 检查当前去重管道是否为内存去重管道
|
|
146
|
+
current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
|
|
147
|
+
if 'memory_dedup_pipeline' in current_dedup_pipeline:
|
|
148
|
+
# 更新为Redis去重管道
|
|
149
|
+
self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline')
|
|
150
|
+
# 同时更新PIPELINES列表中的去重管道
|
|
151
|
+
pipelines = self.crawler.settings.get('PIPELINES', [])
|
|
152
|
+
if current_dedup_pipeline in pipelines:
|
|
153
|
+
# 找到并替换内存去重管道为Redis去重管道
|
|
154
|
+
index = pipelines.index(current_dedup_pipeline)
|
|
155
|
+
pipelines[index] = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
156
|
+
self.crawler.settings.set('PIPELINES', pipelines)
|
|
157
|
+
self.logger.info("✅ 已更新去重管道配置为Redis模式")
|
|
62
158
|
|
|
63
159
|
async def next_request(self):
|
|
64
160
|
"""获取下一个请求"""
|
|
@@ -1,18 +1,6 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Redis 过滤器实现
|
|
5
|
-
=================
|
|
6
|
-
提供基于 Redis 的分布式请求去重功能。
|
|
7
|
-
|
|
8
|
-
特点:
|
|
9
|
-
- 分布式支持: 多节点共享去重数据
|
|
10
|
-
- TTL 支持: 自动过期清理
|
|
11
|
-
- 高性能: 使用 Redis pipeline 优化
|
|
12
|
-
- 容错设计: 网络异常自动重试
|
|
13
|
-
"""
|
|
14
|
-
import redis.asyncio as aioredis
|
|
15
1
|
from typing import Optional
|
|
2
|
+
import redis.asyncio as aioredis
|
|
3
|
+
|
|
16
4
|
from crawlo.filters import BaseFilter
|
|
17
5
|
from crawlo.utils.log import get_logger
|
|
18
6
|
from crawlo.utils.request import request_fingerprint
|
|
@@ -70,6 +58,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
70
58
|
# 性能计数器
|
|
71
59
|
self._redis_operations = 0
|
|
72
60
|
self._pipeline_operations = 0
|
|
61
|
+
|
|
62
|
+
# 连接状态标记,避免重复尝试连接失败的Redis
|
|
63
|
+
self._connection_failed = False
|
|
73
64
|
|
|
74
65
|
@classmethod
|
|
75
66
|
def create_instance(cls, crawler) -> 'BaseFilter':
|
|
@@ -123,8 +114,17 @@ class AioRedisFilter(BaseFilter):
|
|
|
123
114
|
|
|
124
115
|
async def _get_redis_client(self):
|
|
125
116
|
"""获取Redis客户端实例(延迟初始化)"""
|
|
117
|
+
# 如果之前连接失败,直接返回None
|
|
118
|
+
if self._connection_failed:
|
|
119
|
+
return None
|
|
120
|
+
|
|
126
121
|
if self.redis is None and self._redis_pool is not None:
|
|
127
|
-
|
|
122
|
+
try:
|
|
123
|
+
self.redis = await self._redis_pool.get_connection()
|
|
124
|
+
except Exception as e:
|
|
125
|
+
self._connection_failed = True
|
|
126
|
+
self.logger.error(f"Redis连接失败,将使用本地去重: {e}")
|
|
127
|
+
return None
|
|
128
128
|
return self.redis
|
|
129
129
|
|
|
130
130
|
async def requested(self, request) -> bool:
|
|
@@ -136,13 +136,17 @@ class AioRedisFilter(BaseFilter):
|
|
|
136
136
|
"""
|
|
137
137
|
try:
|
|
138
138
|
# 确保Redis客户端已初始化
|
|
139
|
-
await self._get_redis_client()
|
|
139
|
+
redis_client = await self._get_redis_client()
|
|
140
|
+
|
|
141
|
+
# 如果Redis不可用,返回False表示不重复(避免丢失请求)
|
|
142
|
+
if redis_client is None:
|
|
143
|
+
return False
|
|
140
144
|
|
|
141
145
|
fp = str(request_fingerprint(request))
|
|
142
146
|
self._redis_operations += 1
|
|
143
147
|
|
|
144
148
|
# 使用 pipeline 优化性能
|
|
145
|
-
pipe =
|
|
149
|
+
pipe = redis_client.pipeline()
|
|
146
150
|
pipe.sismember(self.redis_key, fp)
|
|
147
151
|
|
|
148
152
|
results = await pipe.execute()
|
|
@@ -173,12 +177,16 @@ class AioRedisFilter(BaseFilter):
|
|
|
173
177
|
"""
|
|
174
178
|
try:
|
|
175
179
|
# 确保Redis客户端已初始化
|
|
176
|
-
await self._get_redis_client()
|
|
180
|
+
redis_client = await self._get_redis_client()
|
|
181
|
+
|
|
182
|
+
# 如果Redis不可用,返回False表示添加失败
|
|
183
|
+
if redis_client is None:
|
|
184
|
+
return False
|
|
177
185
|
|
|
178
186
|
fp = str(fp)
|
|
179
187
|
|
|
180
188
|
# 使用 pipeline 优化性能
|
|
181
|
-
pipe =
|
|
189
|
+
pipe = redis_client.pipeline()
|
|
182
190
|
pipe.sadd(self.redis_key, fp)
|
|
183
191
|
|
|
184
192
|
if self.ttl and self.ttl > 0:
|
|
@@ -197,85 +205,30 @@ class AioRedisFilter(BaseFilter):
|
|
|
197
205
|
except Exception as e:
|
|
198
206
|
self.logger.error(f"添加指纹失败: {fp[:20]}... - {e}")
|
|
199
207
|
return False
|
|
200
|
-
|
|
201
|
-
def __contains__(self,
|
|
208
|
+
|
|
209
|
+
async def __contains__(self, fp: str) -> bool:
|
|
202
210
|
"""
|
|
203
|
-
|
|
211
|
+
检查指纹是否存在于Redis集合中
|
|
204
212
|
|
|
205
|
-
:param
|
|
206
|
-
:return:
|
|
213
|
+
:param fp: 请求指纹字符串
|
|
214
|
+
:return: 是否存在
|
|
207
215
|
"""
|
|
208
|
-
# 这是一个同步方法,不能直接调用异步Redis操作
|
|
209
|
-
# 建议使用 requested() 方法替代
|
|
210
|
-
raise NotImplementedError("请使用 requested() 方法进行异步检查")
|
|
211
|
-
|
|
212
|
-
async def get_stats(self) -> dict:
|
|
213
|
-
"""获取过滤器详细统计信息"""
|
|
214
216
|
try:
|
|
215
217
|
# 确保Redis客户端已初始化
|
|
216
|
-
await self._get_redis_client()
|
|
217
|
-
|
|
218
|
-
count = await self.redis.scard(self.redis_key)
|
|
218
|
+
redis_client = await self._get_redis_client()
|
|
219
219
|
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
remaining_ttl = await self.redis.ttl(self.redis_key)
|
|
224
|
-
if remaining_ttl > 0:
|
|
225
|
-
ttl_info = f"剩余 {remaining_ttl} 秒"
|
|
226
|
-
else:
|
|
227
|
-
ttl_info = f"配置 {self.ttl} 秒"
|
|
228
|
-
|
|
229
|
-
stats = {
|
|
230
|
-
'filter_type': 'AioRedisFilter',
|
|
231
|
-
'指纹总数': count,
|
|
232
|
-
'Redis键名': self.redis_key,
|
|
233
|
-
'TTL配置': ttl_info,
|
|
234
|
-
'Redis操作数': self._redis_operations,
|
|
235
|
-
'Pipeline操作数': self._pipeline_operations,
|
|
236
|
-
'性能优化率': f"{self._pipeline_operations / max(1, self._redis_operations) * 100:.1f}%"
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
# 合并基类统计
|
|
240
|
-
base_stats = super().get_stats()
|
|
241
|
-
stats.update(base_stats)
|
|
242
|
-
|
|
243
|
-
return stats
|
|
220
|
+
# 如果Redis不可用,返回False表示不存在
|
|
221
|
+
if redis_client is None:
|
|
222
|
+
return False
|
|
244
223
|
|
|
224
|
+
# 检查指纹是否存在
|
|
225
|
+
exists = await redis_client.sismember(self.redis_key, str(fp))
|
|
226
|
+
return exists
|
|
245
227
|
except Exception as e:
|
|
246
|
-
self.logger.error(f"
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
async def clear_all(self) -> int:
|
|
250
|
-
"""清空所有指纹数据"""
|
|
251
|
-
try:
|
|
252
|
-
# 确保Redis客户端已初始化
|
|
253
|
-
await self._get_redis_client()
|
|
254
|
-
|
|
255
|
-
deleted = await self.redis.delete(self.redis_key)
|
|
256
|
-
self.logger.info(f"已清除指纹数: {deleted}")
|
|
257
|
-
return deleted
|
|
258
|
-
except Exception as e:
|
|
259
|
-
self.logger.error("清空指纹失败")
|
|
260
|
-
raise
|
|
228
|
+
self.logger.error(f"检查指纹存在性失败: {fp[:20]}... - {e}")
|
|
229
|
+
# 在网络异常时返回False,避免丢失请求
|
|
230
|
+
return False
|
|
261
231
|
|
|
262
|
-
async def closed(self, reason: Optional[str] = None) -> None:
|
|
263
|
-
"""爬虫关闭时的清理操作"""
|
|
264
|
-
try:
|
|
265
|
-
# 确保Redis客户端已初始化
|
|
266
|
-
await self._get_redis_client()
|
|
267
|
-
|
|
268
|
-
if self.cleanup_fp:
|
|
269
|
-
deleted = await self.redis.delete(self.redis_key)
|
|
270
|
-
self.logger.info(f"爬虫关闭清理: 已删除{deleted}个指纹")
|
|
271
|
-
else:
|
|
272
|
-
count = await self.redis.scard(self.redis_key)
|
|
273
|
-
ttl_info = f"{self.ttl}秒" if self.ttl else "持久化"
|
|
274
|
-
self.logger.info(f"保留指纹数: {count} (TTL: {ttl_info})")
|
|
275
|
-
finally:
|
|
276
|
-
await self._close_redis()
|
|
277
232
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# 连接池会自动管理连接,这里不需要显式关闭
|
|
281
|
-
self.logger.debug("Redis连接已释放")
|
|
233
|
+
# 为了兼容性,确保导出类
|
|
234
|
+
__all__ = ['AioRedisFilter']
|
crawlo/queue/queue_manager.py
CHANGED
|
@@ -4,11 +4,11 @@
|
|
|
4
4
|
统一的队列管理器
|
|
5
5
|
提供简洁、一致的队列接口,自动处理不同队列类型的差异
|
|
6
6
|
"""
|
|
7
|
-
|
|
8
|
-
from enum import Enum
|
|
7
|
+
import os
|
|
9
8
|
import asyncio
|
|
10
9
|
import traceback
|
|
11
|
-
import
|
|
10
|
+
from typing import Optional, Dict, Any, Union
|
|
11
|
+
from enum import Enum
|
|
12
12
|
|
|
13
13
|
from crawlo.utils.log import get_logger
|
|
14
14
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
@@ -103,11 +103,24 @@ class QueueManager:
|
|
|
103
103
|
self._queue_type = queue_type
|
|
104
104
|
|
|
105
105
|
# 测试队列健康状态
|
|
106
|
-
await self._health_check()
|
|
106
|
+
health_check_result = await self._health_check()
|
|
107
107
|
|
|
108
108
|
self.logger.info(f"✅ 队列初始化成功: {queue_type.value}")
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
# 只在调试模式下输出详细配置信息
|
|
110
|
+
self.logger.debug(f"📊 队列配置: {self._get_queue_info()}")
|
|
111
|
+
|
|
112
|
+
# 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
|
|
113
|
+
if health_check_result:
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
# 如果队列类型是Redis,检查是否需要更新配置
|
|
117
|
+
if queue_type == QueueType.REDIS:
|
|
118
|
+
# 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
|
|
119
|
+
# 但我们不需要总是返回True,只有在确实需要更新时才返回True
|
|
120
|
+
# 调度器会进行更详细的检查
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
return False # 默认不需要更新配置
|
|
111
124
|
|
|
112
125
|
except Exception as e:
|
|
113
126
|
# 记录详细的错误信息和堆栈跟踪
|
|
@@ -265,7 +278,15 @@ class QueueManager:
|
|
|
265
278
|
raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
|
|
266
279
|
if not self.config.redis_url:
|
|
267
280
|
raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
|
|
268
|
-
|
|
281
|
+
# 测试 Redis 连接
|
|
282
|
+
try:
|
|
283
|
+
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
284
|
+
await test_queue.connect()
|
|
285
|
+
await test_queue.close()
|
|
286
|
+
return QueueType.REDIS
|
|
287
|
+
except Exception as e:
|
|
288
|
+
# 如果强制使用Redis但连接失败,则抛出异常
|
|
289
|
+
raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
|
|
269
290
|
|
|
270
291
|
elif self.config.queue_type == QueueType.MEMORY:
|
|
271
292
|
return QueueType.MEMORY
|
|
@@ -307,7 +328,7 @@ class QueueManager:
|
|
|
307
328
|
else:
|
|
308
329
|
raise ValueError(f"不支持的队列类型: {queue_type}")
|
|
309
330
|
|
|
310
|
-
async def _health_check(self) ->
|
|
331
|
+
async def _health_check(self) -> bool:
|
|
311
332
|
"""健康检查"""
|
|
312
333
|
try:
|
|
313
334
|
if self._queue_type == QueueType.REDIS:
|
|
@@ -317,9 +338,27 @@ class QueueManager:
|
|
|
317
338
|
else:
|
|
318
339
|
# 内存队列总是健康的
|
|
319
340
|
self._health_status = "healthy"
|
|
341
|
+
return False # 内存队列不需要更新配置
|
|
320
342
|
except Exception as e:
|
|
321
343
|
self.logger.warning(f"队列健康检查失败: {e}")
|
|
322
344
|
self._health_status = "unhealthy"
|
|
345
|
+
# 如果是Redis队列且健康检查失败,尝试切换到内存队列
|
|
346
|
+
if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
|
|
347
|
+
self.logger.info("Redis队列不可用,尝试切换到内存队列...")
|
|
348
|
+
try:
|
|
349
|
+
await self._queue.close()
|
|
350
|
+
except:
|
|
351
|
+
pass
|
|
352
|
+
self._queue = None
|
|
353
|
+
# 重新创建内存队列
|
|
354
|
+
self._queue = await self._create_queue(QueueType.MEMORY)
|
|
355
|
+
self._queue_type = QueueType.MEMORY
|
|
356
|
+
self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
|
|
357
|
+
self._health_status = "healthy"
|
|
358
|
+
self.logger.info("✅ 已切换到内存队列")
|
|
359
|
+
# 返回一个信号,表示需要更新过滤器和去重管道配置
|
|
360
|
+
return True
|
|
361
|
+
return False
|
|
323
362
|
|
|
324
363
|
def _get_queue_info(self) -> Dict[str, Any]:
|
|
325
364
|
"""获取队列配置信息"""
|
|
@@ -77,7 +77,13 @@ class RedisPriorityQueue:
|
|
|
77
77
|
"""异步连接 Redis,支持重试"""
|
|
78
78
|
async with self._lock:
|
|
79
79
|
if self._redis is not None:
|
|
80
|
-
|
|
80
|
+
# 如果已经连接,测试连接是否仍然有效
|
|
81
|
+
try:
|
|
82
|
+
await self._redis.ping()
|
|
83
|
+
return self._redis
|
|
84
|
+
except Exception:
|
|
85
|
+
# 连接失效,重新连接
|
|
86
|
+
self._redis = None
|
|
81
87
|
|
|
82
88
|
for attempt in range(max_retries):
|
|
83
89
|
try:
|
|
@@ -97,7 +103,8 @@ class RedisPriorityQueue:
|
|
|
97
103
|
|
|
98
104
|
# 测试连接
|
|
99
105
|
await self._redis.ping()
|
|
100
|
-
|
|
106
|
+
# 只在调试模式下输出详细连接信息
|
|
107
|
+
logger.debug(f"✅ Redis 连接成功 (Module: {self.module_name})")
|
|
101
108
|
return self._redis
|
|
102
109
|
except Exception as e:
|
|
103
110
|
error_msg = f"⚠️ Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
默认配置文件
|
|
4
4
|
包含 Crawlo 框架的所有默认设置项
|
|
5
5
|
"""
|
|
6
|
-
import os
|
|
7
6
|
|
|
8
7
|
# 添加环境变量配置工具导入
|
|
9
8
|
from crawlo.utils.env_config import get_redis_config, get_runtime_config
|
|
@@ -13,9 +12,6 @@ from crawlo.utils.env_config import get_redis_config, get_runtime_config
|
|
|
13
12
|
# 项目名称(用于日志、Redis Key 等标识)
|
|
14
13
|
PROJECT_NAME = get_runtime_config()['PROJECT_NAME']
|
|
15
14
|
|
|
16
|
-
# 框架版本
|
|
17
|
-
VERSION = 1.0
|
|
18
|
-
|
|
19
15
|
# 运行模式:standalone/distributed/auto
|
|
20
16
|
RUN_MODE = get_runtime_config()['CRAWLO_MODE']
|
|
21
17
|
|
|
@@ -46,9 +42,11 @@ SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
|
|
|
46
42
|
# 队列类型:memory/redis/auto
|
|
47
43
|
QUEUE_TYPE = 'auto'
|
|
48
44
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
45
|
+
|
|
46
|
+
# 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
|
|
47
|
+
# 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
|
|
48
|
+
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
49
|
+
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
52
50
|
|
|
53
51
|
# --- Redis 过滤器配置 ---
|
|
54
52
|
# 使用环境变量配置工具获取 Redis 配置
|
|
@@ -75,43 +75,6 @@ INTERVAL = 5
|
|
|
75
75
|
DEPTH_PRIORITY = 1
|
|
76
76
|
MAX_RUNNING_SPIDERS = 3
|
|
77
77
|
|
|
78
|
-
# ============================== 运行模式选择 ==============================
|
|
79
|
-
# 运行模式:'standalone'(单机), 'distributed'(分布式), 'auto'(自动检测)
|
|
80
|
-
#
|
|
81
|
-
# 三种运行模式的最佳使用场景:
|
|
82
|
-
#
|
|
83
|
-
# 1. standalone(单机模式):
|
|
84
|
-
# - 适用场景:开发调试、小规模数据采集、个人项目
|
|
85
|
-
# - 特点:简单易用,资源占用少,无需额外依赖
|
|
86
|
-
# - 配置建议:
|
|
87
|
-
# * QUEUE_TYPE = 'auto'(自动选择队列类型)
|
|
88
|
-
# * FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'(内存过滤器)
|
|
89
|
-
# * DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'(内存去重)
|
|
90
|
-
# - 混合配置(推荐):
|
|
91
|
-
# * QUEUE_TYPE = 'auto'(自动选择)
|
|
92
|
-
# * FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'(Redis过滤器)
|
|
93
|
-
# * DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'(Redis去重)
|
|
94
|
-
# * 优势:享受Redis去重的持久性,同时保持部署简单
|
|
95
|
-
#
|
|
96
|
-
# 2. distributed(分布式模式):
|
|
97
|
-
# - 适用场景:大规模数据采集、多节点协同工作、高并发需求
|
|
98
|
-
# - 特点:支持多节点扩展,高并发处理能力,需要Redis支持
|
|
99
|
-
# - 配置建议:
|
|
100
|
-
# * QUEUE_TYPE = 'redis'(Redis队列)
|
|
101
|
-
# * FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'(Redis过滤器)
|
|
102
|
-
# * DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'(Redis去重)
|
|
103
|
-
# - 部署要求:需要配置Redis服务器连接参数
|
|
104
|
-
#
|
|
105
|
-
# 3. auto(自动检测模式):
|
|
106
|
-
# - 适用场景:希望根据环境自动选择最佳运行方式
|
|
107
|
-
# - 特点:智能检测环境配置,自动选择运行模式
|
|
108
|
-
# - 配置建议:
|
|
109
|
-
# * 框架会根据Redis可用性自动选择队列类型
|
|
110
|
-
# * 默认使用内存过滤器和去重管道
|
|
111
|
-
# - 适用情况:需要在不同环境中使用同一套配置
|
|
112
|
-
|
|
113
|
-
RUN_MODE = 'standalone' # 默认单机模式,简单易用
|
|
114
|
-
|
|
115
78
|
# ============================== 队列配置(支持分布式) ==============================
|
|
116
79
|
|
|
117
80
|
# 队列类型:'auto'(自动选择), 'memory'(内存队列), 'redis'(分布式队列)
|
|
@@ -157,8 +120,9 @@ MONGO_USE_BATCH = False # 是否启用批量插入
|
|
|
157
120
|
REQUEST_DIR = '.'
|
|
158
121
|
|
|
159
122
|
# 明确配置默认去重管道和过滤器,避免冗余的if-else判断
|
|
160
|
-
|
|
161
|
-
|
|
123
|
+
# 在单机模式下,如果Redis可用则使用Redis去重,否则使用内存去重
|
|
124
|
+
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
125
|
+
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
162
126
|
|
|
163
127
|
# --- Redis 配置(用于分布式去重和队列) ---
|
|
164
128
|
REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
|
|
@@ -12,7 +12,10 @@ from crawlo.config import CrawloConfig
|
|
|
12
12
|
PROJECT_NAME = '{{project_name}}'
|
|
13
13
|
|
|
14
14
|
# ============================== 分布式配置说明 ==============================
|
|
15
|
-
|
|
15
|
+
RUN_MODE = 'distributed'
|
|
16
|
+
QUEUE_TYPE = 'redis'
|
|
17
|
+
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
18
|
+
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
16
19
|
# 本模板专为分布式部署设计,适用于以下场景:
|
|
17
20
|
# - 大规模数据采集任务
|
|
18
21
|
# - 需要多节点协同工作的项目
|