crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,26 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import asyncio
|
|
3
2
|
import pickle
|
|
4
3
|
import time
|
|
5
4
|
import traceback
|
|
6
|
-
from typing import Optional, TYPE_CHECKING
|
|
5
|
+
from typing import Optional, TYPE_CHECKING, List, Union, Any
|
|
7
6
|
|
|
8
7
|
import redis.asyncio as aioredis
|
|
9
8
|
|
|
9
|
+
# 尝试导入Redis集群支持
|
|
10
|
+
try:
|
|
11
|
+
from redis.asyncio.cluster import RedisCluster
|
|
12
|
+
REDIS_CLUSTER_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
RedisCluster = None
|
|
15
|
+
REDIS_CLUSTER_AVAILABLE = False
|
|
16
|
+
|
|
10
17
|
# 使用 TYPE_CHECKING 避免运行时循环导入
|
|
11
18
|
if TYPE_CHECKING:
|
|
12
19
|
from crawlo import Request
|
|
13
20
|
|
|
14
|
-
from crawlo.utils.error_handler import ErrorHandler
|
|
15
|
-
from crawlo.
|
|
16
|
-
from crawlo.utils.redis_connection_pool import get_redis_pool,
|
|
21
|
+
from crawlo.utils.error_handler import ErrorHandler, ErrorContext
|
|
22
|
+
from crawlo.logging import get_logger
|
|
23
|
+
from crawlo.utils.redis_connection_pool import get_redis_pool, RedisConnectionPool
|
|
17
24
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
18
25
|
|
|
19
26
|
# 延迟初始化避免循环依赖
|
|
@@ -42,14 +49,16 @@ class RedisPriorityQueue:
|
|
|
42
49
|
|
|
43
50
|
def __init__(
|
|
44
51
|
self,
|
|
45
|
-
redis_url: str = None,
|
|
46
|
-
queue_name: str = None, # 修改默认值为 None
|
|
47
|
-
processing_queue: str = None, # 修改默认值为 None
|
|
48
|
-
failed_queue: str = None, # 修改默认值为 None
|
|
52
|
+
redis_url: Optional[str] = None,
|
|
53
|
+
queue_name: Optional[str] = None, # 修改默认值为 None
|
|
54
|
+
processing_queue: Optional[str] = None, # 修改默认值为 None
|
|
55
|
+
failed_queue: Optional[str] = None, # 修改默认值为 None
|
|
49
56
|
max_retries: int = 3,
|
|
50
57
|
timeout: int = 300, # 任务处理超时时间(秒)
|
|
51
58
|
max_connections: int = 10, # 连接池大小
|
|
52
|
-
module_name: str = "default" # 添加 module_name 参数
|
|
59
|
+
module_name: str = "default", # 添加 module_name 参数
|
|
60
|
+
is_cluster: bool = False, # 是否为集群模式
|
|
61
|
+
cluster_nodes: Optional[List[str]] = None # 集群节点列表
|
|
53
62
|
):
|
|
54
63
|
# 移除直接使用 os.getenv(),要求通过参数传递 redis_url
|
|
55
64
|
if redis_url is None:
|
|
@@ -58,6 +67,8 @@ class RedisPriorityQueue:
|
|
|
58
67
|
|
|
59
68
|
self.redis_url = redis_url
|
|
60
69
|
self.module_name = module_name # 保存 module_name
|
|
70
|
+
self.is_cluster = is_cluster
|
|
71
|
+
self.cluster_nodes = cluster_nodes
|
|
61
72
|
|
|
62
73
|
# 如果未提供 queue_name,则根据 module_name 自动生成
|
|
63
74
|
if queue_name is None:
|
|
@@ -87,8 +98,8 @@ class RedisPriorityQueue:
|
|
|
87
98
|
self.max_retries = max_retries
|
|
88
99
|
self.timeout = timeout
|
|
89
100
|
self.max_connections = max_connections
|
|
90
|
-
self._redis_pool: Optional[
|
|
91
|
-
self._redis: Optional[
|
|
101
|
+
self._redis_pool: Optional[RedisConnectionPool] = None
|
|
102
|
+
self._redis: Optional[Any] = None
|
|
92
103
|
self._lock = asyncio.Lock() # 用于连接初始化的锁
|
|
93
104
|
self.request_serializer = RequestSerializer() # 处理序列化
|
|
94
105
|
|
|
@@ -150,6 +161,8 @@ class RedisPriorityQueue:
|
|
|
150
161
|
# 使用优化的连接池,确保 decode_responses=False 以避免编码问题
|
|
151
162
|
self._redis_pool = get_redis_pool(
|
|
152
163
|
self.redis_url,
|
|
164
|
+
is_cluster=self.is_cluster,
|
|
165
|
+
cluster_nodes=self.cluster_nodes,
|
|
153
166
|
max_connections=self.max_connections,
|
|
154
167
|
socket_connect_timeout=5,
|
|
155
168
|
socket_timeout=30,
|
|
@@ -162,9 +175,8 @@ class RedisPriorityQueue:
|
|
|
162
175
|
self._redis = await self._redis_pool.get_connection()
|
|
163
176
|
|
|
164
177
|
# 测试连接
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# get_module_logger().debug(f"Redis 连接成功 (Module: {self.module_name})") # 注释掉重复的日志
|
|
178
|
+
if self._redis:
|
|
179
|
+
await self._redis.ping()
|
|
168
180
|
return self._redis
|
|
169
181
|
except Exception as e:
|
|
170
182
|
error_msg = f"Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
|
|
@@ -180,16 +192,28 @@ class RedisPriorityQueue:
|
|
|
180
192
|
if self._redis is None:
|
|
181
193
|
await self.connect()
|
|
182
194
|
try:
|
|
183
|
-
|
|
195
|
+
if self._redis:
|
|
196
|
+
await self._redis.ping()
|
|
184
197
|
except Exception as e:
|
|
185
198
|
get_module_logger().warning(f"Redis 连接失效 (Module: {self.module_name}),尝试重连...: {e}")
|
|
186
199
|
self._redis = None
|
|
187
200
|
await self.connect()
|
|
188
201
|
|
|
202
|
+
def _is_cluster_mode(self) -> bool:
|
|
203
|
+
"""检查是否为集群模式"""
|
|
204
|
+
if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None:
|
|
205
|
+
# 检查 _redis 是否为 RedisCluster 实例
|
|
206
|
+
if self._redis is not None and isinstance(self._redis, RedisCluster):
|
|
207
|
+
return True
|
|
208
|
+
return False
|
|
209
|
+
|
|
189
210
|
async def put(self, request, priority: int = 0) -> bool:
|
|
190
211
|
"""放入请求到队列"""
|
|
191
212
|
try:
|
|
192
213
|
await self._ensure_connection()
|
|
214
|
+
if not self._redis:
|
|
215
|
+
return False
|
|
216
|
+
|
|
193
217
|
# 修复优先级行为一致性问题
|
|
194
218
|
# 原来: score = -priority (导致priority大的先出队)
|
|
195
219
|
# 现在: score = priority (确保priority小的先出队,与内存队列一致)
|
|
@@ -208,18 +232,34 @@ class RedisPriorityQueue:
|
|
|
208
232
|
get_module_logger().error(f"请求序列化验证失败 (Module: {self.module_name}): {serialize_error}")
|
|
209
233
|
return False
|
|
210
234
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
235
|
+
# 处理集群模式下的操作
|
|
236
|
+
if self._is_cluster_mode():
|
|
237
|
+
# 在集群模式下,确保所有键都在同一个slot中
|
|
238
|
+
# 可以通过在键名中添加相同的哈希标签来实现
|
|
239
|
+
hash_tag = "{queue}" # 使用哈希标签确保键在同一个slot
|
|
240
|
+
queue_name_with_tag = f"{self.queue_name}{hash_tag}"
|
|
241
|
+
data_key_with_tag = f"{self.queue_name}:data{hash_tag}"
|
|
242
|
+
|
|
243
|
+
pipe = self._redis.pipeline()
|
|
244
|
+
pipe.zadd(queue_name_with_tag, {key: score})
|
|
245
|
+
pipe.hset(data_key_with_tag, key, serialized)
|
|
246
|
+
result = await pipe.execute()
|
|
247
|
+
else:
|
|
248
|
+
pipe = self._redis.pipeline()
|
|
249
|
+
pipe.zadd(self.queue_name, {key: score})
|
|
250
|
+
pipe.hset(f"{self.queue_name}:data", key, serialized)
|
|
251
|
+
result = await pipe.execute()
|
|
215
252
|
|
|
216
253
|
if result[0] > 0:
|
|
217
|
-
get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}")
|
|
254
|
+
get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}")
|
|
218
255
|
return result[0] > 0
|
|
219
256
|
except Exception as e:
|
|
257
|
+
error_context = ErrorContext(
|
|
258
|
+
context=f"放入队列失败 (Module: {self.module_name})"
|
|
259
|
+
)
|
|
220
260
|
get_module_error_handler().handle_error(
|
|
221
261
|
e,
|
|
222
|
-
context=
|
|
262
|
+
context=error_context,
|
|
223
263
|
raise_error=False
|
|
224
264
|
)
|
|
225
265
|
return False
|
|
@@ -231,24 +271,54 @@ class RedisPriorityQueue:
|
|
|
231
271
|
"""
|
|
232
272
|
try:
|
|
233
273
|
await self._ensure_connection()
|
|
274
|
+
if not self._redis:
|
|
275
|
+
return None
|
|
276
|
+
|
|
234
277
|
start_time = asyncio.get_event_loop().time()
|
|
235
278
|
|
|
236
279
|
while True:
|
|
237
280
|
# 尝试获取任务
|
|
238
|
-
|
|
281
|
+
if self._is_cluster_mode():
|
|
282
|
+
# 集群模式处理
|
|
283
|
+
hash_tag = "{queue}"
|
|
284
|
+
queue_name_with_tag = f"{self.queue_name}{hash_tag}"
|
|
285
|
+
result = await self._redis.zpopmin(queue_name_with_tag, count=1)
|
|
286
|
+
else:
|
|
287
|
+
result = await self._redis.zpopmin(self.queue_name, count=1)
|
|
288
|
+
|
|
239
289
|
if result:
|
|
240
290
|
key, score = result[0]
|
|
241
|
-
|
|
291
|
+
data_key = f"{self.queue_name}:data"
|
|
292
|
+
if self._is_cluster_mode():
|
|
293
|
+
hash_tag = "{queue}"
|
|
294
|
+
data_key = f"{self.queue_name}:data{hash_tag}"
|
|
295
|
+
|
|
296
|
+
serialized = await self._redis.hget(data_key, key)
|
|
242
297
|
if not serialized:
|
|
243
298
|
continue
|
|
244
299
|
|
|
245
300
|
# 移动到 processing
|
|
246
301
|
processing_key = f"{key}:{int(time.time())}"
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
302
|
+
processing_queue = self.processing_queue
|
|
303
|
+
processing_data_key = f"{self.processing_queue}:data"
|
|
304
|
+
|
|
305
|
+
if self._is_cluster_mode():
|
|
306
|
+
hash_tag = "{queue}"
|
|
307
|
+
processing_queue = f"{self.processing_queue}{hash_tag}"
|
|
308
|
+
processing_data_key = f"{self.processing_queue}:data{hash_tag}"
|
|
309
|
+
|
|
310
|
+
if self._is_cluster_mode():
|
|
311
|
+
pipe = self._redis.pipeline()
|
|
312
|
+
pipe.zadd(processing_queue, {processing_key: time.time() + self.timeout})
|
|
313
|
+
pipe.hset(processing_data_key, processing_key, serialized)
|
|
314
|
+
pipe.hdel(data_key, key)
|
|
315
|
+
await pipe.execute()
|
|
316
|
+
else:
|
|
317
|
+
pipe = self._redis.pipeline()
|
|
318
|
+
pipe.zadd(processing_queue, {processing_key: time.time() + self.timeout})
|
|
319
|
+
pipe.hset(processing_data_key, processing_key, serialized)
|
|
320
|
+
pipe.hdel(data_key, key)
|
|
321
|
+
await pipe.execute()
|
|
252
322
|
|
|
253
323
|
# 更安全的反序列化方式
|
|
254
324
|
try:
|
|
@@ -263,8 +333,12 @@ class RedisPriorityQueue:
|
|
|
263
333
|
# 如果pickle反序列化失败,记录错误并跳过这个任务
|
|
264
334
|
get_module_logger().error(f"无法反序列化请求数据 (Module: {self.module_name}): {pickle_error}")
|
|
265
335
|
# 从processing队列中移除这个无效的任务
|
|
266
|
-
|
|
267
|
-
|
|
336
|
+
if self._is_cluster_mode():
|
|
337
|
+
await self._redis.zrem(processing_queue, processing_key)
|
|
338
|
+
await self._redis.hdel(processing_data_key, processing_key)
|
|
339
|
+
else:
|
|
340
|
+
await self._redis.zrem(processing_queue, processing_key)
|
|
341
|
+
await self._redis.hdel(processing_data_key, processing_key)
|
|
268
342
|
# 继续尝试下一个任务
|
|
269
343
|
continue
|
|
270
344
|
|
|
@@ -276,9 +350,12 @@ class RedisPriorityQueue:
|
|
|
276
350
|
await asyncio.sleep(0.001) # 从0.01减少到0.001
|
|
277
351
|
|
|
278
352
|
except Exception as e:
|
|
353
|
+
error_context = ErrorContext(
|
|
354
|
+
context=f"获取队列任务失败 (Module: {self.module_name})"
|
|
355
|
+
)
|
|
279
356
|
get_module_error_handler().handle_error(
|
|
280
357
|
e,
|
|
281
|
-
context=
|
|
358
|
+
context=error_context,
|
|
282
359
|
raise_error=False
|
|
283
360
|
)
|
|
284
361
|
return None
|
|
@@ -287,22 +364,46 @@ class RedisPriorityQueue:
|
|
|
287
364
|
"""确认任务完成"""
|
|
288
365
|
try:
|
|
289
366
|
await self._ensure_connection()
|
|
367
|
+
if not self._redis:
|
|
368
|
+
return
|
|
369
|
+
|
|
290
370
|
key = self._get_request_key(request)
|
|
371
|
+
processing_queue = self.processing_queue
|
|
372
|
+
processing_data_key = f"{self.processing_queue}:data"
|
|
373
|
+
|
|
374
|
+
if self._is_cluster_mode():
|
|
375
|
+
hash_tag = "{queue}"
|
|
376
|
+
processing_queue = f"{self.processing_queue}{hash_tag}"
|
|
377
|
+
processing_data_key = f"{self.processing_queue}:data{hash_tag}"
|
|
378
|
+
|
|
291
379
|
cursor = 0
|
|
292
380
|
while True:
|
|
293
|
-
|
|
381
|
+
if self._is_cluster_mode():
|
|
382
|
+
cursor, keys = await self._redis.zscan(processing_queue, cursor, match=f"{key}:*")
|
|
383
|
+
else:
|
|
384
|
+
cursor, keys = await self._redis.zscan(processing_queue, cursor, match=f"{key}:*")
|
|
294
385
|
if keys:
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
386
|
+
if self._is_cluster_mode():
|
|
387
|
+
pipe = self._redis.pipeline()
|
|
388
|
+
for k in keys:
|
|
389
|
+
pipe.zrem(processing_queue, k)
|
|
390
|
+
pipe.hdel(processing_data_key, k)
|
|
391
|
+
await pipe.execute()
|
|
392
|
+
else:
|
|
393
|
+
pipe = self._redis.pipeline()
|
|
394
|
+
for k in keys:
|
|
395
|
+
pipe.zrem(processing_queue, k)
|
|
396
|
+
pipe.hdel(processing_data_key, k)
|
|
397
|
+
await pipe.execute()
|
|
300
398
|
if cursor == 0:
|
|
301
399
|
break
|
|
302
400
|
except Exception as e:
|
|
401
|
+
error_context = ErrorContext(
|
|
402
|
+
context=f"确认任务完成失败 (Module: {self.module_name})"
|
|
403
|
+
)
|
|
303
404
|
get_module_error_handler().handle_error(
|
|
304
405
|
e,
|
|
305
|
-
context=
|
|
406
|
+
context=error_context,
|
|
306
407
|
raise_error=False
|
|
307
408
|
)
|
|
308
409
|
|
|
@@ -310,10 +411,20 @@ class RedisPriorityQueue:
|
|
|
310
411
|
"""标记任务失败"""
|
|
311
412
|
try:
|
|
312
413
|
await self._ensure_connection()
|
|
414
|
+
if not self._redis:
|
|
415
|
+
return
|
|
416
|
+
|
|
313
417
|
key = self._get_request_key(request)
|
|
314
418
|
await self.ack(request)
|
|
315
419
|
|
|
316
420
|
retry_key = f"{self.failed_queue}:retries:{key}"
|
|
421
|
+
failed_queue = self.failed_queue
|
|
422
|
+
|
|
423
|
+
if self._is_cluster_mode():
|
|
424
|
+
hash_tag = "{queue}"
|
|
425
|
+
retry_key = f"{self.failed_queue}:retries:{key}{hash_tag}"
|
|
426
|
+
failed_queue = f"{self.failed_queue}{hash_tag}"
|
|
427
|
+
|
|
317
428
|
retries = await self._redis.incr(retry_key)
|
|
318
429
|
await self._redis.expire(retry_key, 86400)
|
|
319
430
|
|
|
@@ -329,12 +440,15 @@ class RedisPriorityQueue:
|
|
|
329
440
|
"failed_at": time.time(),
|
|
330
441
|
"request_pickle": pickle.dumps(request).hex(), # 可选:保存完整请求
|
|
331
442
|
}
|
|
332
|
-
await self._redis.lpush(
|
|
443
|
+
await self._redis.lpush(failed_queue, pickle.dumps(failed_data))
|
|
333
444
|
get_module_logger().error(f"任务彻底失败 [{retries}次] (Module: {self.module_name}): {request.url}")
|
|
334
445
|
except Exception as e:
|
|
446
|
+
error_context = ErrorContext(
|
|
447
|
+
context=f"标记任务失败失败 (Module: {self.module_name})"
|
|
448
|
+
)
|
|
335
449
|
get_module_error_handler().handle_error(
|
|
336
450
|
e,
|
|
337
|
-
context=
|
|
451
|
+
context=error_context,
|
|
338
452
|
raise_error=False
|
|
339
453
|
)
|
|
340
454
|
|
|
@@ -346,11 +460,22 @@ class RedisPriorityQueue:
|
|
|
346
460
|
"""Get queue size"""
|
|
347
461
|
try:
|
|
348
462
|
await self._ensure_connection()
|
|
349
|
-
|
|
463
|
+
if not self._redis:
|
|
464
|
+
return 0
|
|
465
|
+
|
|
466
|
+
if self._is_cluster_mode():
|
|
467
|
+
hash_tag = "{queue}"
|
|
468
|
+
queue_name_with_tag = f"{self.queue_name}{hash_tag}"
|
|
469
|
+
return await self._redis.zcard(queue_name_with_tag)
|
|
470
|
+
else:
|
|
471
|
+
return await self._redis.zcard(self.queue_name)
|
|
350
472
|
except Exception as e:
|
|
473
|
+
error_context = ErrorContext(
|
|
474
|
+
context=f"Failed to get queue size (Module: {self.module_name})"
|
|
475
|
+
)
|
|
351
476
|
get_module_error_handler().handle_error(
|
|
352
477
|
e,
|
|
353
|
-
context=
|
|
478
|
+
context=error_context,
|
|
354
479
|
raise_error=False
|
|
355
480
|
)
|
|
356
481
|
return 0
|
|
@@ -358,13 +483,37 @@ class RedisPriorityQueue:
|
|
|
358
483
|
async def close(self):
|
|
359
484
|
"""关闭连接"""
|
|
360
485
|
try:
|
|
361
|
-
#
|
|
362
|
-
self._redis
|
|
486
|
+
# 显式关闭Redis连接
|
|
487
|
+
if self._redis is not None:
|
|
488
|
+
try:
|
|
489
|
+
# 尝试关闭连接
|
|
490
|
+
if hasattr(self._redis, 'close'):
|
|
491
|
+
close_result = self._redis.close()
|
|
492
|
+
if asyncio.iscoroutine(close_result):
|
|
493
|
+
await close_result
|
|
494
|
+
|
|
495
|
+
# 等待连接关闭完成
|
|
496
|
+
if hasattr(self._redis, 'wait_closed'):
|
|
497
|
+
wait_result = self._redis.wait_closed()
|
|
498
|
+
if asyncio.iscoroutine(wait_result):
|
|
499
|
+
await wait_result
|
|
500
|
+
except Exception as close_error:
|
|
501
|
+
get_module_logger().warning(
|
|
502
|
+
f"Error closing Redis connection (Module: {self.module_name}): {close_error}"
|
|
503
|
+
)
|
|
504
|
+
finally:
|
|
505
|
+
self._redis = None
|
|
506
|
+
|
|
507
|
+
# 释放连接池引用(连接池由全局管理器管理)
|
|
363
508
|
self._redis_pool = None
|
|
509
|
+
|
|
364
510
|
get_module_logger().debug(f"Redis 连接已释放 (Module: {self.module_name})")
|
|
365
511
|
except Exception as e:
|
|
512
|
+
error_context = ErrorContext(
|
|
513
|
+
context=f"释放 Redis 连接失败 (Module: {self.module_name})"
|
|
514
|
+
)
|
|
366
515
|
get_module_error_handler().handle_error(
|
|
367
516
|
e,
|
|
368
|
-
context=
|
|
517
|
+
context=error_context,
|
|
369
518
|
raise_error=False
|
|
370
|
-
)
|
|
519
|
+
)
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
包含 Crawlo 框架的所有默认设置项
|
|
5
5
|
"""
|
|
6
6
|
# 添加环境变量配置工具导入
|
|
7
|
-
from crawlo.utils.
|
|
7
|
+
from crawlo.utils.config_manager import EnvConfigManager
|
|
8
8
|
|
|
9
9
|
# --------------------------------- 1. 框架基础配置 ------------------------------------
|
|
10
10
|
|
|
@@ -19,9 +19,9 @@ FRAMEWORK_INIT_ORDER = [
|
|
|
19
19
|
FRAMEWORK_INIT_STATE = 'uninitialized'
|
|
20
20
|
|
|
21
21
|
# 项目基础配置
|
|
22
|
-
runtime_config = get_runtime_config()
|
|
22
|
+
runtime_config = EnvConfigManager.get_runtime_config()
|
|
23
23
|
PROJECT_NAME = runtime_config['PROJECT_NAME'] # 项目名称(用于日志、Redis Key 等标识)
|
|
24
|
-
VERSION = get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
|
|
24
|
+
VERSION = EnvConfigManager.get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
|
|
25
25
|
RUN_MODE = runtime_config['CRAWLO_MODE'] # 运行模式:standalone/distributed/auto
|
|
26
26
|
CONCURRENCY = runtime_config['CONCURRENCY'] # 并发数配置
|
|
27
27
|
|
|
@@ -70,12 +70,19 @@ MYSQL_INSERT_IGNORE = False # 是否使用 INSERT IGNORE(忽略重复数据
|
|
|
70
70
|
MYSQL_UPDATE_COLUMNS = () # 冲突时需更新的列名;指定后 MYSQL_AUTO_UPDATE 失效
|
|
71
71
|
|
|
72
72
|
# Redis配置
|
|
73
|
-
redis_config = get_redis_config()
|
|
73
|
+
redis_config = EnvConfigManager.get_redis_config()
|
|
74
74
|
REDIS_HOST = redis_config['REDIS_HOST']
|
|
75
75
|
REDIS_PORT = redis_config['REDIS_PORT']
|
|
76
76
|
REDIS_PASSWORD = redis_config['REDIS_PASSWORD']
|
|
77
77
|
REDIS_DB = redis_config['REDIS_DB']
|
|
78
78
|
|
|
79
|
+
# Redis集群支持说明:
|
|
80
|
+
# Crawlo框架支持Redis单实例和集群模式的智能切换
|
|
81
|
+
# 集群模式配置方式:
|
|
82
|
+
# 1. 使用逗号分隔的节点列表:'192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
|
|
83
|
+
# 2. 使用集群URL格式:'redis-cluster://192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
|
|
84
|
+
# 框架会自动检测URL格式并选择合适的模式
|
|
85
|
+
|
|
79
86
|
# 根据是否有密码生成不同的 URL 格式
|
|
80
87
|
if REDIS_PASSWORD:
|
|
81
88
|
REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
@@ -142,8 +149,11 @@ STATS_DUMP = True # 是否周期性输出统计信息
|
|
|
142
149
|
LOG_FILE = None # 日志文件路径,将在项目配置中设置
|
|
143
150
|
LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
|
|
144
151
|
LOG_ENCODING = 'utf-8'
|
|
145
|
-
LOG_MAX_BYTES = 10 * 1024 * 1024 #
|
|
146
|
-
LOG_BACKUP_COUNT = 5 #
|
|
152
|
+
LOG_MAX_BYTES = 10 * 1024 * 1024 # 日志轮转大小(字节),推荐20MB用于生产环境
|
|
153
|
+
LOG_BACKUP_COUNT = 5 # 日志备份数量,推荐10个用于生产环境
|
|
154
|
+
# 如果用户不想要日志轮转,可以设置 LOG_MAX_BYTES = 0 来禁用轮转功能
|
|
155
|
+
# 注意:当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转永远不会发生,日志文件会持续增长
|
|
156
|
+
# 需要通过其他方式管理磁盘空间,如系统级日志轮转工具(logrotate等)
|
|
147
157
|
|
|
148
158
|
# 日志间隔配置
|
|
149
159
|
INTERVAL = 60 # 日志输出间隔(秒)
|
crawlo/spider/__init__.py
CHANGED
|
@@ -28,10 +28,11 @@ Crawlo Spider Module
|
|
|
28
28
|
yield Item(data=response.json())
|
|
29
29
|
"""
|
|
30
30
|
from __future__ import annotations
|
|
31
|
-
from typing import Type, Any, Optional, List, Dict, Union, Iterator, AsyncIterator
|
|
32
|
-
from ..network.request import Request
|
|
33
|
-
from ..utils.log import get_logger
|
|
34
31
|
|
|
32
|
+
from typing import Type, Any, Optional, List, Dict, Iterator
|
|
33
|
+
|
|
34
|
+
from ..logging import get_logger
|
|
35
|
+
from ..network.request import Request
|
|
35
36
|
|
|
36
37
|
# 全局爬虫注册表
|
|
37
38
|
_DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
|
|
@@ -79,7 +80,7 @@ class SpiderMeta(type):
|
|
|
79
80
|
_DEFAULT_SPIDER_REGISTRY[spider_name] = cls
|
|
80
81
|
# 延迟初始化logger避免模块级别阻塞
|
|
81
82
|
try:
|
|
82
|
-
from crawlo.
|
|
83
|
+
from crawlo.logging import get_logger
|
|
83
84
|
get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
|
|
84
85
|
except:
|
|
85
86
|
# 如果日志系统未初始化,静默失败
|
|
@@ -171,7 +172,7 @@ class Spider(metaclass=SpiderMeta):
|
|
|
171
172
|
def logger(self):
|
|
172
173
|
"""延迟初始化logger"""
|
|
173
174
|
if self._logger is None:
|
|
174
|
-
from crawlo.
|
|
175
|
+
from crawlo.logging import get_logger
|
|
175
176
|
self._logger = get_logger(self.name)
|
|
176
177
|
return self._logger
|
|
177
178
|
|
crawlo/stats_collector.py
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# @Desc : 统计信息收集器
|
|
7
7
|
"""
|
|
8
8
|
from pprint import pformat
|
|
9
|
-
from crawlo.
|
|
9
|
+
from crawlo.logging import get_logger
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class StatsCollector(object):
|
|
@@ -15,7 +15,7 @@ class StatsCollector(object):
|
|
|
15
15
|
self.crawler = crawler
|
|
16
16
|
self._dump = self.crawler.settings.get_bool('STATS_DUMP')
|
|
17
17
|
self._stats = {}
|
|
18
|
-
self.logger = get_logger(self.__class__.__name__
|
|
18
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
19
19
|
|
|
20
20
|
def inc_value(self, key, count=1, start=0):
|
|
21
21
|
self._stats[key] = self._stats.setdefault(key, start) + count
|
crawlo/task_manager.py
CHANGED
crawlo/templates/crawlo.cfg.tmpl
CHANGED
|
@@ -1,17 +1,13 @@
|
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
|
2
2
|
"""
|
|
3
|
-
|
|
4
|
-
======================
|
|
5
|
-
定义你抓取的数据结构。
|
|
3
|
+
数据项定义
|
|
6
4
|
"""
|
|
7
5
|
|
|
8
6
|
from crawlo.items import Item, Field
|
|
9
7
|
|
|
10
8
|
|
|
11
9
|
class {{project_name|title}}Item(Item):
|
|
12
|
-
"""
|
|
13
|
-
{{project_name}} 项目的数据项。
|
|
14
|
-
"""
|
|
10
|
+
"""数据项"""
|
|
15
11
|
id = Field()
|
|
16
12
|
# price = Field()
|
|
17
13
|
# description = Field()
|
|
@@ -4,13 +4,13 @@
|
|
|
4
4
|
=============================
|
|
5
5
|
基于 Crawlo 框架的爬虫项目配置。
|
|
6
6
|
|
|
7
|
-
此配置使用 CrawloConfig.
|
|
8
|
-
|
|
7
|
+
此配置使用 CrawloConfig.auto() 工厂方法创建自动检测模式配置,
|
|
8
|
+
框架会自动检测Redis可用性,可用则使用分布式模式,否则使用单机模式。
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from crawlo.config import CrawloConfig
|
|
12
12
|
|
|
13
|
-
#
|
|
13
|
+
# 使用自动检测模式配置工厂创建配置
|
|
14
14
|
config = CrawloConfig.auto(
|
|
15
15
|
project_name='{{project_name}}',
|
|
16
16
|
concurrency=8,
|
|
@@ -25,12 +25,10 @@ locals().update(config.to_dict())
|
|
|
25
25
|
# 爬虫模块配置
|
|
26
26
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
# 为DefaultHeaderMiddleware配置默认请求头
|
|
28
|
+
# 默认请求头
|
|
30
29
|
# DEFAULT_REQUEST_HEADERS = {}
|
|
31
30
|
|
|
32
31
|
# 允许的域名
|
|
33
|
-
# 为OffsiteMiddleware配置允许的域名
|
|
34
32
|
# ALLOWED_DOMAINS = []
|
|
35
33
|
|
|
36
34
|
# 数据管道
|
|
@@ -58,6 +56,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
|
58
56
|
LOG_LEVEL = 'INFO'
|
|
59
57
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
60
58
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
59
|
+
LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
|
|
60
|
+
LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
|
|
61
|
+
# 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
|
|
62
|
+
# 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
|
|
61
63
|
STATS_DUMP = True
|
|
62
64
|
|
|
63
65
|
# 输出配置
|
|
@@ -103,10 +105,10 @@ MONGO_USE_BATCH = False # 是否启用批量插入
|
|
|
103
105
|
|
|
104
106
|
# =================================== 代理配置 ===================================
|
|
105
107
|
|
|
106
|
-
#
|
|
107
|
-
#
|
|
108
|
+
# 简单代理(SimpleProxyMiddleware)
|
|
109
|
+
# 配置代理列表后中间件自动启用
|
|
108
110
|
# PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
|
|
109
111
|
|
|
110
|
-
#
|
|
111
|
-
#
|
|
112
|
+
# 动态代理(ProxyMiddleware)
|
|
113
|
+
# 配置代理API URL后中间件自动启用
|
|
112
114
|
# PROXY_API_URL = "http://your-proxy-api.com/get-proxy"
|