crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,26 @@
1
1
  import asyncio
2
- import asyncio
3
2
  import pickle
4
3
  import time
5
4
  import traceback
6
- from typing import Optional, TYPE_CHECKING
5
+ from typing import Optional, TYPE_CHECKING, List, Union, Any
7
6
 
8
7
  import redis.asyncio as aioredis
9
8
 
9
+ # 尝试导入Redis集群支持
10
+ try:
11
+ from redis.asyncio.cluster import RedisCluster
12
+ REDIS_CLUSTER_AVAILABLE = True
13
+ except ImportError:
14
+ RedisCluster = None
15
+ REDIS_CLUSTER_AVAILABLE = False
16
+
10
17
  # 使用 TYPE_CHECKING 避免运行时循环导入
11
18
  if TYPE_CHECKING:
12
19
  from crawlo import Request
13
20
 
14
- from crawlo.utils.error_handler import ErrorHandler
15
- from crawlo.utils.log import get_logger
16
- from crawlo.utils.redis_connection_pool import get_redis_pool, OptimizedRedisConnectionPool
21
+ from crawlo.utils.error_handler import ErrorHandler, ErrorContext
22
+ from crawlo.logging import get_logger
23
+ from crawlo.utils.redis_connection_pool import get_redis_pool, RedisConnectionPool
17
24
  from crawlo.utils.request_serializer import RequestSerializer
18
25
 
19
26
  # 延迟初始化避免循环依赖
@@ -42,14 +49,16 @@ class RedisPriorityQueue:
42
49
 
43
50
  def __init__(
44
51
  self,
45
- redis_url: str = None,
46
- queue_name: str = None, # 修改默认值为 None
47
- processing_queue: str = None, # 修改默认值为 None
48
- failed_queue: str = None, # 修改默认值为 None
52
+ redis_url: Optional[str] = None,
53
+ queue_name: Optional[str] = None, # 修改默认值为 None
54
+ processing_queue: Optional[str] = None, # 修改默认值为 None
55
+ failed_queue: Optional[str] = None, # 修改默认值为 None
49
56
  max_retries: int = 3,
50
57
  timeout: int = 300, # 任务处理超时时间(秒)
51
58
  max_connections: int = 10, # 连接池大小
52
- module_name: str = "default" # 添加 module_name 参数
59
+ module_name: str = "default", # 添加 module_name 参数
60
+ is_cluster: bool = False, # 是否为集群模式
61
+ cluster_nodes: Optional[List[str]] = None # 集群节点列表
53
62
  ):
54
63
  # 移除直接使用 os.getenv(),要求通过参数传递 redis_url
55
64
  if redis_url is None:
@@ -58,6 +67,8 @@ class RedisPriorityQueue:
58
67
 
59
68
  self.redis_url = redis_url
60
69
  self.module_name = module_name # 保存 module_name
70
+ self.is_cluster = is_cluster
71
+ self.cluster_nodes = cluster_nodes
61
72
 
62
73
  # 如果未提供 queue_name,则根据 module_name 自动生成
63
74
  if queue_name is None:
@@ -87,8 +98,8 @@ class RedisPriorityQueue:
87
98
  self.max_retries = max_retries
88
99
  self.timeout = timeout
89
100
  self.max_connections = max_connections
90
- self._redis_pool: Optional[OptimizedRedisConnectionPool] = None
91
- self._redis: Optional[aioredis.Redis] = None
101
+ self._redis_pool: Optional[RedisConnectionPool] = None
102
+ self._redis: Optional[Any] = None
92
103
  self._lock = asyncio.Lock() # 用于连接初始化的锁
93
104
  self.request_serializer = RequestSerializer() # 处理序列化
94
105
 
@@ -150,6 +161,8 @@ class RedisPriorityQueue:
150
161
  # 使用优化的连接池,确保 decode_responses=False 以避免编码问题
151
162
  self._redis_pool = get_redis_pool(
152
163
  self.redis_url,
164
+ is_cluster=self.is_cluster,
165
+ cluster_nodes=self.cluster_nodes,
153
166
  max_connections=self.max_connections,
154
167
  socket_connect_timeout=5,
155
168
  socket_timeout=30,
@@ -162,9 +175,8 @@ class RedisPriorityQueue:
162
175
  self._redis = await self._redis_pool.get_connection()
163
176
 
164
177
  # 测试连接
165
- await self._redis.ping()
166
- # 只在调试模式下输出详细连接信息
167
- # get_module_logger().debug(f"Redis 连接成功 (Module: {self.module_name})") # 注释掉重复的日志
178
+ if self._redis:
179
+ await self._redis.ping()
168
180
  return self._redis
169
181
  except Exception as e:
170
182
  error_msg = f"Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
@@ -180,16 +192,28 @@ class RedisPriorityQueue:
180
192
  if self._redis is None:
181
193
  await self.connect()
182
194
  try:
183
- await self._redis.ping()
195
+ if self._redis:
196
+ await self._redis.ping()
184
197
  except Exception as e:
185
198
  get_module_logger().warning(f"Redis 连接失效 (Module: {self.module_name}),尝试重连...: {e}")
186
199
  self._redis = None
187
200
  await self.connect()
188
201
 
202
+ def _is_cluster_mode(self) -> bool:
203
+ """检查是否为集群模式"""
204
+ if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None:
205
+ # 检查 _redis 是否为 RedisCluster 实例
206
+ if self._redis is not None and isinstance(self._redis, RedisCluster):
207
+ return True
208
+ return False
209
+
189
210
  async def put(self, request, priority: int = 0) -> bool:
190
211
  """放入请求到队列"""
191
212
  try:
192
213
  await self._ensure_connection()
214
+ if not self._redis:
215
+ return False
216
+
193
217
  # 修复优先级行为一致性问题
194
218
  # 原来: score = -priority (导致priority大的先出队)
195
219
  # 现在: score = priority (确保priority小的先出队,与内存队列一致)
@@ -208,18 +232,34 @@ class RedisPriorityQueue:
208
232
  get_module_logger().error(f"请求序列化验证失败 (Module: {self.module_name}): {serialize_error}")
209
233
  return False
210
234
 
211
- pipe = self._redis.pipeline()
212
- pipe.zadd(self.queue_name, {key: score})
213
- pipe.hset(f"{self.queue_name}:data", key, serialized)
214
- result = await pipe.execute()
235
+ # 处理集群模式下的操作
236
+ if self._is_cluster_mode():
237
+ # 在集群模式下,确保所有键都在同一个slot中
238
+ # 可以通过在键名中添加相同的哈希标签来实现
239
+ hash_tag = "{queue}" # 使用哈希标签确保键在同一个slot
240
+ queue_name_with_tag = f"{self.queue_name}{hash_tag}"
241
+ data_key_with_tag = f"{self.queue_name}:data{hash_tag}"
242
+
243
+ pipe = self._redis.pipeline()
244
+ pipe.zadd(queue_name_with_tag, {key: score})
245
+ pipe.hset(data_key_with_tag, key, serialized)
246
+ result = await pipe.execute()
247
+ else:
248
+ pipe = self._redis.pipeline()
249
+ pipe.zadd(self.queue_name, {key: score})
250
+ pipe.hset(f"{self.queue_name}:data", key, serialized)
251
+ result = await pipe.execute()
215
252
 
216
253
  if result[0] > 0:
217
- get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}") # 注释掉重复的日志
254
+ get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}")
218
255
  return result[0] > 0
219
256
  except Exception as e:
257
+ error_context = ErrorContext(
258
+ context=f"放入队列失败 (Module: {self.module_name})"
259
+ )
220
260
  get_module_error_handler().handle_error(
221
261
  e,
222
- context=f"放入队列失败 (Module: {self.module_name})",
262
+ context=error_context,
223
263
  raise_error=False
224
264
  )
225
265
  return False
@@ -231,24 +271,54 @@ class RedisPriorityQueue:
231
271
  """
232
272
  try:
233
273
  await self._ensure_connection()
274
+ if not self._redis:
275
+ return None
276
+
234
277
  start_time = asyncio.get_event_loop().time()
235
278
 
236
279
  while True:
237
280
  # 尝试获取任务
238
- result = await self._redis.zpopmin(self.queue_name, count=1)
281
+ if self._is_cluster_mode():
282
+ # 集群模式处理
283
+ hash_tag = "{queue}"
284
+ queue_name_with_tag = f"{self.queue_name}{hash_tag}"
285
+ result = await self._redis.zpopmin(queue_name_with_tag, count=1)
286
+ else:
287
+ result = await self._redis.zpopmin(self.queue_name, count=1)
288
+
239
289
  if result:
240
290
  key, score = result[0]
241
- serialized = await self._redis.hget(f"{self.queue_name}:data", key)
291
+ data_key = f"{self.queue_name}:data"
292
+ if self._is_cluster_mode():
293
+ hash_tag = "{queue}"
294
+ data_key = f"{self.queue_name}:data{hash_tag}"
295
+
296
+ serialized = await self._redis.hget(data_key, key)
242
297
  if not serialized:
243
298
  continue
244
299
 
245
300
  # 移动到 processing
246
301
  processing_key = f"{key}:{int(time.time())}"
247
- pipe = self._redis.pipeline()
248
- pipe.zadd(self.processing_queue, {processing_key: time.time() + self.timeout})
249
- pipe.hset(f"{self.processing_queue}:data", processing_key, serialized)
250
- pipe.hdel(f"{self.queue_name}:data", key)
251
- await pipe.execute()
302
+ processing_queue = self.processing_queue
303
+ processing_data_key = f"{self.processing_queue}:data"
304
+
305
+ if self._is_cluster_mode():
306
+ hash_tag = "{queue}"
307
+ processing_queue = f"{self.processing_queue}{hash_tag}"
308
+ processing_data_key = f"{self.processing_queue}:data{hash_tag}"
309
+
310
+ if self._is_cluster_mode():
311
+ pipe = self._redis.pipeline()
312
+ pipe.zadd(processing_queue, {processing_key: time.time() + self.timeout})
313
+ pipe.hset(processing_data_key, processing_key, serialized)
314
+ pipe.hdel(data_key, key)
315
+ await pipe.execute()
316
+ else:
317
+ pipe = self._redis.pipeline()
318
+ pipe.zadd(processing_queue, {processing_key: time.time() + self.timeout})
319
+ pipe.hset(processing_data_key, processing_key, serialized)
320
+ pipe.hdel(data_key, key)
321
+ await pipe.execute()
252
322
 
253
323
  # 更安全的反序列化方式
254
324
  try:
@@ -263,8 +333,12 @@ class RedisPriorityQueue:
263
333
  # 如果pickle反序列化失败,记录错误并跳过这个任务
264
334
  get_module_logger().error(f"无法反序列化请求数据 (Module: {self.module_name}): {pickle_error}")
265
335
  # 从processing队列中移除这个无效的任务
266
- await self._redis.zrem(self.processing_queue, processing_key)
267
- await self._redis.hdel(f"{self.processing_queue}:data", processing_key)
336
+ if self._is_cluster_mode():
337
+ await self._redis.zrem(processing_queue, processing_key)
338
+ await self._redis.hdel(processing_data_key, processing_key)
339
+ else:
340
+ await self._redis.zrem(processing_queue, processing_key)
341
+ await self._redis.hdel(processing_data_key, processing_key)
268
342
  # 继续尝试下一个任务
269
343
  continue
270
344
 
@@ -276,9 +350,12 @@ class RedisPriorityQueue:
276
350
  await asyncio.sleep(0.001) # 从0.01减少到0.001
277
351
 
278
352
  except Exception as e:
353
+ error_context = ErrorContext(
354
+ context=f"获取队列任务失败 (Module: {self.module_name})"
355
+ )
279
356
  get_module_error_handler().handle_error(
280
357
  e,
281
- context=f"获取队列任务失败 (Module: {self.module_name})",
358
+ context=error_context,
282
359
  raise_error=False
283
360
  )
284
361
  return None
@@ -287,22 +364,46 @@ class RedisPriorityQueue:
287
364
  """确认任务完成"""
288
365
  try:
289
366
  await self._ensure_connection()
367
+ if not self._redis:
368
+ return
369
+
290
370
  key = self._get_request_key(request)
371
+ processing_queue = self.processing_queue
372
+ processing_data_key = f"{self.processing_queue}:data"
373
+
374
+ if self._is_cluster_mode():
375
+ hash_tag = "{queue}"
376
+ processing_queue = f"{self.processing_queue}{hash_tag}"
377
+ processing_data_key = f"{self.processing_queue}:data{hash_tag}"
378
+
291
379
  cursor = 0
292
380
  while True:
293
- cursor, keys = await self._redis.zscan(self.processing_queue, cursor, match=f"{key}:*")
381
+ if self._is_cluster_mode():
382
+ cursor, keys = await self._redis.zscan(processing_queue, cursor, match=f"{key}:*")
383
+ else:
384
+ cursor, keys = await self._redis.zscan(processing_queue, cursor, match=f"{key}:*")
294
385
  if keys:
295
- pipe = self._redis.pipeline()
296
- for k in keys:
297
- pipe.zrem(self.processing_queue, k)
298
- pipe.hdel(f"{self.processing_queue}:data", k)
299
- await pipe.execute()
386
+ if self._is_cluster_mode():
387
+ pipe = self._redis.pipeline()
388
+ for k in keys:
389
+ pipe.zrem(processing_queue, k)
390
+ pipe.hdel(processing_data_key, k)
391
+ await pipe.execute()
392
+ else:
393
+ pipe = self._redis.pipeline()
394
+ for k in keys:
395
+ pipe.zrem(processing_queue, k)
396
+ pipe.hdel(processing_data_key, k)
397
+ await pipe.execute()
300
398
  if cursor == 0:
301
399
  break
302
400
  except Exception as e:
401
+ error_context = ErrorContext(
402
+ context=f"确认任务完成失败 (Module: {self.module_name})"
403
+ )
303
404
  get_module_error_handler().handle_error(
304
405
  e,
305
- context=f"确认任务完成失败 (Module: {self.module_name})",
406
+ context=error_context,
306
407
  raise_error=False
307
408
  )
308
409
 
@@ -310,10 +411,20 @@ class RedisPriorityQueue:
310
411
  """标记任务失败"""
311
412
  try:
312
413
  await self._ensure_connection()
414
+ if not self._redis:
415
+ return
416
+
313
417
  key = self._get_request_key(request)
314
418
  await self.ack(request)
315
419
 
316
420
  retry_key = f"{self.failed_queue}:retries:{key}"
421
+ failed_queue = self.failed_queue
422
+
423
+ if self._is_cluster_mode():
424
+ hash_tag = "{queue}"
425
+ retry_key = f"{self.failed_queue}:retries:{key}{hash_tag}"
426
+ failed_queue = f"{self.failed_queue}{hash_tag}"
427
+
317
428
  retries = await self._redis.incr(retry_key)
318
429
  await self._redis.expire(retry_key, 86400)
319
430
 
@@ -329,12 +440,15 @@ class RedisPriorityQueue:
329
440
  "failed_at": time.time(),
330
441
  "request_pickle": pickle.dumps(request).hex(), # 可选:保存完整请求
331
442
  }
332
- await self._redis.lpush(self.failed_queue, pickle.dumps(failed_data))
443
+ await self._redis.lpush(failed_queue, pickle.dumps(failed_data))
333
444
  get_module_logger().error(f"任务彻底失败 [{retries}次] (Module: {self.module_name}): {request.url}")
334
445
  except Exception as e:
446
+ error_context = ErrorContext(
447
+ context=f"标记任务失败失败 (Module: {self.module_name})"
448
+ )
335
449
  get_module_error_handler().handle_error(
336
450
  e,
337
- context=f"标记任务失败失败 (Module: {self.module_name})",
451
+ context=error_context,
338
452
  raise_error=False
339
453
  )
340
454
 
@@ -346,11 +460,22 @@ class RedisPriorityQueue:
346
460
  """Get queue size"""
347
461
  try:
348
462
  await self._ensure_connection()
349
- return await self._redis.zcard(self.queue_name)
463
+ if not self._redis:
464
+ return 0
465
+
466
+ if self._is_cluster_mode():
467
+ hash_tag = "{queue}"
468
+ queue_name_with_tag = f"{self.queue_name}{hash_tag}"
469
+ return await self._redis.zcard(queue_name_with_tag)
470
+ else:
471
+ return await self._redis.zcard(self.queue_name)
350
472
  except Exception as e:
473
+ error_context = ErrorContext(
474
+ context=f"Failed to get queue size (Module: {self.module_name})"
475
+ )
351
476
  get_module_error_handler().handle_error(
352
477
  e,
353
- context=f"Failed to get queue size (Module: {self.module_name})",
478
+ context=error_context,
354
479
  raise_error=False
355
480
  )
356
481
  return 0
@@ -358,13 +483,37 @@ class RedisPriorityQueue:
358
483
  async def close(self):
359
484
  """关闭连接"""
360
485
  try:
361
- # 连接池会自动管理连接,这里不需要显式关闭单个连接
362
- self._redis = None
486
+ # 显式关闭Redis连接
487
+ if self._redis is not None:
488
+ try:
489
+ # 尝试关闭连接
490
+ if hasattr(self._redis, 'close'):
491
+ close_result = self._redis.close()
492
+ if asyncio.iscoroutine(close_result):
493
+ await close_result
494
+
495
+ # 等待连接关闭完成
496
+ if hasattr(self._redis, 'wait_closed'):
497
+ wait_result = self._redis.wait_closed()
498
+ if asyncio.iscoroutine(wait_result):
499
+ await wait_result
500
+ except Exception as close_error:
501
+ get_module_logger().warning(
502
+ f"Error closing Redis connection (Module: {self.module_name}): {close_error}"
503
+ )
504
+ finally:
505
+ self._redis = None
506
+
507
+ # 释放连接池引用(连接池由全局管理器管理)
363
508
  self._redis_pool = None
509
+
364
510
  get_module_logger().debug(f"Redis 连接已释放 (Module: {self.module_name})")
365
511
  except Exception as e:
512
+ error_context = ErrorContext(
513
+ context=f"释放 Redis 连接失败 (Module: {self.module_name})"
514
+ )
366
515
  get_module_error_handler().handle_error(
367
516
  e,
368
- context=f"释放 Redis 连接失败 (Module: {self.module_name})",
517
+ context=error_context,
369
518
  raise_error=False
370
- )
519
+ )
@@ -4,7 +4,7 @@
4
4
  包含 Crawlo 框架的所有默认设置项
5
5
  """
6
6
  # 添加环境变量配置工具导入
7
- from crawlo.utils.env_config import get_redis_config, get_runtime_config, get_version
7
+ from crawlo.utils.config_manager import EnvConfigManager
8
8
 
9
9
  # --------------------------------- 1. 框架基础配置 ------------------------------------
10
10
 
@@ -19,9 +19,9 @@ FRAMEWORK_INIT_ORDER = [
19
19
  FRAMEWORK_INIT_STATE = 'uninitialized'
20
20
 
21
21
  # 项目基础配置
22
- runtime_config = get_runtime_config()
22
+ runtime_config = EnvConfigManager.get_runtime_config()
23
23
  PROJECT_NAME = runtime_config['PROJECT_NAME'] # 项目名称(用于日志、Redis Key 等标识)
24
- VERSION = get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
24
+ VERSION = EnvConfigManager.get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
25
25
  RUN_MODE = runtime_config['CRAWLO_MODE'] # 运行模式:standalone/distributed/auto
26
26
  CONCURRENCY = runtime_config['CONCURRENCY'] # 并发数配置
27
27
 
@@ -70,12 +70,19 @@ MYSQL_INSERT_IGNORE = False # 是否使用 INSERT IGNORE(忽略重复数据
70
70
  MYSQL_UPDATE_COLUMNS = () # 冲突时需更新的列名;指定后 MYSQL_AUTO_UPDATE 失效
71
71
 
72
72
  # Redis配置
73
- redis_config = get_redis_config()
73
+ redis_config = EnvConfigManager.get_redis_config()
74
74
  REDIS_HOST = redis_config['REDIS_HOST']
75
75
  REDIS_PORT = redis_config['REDIS_PORT']
76
76
  REDIS_PASSWORD = redis_config['REDIS_PASSWORD']
77
77
  REDIS_DB = redis_config['REDIS_DB']
78
78
 
79
+ # Redis集群支持说明:
80
+ # Crawlo框架支持Redis单实例和集群模式的智能切换
81
+ # 集群模式配置方式:
82
+ # 1. 使用逗号分隔的节点列表:'192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
83
+ # 2. 使用集群URL格式:'redis-cluster://192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
84
+ # 框架会自动检测URL格式并选择合适的模式
85
+
79
86
  # 根据是否有密码生成不同的 URL 格式
80
87
  if REDIS_PASSWORD:
81
88
  REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
@@ -142,8 +149,11 @@ STATS_DUMP = True # 是否周期性输出统计信息
142
149
  LOG_FILE = None # 日志文件路径,将在项目配置中设置
143
150
  LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
144
151
  LOG_ENCODING = 'utf-8'
145
- LOG_MAX_BYTES = 10 * 1024 * 1024 # 日志轮转大小(字节)
146
- LOG_BACKUP_COUNT = 5 # 日志备份数量
152
+ LOG_MAX_BYTES = 10 * 1024 * 1024 # 日志轮转大小(字节),推荐20MB用于生产环境
153
+ LOG_BACKUP_COUNT = 5 # 日志备份数量,推荐10个用于生产环境
154
+ # 如果用户不想要日志轮转,可以设置 LOG_MAX_BYTES = 0 来禁用轮转功能
155
+ # 注意:当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转永远不会发生,日志文件会持续增长
156
+ # 需要通过其他方式管理磁盘空间,如系统级日志轮转工具(logrotate等)
147
157
 
148
158
  # 日志间隔配置
149
159
  INTERVAL = 60 # 日志输出间隔(秒)
crawlo/spider/__init__.py CHANGED
@@ -28,10 +28,11 @@ Crawlo Spider Module
28
28
  yield Item(data=response.json())
29
29
  """
30
30
  from __future__ import annotations
31
- from typing import Type, Any, Optional, List, Dict, Union, Iterator, AsyncIterator
32
- from ..network.request import Request
33
- from ..utils.log import get_logger
34
31
 
32
+ from typing import Type, Any, Optional, List, Dict, Iterator
33
+
34
+ from ..logging import get_logger
35
+ from ..network.request import Request
35
36
 
36
37
  # 全局爬虫注册表
37
38
  _DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
@@ -79,7 +80,7 @@ class SpiderMeta(type):
79
80
  _DEFAULT_SPIDER_REGISTRY[spider_name] = cls
80
81
  # 延迟初始化logger避免模块级别阻塞
81
82
  try:
82
- from crawlo.utils.log import get_logger
83
+ from crawlo.logging import get_logger
83
84
  get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
84
85
  except:
85
86
  # 如果日志系统未初始化,静默失败
@@ -171,7 +172,7 @@ class Spider(metaclass=SpiderMeta):
171
172
  def logger(self):
172
173
  """延迟初始化logger"""
173
174
  if self._logger is None:
174
- from crawlo.utils.log import get_logger
175
+ from crawlo.logging import get_logger
175
176
  self._logger = get_logger(self.name)
176
177
  return self._logger
177
178
 
crawlo/stats_collector.py CHANGED
@@ -6,7 +6,7 @@
6
6
  # @Desc : 统计信息收集器
7
7
  """
8
8
  from pprint import pformat
9
- from crawlo.utils.log import get_logger
9
+ from crawlo.logging import get_logger
10
10
 
11
11
 
12
12
  class StatsCollector(object):
@@ -15,7 +15,7 @@ class StatsCollector(object):
15
15
  self.crawler = crawler
16
16
  self._dump = self.crawler.settings.get_bool('STATS_DUMP')
17
17
  self._stats = {}
18
- self.logger = get_logger(self.__class__.__name__, "INFO")
18
+ self.logger = get_logger(self.__class__.__name__)
19
19
 
20
20
  def inc_value(self, key, count=1, start=0):
21
21
  self._stats[key] = self._stats.setdefault(key, start) + count
crawlo/task_manager.py CHANGED
@@ -5,7 +5,7 @@ import asyncio
5
5
  from typing import Set, Final
6
6
  from collections import deque
7
7
  from asyncio import Task, Future, Semaphore
8
- from crawlo.utils.log import get_logger
8
+ from crawlo.logging import get_logger
9
9
 
10
10
 
11
11
  class DynamicSemaphore(Semaphore):
@@ -1,11 +1,11 @@
1
1
  # crawlo.cfg
2
- # 项目的配置文件。
2
+ # 项目配置文件
3
3
 
4
4
  [settings]
5
- # 指定 settings 模块的导入路径
5
+ # settings 模块路径
6
6
  default = {{project_name}}.settings
7
7
 
8
8
  [deploy]
9
- # (可选)用于部署配置
9
+ # 部署配置(可选)
10
10
  # url = http://localhost:6800/
11
11
  # project = {{project_name}}
@@ -1,4 +1,2 @@
1
1
  # -*- coding: UTF-8 -*-
2
- """
3
- {{project_name}} 项目包
4
- """
2
+ """{{project_name}} 项目"""
@@ -1,17 +1,13 @@
1
1
  # -*- coding: UTF-8 -*-
2
2
  """
3
- {{project_name}}.items
4
- ======================
5
- 定义你抓取的数据结构。
3
+ 数据项定义
6
4
  """
7
5
 
8
6
  from crawlo.items import Item, Field
9
7
 
10
8
 
11
9
  class {{project_name|title}}Item(Item):
12
- """
13
- {{project_name}} 项目的数据项。
14
- """
10
+ """数据项"""
15
11
  id = Field()
16
12
  # price = Field()
17
13
  # description = Field()
@@ -6,7 +6,7 @@
6
6
  """
7
7
 
8
8
  from crawlo.network import Request, Response
9
- from crawlo.utils.log import get_logger
9
+ from crawlo.logging import get_logger
10
10
 
11
11
 
12
12
  class {{project_name|title}}Middleware:
@@ -5,8 +5,7 @@
5
5
  数据管道示例
6
6
  """
7
7
 
8
- from crawlo.exceptions import DropItem
9
- from crawlo.utils.log import get_logger
8
+ from crawlo.logging import get_logger
10
9
 
11
10
 
12
11
  class {{project_name|title}}Pipeline:
@@ -4,13 +4,13 @@
4
4
  =============================
5
5
  基于 Crawlo 框架的爬虫项目配置。
6
6
 
7
- 此配置使用 CrawloConfig.standalone() 工厂方法创建单机模式配置,
8
- 适用于开发测试和中小规模数据采集任务。
7
+ 此配置使用 CrawloConfig.auto() 工厂方法创建自动检测模式配置,
8
+ 框架会自动检测Redis可用性,可用则使用分布式模式,否则使用单机模式。
9
9
  """
10
10
 
11
11
  from crawlo.config import CrawloConfig
12
12
 
13
- # 使用单机模式配置工厂创建配置
13
+ # 使用自动检测模式配置工厂创建配置
14
14
  config = CrawloConfig.auto(
15
15
  project_name='{{project_name}}',
16
16
  concurrency=8,
@@ -25,12 +25,10 @@ locals().update(config.to_dict())
25
25
  # 爬虫模块配置
26
26
  SPIDER_MODULES = ['{{project_name}}.spiders']
27
27
 
28
- # 默认请求头配置
29
- # 为DefaultHeaderMiddleware配置默认请求头
28
+ # 默认请求头
30
29
  # DEFAULT_REQUEST_HEADERS = {}
31
30
 
32
31
  # 允许的域名
33
- # 为OffsiteMiddleware配置允许的域名
34
32
  # ALLOWED_DOMAINS = []
35
33
 
36
34
  # 数据管道
@@ -58,6 +56,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
58
56
  LOG_LEVEL = 'INFO'
59
57
  LOG_FILE = 'logs/{{project_name}}.log'
60
58
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
59
+ LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
60
+ LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
61
+ # 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
62
+ # 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
61
63
  STATS_DUMP = True
62
64
 
63
65
  # 输出配置
@@ -103,10 +105,10 @@ MONGO_USE_BATCH = False # 是否启用批量插入
103
105
 
104
106
  # =================================== 代理配置 ===================================
105
107
 
106
- # 简化版代理配置(适用于SimpleProxyMiddleware)
107
- # 只要配置了代理列表,中间件就会自动启用
108
+ # 简单代理(SimpleProxyMiddleware)
109
+ # 配置代理列表后中间件自动启用
108
110
  # PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
109
111
 
110
- # 高级代理配置(适用于ProxyMiddleware)
111
- # 只要配置了代理API URL,中间件就会自动启用
112
+ # 动态代理(ProxyMiddleware)
113
+ # 配置代理API URL后中间件自动启用
112
114
  # PROXY_API_URL = "http://your-proxy-api.com/get-proxy"