crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,18 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  from typing import Optional, List, Dict
3
- from motor.motor_asyncio import AsyncIOMotorClient
3
+
4
4
  from pymongo.errors import PyMongoError
5
- from crawlo.utils.log import get_logger
5
+
6
6
  from crawlo.exceptions import ItemDiscard
7
+ from crawlo.logging import get_logger
8
+ from crawlo.utils.mongo_connection_pool import MongoConnectionPoolManager
7
9
 
8
10
 
9
11
  class MongoPipeline:
10
12
  def __init__(self, crawler):
11
13
  self.crawler = crawler
12
14
  self.settings = crawler.settings
13
- self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
15
+ self.logger = get_logger(self.__class__.__name__)
14
16
 
15
17
  # 初始化连接参数
16
18
  self.client = None
@@ -43,17 +45,21 @@ class MongoPipeline:
43
45
  async def _ensure_connection(self):
44
46
  """确保连接已建立"""
45
47
  if self.client is None:
46
- # 使用连接池配置创建客户端
47
- self.client = AsyncIOMotorClient(
48
- self.mongo_uri,
49
- maxPoolSize=self.max_pool_size,
50
- minPoolSize=self.min_pool_size,
51
- connectTimeoutMS=self.connect_timeout_ms,
52
- socketTimeoutMS=self.socket_timeout_ms
48
+ # 使用单例连接池管理器
49
+ self.client = await MongoConnectionPoolManager.get_client(
50
+ mongo_uri=self.mongo_uri,
51
+ db_name=self.db_name,
52
+ max_pool_size=self.max_pool_size,
53
+ min_pool_size=self.min_pool_size,
54
+ connect_timeout_ms=self.connect_timeout_ms,
55
+ socket_timeout_ms=self.socket_timeout_ms
53
56
  )
54
57
  self.db = self.client[self.db_name]
55
58
  self.collection = self.db[self.collection_name]
56
- self.logger.info(f"MongoDB连接建立 (集合: {self.collection_name})")
59
+ self.logger.info(
60
+ f"MongoDB连接建立 (集合: {self.collection_name}, "
61
+ f"使用全局共享连接池)"
62
+ )
57
63
 
58
64
  async def process_item(self, item, spider) -> Optional[dict]:
59
65
  """处理item的核心方法(带重试机制)"""
@@ -126,7 +132,10 @@ class MongoPipeline:
126
132
  # 在关闭前刷新剩余的批量数据
127
133
  if self.use_batch and self.batch_buffer:
128
134
  await self._flush_batch(self.crawler.spider)
129
-
135
+
136
+ # 注意:不再关闭客户端,因为客户端是全局共享的
137
+ # 客户端的关闭由 MongoConnectionPoolManager.close_all_clients() 统一管理
130
138
  if self.client:
131
- self.client.close()
132
- self.logger.info("MongoDB连接已关闭")
139
+ self.logger.info(
140
+ f"MongoDB Pipeline 关闭,但保留全局共享连接池以供其他爬虫使用"
141
+ )
@@ -1,15 +1,14 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  import asyncio
3
- import aiomysql
4
- from asyncmy import create_pool
5
- from typing import Optional, List, Dict, Any
6
- from abc import ABC, abstractmethod
7
3
  import async_timeout
4
+ from abc import ABC, abstractmethod
5
+ from typing import List, Dict, Any
8
6
 
9
- from crawlo.items import Item
10
7
  from crawlo.exceptions import ItemDiscard
8
+ from crawlo.items import Item
11
9
  from crawlo.utils.db_helper import SQLBuilder
12
- from crawlo.utils.log import get_logger
10
+ from crawlo.logging import get_logger
11
+ from crawlo.utils.mysql_connection_pool import MySQLConnectionPoolManager
13
12
  from . import BasePipeline
14
13
 
15
14
 
@@ -19,7 +18,7 @@ class BaseMySQLPipeline(BasePipeline, ABC):
19
18
  def __init__(self, crawler):
20
19
  self.crawler = crawler
21
20
  self.settings = crawler.settings
22
- self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
21
+ self.logger = get_logger(self.__class__.__name__)
23
22
 
24
23
  # 记录管道初始化
25
24
  self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
@@ -203,20 +202,13 @@ class BaseMySQLPipeline(BasePipeline, ABC):
203
202
  await self._flush_batch(spider_name)
204
203
  except Exception as e:
205
204
  self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
206
-
205
+
206
+ # 注意:不再关闭连接池,因为连接池是全局共享的
207
+ # 连接池的关闭由 MySQLConnectionPoolManager.close_all_pools() 统一管理
207
208
  if self.pool:
208
- try:
209
- pool_stats = {
210
- 'size': getattr(self.pool, 'size', 'unknown'),
211
- 'minsize': getattr(self.pool, 'minsize', 'unknown'),
212
- 'maxsize': getattr(self.pool, 'maxsize', 'unknown')
213
- }
214
- self.logger.info(f"正在关闭MySQL连接池,当前状态: {pool_stats}")
215
- self.pool.close()
216
- await self.pool.wait_closed()
217
- self.logger.info("MySQL连接池已关闭")
218
- except Exception as e:
219
- self.logger.error(f"关闭MySQL连接池时发生错误: {e}")
209
+ self.logger.info(
210
+ f"MySQL Pipeline 关闭,但保留全局共享连接池以供其他爬虫使用"
211
+ )
220
212
 
221
213
  async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
222
214
  """生成插入SQL语句,子类可以重写此方法"""
@@ -253,9 +245,9 @@ class AsyncmyMySQLPipeline(BaseMySQLPipeline):
253
245
 
254
246
  async def _ensure_pool(self):
255
247
  """确保连接池已初始化(线程安全)"""
256
- if self._pool_initialized:
248
+ if self._pool_initialized and self.pool:
257
249
  # 检查连接池是否仍然有效
258
- if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
250
+ if hasattr(self.pool, 'closed') and not self.pool.closed:
259
251
  return
260
252
  else:
261
253
  self.logger.warning("连接池已初始化但无效,重新初始化")
@@ -263,7 +255,9 @@ class AsyncmyMySQLPipeline(BaseMySQLPipeline):
263
255
  async with self._pool_lock:
264
256
  if not self._pool_initialized: # 双重检查避免竞争条件
265
257
  try:
266
- self.pool = await create_pool(
258
+ # 使用单例连接池管理器
259
+ self.pool = await MySQLConnectionPoolManager.get_pool(
260
+ pool_type='asyncmy',
267
261
  host=self.settings.get('MYSQL_HOST', 'localhost'),
268
262
  port=self.settings.get_int('MYSQL_PORT', 3306),
269
263
  user=self.settings.get('MYSQL_USER', 'root'),
@@ -274,11 +268,10 @@ class AsyncmyMySQLPipeline(BaseMySQLPipeline):
274
268
  echo=self.settings.get_bool('MYSQL_ECHO', False)
275
269
  )
276
270
  self._pool_initialized = True
277
- pool_stats = {
278
- 'minsize': getattr(self.pool, 'minsize', 'unknown'),
279
- 'maxsize': getattr(self.pool, 'maxsize', 'unknown')
280
- }
281
- self.logger.info(f"MySQL连接池初始化完成(表: {self.table_name}, 配置: {pool_stats})")
271
+ self.logger.info(
272
+ f"MySQL连接池初始化完成(表: {self.table_name}, "
273
+ f"使用全局共享连接池)"
274
+ )
282
275
  except Exception as e:
283
276
  self.logger.error(f"MySQL连接池初始化失败: {e}")
284
277
  # 重置状态以便重试
@@ -391,9 +384,9 @@ class AiomysqlMySQLPipeline(BaseMySQLPipeline):
391
384
 
392
385
  async def _ensure_pool(self):
393
386
  """延迟初始化连接池(线程安全)"""
394
- if self._pool_initialized:
387
+ if self._pool_initialized and self.pool:
395
388
  # 检查连接池是否仍然有效
396
- if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
389
+ if hasattr(self.pool, 'closed') and not self.pool.closed:
397
390
  return
398
391
  else:
399
392
  self.logger.warning("连接池已初始化但无效,重新初始化")
@@ -401,23 +394,22 @@ class AiomysqlMySQLPipeline(BaseMySQLPipeline):
401
394
  async with self._pool_lock:
402
395
  if not self._pool_initialized:
403
396
  try:
404
- self.pool = await aiomysql.create_pool(
397
+ # 使用单例连接池管理器
398
+ self.pool = await MySQLConnectionPoolManager.get_pool(
399
+ pool_type='aiomysql',
405
400
  host=self.settings.get('MYSQL_HOST', 'localhost'),
406
401
  port=self.settings.get_int('MYSQL_PORT', 3306),
407
402
  user=self.settings.get('MYSQL_USER', 'root'),
408
403
  password=self.settings.get('MYSQL_PASSWORD', ''),
409
404
  db=self.settings.get('MYSQL_DB', 'scrapy_db'),
410
405
  minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
411
- maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5),
412
- cursorclass=aiomysql.DictCursor,
413
- autocommit=False
406
+ maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5)
414
407
  )
415
408
  self._pool_initialized = True
416
- pool_stats = {
417
- 'minsize': getattr(self.pool, 'minsize', 'unknown'),
418
- 'maxsize': getattr(self.pool, 'maxsize', 'unknown')
419
- }
420
- self.logger.info(f"aiomysql连接池已初始化(表: {self.table_name}, 配置: {pool_stats})")
409
+ self.logger.info(
410
+ f"aiomysql连接池已初始化(表: {self.table_name}, "
411
+ f"使用全局共享连接池)"
412
+ )
421
413
  except Exception as e:
422
414
  self.logger.error(f"aiomysql连接池初始化失败: {e}")
423
415
  # 重置状态以便重试
@@ -4,11 +4,11 @@ from typing import List
4
4
  from pprint import pformat
5
5
  from asyncio import create_task
6
6
 
7
- from crawlo.utils.log import get_logger
8
- from crawlo.event import item_successful, item_discard
7
+ from crawlo.logging import get_logger
8
+ from crawlo.event import CrawlerEvent
9
9
  from crawlo.utils.misc import load_object
10
10
  from crawlo.project import common_call
11
- from crawlo.exceptions import PipelineInitError, ItemDiscard, InvalidOutputError, DropItem
11
+ from crawlo.exceptions import PipelineInitError, ItemDiscard, InvalidOutputError
12
12
 
13
13
 
14
14
  def get_dedup_pipeline_classes():
@@ -34,7 +34,7 @@ class PipelineManager:
34
34
  self.pipelines: List = []
35
35
  self.methods: List = []
36
36
 
37
- self.logger = get_logger(self.__class__.__name__, self.crawler.settings.get('LOG_LEVEL'))
37
+ self.logger = get_logger(self.__class__.__name__)
38
38
  pipelines = self.crawler.settings.get_list('PIPELINES')
39
39
  dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
40
40
 
@@ -88,13 +88,13 @@ class PipelineManager:
88
88
  item = await common_call(method, item, self.crawler.spider)
89
89
  if item is None:
90
90
  raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
91
- except (ItemDiscard, DropItem) as exc: # 同时捕获两种异常类型
91
+ except ItemDiscard as exc:
92
92
  self.logger.debug(f"Item discarded by pipeline: {exc}")
93
- create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
93
+ create_task(self.crawler.subscriber.notify(CrawlerEvent.ITEM_DISCARD, item, exc, self.crawler.spider))
94
94
  # 重新抛出异常,确保上层调用者也能捕获到,并停止执行后续管道
95
95
  raise
96
- except (ItemDiscard, DropItem):
96
+ except ItemDiscard:
97
97
  # 异常已经被处理和通知,这里只需要重新抛出
98
98
  raise
99
99
  else:
100
- create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))
100
+ create_task(self.crawler.subscriber.notify(CrawlerEvent.ITEM_SUCCESSFUL, item, self.crawler.spider))
@@ -17,9 +17,9 @@ from typing import Optional
17
17
 
18
18
  from crawlo import Item
19
19
  from crawlo.spider import Spider
20
- from crawlo.exceptions import DropItem, ItemDiscard
20
+ from crawlo.exceptions import ItemDiscard
21
21
  from crawlo.utils.fingerprint import FingerprintGenerator
22
- from crawlo.utils.log import get_logger
22
+ from crawlo.logging import get_logger
23
23
 
24
24
 
25
25
  class RedisDedupPipeline:
@@ -31,8 +31,7 @@ class RedisDedupPipeline:
31
31
  redis_port: int = 6379,
32
32
  redis_db: int = 0,
33
33
  redis_password: Optional[str] = None,
34
- redis_key: str = 'crawlo:item_fingerprints',
35
- log_level: str = "INFO"
34
+ redis_key: str = 'crawlo:item_fingerprints'
36
35
  ):
37
36
  """
38
37
  初始化 Redis 去重管道
@@ -42,9 +41,8 @@ class RedisDedupPipeline:
42
41
  :param redis_db: Redis 数据库编号
43
42
  :param redis_password: Redis 密码
44
43
  :param redis_key: 存储指纹的 Redis 键名
45
- :param log_level: 日志级别
46
44
  """
47
- self.logger = get_logger(self.__class__.__name__, log_level)
45
+ self.logger = get_logger(self.__class__.__name__)
48
46
 
49
47
  # 初始化 Redis 连接
50
48
  try:
@@ -59,8 +57,6 @@ class RedisDedupPipeline:
59
57
  )
60
58
  # 测试连接
61
59
  self.redis_client.ping()
62
- # Change INFO level log to DEBUG level to avoid redundant output
63
- # self.logger.debug(f"Redis connection successful: {redis_host}:{redis_port}/{redis_db}") # 注释掉重复的日志
64
60
  except Exception as e:
65
61
  self.logger.error(f"Redis connection failed: {e}")
66
62
  raise RuntimeError(f"Redis 连接失败: {e}")
@@ -82,8 +78,7 @@ class RedisDedupPipeline:
82
78
  redis_port=settings.get_int('REDIS_PORT', 6379),
83
79
  redis_db=settings.get_int('REDIS_DB', 0),
84
80
  redis_password=settings.get('REDIS_PASSWORD') or None,
85
- redis_key=redis_key,
86
- log_level=settings.get('LOG_LEVEL', 'INFO')
81
+ redis_key=redis_key
87
82
  )
88
83
 
89
84
  def process_item(self, item: Item, spider: Spider) -> Item:
@@ -92,7 +87,7 @@ class RedisDedupPipeline:
92
87
 
93
88
  :param item: 要处理的数据项
94
89
  :param spider: 爬虫实例
95
- :return: 处理后的数据项或抛出 DropItem 异常
90
+ :return: 处理后的数据项或抛出 ItemDiscard 异常
96
91
  """
97
92
  try:
98
93
  # 生成数据项指纹
@@ -150,8 +145,12 @@ class RedisDedupPipeline:
150
145
 
151
146
  # 注意:默认情况下不清理 Redis 中的指纹
152
147
  # 如果需要清理,可以在设置中配置
153
- if spider.crawler.settings.getbool('REDIS_DEDUP_CLEANUP', False):
154
- deleted = self.redis_client.delete(self.redis_key)
155
- self.logger.info(f" - Cleaned fingerprints: {deleted}")
148
+ # 安全访问crawlersettings
149
+ crawler = getattr(spider, 'crawler', None)
150
+ if crawler and hasattr(crawler, 'settings'):
151
+ settings = crawler.settings
152
+ if settings.getbool('REDIS_DEDUP_CLEANUP', False):
153
+ deleted = self.redis_client.delete(self.redis_key)
154
+ self.logger.info(f" - Cleaned fingerprints: {deleted}")
156
155
  except Exception as e:
157
156
  self.logger.error(f"Error closing spider: {e}")
crawlo/project.py CHANGED
@@ -5,7 +5,7 @@ from inspect import iscoroutinefunction
5
5
  from typing import Callable, Optional, Any
6
6
 
7
7
  from crawlo.settings.setting_manager import SettingManager
8
- from crawlo.utils.log import get_logger
8
+ from crawlo.logging import get_logger
9
9
 
10
10
  # 使用全局logger,避免每个模块都创建自己的延迟初始化函数
11
11
  # 延迟获取logger,确保在日志系统配置之后获取
crawlo/queue/__init__.py CHANGED
@@ -0,0 +1,10 @@
1
+ """队列管理模块"""
2
+ from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
3
+ from crawlo.queue.pqueue import SpiderPriorityQueue
4
+
5
+ __all__ = [
6
+ 'QueueManager',
7
+ 'QueueConfig',
8
+ 'QueueType',
9
+ 'SpiderPriorityQueue',
10
+ ]
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
15
15
 
16
16
  from crawlo.queue.pqueue import SpiderPriorityQueue
17
17
  from crawlo.utils.error_handler import ErrorHandler
18
- from crawlo.utils.log import get_logger
18
+ from crawlo.logging import get_logger
19
19
  from crawlo.utils.request_serializer import RequestSerializer
20
20
 
21
21
  try:
@@ -123,9 +123,11 @@ class QueueConfig:
123
123
  max_queue_size: int = 1000,
124
124
  max_retries: int = 3,
125
125
  timeout: int = 300,
126
+ run_mode: Optional[str] = None, # 新增:运行模式
126
127
  **kwargs
127
128
  ):
128
129
  self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
130
+ self.run_mode = run_mode # 保存运行模式
129
131
 
130
132
  # Redis 配置
131
133
  if redis_url:
@@ -166,7 +168,8 @@ class QueueConfig:
166
168
  queue_name=queue_name,
167
169
  max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
168
170
  max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
169
- timeout=settings.get_int('QUEUE_TIMEOUT', 300)
171
+ timeout=settings.get_int('QUEUE_TIMEOUT', 300),
172
+ run_mode=settings.get('RUN_MODE') # 传递运行模式
170
173
  )
171
174
 
172
175
 
@@ -224,6 +227,17 @@ class QueueManager:
224
227
 
225
228
  return False # 默认不需要更新配置
226
229
 
230
+ except RuntimeError as e:
231
+ # Distributed 模式下的 RuntimeError 必须重新抛出
232
+ if self.config.run_mode == 'distributed':
233
+ self.logger.error(f"Queue initialization failed: {e}")
234
+ self._health_status = "error"
235
+ raise # 重新抛出异常
236
+ # 其他模式记录错误但不抛出
237
+ self.logger.error(f"Queue initialization failed: {e}")
238
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
239
+ self._health_status = "error"
240
+ return False
227
241
  except Exception as e:
228
242
  # 记录详细的错误信息和堆栈跟踪
229
243
  self.logger.error(f"Queue initialization failed: {e}")
@@ -403,23 +417,63 @@ class QueueManager:
403
417
  return QueueType.MEMORY
404
418
 
405
419
  elif self.config.queue_type == QueueType.REDIS:
406
- # QUEUE_TYPE = 'redis' 时,行为等同于 'auto' 模式
407
- # 优先使用 Redis(如果可用),如果不可用则回退到内存队列
408
- if REDIS_AVAILABLE and self.config.redis_url:
420
+ # Distributed 模式:必须使用 Redis,不允许降级
421
+ if self.config.run_mode == 'distributed':
422
+ # 分布式模式必须确保 Redis 可用
423
+ if not REDIS_AVAILABLE:
424
+ error_msg = (
425
+ "Distributed 模式要求 Redis 可用,但 Redis 客户端库未安装。\n"
426
+ "请安装 Redis 支持: pip install redis"
427
+ )
428
+ self.logger.error(error_msg)
429
+ raise RuntimeError(error_msg)
430
+
431
+ if not self.config.redis_url:
432
+ error_msg = (
433
+ "Distributed 模式要求配置 Redis 连接信息。\n"
434
+ "请在 settings.py 中配置 REDIS_HOST、REDIS_PORT 等参数"
435
+ )
436
+ self.logger.error(error_msg)
437
+ raise RuntimeError(error_msg)
438
+
409
439
  # 测试 Redis 连接
410
440
  try:
411
441
  from crawlo.queue.redis_priority_queue import RedisPriorityQueue
412
442
  test_queue = RedisPriorityQueue(self.config.redis_url)
413
443
  await test_queue.connect()
414
444
  await test_queue.close()
415
- self.logger.debug("Redis mode: Redis available, using distributed queue")
445
+ self.logger.debug("Distributed mode: Redis connection verified")
416
446
  return QueueType.REDIS
417
447
  except Exception as e:
418
- self.logger.debug(f"Redis mode: Redis unavailable ({e}), falling back to memory queue")
419
- return QueueType.MEMORY
448
+ error_msg = (
449
+ f"Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。\n"
450
+ f"错误信息: {e}\n"
451
+ f"Redis URL: {self.config.redis_url}\n"
452
+ f"请检查:\n"
453
+ f" 1. Redis 服务是否正在运行\n"
454
+ f" 2. Redis 连接配置是否正确\n"
455
+ f" 3. 网络连接是否正常"
456
+ )
457
+ self.logger.error(error_msg)
458
+ raise RuntimeError(error_msg) from e
420
459
  else:
421
- self.logger.debug("Redis mode: Redis not configured, falling back to memory queue")
422
- return QueueType.MEMORY
460
+ # distributed 模式:QUEUE_TYPE='redis' 时允许降级到 memory
461
+ # 这提供了向后兼容性和更好的容错性
462
+ if REDIS_AVAILABLE and self.config.redis_url:
463
+ # 测试 Redis 连接
464
+ try:
465
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
466
+ test_queue = RedisPriorityQueue(self.config.redis_url)
467
+ await test_queue.connect()
468
+ await test_queue.close()
469
+ self.logger.debug("Redis mode: Redis available, using distributed queue")
470
+ return QueueType.REDIS
471
+ except Exception as e:
472
+ self.logger.warning(f"Redis mode: Redis unavailable ({e}), falling back to memory queue")
473
+ return QueueType.MEMORY
474
+ else:
475
+ self.logger.warning("Redis mode: Redis not configured, falling back to memory queue")
476
+ return QueueType.MEMORY
423
477
 
424
478
  elif self.config.queue_type == QueueType.MEMORY:
425
479
  return QueueType.MEMORY
@@ -489,9 +543,21 @@ class QueueManager:
489
543
  except Exception as e:
490
544
  self.logger.warning(f"Queue health check failed: {e}")
491
545
  self._health_status = "unhealthy"
492
- # 如果是Redis队列且健康检查失败,尝试切换到内存队列
493
- # 对于 AUTO REDIS 模式都允许回退
494
- if self._queue_type == QueueType.REDIS and self.config.queue_type in [QueueType.AUTO, QueueType.REDIS]:
546
+
547
+ # Distributed 模式下 Redis 健康检查失败应该报错
548
+ if self.config.run_mode == 'distributed':
549
+ error_msg = (
550
+ f"Distributed 模式下 Redis 健康检查失败。\n"
551
+ f"错误信息: {e}\n"
552
+ f"Redis URL: {self.config.redis_url}\n"
553
+ f"分布式模式不允许降级到内存队列,请修复 Redis 连接问题。"
554
+ )
555
+ self.logger.error(error_msg)
556
+ raise RuntimeError(error_msg) from e
557
+
558
+ # 非 Distributed 模式:如果是Redis队列且健康检查失败,尝试切换到内存队列
559
+ # 对于 AUTO 模式允许回退
560
+ if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
495
561
  self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
496
562
  try:
497
563
  await self._queue.close()