crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (105) hide show
  1. crawlo/__init__.py +24 -0
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/run.py +58 -32
  4. crawlo/core/__init__.py +44 -0
  5. crawlo/core/engine.py +119 -45
  6. crawlo/core/scheduler.py +4 -3
  7. crawlo/crawler.py +603 -1133
  8. crawlo/downloader/aiohttp_downloader.py +4 -2
  9. crawlo/extension/__init__.py +1 -1
  10. crawlo/extension/logging_extension.py +23 -7
  11. crawlo/factories/__init__.py +28 -0
  12. crawlo/factories/base.py +69 -0
  13. crawlo/factories/crawler.py +104 -0
  14. crawlo/factories/registry.py +85 -0
  15. crawlo/filters/aioredis_filter.py +25 -2
  16. crawlo/framework.py +292 -0
  17. crawlo/initialization/__init__.py +40 -0
  18. crawlo/initialization/built_in.py +426 -0
  19. crawlo/initialization/context.py +142 -0
  20. crawlo/initialization/core.py +194 -0
  21. crawlo/initialization/phases.py +149 -0
  22. crawlo/initialization/registry.py +146 -0
  23. crawlo/items/base.py +2 -1
  24. crawlo/logging/__init__.py +38 -0
  25. crawlo/logging/config.py +97 -0
  26. crawlo/logging/factory.py +129 -0
  27. crawlo/logging/manager.py +112 -0
  28. crawlo/middleware/middleware_manager.py +1 -1
  29. crawlo/middleware/offsite.py +1 -1
  30. crawlo/mode_manager.py +26 -1
  31. crawlo/pipelines/pipeline_manager.py +2 -1
  32. crawlo/project.py +76 -46
  33. crawlo/queue/pqueue.py +11 -5
  34. crawlo/queue/queue_manager.py +143 -19
  35. crawlo/queue/redis_priority_queue.py +69 -49
  36. crawlo/settings/default_settings.py +110 -14
  37. crawlo/settings/setting_manager.py +29 -13
  38. crawlo/spider/__init__.py +34 -16
  39. crawlo/stats_collector.py +17 -3
  40. crawlo/task_manager.py +112 -3
  41. crawlo/templates/project/settings.py.tmpl +103 -202
  42. crawlo/templates/project/settings_distributed.py.tmpl +122 -135
  43. crawlo/templates/project/settings_gentle.py.tmpl +149 -43
  44. crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
  45. crawlo/templates/project/settings_minimal.py.tmpl +46 -15
  46. crawlo/templates/project/settings_simple.py.tmpl +138 -75
  47. crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
  48. crawlo/templates/run.py.tmpl +10 -14
  49. crawlo/templates/spiders_init.py.tmpl +10 -0
  50. crawlo/tools/network_diagnostic.py +365 -0
  51. crawlo/utils/class_loader.py +26 -0
  52. crawlo/utils/error_handler.py +76 -35
  53. crawlo/utils/log.py +41 -144
  54. crawlo/utils/redis_connection_pool.py +43 -6
  55. crawlo/utils/request_serializer.py +8 -1
  56. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
  57. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
  58. tests/authenticated_proxy_example.py +2 -2
  59. tests/baidu_performance_test.py +109 -0
  60. tests/baidu_test.py +60 -0
  61. tests/comprehensive_framework_test.py +213 -0
  62. tests/comprehensive_test.py +82 -0
  63. tests/comprehensive_testing_summary.md +187 -0
  64. tests/debug_configure.py +70 -0
  65. tests/debug_framework_logger.py +85 -0
  66. tests/debug_log_levels.py +64 -0
  67. tests/distributed_test.py +67 -0
  68. tests/distributed_test_debug.py +77 -0
  69. tests/final_command_test_report.md +0 -0
  70. tests/final_comprehensive_test.py +152 -0
  71. tests/final_validation_test.py +183 -0
  72. tests/framework_performance_test.py +203 -0
  73. tests/optimized_performance_test.py +212 -0
  74. tests/performance_comparison.py +246 -0
  75. tests/queue_blocking_test.py +114 -0
  76. tests/queue_test.py +90 -0
  77. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  78. tests/scrapy_comparison/scrapy_test.py +134 -0
  79. tests/simple_command_test.py +120 -0
  80. tests/simple_crawlo_test.py +128 -0
  81. tests/simple_log_test.py +58 -0
  82. tests/simple_optimization_test.py +129 -0
  83. tests/simple_spider_test.py +50 -0
  84. tests/simple_test.py +48 -0
  85. tests/test_all_commands.py +231 -0
  86. tests/test_batch_processor.py +179 -0
  87. tests/test_component_factory.py +175 -0
  88. tests/test_controlled_spider_mixin.py +80 -0
  89. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  90. tests/test_factories.py +253 -0
  91. tests/test_framework_logger.py +67 -0
  92. tests/test_framework_startup.py +65 -0
  93. tests/test_large_scale_config.py +113 -0
  94. tests/test_large_scale_helper.py +236 -0
  95. tests/test_mode_change.py +73 -0
  96. tests/test_mode_consistency.py +1 -1
  97. tests/test_performance_monitor.py +116 -0
  98. tests/test_queue_empty_check.py +42 -0
  99. tests/untested_features_report.md +139 -0
  100. tests/verify_debug.py +52 -0
  101. tests/verify_log_fix.py +112 -0
  102. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  103. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  104. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  105. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,129 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 日志器工厂 - 创建和缓存Logger实例
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import threading
10
+ from logging.handlers import RotatingFileHandler
11
+ from typing import Dict, Optional
12
+ from weakref import WeakValueDictionary
13
+
14
+ from .manager import get_config, is_configured, configure
15
+ from .config import LogConfig
16
+
17
+
18
+ class LoggerFactory:
19
+ """
20
+ Logger工厂类 - 负责创建和缓存Logger实例
21
+
22
+ 特点:
23
+ 1. 使用WeakValueDictionary避免内存泄漏
24
+ 2. 线程安全的Logger创建
25
+ 3. 自动配置管理
26
+ 4. 简单的缓存策略
27
+ """
28
+
29
+ # Logger缓存 - 使用弱引用避免内存泄漏
30
+ _logger_cache: WeakValueDictionary = WeakValueDictionary()
31
+ _cache_lock = threading.RLock()
32
+
33
+ @classmethod
34
+ def get_logger(cls, name: str = 'crawlo') -> logging.Logger:
35
+ """
36
+ 获取Logger实例
37
+
38
+ Args:
39
+ name: Logger名称
40
+
41
+ Returns:
42
+ logging.Logger: 配置好的Logger实例
43
+ """
44
+ # 确保日志系统已配置
45
+ if not is_configured():
46
+ configure() # 使用默认配置
47
+
48
+ # 检查缓存
49
+ with cls._cache_lock:
50
+ if name in cls._logger_cache:
51
+ return cls._logger_cache[name]
52
+
53
+ # 创建新的Logger
54
+ logger = cls._create_logger(name)
55
+ cls._logger_cache[name] = logger
56
+ return logger
57
+
58
+ @classmethod
59
+ def _create_logger(cls, name: str) -> logging.Logger:
60
+ """创建新的Logger实例"""
61
+ config = get_config()
62
+ if not config:
63
+ raise RuntimeError("Log system not configured")
64
+
65
+ # 创建Logger
66
+ logger = logging.getLogger(name)
67
+ logger.setLevel(logging.DEBUG) # Logger本身设为最低级别
68
+
69
+ # 清除现有handlers(避免重复添加)
70
+ logger.handlers.clear()
71
+
72
+ # 获取模块级别
73
+ module_level = config.get_module_level(name)
74
+ level = getattr(logging, module_level.upper(), logging.INFO)
75
+
76
+ # 创建formatter
77
+ formatter = logging.Formatter(config.format)
78
+
79
+ # 添加控制台Handler
80
+ if config.console_enabled:
81
+ console_handler = logging.StreamHandler()
82
+ console_handler.setFormatter(formatter)
83
+ console_handler.setLevel(level)
84
+ logger.addHandler(console_handler)
85
+
86
+ # 添加文件Handler
87
+ if config.file_enabled and config.file_path:
88
+ try:
89
+ # 确保日志目录存在
90
+ log_dir = os.path.dirname(config.file_path)
91
+ if log_dir and not os.path.exists(log_dir):
92
+ os.makedirs(log_dir, exist_ok=True)
93
+
94
+ file_handler = RotatingFileHandler(
95
+ filename=config.file_path,
96
+ maxBytes=config.max_bytes,
97
+ backupCount=config.backup_count,
98
+ encoding=config.encoding
99
+ )
100
+ file_handler.setFormatter(formatter)
101
+ file_handler.setLevel(level)
102
+ logger.addHandler(file_handler)
103
+ except Exception as e:
104
+ # 文件Handler创建失败时,至少保证控制台输出
105
+ pass
106
+
107
+ # 防止向上传播(避免重复输出)
108
+ logger.propagate = False
109
+
110
+ return logger
111
+
112
+ @classmethod
113
+ def clear_cache(cls):
114
+ """清空Logger缓存"""
115
+ with cls._cache_lock:
116
+ cls._logger_cache.clear()
117
+
118
+ @classmethod
119
+ def refresh_loggers(cls, new_config: LogConfig):
120
+ """刷新所有缓存的Logger(配置更新时使用)"""
121
+ with cls._cache_lock:
122
+ # 清空缓存,强制重新创建
123
+ cls._logger_cache.clear()
124
+
125
+
126
+ # 便捷函数
127
+ def get_logger(name: str = 'crawlo') -> logging.Logger:
128
+ """获取Logger实例的便捷函数"""
129
+ return LoggerFactory.get_logger(name)
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 日志管理器 - 核心组件
5
+ """
6
+
7
+ import threading
8
+ from typing import Optional, Any
9
+ from .config import LogConfig
10
+
11
+
12
+ class LogManager:
13
+ """
14
+ 日志管理器 - 单例模式
15
+
16
+ 职责:
17
+ 1. 全局日志配置管理
18
+ 2. 配置状态跟踪
19
+ 3. 线程安全的配置更新
20
+ """
21
+
22
+ _instance: Optional['LogManager'] = None
23
+ _lock = threading.Lock()
24
+
25
+ def __new__(cls) -> 'LogManager':
26
+ if cls._instance is None:
27
+ with cls._lock:
28
+ if cls._instance is None:
29
+ cls._instance = super(LogManager, cls).__new__(cls)
30
+ cls._instance._initialized = False
31
+ return cls._instance
32
+
33
+ def __init__(self):
34
+ if hasattr(self, '_initialized') and self._initialized:
35
+ return
36
+
37
+ self._config: Optional[LogConfig] = None
38
+ self._configured = False
39
+ self._config_lock = threading.RLock()
40
+ self._initialized = True
41
+
42
+ @property
43
+ def config(self) -> Optional[LogConfig]:
44
+ """获取当前配置"""
45
+ with self._config_lock:
46
+ return self._config
47
+
48
+ @property
49
+ def is_configured(self) -> bool:
50
+ """检查是否已配置"""
51
+ return self._configured
52
+
53
+ def configure(self, settings=None, **kwargs) -> LogConfig:
54
+ """
55
+ 配置日志系统
56
+
57
+ Args:
58
+ settings: 配置对象或None
59
+ **kwargs: 关键字参数配置
60
+
61
+ Returns:
62
+ LogConfig: 生效的配置对象
63
+ """
64
+ with self._config_lock:
65
+ # 总是重新配置,即使已经配置过
66
+ # 从不同来源创建配置
67
+ if settings is not None:
68
+ # 检查settings是否已经是LogConfig对象
69
+ if isinstance(settings, LogConfig):
70
+ config = settings
71
+ else:
72
+ config = LogConfig.from_settings(settings)
73
+ elif kwargs:
74
+ config = LogConfig.from_dict(kwargs)
75
+ else:
76
+ config = LogConfig() # 使用默认配置
77
+
78
+ # 验证配置
79
+ if not config.validate():
80
+ raise ValueError("Invalid log configuration")
81
+
82
+ self._config = config
83
+ self._configured = True
84
+
85
+ return config
86
+
87
+ def reset(self):
88
+ """重置配置(主要用于测试)"""
89
+ with self._config_lock:
90
+ self._config = None
91
+ self._configured = False
92
+
93
+
94
+ # 全局实例
95
+ _log_manager = LogManager()
96
+
97
+ # 模块级便捷函数
98
+ def configure(settings=None, **kwargs) -> LogConfig:
99
+ """配置日志系统"""
100
+ return _log_manager.configure(settings, **kwargs)
101
+
102
+ def is_configured() -> bool:
103
+ """检查是否已配置"""
104
+ return _log_manager.is_configured
105
+
106
+ def get_config() -> Optional[LogConfig]:
107
+ """获取当前配置"""
108
+ return _log_manager.config
109
+
110
+ def reset():
111
+ """重置配置"""
112
+ _log_manager.reset()
@@ -8,7 +8,7 @@ from typing import List, Dict, Callable, Optional
8
8
 
9
9
  from crawlo import Request, Response
10
10
  from crawlo.utils.log import get_logger
11
- from crawlo.project import load_class
11
+ from crawlo.utils.class_loader import load_class
12
12
  from crawlo.middleware import BaseMiddleware
13
13
  from crawlo.project import common_call
14
14
  from crawlo.event import ignore_request, response_received
@@ -54,7 +54,7 @@ class OffsiteMiddleware:
54
54
  o._compile_domains()
55
55
 
56
56
  # 使用中间件自己的logger而不是crawler.logger
57
- o.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
57
+ o.logger.debug(f"OffsiteMiddleware 已启用,允许的域名: {allowed_domains}")
58
58
  return o
59
59
 
60
60
  def _compile_domains(self):
crawlo/mode_manager.py CHANGED
@@ -26,7 +26,26 @@ class ModeManager:
26
26
  """运行模式管理器"""
27
27
 
28
28
  def __init__(self):
29
- pass
29
+ # 延迟初始化logger,避免循环依赖
30
+ self._logger = None
31
+ self._debug("运行模式管理器初始化完成")
32
+
33
+ def _get_logger(self):
34
+ """延迟获取logger实例"""
35
+ if self._logger is None:
36
+ try:
37
+ from crawlo.utils.log import get_logger
38
+ self._logger = get_logger(__name__)
39
+ except Exception:
40
+ # 如果日志系统尚未初始化,返回None
41
+ pass
42
+ return self._logger
43
+
44
+ def _debug(self, message: str):
45
+ """调试日志"""
46
+ logger = self._get_logger()
47
+ if logger:
48
+ logger.debug(message)
30
49
 
31
50
  @staticmethod
32
51
  def get_standalone_settings() -> Dict[str, Any]:
@@ -94,12 +113,14 @@ class ModeManager:
94
113
  Returns:
95
114
  Dict[str, Any]: 配置字典
96
115
  """
116
+ self._debug(f"解析运行模式: {mode}")
97
117
  mode = RunMode(mode.lower())
98
118
  mode_info = None
99
119
 
100
120
  if mode == RunMode.STANDALONE:
101
121
  mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
102
122
  settings = self.get_standalone_settings()
123
+ self._debug("应用单机模式配置")
103
124
 
104
125
  elif mode == RunMode.DISTRIBUTED:
105
126
  mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
@@ -110,10 +131,12 @@ class ModeManager:
110
131
  redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
111
132
  project_name=kwargs.get('project_name', 'crawlo')
112
133
  )
134
+ self._debug("应用分布式模式配置")
113
135
 
114
136
  elif mode == RunMode.AUTO:
115
137
  mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
116
138
  settings = self.get_auto_settings()
139
+ self._debug("应用自动检测模式配置")
117
140
 
118
141
  else:
119
142
  raise ValueError(f"不支持的运行模式: {mode}")
@@ -122,10 +145,12 @@ class ModeManager:
122
145
  user_settings = {k: v for k, v in kwargs.items()
123
146
  if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
124
147
  settings.update(user_settings)
148
+ self._debug(f"合并用户自定义配置: {list(user_settings.keys())}")
125
149
 
126
150
  # 将模式信息添加到配置中,供后续使用
127
151
  settings['_mode_info'] = mode_info
128
152
 
153
+ self._debug(f"运行模式解析完成: {mode}")
129
154
  return settings
130
155
 
131
156
  def from_environment(self) -> Dict[str, Any]:
@@ -6,7 +6,8 @@ from asyncio import create_task
6
6
 
7
7
  from crawlo.utils.log import get_logger
8
8
  from crawlo.event import item_successful, item_discard
9
- from crawlo.project import load_class, common_call
9
+ from crawlo.utils.class_loader import load_class
10
+ from crawlo.project import common_call
10
11
  from crawlo.exceptions import PipelineInitError, ItemDiscard, InvalidOutputError
11
12
 
12
13
 
crawlo/project.py CHANGED
@@ -7,10 +7,28 @@ from inspect import iscoroutinefunction
7
7
  from typing import Callable, Optional, Any
8
8
 
9
9
  from crawlo.settings.setting_manager import SettingManager
10
- from crawlo.utils.log import get_logger, LoggerManager
10
+ from crawlo.utils.log import get_logger
11
11
 
12
12
  # 使用全局logger,避免每个模块都创建自己的延迟初始化函数
13
- logger = get_logger(__name__)
13
+ # 延迟获取logger,确保在日志系统配置之后获取
14
+ _logger = None
15
+
16
+
17
+ def logger():
18
+ """延迟获取logger实例,确保在日志系统配置之后获取"""
19
+ global _logger
20
+ if _logger is None:
21
+ _logger = get_logger(__name__)
22
+ return _logger
23
+
24
+
25
+ # 添加一个临时的日志函数,用于在日志系统配置之前输出信息
26
+ def _temp_debug(message):
27
+ """临时调试函数,在日志系统配置之前使用"""
28
+ # 直接输出到控制台,避免循环依赖
29
+ import os
30
+ if os.environ.get('CRAWLO_DEBUG'):
31
+ print(f"[CRAWLO_DEBUG] {message}")
14
32
 
15
33
 
16
34
  def load_class(path: str) -> Any:
@@ -23,12 +41,9 @@ def load_class(path: str) -> Any:
23
41
  Returns:
24
42
  加载的类对象
25
43
  """
26
- try:
27
- module_path, class_name = path.rsplit('.', 1)
28
- module = importlib.import_module(module_path)
29
- return getattr(module, class_name)
30
- except (ValueError, ImportError, AttributeError) as e:
31
- raise ImportError(f"无法加载类 '{path}': {e}")
44
+ # 使用工具模块的实现,避免循环依赖
45
+ from crawlo.utils.class_loader import load_class as _load_class
46
+ return _load_class(path)
32
47
 
33
48
 
34
49
  def merge_settings(spider, settings):
@@ -42,7 +57,7 @@ def merge_settings(spider, settings):
42
57
  spider_name = getattr(spider, 'name', 'UnknownSpider')
43
58
  # 检查 settings 是否为 SettingManager 实例
44
59
  if not hasattr(settings, 'update_attributes'):
45
- logger.error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
60
+ _temp_debug(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
46
61
  # 如果是字典,创建一个新的 SettingManager 实例
47
62
  if isinstance(settings, dict):
48
63
  from crawlo.settings.setting_manager import SettingManager
@@ -50,14 +65,14 @@ def merge_settings(spider, settings):
50
65
  new_settings.update_attributes(settings)
51
66
  settings = new_settings
52
67
  else:
53
- logger.error("无法处理的 settings 类型")
68
+ _temp_debug("无法处理的 settings 类型")
54
69
  return
55
-
70
+
56
71
  if hasattr(spider, 'custom_settings'):
57
72
  custom_settings = getattr(spider, 'custom_settings')
58
73
  settings.update_attributes(custom_settings)
59
74
  else:
60
- logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
75
+ _temp_debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
61
76
 
62
77
 
63
78
  async def common_call(func: Callable, *args, **kwargs):
@@ -85,7 +100,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
85
100
  config.read(cfg_path, encoding="utf-8")
86
101
  if config.has_section("settings") and config.has_option("settings", "default"):
87
102
  module_path = config.get("settings", "default")
88
- logger.debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
103
+ _temp_debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
89
104
  return module_path
90
105
  else:
91
106
  raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
@@ -101,41 +116,41 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
101
116
  2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
102
117
  """
103
118
  path = os.path.abspath(start_path)
104
-
119
+
105
120
  # 首先检查当前目录及其子目录
106
121
  for root, dirs, files in os.walk(path):
107
122
  if "crawlo.cfg" in files:
108
123
  cfg_path = os.path.join(root, "crawlo.cfg")
109
- logger.debug(f"✅ 找到项目配置文件: {cfg_path}")
124
+ _temp_debug(f"✅ 找到项目配置文件: {cfg_path}")
110
125
  return root
111
-
126
+
112
127
  # 向上查找直到找到 crawlo.cfg 或包含 settings.py 和 __init__.py 的目录
113
128
  original_path = path
114
129
  checked_paths = set()
115
-
130
+
116
131
  while True:
117
132
  # 避免无限循环
118
133
  if path in checked_paths:
119
134
  break
120
135
  checked_paths.add(path)
121
-
136
+
122
137
  # 检查 crawlo.cfg
123
138
  cfg_file = os.path.join(path, "crawlo.cfg")
124
139
  if os.path.isfile(cfg_file):
125
- logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
140
+ _temp_debug(f"✅ 找到项目配置文件: {cfg_file}")
126
141
  return path
127
142
 
128
143
  # 检查 settings.py 和 __init__.py
129
144
  settings_file = os.path.join(path, "settings.py")
130
145
  init_file = os.path.join(path, "__init__.py")
131
146
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
132
- logger.debug(f"✅ 找到项目模块: {path}")
147
+ _temp_debug(f"✅ 找到项目模块: {path}")
133
148
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
134
149
  parent = os.path.dirname(path)
135
150
  if parent != path:
136
151
  parent_cfg = os.path.join(parent, "crawlo.cfg")
137
152
  if os.path.isfile(parent_cfg):
138
- logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
153
+ _temp_debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
139
154
  return parent
140
155
  return path
141
156
 
@@ -156,22 +171,22 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
156
171
  if path in checked_paths:
157
172
  break
158
173
  checked_paths.add(path)
159
-
174
+
160
175
  cfg_file = os.path.join(path, "crawlo.cfg")
161
176
  if os.path.isfile(cfg_file):
162
- logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
177
+ _temp_debug(f"✅ 找到项目配置文件: {cfg_file}")
163
178
  return path
164
179
 
165
180
  settings_file = os.path.join(path, "settings.py")
166
181
  init_file = os.path.join(path, "__init__.py")
167
182
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
168
- logger.debug(f"✅ 找到项目模块: {path}")
183
+ _temp_debug(f"✅ 找到项目模块: {path}")
169
184
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
170
185
  parent = os.path.dirname(path)
171
186
  if parent != path:
172
187
  parent_cfg = os.path.join(parent, "crawlo.cfg")
173
188
  if os.path.isfile(parent_cfg):
174
- logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
189
+ _temp_debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
175
190
  return parent
176
191
  return path
177
192
 
@@ -193,22 +208,22 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
193
208
  if path in checked_paths:
194
209
  break
195
210
  checked_paths.add(path)
196
-
211
+
197
212
  cfg_file = os.path.join(path, "crawlo.cfg")
198
213
  if os.path.isfile(cfg_file):
199
- logger.debug(f"找到项目配置文件: {cfg_file}")
214
+ _temp_debug(f"找到项目配置文件: {cfg_file}")
200
215
  return path
201
216
 
202
217
  settings_file = os.path.join(path, "settings.py")
203
218
  init_file = os.path.join(path, "__init__.py")
204
219
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
205
- logger.debug(f"找到项目模块: {path}")
220
+ _temp_debug(f"找到项目模块: {path}")
206
221
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
207
222
  parent = os.path.dirname(path)
208
223
  if parent != path:
209
224
  parent_cfg = os.path.join(parent, "crawlo.cfg")
210
225
  if os.path.isfile(parent_cfg):
211
- logger.debug(f"在上层目录找到项目配置文件: {parent_cfg}")
226
+ _temp_debug(f"在上层目录找到项目配置文件: {parent_cfg}")
212
227
  return parent
213
228
  return path
214
229
 
@@ -219,13 +234,14 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
219
234
  except Exception:
220
235
  pass
221
236
 
222
- logger.warning("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
237
+ _temp_debug("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
223
238
  return None
224
239
 
225
240
 
226
- def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
241
+ def _load_project_settings(custom_settings: Optional[dict] = None) -> SettingManager:
227
242
  """
228
- 获取配置管理器实例(主入口函数)
243
+ 内部函数:加载项目配置(不处理日志初始化)
244
+ 这个函数专门负责配置加载逻辑,避免与初始化管理器产生循环依赖
229
245
 
230
246
  Args:
231
247
  custom_settings: 运行时自定义配置,会覆盖 settings.py
@@ -233,7 +249,7 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
233
249
  Returns:
234
250
  SettingManager: 已加载配置的实例
235
251
  """
236
- logger.debug("🚀 正在初始化 Crawlo 项目配置...")
252
+ _temp_debug("🚀 正在加载 Crawlo 项目配置...")
237
253
 
238
254
  # 1. 查找项目根
239
255
  project_root = _find_project_root()
@@ -250,21 +266,21 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
250
266
  # 推断:项目目录名.settings
251
267
  project_name = os.path.basename(project_root)
252
268
  settings_module_path = f"{project_name}.settings"
253
- logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
269
+ _temp_debug(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
254
270
 
255
271
  # 3. 注入 sys.path
256
272
  project_root_str = os.path.abspath(project_root)
257
273
  if project_root_str not in sys.path:
258
274
  sys.path.insert(0, project_root_str)
259
- logger.debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
275
+ _temp_debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
260
276
 
261
277
  # 4. 加载 SettingManager
262
- logger.debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
278
+ _temp_debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
263
279
  settings = SettingManager()
264
280
 
265
281
  try:
266
282
  settings.set_settings(settings_module_path)
267
- logger.debug("✅ settings 模块加载成功")
283
+ _temp_debug("✅ settings 模块加载成功")
268
284
  except Exception as e:
269
285
  raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
270
286
 
@@ -279,19 +295,33 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
279
295
  # 只有当用户没有设置该配置项时才应用模式配置
280
296
  if key not in settings.attributes:
281
297
  settings.set(key, value)
282
- logger.debug(f"🔧 已应用 {run_mode} 模式配置")
298
+ _temp_debug(f"🔧 已应用 {run_mode} 模式配置")
283
299
 
284
300
  # 6. 合并运行时配置
285
301
  if custom_settings:
286
302
  settings.update_attributes(custom_settings)
287
- logger.debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
303
+ _temp_debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
304
+
305
+ _temp_debug("🎉 Crawlo 项目配置加载完成!")
306
+ return settings
288
307
 
289
- # 7. 显示核心配置摘要(INFO级别)
290
- # _log_settings_summary(settings)
291
308
 
292
- # 配置日志系统
293
- LoggerManager.configure(settings)
309
+ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
310
+ """
311
+ 获取配置管理器实例(主入口函数)
294
312
 
295
- # 将项目初始化完成的消息改为DEBUG级别
296
- logger.debug("🎉 Crawlo 项目配置初始化完成!")
297
- return settings
313
+ 注意:这个函数现在作为向后兼容的入口,实际的初始化逻辑已经移到
314
+ crawlo.core.framework_initializer 模块中。建议使用新的初始化方式:
315
+
316
+ >>> from crawlo.core.framework_initializer import initialize_framework
317
+ >>> settings = initialize_framework(custom_settings)
318
+
319
+ Args:
320
+ custom_settings: 运行时自定义配置,会覆盖 settings.py
321
+
322
+ Returns:
323
+ SettingManager: 已加载配置的实例
324
+ """
325
+ # 使用新的统一初始化管理器
326
+ from crawlo.core.framework_initializer import initialize_framework
327
+ return initialize_framework(custom_settings)
crawlo/queue/pqueue.py CHANGED
@@ -3,7 +3,7 @@ import json
3
3
  import sys
4
4
  import asyncio
5
5
  from asyncio import PriorityQueue
6
- from typing import Optional
6
+ from typing import Optional, Tuple, Any
7
7
 
8
8
 
9
9
  from crawlo import Request
@@ -16,12 +16,12 @@ class SpiderPriorityQueue(PriorityQueue):
16
16
  """初始化队列,maxsize为0表示无大小限制"""
17
17
  super().__init__(maxsize)
18
18
 
19
- async def get(self, timeout: float = 0.1) -> Optional[Request]:
19
+ async def get(self, timeout: float = 0.01) -> Optional[Any]:
20
20
  """
21
21
  异步获取队列元素,带超时功能
22
22
 
23
23
  Args:
24
- timeout: 超时时间(秒),默认0.1
24
+ timeout: 超时时间(秒),默认0.01
25
25
 
26
26
  Returns:
27
27
  队列元素(优先级, 值)或None(超时)
@@ -30,8 +30,14 @@ class SpiderPriorityQueue(PriorityQueue):
30
30
  # 根据Python版本选择超时实现方式
31
31
  if sys.version_info >= (3, 11):
32
32
  async with asyncio.timeout(timeout):
33
- return await super().get()
33
+ item = await super().get()
34
+ return item
34
35
  else:
35
- return await asyncio.wait_for(super().get(), timeout=timeout)
36
+ item = await asyncio.wait_for(super().get(), timeout=timeout)
37
+ return item
36
38
  except asyncio.TimeoutError:
37
39
  return None
40
+
41
+ def qsize(self) -> int:
42
+ """获取队列大小"""
43
+ return super().qsize()