crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +24 -0
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +58 -32
- crawlo/core/__init__.py +44 -0
- crawlo/core/engine.py +119 -45
- crawlo/core/scheduler.py +4 -3
- crawlo/crawler.py +603 -1133
- crawlo/downloader/aiohttp_downloader.py +4 -2
- crawlo/extension/__init__.py +1 -1
- crawlo/extension/logging_extension.py +23 -7
- crawlo/factories/__init__.py +28 -0
- crawlo/factories/base.py +69 -0
- crawlo/factories/crawler.py +104 -0
- crawlo/factories/registry.py +85 -0
- crawlo/filters/aioredis_filter.py +25 -2
- crawlo/framework.py +292 -0
- crawlo/initialization/__init__.py +40 -0
- crawlo/initialization/built_in.py +426 -0
- crawlo/initialization/context.py +142 -0
- crawlo/initialization/core.py +194 -0
- crawlo/initialization/phases.py +149 -0
- crawlo/initialization/registry.py +146 -0
- crawlo/items/base.py +2 -1
- crawlo/logging/__init__.py +38 -0
- crawlo/logging/config.py +97 -0
- crawlo/logging/factory.py +129 -0
- crawlo/logging/manager.py +112 -0
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/middleware/offsite.py +1 -1
- crawlo/mode_manager.py +26 -1
- crawlo/pipelines/pipeline_manager.py +2 -1
- crawlo/project.py +76 -46
- crawlo/queue/pqueue.py +11 -5
- crawlo/queue/queue_manager.py +143 -19
- crawlo/queue/redis_priority_queue.py +69 -49
- crawlo/settings/default_settings.py +110 -14
- crawlo/settings/setting_manager.py +29 -13
- crawlo/spider/__init__.py +34 -16
- crawlo/stats_collector.py +17 -3
- crawlo/task_manager.py +112 -3
- crawlo/templates/project/settings.py.tmpl +103 -202
- crawlo/templates/project/settings_distributed.py.tmpl +122 -135
- crawlo/templates/project/settings_gentle.py.tmpl +149 -43
- crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
- crawlo/templates/project/settings_minimal.py.tmpl +46 -15
- crawlo/templates/project/settings_simple.py.tmpl +138 -75
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
- crawlo/templates/run.py.tmpl +10 -14
- crawlo/templates/spiders_init.py.tmpl +10 -0
- crawlo/tools/network_diagnostic.py +365 -0
- crawlo/utils/class_loader.py +26 -0
- crawlo/utils/error_handler.py +76 -35
- crawlo/utils/log.py +41 -144
- crawlo/utils/redis_connection_pool.py +43 -6
- crawlo/utils/request_serializer.py +8 -1
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
- tests/authenticated_proxy_example.py +2 -2
- tests/baidu_performance_test.py +109 -0
- tests/baidu_test.py +60 -0
- tests/comprehensive_framework_test.py +213 -0
- tests/comprehensive_test.py +82 -0
- tests/comprehensive_testing_summary.md +187 -0
- tests/debug_configure.py +70 -0
- tests/debug_framework_logger.py +85 -0
- tests/debug_log_levels.py +64 -0
- tests/distributed_test.py +67 -0
- tests/distributed_test_debug.py +77 -0
- tests/final_command_test_report.md +0 -0
- tests/final_comprehensive_test.py +152 -0
- tests/final_validation_test.py +183 -0
- tests/framework_performance_test.py +203 -0
- tests/optimized_performance_test.py +212 -0
- tests/performance_comparison.py +246 -0
- tests/queue_blocking_test.py +114 -0
- tests/queue_test.py +90 -0
- tests/scrapy_comparison/ofweek_scrapy.py +139 -0
- tests/scrapy_comparison/scrapy_test.py +134 -0
- tests/simple_command_test.py +120 -0
- tests/simple_crawlo_test.py +128 -0
- tests/simple_log_test.py +58 -0
- tests/simple_optimization_test.py +129 -0
- tests/simple_spider_test.py +50 -0
- tests/simple_test.py +48 -0
- tests/test_all_commands.py +231 -0
- tests/test_batch_processor.py +179 -0
- tests/test_component_factory.py +175 -0
- tests/test_controlled_spider_mixin.py +80 -0
- tests/test_enhanced_error_handler_comprehensive.py +246 -0
- tests/test_factories.py +253 -0
- tests/test_framework_logger.py +67 -0
- tests/test_framework_startup.py +65 -0
- tests/test_large_scale_config.py +113 -0
- tests/test_large_scale_helper.py +236 -0
- tests/test_mode_change.py +73 -0
- tests/test_mode_consistency.py +1 -1
- tests/test_performance_monitor.py +116 -0
- tests/test_queue_empty_check.py +42 -0
- tests/untested_features_report.md +139 -0
- tests/verify_debug.py +52 -0
- tests/verify_log_fix.py +112 -0
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
日志器工厂 - 创建和缓存Logger实例
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import threading
|
|
10
|
+
from logging.handlers import RotatingFileHandler
|
|
11
|
+
from typing import Dict, Optional
|
|
12
|
+
from weakref import WeakValueDictionary
|
|
13
|
+
|
|
14
|
+
from .manager import get_config, is_configured, configure
|
|
15
|
+
from .config import LogConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LoggerFactory:
|
|
19
|
+
"""
|
|
20
|
+
Logger工厂类 - 负责创建和缓存Logger实例
|
|
21
|
+
|
|
22
|
+
特点:
|
|
23
|
+
1. 使用WeakValueDictionary避免内存泄漏
|
|
24
|
+
2. 线程安全的Logger创建
|
|
25
|
+
3. 自动配置管理
|
|
26
|
+
4. 简单的缓存策略
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# Logger缓存 - 使用弱引用避免内存泄漏
|
|
30
|
+
_logger_cache: WeakValueDictionary = WeakValueDictionary()
|
|
31
|
+
_cache_lock = threading.RLock()
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def get_logger(cls, name: str = 'crawlo') -> logging.Logger:
|
|
35
|
+
"""
|
|
36
|
+
获取Logger实例
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
name: Logger名称
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
logging.Logger: 配置好的Logger实例
|
|
43
|
+
"""
|
|
44
|
+
# 确保日志系统已配置
|
|
45
|
+
if not is_configured():
|
|
46
|
+
configure() # 使用默认配置
|
|
47
|
+
|
|
48
|
+
# 检查缓存
|
|
49
|
+
with cls._cache_lock:
|
|
50
|
+
if name in cls._logger_cache:
|
|
51
|
+
return cls._logger_cache[name]
|
|
52
|
+
|
|
53
|
+
# 创建新的Logger
|
|
54
|
+
logger = cls._create_logger(name)
|
|
55
|
+
cls._logger_cache[name] = logger
|
|
56
|
+
return logger
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def _create_logger(cls, name: str) -> logging.Logger:
|
|
60
|
+
"""创建新的Logger实例"""
|
|
61
|
+
config = get_config()
|
|
62
|
+
if not config:
|
|
63
|
+
raise RuntimeError("Log system not configured")
|
|
64
|
+
|
|
65
|
+
# 创建Logger
|
|
66
|
+
logger = logging.getLogger(name)
|
|
67
|
+
logger.setLevel(logging.DEBUG) # Logger本身设为最低级别
|
|
68
|
+
|
|
69
|
+
# 清除现有handlers(避免重复添加)
|
|
70
|
+
logger.handlers.clear()
|
|
71
|
+
|
|
72
|
+
# 获取模块级别
|
|
73
|
+
module_level = config.get_module_level(name)
|
|
74
|
+
level = getattr(logging, module_level.upper(), logging.INFO)
|
|
75
|
+
|
|
76
|
+
# 创建formatter
|
|
77
|
+
formatter = logging.Formatter(config.format)
|
|
78
|
+
|
|
79
|
+
# 添加控制台Handler
|
|
80
|
+
if config.console_enabled:
|
|
81
|
+
console_handler = logging.StreamHandler()
|
|
82
|
+
console_handler.setFormatter(formatter)
|
|
83
|
+
console_handler.setLevel(level)
|
|
84
|
+
logger.addHandler(console_handler)
|
|
85
|
+
|
|
86
|
+
# 添加文件Handler
|
|
87
|
+
if config.file_enabled and config.file_path:
|
|
88
|
+
try:
|
|
89
|
+
# 确保日志目录存在
|
|
90
|
+
log_dir = os.path.dirname(config.file_path)
|
|
91
|
+
if log_dir and not os.path.exists(log_dir):
|
|
92
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
file_handler = RotatingFileHandler(
|
|
95
|
+
filename=config.file_path,
|
|
96
|
+
maxBytes=config.max_bytes,
|
|
97
|
+
backupCount=config.backup_count,
|
|
98
|
+
encoding=config.encoding
|
|
99
|
+
)
|
|
100
|
+
file_handler.setFormatter(formatter)
|
|
101
|
+
file_handler.setLevel(level)
|
|
102
|
+
logger.addHandler(file_handler)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
# 文件Handler创建失败时,至少保证控制台输出
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
# 防止向上传播(避免重复输出)
|
|
108
|
+
logger.propagate = False
|
|
109
|
+
|
|
110
|
+
return logger
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def clear_cache(cls):
|
|
114
|
+
"""清空Logger缓存"""
|
|
115
|
+
with cls._cache_lock:
|
|
116
|
+
cls._logger_cache.clear()
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def refresh_loggers(cls, new_config: LogConfig):
|
|
120
|
+
"""刷新所有缓存的Logger(配置更新时使用)"""
|
|
121
|
+
with cls._cache_lock:
|
|
122
|
+
# 清空缓存,强制重新创建
|
|
123
|
+
cls._logger_cache.clear()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# 便捷函数
|
|
127
|
+
def get_logger(name: str = 'crawlo') -> logging.Logger:
|
|
128
|
+
"""获取Logger实例的便捷函数"""
|
|
129
|
+
return LoggerFactory.get_logger(name)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
日志管理器 - 核心组件
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import threading
|
|
8
|
+
from typing import Optional, Any
|
|
9
|
+
from .config import LogConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LogManager:
|
|
13
|
+
"""
|
|
14
|
+
日志管理器 - 单例模式
|
|
15
|
+
|
|
16
|
+
职责:
|
|
17
|
+
1. 全局日志配置管理
|
|
18
|
+
2. 配置状态跟踪
|
|
19
|
+
3. 线程安全的配置更新
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
_instance: Optional['LogManager'] = None
|
|
23
|
+
_lock = threading.Lock()
|
|
24
|
+
|
|
25
|
+
def __new__(cls) -> 'LogManager':
|
|
26
|
+
if cls._instance is None:
|
|
27
|
+
with cls._lock:
|
|
28
|
+
if cls._instance is None:
|
|
29
|
+
cls._instance = super(LogManager, cls).__new__(cls)
|
|
30
|
+
cls._instance._initialized = False
|
|
31
|
+
return cls._instance
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
if hasattr(self, '_initialized') and self._initialized:
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
self._config: Optional[LogConfig] = None
|
|
38
|
+
self._configured = False
|
|
39
|
+
self._config_lock = threading.RLock()
|
|
40
|
+
self._initialized = True
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def config(self) -> Optional[LogConfig]:
|
|
44
|
+
"""获取当前配置"""
|
|
45
|
+
with self._config_lock:
|
|
46
|
+
return self._config
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def is_configured(self) -> bool:
|
|
50
|
+
"""检查是否已配置"""
|
|
51
|
+
return self._configured
|
|
52
|
+
|
|
53
|
+
def configure(self, settings=None, **kwargs) -> LogConfig:
|
|
54
|
+
"""
|
|
55
|
+
配置日志系统
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
settings: 配置对象或None
|
|
59
|
+
**kwargs: 关键字参数配置
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
LogConfig: 生效的配置对象
|
|
63
|
+
"""
|
|
64
|
+
with self._config_lock:
|
|
65
|
+
# 总是重新配置,即使已经配置过
|
|
66
|
+
# 从不同来源创建配置
|
|
67
|
+
if settings is not None:
|
|
68
|
+
# 检查settings是否已经是LogConfig对象
|
|
69
|
+
if isinstance(settings, LogConfig):
|
|
70
|
+
config = settings
|
|
71
|
+
else:
|
|
72
|
+
config = LogConfig.from_settings(settings)
|
|
73
|
+
elif kwargs:
|
|
74
|
+
config = LogConfig.from_dict(kwargs)
|
|
75
|
+
else:
|
|
76
|
+
config = LogConfig() # 使用默认配置
|
|
77
|
+
|
|
78
|
+
# 验证配置
|
|
79
|
+
if not config.validate():
|
|
80
|
+
raise ValueError("Invalid log configuration")
|
|
81
|
+
|
|
82
|
+
self._config = config
|
|
83
|
+
self._configured = True
|
|
84
|
+
|
|
85
|
+
return config
|
|
86
|
+
|
|
87
|
+
def reset(self):
|
|
88
|
+
"""重置配置(主要用于测试)"""
|
|
89
|
+
with self._config_lock:
|
|
90
|
+
self._config = None
|
|
91
|
+
self._configured = False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# 全局实例
|
|
95
|
+
_log_manager = LogManager()
|
|
96
|
+
|
|
97
|
+
# 模块级便捷函数
|
|
98
|
+
def configure(settings=None, **kwargs) -> LogConfig:
|
|
99
|
+
"""配置日志系统"""
|
|
100
|
+
return _log_manager.configure(settings, **kwargs)
|
|
101
|
+
|
|
102
|
+
def is_configured() -> bool:
|
|
103
|
+
"""检查是否已配置"""
|
|
104
|
+
return _log_manager.is_configured
|
|
105
|
+
|
|
106
|
+
def get_config() -> Optional[LogConfig]:
|
|
107
|
+
"""获取当前配置"""
|
|
108
|
+
return _log_manager.config
|
|
109
|
+
|
|
110
|
+
def reset():
|
|
111
|
+
"""重置配置"""
|
|
112
|
+
_log_manager.reset()
|
|
@@ -8,7 +8,7 @@ from typing import List, Dict, Callable, Optional
|
|
|
8
8
|
|
|
9
9
|
from crawlo import Request, Response
|
|
10
10
|
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.
|
|
11
|
+
from crawlo.utils.class_loader import load_class
|
|
12
12
|
from crawlo.middleware import BaseMiddleware
|
|
13
13
|
from crawlo.project import common_call
|
|
14
14
|
from crawlo.event import ignore_request, response_received
|
crawlo/middleware/offsite.py
CHANGED
|
@@ -54,7 +54,7 @@ class OffsiteMiddleware:
|
|
|
54
54
|
o._compile_domains()
|
|
55
55
|
|
|
56
56
|
# 使用中间件自己的logger而不是crawler.logger
|
|
57
|
-
o.logger.
|
|
57
|
+
o.logger.debug(f"OffsiteMiddleware 已启用,允许的域名: {allowed_domains}")
|
|
58
58
|
return o
|
|
59
59
|
|
|
60
60
|
def _compile_domains(self):
|
crawlo/mode_manager.py
CHANGED
|
@@ -26,7 +26,26 @@ class ModeManager:
|
|
|
26
26
|
"""运行模式管理器"""
|
|
27
27
|
|
|
28
28
|
def __init__(self):
|
|
29
|
-
|
|
29
|
+
# 延迟初始化logger,避免循环依赖
|
|
30
|
+
self._logger = None
|
|
31
|
+
self._debug("运行模式管理器初始化完成")
|
|
32
|
+
|
|
33
|
+
def _get_logger(self):
|
|
34
|
+
"""延迟获取logger实例"""
|
|
35
|
+
if self._logger is None:
|
|
36
|
+
try:
|
|
37
|
+
from crawlo.utils.log import get_logger
|
|
38
|
+
self._logger = get_logger(__name__)
|
|
39
|
+
except Exception:
|
|
40
|
+
# 如果日志系统尚未初始化,返回None
|
|
41
|
+
pass
|
|
42
|
+
return self._logger
|
|
43
|
+
|
|
44
|
+
def _debug(self, message: str):
|
|
45
|
+
"""调试日志"""
|
|
46
|
+
logger = self._get_logger()
|
|
47
|
+
if logger:
|
|
48
|
+
logger.debug(message)
|
|
30
49
|
|
|
31
50
|
@staticmethod
|
|
32
51
|
def get_standalone_settings() -> Dict[str, Any]:
|
|
@@ -94,12 +113,14 @@ class ModeManager:
|
|
|
94
113
|
Returns:
|
|
95
114
|
Dict[str, Any]: 配置字典
|
|
96
115
|
"""
|
|
116
|
+
self._debug(f"解析运行模式: {mode}")
|
|
97
117
|
mode = RunMode(mode.lower())
|
|
98
118
|
mode_info = None
|
|
99
119
|
|
|
100
120
|
if mode == RunMode.STANDALONE:
|
|
101
121
|
mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
|
|
102
122
|
settings = self.get_standalone_settings()
|
|
123
|
+
self._debug("应用单机模式配置")
|
|
103
124
|
|
|
104
125
|
elif mode == RunMode.DISTRIBUTED:
|
|
105
126
|
mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
|
|
@@ -110,10 +131,12 @@ class ModeManager:
|
|
|
110
131
|
redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
|
|
111
132
|
project_name=kwargs.get('project_name', 'crawlo')
|
|
112
133
|
)
|
|
134
|
+
self._debug("应用分布式模式配置")
|
|
113
135
|
|
|
114
136
|
elif mode == RunMode.AUTO:
|
|
115
137
|
mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
|
|
116
138
|
settings = self.get_auto_settings()
|
|
139
|
+
self._debug("应用自动检测模式配置")
|
|
117
140
|
|
|
118
141
|
else:
|
|
119
142
|
raise ValueError(f"不支持的运行模式: {mode}")
|
|
@@ -122,10 +145,12 @@ class ModeManager:
|
|
|
122
145
|
user_settings = {k: v for k, v in kwargs.items()
|
|
123
146
|
if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
|
|
124
147
|
settings.update(user_settings)
|
|
148
|
+
self._debug(f"合并用户自定义配置: {list(user_settings.keys())}")
|
|
125
149
|
|
|
126
150
|
# 将模式信息添加到配置中,供后续使用
|
|
127
151
|
settings['_mode_info'] = mode_info
|
|
128
152
|
|
|
153
|
+
self._debug(f"运行模式解析完成: {mode}")
|
|
129
154
|
return settings
|
|
130
155
|
|
|
131
156
|
def from_environment(self) -> Dict[str, Any]:
|
|
@@ -6,7 +6,8 @@ from asyncio import create_task
|
|
|
6
6
|
|
|
7
7
|
from crawlo.utils.log import get_logger
|
|
8
8
|
from crawlo.event import item_successful, item_discard
|
|
9
|
-
from crawlo.
|
|
9
|
+
from crawlo.utils.class_loader import load_class
|
|
10
|
+
from crawlo.project import common_call
|
|
10
11
|
from crawlo.exceptions import PipelineInitError, ItemDiscard, InvalidOutputError
|
|
11
12
|
|
|
12
13
|
|
crawlo/project.py
CHANGED
|
@@ -7,10 +7,28 @@ from inspect import iscoroutinefunction
|
|
|
7
7
|
from typing import Callable, Optional, Any
|
|
8
8
|
|
|
9
9
|
from crawlo.settings.setting_manager import SettingManager
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
11
|
|
|
12
12
|
# 使用全局logger,避免每个模块都创建自己的延迟初始化函数
|
|
13
|
-
logger
|
|
13
|
+
# 延迟获取logger,确保在日志系统配置之后获取
|
|
14
|
+
_logger = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def logger():
|
|
18
|
+
"""延迟获取logger实例,确保在日志系统配置之后获取"""
|
|
19
|
+
global _logger
|
|
20
|
+
if _logger is None:
|
|
21
|
+
_logger = get_logger(__name__)
|
|
22
|
+
return _logger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# 添加一个临时的日志函数,用于在日志系统配置之前输出信息
|
|
26
|
+
def _temp_debug(message):
|
|
27
|
+
"""临时调试函数,在日志系统配置之前使用"""
|
|
28
|
+
# 直接输出到控制台,避免循环依赖
|
|
29
|
+
import os
|
|
30
|
+
if os.environ.get('CRAWLO_DEBUG'):
|
|
31
|
+
print(f"[CRAWLO_DEBUG] {message}")
|
|
14
32
|
|
|
15
33
|
|
|
16
34
|
def load_class(path: str) -> Any:
|
|
@@ -23,12 +41,9 @@ def load_class(path: str) -> Any:
|
|
|
23
41
|
Returns:
|
|
24
42
|
加载的类对象
|
|
25
43
|
"""
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
return getattr(module, class_name)
|
|
30
|
-
except (ValueError, ImportError, AttributeError) as e:
|
|
31
|
-
raise ImportError(f"无法加载类 '{path}': {e}")
|
|
44
|
+
# 使用工具模块的实现,避免循环依赖
|
|
45
|
+
from crawlo.utils.class_loader import load_class as _load_class
|
|
46
|
+
return _load_class(path)
|
|
32
47
|
|
|
33
48
|
|
|
34
49
|
def merge_settings(spider, settings):
|
|
@@ -42,7 +57,7 @@ def merge_settings(spider, settings):
|
|
|
42
57
|
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
43
58
|
# 检查 settings 是否为 SettingManager 实例
|
|
44
59
|
if not hasattr(settings, 'update_attributes'):
|
|
45
|
-
|
|
60
|
+
_temp_debug(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
|
|
46
61
|
# 如果是字典,创建一个新的 SettingManager 实例
|
|
47
62
|
if isinstance(settings, dict):
|
|
48
63
|
from crawlo.settings.setting_manager import SettingManager
|
|
@@ -50,14 +65,14 @@ def merge_settings(spider, settings):
|
|
|
50
65
|
new_settings.update_attributes(settings)
|
|
51
66
|
settings = new_settings
|
|
52
67
|
else:
|
|
53
|
-
|
|
68
|
+
_temp_debug("无法处理的 settings 类型")
|
|
54
69
|
return
|
|
55
|
-
|
|
70
|
+
|
|
56
71
|
if hasattr(spider, 'custom_settings'):
|
|
57
72
|
custom_settings = getattr(spider, 'custom_settings')
|
|
58
73
|
settings.update_attributes(custom_settings)
|
|
59
74
|
else:
|
|
60
|
-
|
|
75
|
+
_temp_debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
|
|
61
76
|
|
|
62
77
|
|
|
63
78
|
async def common_call(func: Callable, *args, **kwargs):
|
|
@@ -85,7 +100,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
|
85
100
|
config.read(cfg_path, encoding="utf-8")
|
|
86
101
|
if config.has_section("settings") and config.has_option("settings", "default"):
|
|
87
102
|
module_path = config.get("settings", "default")
|
|
88
|
-
|
|
103
|
+
_temp_debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
|
|
89
104
|
return module_path
|
|
90
105
|
else:
|
|
91
106
|
raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
|
|
@@ -101,41 +116,41 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
101
116
|
2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
|
|
102
117
|
"""
|
|
103
118
|
path = os.path.abspath(start_path)
|
|
104
|
-
|
|
119
|
+
|
|
105
120
|
# 首先检查当前目录及其子目录
|
|
106
121
|
for root, dirs, files in os.walk(path):
|
|
107
122
|
if "crawlo.cfg" in files:
|
|
108
123
|
cfg_path = os.path.join(root, "crawlo.cfg")
|
|
109
|
-
|
|
124
|
+
_temp_debug(f"✅ 找到项目配置文件: {cfg_path}")
|
|
110
125
|
return root
|
|
111
|
-
|
|
126
|
+
|
|
112
127
|
# 向上查找直到找到 crawlo.cfg 或包含 settings.py 和 __init__.py 的目录
|
|
113
128
|
original_path = path
|
|
114
129
|
checked_paths = set()
|
|
115
|
-
|
|
130
|
+
|
|
116
131
|
while True:
|
|
117
132
|
# 避免无限循环
|
|
118
133
|
if path in checked_paths:
|
|
119
134
|
break
|
|
120
135
|
checked_paths.add(path)
|
|
121
|
-
|
|
136
|
+
|
|
122
137
|
# 检查 crawlo.cfg
|
|
123
138
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
124
139
|
if os.path.isfile(cfg_file):
|
|
125
|
-
|
|
140
|
+
_temp_debug(f"✅ 找到项目配置文件: {cfg_file}")
|
|
126
141
|
return path
|
|
127
142
|
|
|
128
143
|
# 检查 settings.py 和 __init__.py
|
|
129
144
|
settings_file = os.path.join(path, "settings.py")
|
|
130
145
|
init_file = os.path.join(path, "__init__.py")
|
|
131
146
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
132
|
-
|
|
147
|
+
_temp_debug(f"✅ 找到项目模块: {path}")
|
|
133
148
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
134
149
|
parent = os.path.dirname(path)
|
|
135
150
|
if parent != path:
|
|
136
151
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
137
152
|
if os.path.isfile(parent_cfg):
|
|
138
|
-
|
|
153
|
+
_temp_debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
|
|
139
154
|
return parent
|
|
140
155
|
return path
|
|
141
156
|
|
|
@@ -156,22 +171,22 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
156
171
|
if path in checked_paths:
|
|
157
172
|
break
|
|
158
173
|
checked_paths.add(path)
|
|
159
|
-
|
|
174
|
+
|
|
160
175
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
161
176
|
if os.path.isfile(cfg_file):
|
|
162
|
-
|
|
177
|
+
_temp_debug(f"✅ 找到项目配置文件: {cfg_file}")
|
|
163
178
|
return path
|
|
164
179
|
|
|
165
180
|
settings_file = os.path.join(path, "settings.py")
|
|
166
181
|
init_file = os.path.join(path, "__init__.py")
|
|
167
182
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
168
|
-
|
|
183
|
+
_temp_debug(f"✅ 找到项目模块: {path}")
|
|
169
184
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
170
185
|
parent = os.path.dirname(path)
|
|
171
186
|
if parent != path:
|
|
172
187
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
173
188
|
if os.path.isfile(parent_cfg):
|
|
174
|
-
|
|
189
|
+
_temp_debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
|
|
175
190
|
return parent
|
|
176
191
|
return path
|
|
177
192
|
|
|
@@ -193,22 +208,22 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
193
208
|
if path in checked_paths:
|
|
194
209
|
break
|
|
195
210
|
checked_paths.add(path)
|
|
196
|
-
|
|
211
|
+
|
|
197
212
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
198
213
|
if os.path.isfile(cfg_file):
|
|
199
|
-
|
|
214
|
+
_temp_debug(f"找到项目配置文件: {cfg_file}")
|
|
200
215
|
return path
|
|
201
216
|
|
|
202
217
|
settings_file = os.path.join(path, "settings.py")
|
|
203
218
|
init_file = os.path.join(path, "__init__.py")
|
|
204
219
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
205
|
-
|
|
220
|
+
_temp_debug(f"找到项目模块: {path}")
|
|
206
221
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
207
222
|
parent = os.path.dirname(path)
|
|
208
223
|
if parent != path:
|
|
209
224
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
210
225
|
if os.path.isfile(parent_cfg):
|
|
211
|
-
|
|
226
|
+
_temp_debug(f"在上层目录找到项目配置文件: {parent_cfg}")
|
|
212
227
|
return parent
|
|
213
228
|
return path
|
|
214
229
|
|
|
@@ -219,13 +234,14 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
219
234
|
except Exception:
|
|
220
235
|
pass
|
|
221
236
|
|
|
222
|
-
|
|
237
|
+
_temp_debug("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
|
|
223
238
|
return None
|
|
224
239
|
|
|
225
240
|
|
|
226
|
-
def
|
|
241
|
+
def _load_project_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
227
242
|
"""
|
|
228
|
-
|
|
243
|
+
内部函数:加载项目配置(不处理日志初始化)
|
|
244
|
+
这个函数专门负责配置加载逻辑,避免与初始化管理器产生循环依赖
|
|
229
245
|
|
|
230
246
|
Args:
|
|
231
247
|
custom_settings: 运行时自定义配置,会覆盖 settings.py
|
|
@@ -233,7 +249,7 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
233
249
|
Returns:
|
|
234
250
|
SettingManager: 已加载配置的实例
|
|
235
251
|
"""
|
|
236
|
-
|
|
252
|
+
_temp_debug("🚀 正在加载 Crawlo 项目配置...")
|
|
237
253
|
|
|
238
254
|
# 1. 查找项目根
|
|
239
255
|
project_root = _find_project_root()
|
|
@@ -250,21 +266,21 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
250
266
|
# 推断:项目目录名.settings
|
|
251
267
|
project_name = os.path.basename(project_root)
|
|
252
268
|
settings_module_path = f"{project_name}.settings"
|
|
253
|
-
|
|
269
|
+
_temp_debug(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
|
|
254
270
|
|
|
255
271
|
# 3. 注入 sys.path
|
|
256
272
|
project_root_str = os.path.abspath(project_root)
|
|
257
273
|
if project_root_str not in sys.path:
|
|
258
274
|
sys.path.insert(0, project_root_str)
|
|
259
|
-
|
|
275
|
+
_temp_debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
|
|
260
276
|
|
|
261
277
|
# 4. 加载 SettingManager
|
|
262
|
-
|
|
278
|
+
_temp_debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
|
|
263
279
|
settings = SettingManager()
|
|
264
280
|
|
|
265
281
|
try:
|
|
266
282
|
settings.set_settings(settings_module_path)
|
|
267
|
-
|
|
283
|
+
_temp_debug("✅ settings 模块加载成功")
|
|
268
284
|
except Exception as e:
|
|
269
285
|
raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
|
|
270
286
|
|
|
@@ -279,19 +295,33 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
279
295
|
# 只有当用户没有设置该配置项时才应用模式配置
|
|
280
296
|
if key not in settings.attributes:
|
|
281
297
|
settings.set(key, value)
|
|
282
|
-
|
|
298
|
+
_temp_debug(f"🔧 已应用 {run_mode} 模式配置")
|
|
283
299
|
|
|
284
300
|
# 6. 合并运行时配置
|
|
285
301
|
if custom_settings:
|
|
286
302
|
settings.update_attributes(custom_settings)
|
|
287
|
-
|
|
303
|
+
_temp_debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
|
|
304
|
+
|
|
305
|
+
_temp_debug("🎉 Crawlo 项目配置加载完成!")
|
|
306
|
+
return settings
|
|
288
307
|
|
|
289
|
-
# 7. 显示核心配置摘要(INFO级别)
|
|
290
|
-
# _log_settings_summary(settings)
|
|
291
308
|
|
|
292
|
-
|
|
293
|
-
|
|
309
|
+
def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
310
|
+
"""
|
|
311
|
+
获取配置管理器实例(主入口函数)
|
|
294
312
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
313
|
+
注意:这个函数现在作为向后兼容的入口,实际的初始化逻辑已经移到
|
|
314
|
+
crawlo.core.framework_initializer 模块中。建议使用新的初始化方式:
|
|
315
|
+
|
|
316
|
+
>>> from crawlo.core.framework_initializer import initialize_framework
|
|
317
|
+
>>> settings = initialize_framework(custom_settings)
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
custom_settings: 运行时自定义配置,会覆盖 settings.py
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
SettingManager: 已加载配置的实例
|
|
324
|
+
"""
|
|
325
|
+
# 使用新的统一初始化管理器
|
|
326
|
+
from crawlo.core.framework_initializer import initialize_framework
|
|
327
|
+
return initialize_framework(custom_settings)
|
crawlo/queue/pqueue.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import sys
|
|
4
4
|
import asyncio
|
|
5
5
|
from asyncio import PriorityQueue
|
|
6
|
-
from typing import Optional
|
|
6
|
+
from typing import Optional, Tuple, Any
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
from crawlo import Request
|
|
@@ -16,12 +16,12 @@ class SpiderPriorityQueue(PriorityQueue):
|
|
|
16
16
|
"""初始化队列,maxsize为0表示无大小限制"""
|
|
17
17
|
super().__init__(maxsize)
|
|
18
18
|
|
|
19
|
-
async def get(self, timeout: float = 0.
|
|
19
|
+
async def get(self, timeout: float = 0.01) -> Optional[Any]:
|
|
20
20
|
"""
|
|
21
21
|
异步获取队列元素,带超时功能
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
|
-
timeout: 超时时间(秒),默认0.
|
|
24
|
+
timeout: 超时时间(秒),默认0.01秒
|
|
25
25
|
|
|
26
26
|
Returns:
|
|
27
27
|
队列元素(优先级, 值)或None(超时)
|
|
@@ -30,8 +30,14 @@ class SpiderPriorityQueue(PriorityQueue):
|
|
|
30
30
|
# 根据Python版本选择超时实现方式
|
|
31
31
|
if sys.version_info >= (3, 11):
|
|
32
32
|
async with asyncio.timeout(timeout):
|
|
33
|
-
|
|
33
|
+
item = await super().get()
|
|
34
|
+
return item
|
|
34
35
|
else:
|
|
35
|
-
|
|
36
|
+
item = await asyncio.wait_for(super().get(), timeout=timeout)
|
|
37
|
+
return item
|
|
36
38
|
except asyncio.TimeoutError:
|
|
37
39
|
return None
|
|
40
|
+
|
|
41
|
+
def qsize(self) -> int:
|
|
42
|
+
"""获取队列大小"""
|
|
43
|
+
return super().qsize()
|