crawlo 1.2.9__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +26 -35
- crawlo/commands/utils.py +12 -2
- crawlo/core/engine.py +1 -2
- crawlo/crawler.py +135 -69
- crawlo/extension/logging_extension.py +4 -2
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/middleware/offsite.py +2 -1
- crawlo/mode_manager.py +37 -100
- crawlo/pipelines/mysql_pipeline.py +5 -4
- crawlo/pipelines/pipeline_manager.py +15 -2
- crawlo/project.py +44 -37
- crawlo/settings/default_settings.py +13 -4
- crawlo/settings/setting_manager.py +55 -20
- crawlo/utils/log.py +21 -62
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/METADATA +13 -4
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/RECORD +20 -20
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/WHEEL +0 -0
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/top_level.txt +0 -0
crawlo/middleware/offsite.py
CHANGED
|
@@ -45,7 +45,8 @@ class OffsiteMiddleware:
|
|
|
45
45
|
# 编译域名正则表达式以提高性能
|
|
46
46
|
o._compile_domains()
|
|
47
47
|
|
|
48
|
-
crawler.logger
|
|
48
|
+
# 使用中间件自己的logger而不是crawler.logger
|
|
49
|
+
o.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
|
|
49
50
|
return o
|
|
50
51
|
|
|
51
52
|
def _compile_domains(self):
|
crawlo/mode_manager.py
CHANGED
|
@@ -14,102 +14,56 @@ import os
|
|
|
14
14
|
from enum import Enum
|
|
15
15
|
from typing import Dict, Any, Optional
|
|
16
16
|
|
|
17
|
-
from crawlo.utils.log import get_logger
|
|
18
|
-
|
|
19
17
|
|
|
20
18
|
class RunMode(Enum):
|
|
21
19
|
"""运行模式枚举"""
|
|
22
|
-
STANDALONE = "standalone"
|
|
20
|
+
STANDALONE = "standalone" # 单机模式
|
|
23
21
|
DISTRIBUTED = "distributed" # 分布式模式
|
|
24
|
-
AUTO = "auto"
|
|
22
|
+
AUTO = "auto" # 自动检测模式
|
|
25
23
|
|
|
26
24
|
|
|
27
25
|
class ModeManager:
|
|
28
26
|
"""运行模式管理器"""
|
|
29
|
-
|
|
27
|
+
|
|
30
28
|
def __init__(self):
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
pass
|
|
30
|
+
|
|
33
31
|
@staticmethod
|
|
34
32
|
def get_standalone_settings() -> Dict[str, Any]:
|
|
35
33
|
"""获取单机模式配置"""
|
|
36
34
|
return {
|
|
37
35
|
'QUEUE_TYPE': 'memory',
|
|
38
36
|
'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
|
|
37
|
+
'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
|
|
39
38
|
'CONCURRENCY': 8,
|
|
40
39
|
'MAX_RUNNING_SPIDERS': 1,
|
|
41
40
|
'DOWNLOAD_DELAY': 1.0,
|
|
42
|
-
'LOG_LEVEL': 'INFO',
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
@staticmethod
|
|
46
|
-
def get_distributed_settings(
|
|
47
|
-
redis_host: str = '127.0.0.1',
|
|
48
|
-
redis_port: int = 6379,
|
|
49
|
-
redis_password: Optional[str] = None,
|
|
50
|
-
redis_db: int = 0, # 添加 redis_db 参数
|
|
51
|
-
project_name: str = 'crawlo'
|
|
52
|
-
) -> Dict[str, Any]:
|
|
53
|
-
"""获取分布式模式配置"""
|
|
54
|
-
# 构建 Redis URL,使用传入的 redis_db 参数
|
|
55
|
-
if redis_password:
|
|
56
|
-
redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
|
|
57
|
-
else:
|
|
58
|
-
redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
|
|
59
|
-
|
|
60
|
-
return {
|
|
61
|
-
'PROJECT_NAME': project_name, # 添加项目名称到配置中
|
|
62
|
-
'QUEUE_TYPE': 'redis',
|
|
63
|
-
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
64
|
-
'REDIS_HOST': redis_host,
|
|
65
|
-
'REDIS_PORT': redis_port,
|
|
66
|
-
'REDIS_PASSWORD': redis_password,
|
|
67
|
-
'REDIS_DB': redis_db, # 添加 Redis 数据库编号到配置中
|
|
68
|
-
'REDIS_URL': redis_url,
|
|
69
|
-
'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
|
|
70
|
-
# Redis key配置已移至各组件中,使用统一的命名规范
|
|
71
|
-
# crawlo:{project_name}:filter:fingerprint (请求去重)
|
|
72
|
-
'CONCURRENCY': 16,
|
|
73
|
-
'MAX_RUNNING_SPIDERS': 1,
|
|
74
|
-
'DOWNLOAD_DELAY': 1.0,
|
|
75
|
-
'LOG_LEVEL': 'INFO',
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
@staticmethod
|
|
79
|
-
def get_auto_settings() -> Dict[str, Any]:
|
|
80
|
-
"""获取自动检测模式配置"""
|
|
81
|
-
return {
|
|
82
|
-
'QUEUE_TYPE': 'auto',
|
|
83
|
-
'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
|
|
84
|
-
'CONCURRENCY': 12,
|
|
85
|
-
'MAX_RUNNING_SPIDERS': 1,
|
|
86
|
-
'DOWNLOAD_DELAY': 1.0,
|
|
87
|
-
'LOG_LEVEL': 'INFO',
|
|
88
41
|
}
|
|
89
|
-
|
|
42
|
+
|
|
90
43
|
def resolve_mode_settings(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
44
|
+
self,
|
|
45
|
+
mode: str = 'standalone',
|
|
46
|
+
**kwargs
|
|
94
47
|
) -> Dict[str, Any]:
|
|
95
48
|
"""
|
|
96
49
|
解析运行模式并返回对应配置
|
|
97
|
-
|
|
50
|
+
|
|
98
51
|
Args:
|
|
99
52
|
mode: 运行模式 ('standalone', 'distributed', 'auto')
|
|
100
53
|
**kwargs: 额外配置参数
|
|
101
|
-
|
|
54
|
+
|
|
102
55
|
Returns:
|
|
103
56
|
Dict[str, Any]: 配置字典
|
|
104
57
|
"""
|
|
105
58
|
mode = RunMode(mode.lower())
|
|
106
|
-
|
|
59
|
+
mode_info = None
|
|
60
|
+
|
|
107
61
|
if mode == RunMode.STANDALONE:
|
|
108
|
-
|
|
62
|
+
mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
|
|
109
63
|
settings = self.get_standalone_settings()
|
|
110
|
-
|
|
64
|
+
|
|
111
65
|
elif mode == RunMode.DISTRIBUTED:
|
|
112
|
-
|
|
66
|
+
mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
|
|
113
67
|
settings = self.get_distributed_settings(
|
|
114
68
|
redis_host=kwargs.get('redis_host', '127.0.0.1'),
|
|
115
69
|
redis_port=kwargs.get('redis_port', 6379),
|
|
@@ -117,25 +71,28 @@ class ModeManager:
|
|
|
117
71
|
redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
|
|
118
72
|
project_name=kwargs.get('project_name', 'crawlo')
|
|
119
73
|
)
|
|
120
|
-
|
|
74
|
+
|
|
121
75
|
elif mode == RunMode.AUTO:
|
|
122
|
-
|
|
76
|
+
mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
|
|
123
77
|
settings = self.get_auto_settings()
|
|
124
|
-
|
|
78
|
+
|
|
125
79
|
else:
|
|
126
80
|
raise ValueError(f"不支持的运行模式: {mode}")
|
|
127
|
-
|
|
81
|
+
|
|
128
82
|
# 合并用户自定义配置
|
|
129
|
-
user_settings = {k: v for k, v in kwargs.items()
|
|
130
|
-
|
|
83
|
+
user_settings = {k: v for k, v in kwargs.items()
|
|
84
|
+
if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
|
|
131
85
|
settings.update(user_settings)
|
|
132
|
-
|
|
86
|
+
|
|
87
|
+
# 将模式信息添加到配置中,供后续使用
|
|
88
|
+
settings['_mode_info'] = mode_info
|
|
89
|
+
|
|
133
90
|
return settings
|
|
134
|
-
|
|
91
|
+
|
|
135
92
|
def from_environment(self) -> Dict[str, Any]:
|
|
136
93
|
"""从环境变量构建配置"""
|
|
137
94
|
config = {}
|
|
138
|
-
|
|
95
|
+
|
|
139
96
|
# 扫描 CRAWLO_ 前缀的环境变量
|
|
140
97
|
for key, value in os.environ.items():
|
|
141
98
|
if key.startswith('CRAWLO_'):
|
|
@@ -150,7 +107,7 @@ class ModeManager:
|
|
|
150
107
|
config[config_key] = float(value)
|
|
151
108
|
except ValueError:
|
|
152
109
|
config[config_key] = value
|
|
153
|
-
|
|
110
|
+
|
|
154
111
|
return config
|
|
155
112
|
|
|
156
113
|
|
|
@@ -161,12 +118,12 @@ def standalone_mode(**kwargs) -> Dict[str, Any]:
|
|
|
161
118
|
|
|
162
119
|
|
|
163
120
|
def distributed_mode(
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
121
|
+
redis_host: str = '127.0.0.1',
|
|
122
|
+
redis_port: int = 6379,
|
|
123
|
+
redis_password: Optional[str] = None,
|
|
124
|
+
redis_db: int = 0, # 添加 redis_db 参数
|
|
125
|
+
project_name: str = 'crawlo',
|
|
126
|
+
**kwargs
|
|
170
127
|
) -> Dict[str, Any]:
|
|
171
128
|
"""快速创建分布式模式配置"""
|
|
172
129
|
return ModeManager().resolve_mode_settings(
|
|
@@ -189,24 +146,4 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
|
|
|
189
146
|
def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
|
|
190
147
|
"""从环境变量创建配置"""
|
|
191
148
|
# 移除直接使用 os.getenv(),要求通过 settings 配置
|
|
192
|
-
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
193
|
-
|
|
194
|
-
# 保留原有代码作为参考
|
|
195
|
-
# mode = os.getenv('CRAWLO_MODE', default_mode).lower()
|
|
196
|
-
#
|
|
197
|
-
# if mode == 'distributed':
|
|
198
|
-
# return distributed_mode(
|
|
199
|
-
# redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
|
|
200
|
-
# redis_port=int(os.getenv('REDIS_PORT', 6379)),
|
|
201
|
-
# redis_password=os.getenv('REDIS_PASSWORD'),
|
|
202
|
-
# project_name=os.getenv('PROJECT_NAME', 'crawlo'),
|
|
203
|
-
# CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
|
|
204
|
-
# )
|
|
205
|
-
# elif mode == 'auto':
|
|
206
|
-
# return auto_mode(
|
|
207
|
-
# CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
|
|
208
|
-
# )
|
|
209
|
-
# else: # standalone
|
|
210
|
-
# return standalone_mode(
|
|
211
|
-
# CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
|
|
212
|
-
# )
|
|
149
|
+
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
@@ -7,6 +7,7 @@ from typing import Optional, List, Dict
|
|
|
7
7
|
from crawlo.exceptions import ItemDiscard
|
|
8
8
|
from crawlo.utils.db_helper import make_insert_sql, make_batch_sql
|
|
9
9
|
from crawlo.utils.log import get_logger
|
|
10
|
+
from . import BasePipeline
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class AsyncmyMySQLPipeline:
|
|
@@ -200,7 +201,7 @@ class AiomysqlMySQLPipeline:
|
|
|
200
201
|
crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
|
|
201
202
|
|
|
202
203
|
@classmethod
|
|
203
|
-
def
|
|
204
|
+
def from_crawler(cls, crawler):
|
|
204
205
|
return cls(crawler)
|
|
205
206
|
|
|
206
207
|
async def _init_pool(self):
|
|
@@ -213,12 +214,12 @@ class AiomysqlMySQLPipeline:
|
|
|
213
214
|
try:
|
|
214
215
|
self.pool = await aiomysql.create_pool(
|
|
215
216
|
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
216
|
-
port=self.settings.
|
|
217
|
+
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
217
218
|
user=self.settings.get('MYSQL_USER', 'root'),
|
|
218
219
|
password=self.settings.get('MYSQL_PASSWORD', ''),
|
|
219
220
|
db=self.settings.get('MYSQL_DB', 'scrapy_db'),
|
|
220
|
-
minsize=self.settings.
|
|
221
|
-
maxsize=self.settings.
|
|
221
|
+
minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
|
|
222
|
+
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5),
|
|
222
223
|
cursorclass=aiomysql.DictCursor,
|
|
223
224
|
autocommit=False
|
|
224
225
|
)
|
|
@@ -4,7 +4,6 @@ from typing import List
|
|
|
4
4
|
from pprint import pformat
|
|
5
5
|
from asyncio import create_task
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
from crawlo.utils.log import get_logger
|
|
9
8
|
from crawlo.event import item_successful, item_discard
|
|
10
9
|
from crawlo.project import load_class, common_call
|
|
@@ -20,6 +19,20 @@ class PipelineManager:
|
|
|
20
19
|
|
|
21
20
|
self.logger = get_logger(self.__class__.__name__, self.crawler.settings.get('LOG_LEVEL'))
|
|
22
21
|
pipelines = self.crawler.settings.get_list('PIPELINES')
|
|
22
|
+
dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
23
|
+
|
|
24
|
+
# 添加调试信息
|
|
25
|
+
self.logger.debug(f"PIPELINES from settings: {pipelines}")
|
|
26
|
+
self.logger.debug(f"DEFAULT_DEDUP_PIPELINE from settings: {dedup_pipeline}")
|
|
27
|
+
|
|
28
|
+
# 确保DEFAULT_DEDUP_PIPELINE被添加到管道列表开头
|
|
29
|
+
if dedup_pipeline:
|
|
30
|
+
# 移除所有去重管道实例(如果存在)
|
|
31
|
+
pipelines = [item for item in pipelines if item != dedup_pipeline]
|
|
32
|
+
# 在开头插入去重管道
|
|
33
|
+
self.logger.debug(f"{dedup_pipeline} insert successful")
|
|
34
|
+
pipelines.insert(0, dedup_pipeline)
|
|
35
|
+
|
|
23
36
|
self._add_pipelines(pipelines)
|
|
24
37
|
self._add_methods()
|
|
25
38
|
|
|
@@ -34,7 +47,7 @@ class PipelineManager:
|
|
|
34
47
|
pipeline_cls = load_class(pipeline)
|
|
35
48
|
if not hasattr(pipeline_cls, 'from_crawler'):
|
|
36
49
|
raise PipelineInitError(
|
|
37
|
-
f"Pipeline init failed, must inherit from `BasePipeline` or have a `
|
|
50
|
+
f"Pipeline init failed, must inherit from `BasePipeline` or have a `from_crawler` method"
|
|
38
51
|
)
|
|
39
52
|
self.pipelines.append(pipeline_cls.from_crawler(self.crawler))
|
|
40
53
|
except Exception as e:
|
crawlo/project.py
CHANGED
|
@@ -7,18 +7,10 @@ from inspect import iscoroutinefunction
|
|
|
7
7
|
from typing import Callable, Optional, Any
|
|
8
8
|
|
|
9
9
|
from crawlo.settings.setting_manager import SettingManager
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
10
|
+
from crawlo.utils.log import get_logger, LoggerManager
|
|
11
11
|
|
|
12
|
-
#
|
|
13
|
-
logger =
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def _get_logger():
|
|
17
|
-
"""延迟获取logger实例,确保在配置加载后创建"""
|
|
18
|
-
global logger
|
|
19
|
-
if logger is None:
|
|
20
|
-
logger = get_logger(__name__)
|
|
21
|
-
return logger
|
|
12
|
+
# 使用全局logger,避免每个模块都创建自己的延迟初始化函数
|
|
13
|
+
logger = get_logger(__name__)
|
|
22
14
|
|
|
23
15
|
|
|
24
16
|
def load_class(path: str) -> Any:
|
|
@@ -50,7 +42,7 @@ def merge_settings(spider, settings):
|
|
|
50
42
|
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
51
43
|
# 检查 settings 是否为 SettingManager 实例
|
|
52
44
|
if not hasattr(settings, 'update_attributes'):
|
|
53
|
-
|
|
45
|
+
logger.error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
|
|
54
46
|
# 如果是字典,创建一个新的 SettingManager 实例
|
|
55
47
|
if isinstance(settings, dict):
|
|
56
48
|
from crawlo.settings.setting_manager import SettingManager
|
|
@@ -58,14 +50,14 @@ def merge_settings(spider, settings):
|
|
|
58
50
|
new_settings.update_attributes(settings)
|
|
59
51
|
settings = new_settings
|
|
60
52
|
else:
|
|
61
|
-
|
|
53
|
+
logger.error("无法处理的 settings 类型")
|
|
62
54
|
return
|
|
63
55
|
|
|
64
56
|
if hasattr(spider, 'custom_settings'):
|
|
65
57
|
custom_settings = getattr(spider, 'custom_settings')
|
|
66
58
|
settings.update_attributes(custom_settings)
|
|
67
59
|
else:
|
|
68
|
-
|
|
60
|
+
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
|
|
69
61
|
|
|
70
62
|
|
|
71
63
|
async def common_call(func: Callable, *args, **kwargs):
|
|
@@ -93,7 +85,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
|
93
85
|
config.read(cfg_path, encoding="utf-8")
|
|
94
86
|
if config.has_section("settings") and config.has_option("settings", "default"):
|
|
95
87
|
module_path = config.get("settings", "default")
|
|
96
|
-
|
|
88
|
+
logger.debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
|
|
97
89
|
return module_path
|
|
98
90
|
else:
|
|
99
91
|
raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
|
|
@@ -114,7 +106,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
114
106
|
for root, dirs, files in os.walk(path):
|
|
115
107
|
if "crawlo.cfg" in files:
|
|
116
108
|
cfg_path = os.path.join(root, "crawlo.cfg")
|
|
117
|
-
|
|
109
|
+
logger.debug(f"✅ 找到项目配置文件: {cfg_path}")
|
|
118
110
|
return root
|
|
119
111
|
|
|
120
112
|
# 向上查找直到找到 crawlo.cfg 或包含 settings.py 和 __init__.py 的目录
|
|
@@ -130,20 +122,20 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
130
122
|
# 检查 crawlo.cfg
|
|
131
123
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
132
124
|
if os.path.isfile(cfg_file):
|
|
133
|
-
|
|
125
|
+
logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
|
|
134
126
|
return path
|
|
135
127
|
|
|
136
128
|
# 检查 settings.py 和 __init__.py
|
|
137
129
|
settings_file = os.path.join(path, "settings.py")
|
|
138
130
|
init_file = os.path.join(path, "__init__.py")
|
|
139
131
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
140
|
-
|
|
132
|
+
logger.debug(f"✅ 找到项目模块: {path}")
|
|
141
133
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
142
134
|
parent = os.path.dirname(path)
|
|
143
135
|
if parent != path:
|
|
144
136
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
145
137
|
if os.path.isfile(parent_cfg):
|
|
146
|
-
|
|
138
|
+
logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
|
|
147
139
|
return parent
|
|
148
140
|
return path
|
|
149
141
|
|
|
@@ -167,19 +159,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
167
159
|
|
|
168
160
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
169
161
|
if os.path.isfile(cfg_file):
|
|
170
|
-
|
|
162
|
+
logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
|
|
171
163
|
return path
|
|
172
164
|
|
|
173
165
|
settings_file = os.path.join(path, "settings.py")
|
|
174
166
|
init_file = os.path.join(path, "__init__.py")
|
|
175
167
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
176
|
-
|
|
168
|
+
logger.debug(f"✅ 找到项目模块: {path}")
|
|
177
169
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
178
170
|
parent = os.path.dirname(path)
|
|
179
171
|
if parent != path:
|
|
180
172
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
181
173
|
if os.path.isfile(parent_cfg):
|
|
182
|
-
|
|
174
|
+
logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
|
|
183
175
|
return parent
|
|
184
176
|
return path
|
|
185
177
|
|
|
@@ -204,19 +196,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
204
196
|
|
|
205
197
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
206
198
|
if os.path.isfile(cfg_file):
|
|
207
|
-
|
|
199
|
+
logger.debug(f"找到项目配置文件: {cfg_file}")
|
|
208
200
|
return path
|
|
209
201
|
|
|
210
202
|
settings_file = os.path.join(path, "settings.py")
|
|
211
203
|
init_file = os.path.join(path, "__init__.py")
|
|
212
204
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
213
|
-
|
|
205
|
+
logger.debug(f"找到项目模块: {path}")
|
|
214
206
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
215
207
|
parent = os.path.dirname(path)
|
|
216
208
|
if parent != path:
|
|
217
209
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
218
210
|
if os.path.isfile(parent_cfg):
|
|
219
|
-
|
|
211
|
+
logger.debug(f"在上层目录找到项目配置文件: {parent_cfg}")
|
|
220
212
|
return parent
|
|
221
213
|
return path
|
|
222
214
|
|
|
@@ -227,7 +219,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
227
219
|
except Exception:
|
|
228
220
|
pass
|
|
229
221
|
|
|
230
|
-
|
|
222
|
+
logger.warning("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
|
|
231
223
|
return None
|
|
232
224
|
|
|
233
225
|
|
|
@@ -241,8 +233,7 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
241
233
|
Returns:
|
|
242
234
|
SettingManager: 已加载配置的实例
|
|
243
235
|
"""
|
|
244
|
-
|
|
245
|
-
_get_logger().debug("🚀 正在初始化 Crawlo 项目配置...")
|
|
236
|
+
logger.debug("🚀 正在初始化 Crawlo 项目配置...")
|
|
246
237
|
|
|
247
238
|
# 1. 查找项目根
|
|
248
239
|
project_root = _find_project_root()
|
|
@@ -259,32 +250,48 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
259
250
|
# 推断:项目目录名.settings
|
|
260
251
|
project_name = os.path.basename(project_root)
|
|
261
252
|
settings_module_path = f"{project_name}.settings"
|
|
262
|
-
|
|
253
|
+
logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
|
|
263
254
|
|
|
264
255
|
# 3. 注入 sys.path
|
|
265
256
|
project_root_str = os.path.abspath(project_root)
|
|
266
257
|
if project_root_str not in sys.path:
|
|
267
258
|
sys.path.insert(0, project_root_str)
|
|
268
|
-
|
|
259
|
+
logger.debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
|
|
269
260
|
|
|
270
261
|
# 4. 加载 SettingManager
|
|
271
|
-
|
|
262
|
+
logger.debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
|
|
272
263
|
settings = SettingManager()
|
|
273
264
|
|
|
274
265
|
try:
|
|
275
266
|
settings.set_settings(settings_module_path)
|
|
276
|
-
|
|
267
|
+
logger.debug("✅ settings 模块加载成功")
|
|
277
268
|
except Exception as e:
|
|
278
269
|
raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
|
|
279
270
|
|
|
280
|
-
# 5.
|
|
271
|
+
# 5. 根据 RUN_MODE 获取相应配置
|
|
272
|
+
run_mode = settings.get('RUN_MODE', 'standalone')
|
|
273
|
+
if run_mode:
|
|
274
|
+
from crawlo.mode_manager import ModeManager
|
|
275
|
+
mode_manager = ModeManager()
|
|
276
|
+
mode_settings = mode_manager.resolve_mode_settings(run_mode)
|
|
277
|
+
# 合并模式配置,但不覆盖用户已设置的配置
|
|
278
|
+
for key, value in mode_settings.items():
|
|
279
|
+
# 只有当用户没有设置该配置项时才应用模式配置
|
|
280
|
+
if key not in settings.attributes:
|
|
281
|
+
settings.set(key, value)
|
|
282
|
+
logger.debug(f"🔧 已应用 {run_mode} 模式配置")
|
|
283
|
+
|
|
284
|
+
# 6. 合并运行时配置
|
|
281
285
|
if custom_settings:
|
|
282
286
|
settings.update_attributes(custom_settings)
|
|
283
|
-
|
|
287
|
+
logger.debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
|
|
284
288
|
|
|
285
|
-
#
|
|
289
|
+
# 7. 显示核心配置摘要(INFO级别)
|
|
286
290
|
# _log_settings_summary(settings)
|
|
287
291
|
|
|
292
|
+
# 配置日志系统
|
|
293
|
+
LoggerManager.configure(settings)
|
|
294
|
+
|
|
288
295
|
# 将项目初始化完成的消息改为DEBUG级别
|
|
289
|
-
|
|
290
|
-
return settings
|
|
296
|
+
logger.debug("🎉 Crawlo 项目配置初始化完成!")
|
|
297
|
+
return settings
|
|
@@ -48,7 +48,18 @@ QUEUE_TYPE = 'auto'
|
|
|
48
48
|
# 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
|
|
49
49
|
# 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
|
|
50
50
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
51
|
-
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
51
|
+
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
MYSQL_HOST = '127.0.0.1'
|
|
55
|
+
MYSQL_PORT = 3306
|
|
56
|
+
MYSQL_USER = 'root'
|
|
57
|
+
MYSQL_PASSWORD = '123456'
|
|
58
|
+
MYSQL_DB = 'crawl_pro'
|
|
59
|
+
MYSQL_TABLE = 'crawlo'
|
|
60
|
+
MYSQL_BATCH_SIZE = 100
|
|
61
|
+
MYSQL_USE_BATCH = False # 是否启用批量插入
|
|
62
|
+
|
|
52
63
|
|
|
53
64
|
# --- Redis 过滤器配置 ---
|
|
54
65
|
# 使用环境变量配置工具获取 Redis 配置
|
|
@@ -85,7 +96,6 @@ MIDDLEWARES = [
|
|
|
85
96
|
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
|
|
86
97
|
'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
|
|
87
98
|
'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
|
|
88
|
-
# 'crawlo.middleware.proxy.ProxyMiddleware', # 4. 设置代理(默认不启用)
|
|
89
99
|
'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
|
|
90
100
|
|
|
91
101
|
# === 响应处理阶段 ===
|
|
@@ -98,8 +108,7 @@ MIDDLEWARES = [
|
|
|
98
108
|
|
|
99
109
|
# 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
|
|
100
110
|
PIPELINES = [
|
|
101
|
-
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
102
|
-
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(可选)
|
|
111
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
103
112
|
]
|
|
104
113
|
|
|
105
114
|
# 明确添加默认去重管道到管道列表开头
|
|
@@ -15,38 +15,67 @@ class SettingManager(MutableMapping):
|
|
|
15
15
|
self.set_settings(default_settings)
|
|
16
16
|
# 在初始化时合并配置
|
|
17
17
|
self._merge_config(values)
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
def _merge_config(self, user_config):
|
|
20
20
|
"""合并默认配置和用户配置"""
|
|
21
21
|
if not user_config:
|
|
22
22
|
return
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
# 合并中间件配置
|
|
25
25
|
if 'MIDDLEWARES' in user_config:
|
|
26
26
|
default_middlewares = self.attributes.get('MIDDLEWARES', [])
|
|
27
27
|
user_middlewares = user_config['MIDDLEWARES']
|
|
28
|
-
|
|
29
|
-
|
|
28
|
+
# 如果用户配置了空列表,则仍然使用默认配置
|
|
29
|
+
if user_middlewares:
|
|
30
|
+
# 过滤掉空值和注释
|
|
31
|
+
user_middlewares = [middleware for middleware in user_middlewares if middleware and not middleware.strip().startswith('#')]
|
|
32
|
+
# 合并默认中间件和用户中间件,去重但保持顺序
|
|
33
|
+
merged_middlewares = default_middlewares[:]
|
|
34
|
+
for middleware in user_middlewares:
|
|
35
|
+
if middleware not in merged_middlewares:
|
|
36
|
+
merged_middlewares.append(middleware)
|
|
37
|
+
self.attributes['MIDDLEWARES'] = merged_middlewares
|
|
38
|
+
|
|
30
39
|
# 合并管道配置
|
|
31
40
|
if 'PIPELINES' in user_config:
|
|
32
41
|
default_pipelines = self.attributes.get('PIPELINES', [])
|
|
33
42
|
user_pipelines = user_config['PIPELINES']
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
#
|
|
39
|
-
merged_pipelines = [
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
43
|
+
# 如果用户配置了空列表,则仍然使用默认配置
|
|
44
|
+
if user_pipelines:
|
|
45
|
+
# 过滤掉空值和注释
|
|
46
|
+
user_pipelines = [pipeline for pipeline in user_pipelines if pipeline and not pipeline.strip().startswith('#')]
|
|
47
|
+
# 合并默认管道和用户管道,去重但保持顺序
|
|
48
|
+
merged_pipelines = default_pipelines[:]
|
|
49
|
+
for pipeline in user_pipelines:
|
|
50
|
+
if pipeline not in merged_pipelines:
|
|
51
|
+
merged_pipelines.append(pipeline)
|
|
52
|
+
self.attributes['PIPELINES'] = merged_pipelines
|
|
53
|
+
|
|
54
|
+
# 特殊处理PIPELINES,确保去重管道在最前面
|
|
55
|
+
dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
|
|
56
|
+
if dedup_pipeline:
|
|
57
|
+
pipelines = self.attributes.get('PIPELINES', [])
|
|
58
|
+
# 移除所有去重管道实例(如果存在)
|
|
59
|
+
pipelines = [item for item in pipelines if item != dedup_pipeline]
|
|
60
|
+
# 在开头插入去重管道
|
|
61
|
+
pipelines.insert(0, dedup_pipeline)
|
|
62
|
+
self.attributes['PIPELINES'] = pipelines
|
|
63
|
+
|
|
44
64
|
# 合并扩展配置
|
|
45
65
|
if 'EXTENSIONS' in user_config:
|
|
46
66
|
default_extensions = self.attributes.get('EXTENSIONS', [])
|
|
47
67
|
user_extensions = user_config['EXTENSIONS']
|
|
48
|
-
|
|
49
|
-
|
|
68
|
+
# 如果用户配置了空列表,则仍然使用默认配置
|
|
69
|
+
if user_extensions:
|
|
70
|
+
# 过滤掉空值和注释
|
|
71
|
+
user_extensions = [extension for extension in user_extensions if extension and not extension.strip().startswith('#')]
|
|
72
|
+
# 合并默认扩展和用户扩展,去重但保持顺序
|
|
73
|
+
merged_extensions = default_extensions[:]
|
|
74
|
+
for extension in user_extensions:
|
|
75
|
+
if extension not in merged_extensions:
|
|
76
|
+
merged_extensions.append(extension)
|
|
77
|
+
self.attributes['EXTENSIONS'] = merged_extensions
|
|
78
|
+
|
|
50
79
|
# 更新其他用户配置
|
|
51
80
|
for key, value in user_config.items():
|
|
52
81
|
if key not in ['MIDDLEWARES', 'PIPELINES', 'EXTENSIONS']:
|
|
@@ -107,9 +136,15 @@ class SettingManager(MutableMapping):
|
|
|
107
136
|
def set_settings(self, module):
|
|
108
137
|
if isinstance(module, str):
|
|
109
138
|
module = import_module(module)
|
|
139
|
+
|
|
140
|
+
# 收集模块中的所有配置项
|
|
141
|
+
module_settings = {}
|
|
110
142
|
for key in dir(module):
|
|
111
143
|
if key.isupper():
|
|
112
|
-
|
|
144
|
+
module_settings[key] = getattr(module, key)
|
|
145
|
+
|
|
146
|
+
# 使用合并逻辑而不是直接设置
|
|
147
|
+
self._merge_config(module_settings)
|
|
113
148
|
|
|
114
149
|
# 实现 MutableMapping 必须的方法
|
|
115
150
|
def __getitem__(self, item):
|
|
@@ -147,7 +182,7 @@ class SettingManager(MutableMapping):
|
|
|
147
182
|
# 创建一个新的实例
|
|
148
183
|
cls = self.__class__
|
|
149
184
|
new_instance = cls.__new__(cls)
|
|
150
|
-
|
|
185
|
+
|
|
151
186
|
# 复制attributes字典,但排除不可pickle的对象
|
|
152
187
|
new_attributes = {}
|
|
153
188
|
for key, value in self.attributes.items():
|
|
@@ -157,8 +192,8 @@ class SettingManager(MutableMapping):
|
|
|
157
192
|
except Exception:
|
|
158
193
|
# 如果复制失败,保留原始引用(对于logger等对象)
|
|
159
194
|
new_attributes[key] = value
|
|
160
|
-
|
|
195
|
+
|
|
161
196
|
# 设置新实例的attributes
|
|
162
197
|
new_instance.attributes = new_attributes
|
|
163
|
-
|
|
198
|
+
|
|
164
199
|
return new_instance
|