crawlo 1.2.9__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

@@ -45,7 +45,8 @@ class OffsiteMiddleware:
45
45
  # 编译域名正则表达式以提高性能
46
46
  o._compile_domains()
47
47
 
48
- crawler.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
48
+ # 使用中间件自己的logger而不是crawler.logger
49
+ o.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
49
50
  return o
50
51
 
51
52
  def _compile_domains(self):
crawlo/mode_manager.py CHANGED
@@ -14,102 +14,56 @@ import os
14
14
  from enum import Enum
15
15
  from typing import Dict, Any, Optional
16
16
 
17
- from crawlo.utils.log import get_logger
18
-
19
17
 
20
18
  class RunMode(Enum):
21
19
  """运行模式枚举"""
22
- STANDALONE = "standalone" # 单机模式
20
+ STANDALONE = "standalone" # 单机模式
23
21
  DISTRIBUTED = "distributed" # 分布式模式
24
- AUTO = "auto" # 自动检测模式
22
+ AUTO = "auto" # 自动检测模式
25
23
 
26
24
 
27
25
  class ModeManager:
28
26
  """运行模式管理器"""
29
-
27
+
30
28
  def __init__(self):
31
- self.logger = get_logger(self.__class__.__name__)
32
-
29
+ pass
30
+
33
31
  @staticmethod
34
32
  def get_standalone_settings() -> Dict[str, Any]:
35
33
  """获取单机模式配置"""
36
34
  return {
37
35
  'QUEUE_TYPE': 'memory',
38
36
  'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
37
+ 'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
39
38
  'CONCURRENCY': 8,
40
39
  'MAX_RUNNING_SPIDERS': 1,
41
40
  'DOWNLOAD_DELAY': 1.0,
42
- 'LOG_LEVEL': 'INFO',
43
- }
44
-
45
- @staticmethod
46
- def get_distributed_settings(
47
- redis_host: str = '127.0.0.1',
48
- redis_port: int = 6379,
49
- redis_password: Optional[str] = None,
50
- redis_db: int = 0, # 添加 redis_db 参数
51
- project_name: str = 'crawlo'
52
- ) -> Dict[str, Any]:
53
- """获取分布式模式配置"""
54
- # 构建 Redis URL,使用传入的 redis_db 参数
55
- if redis_password:
56
- redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
57
- else:
58
- redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
59
-
60
- return {
61
- 'PROJECT_NAME': project_name, # 添加项目名称到配置中
62
- 'QUEUE_TYPE': 'redis',
63
- 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
64
- 'REDIS_HOST': redis_host,
65
- 'REDIS_PORT': redis_port,
66
- 'REDIS_PASSWORD': redis_password,
67
- 'REDIS_DB': redis_db, # 添加 Redis 数据库编号到配置中
68
- 'REDIS_URL': redis_url,
69
- 'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
70
- # Redis key配置已移至各组件中,使用统一的命名规范
71
- # crawlo:{project_name}:filter:fingerprint (请求去重)
72
- 'CONCURRENCY': 16,
73
- 'MAX_RUNNING_SPIDERS': 1,
74
- 'DOWNLOAD_DELAY': 1.0,
75
- 'LOG_LEVEL': 'INFO',
76
- }
77
-
78
- @staticmethod
79
- def get_auto_settings() -> Dict[str, Any]:
80
- """获取自动检测模式配置"""
81
- return {
82
- 'QUEUE_TYPE': 'auto',
83
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
84
- 'CONCURRENCY': 12,
85
- 'MAX_RUNNING_SPIDERS': 1,
86
- 'DOWNLOAD_DELAY': 1.0,
87
- 'LOG_LEVEL': 'INFO',
88
41
  }
89
-
42
+
90
43
  def resolve_mode_settings(
91
- self,
92
- mode: str = 'standalone',
93
- **kwargs
44
+ self,
45
+ mode: str = 'standalone',
46
+ **kwargs
94
47
  ) -> Dict[str, Any]:
95
48
  """
96
49
  解析运行模式并返回对应配置
97
-
50
+
98
51
  Args:
99
52
  mode: 运行模式 ('standalone', 'distributed', 'auto')
100
53
  **kwargs: 额外配置参数
101
-
54
+
102
55
  Returns:
103
56
  Dict[str, Any]: 配置字典
104
57
  """
105
58
  mode = RunMode(mode.lower())
106
-
59
+ mode_info = None
60
+
107
61
  if mode == RunMode.STANDALONE:
108
- self.logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
62
+ mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
109
63
  settings = self.get_standalone_settings()
110
-
64
+
111
65
  elif mode == RunMode.DISTRIBUTED:
112
- self.logger.info("使用分布式模式 - 支持多节点扩展,适合大规模爬取")
66
+ mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
113
67
  settings = self.get_distributed_settings(
114
68
  redis_host=kwargs.get('redis_host', '127.0.0.1'),
115
69
  redis_port=kwargs.get('redis_port', 6379),
@@ -117,25 +71,28 @@ class ModeManager:
117
71
  redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
118
72
  project_name=kwargs.get('project_name', 'crawlo')
119
73
  )
120
-
74
+
121
75
  elif mode == RunMode.AUTO:
122
- self.logger.info("使用自动检测模式 - 智能选择最佳运行方式")
76
+ mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
123
77
  settings = self.get_auto_settings()
124
-
78
+
125
79
  else:
126
80
  raise ValueError(f"不支持的运行模式: {mode}")
127
-
81
+
128
82
  # 合并用户自定义配置
129
- user_settings = {k: v for k, v in kwargs.items()
130
- if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
83
+ user_settings = {k: v for k, v in kwargs.items()
84
+ if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
131
85
  settings.update(user_settings)
132
-
86
+
87
+ # 将模式信息添加到配置中,供后续使用
88
+ settings['_mode_info'] = mode_info
89
+
133
90
  return settings
134
-
91
+
135
92
  def from_environment(self) -> Dict[str, Any]:
136
93
  """从环境变量构建配置"""
137
94
  config = {}
138
-
95
+
139
96
  # 扫描 CRAWLO_ 前缀的环境变量
140
97
  for key, value in os.environ.items():
141
98
  if key.startswith('CRAWLO_'):
@@ -150,7 +107,7 @@ class ModeManager:
150
107
  config[config_key] = float(value)
151
108
  except ValueError:
152
109
  config[config_key] = value
153
-
110
+
154
111
  return config
155
112
 
156
113
 
@@ -161,12 +118,12 @@ def standalone_mode(**kwargs) -> Dict[str, Any]:
161
118
 
162
119
 
163
120
  def distributed_mode(
164
- redis_host: str = '127.0.0.1',
165
- redis_port: int = 6379,
166
- redis_password: Optional[str] = None,
167
- redis_db: int = 0, # 添加 redis_db 参数
168
- project_name: str = 'crawlo',
169
- **kwargs
121
+ redis_host: str = '127.0.0.1',
122
+ redis_port: int = 6379,
123
+ redis_password: Optional[str] = None,
124
+ redis_db: int = 0, # 添加 redis_db 参数
125
+ project_name: str = 'crawlo',
126
+ **kwargs
170
127
  ) -> Dict[str, Any]:
171
128
  """快速创建分布式模式配置"""
172
129
  return ModeManager().resolve_mode_settings(
@@ -189,24 +146,4 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
189
146
  def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
190
147
  """从环境变量创建配置"""
191
148
  # 移除直接使用 os.getenv(),要求通过 settings 配置
192
- raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
193
-
194
- # 保留原有代码作为参考
195
- # mode = os.getenv('CRAWLO_MODE', default_mode).lower()
196
- #
197
- # if mode == 'distributed':
198
- # return distributed_mode(
199
- # redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
200
- # redis_port=int(os.getenv('REDIS_PORT', 6379)),
201
- # redis_password=os.getenv('REDIS_PASSWORD'),
202
- # project_name=os.getenv('PROJECT_NAME', 'crawlo'),
203
- # CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
204
- # )
205
- # elif mode == 'auto':
206
- # return auto_mode(
207
- # CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
208
- # )
209
- # else: # standalone
210
- # return standalone_mode(
211
- # CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
212
- # )
149
+ raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
@@ -7,6 +7,7 @@ from typing import Optional, List, Dict
7
7
  from crawlo.exceptions import ItemDiscard
8
8
  from crawlo.utils.db_helper import make_insert_sql, make_batch_sql
9
9
  from crawlo.utils.log import get_logger
10
+ from . import BasePipeline
10
11
 
11
12
 
12
13
  class AsyncmyMySQLPipeline:
@@ -200,7 +201,7 @@ class AiomysqlMySQLPipeline:
200
201
  crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
201
202
 
202
203
  @classmethod
203
- def create_instance(cls, crawler):
204
+ def from_crawler(cls, crawler):
204
205
  return cls(crawler)
205
206
 
206
207
  async def _init_pool(self):
@@ -213,12 +214,12 @@ class AiomysqlMySQLPipeline:
213
214
  try:
214
215
  self.pool = await aiomysql.create_pool(
215
216
  host=self.settings.get('MYSQL_HOST', 'localhost'),
216
- port=self.settings.getint('MYSQL_PORT', 3306),
217
+ port=self.settings.get_int('MYSQL_PORT', 3306),
217
218
  user=self.settings.get('MYSQL_USER', 'root'),
218
219
  password=self.settings.get('MYSQL_PASSWORD', ''),
219
220
  db=self.settings.get('MYSQL_DB', 'scrapy_db'),
220
- minsize=self.settings.getint('MYSQL_POOL_MIN', 2),
221
- maxsize=self.settings.getint('MYSQL_POOL_MAX', 5),
221
+ minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
222
+ maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5),
222
223
  cursorclass=aiomysql.DictCursor,
223
224
  autocommit=False
224
225
  )
@@ -4,7 +4,6 @@ from typing import List
4
4
  from pprint import pformat
5
5
  from asyncio import create_task
6
6
 
7
-
8
7
  from crawlo.utils.log import get_logger
9
8
  from crawlo.event import item_successful, item_discard
10
9
  from crawlo.project import load_class, common_call
@@ -20,6 +19,20 @@ class PipelineManager:
20
19
 
21
20
  self.logger = get_logger(self.__class__.__name__, self.crawler.settings.get('LOG_LEVEL'))
22
21
  pipelines = self.crawler.settings.get_list('PIPELINES')
22
+ dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
23
+
24
+ # 添加调试信息
25
+ self.logger.debug(f"PIPELINES from settings: {pipelines}")
26
+ self.logger.debug(f"DEFAULT_DEDUP_PIPELINE from settings: {dedup_pipeline}")
27
+
28
+ # 确保DEFAULT_DEDUP_PIPELINE被添加到管道列表开头
29
+ if dedup_pipeline:
30
+ # 移除所有去重管道实例(如果存在)
31
+ pipelines = [item for item in pipelines if item != dedup_pipeline]
32
+ # 在开头插入去重管道
33
+ self.logger.debug(f"{dedup_pipeline} insert successful")
34
+ pipelines.insert(0, dedup_pipeline)
35
+
23
36
  self._add_pipelines(pipelines)
24
37
  self._add_methods()
25
38
 
@@ -34,7 +47,7 @@ class PipelineManager:
34
47
  pipeline_cls = load_class(pipeline)
35
48
  if not hasattr(pipeline_cls, 'from_crawler'):
36
49
  raise PipelineInitError(
37
- f"Pipeline init failed, must inherit from `BasePipeline` or have a `create_instance` method"
50
+ f"Pipeline init failed, must inherit from `BasePipeline` or have a `from_crawler` method"
38
51
  )
39
52
  self.pipelines.append(pipeline_cls.from_crawler(self.crawler))
40
53
  except Exception as e:
crawlo/project.py CHANGED
@@ -7,18 +7,10 @@ from inspect import iscoroutinefunction
7
7
  from typing import Callable, Optional, Any
8
8
 
9
9
  from crawlo.settings.setting_manager import SettingManager
10
- from crawlo.utils.log import get_logger
10
+ from crawlo.utils.log import get_logger, LoggerManager
11
11
 
12
- # 延迟初始化logger,在需要时通过get_logger获取
13
- logger = None
14
-
15
-
16
- def _get_logger():
17
- """延迟获取logger实例,确保在配置加载后创建"""
18
- global logger
19
- if logger is None:
20
- logger = get_logger(__name__)
21
- return logger
12
+ # 使用全局logger,避免每个模块都创建自己的延迟初始化函数
13
+ logger = get_logger(__name__)
22
14
 
23
15
 
24
16
  def load_class(path: str) -> Any:
@@ -50,7 +42,7 @@ def merge_settings(spider, settings):
50
42
  spider_name = getattr(spider, 'name', 'UnknownSpider')
51
43
  # 检查 settings 是否为 SettingManager 实例
52
44
  if not hasattr(settings, 'update_attributes'):
53
- _get_logger().error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
45
+ logger.error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
54
46
  # 如果是字典,创建一个新的 SettingManager 实例
55
47
  if isinstance(settings, dict):
56
48
  from crawlo.settings.setting_manager import SettingManager
@@ -58,14 +50,14 @@ def merge_settings(spider, settings):
58
50
  new_settings.update_attributes(settings)
59
51
  settings = new_settings
60
52
  else:
61
- _get_logger().error("无法处理的 settings 类型")
53
+ logger.error("无法处理的 settings 类型")
62
54
  return
63
55
 
64
56
  if hasattr(spider, 'custom_settings'):
65
57
  custom_settings = getattr(spider, 'custom_settings')
66
58
  settings.update_attributes(custom_settings)
67
59
  else:
68
- _get_logger().debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
60
+ logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
69
61
 
70
62
 
71
63
  async def common_call(func: Callable, *args, **kwargs):
@@ -93,7 +85,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
93
85
  config.read(cfg_path, encoding="utf-8")
94
86
  if config.has_section("settings") and config.has_option("settings", "default"):
95
87
  module_path = config.get("settings", "default")
96
- _get_logger().debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
88
+ logger.debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
97
89
  return module_path
98
90
  else:
99
91
  raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
@@ -114,7 +106,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
114
106
  for root, dirs, files in os.walk(path):
115
107
  if "crawlo.cfg" in files:
116
108
  cfg_path = os.path.join(root, "crawlo.cfg")
117
- _get_logger().debug(f"✅ 找到项目配置文件: {cfg_path}")
109
+ logger.debug(f"✅ 找到项目配置文件: {cfg_path}")
118
110
  return root
119
111
 
120
112
  # 向上查找直到找到 crawlo.cfg 或包含 settings.py 和 __init__.py 的目录
@@ -130,20 +122,20 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
130
122
  # 检查 crawlo.cfg
131
123
  cfg_file = os.path.join(path, "crawlo.cfg")
132
124
  if os.path.isfile(cfg_file):
133
- _get_logger().debug(f"✅ 找到项目配置文件: {cfg_file}")
125
+ logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
134
126
  return path
135
127
 
136
128
  # 检查 settings.py 和 __init__.py
137
129
  settings_file = os.path.join(path, "settings.py")
138
130
  init_file = os.path.join(path, "__init__.py")
139
131
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
140
- _get_logger().debug(f"✅ 找到项目模块: {path}")
132
+ logger.debug(f"✅ 找到项目模块: {path}")
141
133
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
142
134
  parent = os.path.dirname(path)
143
135
  if parent != path:
144
136
  parent_cfg = os.path.join(parent, "crawlo.cfg")
145
137
  if os.path.isfile(parent_cfg):
146
- _get_logger().debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
138
+ logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
147
139
  return parent
148
140
  return path
149
141
 
@@ -167,19 +159,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
167
159
 
168
160
  cfg_file = os.path.join(path, "crawlo.cfg")
169
161
  if os.path.isfile(cfg_file):
170
- _get_logger().debug(f"✅ 找到项目配置文件: {cfg_file}")
162
+ logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
171
163
  return path
172
164
 
173
165
  settings_file = os.path.join(path, "settings.py")
174
166
  init_file = os.path.join(path, "__init__.py")
175
167
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
176
- _get_logger().debug(f"✅ 找到项目模块: {path}")
168
+ logger.debug(f"✅ 找到项目模块: {path}")
177
169
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
178
170
  parent = os.path.dirname(path)
179
171
  if parent != path:
180
172
  parent_cfg = os.path.join(parent, "crawlo.cfg")
181
173
  if os.path.isfile(parent_cfg):
182
- _get_logger().debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
174
+ logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
183
175
  return parent
184
176
  return path
185
177
 
@@ -204,19 +196,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
204
196
 
205
197
  cfg_file = os.path.join(path, "crawlo.cfg")
206
198
  if os.path.isfile(cfg_file):
207
- _get_logger().debug(f"找到项目配置文件: {cfg_file}")
199
+ logger.debug(f"找到项目配置文件: {cfg_file}")
208
200
  return path
209
201
 
210
202
  settings_file = os.path.join(path, "settings.py")
211
203
  init_file = os.path.join(path, "__init__.py")
212
204
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
213
- _get_logger().debug(f"找到项目模块: {path}")
205
+ logger.debug(f"找到项目模块: {path}")
214
206
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
215
207
  parent = os.path.dirname(path)
216
208
  if parent != path:
217
209
  parent_cfg = os.path.join(parent, "crawlo.cfg")
218
210
  if os.path.isfile(parent_cfg):
219
- _get_logger().debug(f"在上层目录找到项目配置文件: {parent_cfg}")
211
+ logger.debug(f"在上层目录找到项目配置文件: {parent_cfg}")
220
212
  return parent
221
213
  return path
222
214
 
@@ -227,7 +219,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
227
219
  except Exception:
228
220
  pass
229
221
 
230
- _get_logger().warning("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
222
+ logger.warning("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
231
223
  return None
232
224
 
233
225
 
@@ -241,8 +233,7 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
241
233
  Returns:
242
234
  SettingManager: 已加载配置的实例
243
235
  """
244
- # Change INFO level log to DEBUG level to avoid redundant output
245
- _get_logger().debug("🚀 正在初始化 Crawlo 项目配置...")
236
+ logger.debug("🚀 正在初始化 Crawlo 项目配置...")
246
237
 
247
238
  # 1. 查找项目根
248
239
  project_root = _find_project_root()
@@ -259,32 +250,48 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
259
250
  # 推断:项目目录名.settings
260
251
  project_name = os.path.basename(project_root)
261
252
  settings_module_path = f"{project_name}.settings"
262
- _get_logger().warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
253
+ logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
263
254
 
264
255
  # 3. 注入 sys.path
265
256
  project_root_str = os.path.abspath(project_root)
266
257
  if project_root_str not in sys.path:
267
258
  sys.path.insert(0, project_root_str)
268
- _get_logger().debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
259
+ logger.debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
269
260
 
270
261
  # 4. 加载 SettingManager
271
- _get_logger().debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
262
+ logger.debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
272
263
  settings = SettingManager()
273
264
 
274
265
  try:
275
266
  settings.set_settings(settings_module_path)
276
- _get_logger().debug("✅ settings 模块加载成功")
267
+ logger.debug("✅ settings 模块加载成功")
277
268
  except Exception as e:
278
269
  raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
279
270
 
280
- # 5. 合并运行时配置
271
+ # 5. 根据 RUN_MODE 获取相应配置
272
+ run_mode = settings.get('RUN_MODE', 'standalone')
273
+ if run_mode:
274
+ from crawlo.mode_manager import ModeManager
275
+ mode_manager = ModeManager()
276
+ mode_settings = mode_manager.resolve_mode_settings(run_mode)
277
+ # 合并模式配置,但不覆盖用户已设置的配置
278
+ for key, value in mode_settings.items():
279
+ # 只有当用户没有设置该配置项时才应用模式配置
280
+ if key not in settings.attributes:
281
+ settings.set(key, value)
282
+ logger.debug(f"🔧 已应用 {run_mode} 模式配置")
283
+
284
+ # 6. 合并运行时配置
281
285
  if custom_settings:
282
286
  settings.update_attributes(custom_settings)
283
- _get_logger().debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
287
+ logger.debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
284
288
 
285
- # 6. 显示核心配置摘要(INFO级别)
289
+ # 7. 显示核心配置摘要(INFO级别)
286
290
  # _log_settings_summary(settings)
287
291
 
292
+ # 配置日志系统
293
+ LoggerManager.configure(settings)
294
+
288
295
  # 将项目初始化完成的消息改为DEBUG级别
289
- _get_logger().debug("🎉 Crawlo 项目配置初始化完成!")
290
- return settings
296
+ logger.debug("🎉 Crawlo 项目配置初始化完成!")
297
+ return settings
@@ -48,7 +48,18 @@ QUEUE_TYPE = 'auto'
48
48
  # 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
49
49
  # 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
50
50
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
51
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
51
+ FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
52
+
53
+
54
+ MYSQL_HOST = '127.0.0.1'
55
+ MYSQL_PORT = 3306
56
+ MYSQL_USER = 'root'
57
+ MYSQL_PASSWORD = '123456'
58
+ MYSQL_DB = 'crawl_pro'
59
+ MYSQL_TABLE = 'crawlo'
60
+ MYSQL_BATCH_SIZE = 100
61
+ MYSQL_USE_BATCH = False # 是否启用批量插入
62
+
52
63
 
53
64
  # --- Redis 过滤器配置 ---
54
65
  # 使用环境变量配置工具获取 Redis 配置
@@ -85,7 +96,6 @@ MIDDLEWARES = [
85
96
  'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
86
97
  'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
87
98
  'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
88
- # 'crawlo.middleware.proxy.ProxyMiddleware', # 4. 设置代理(默认不启用)
89
99
  'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
90
100
 
91
101
  # === 响应处理阶段 ===
@@ -98,8 +108,7 @@ MIDDLEWARES = [
98
108
 
99
109
  # 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
100
110
  PIPELINES = [
101
- 'crawlo.pipelines.console_pipeline.ConsolePipeline', # 控制台输出
102
- # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(可选)
111
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
103
112
  ]
104
113
 
105
114
  # 明确添加默认去重管道到管道列表开头
@@ -15,38 +15,67 @@ class SettingManager(MutableMapping):
15
15
  self.set_settings(default_settings)
16
16
  # 在初始化时合并配置
17
17
  self._merge_config(values)
18
-
18
+
19
19
  def _merge_config(self, user_config):
20
20
  """合并默认配置和用户配置"""
21
21
  if not user_config:
22
22
  return
23
-
23
+
24
24
  # 合并中间件配置
25
25
  if 'MIDDLEWARES' in user_config:
26
26
  default_middlewares = self.attributes.get('MIDDLEWARES', [])
27
27
  user_middlewares = user_config['MIDDLEWARES']
28
- self.attributes['MIDDLEWARES'] = default_middlewares + user_middlewares
29
-
28
+ # 如果用户配置了空列表,则仍然使用默认配置
29
+ if user_middlewares:
30
+ # 过滤掉空值和注释
31
+ user_middlewares = [middleware for middleware in user_middlewares if middleware and not middleware.strip().startswith('#')]
32
+ # 合并默认中间件和用户中间件,去重但保持顺序
33
+ merged_middlewares = default_middlewares[:]
34
+ for middleware in user_middlewares:
35
+ if middleware not in merged_middlewares:
36
+ merged_middlewares.append(middleware)
37
+ self.attributes['MIDDLEWARES'] = merged_middlewares
38
+
30
39
  # 合并管道配置
31
40
  if 'PIPELINES' in user_config:
32
41
  default_pipelines = self.attributes.get('PIPELINES', [])
33
42
  user_pipelines = user_config['PIPELINES']
34
- merged_pipelines = default_pipelines + user_pipelines
35
- # 特殊处理PIPELINES,确保去重管道在最前面
36
- dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
37
- if dedup_pipeline:
38
- # 移除所有去重管道实例(如果存在)
39
- merged_pipelines = [item for item in merged_pipelines if item != dedup_pipeline]
40
- # 在开头插入去重管道
41
- merged_pipelines.insert(0, dedup_pipeline)
42
- self.attributes['PIPELINES'] = merged_pipelines
43
-
43
+ # 如果用户配置了空列表,则仍然使用默认配置
44
+ if user_pipelines:
45
+ # 过滤掉空值和注释
46
+ user_pipelines = [pipeline for pipeline in user_pipelines if pipeline and not pipeline.strip().startswith('#')]
47
+ # 合并默认管道和用户管道,去重但保持顺序
48
+ merged_pipelines = default_pipelines[:]
49
+ for pipeline in user_pipelines:
50
+ if pipeline not in merged_pipelines:
51
+ merged_pipelines.append(pipeline)
52
+ self.attributes['PIPELINES'] = merged_pipelines
53
+
54
+ # 特殊处理PIPELINES,确保去重管道在最前面
55
+ dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
56
+ if dedup_pipeline:
57
+ pipelines = self.attributes.get('PIPELINES', [])
58
+ # 移除所有去重管道实例(如果存在)
59
+ pipelines = [item for item in pipelines if item != dedup_pipeline]
60
+ # 在开头插入去重管道
61
+ pipelines.insert(0, dedup_pipeline)
62
+ self.attributes['PIPELINES'] = pipelines
63
+
44
64
  # 合并扩展配置
45
65
  if 'EXTENSIONS' in user_config:
46
66
  default_extensions = self.attributes.get('EXTENSIONS', [])
47
67
  user_extensions = user_config['EXTENSIONS']
48
- self.attributes['EXTENSIONS'] = default_extensions + user_extensions
49
-
68
+ # 如果用户配置了空列表,则仍然使用默认配置
69
+ if user_extensions:
70
+ # 过滤掉空值和注释
71
+ user_extensions = [extension for extension in user_extensions if extension and not extension.strip().startswith('#')]
72
+ # 合并默认扩展和用户扩展,去重但保持顺序
73
+ merged_extensions = default_extensions[:]
74
+ for extension in user_extensions:
75
+ if extension not in merged_extensions:
76
+ merged_extensions.append(extension)
77
+ self.attributes['EXTENSIONS'] = merged_extensions
78
+
50
79
  # 更新其他用户配置
51
80
  for key, value in user_config.items():
52
81
  if key not in ['MIDDLEWARES', 'PIPELINES', 'EXTENSIONS']:
@@ -107,9 +136,15 @@ class SettingManager(MutableMapping):
107
136
  def set_settings(self, module):
108
137
  if isinstance(module, str):
109
138
  module = import_module(module)
139
+
140
+ # 收集模块中的所有配置项
141
+ module_settings = {}
110
142
  for key in dir(module):
111
143
  if key.isupper():
112
- self.set(key, getattr(module, key))
144
+ module_settings[key] = getattr(module, key)
145
+
146
+ # 使用合并逻辑而不是直接设置
147
+ self._merge_config(module_settings)
113
148
 
114
149
  # 实现 MutableMapping 必须的方法
115
150
  def __getitem__(self, item):
@@ -147,7 +182,7 @@ class SettingManager(MutableMapping):
147
182
  # 创建一个新的实例
148
183
  cls = self.__class__
149
184
  new_instance = cls.__new__(cls)
150
-
185
+
151
186
  # 复制attributes字典,但排除不可pickle的对象
152
187
  new_attributes = {}
153
188
  for key, value in self.attributes.items():
@@ -157,8 +192,8 @@ class SettingManager(MutableMapping):
157
192
  except Exception:
158
193
  # 如果复制失败,保留原始引用(对于logger等对象)
159
194
  new_attributes[key] = value
160
-
195
+
161
196
  # 设置新实例的attributes
162
197
  new_instance.attributes = new_attributes
163
-
198
+
164
199
  return new_instance