crawlo 1.1.5__py3-none-any.whl → 1.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.5"
1
+ __version__ = "1.1.8"
crawlo/cli.py CHANGED
@@ -10,24 +10,49 @@ def main():
10
10
  # 获取所有可用命令
11
11
  commands = get_commands()
12
12
 
13
+ # 创建主解析器
13
14
  parser = argparse.ArgumentParser(
14
15
  description="Crawlo: A lightweight web crawler framework.",
15
- usage="crawlo <command> [options]"
16
+ usage="crawlo <command> [options]",
17
+ add_help=False # 禁用默认帮助,我们自己处理
16
18
  )
17
- parser.add_argument('command', help='Available commands: ' + ', '.join(commands.keys()))
18
- # 注意:这里不添加具体参数,由子命令解析
19
-
20
- # 只解析命令
19
+
20
+ # 添加帮助参数
21
+ parser.add_argument('-h', '--help', action='store_true', help='显示帮助信息')
22
+ parser.add_argument('command', nargs='?', help='可用命令: ' + ', '.join(commands.keys()))
23
+
24
+ # 解析已知参数
21
25
  args, unknown = parser.parse_known_args()
22
26
 
27
+ # 处理帮助参数
28
+ if args.help or (args.command is None and not unknown):
29
+ # 导入并运行帮助命令
30
+ try:
31
+ module = __import__(commands['help'], fromlist=['main'])
32
+ sys.exit(module.main([]))
33
+ except ImportError as e:
34
+ print(f"Failed to load help command: {e}")
35
+ sys.exit(1)
36
+ except Exception as e:
37
+ print(f"Help command failed: {e}")
38
+ sys.exit(1)
39
+
40
+ # 检查命令是否存在
23
41
  if args.command not in commands:
24
42
  print(f"Unknown command: {args.command}")
25
43
  print(f"Available commands: {', '.join(commands.keys())}")
44
+ # 显示帮助信息
45
+ try:
46
+ module = __import__(commands['help'], fromlist=['main'])
47
+ module.main([])
48
+ except:
49
+ pass
26
50
  sys.exit(1)
27
51
 
28
52
  # 动态导入并执行命令
29
53
  try:
30
54
  module = __import__(commands[args.command], fromlist=['main'])
55
+ # 将未知参数传递给子命令
31
56
  sys.exit(module.main(unknown))
32
57
  except ImportError as e:
33
58
  print(f"Failed to load command '{args.command}': {e}")
@@ -7,7 +7,8 @@ _commands = {
7
7
  'run': 'crawlo.commands.run',
8
8
  'check': 'crawlo.commands.check',
9
9
  'list': 'crawlo.commands.list',
10
- 'stats': 'crawlo.commands.stats'
10
+ 'stats': 'crawlo.commands.stats',
11
+ 'help': 'crawlo.commands.help'
11
12
  }
12
13
 
13
14
  def get_commands():
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-09-12
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo -h|--help,显示帮助信息。
7
+ """
8
+ import sys
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+ from rich.panel import Panel
12
+ from rich.text import Text
13
+ from rich import box
14
+
15
+ console = Console()
16
+
17
+
18
+ def main(args):
19
+ """
20
+ 主函数:显示帮助信息
21
+ 用法:
22
+ crawlo -h|--help
23
+ """
24
+ # 检查是否有无效参数
25
+ if args and args[0] not in ['-h', '--help', 'help']:
26
+ console.print("[bold red]❌ 无效参数:[/bold red] [yellow]{}[/yellow]".format(args[0]))
27
+ console.print("[bold blue]💡 提示:[/bold blue] 使用 [green]crawlo -h[/green] 或 [green]crawlo --help[/green] 查看帮助信息")
28
+ return 1
29
+
30
+ # 显示帮助信息
31
+ show_help()
32
+ return 0
33
+
34
+
35
+ def show_help():
36
+ """显示完整的帮助信息"""
37
+ # 显示框架标题和版本
38
+ console.print(Panel(
39
+ Text.from_markup(":spider_web: [bold blue]Crawlo[/bold blue] [bold white]v1.1.7[/bold white] - 异步爬虫框架"),
40
+ expand=False,
41
+ border_style="blue"
42
+ ))
43
+
44
+ # 显示基本用法
45
+ console.print("[bold green]基本用法:[/bold green]")
46
+ console.print(" [blue]crawlo[/blue] [cyan]<command>[/cyan] [options]")
47
+ console.print()
48
+
49
+ # 显示可用命令
50
+ console.print("[bold green]可用命令:[/bold green]")
51
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
52
+ table.add_column("命令", style="cyan", width=15)
53
+ table.add_column("描述", style="white")
54
+ table.add_column("用法", style="yellow")
55
+
56
+ table.add_row("startproject", "创建新项目", "crawlo startproject <project_name>")
57
+ table.add_row("genspider", "生成爬虫模板", "crawlo genspider <spider_name> [domain]")
58
+ table.add_row("run", "运行爬虫", "crawlo run <spider_name>|all [options]")
59
+ table.add_row("check", "检查爬虫代码", "crawlo check [options]")
60
+ table.add_row("list", "列出所有爬虫", "crawlo list")
61
+ table.add_row("stats", "查看统计信息", "crawlo stats [spider_name]")
62
+ table.add_row("help", "显示帮助信息", "crawlo -h|--help")
63
+
64
+ console.print(table)
65
+ console.print()
66
+
67
+ # 显示全局选项
68
+ console.print("[bold green]全局选项:[/bold green]")
69
+ table = Table(box=box.SIMPLE, show_header=False)
70
+ table.add_column("选项", style="cyan", width=15)
71
+ table.add_column("描述", style="white")
72
+
73
+ table.add_row("-h, --help", "显示帮助信息")
74
+ table.add_row("-v, --version", "显示版本信息")
75
+
76
+ console.print(table)
77
+ console.print()
78
+
79
+ # 显示各命令的详细用法
80
+ console.print("[bold green]命令详细用法:[/bold green]")
81
+
82
+ # run 命令
83
+ console.print("[bold cyan]run[/bold cyan] - 运行爬虫")
84
+ console.print(" 用法: crawlo run <spider_name>|all [--json] [--no-stats]")
85
+ console.print(" 示例:")
86
+ console.print(" crawlo run myspider")
87
+ console.print(" crawlo run all")
88
+ console.print(" crawlo run all --json --no-stats")
89
+ console.print()
90
+
91
+ # check 命令
92
+ console.print("[bold cyan]check[/bold cyan] - 检查爬虫代码")
93
+ console.print(" 用法: crawlo check [--fix] [--ci] [--json] [--watch]")
94
+ console.print(" 示例:")
95
+ console.print(" crawlo check")
96
+ console.print(" crawlo check --fix")
97
+ console.print(" crawlo check --ci --json")
98
+ console.print()
99
+
100
+ # startproject 命令
101
+ console.print("[bold cyan]startproject[/bold cyan] - 创建新项目")
102
+ console.print(" 用法: crawlo startproject <project_name>")
103
+ console.print(" 示例:")
104
+ console.print(" crawlo startproject myproject")
105
+ console.print()
106
+
107
+ # genspider 命令
108
+ console.print("[bold cyan]genspider[/bold cyan] - 生成爬虫模板")
109
+ console.print(" 用法: crawlo genspider <spider_name> [domain]")
110
+ console.print(" 示例:")
111
+ console.print(" crawlo genspider myspider example.com")
112
+ console.print()
113
+
114
+ # list 命令
115
+ console.print("[bold cyan]list[/bold cyan] - 列出所有爬虫")
116
+ console.print(" 用法: crawlo list")
117
+ console.print(" 示例:")
118
+ console.print(" crawlo list")
119
+ console.print()
120
+
121
+ # stats 命令
122
+ console.print("[bold cyan]stats[/bold cyan] - 查看统计信息")
123
+ console.print(" 用法: crawlo stats [spider_name]")
124
+ console.print(" 示例:")
125
+ console.print(" crawlo stats")
126
+ console.print(" crawlo stats myspider")
127
+ console.print()
128
+
129
+ # 显示更多信息
130
+ console.print("[bold green]更多信息:[/bold green]")
131
+ console.print(" 文档: https://crawlo.readthedocs.io/")
132
+ console.print(" 源码: https://github.com/crawl-coder/Crawlo")
133
+ console.print(" 问题: https://github.com/crawl-coder/Crawlo/issues")
crawlo/commands/run.py CHANGED
@@ -8,6 +8,7 @@
8
8
  import sys
9
9
  import asyncio
10
10
  import configparser
11
+ import os
11
12
  from pathlib import Path
12
13
  from importlib import import_module
13
14
 
@@ -32,6 +33,12 @@ def get_project_root():
32
33
  向上查找 crawlo.cfg 来确定项目根目录
33
34
  """
34
35
  current = Path.cwd()
36
+ # 首先检查当前目录及其子目录
37
+ for root, dirs, files in os.walk(current):
38
+ if "crawlo.cfg" in files:
39
+ return Path(root)
40
+
41
+ # 如果在子目录中没找到,再向上查找
35
42
  for _ in range(10):
36
43
  cfg = current / "crawlo.cfg"
37
44
  if cfg.exists():
@@ -10,7 +10,7 @@ import re
10
10
  import sys
11
11
  import os
12
12
  from pathlib import Path
13
- from typing import Optional
13
+ from typing import Optional, List
14
14
 
15
15
  # 添加项目根目录到路径,以便能够导入utils模块
16
16
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
@@ -56,6 +56,19 @@ TEMPLATE_TYPES = {
56
56
  'gentle': '温和模板 - 低负载配置,对目标网站友好'
57
57
  }
58
58
 
59
+ # 可选的模块组件
60
+ OPTIONAL_MODULES = {
61
+ 'mysql': 'MySQL数据库支持',
62
+ 'mongodb': 'MongoDB数据库支持',
63
+ 'redis': 'Redis支持(分布式队列和去重)',
64
+ 'proxy': '代理支持',
65
+ 'monitoring': '监控和性能分析',
66
+ 'dedup': '去重功能',
67
+ 'httpx': 'HttpX下载器',
68
+ 'aiohttp': 'AioHttp下载器',
69
+ 'curl': 'CurlCffi下载器'
70
+ }
71
+
59
72
 
60
73
  def show_error_panel(title, content):
61
74
  """显示错误面板的简单实现"""
@@ -84,9 +97,10 @@ def _render_template(tmpl_path, context):
84
97
  return content
85
98
 
86
99
 
87
- def _copytree_with_templates(src, dst, context, template_type='default'):
100
+ def _copytree_with_templates(src, dst, context, template_type='default', modules: List[str] = None):
88
101
  """
89
102
  递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
103
+ 支持选择性模块复制。
90
104
  """
91
105
  src_path = Path(src)
92
106
  dst_path = Path(dst)
@@ -96,24 +110,43 @@ def _copytree_with_templates(src, dst, context, template_type='default'):
96
110
  rel_path = item.relative_to(src_path)
97
111
  dst_item = dst_path / rel_path
98
112
 
113
+ # 检查是否应该包含此文件(基于模块选择)
114
+ if not _should_include_file(rel_path, modules):
115
+ continue
116
+
99
117
  if item.is_dir():
100
118
  dst_item.mkdir(parents=True, exist_ok=True)
101
119
  else:
102
120
  if item.suffix == '.tmpl':
121
+ rendered_content = None
103
122
  # 处理特定模板类型的设置文件
104
- if item.name == 'settings.py.tmpl' and template_type != 'default':
105
- # 使用特定模板类型的设置文件
106
- template_file_name = f'settings_{template_type}.py.tmpl'
107
- template_file_path = src_path / template_file_name
108
- if template_file_path.exists():
109
- rendered_content = _render_template(template_file_path, context)
123
+ if item.name == 'settings.py.tmpl':
124
+ # 对于设置文件,根据模板类型选择相应的内容模板
125
+ if template_type != 'default':
126
+ # 使用特定模板类型的设置文件
127
+ template_file_name = f'settings_{template_type}.py.tmpl'
128
+ template_file_path = src_path / template_file_name
129
+ if template_file_path.exists():
130
+ rendered_content = _render_template(template_file_path, context)
131
+ else:
132
+ # 如果特定模板不存在,使用默认模板
133
+ rendered_content = _render_template(item, context)
110
134
  else:
111
- # 如果特定模板不存在,使用默认模板
135
+ # 使用默认模板
112
136
  rendered_content = _render_template(item, context)
137
+ # 跳过其他以 settings_ 开头的模板文件,避免重复处理
138
+ elif item.name.startswith('settings_') and item.name.endswith('.py.tmpl'):
139
+ continue
113
140
  else:
114
141
  rendered_content = _render_template(item, context)
115
142
 
116
- final_dst = dst_item.with_suffix('')
143
+ # 确保设置文件始终命名为 settings.py
144
+ if item.name == 'settings.py.tmpl':
145
+ # 特殊处理设置模板文件,统一生成为 settings.py
146
+ final_dst = dst_item.parent / 'settings.py'
147
+ else:
148
+ final_dst = dst_item.with_suffix('')
149
+
117
150
  final_dst.parent.mkdir(parents=True, exist_ok=True)
118
151
  with open(final_dst, 'w', encoding='utf-8') as f:
119
152
  f.write(rendered_content)
@@ -121,6 +154,54 @@ def _copytree_with_templates(src, dst, context, template_type='default'):
121
154
  shutil.copy2(item, dst_item)
122
155
 
123
156
 
157
+ def _should_include_file(rel_path, modules: List[str]) -> bool:
158
+ """
159
+ 根据选择的模块决定是否包含文件
160
+ """
161
+ if modules is None:
162
+ # 如果没有指定模块,则包含所有文件
163
+ return True
164
+
165
+ # 基础文件始终包含
166
+ basic_files = [
167
+ '__init__.py.tmpl',
168
+ 'settings.py.tmpl',
169
+ 'spiders/__init__.py.tmpl',
170
+ 'items.py.tmpl',
171
+ 'middlewares.py.tmpl',
172
+ 'run.py.tmpl'
173
+ ]
174
+
175
+ path_str = str(rel_path).replace('\\', '/')
176
+
177
+ # 始终包含基础文件
178
+ if path_str in basic_files:
179
+ return True
180
+
181
+ # 根据模块选择包含特定文件
182
+ if 'mysql' in modules and 'mysql' in path_str:
183
+ return True
184
+ if 'mongodb' in modules and 'mongo' in path_str:
185
+ return True
186
+ if 'redis' in modules and 'redis' in path_str:
187
+ return True
188
+ if 'proxy' in modules and 'proxy' in path_str:
189
+ return True
190
+ if 'monitoring' in modules and ('monitor' in path_str or 'stats' in path_str):
191
+ return True
192
+ if 'dedup' in modules and 'dedup' in path_str:
193
+ return True
194
+ if 'httpx' in modules and 'httpx' in path_str:
195
+ return True
196
+ if 'aiohttp' in modules and 'aiohttp' in path_str:
197
+ return True
198
+ if 'curl' in modules and 'cffi' in path_str:
199
+ return True
200
+
201
+ # 默认不包含特定模块文件
202
+ return False
203
+
204
+
124
205
  def validate_project_name(project_name: str) -> tuple[bool, str]:
125
206
  """
126
207
  验证项目名称是否有效
@@ -184,18 +265,52 @@ def show_template_options():
184
265
  print(f" {template_type}: {description}")
185
266
 
186
267
 
268
+ def show_module_options():
269
+ """显示可用的模块选项"""
270
+ if RICH_AVAILABLE:
271
+ table = Table(title="可选模块组件", show_header=True, header_style="bold magenta")
272
+ table.add_column("模块", style="cyan", no_wrap=True)
273
+ table.add_column("描述", style="green")
274
+
275
+ for module, description in OPTIONAL_MODULES.items():
276
+ table.add_row(module, description)
277
+
278
+ console.print(table)
279
+ else:
280
+ print("可选模块组件:")
281
+ for module, description in OPTIONAL_MODULES.items():
282
+ print(f" {module}: {description}")
283
+
284
+
187
285
  def main(args):
188
- if len(args) < 1 or len(args) > 2:
189
- console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name> [template_type]")
286
+ if len(args) < 1:
287
+ console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name> [template_type] [--modules module1,module2]")
190
288
  console.print("💡 Examples:")
191
289
  console.print(" [blue]crawlo startproject[/blue] my_spider_project")
192
290
  console.print(" [blue]crawlo startproject[/blue] news_crawler simple")
193
- console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed")
291
+ console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed --modules mysql,proxy")
194
292
  show_template_options()
293
+ show_module_options()
195
294
  return 1
196
295
 
296
+ # 解析参数
197
297
  project_name = args[0]
198
- template_type = args[1] if len(args) > 1 else 'default'
298
+ template_type = 'default'
299
+ modules = None
300
+
301
+ # 解析可选参数
302
+ if len(args) > 1:
303
+ for i, arg in enumerate(args[1:], 1):
304
+ if arg.startswith('--modules='):
305
+ modules_str = arg.split('=', 1)[1]
306
+ modules = [m.strip() for m in modules_str.split(',') if m.strip()]
307
+ elif arg.startswith('--modules'):
308
+ # 处理 --modules module1,module2 格式
309
+ if i + 1 < len(args):
310
+ modules_str = args[i + 1]
311
+ modules = [m.strip() for m in modules_str.split(',') if m.strip()]
312
+ elif not arg.startswith('--') and arg in TEMPLATE_TYPES:
313
+ template_type = arg
199
314
 
200
315
  # 验证模板类型
201
316
  if template_type not in TEMPLATE_TYPES:
@@ -249,7 +364,7 @@ def main(args):
249
364
 
250
365
  # 3. 复制并渲染项目包内容
251
366
  package_dir = project_dir / project_name
252
- _copytree_with_templates(template_dir, package_dir, context, template_type)
367
+ _copytree_with_templates(template_dir, package_dir, context, template_type, modules)
253
368
  console.print(f":white_check_mark: Created project package: [green]{package_dir}[/green]")
254
369
 
255
370
  # 4. 创建 logs 目录
@@ -267,6 +382,10 @@ def main(args):
267
382
  # 显示使用的模板类型
268
383
  if template_type != 'default':
269
384
  console.print(f":information: 使用模板类型: [bold blue]{template_type}[/bold blue] - {TEMPLATE_TYPES[template_type]}")
385
+
386
+ # 显示选择的模块
387
+ if modules:
388
+ console.print(f":information: 选择的模块: [bold blue]{', '.join(modules)}[/bold blue]")
270
389
 
271
390
  # 下一步操作提示(对齐美观 + 语法高亮)
272
391
  next_steps = f"""
crawlo/config.py CHANGED
@@ -158,6 +158,7 @@ class CrawloConfig:
158
158
  redis_host: str = '127.0.0.1',
159
159
  redis_port: int = 6379,
160
160
  redis_password: Optional[str] = None,
161
+ redis_db: int = 0, # 添加 redis_db 参数
161
162
  project_name: str = 'crawlo',
162
163
  concurrency: int = 16,
163
164
  download_delay: float = 1.0,
@@ -170,6 +171,7 @@ class CrawloConfig:
170
171
  redis_host: Redis 服务器地址
171
172
  redis_port: Redis 端口
172
173
  redis_password: Redis 密码
174
+ redis_db: Redis 数据库编号
173
175
  project_name: 项目名称(用于命名空间)
174
176
  concurrency: 并发数
175
177
  download_delay: 下载延迟
@@ -179,6 +181,7 @@ class CrawloConfig:
179
181
  redis_host=redis_host,
180
182
  redis_port=redis_port,
181
183
  redis_password=redis_password,
184
+ redis_db=redis_db, # 传递 redis_db 参数
182
185
  project_name=project_name,
183
186
  CONCURRENCY=concurrency,
184
187
  DOWNLOAD_DELAY=download_delay,
crawlo/core/engine.py CHANGED
@@ -70,7 +70,7 @@ class Engine(object):
70
70
  def engine_start(self):
71
71
  self.running = True
72
72
  self.logger.info(
73
- f"Crawlo (version {self.settings.get_float('VERSION')}) started. "
73
+ f"Crawlo (version {self.settings.get('VERSION')}) started. "
74
74
  f"(project name : {self.settings.get('PROJECT_NAME')})"
75
75
  )
76
76
 
@@ -75,7 +75,8 @@ class AioRedisFilter(BaseFilter):
75
75
  def create_instance(cls, crawler) -> 'BaseFilter':
76
76
  """从爬虫配置创建过滤器实例"""
77
77
  redis_url = crawler.settings.get('REDIS_URL', 'redis://localhost:6379')
78
- decode_responses = crawler.settings.get_bool('DECODE_RESPONSES', False)
78
+ # 确保 decode_responses=False 以避免编码问题
79
+ decode_responses = False # crawler.settings.get_bool('DECODE_RESPONSES', False)
79
80
  ttl_setting = crawler.settings.get_int('REDIS_TTL')
80
81
 
81
82
  # 处理TTL设置
@@ -84,7 +85,7 @@ class AioRedisFilter(BaseFilter):
84
85
  ttl = max(0, int(ttl_setting)) if ttl_setting > 0 else None
85
86
 
86
87
  try:
87
- # 使用优化的连接池
88
+ # 使用优化的连接池,确保 decode_responses=False
88
89
  redis_pool = get_redis_pool(
89
90
  redis_url,
90
91
  max_connections=20,
@@ -92,7 +93,7 @@ class AioRedisFilter(BaseFilter):
92
93
  socket_timeout=30,
93
94
  health_check_interval=30,
94
95
  retry_on_timeout=True,
95
- decode_responses=decode_responses,
96
+ decode_responses=decode_responses, # 确保不自动解码响应
96
97
  encoding='utf-8'
97
98
  )
98
99
 
crawlo/mode_manager.py CHANGED
@@ -47,21 +47,24 @@ class ModeManager:
47
47
  redis_host: str = '127.0.0.1',
48
48
  redis_port: int = 6379,
49
49
  redis_password: Optional[str] = None,
50
+ redis_db: int = 0, # 添加 redis_db 参数
50
51
  project_name: str = 'crawlo'
51
52
  ) -> Dict[str, Any]:
52
53
  """获取分布式模式配置"""
53
- # 构建 Redis URL
54
+ # 构建 Redis URL,使用传入的 redis_db 参数
54
55
  if redis_password:
55
- redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
+ redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
56
57
  else:
57
- redis_url = f'redis://{redis_host}:{redis_port}/0'
58
+ redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
58
59
 
59
60
  return {
61
+ 'PROJECT_NAME': project_name, # 添加项目名称到配置中
60
62
  'QUEUE_TYPE': 'redis',
61
63
  'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
64
  'REDIS_HOST': redis_host,
63
65
  'REDIS_PORT': redis_port,
64
66
  'REDIS_PASSWORD': redis_password,
67
+ 'REDIS_DB': redis_db, # 添加 Redis 数据库编号到配置中
65
68
  'REDIS_URL': redis_url,
66
69
  'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
67
70
  # Redis key配置已移至各组件中,使用统一的命名规范
@@ -111,6 +114,7 @@ class ModeManager:
111
114
  redis_host=kwargs.get('redis_host', '127.0.0.1'),
112
115
  redis_port=kwargs.get('redis_port', 6379),
113
116
  redis_password=kwargs.get('redis_password'),
117
+ redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
114
118
  project_name=kwargs.get('project_name', 'crawlo')
115
119
  )
116
120
 
@@ -160,6 +164,7 @@ def distributed_mode(
160
164
  redis_host: str = '127.0.0.1',
161
165
  redis_port: int = 6379,
162
166
  redis_password: Optional[str] = None,
167
+ redis_db: int = 0, # 添加 redis_db 参数
163
168
  project_name: str = 'crawlo',
164
169
  **kwargs
165
170
  ) -> Dict[str, Any]:
@@ -169,6 +174,7 @@ def distributed_mode(
169
174
  redis_host=redis_host,
170
175
  redis_port=redis_port,
171
176
  redis_password=redis_password,
177
+ redis_db=redis_db, # 传递 redis_db 参数
172
178
  project_name=project_name,
173
179
  **kwargs
174
180
  )
@@ -30,12 +30,17 @@ class PipelineManager:
30
30
 
31
31
  def _add_pipelines(self, pipelines):
32
32
  for pipeline in pipelines:
33
- pipeline_cls = load_class(pipeline)
34
- if not hasattr(pipeline_cls, 'from_crawler'):
35
- raise PipelineInitError(
36
- f"Pipeline init failed, must inherit from `BasePipeline` or have a `create_instance` method"
37
- )
38
- self.pipelines.append(pipeline_cls.from_crawler(self.crawler))
33
+ try:
34
+ pipeline_cls = load_class(pipeline)
35
+ if not hasattr(pipeline_cls, 'from_crawler'):
36
+ raise PipelineInitError(
37
+ f"Pipeline init failed, must inherit from `BasePipeline` or have a `create_instance` method"
38
+ )
39
+ self.pipelines.append(pipeline_cls.from_crawler(self.crawler))
40
+ except Exception as e:
41
+ self.logger.error(f"Failed to load pipeline {pipeline}: {e}")
42
+ # 可以选择继续加载其他管道或抛出异常
43
+ raise
39
44
  if pipelines:
40
45
  self.logger.info(f"enabled pipelines: \n {pformat(pipelines)}")
41
46
 
@@ -13,13 +13,14 @@
13
13
  """
14
14
 
15
15
  import hashlib
16
- from typing import Dict, Any, Optional
16
+ from typing import Optional
17
+
17
18
  import redis
18
19
 
19
20
  from crawlo import Item
21
+ from crawlo.exceptions import DropItem
20
22
  from crawlo.spider import Spider
21
23
  from crawlo.utils.log import get_logger
22
- from crawlo.exceptions import DropItem
23
24
 
24
25
 
25
26
  class RedisDedupPipeline:
crawlo/project.py CHANGED
@@ -30,6 +30,15 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
30
30
  2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
31
31
  """
32
32
  path = os.path.abspath(start_path)
33
+
34
+ # 首先检查当前目录及其子目录
35
+ for root, dirs, files in os.walk(path):
36
+ if "crawlo.cfg" in files:
37
+ cfg_path = os.path.join(root, "crawlo.cfg")
38
+ logger.info(f"✅ 找到项目配置文件: {cfg_path}")
39
+ return root
40
+
41
+ # 如果在子目录中没找到,再向上查找
33
42
  while True:
34
43
  cfg_file = os.path.join(path, "crawlo.cfg")
35
44
  if os.path.isfile(cfg_file):
@@ -128,17 +137,43 @@ def load_class(_path):
128
137
  raise TypeError(f"args expect str or object, got {_path}")
129
138
 
130
139
  module_name, class_name = _path.rsplit('.', 1)
131
- module = import_module(module_name)
140
+
141
+ try:
142
+ module = import_module(module_name)
143
+ except ImportError as e:
144
+ # 尝试不同的导入方式
145
+ try:
146
+ # 尝试直接导入完整路径
147
+ module = import_module(_path)
148
+ return module
149
+ except ImportError:
150
+ pass
151
+ raise ImportError(f"Cannot import module {module_name}: {e}")
132
152
 
133
153
  try:
134
154
  cls = getattr(module, class_name)
135
155
  except AttributeError:
136
- raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
156
+ # 提供更详细的错误信息
157
+ available_attrs = [attr for attr in dir(module) if not attr.startswith('_')]
158
+ raise NameError(f"Module {module_name!r} has no class named {class_name!r}. Available attributes: {available_attrs}")
137
159
  return cls
138
160
 
139
161
 
140
162
  def merge_settings(spider, settings):
141
163
  spider_name = getattr(spider, 'name', 'UnknownSpider')
164
+ # 检查 settings 是否为 SettingManager 实例
165
+ if not hasattr(settings, 'update_attributes'):
166
+ logger.error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
167
+ # 如果是字典,创建一个新的 SettingManager 实例
168
+ if isinstance(settings, dict):
169
+ from crawlo.settings.setting_manager import SettingManager
170
+ new_settings = SettingManager()
171
+ new_settings.update_attributes(settings)
172
+ settings = new_settings
173
+ else:
174
+ logger.error("无法处理的 settings 类型")
175
+ return
176
+
142
177
  if hasattr(spider, 'custom_settings'):
143
178
  custom_settings = getattr(spider, 'custom_settings')
144
179
  settings.update_attributes(custom_settings)
@@ -150,4 +185,4 @@ async def common_call(func: Callable, *args, **kwargs):
150
185
  if iscoroutinefunction(func):
151
186
  return await func(*args, **kwargs)
152
187
  else:
153
- return func(*args, **kwargs)
188
+ return func(*args, **kwargs)