crawlo 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.6"
1
+ __version__ = "1.0.8"
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:35
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo check, 检查所有爬虫定义是否合规。
7
+ """
8
+ import sys
9
+ import configparser
10
+
11
+ from crawlo.crawler import CrawlerProcess
12
+ from crawlo.utils.project import get_settings
13
+ from crawlo.utils.log import get_logger
14
+
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ def main(args):
20
+ if args:
21
+ print("Usage: crawlo check")
22
+ return 1
23
+
24
+ try:
25
+ project_root = get_settings().get('PROJECT_ROOT')
26
+ if not project_root:
27
+ print("❌ Error: Cannot determine project root.")
28
+ return 1
29
+
30
+ if str(project_root) not in sys.path:
31
+ sys.path.insert(0, str(project_root))
32
+
33
+ cfg_file = project_root / 'crawlo.cfg'
34
+ if not cfg_file.exists():
35
+ print(f"❌ Error: crawlo.cfg not found in {project_root}")
36
+ return 1
37
+
38
+ config = configparser.ConfigParser()
39
+ config.read(cfg_file, encoding='utf-8')
40
+
41
+ if not config.has_section('settings') or not config.has_option('settings', 'default'):
42
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
43
+ return 1
44
+
45
+ settings_module = config.get('settings', 'default')
46
+ project_package = settings_module.split('.')[0]
47
+
48
+ # 创建 CrawlerProcess 并发现爬虫
49
+ process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
50
+ spider_names = process.get_spider_names()
51
+
52
+ if not spider_names:
53
+ print("📭 No spiders found.")
54
+ return 1
55
+
56
+ print(f"🔍 Checking {len(spider_names)} spider(s)...")
57
+ print("-" * 60)
58
+
59
+ issues_found = False
60
+ for name in sorted(spider_names):
61
+ cls = process.get_spider_class(name)
62
+ issues = []
63
+
64
+ if not hasattr(cls, 'name') or not cls.name:
65
+ issues.append("missing or empty 'name' attribute")
66
+ elif not isinstance(cls.name, str):
67
+ issues.append("'name' is not a string")
68
+
69
+ if not callable(getattr(cls, 'start_requests', None)):
70
+ issues.append("missing or non-callable 'start_requests' method")
71
+
72
+ if hasattr(cls, 'start_urls') and isinstance(cls.start_urls, str):
73
+ issues.append("'start_urls' is a string, should be list/tuple")
74
+
75
+ # 实例化检查(轻量)
76
+ try:
77
+ spider = cls.create_instance(None)
78
+ if not callable(getattr(spider, 'parse', None)):
79
+ issues.append("no 'parse' method defined (optional but recommended)")
80
+ except Exception as e:
81
+ issues.append(f"failed to create instance: {e}")
82
+
83
+ if issues:
84
+ print(f"❌ {name:<20} {cls.__name__}")
85
+ for issue in issues:
86
+ print(f" • {issue}")
87
+ issues_found = True
88
+ else:
89
+ print(f"✅ {name:<20} {cls.__name__} (OK)")
90
+
91
+ print("-" * 60)
92
+ if issues_found:
93
+ print("⚠️ Some spiders have issues. Please fix them.")
94
+ return 1
95
+ else:
96
+ print("🎉 All spiders are compliant!")
97
+ return 0
98
+
99
+ except Exception as e:
100
+ print(f"❌ Error during check: {e}")
101
+ import traceback
102
+ traceback.print_exc()
103
+ return 1
104
+
105
+
106
+ if __name__ == '__main__':
107
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:33
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
7
+ """
8
+ import sys
9
+ import configparser
10
+
11
+ from crawlo.crawler import CrawlerProcess
12
+ from crawlo.utils.project import get_settings
13
+ from crawlo.utils.log import get_logger
14
+
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ def main(args):
20
+ """
21
+ 列出所有可用爬虫
22
+ 用法: crawlo list
23
+ """
24
+ if args:
25
+ print("Usage: crawlo list")
26
+ return 1
27
+
28
+ try:
29
+ # 1. 获取项目根目录
30
+ project_root = get_settings().get('PROJECT_ROOT')
31
+ if not project_root:
32
+ print("❌ Error: Cannot determine project root.")
33
+ return 1
34
+
35
+ # 将项目根目录加入 sys.path
36
+ project_root_str = str(project_root)
37
+ if project_root_str not in sys.path:
38
+ sys.path.insert(0, project_root_str)
39
+
40
+ # 2. 读取 crawlo.cfg 获取项目包名
41
+ cfg_file = project_root / 'crawlo.cfg'
42
+ if not cfg_file.exists():
43
+ print(f"❌ Error: crawlo.cfg not found in {project_root}")
44
+ return 1
45
+
46
+ config = configparser.ConfigParser()
47
+ config.read(cfg_file, encoding='utf-8')
48
+
49
+ if not config.has_section('settings') or not config.has_option('settings', 'default'):
50
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
51
+ return 1
52
+
53
+ settings_module = config.get('settings', 'default')
54
+ project_package = settings_module.split('.')[0]
55
+
56
+ # 3. 创建 CrawlerProcess 并自动发现爬虫
57
+ spider_modules = [f"{project_package}.spiders"]
58
+ process = CrawlerProcess(spider_modules=spider_modules)
59
+
60
+ # 4. 获取所有爬虫信息
61
+ spider_names = process.get_spider_names()
62
+ if not spider_names:
63
+ print("📭 No spiders found.")
64
+ print("💡 Make sure:")
65
+ print(" - Your spider classes inherit from `Spider`")
66
+ print(" - They define a `name` attribute")
67
+ print(" - The modules are imported (e.g. via __init__.py)")
68
+ return 1
69
+
70
+ # 5. 输出爬虫列表
71
+ print(f"📋 Found {len(spider_names)} spider(s):")
72
+ print("-" * 50)
73
+ for name in sorted(spider_names):
74
+ cls = process.get_spider_class(name)
75
+ module = cls.__module__.replace(project_package + ".", "") # 简化模块名
76
+ print(f"🕷️ {name:<20} {cls.__name__:<25} ({module})")
77
+ print("-" * 50)
78
+ return 0
79
+
80
+ except Exception as e:
81
+ print(f"❌ Error listing spiders: {e}")
82
+ import traceback
83
+ traceback.print_exc()
84
+ return 1
85
+
86
+
87
+ if __name__ == '__main__':
88
+ """
89
+ 允许直接运行:
90
+ python -m crawlo.commands.list
91
+ """
92
+ sys.exit(main(sys.argv[1:]))
crawlo/commands/run.py CHANGED
@@ -1,14 +1,15 @@
1
- # crawlo/commands/run.py
1
+ """
2
+ 命令行入口:crawlo run <spider_name>
3
+ 用于运行指定名称的爬虫。
4
+ """
5
+
2
6
  import asyncio
3
- import importlib
4
- import sys
5
7
  from pathlib import Path
6
8
  import configparser
7
9
 
8
10
  from crawlo.crawler import CrawlerProcess
9
11
  from crawlo.utils.project import get_settings
10
12
  from crawlo.utils.log import get_logger
11
- from crawlo.utils.spider_loader import SpiderLoader
12
13
 
13
14
  logger = get_logger(__name__)
14
15
 
@@ -16,24 +17,30 @@ logger = get_logger(__name__)
16
17
  def main(args):
17
18
  """
18
19
  运行指定爬虫的主函数
19
- 用法: crawlo run <spider_name>
20
+ 用法:
21
+ crawlo run <spider_name>
22
+ crawlo run all
20
23
  """
21
24
  if len(args) < 1:
22
- print("Usage: crawlo run <spider_name>")
23
- print("Example: crawlo run baidu")
25
+ print("Usage: crawlo run <spider_name>|all")
26
+ print("Examples:")
27
+ print(" crawlo run baidu")
28
+ print(" crawlo run all")
24
29
  return 1
25
30
 
26
- spider_name = args[0]
31
+ spider_arg = args[0]
27
32
 
28
33
  try:
29
34
  # 1. 获取项目根目录
30
- project_root = get_settings()
35
+ project_root = get_settings().get('PROJECT_ROOT')
36
+ if not project_root:
37
+ print("❌ Error: Cannot determine project root.")
38
+ return 1
31
39
 
32
- # 将项目根目录添加到 Python 路径
33
40
  if str(project_root) not in sys.path:
34
41
  sys.path.insert(0, str(project_root))
35
42
 
36
- # 2. 读取配置文件获取项目包名
43
+ # 2. 读取 crawlo.cfg 获取项目包名
37
44
  cfg_file = project_root / 'crawlo.cfg'
38
45
  if not cfg_file.exists():
39
46
  print(f"❌ Error: crawlo.cfg not found in {project_root}")
@@ -49,27 +56,60 @@ def main(args):
49
56
  settings_module = config.get('settings', 'default')
50
57
  project_package = settings_module.split('.')[0]
51
58
 
52
- # 3. 查找并加载指定名称的 Spider
53
- spider_class = find_spider_by_name(project_package, spider_name)
54
- if spider_class is None:
59
+ # 3. 创建 CrawlerProcess 并自动发现爬虫模块
60
+ spider_modules = [f"{project_package}.spiders"]
61
+ settings = get_settings()
62
+ process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
63
+
64
+ # === 新增:支持 'all' ===
65
+ if spider_arg.lower() == "all":
66
+ spider_names = process.get_spider_names()
67
+ if not spider_names:
68
+ print("❌ No spiders found. Make sure spiders are defined and imported.")
69
+ return 1
70
+
71
+ print(f"🚀 Starting ALL {len(spider_names)} spiders:")
72
+ for name in sorted(spider_names):
73
+ cls = process.get_spider_class(name)
74
+ print(f" 🕷️ {name} ({cls.__name__})")
75
+ print("-" * 50)
76
+
77
+ # 启动所有爬虫
78
+ asyncio.run(process.crawl(spider_names))
79
+ return 0
80
+
81
+ # === 原有:启动单个爬虫 ===
82
+ spider_name = spider_arg
83
+ if not process.is_spider_registered(spider_name):
84
+ print(f"❌ Error: Spider with name '{spider_name}' not found.")
85
+ available_names = process.get_spider_names()
86
+ if available_names:
87
+ print("💡 Available spiders:")
88
+ for name in sorted(available_names):
89
+ cls = process.get_spider_class(name)
90
+ print(f" - {name} (class: {cls.__name__})")
91
+ else:
92
+ print("💡 No spiders found. Make sure your spider classes are defined and imported.")
55
93
  return 1
56
94
 
57
- # 4. 创建 CrawlerProcess 并运行单个爬虫
58
- settings = get_settings()
59
- process = CrawlerProcess(settings)
95
+ spider_class = process.get_spider_class(spider_name)
60
96
 
61
- print(f"🚀 Starting spider: {spider_class.name}")
97
+ # 打印启动信息
98
+ print(f"🚀 Starting spider: {spider_name}")
62
99
  print(f"📁 Project: {project_package}")
63
100
  print(f"🕷️ Class: {spider_class.__name__}")
64
101
  print("-" * 50)
65
102
 
66
- # 运行单个爬虫
67
- asyncio.run(process.crawl(spider_class))
103
+ # 启动爬虫
104
+ asyncio.run(process.crawl(spider_name))
68
105
 
69
106
  print("-" * 50)
70
107
  print("✅ Spider completed successfully!")
71
108
  return 0
72
109
 
110
+ except KeyboardInterrupt:
111
+ print("\n⚠️ Spider interrupted by user.")
112
+ return 1
73
113
  except Exception as e:
74
114
  print(f"❌ Error running spider: {e}")
75
115
  import traceback
@@ -77,73 +117,65 @@ def main(args):
77
117
  return 1
78
118
 
79
119
 
80
- def find_spider_by_name(project_package: str, target_spider_name: str):
81
- """使用 SpiderLoader 查找爬虫"""
82
- loader = SpiderLoader(project_package)
83
- spider_class = loader.load(target_spider_name)
84
-
85
- if spider_class is None:
86
- print(f"❌ Error: Spider with name '{target_spider_name}' not found")
87
- print("💡 Available spiders:")
88
- available_spiders = loader.list()
89
- for spider_name in available_spiders:
90
- print(f" - {spider_name}")
91
- return None
92
-
93
- return spider_class
94
-
95
-
96
120
  def list_available_spiders(project_package: str):
97
121
  """
98
- 列出所有可用的爬虫
122
+ 列出指定项目包中所有可用的爬虫(用于调试或命令行扩展)
99
123
  """
100
- spiders_dir = Path.cwd() / project_package / 'spiders'
101
- if not spiders_dir.exists():
102
- print(" No spiders directory found")
103
- return
104
-
105
- spider_count = 0
106
- for py_file in spiders_dir.glob("*.py"):
107
- if py_file.name.startswith('_'):
108
- continue
109
-
110
- module_name = py_file.stem
111
- spider_module_path = f"{project_package}.spiders.{module_name}"
112
-
113
- try:
114
- module = importlib.import_module(spider_module_path)
115
- except ImportError:
116
- continue
117
-
118
- # 查找模块中所有 Spider 子类
119
- from crawlo.spider import Spider
120
- for attr_name in dir(module):
121
- attr_value = getattr(module, attr_name)
122
- if (isinstance(attr_value, type) and
123
- issubclass(attr_value, Spider) and
124
- attr_value != Spider and
125
- hasattr(attr_value, 'name')):
126
- print(f" - {attr_value.name} (class: {attr_value.__name__}, module: {module_name})")
127
- spider_count += 1
128
-
129
- if spider_count == 0:
130
- print(" No spiders found")
131
-
132
-
133
- def run_spider_by_name(spider_name: str, project_root: Path = None):
124
+ try:
125
+ # 临时创建一个 CrawlerProcess 来发现爬虫
126
+ process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
127
+ available_names = process.get_spider_names()
128
+
129
+ if not available_names:
130
+ print(" No spiders found. Make sure:")
131
+ print(" - spiders/ 目录存在")
132
+ print(" - 爬虫类继承 Spider 且定义了 name")
133
+ print(" - 模块被导入(可通过 __init__.py 触发)")
134
+ return
135
+
136
+ print(f"Found {len(available_names)} spider(s):")
137
+ for name in sorted(available_names):
138
+ cls = process.get_spider_class(name)
139
+ module = cls.__module__.replace(project_package + ".", "")
140
+ print(f" - {name} ({cls.__name__} @ {module})")
141
+ except Exception as e:
142
+ print(f"❌ Failed to list spiders: {e}")
143
+ import traceback
144
+ traceback.print_exc()
145
+
146
+
147
+ def run_spider_by_name(spider_name: str, project_package: str = None):
134
148
  """
135
- 直接在代码中通过 spider name 运行爬虫
149
+ 在代码中直接运行某个爬虫(需提供 project_package)
136
150
  """
137
- if project_root:
138
- if str(project_root) not in sys.path:
139
- sys.path.insert(0, str(project_root))
151
+ if project_package is None:
152
+ # 尝试从配置读取
153
+ cfg_file = Path('crawlo.cfg')
154
+ if cfg_file.exists():
155
+ config = configparser.ConfigParser()
156
+ config.read(cfg_file, encoding='utf-8')
157
+ if config.has_option('settings', 'default'):
158
+ project_package = config.get('settings', 'default').split('.')[0]
159
+
160
+ if not project_package:
161
+ print("❌ Error: project_package is required.")
162
+ return 1
163
+
164
+ # 添加项目路径
165
+ project_root = get_settings().get('PROJECT_ROOT')
166
+ if project_root and str(project_root) not in sys.path:
167
+ sys.path.insert(0, str(project_root))
140
168
 
169
+ # 复用 main 函数逻辑
141
170
  args = [spider_name]
142
171
  return main(args)
143
172
 
144
173
 
145
174
  if __name__ == '__main__':
146
- # 允许直接运行: python -m crawlo.commands.run <spider_name>
175
+ """
176
+ 允许直接运行:
177
+ python -m crawlo.commands.run <spider_name>
178
+ """
147
179
  import sys
148
180
 
149
- sys.exit(main(sys.argv[1:]))
181
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
7
+ """
8
+ import sys
9
+ from crawlo.utils.log import get_logger
10
+
11
+
12
+ logger = get_logger(__name__)
13
+
14
+ # 保存最近运行的爬虫的统计(示例)
15
+ _LAST_RUN_STATS = {}
16
+
17
+
18
+ def record_stats(crawler):
19
+ """在爬虫关闭后记录统计(需在 close 中调用)"""
20
+ if crawler.stats and crawler.spider:
21
+ _LAST_RUN_STATS[crawler.spider.name] = crawler.stats.get_stats()
22
+
23
+
24
+ def main(args):
25
+ if len(args) == 0:
26
+ # 显示所有历史统计
27
+ if not _LAST_RUN_STATS:
28
+ print("📊 No stats available. Run a spider first.")
29
+ return 0
30
+
31
+ print("📊 Recent Spider Statistics:")
32
+ print("-" * 60)
33
+ for spider_name, stats in _LAST_RUN_STATS.items():
34
+ print(f"🕷️ {spider_name}")
35
+ for k, v in stats.items():
36
+ print(f" {k:<30} {v}")
37
+ print()
38
+ return 0
39
+
40
+ elif len(args) == 1:
41
+ spider_name = args[0]
42
+ if spider_name not in _LAST_RUN_STATS:
43
+ print(f"📊 No stats found for spider '{spider_name}'")
44
+ return 1
45
+
46
+ stats = _LAST_RUN_STATS[spider_name]
47
+ print(f"📊 Stats for '{spider_name}':")
48
+ print("-" * 60)
49
+ for k, v in stats.items():
50
+ print(f" {k:<30} {v}")
51
+ return 0
52
+
53
+ else:
54
+ print("Usage: crawlo stats [spider_name]")
55
+ return 1
56
+
57
+
58
+ if __name__ == '__main__':
59
+ sys.exit(main(sys.argv[1:]))