crawlo 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.8"
1
+ __version__ = "1.1.0"
@@ -1,9 +1,13 @@
1
- # crawlo/commands/__init__.py
2
- # 定义可用的命令
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+
3
4
  _commands = {
4
5
  'startproject': 'crawlo.commands.startproject',
5
6
  'genspider': 'crawlo.commands.genspider',
6
7
  'run': 'crawlo.commands.run',
8
+ 'check': 'crawlo.commands.check',
9
+ 'list': 'crawlo.commands.list',
10
+ 'stats': 'crawlo.commands.stats'
7
11
  }
8
12
 
9
13
  def get_commands():
crawlo/commands/check.py CHANGED
@@ -1,85 +1,130 @@
1
1
  #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
2
+ # -*- coding: UTF-8 -*-
3
3
  """
4
- # @Time : 2025-08-31 22:35
5
- # @Author : crawl-coder
6
- # @Desc : 命令行入口:crawlo check, 检查所有爬虫定义是否合规。
4
+ # @Time : 2025-08-31 22:35
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo check,检查所有爬虫定义是否合规。
7
7
  """
8
+
8
9
  import sys
9
10
  import configparser
11
+ from pathlib import Path
12
+ from importlib import import_module
10
13
 
11
14
  from crawlo.crawler import CrawlerProcess
12
- from crawlo.utils.project import get_settings
13
15
  from crawlo.utils.log import get_logger
14
16
 
15
17
 
16
18
  logger = get_logger(__name__)
17
19
 
18
20
 
21
+ def get_project_root():
22
+ """
23
+ 从当前目录向上查找 crawlo.cfg,确定项目根目录
24
+ """
25
+ current = Path.cwd()
26
+
27
+ for _ in range(10):
28
+ cfg = current / "crawlo.cfg"
29
+ if cfg.exists():
30
+ return current
31
+
32
+ if current == current.parent:
33
+ break
34
+ current = current.parent
35
+
36
+ return None
37
+
38
+
19
39
  def main(args):
40
+ """
41
+ 主函数:检查所有爬虫定义的合规性
42
+ 用法: crawlo check
43
+ """
20
44
  if args:
21
- print("Usage: crawlo check")
45
+ print("Usage: crawlo check")
22
46
  return 1
23
47
 
24
48
  try:
25
- project_root = get_settings().get('PROJECT_ROOT')
49
+ # 1. 查找项目根目录
50
+ project_root = get_project_root()
26
51
  if not project_root:
27
- print("❌ Error: Cannot determine project root.")
52
+ print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
53
+ print("💡 Tip: Run this command inside your project directory.")
28
54
  return 1
29
55
 
30
- if str(project_root) not in sys.path:
31
- sys.path.insert(0, str(project_root))
56
+ project_root_str = str(project_root)
57
+ if project_root_str not in sys.path:
58
+ sys.path.insert(0, project_root_str)
32
59
 
33
- cfg_file = project_root / 'crawlo.cfg'
60
+ # 2. 读取 crawlo.cfg
61
+ cfg_file = project_root / "crawlo.cfg"
34
62
  if not cfg_file.exists():
35
- print(f"❌ Error: crawlo.cfg not found in {project_root}")
63
+ print(f"❌ Error: Expected config file not found: {cfg_file}")
36
64
  return 1
37
65
 
38
66
  config = configparser.ConfigParser()
39
- config.read(cfg_file, encoding='utf-8')
67
+ config.read(cfg_file, encoding="utf-8")
40
68
 
41
- if not config.has_section('settings') or not config.has_option('settings', 'default'):
69
+ if not config.has_section("settings") or not config.has_option("settings", "default"):
42
70
  print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
43
71
  return 1
44
72
 
45
- settings_module = config.get('settings', 'default')
46
- project_package = settings_module.split('.')[0]
73
+ settings_module = config.get("settings", "default")
74
+ project_package = settings_module.split(".")[0]
75
+
76
+ # 3. 确保项目包可导入
77
+ try:
78
+ import_module(project_package)
79
+ except ImportError as e:
80
+ print(f"❌ Failed to import project package '{project_package}': {e}")
81
+ return 1
47
82
 
48
- # 创建 CrawlerProcess 并发现爬虫
49
- process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
83
+ # 4. 加载爬虫
84
+ spider_modules = [f"{project_package}.spiders"]
85
+ process = CrawlerProcess(spider_modules=spider_modules)
50
86
  spider_names = process.get_spider_names()
51
87
 
52
88
  if not spider_names:
53
89
  print("📭 No spiders found.")
90
+ print("💡 Make sure:")
91
+ print(" • Spiders are defined in the 'spiders' module")
92
+ print(" • They have a `name` attribute")
93
+ print(" • Modules are properly imported")
54
94
  return 1
55
95
 
56
96
  print(f"🔍 Checking {len(spider_names)} spider(s)...")
57
97
  print("-" * 60)
58
98
 
59
99
  issues_found = False
100
+
60
101
  for name in sorted(spider_names):
61
102
  cls = process.get_spider_class(name)
62
103
  issues = []
63
104
 
64
- if not hasattr(cls, 'name') or not cls.name:
105
+ # 检查 name 属性
106
+ if not getattr(cls, "name", None):
65
107
  issues.append("missing or empty 'name' attribute")
66
108
  elif not isinstance(cls.name, str):
67
109
  issues.append("'name' is not a string")
68
110
 
69
- if not callable(getattr(cls, 'start_requests', None)):
111
+ # 检查 start_requests 是否可调用
112
+ if not callable(getattr(cls, "start_requests", None)):
70
113
  issues.append("missing or non-callable 'start_requests' method")
71
114
 
72
- if hasattr(cls, 'start_urls') and isinstance(cls.start_urls, str):
73
- issues.append("'start_urls' is a string, should be list/tuple")
115
+ # 检查 start_urls 类型(不应是字符串)
116
+ if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
117
+ issues.append("'start_urls' is a string; should be list or tuple")
74
118
 
75
- # 实例化检查(轻量)
119
+ # 实例化并检查 parse 方法(非强制但推荐)
76
120
  try:
77
121
  spider = cls.create_instance(None)
78
- if not callable(getattr(spider, 'parse', None)):
79
- issues.append("no 'parse' method defined (optional but recommended)")
122
+ if not callable(getattr(spider, "parse", None)):
123
+ issues.append("no 'parse' method defined (recommended)")
80
124
  except Exception as e:
81
- issues.append(f"failed to create instance: {e}")
125
+ issues.append(f"failed to instantiate spider: {e}")
82
126
 
127
+ # 输出结果
83
128
  if issues:
84
129
  print(f"❌ {name:<20} {cls.__name__}")
85
130
  for issue in issues:
@@ -89,19 +134,23 @@ def main(args):
89
134
  print(f"✅ {name:<20} {cls.__name__} (OK)")
90
135
 
91
136
  print("-" * 60)
137
+
92
138
  if issues_found:
93
139
  print("⚠️ Some spiders have issues. Please fix them.")
94
140
  return 1
95
141
  else:
96
- print("🎉 All spiders are compliant!")
142
+ print("🎉 All spiders are compliant and well-defined!")
97
143
  return 0
98
144
 
99
145
  except Exception as e:
100
- print(f"❌ Error during check: {e}")
101
- import traceback
102
- traceback.print_exc()
146
+ print(f"❌ Unexpected error during check: {e}")
147
+ logger.exception("Exception in 'crawlo check'")
103
148
  return 1
104
149
 
105
150
 
106
- if __name__ == '__main__':
151
+ if __name__ == "__main__":
152
+ """
153
+ 支持直接运行:
154
+ python -m crawlo.commands.check
155
+ """
107
156
  sys.exit(main(sys.argv[1:]))
crawlo/commands/list.py CHANGED
@@ -1,92 +1,119 @@
1
1
  #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
2
+ # -*- coding: UTF-8 -*-
3
3
  """
4
- # @Time : 2025-08-31 22:33
5
- # @Author : crawl-coder
6
- # @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
4
+ # @Time : 2025-08-31 22:33
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
7
7
  """
8
+
8
9
  import sys
9
10
  import configparser
11
+ from pathlib import Path
12
+ from importlib import import_module
10
13
 
11
14
  from crawlo.crawler import CrawlerProcess
12
- from crawlo.utils.project import get_settings
13
15
  from crawlo.utils.log import get_logger
14
16
 
15
17
 
16
18
  logger = get_logger(__name__)
17
19
 
18
20
 
21
+ def get_project_root():
22
+ """
23
+ 自动检测项目根目录:从当前目录向上查找 crawlo.cfg
24
+ 找到后返回该目录路径(字符串),最多向上查找10层。
25
+ """
26
+ current = Path.cwd()
27
+
28
+ for _ in range(10):
29
+ cfg = current / "crawlo.cfg"
30
+ if cfg.exists():
31
+ return str(current)
32
+
33
+ # 到达文件系统根目录
34
+ if current == current.parent:
35
+ break
36
+ current = current.parent
37
+
38
+ return None # 未找到
39
+
40
+
19
41
  def main(args):
20
42
  """
21
- 列出所有可用爬虫
43
+ 主函数:列出所有可用爬虫
22
44
  用法: crawlo list
23
45
  """
24
46
  if args:
25
- print("Usage: crawlo list")
47
+ print("Usage: crawlo list")
26
48
  return 1
27
49
 
28
50
  try:
29
- # 1. 获取项目根目录
30
- project_root = get_settings().get('PROJECT_ROOT')
51
+ # 1. 查找项目根目录
52
+ project_root = get_project_root()
31
53
  if not project_root:
32
- print("❌ Error: Cannot determine project root.")
54
+ print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
55
+ print("💡 Tip: Run this command inside your project directory, or create a project with 'crawlo startproject'.")
33
56
  return 1
34
57
 
35
- # 将项目根目录加入 sys.path
36
- project_root_str = str(project_root)
58
+ project_root_path = Path(project_root)
59
+ project_root_str = str(project_root_path)
60
+
61
+ # 2. 将项目根加入 Python 路径,以便导入项目模块
37
62
  if project_root_str not in sys.path:
38
63
  sys.path.insert(0, project_root_str)
39
64
 
40
- # 2. 读取 crawlo.cfg 获取项目包名
41
- cfg_file = project_root / 'crawlo.cfg'
42
- if not cfg_file.exists():
43
- print(f"❌ Error: crawlo.cfg not found in {project_root}")
44
- return 1
45
-
65
+ # 3. 读取 crawlo.cfg 获取 settings 模块
66
+ cfg_file = project_root_path / "crawlo.cfg"
46
67
  config = configparser.ConfigParser()
47
- config.read(cfg_file, encoding='utf-8')
68
+ config.read(cfg_file, encoding="utf-8")
48
69
 
49
- if not config.has_section('settings') or not config.has_option('settings', 'default'):
50
- print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
70
+ if not config.has_section("settings") or not config.has_option("settings", "default"):
71
+ print("❌ Error: Invalid crawlo.cfg — missing [settings] or 'default' option.")
51
72
  return 1
52
73
 
53
- settings_module = config.get('settings', 'default')
54
- project_package = settings_module.split('.')[0]
74
+ settings_module = config.get("settings", "default")
75
+ project_package = settings_module.split(".")[0]
76
+
77
+ # 4. 确保项目包可导入(可选:尝试导入以触发异常)
78
+ try:
79
+ import_module(project_package)
80
+ except ImportError as e:
81
+ print(f"❌ Failed to import project package '{project_package}': {e}")
82
+ return 1
55
83
 
56
- # 3. 创建 CrawlerProcess 并自动发现爬虫
84
+ # 5. 初始化 CrawlerProcess 并加载爬虫模块
57
85
  spider_modules = [f"{project_package}.spiders"]
58
86
  process = CrawlerProcess(spider_modules=spider_modules)
59
87
 
60
- # 4. 获取所有爬虫信息
88
+ # 6. 获取所有爬虫名称
61
89
  spider_names = process.get_spider_names()
62
90
  if not spider_names:
63
- print("📭 No spiders found.")
91
+ print("📭 No spiders found in 'spiders/' directory.")
64
92
  print("💡 Make sure:")
65
- print(" - Your spider classes inherit from `Spider`")
66
- print(" - They define a `name` attribute")
67
- print(" - The modules are imported (e.g. via __init__.py)")
93
+ print(" Spider classes inherit from `crawlo.spider.Spider`")
94
+ print(" Each spider has a `name` attribute")
95
+ print(" Spiders are imported in `spiders/__init__.py` (if using package)")
68
96
  return 1
69
97
 
70
- # 5. 输出爬虫列表
98
+ # 7. 输出爬虫列表
71
99
  print(f"📋 Found {len(spider_names)} spider(s):")
72
- print("-" * 50)
100
+ print("-" * 60)
73
101
  for name in sorted(spider_names):
74
- cls = process.get_spider_class(name)
75
- module = cls.__module__.replace(project_package + ".", "") # 简化模块名
76
- print(f"🕷️ {name:<20} {cls.__name__:<25} ({module})")
77
- print("-" * 50)
102
+ spider_cls = process.get_spider_class(name)
103
+ module_name = spider_cls.__module__.replace(f"{project_package}.", "")
104
+ print(f"🕷️ {name:<20} {spider_cls.__name__:<25} ({module_name})")
105
+ print("-" * 60)
78
106
  return 0
79
107
 
80
108
  except Exception as e:
81
- print(f"❌ Error listing spiders: {e}")
82
- import traceback
83
- traceback.print_exc()
109
+ print(f"❌ Unexpected error: {e}")
110
+ logger.exception("Exception during 'crawlo list'")
84
111
  return 1
85
112
 
86
113
 
87
- if __name__ == '__main__':
114
+ if __name__ == "__main__":
88
115
  """
89
- 允许直接运行:
116
+ 支持直接运行:
90
117
  python -m crawlo.commands.list
91
118
  """
92
119
  sys.exit(main(sys.argv[1:]))
crawlo/commands/run.py CHANGED
@@ -1,106 +1,153 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
1
3
  """
2
- 命令行入口:crawlo run <spider_name>
3
- 用于运行指定名称的爬虫。
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
4
7
  """
5
-
8
+ import sys
6
9
  import asyncio
7
- from pathlib import Path
8
10
  import configparser
11
+ from pathlib import Path
12
+ from importlib import import_module
9
13
 
10
14
  from crawlo.crawler import CrawlerProcess
11
- from crawlo.utils.project import get_settings
12
15
  from crawlo.utils.log import get_logger
16
+ from crawlo.utils.project import get_settings
17
+ from crawlo.commands.stats import record_stats # 自动记录 stats
13
18
 
14
19
  logger = get_logger(__name__)
15
20
 
16
21
 
22
+ def get_project_root():
23
+ """
24
+ 向上查找 crawlo.cfg 来确定项目根目录
25
+ """
26
+ current = Path.cwd()
27
+
28
+ for _ in range(10):
29
+ cfg = current / "crawlo.cfg"
30
+ if cfg.exists():
31
+ return current
32
+
33
+ if current == current.parent:
34
+ break
35
+ current = current.parent
36
+
37
+ return None
38
+
39
+
17
40
  def main(args):
18
41
  """
19
- 运行指定爬虫的主函数
42
+ 主函数:运行指定爬虫
20
43
  用法:
21
44
  crawlo run <spider_name>
22
45
  crawlo run all
23
46
  """
24
47
  if len(args) < 1:
25
- print("Usage: crawlo run <spider_name>|all")
26
- print("Examples:")
27
- print(" crawlo run baidu")
28
- print(" crawlo run all")
48
+ print("Usage: crawlo run <spider_name>|all")
49
+ print("💡 Examples:")
50
+ print(" crawlo run baidu")
51
+ print(" crawlo run all")
29
52
  return 1
30
53
 
31
54
  spider_arg = args[0]
32
55
 
33
56
  try:
34
- # 1. 获取项目根目录
35
- project_root = get_settings().get('PROJECT_ROOT')
57
+ # 1. 查找项目根目录
58
+ project_root = get_project_root()
36
59
  if not project_root:
37
- print("❌ Error: Cannot determine project root.")
60
+ print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
61
+ print("💡 Tip: Run this command inside your project directory.")
38
62
  return 1
39
63
 
40
- if str(project_root) not in sys.path:
41
- sys.path.insert(0, str(project_root))
64
+ project_root_str = str(project_root)
65
+ if project_root_str not in sys.path:
66
+ sys.path.insert(0, project_root_str)
42
67
 
43
- # 2. 读取 crawlo.cfg 获取项目包名
44
- cfg_file = project_root / 'crawlo.cfg'
68
+ # 2. 读取 crawlo.cfg 获取 settings 模块
69
+ cfg_file = project_root / "crawlo.cfg"
45
70
  if not cfg_file.exists():
46
71
  print(f"❌ Error: crawlo.cfg not found in {project_root}")
47
72
  return 1
48
73
 
49
74
  config = configparser.ConfigParser()
50
- config.read(cfg_file, encoding='utf-8')
75
+ config.read(cfg_file, encoding="utf-8")
51
76
 
52
- if not config.has_section('settings') or not config.has_option('settings', 'default'):
77
+ if not config.has_section("settings") or not config.has_option("settings", "default"):
53
78
  print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
54
79
  return 1
55
80
 
56
- settings_module = config.get('settings', 'default')
57
- project_package = settings_module.split('.')[0]
81
+ settings_module = config.get("settings", "default")
82
+ project_package = settings_module.split(".")[0]
58
83
 
59
- # 3. 创建 CrawlerProcess 并自动发现爬虫模块
84
+ # 3. 确保项目包可导入
85
+ try:
86
+ import_module(project_package)
87
+ except ImportError as e:
88
+ print(f"❌ Failed to import project package '{project_package}': {e}")
89
+ return 1
90
+
91
+ # 4. 加载 settings 和爬虫模块
92
+ settings = get_settings() # 此时已安全
60
93
  spider_modules = [f"{project_package}.spiders"]
61
- settings = get_settings()
62
94
  process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
63
95
 
64
- # === 新增:支持 'all' ===
96
+ # === 情况1:运行所有爬虫 ===
65
97
  if spider_arg.lower() == "all":
66
98
  spider_names = process.get_spider_names()
67
99
  if not spider_names:
68
- print("❌ No spiders found. Make sure spiders are defined and imported.")
100
+ print("❌ No spiders found.")
101
+ print("💡 Make sure:")
102
+ print(" • Spiders are defined in 'spiders/'")
103
+ print(" • They have a `name` attribute")
104
+ print(" • Modules are imported (e.g. via __init__.py)")
69
105
  return 1
70
106
 
71
- print(f"🚀 Starting ALL {len(spider_names)} spiders:")
107
+ print(f"🚀 Starting ALL {len(spider_names)} spider(s):")
108
+ print("-" * 60)
72
109
  for name in sorted(spider_names):
73
110
  cls = process.get_spider_class(name)
74
- print(f" 🕷️ {name} ({cls.__name__})")
75
- print("-" * 50)
111
+ print(f"🕷️ {name:<20} {cls.__name__}")
112
+ print("-" * 60)
113
+
114
+ # 注册 stats 记录(每个爬虫结束时保存)
115
+ for crawler in process.crawlers:
116
+ crawler.signals.connect(record_stats, signal="spider_closed")
76
117
 
77
- # 启动所有爬虫
118
+ # 并行运行所有爬虫(可改为串行:for name in ... await process.crawl(name))
78
119
  asyncio.run(process.crawl(spider_names))
120
+ print("✅ All spiders completed.")
79
121
  return 0
80
122
 
81
- # === 原有:启动单个爬虫 ===
123
+ # === 情况2:运行单个爬虫 ===
82
124
  spider_name = spider_arg
83
125
  if not process.is_spider_registered(spider_name):
84
- print(f"❌ Error: Spider with name '{spider_name}' not found.")
85
- available_names = process.get_spider_names()
86
- if available_names:
126
+ print(f"❌ Spider '{spider_name}' not found.")
127
+ available = process.get_spider_names()
128
+ if available:
87
129
  print("💡 Available spiders:")
88
- for name in sorted(available_names):
130
+ for name in sorted(available):
89
131
  cls = process.get_spider_class(name)
90
- print(f" - {name} (class: {cls.__name__})")
132
+ print(f" {name} ({cls.__name__})")
91
133
  else:
92
- print("💡 No spiders found. Make sure your spider classes are defined and imported.")
134
+ print("💡 No spiders found. Check your spiders module.")
93
135
  return 1
94
136
 
95
137
  spider_class = process.get_spider_class(spider_name)
96
138
 
97
139
  # 打印启动信息
98
140
  print(f"🚀 Starting spider: {spider_name}")
99
- print(f"📁 Project: {project_package}")
100
- print(f"🕷️ Class: {spider_class.__name__}")
141
+ print(f"📦 Project: {project_package}")
142
+ print(f"CppClass: {spider_class.__name__}")
143
+ print(f"📄 Module: {spider_class.__module__}")
101
144
  print("-" * 50)
102
145
 
103
- # 启动爬虫
146
+ # 注册 stats 记录
147
+ for crawler in process.crawlers:
148
+ crawler.signals.connect(record_stats, signal="spider_closed")
149
+
150
+ # 运行爬虫
104
151
  asyncio.run(process.crawl(spider_name))
105
152
 
106
153
  print("-" * 50)
@@ -111,71 +158,14 @@ def main(args):
111
158
  print("\n⚠️ Spider interrupted by user.")
112
159
  return 1
113
160
  except Exception as e:
114
- print(f"❌ Error running spider: {e}")
115
- import traceback
116
- traceback.print_exc()
161
+ print(f"❌ Unexpected error: {e}")
162
+ logger.exception("Exception during 'crawlo run'")
117
163
  return 1
118
164
 
119
165
 
120
- def list_available_spiders(project_package: str):
121
- """
122
- 列出指定项目包中所有可用的爬虫(用于调试或命令行扩展)
123
- """
124
- try:
125
- # 临时创建一个 CrawlerProcess 来发现爬虫
126
- process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
127
- available_names = process.get_spider_names()
128
-
129
- if not available_names:
130
- print(" No spiders found. Make sure:")
131
- print(" - spiders/ 目录存在")
132
- print(" - 爬虫类继承 Spider 且定义了 name")
133
- print(" - 模块被导入(可通过 __init__.py 触发)")
134
- return
135
-
136
- print(f"Found {len(available_names)} spider(s):")
137
- for name in sorted(available_names):
138
- cls = process.get_spider_class(name)
139
- module = cls.__module__.replace(project_package + ".", "")
140
- print(f" - {name} ({cls.__name__} @ {module})")
141
- except Exception as e:
142
- print(f"❌ Failed to list spiders: {e}")
143
- import traceback
144
- traceback.print_exc()
145
-
146
-
147
- def run_spider_by_name(spider_name: str, project_package: str = None):
166
+ if __name__ == "__main__":
148
167
  """
149
- 在代码中直接运行某个爬虫(需提供 project_package)
168
+ 支持直接运行:
169
+ python -m crawlo.commands.run spider_name
150
170
  """
151
- if project_package is None:
152
- # 尝试从配置读取
153
- cfg_file = Path('crawlo.cfg')
154
- if cfg_file.exists():
155
- config = configparser.ConfigParser()
156
- config.read(cfg_file, encoding='utf-8')
157
- if config.has_option('settings', 'default'):
158
- project_package = config.get('settings', 'default').split('.')[0]
159
-
160
- if not project_package:
161
- print("❌ Error: project_package is required.")
162
- return 1
163
-
164
- # 添加项目路径
165
- project_root = get_settings().get('PROJECT_ROOT')
166
- if project_root and str(project_root) not in sys.path:
167
- sys.path.insert(0, str(project_root))
168
-
169
- # 复用 main 函数逻辑
170
- args = [spider_name]
171
- return main(args)
172
-
173
-
174
- if __name__ == '__main__':
175
- """
176
- 允许直接运行:
177
- python -m crawlo.commands.run <spider_name>
178
- """
179
- import sys
180
-
181
- sys.exit(main(sys.argv[1:]))
171
+ sys.exit(main(sys.argv[1:]))
crawlo/commands/stats.py CHANGED
@@ -1,59 +1,167 @@
1
1
  #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
2
+ # -*- coding: UTF-8 -*-
3
3
  """
4
- # @Time : 2025-08-31 22:36
5
- # @Author : crawl-coder
6
- # @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
7
7
  """
8
+
8
9
  import sys
10
+ import json
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ from typing import Dict, Any
14
+
9
15
  from crawlo.utils.log import get_logger
10
16
 
11
17
 
12
18
  logger = get_logger(__name__)
13
19
 
14
- # 保存最近运行的爬虫的统计(示例)
15
- _LAST_RUN_STATS = {}
20
+ # 默认存储目录(相对于项目根目录)
21
+ STATS_DIR = "logs/stats"
22
+
23
+
24
+ def get_stats_dir() -> Path:
25
+ """
26
+ 获取统计文件存储目录,优先使用项目根下的 logs/stats/
27
+ 如果不在项目中,回退到当前目录
28
+ """
29
+ # 尝试查找项目根目录(通过 crawlo.cfg)
30
+ current = Path.cwd()
31
+ for _ in range(10):
32
+ if (current / "crawlo.cfg").exists():
33
+ return current / STATS_DIR
34
+ if current == current.parent:
35
+ break
36
+ current = current.parent
37
+
38
+ # 回退:使用当前目录下的 logs/stats
39
+ return Path.cwd() / STATS_DIR
16
40
 
17
41
 
18
42
  def record_stats(crawler):
19
- """在爬虫关闭后记录统计(需在 close 中调用)"""
20
- if crawler.stats and crawler.spider:
21
- _LAST_RUN_STATS[crawler.spider.name] = crawler.stats.get_stats()
43
+ """
44
+ 【供爬虫运行时调用】记录爬虫结束后的统计信息到 JSON 文件
45
+ 需在 Crawler 的 closed 回调中调用
46
+ """
47
+ spider_name = getattr(crawler.spider, "name", "unknown")
48
+ stats = crawler.stats.get_stats() if crawler.stats else {}
49
+
50
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
51
+ stats_dir = Path(get_stats_dir())
52
+ stats_dir.mkdir(parents=True, exist_ok=True)
53
+
54
+ filename = stats_dir / f"{spider_name}_{timestamp}.json"
55
+ try:
56
+ with open(filename, "w", encoding="utf-8") as f:
57
+ json.dump({
58
+ "spider": spider_name,
59
+ "timestamp": datetime.now().isoformat(),
60
+ "stats": stats
61
+ }, f, ensure_ascii=False, indent=2, default=str)
62
+ logger.info(f"📊 Stats saved for spider '{spider_name}' → {filename}")
63
+ except Exception as e:
64
+ logger.error(f"Failed to save stats for '{spider_name}': {e}")
65
+
66
+
67
+ def load_all_stats() -> Dict[str, list]:
68
+ """
69
+ 加载所有已保存的统计文件,按 spider name 分组
70
+ 返回: {spider_name: [stats_record, ...]}
71
+ """
72
+ stats_dir = get_stats_dir()
73
+ if not stats_dir.exists():
74
+ return {}
75
+
76
+ result = {}
77
+ json_files = sorted(stats_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
78
+
79
+ for file in json_files:
80
+ try:
81
+ with open(file, "r", encoding="utf-8") as f:
82
+ data = json.load(f)
83
+ spider_name = data.get("spider", "unknown")
84
+ result.setdefault(spider_name, []).append(data)
85
+ except Exception as e:
86
+ logger.warning(f"Failed to load stats file {file}: {e}")
87
+ return result
88
+
89
+
90
+ def format_value(v: Any) -> str:
91
+ """格式化值,防止太长或不可打印"""
92
+ if isinstance(v, float):
93
+ return f"{v:.4f}"
94
+ return str(v)
22
95
 
23
96
 
24
97
  def main(args):
25
- if len(args) == 0:
26
- # 显示所有历史统计
27
- if not _LAST_RUN_STATS:
28
- print("📊 No stats available. Run a spider first.")
29
- return 0
98
+ """
99
+ 主函数:查看统计信息
100
+ 用法:
101
+ crawlo stats 显示所有爬虫最近一次运行
102
+ crawlo stats myspider → 显示指定爬虫所有历史记录
103
+ crawlo stats myspider --all → 显示所有历史(同上)
104
+ """
105
+ if len(args) > 2:
106
+ print("Usage: crawlo stats [spider_name] [--all]")
107
+ return 1
108
+
109
+ spider_name = None
110
+ show_all = False
111
+
112
+ if args:
113
+ spider_name = args[0]
114
+ show_all = "--all" in args or "-a" in args
115
+
116
+ # 加载所有 stats
117
+ all_stats = load_all_stats()
118
+ if not all_stats:
119
+ print("📊 No stats found. Run a spider first.")
120
+ print(f"💡 Stats are saved in: {get_stats_dir()}")
121
+ return 0
30
122
 
31
- print("📊 Recent Spider Statistics:")
123
+ if not spider_name:
124
+ # 显示每个爬虫最近一次运行
125
+ print("📊 Recent Spider Statistics (last run):")
32
126
  print("-" * 60)
33
- for spider_name, stats in _LAST_RUN_STATS.items():
34
- print(f"🕷️ {spider_name}")
35
- for k, v in stats.items():
36
- print(f" {k:<30} {v}")
127
+ for name, runs in all_stats.items():
128
+ latest = runs[0]
129
+ print(f"🕷️ {name} ({latest['timestamp'][:19]})")
130
+ stats = latest["stats"]
131
+ for k in sorted(stats.keys()):
132
+ print(f" {k:<30} {format_value(stats[k])}")
37
133
  print()
38
134
  return 0
39
135
 
40
- elif len(args) == 1:
41
- spider_name = args[0]
42
- if spider_name not in _LAST_RUN_STATS:
136
+ else:
137
+ # 查看指定爬虫
138
+ if spider_name not in all_stats:
43
139
  print(f"📊 No stats found for spider '{spider_name}'")
140
+ available = ', '.join(all_stats.keys())
141
+ if available:
142
+ print(f"💡 Available spiders: {available}")
44
143
  return 1
45
144
 
46
- stats = _LAST_RUN_STATS[spider_name]
47
- print(f"📊 Stats for '{spider_name}':")
145
+ runs = all_stats[spider_name]
146
+ if show_all:
147
+ print(f"📊 All runs for '{spider_name}' ({len(runs)} runs):")
148
+ else:
149
+ runs = runs[:1]
150
+ print(f"📊 Last run for '{spider_name}':")
151
+
48
152
  print("-" * 60)
49
- for k, v in stats.items():
50
- print(f" {k:<30} {v}")
153
+ for run in runs:
154
+ print(f"⏱️ Timestamp: {run['timestamp']}")
155
+ stats = run["stats"]
156
+ for k in sorted(stats.keys()):
157
+ print(f" {k:<30} {format_value(stats[k])}")
158
+ print("─" * 60)
51
159
  return 0
52
160
 
53
- else:
54
- print("Usage: crawlo stats [spider_name]")
55
- return 1
56
-
57
161
 
58
- if __name__ == '__main__':
162
+ if __name__ == "__main__":
163
+ """
164
+ 支持直接运行:
165
+ python -m crawlo.commands.stats
166
+ """
59
167
  sys.exit(main(sys.argv[1:]))
crawlo/core/engine.py CHANGED
@@ -42,7 +42,7 @@ class Engine(object):
42
42
  def engine_start(self):
43
43
  self.running = True
44
44
  self.logger.info(
45
- f"Crawlo (version {self.settings.get_int('VERSION')}) started. "
45
+ f"Crawlo (version {self.settings.get_float('VERSION')}) started. "
46
46
  f"(project name : {self.settings.get('PROJECT_NAME')})"
47
47
  )
48
48
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.8
3
+ Version: 1.1.0
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -1,5 +1,5 @@
1
1
  crawlo/__init__.py,sha256=xpiIAZbSG3CzneJuDLPCbwfRcvw2wyHYl2kJjaNfNGY,584
2
- crawlo/__version__.py,sha256=uyL3a6o1xccXPZ2OS65zqIN_lbEMT7PcCxErq7cuWwA,23
2
+ crawlo/__version__.py,sha256=Zrv57EzpjdsuSPqsYvFkVsQKKRUOHFG7yURCf7qN-Tk,23
3
3
  crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
4
4
  crawlo/crawler.py,sha256=AyKxUyJvCwb1u4d3Zn3vFmjH28ExWKIygfTICps-3yY,20026
5
5
  crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
@@ -7,15 +7,15 @@ crawlo/exceptions.py,sha256=xdyZkvVcLEJ-19sWMHvn9IJsu30-hAY2jJhA2kYIims,1207
7
7
  crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
8
8
  crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
9
9
  crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
10
- crawlo/commands/__init__.py,sha256=dRu3ipuhDM7M1eTb6zJtQZ_u7N_tZumGfH5_I92xno8,252
11
- crawlo/commands/check.py,sha256=Q8wFjIo43XW0wP93TTlM7HSShgytJsbSWHIlmkcNxz0,3585
10
+ crawlo/commands/__init__.py,sha256=kZ3qATqDPmMUCNUQSFfBfIA8fp_1dgBwIAWbmFN3_To,355
11
+ crawlo/commands/check.py,sha256=He5Dmpn8M0gYEfiXRW801I6ULypWKMvT5Iwjg_4cUYE,5070
12
12
  crawlo/commands/genspider.py,sha256=kSHYsAGHRoxU6Qf_MGpR_VS-Ua5NUGY2KGm_Wapn0sw,3529
13
- crawlo/commands/list.py,sha256=itR05muZlZs8FbRh88kOhcRbZc77OXiR6A86UnVhSMY,2974
14
- crawlo/commands/run.py,sha256=s6JJC8HNa-tBgPDB2BPUmj26D7PMckhlx4AOEz57ESY,6197
13
+ crawlo/commands/list.py,sha256=iwd1piFYa7cr4WkRTD0ndCZEN0xoZX0vvlWTU1FbSYE,3972
14
+ crawlo/commands/run.py,sha256=ppgEUNVNuhpQFiBkgB6ZFAKeOJiLybd68gGHcAJgF4w,5813
15
15
  crawlo/commands/startproject.py,sha256=1KOq_CALy01oklr0dAUYhGFzu4f7w45q2H0O3qafLX4,3494
16
- crawlo/commands/stats.py,sha256=rH0TlD0o-xUr9RxtvNYgnSjHHoRyma3rvx9Q9nIGDNg,1659
16
+ crawlo/commands/stats.py,sha256=siuCv2PGhr0_eqAaER2YYwI_IHmOlFbgIHWmX3-EWs4,5246
17
17
  crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
18
- crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
18
+ crawlo/core/engine.py,sha256=SoTVS3F2EI1G_zQVe9UbeUz8cBhyVFlxJ-HuhPD3ock,6032
19
19
  crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
20
20
  crawlo/core/scheduler.py,sha256=ZMPs4LSs69FsFfDTvaOMJKqpSQQGvIEE9pMyYVVAA64,1948
21
21
  crawlo/downloader/__init__.py,sha256=72u2Hef4HaMfs9VCqEjbMtiaRXbaXmgNiJn6qy09LHs,2384
@@ -81,7 +81,7 @@ examples/gxb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
81
  examples/gxb/items.py,sha256=3-1Lxpi7EqMzheDJoO0MPyHky5nHG_nqQGgKlm8y6mQ,989
82
82
  examples/gxb/run.py,sha256=9kJlR8f-tZ3BqP5PW7sCLTw6PAFWo3x4cG5lc-6GWqI,333
83
83
  examples/gxb/settings.py,sha256=_nbXj9HV2e0F6liUzK0ueygLcaMM_IUlkuwL6mJqUfc,2345
84
- examples/gxb/spider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
+ examples/gxb/spider/__init__.py,sha256=E5bYTAuqcy2KBgnZnZ7OoW7mE6YUIy2w748zCrE68nI,92
85
85
  examples/gxb/spider/miit_spider.py,sha256=tcQnuyUHfu-Re1QbKKSI9DXW3Sp1vyBW8qBzKLf_RC4,6666
86
86
  examples/gxb/spider/telecom_device.py,sha256=58iG6BQtQjjDHOF7-DXH0u5_XnppP5AJTQwaVJVyBEo,4929
87
87
  tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
@@ -90,8 +90,8 @@ tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX6149
90
90
  tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
91
91
  tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
92
92
  tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
93
- crawlo-1.0.8.dist-info/METADATA,sha256=ia-nA0g0Rl76iHFIlvaRbvUnjd88KEKoxIrJKcjtCyw,1825
94
- crawlo-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
- crawlo-1.0.8.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
96
- crawlo-1.0.8.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
97
- crawlo-1.0.8.dist-info/RECORD,,
93
+ crawlo-1.1.0.dist-info/METADATA,sha256=WTcM-8FqMpTLvIAPGvruLzPCBZnY3ODYklhnv7eVS70,1825
94
+ crawlo-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
+ crawlo-1.1.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
96
+ crawlo-1.1.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
97
+ crawlo-1.1.0.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ from .miit_spider import MiitSpider
2
+ from .telecom_device import TelecomDeviceLicensesSpider
File without changes