crawlo 1.0.6__tar.gz → 1.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (109) hide show
  1. {crawlo-1.0.6/crawlo.egg-info → crawlo-1.0.8}/PKG-INFO +1 -1
  2. crawlo-1.0.8/crawlo/__version__.py +1 -0
  3. crawlo-1.0.8/crawlo/commands/check.py +107 -0
  4. crawlo-1.0.8/crawlo/commands/list.py +92 -0
  5. crawlo-1.0.8/crawlo/commands/run.py +181 -0
  6. crawlo-1.0.8/crawlo/commands/stats.py +59 -0
  7. crawlo-1.0.8/crawlo/crawler.py +493 -0
  8. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/items/__init__.py +2 -1
  9. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/items/base.py +1 -10
  10. crawlo-1.0.8/crawlo/spider/__init__.py +129 -0
  11. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/project.py +14 -16
  12. {crawlo-1.0.6 → crawlo-1.0.8/crawlo.egg-info}/PKG-INFO +1 -1
  13. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo.egg-info/SOURCES.txt +3 -0
  14. {crawlo-1.0.6 → crawlo-1.0.8}/requirements.txt +1 -1
  15. crawlo-1.0.6/crawlo/__version__.py +0 -1
  16. crawlo-1.0.6/crawlo/commands/run.py +0 -149
  17. crawlo-1.0.6/crawlo/crawler.py +0 -219
  18. crawlo-1.0.6/crawlo/spider/__init__.py +0 -41
  19. {crawlo-1.0.6 → crawlo-1.0.8}/LICENSE +0 -0
  20. {crawlo-1.0.6 → crawlo-1.0.8}/MANIFEST.in +0 -0
  21. {crawlo-1.0.6 → crawlo-1.0.8}/README.md +0 -0
  22. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/__init__.py +0 -0
  23. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/cli.py +0 -0
  24. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/commands/__init__.py +0 -0
  25. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/commands/genspider.py +0 -0
  26. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/commands/startproject.py +0 -0
  27. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/core/__init__.py +0 -0
  28. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/core/engine.py +0 -0
  29. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/core/processor.py +0 -0
  30. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/core/scheduler.py +0 -0
  31. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/downloader/__init__.py +0 -0
  32. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/downloader/aiohttp_downloader.py +0 -0
  33. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/downloader/cffi_downloader.py +0 -0
  34. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/downloader/httpx_downloader.py +0 -0
  35. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/event.py +0 -0
  36. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/exceptions.py +0 -0
  37. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/extension/__init__.py +0 -0
  38. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/extension/log_interval.py +0 -0
  39. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/extension/log_stats.py +0 -0
  40. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/extension/logging_extension.py +0 -0
  41. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/filters/__init__.py +0 -0
  42. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/filters/aioredis_filter.py +0 -0
  43. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/filters/memory_filter.py +0 -0
  44. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/items/fields.py +0 -0
  45. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/items/items.py +0 -0
  46. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/__init__.py +0 -0
  47. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/default_header.py +0 -0
  48. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/download_delay.py +0 -0
  49. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/middleware_manager.py +0 -0
  50. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/proxy.py +0 -0
  51. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/request_ignore.py +0 -0
  52. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/response_code.py +0 -0
  53. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/response_filter.py +0 -0
  54. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/middleware/retry.py +0 -0
  55. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/network/__init__.py +0 -0
  56. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/network/request.py +0 -0
  57. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/network/response.py +0 -0
  58. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/pipelines/__init__.py +0 -0
  59. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/pipelines/console_pipeline.py +0 -0
  60. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/pipelines/mongo_pipeline.py +0 -0
  61. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/pipelines/mysql_batch_pipline.py +0 -0
  62. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/pipelines/mysql_pipeline.py +0 -0
  63. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/pipelines/pipeline_manager.py +0 -0
  64. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/settings/__init__.py +0 -0
  65. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/settings/default_settings.py +0 -0
  66. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/settings/setting_manager.py +0 -0
  67. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/stats_collector.py +0 -0
  68. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/subscriber.py +0 -0
  69. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/task_manager.py +0 -0
  70. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  71. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/project/__init__.py.tmpl +0 -0
  72. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/project/items.py.tmpl +0 -0
  73. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  74. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  75. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/project/settings.py.tmpl +0 -0
  76. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  77. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/templates/spider/spider.py.tmpl +0 -0
  78. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/__init__.py +0 -0
  79. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/concurrency_manager.py +0 -0
  80. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/date_tools.py +0 -0
  81. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/db_helper.py +0 -0
  82. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/func_tools.py +0 -0
  83. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/log.py +0 -0
  84. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/pqueue.py +0 -0
  85. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/request.py +0 -0
  86. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/spider_loader.py +0 -0
  87. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/system.py +0 -0
  88. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/tools.py +0 -0
  89. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo/utils/url.py +0 -0
  90. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo.egg-info/dependency_links.txt +0 -0
  91. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo.egg-info/entry_points.txt +0 -0
  92. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo.egg-info/requires.txt +0 -0
  93. {crawlo-1.0.6 → crawlo-1.0.8}/crawlo.egg-info/top_level.txt +0 -0
  94. {crawlo-1.0.6 → crawlo-1.0.8}/examples/__init__.py +0 -0
  95. {crawlo-1.0.6 → crawlo-1.0.8}/examples/gxb/__init__.py +0 -0
  96. {crawlo-1.0.6 → crawlo-1.0.8}/examples/gxb/items.py +0 -0
  97. {crawlo-1.0.6 → crawlo-1.0.8}/examples/gxb/run.py +0 -0
  98. {crawlo-1.0.6 → crawlo-1.0.8}/examples/gxb/settings.py +0 -0
  99. {crawlo-1.0.6 → crawlo-1.0.8}/examples/gxb/spider/__init__.py +0 -0
  100. {crawlo-1.0.6 → crawlo-1.0.8}/examples/gxb/spider/miit_spider.py +0 -0
  101. {crawlo-1.0.6 → crawlo-1.0.8}/examples/gxb/spider/telecom_device.py +0 -0
  102. {crawlo-1.0.6 → crawlo-1.0.8}/pyproject.toml +0 -0
  103. {crawlo-1.0.6 → crawlo-1.0.8}/setup.cfg +0 -0
  104. {crawlo-1.0.6 → crawlo-1.0.8}/tests/__init__.py +0 -0
  105. {crawlo-1.0.6 → crawlo-1.0.8}/tests/test_proxy_health_check.py +0 -0
  106. {crawlo-1.0.6 → crawlo-1.0.8}/tests/test_proxy_middleware_integration.py +0 -0
  107. {crawlo-1.0.6 → crawlo-1.0.8}/tests/test_proxy_providers.py +0 -0
  108. {crawlo-1.0.6 → crawlo-1.0.8}/tests/test_proxy_stats.py +0 -0
  109. {crawlo-1.0.6 → crawlo-1.0.8}/tests/test_proxy_strategies.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.6
3
+ Version: 1.0.8
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -0,0 +1 @@
1
+ __version__ = "1.0.8"
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:35
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo check, 检查所有爬虫定义是否合规。
7
+ """
8
+ import sys
9
+ import configparser
10
+
11
+ from crawlo.crawler import CrawlerProcess
12
+ from crawlo.utils.project import get_settings
13
+ from crawlo.utils.log import get_logger
14
+
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ def main(args):
20
+ if args:
21
+ print("Usage: crawlo check")
22
+ return 1
23
+
24
+ try:
25
+ project_root = get_settings().get('PROJECT_ROOT')
26
+ if not project_root:
27
+ print("❌ Error: Cannot determine project root.")
28
+ return 1
29
+
30
+ if str(project_root) not in sys.path:
31
+ sys.path.insert(0, str(project_root))
32
+
33
+ cfg_file = project_root / 'crawlo.cfg'
34
+ if not cfg_file.exists():
35
+ print(f"❌ Error: crawlo.cfg not found in {project_root}")
36
+ return 1
37
+
38
+ config = configparser.ConfigParser()
39
+ config.read(cfg_file, encoding='utf-8')
40
+
41
+ if not config.has_section('settings') or not config.has_option('settings', 'default'):
42
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
43
+ return 1
44
+
45
+ settings_module = config.get('settings', 'default')
46
+ project_package = settings_module.split('.')[0]
47
+
48
+ # 创建 CrawlerProcess 并发现爬虫
49
+ process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
50
+ spider_names = process.get_spider_names()
51
+
52
+ if not spider_names:
53
+ print("📭 No spiders found.")
54
+ return 1
55
+
56
+ print(f"🔍 Checking {len(spider_names)} spider(s)...")
57
+ print("-" * 60)
58
+
59
+ issues_found = False
60
+ for name in sorted(spider_names):
61
+ cls = process.get_spider_class(name)
62
+ issues = []
63
+
64
+ if not hasattr(cls, 'name') or not cls.name:
65
+ issues.append("missing or empty 'name' attribute")
66
+ elif not isinstance(cls.name, str):
67
+ issues.append("'name' is not a string")
68
+
69
+ if not callable(getattr(cls, 'start_requests', None)):
70
+ issues.append("missing or non-callable 'start_requests' method")
71
+
72
+ if hasattr(cls, 'start_urls') and isinstance(cls.start_urls, str):
73
+ issues.append("'start_urls' is a string, should be list/tuple")
74
+
75
+ # 实例化检查(轻量)
76
+ try:
77
+ spider = cls.create_instance(None)
78
+ if not callable(getattr(spider, 'parse', None)):
79
+ issues.append("no 'parse' method defined (optional but recommended)")
80
+ except Exception as e:
81
+ issues.append(f"failed to create instance: {e}")
82
+
83
+ if issues:
84
+ print(f"❌ {name:<20} {cls.__name__}")
85
+ for issue in issues:
86
+ print(f" • {issue}")
87
+ issues_found = True
88
+ else:
89
+ print(f"✅ {name:<20} {cls.__name__} (OK)")
90
+
91
+ print("-" * 60)
92
+ if issues_found:
93
+ print("⚠️ Some spiders have issues. Please fix them.")
94
+ return 1
95
+ else:
96
+ print("🎉 All spiders are compliant!")
97
+ return 0
98
+
99
+ except Exception as e:
100
+ print(f"❌ Error during check: {e}")
101
+ import traceback
102
+ traceback.print_exc()
103
+ return 1
104
+
105
+
106
+ if __name__ == '__main__':
107
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:33
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
7
+ """
8
+ import sys
9
+ import configparser
10
+
11
+ from crawlo.crawler import CrawlerProcess
12
+ from crawlo.utils.project import get_settings
13
+ from crawlo.utils.log import get_logger
14
+
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ def main(args):
20
+ """
21
+ 列出所有可用爬虫
22
+ 用法: crawlo list
23
+ """
24
+ if args:
25
+ print("Usage: crawlo list")
26
+ return 1
27
+
28
+ try:
29
+ # 1. 获取项目根目录
30
+ project_root = get_settings().get('PROJECT_ROOT')
31
+ if not project_root:
32
+ print("❌ Error: Cannot determine project root.")
33
+ return 1
34
+
35
+ # 将项目根目录加入 sys.path
36
+ project_root_str = str(project_root)
37
+ if project_root_str not in sys.path:
38
+ sys.path.insert(0, project_root_str)
39
+
40
+ # 2. 读取 crawlo.cfg 获取项目包名
41
+ cfg_file = project_root / 'crawlo.cfg'
42
+ if not cfg_file.exists():
43
+ print(f"❌ Error: crawlo.cfg not found in {project_root}")
44
+ return 1
45
+
46
+ config = configparser.ConfigParser()
47
+ config.read(cfg_file, encoding='utf-8')
48
+
49
+ if not config.has_section('settings') or not config.has_option('settings', 'default'):
50
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
51
+ return 1
52
+
53
+ settings_module = config.get('settings', 'default')
54
+ project_package = settings_module.split('.')[0]
55
+
56
+ # 3. 创建 CrawlerProcess 并自动发现爬虫
57
+ spider_modules = [f"{project_package}.spiders"]
58
+ process = CrawlerProcess(spider_modules=spider_modules)
59
+
60
+ # 4. 获取所有爬虫信息
61
+ spider_names = process.get_spider_names()
62
+ if not spider_names:
63
+ print("📭 No spiders found.")
64
+ print("💡 Make sure:")
65
+ print(" - Your spider classes inherit from `Spider`")
66
+ print(" - They define a `name` attribute")
67
+ print(" - The modules are imported (e.g. via __init__.py)")
68
+ return 1
69
+
70
+ # 5. 输出爬虫列表
71
+ print(f"📋 Found {len(spider_names)} spider(s):")
72
+ print("-" * 50)
73
+ for name in sorted(spider_names):
74
+ cls = process.get_spider_class(name)
75
+ module = cls.__module__.replace(project_package + ".", "") # 简化模块名
76
+ print(f"🕷️ {name:<20} {cls.__name__:<25} ({module})")
77
+ print("-" * 50)
78
+ return 0
79
+
80
+ except Exception as e:
81
+ print(f"❌ Error listing spiders: {e}")
82
+ import traceback
83
+ traceback.print_exc()
84
+ return 1
85
+
86
+
87
+ if __name__ == '__main__':
88
+ """
89
+ 允许直接运行:
90
+ python -m crawlo.commands.list
91
+ """
92
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,181 @@
1
+ """
2
+ 命令行入口:crawlo run <spider_name>
3
+ 用于运行指定名称的爬虫。
4
+ """
5
+
6
+ import asyncio
7
+ from pathlib import Path
8
+ import configparser
9
+
10
+ from crawlo.crawler import CrawlerProcess
11
+ from crawlo.utils.project import get_settings
12
+ from crawlo.utils.log import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ def main(args):
18
+ """
19
+ 运行指定爬虫的主函数
20
+ 用法:
21
+ crawlo run <spider_name>
22
+ crawlo run all
23
+ """
24
+ if len(args) < 1:
25
+ print("Usage: crawlo run <spider_name>|all")
26
+ print("Examples:")
27
+ print(" crawlo run baidu")
28
+ print(" crawlo run all")
29
+ return 1
30
+
31
+ spider_arg = args[0]
32
+
33
+ try:
34
+ # 1. 获取项目根目录
35
+ project_root = get_settings().get('PROJECT_ROOT')
36
+ if not project_root:
37
+ print("❌ Error: Cannot determine project root.")
38
+ return 1
39
+
40
+ if str(project_root) not in sys.path:
41
+ sys.path.insert(0, str(project_root))
42
+
43
+ # 2. 读取 crawlo.cfg 获取项目包名
44
+ cfg_file = project_root / 'crawlo.cfg'
45
+ if not cfg_file.exists():
46
+ print(f"❌ Error: crawlo.cfg not found in {project_root}")
47
+ return 1
48
+
49
+ config = configparser.ConfigParser()
50
+ config.read(cfg_file, encoding='utf-8')
51
+
52
+ if not config.has_section('settings') or not config.has_option('settings', 'default'):
53
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
54
+ return 1
55
+
56
+ settings_module = config.get('settings', 'default')
57
+ project_package = settings_module.split('.')[0]
58
+
59
+ # 3. 创建 CrawlerProcess 并自动发现爬虫模块
60
+ spider_modules = [f"{project_package}.spiders"]
61
+ settings = get_settings()
62
+ process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
63
+
64
+ # === 新增:支持 'all' ===
65
+ if spider_arg.lower() == "all":
66
+ spider_names = process.get_spider_names()
67
+ if not spider_names:
68
+ print("❌ No spiders found. Make sure spiders are defined and imported.")
69
+ return 1
70
+
71
+ print(f"🚀 Starting ALL {len(spider_names)} spiders:")
72
+ for name in sorted(spider_names):
73
+ cls = process.get_spider_class(name)
74
+ print(f" 🕷️ {name} ({cls.__name__})")
75
+ print("-" * 50)
76
+
77
+ # 启动所有爬虫
78
+ asyncio.run(process.crawl(spider_names))
79
+ return 0
80
+
81
+ # === 原有:启动单个爬虫 ===
82
+ spider_name = spider_arg
83
+ if not process.is_spider_registered(spider_name):
84
+ print(f"❌ Error: Spider with name '{spider_name}' not found.")
85
+ available_names = process.get_spider_names()
86
+ if available_names:
87
+ print("💡 Available spiders:")
88
+ for name in sorted(available_names):
89
+ cls = process.get_spider_class(name)
90
+ print(f" - {name} (class: {cls.__name__})")
91
+ else:
92
+ print("💡 No spiders found. Make sure your spider classes are defined and imported.")
93
+ return 1
94
+
95
+ spider_class = process.get_spider_class(spider_name)
96
+
97
+ # 打印启动信息
98
+ print(f"🚀 Starting spider: {spider_name}")
99
+ print(f"📁 Project: {project_package}")
100
+ print(f"🕷️ Class: {spider_class.__name__}")
101
+ print("-" * 50)
102
+
103
+ # 启动爬虫
104
+ asyncio.run(process.crawl(spider_name))
105
+
106
+ print("-" * 50)
107
+ print("✅ Spider completed successfully!")
108
+ return 0
109
+
110
+ except KeyboardInterrupt:
111
+ print("\n⚠️ Spider interrupted by user.")
112
+ return 1
113
+ except Exception as e:
114
+ print(f"❌ Error running spider: {e}")
115
+ import traceback
116
+ traceback.print_exc()
117
+ return 1
118
+
119
+
120
+ def list_available_spiders(project_package: str):
121
+ """
122
+ 列出指定项目包中所有可用的爬虫(用于调试或命令行扩展)
123
+ """
124
+ try:
125
+ # 临时创建一个 CrawlerProcess 来发现爬虫
126
+ process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
127
+ available_names = process.get_spider_names()
128
+
129
+ if not available_names:
130
+ print(" No spiders found. Make sure:")
131
+ print(" - spiders/ 目录存在")
132
+ print(" - 爬虫类继承 Spider 且定义了 name")
133
+ print(" - 模块被导入(可通过 __init__.py 触发)")
134
+ return
135
+
136
+ print(f"Found {len(available_names)} spider(s):")
137
+ for name in sorted(available_names):
138
+ cls = process.get_spider_class(name)
139
+ module = cls.__module__.replace(project_package + ".", "")
140
+ print(f" - {name} ({cls.__name__} @ {module})")
141
+ except Exception as e:
142
+ print(f"❌ Failed to list spiders: {e}")
143
+ import traceback
144
+ traceback.print_exc()
145
+
146
+
147
+ def run_spider_by_name(spider_name: str, project_package: str = None):
148
+ """
149
+ 在代码中直接运行某个爬虫(需提供 project_package)
150
+ """
151
+ if project_package is None:
152
+ # 尝试从配置读取
153
+ cfg_file = Path('crawlo.cfg')
154
+ if cfg_file.exists():
155
+ config = configparser.ConfigParser()
156
+ config.read(cfg_file, encoding='utf-8')
157
+ if config.has_option('settings', 'default'):
158
+ project_package = config.get('settings', 'default').split('.')[0]
159
+
160
+ if not project_package:
161
+ print("❌ Error: project_package is required.")
162
+ return 1
163
+
164
+ # 添加项目路径
165
+ project_root = get_settings().get('PROJECT_ROOT')
166
+ if project_root and str(project_root) not in sys.path:
167
+ sys.path.insert(0, str(project_root))
168
+
169
+ # 复用 main 函数逻辑
170
+ args = [spider_name]
171
+ return main(args)
172
+
173
+
174
+ if __name__ == '__main__':
175
+ """
176
+ 允许直接运行:
177
+ python -m crawlo.commands.run <spider_name>
178
+ """
179
+ import sys
180
+
181
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
7
+ """
8
+ import sys
9
+ from crawlo.utils.log import get_logger
10
+
11
+
12
+ logger = get_logger(__name__)
13
+
14
+ # 保存最近运行的爬虫的统计(示例)
15
+ _LAST_RUN_STATS = {}
16
+
17
+
18
+ def record_stats(crawler):
19
+ """在爬虫关闭后记录统计(需在 close 中调用)"""
20
+ if crawler.stats and crawler.spider:
21
+ _LAST_RUN_STATS[crawler.spider.name] = crawler.stats.get_stats()
22
+
23
+
24
+ def main(args):
25
+ if len(args) == 0:
26
+ # 显示所有历史统计
27
+ if not _LAST_RUN_STATS:
28
+ print("📊 No stats available. Run a spider first.")
29
+ return 0
30
+
31
+ print("📊 Recent Spider Statistics:")
32
+ print("-" * 60)
33
+ for spider_name, stats in _LAST_RUN_STATS.items():
34
+ print(f"🕷️ {spider_name}")
35
+ for k, v in stats.items():
36
+ print(f" {k:<30} {v}")
37
+ print()
38
+ return 0
39
+
40
+ elif len(args) == 1:
41
+ spider_name = args[0]
42
+ if spider_name not in _LAST_RUN_STATS:
43
+ print(f"📊 No stats found for spider '{spider_name}'")
44
+ return 1
45
+
46
+ stats = _LAST_RUN_STATS[spider_name]
47
+ print(f"📊 Stats for '{spider_name}':")
48
+ print("-" * 60)
49
+ for k, v in stats.items():
50
+ print(f" {k:<30} {v}")
51
+ return 0
52
+
53
+ else:
54
+ print("Usage: crawlo stats [spider_name]")
55
+ return 1
56
+
57
+
58
+ if __name__ == '__main__':
59
+ sys.exit(main(sys.argv[1:]))