crawlo 1.0.7__tar.gz → 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.0.7/crawlo.egg-info → crawlo-1.0.9}/PKG-INFO +1 -1
- crawlo-1.0.9/crawlo/__version__.py +1 -0
- crawlo-1.0.9/crawlo/commands/__init__.py +14 -0
- crawlo-1.0.9/crawlo/commands/check.py +107 -0
- crawlo-1.0.9/crawlo/commands/list.py +92 -0
- crawlo-1.0.9/crawlo/commands/run.py +181 -0
- crawlo-1.0.9/crawlo/commands/stats.py +59 -0
- crawlo-1.0.9/crawlo/crawler.py +493 -0
- crawlo-1.0.9/crawlo/spider/__init__.py +129 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/project.py +14 -16
- {crawlo-1.0.7 → crawlo-1.0.9/crawlo.egg-info}/PKG-INFO +1 -1
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo.egg-info/SOURCES.txt +3 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/requirements.txt +1 -1
- crawlo-1.0.7/crawlo/__version__.py +0 -1
- crawlo-1.0.7/crawlo/commands/__init__.py +0 -10
- crawlo-1.0.7/crawlo/commands/run.py +0 -149
- crawlo-1.0.7/crawlo/crawler.py +0 -219
- crawlo-1.0.7/crawlo/spider/__init__.py +0 -41
- {crawlo-1.0.7 → crawlo-1.0.9}/LICENSE +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/MANIFEST.in +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/README.md +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/cli.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/core/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/core/engine.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/core/processor.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/event.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/exceptions.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/items/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/items/base.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/items/fields.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/items/items.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/network/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/network/request.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/network/response.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/pipelines/mysql_batch_pipline.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/pipelines/pipeline_manager.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/settings/default_settings.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/settings/setting_manager.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/stats_collector.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/subscriber.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/task_manager.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/project/settings.py.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/concurrency_manager.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/date_tools.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/log.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/pqueue.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/request.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/system.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/tools.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo/utils/url.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/gxb/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/gxb/items.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/gxb/run.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/gxb/settings.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/gxb/spider/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/gxb/spider/miit_spider.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/examples/gxb/spider/telecom_device.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/pyproject.toml +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/setup.cfg +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/tests/__init__.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.0.7 → crawlo-1.0.9}/tests/test_proxy_strategies.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.9"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
|
|
4
|
+
_commands = {
|
|
5
|
+
'startproject': 'crawlo.commands.startproject',
|
|
6
|
+
'genspider': 'crawlo.commands.genspider',
|
|
7
|
+
'run': 'crawlo.commands.run',
|
|
8
|
+
'check': 'crawlo.commands.check',
|
|
9
|
+
'list': 'crawlo.commands.list',
|
|
10
|
+
'stats': 'crawlo.commands.stats'
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
def get_commands():
|
|
14
|
+
return _commands
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:35
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo check, 检查所有爬虫定义是否合规。
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
import configparser
|
|
10
|
+
|
|
11
|
+
from crawlo.crawler import CrawlerProcess
|
|
12
|
+
from crawlo.utils.project import get_settings
|
|
13
|
+
from crawlo.utils.log import get_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main(args):
|
|
20
|
+
if args:
|
|
21
|
+
print("Usage: crawlo check")
|
|
22
|
+
return 1
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
26
|
+
if not project_root:
|
|
27
|
+
print("❌ Error: Cannot determine project root.")
|
|
28
|
+
return 1
|
|
29
|
+
|
|
30
|
+
if str(project_root) not in sys.path:
|
|
31
|
+
sys.path.insert(0, str(project_root))
|
|
32
|
+
|
|
33
|
+
cfg_file = project_root / 'crawlo.cfg'
|
|
34
|
+
if not cfg_file.exists():
|
|
35
|
+
print(f"❌ Error: crawlo.cfg not found in {project_root}")
|
|
36
|
+
return 1
|
|
37
|
+
|
|
38
|
+
config = configparser.ConfigParser()
|
|
39
|
+
config.read(cfg_file, encoding='utf-8')
|
|
40
|
+
|
|
41
|
+
if not config.has_section('settings') or not config.has_option('settings', 'default'):
|
|
42
|
+
print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
|
|
43
|
+
return 1
|
|
44
|
+
|
|
45
|
+
settings_module = config.get('settings', 'default')
|
|
46
|
+
project_package = settings_module.split('.')[0]
|
|
47
|
+
|
|
48
|
+
# 创建 CrawlerProcess 并发现爬虫
|
|
49
|
+
process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
|
|
50
|
+
spider_names = process.get_spider_names()
|
|
51
|
+
|
|
52
|
+
if not spider_names:
|
|
53
|
+
print("📭 No spiders found.")
|
|
54
|
+
return 1
|
|
55
|
+
|
|
56
|
+
print(f"🔍 Checking {len(spider_names)} spider(s)...")
|
|
57
|
+
print("-" * 60)
|
|
58
|
+
|
|
59
|
+
issues_found = False
|
|
60
|
+
for name in sorted(spider_names):
|
|
61
|
+
cls = process.get_spider_class(name)
|
|
62
|
+
issues = []
|
|
63
|
+
|
|
64
|
+
if not hasattr(cls, 'name') or not cls.name:
|
|
65
|
+
issues.append("missing or empty 'name' attribute")
|
|
66
|
+
elif not isinstance(cls.name, str):
|
|
67
|
+
issues.append("'name' is not a string")
|
|
68
|
+
|
|
69
|
+
if not callable(getattr(cls, 'start_requests', None)):
|
|
70
|
+
issues.append("missing or non-callable 'start_requests' method")
|
|
71
|
+
|
|
72
|
+
if hasattr(cls, 'start_urls') and isinstance(cls.start_urls, str):
|
|
73
|
+
issues.append("'start_urls' is a string, should be list/tuple")
|
|
74
|
+
|
|
75
|
+
# 实例化检查(轻量)
|
|
76
|
+
try:
|
|
77
|
+
spider = cls.create_instance(None)
|
|
78
|
+
if not callable(getattr(spider, 'parse', None)):
|
|
79
|
+
issues.append("no 'parse' method defined (optional but recommended)")
|
|
80
|
+
except Exception as e:
|
|
81
|
+
issues.append(f"failed to create instance: {e}")
|
|
82
|
+
|
|
83
|
+
if issues:
|
|
84
|
+
print(f"❌ {name:<20} {cls.__name__}")
|
|
85
|
+
for issue in issues:
|
|
86
|
+
print(f" • {issue}")
|
|
87
|
+
issues_found = True
|
|
88
|
+
else:
|
|
89
|
+
print(f"✅ {name:<20} {cls.__name__} (OK)")
|
|
90
|
+
|
|
91
|
+
print("-" * 60)
|
|
92
|
+
if issues_found:
|
|
93
|
+
print("⚠️ Some spiders have issues. Please fix them.")
|
|
94
|
+
return 1
|
|
95
|
+
else:
|
|
96
|
+
print("🎉 All spiders are compliant!")
|
|
97
|
+
return 0
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
print(f"❌ Error during check: {e}")
|
|
101
|
+
import traceback
|
|
102
|
+
traceback.print_exc()
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == '__main__':
|
|
107
|
+
sys.exit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:33
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
import configparser
|
|
10
|
+
|
|
11
|
+
from crawlo.crawler import CrawlerProcess
|
|
12
|
+
from crawlo.utils.project import get_settings
|
|
13
|
+
from crawlo.utils.log import get_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main(args):
|
|
20
|
+
"""
|
|
21
|
+
列出所有可用爬虫
|
|
22
|
+
用法: crawlo list
|
|
23
|
+
"""
|
|
24
|
+
if args:
|
|
25
|
+
print("Usage: crawlo list")
|
|
26
|
+
return 1
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
# 1. 获取项目根目录
|
|
30
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
31
|
+
if not project_root:
|
|
32
|
+
print("❌ Error: Cannot determine project root.")
|
|
33
|
+
return 1
|
|
34
|
+
|
|
35
|
+
# 将项目根目录加入 sys.path
|
|
36
|
+
project_root_str = str(project_root)
|
|
37
|
+
if project_root_str not in sys.path:
|
|
38
|
+
sys.path.insert(0, project_root_str)
|
|
39
|
+
|
|
40
|
+
# 2. 读取 crawlo.cfg 获取项目包名
|
|
41
|
+
cfg_file = project_root / 'crawlo.cfg'
|
|
42
|
+
if not cfg_file.exists():
|
|
43
|
+
print(f"❌ Error: crawlo.cfg not found in {project_root}")
|
|
44
|
+
return 1
|
|
45
|
+
|
|
46
|
+
config = configparser.ConfigParser()
|
|
47
|
+
config.read(cfg_file, encoding='utf-8')
|
|
48
|
+
|
|
49
|
+
if not config.has_section('settings') or not config.has_option('settings', 'default'):
|
|
50
|
+
print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
settings_module = config.get('settings', 'default')
|
|
54
|
+
project_package = settings_module.split('.')[0]
|
|
55
|
+
|
|
56
|
+
# 3. 创建 CrawlerProcess 并自动发现爬虫
|
|
57
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
58
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
59
|
+
|
|
60
|
+
# 4. 获取所有爬虫信息
|
|
61
|
+
spider_names = process.get_spider_names()
|
|
62
|
+
if not spider_names:
|
|
63
|
+
print("📭 No spiders found.")
|
|
64
|
+
print("💡 Make sure:")
|
|
65
|
+
print(" - Your spider classes inherit from `Spider`")
|
|
66
|
+
print(" - They define a `name` attribute")
|
|
67
|
+
print(" - The modules are imported (e.g. via __init__.py)")
|
|
68
|
+
return 1
|
|
69
|
+
|
|
70
|
+
# 5. 输出爬虫列表
|
|
71
|
+
print(f"📋 Found {len(spider_names)} spider(s):")
|
|
72
|
+
print("-" * 50)
|
|
73
|
+
for name in sorted(spider_names):
|
|
74
|
+
cls = process.get_spider_class(name)
|
|
75
|
+
module = cls.__module__.replace(project_package + ".", "") # 简化模块名
|
|
76
|
+
print(f"🕷️ {name:<20} {cls.__name__:<25} ({module})")
|
|
77
|
+
print("-" * 50)
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"❌ Error listing spiders: {e}")
|
|
82
|
+
import traceback
|
|
83
|
+
traceback.print_exc()
|
|
84
|
+
return 1
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == '__main__':
|
|
88
|
+
"""
|
|
89
|
+
允许直接运行:
|
|
90
|
+
python -m crawlo.commands.list
|
|
91
|
+
"""
|
|
92
|
+
sys.exit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""
|
|
2
|
+
命令行入口:crawlo run <spider_name>
|
|
3
|
+
用于运行指定名称的爬虫。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import configparser
|
|
9
|
+
|
|
10
|
+
from crawlo.crawler import CrawlerProcess
|
|
11
|
+
from crawlo.utils.project import get_settings
|
|
12
|
+
from crawlo.utils.log import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main(args):
|
|
18
|
+
"""
|
|
19
|
+
运行指定爬虫的主函数
|
|
20
|
+
用法:
|
|
21
|
+
crawlo run <spider_name>
|
|
22
|
+
crawlo run all
|
|
23
|
+
"""
|
|
24
|
+
if len(args) < 1:
|
|
25
|
+
print("Usage: crawlo run <spider_name>|all")
|
|
26
|
+
print("Examples:")
|
|
27
|
+
print(" crawlo run baidu")
|
|
28
|
+
print(" crawlo run all")
|
|
29
|
+
return 1
|
|
30
|
+
|
|
31
|
+
spider_arg = args[0]
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
# 1. 获取项目根目录
|
|
35
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
36
|
+
if not project_root:
|
|
37
|
+
print("❌ Error: Cannot determine project root.")
|
|
38
|
+
return 1
|
|
39
|
+
|
|
40
|
+
if str(project_root) not in sys.path:
|
|
41
|
+
sys.path.insert(0, str(project_root))
|
|
42
|
+
|
|
43
|
+
# 2. 读取 crawlo.cfg 获取项目包名
|
|
44
|
+
cfg_file = project_root / 'crawlo.cfg'
|
|
45
|
+
if not cfg_file.exists():
|
|
46
|
+
print(f"❌ Error: crawlo.cfg not found in {project_root}")
|
|
47
|
+
return 1
|
|
48
|
+
|
|
49
|
+
config = configparser.ConfigParser()
|
|
50
|
+
config.read(cfg_file, encoding='utf-8')
|
|
51
|
+
|
|
52
|
+
if not config.has_section('settings') or not config.has_option('settings', 'default'):
|
|
53
|
+
print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
|
|
54
|
+
return 1
|
|
55
|
+
|
|
56
|
+
settings_module = config.get('settings', 'default')
|
|
57
|
+
project_package = settings_module.split('.')[0]
|
|
58
|
+
|
|
59
|
+
# 3. 创建 CrawlerProcess 并自动发现爬虫模块
|
|
60
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
61
|
+
settings = get_settings()
|
|
62
|
+
process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
|
|
63
|
+
|
|
64
|
+
# === 新增:支持 'all' ===
|
|
65
|
+
if spider_arg.lower() == "all":
|
|
66
|
+
spider_names = process.get_spider_names()
|
|
67
|
+
if not spider_names:
|
|
68
|
+
print("❌ No spiders found. Make sure spiders are defined and imported.")
|
|
69
|
+
return 1
|
|
70
|
+
|
|
71
|
+
print(f"🚀 Starting ALL {len(spider_names)} spiders:")
|
|
72
|
+
for name in sorted(spider_names):
|
|
73
|
+
cls = process.get_spider_class(name)
|
|
74
|
+
print(f" 🕷️ {name} ({cls.__name__})")
|
|
75
|
+
print("-" * 50)
|
|
76
|
+
|
|
77
|
+
# 启动所有爬虫
|
|
78
|
+
asyncio.run(process.crawl(spider_names))
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
# === 原有:启动单个爬虫 ===
|
|
82
|
+
spider_name = spider_arg
|
|
83
|
+
if not process.is_spider_registered(spider_name):
|
|
84
|
+
print(f"❌ Error: Spider with name '{spider_name}' not found.")
|
|
85
|
+
available_names = process.get_spider_names()
|
|
86
|
+
if available_names:
|
|
87
|
+
print("💡 Available spiders:")
|
|
88
|
+
for name in sorted(available_names):
|
|
89
|
+
cls = process.get_spider_class(name)
|
|
90
|
+
print(f" - {name} (class: {cls.__name__})")
|
|
91
|
+
else:
|
|
92
|
+
print("💡 No spiders found. Make sure your spider classes are defined and imported.")
|
|
93
|
+
return 1
|
|
94
|
+
|
|
95
|
+
spider_class = process.get_spider_class(spider_name)
|
|
96
|
+
|
|
97
|
+
# 打印启动信息
|
|
98
|
+
print(f"🚀 Starting spider: {spider_name}")
|
|
99
|
+
print(f"📁 Project: {project_package}")
|
|
100
|
+
print(f"🕷️ Class: {spider_class.__name__}")
|
|
101
|
+
print("-" * 50)
|
|
102
|
+
|
|
103
|
+
# 启动爬虫
|
|
104
|
+
asyncio.run(process.crawl(spider_name))
|
|
105
|
+
|
|
106
|
+
print("-" * 50)
|
|
107
|
+
print("✅ Spider completed successfully!")
|
|
108
|
+
return 0
|
|
109
|
+
|
|
110
|
+
except KeyboardInterrupt:
|
|
111
|
+
print("\n⚠️ Spider interrupted by user.")
|
|
112
|
+
return 1
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"❌ Error running spider: {e}")
|
|
115
|
+
import traceback
|
|
116
|
+
traceback.print_exc()
|
|
117
|
+
return 1
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def list_available_spiders(project_package: str):
|
|
121
|
+
"""
|
|
122
|
+
列出指定项目包中所有可用的爬虫(用于调试或命令行扩展)
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
# 临时创建一个 CrawlerProcess 来发现爬虫
|
|
126
|
+
process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
|
|
127
|
+
available_names = process.get_spider_names()
|
|
128
|
+
|
|
129
|
+
if not available_names:
|
|
130
|
+
print(" No spiders found. Make sure:")
|
|
131
|
+
print(" - spiders/ 目录存在")
|
|
132
|
+
print(" - 爬虫类继承 Spider 且定义了 name")
|
|
133
|
+
print(" - 模块被导入(可通过 __init__.py 触发)")
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
print(f"Found {len(available_names)} spider(s):")
|
|
137
|
+
for name in sorted(available_names):
|
|
138
|
+
cls = process.get_spider_class(name)
|
|
139
|
+
module = cls.__module__.replace(project_package + ".", "")
|
|
140
|
+
print(f" - {name} ({cls.__name__} @ {module})")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"❌ Failed to list spiders: {e}")
|
|
143
|
+
import traceback
|
|
144
|
+
traceback.print_exc()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def run_spider_by_name(spider_name: str, project_package: str = None):
|
|
148
|
+
"""
|
|
149
|
+
在代码中直接运行某个爬虫(需提供 project_package)
|
|
150
|
+
"""
|
|
151
|
+
if project_package is None:
|
|
152
|
+
# 尝试从配置读取
|
|
153
|
+
cfg_file = Path('crawlo.cfg')
|
|
154
|
+
if cfg_file.exists():
|
|
155
|
+
config = configparser.ConfigParser()
|
|
156
|
+
config.read(cfg_file, encoding='utf-8')
|
|
157
|
+
if config.has_option('settings', 'default'):
|
|
158
|
+
project_package = config.get('settings', 'default').split('.')[0]
|
|
159
|
+
|
|
160
|
+
if not project_package:
|
|
161
|
+
print("❌ Error: project_package is required.")
|
|
162
|
+
return 1
|
|
163
|
+
|
|
164
|
+
# 添加项目路径
|
|
165
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
166
|
+
if project_root and str(project_root) not in sys.path:
|
|
167
|
+
sys.path.insert(0, str(project_root))
|
|
168
|
+
|
|
169
|
+
# 复用 main 函数逻辑
|
|
170
|
+
args = [spider_name]
|
|
171
|
+
return main(args)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == '__main__':
|
|
175
|
+
"""
|
|
176
|
+
允许直接运行:
|
|
177
|
+
python -m crawlo.commands.run <spider_name>
|
|
178
|
+
"""
|
|
179
|
+
import sys
|
|
180
|
+
|
|
181
|
+
sys.exit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:36
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
from crawlo.utils.log import get_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
# 保存最近运行的爬虫的统计(示例)
|
|
15
|
+
_LAST_RUN_STATS = {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def record_stats(crawler):
|
|
19
|
+
"""在爬虫关闭后记录统计(需在 close 中调用)"""
|
|
20
|
+
if crawler.stats and crawler.spider:
|
|
21
|
+
_LAST_RUN_STATS[crawler.spider.name] = crawler.stats.get_stats()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def main(args):
|
|
25
|
+
if len(args) == 0:
|
|
26
|
+
# 显示所有历史统计
|
|
27
|
+
if not _LAST_RUN_STATS:
|
|
28
|
+
print("📊 No stats available. Run a spider first.")
|
|
29
|
+
return 0
|
|
30
|
+
|
|
31
|
+
print("📊 Recent Spider Statistics:")
|
|
32
|
+
print("-" * 60)
|
|
33
|
+
for spider_name, stats in _LAST_RUN_STATS.items():
|
|
34
|
+
print(f"🕷️ {spider_name}")
|
|
35
|
+
for k, v in stats.items():
|
|
36
|
+
print(f" {k:<30} {v}")
|
|
37
|
+
print()
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
elif len(args) == 1:
|
|
41
|
+
spider_name = args[0]
|
|
42
|
+
if spider_name not in _LAST_RUN_STATS:
|
|
43
|
+
print(f"📊 No stats found for spider '{spider_name}'")
|
|
44
|
+
return 1
|
|
45
|
+
|
|
46
|
+
stats = _LAST_RUN_STATS[spider_name]
|
|
47
|
+
print(f"📊 Stats for '{spider_name}':")
|
|
48
|
+
print("-" * 60)
|
|
49
|
+
for k, v in stats.items():
|
|
50
|
+
print(f" {k:<30} {v}")
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
else:
|
|
54
|
+
print("Usage: crawlo stats [spider_name]")
|
|
55
|
+
return 1
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == '__main__':
|
|
59
|
+
sys.exit(main(sys.argv[1:]))
|