crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +34 -34
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +285 -285
- crawlo/commands/startproject.py +196 -196
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +279 -279
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +171 -171
- crawlo/core/enhanced_engine.py +189 -189
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +165 -165
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +242 -242
- crawlo/downloader/aiohttp_downloader.py +212 -212
- crawlo/downloader/cffi_downloader.py +251 -251
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +38 -31
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +58 -49
- crawlo/extension/log_stats.py +82 -44
- crawlo/extension/logging_extension.py +44 -35
- crawlo/extension/memory_monitor.py +89 -0
- crawlo/extension/performance_profiler.py +118 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +241 -241
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +248 -248
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +200 -200
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +311 -311
- crawlo/network/response.py +271 -271
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +224 -224
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +132 -117
- crawlo/pipelines/mysql_pipeline.py +317 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/pipelines/redis_dedup_pipeline.py +162 -162
- crawlo/project.py +153 -153
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +307 -307
- crawlo/queue/redis_priority_queue.py +208 -208
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +278 -244
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +131 -106
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +111 -87
- crawlo/templates/project/pipelines.py.tmpl +97 -341
- crawlo/templates/project/run.py.tmpl +251 -251
- crawlo/templates/project/settings.py.tmpl +279 -250
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +142 -178
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.4.dist-info/METADATA +403 -0
- crawlo-1.1.4.dist-info/RECORD +117 -0
- examples/__init__.py +7 -7
- examples/controlled_spider_example.py +205 -205
- tests/__init__.py +7 -7
- tests/test_final_validation.py +153 -153
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_redis_config.py +28 -28
- tests/test_redis_queue.py +224 -224
- tests/test_request_serialization.py +70 -70
- tests/test_scheduler.py +241 -241
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,252 +1,252 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
{{project_name}} 项目运行脚本
|
|
5
|
-
============================
|
|
6
|
-
基于 Crawlo 框架的智能爬虫启动器。
|
|
7
|
-
支持单机/分布式模式,灵活配置,开箱即用。
|
|
8
|
-
|
|
9
|
-
🎯 快速使用:
|
|
10
|
-
python run.py spider_name # 单机模式运行
|
|
11
|
-
python run.py spider_name --distributed # 分布式模式运行
|
|
12
|
-
python run.py spider_name --env production # 使用预设配置
|
|
13
|
-
python run.py all # 运行所有爬虫
|
|
14
|
-
|
|
15
|
-
🔧 高级选项:
|
|
16
|
-
python run.py spider_name --dry-run # 干运行(不执行实际爬取)
|
|
17
|
-
python run.py spider_name --concurrency 16 # 自定义并发数
|
|
18
|
-
python run.py spider_name --mode gentle # 温和模式(低负载)
|
|
19
|
-
python run.py spider1 spider2 --distributed # 多爬虫分布式运行
|
|
20
|
-
|
|
21
|
-
📦 配置模式:
|
|
22
|
-
--standalone 单机模式(默认)- 内存队列,无需外部依赖
|
|
23
|
-
--distributed 分布式模式 - Redis队列,支持多节点
|
|
24
|
-
--auto 自动模式 - 智能检测Redis可用性
|
|
25
|
-
|
|
26
|
-
🎛️ 预设配置:
|
|
27
|
-
--env development 开发环境(调试友好)
|
|
28
|
-
--env production 生产环境(高性能)
|
|
29
|
-
--env large-scale 大规模爬取(优化内存)
|
|
30
|
-
--env gentle 温和模式(低负载)
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
import os
|
|
34
|
-
import sys
|
|
35
|
-
import asyncio
|
|
36
|
-
import argparse
|
|
37
|
-
from pathlib import Path
|
|
38
|
-
from crawlo.crawler import CrawlerProcess
|
|
39
|
-
from crawlo.config import CrawloConfig
|
|
40
|
-
from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def create_parser():
|
|
44
|
-
"""创建命令行参数解析器"""
|
|
45
|
-
parser = argparse.ArgumentParser(
|
|
46
|
-
description='{{project_name}} 爬虫启动器 - 基于 Crawlo 框架',
|
|
47
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
48
|
-
epilog="""
|
|
49
|
-
示例用法:
|
|
50
|
-
python run.py my_spider # 默认单机模式
|
|
51
|
-
python run.py my_spider --distributed # 分布式模式
|
|
52
|
-
python run.py my_spider --env production # 生产环境配置
|
|
53
|
-
python run.py spider1 spider2 # 运行多个爬虫
|
|
54
|
-
python run.py all # 运行所有爬虫
|
|
55
|
-
python run.py my_spider --dry-run # 测试模式
|
|
56
|
-
"""
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
# 爬虫名称(位置参数)
|
|
60
|
-
parser.add_argument(
|
|
61
|
-
'spiders',
|
|
62
|
-
nargs='*',
|
|
63
|
-
help='要运行的爬虫名称(可指定多个,"all"表示运行所有爬虫)'
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
# 运行模式选择
|
|
67
|
-
mode_group = parser.add_mutually_exclusive_group()
|
|
68
|
-
mode_group.add_argument(
|
|
69
|
-
'--standalone',
|
|
70
|
-
action='store_true',
|
|
71
|
-
help='单机模式(默认)- 使用内存队列,无需外部依赖'
|
|
72
|
-
)
|
|
73
|
-
mode_group.add_argument(
|
|
74
|
-
'--distributed',
|
|
75
|
-
action='store_true',
|
|
76
|
-
help='分布式模式 - 使用 Redis 队列,支持多节点爬取'
|
|
77
|
-
)
|
|
78
|
-
mode_group.add_argument(
|
|
79
|
-
'--auto',
|
|
80
|
-
action='store_true',
|
|
81
|
-
help='自动模式 - 智能检测 Redis 可用性选择队列类型'
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
# 预设环境配置
|
|
85
|
-
parser.add_argument(
|
|
86
|
-
'--env',
|
|
87
|
-
choices=['development', 'production', 'large-scale', 'gentle'],
|
|
88
|
-
help='预设环境配置(优先级高于模式选择)'
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
# 性能调优选项
|
|
92
|
-
parser.add_argument(
|
|
93
|
-
'--concurrency',
|
|
94
|
-
type=int,
|
|
95
|
-
help='并发请求数(覆盖默认设置)'
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
parser.add_argument(
|
|
99
|
-
'--delay',
|
|
100
|
-
type=float,
|
|
101
|
-
help='请求延迟时间(秒)'
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# 功能选项
|
|
105
|
-
parser.add_argument(
|
|
106
|
-
'--dry-run',
|
|
107
|
-
action='store_true',
|
|
108
|
-
help='干运行模式 - 解析页面但不执行实际爬取操作'
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
parser.add_argument(
|
|
112
|
-
'--debug',
|
|
113
|
-
action='store_true',
|
|
114
|
-
help='启用调试模式 - 详细日志输出'
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
parser.add_argument(
|
|
118
|
-
'--config-file',
|
|
119
|
-
type=str,
|
|
120
|
-
help='自定义配置文件路径'
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
# 环境变量支持
|
|
124
|
-
parser.add_argument(
|
|
125
|
-
'--from-env',
|
|
126
|
-
action='store_true',
|
|
127
|
-
help='从环境变量加载配置(CRAWLO_*)'
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
return parser
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def build_config(args):
|
|
134
|
-
"""根据命令行参数构建配置"""
|
|
135
|
-
config = None
|
|
136
|
-
|
|
137
|
-
# 1. 优先使用环境变量配置
|
|
138
|
-
if args.from_env:
|
|
139
|
-
config = CrawloConfig.from_env()
|
|
140
|
-
print("📋 使用环境变量配置")
|
|
141
|
-
|
|
142
|
-
# 2. 使用预设环境配置
|
|
143
|
-
elif args.env:
|
|
144
|
-
presets = {
|
|
145
|
-
'development': CrawloConfig.presets().development(),
|
|
146
|
-
'production': CrawloConfig.presets().production(),
|
|
147
|
-
'large-scale': CrawloConfig.presets().large_scale(),
|
|
148
|
-
'gentle': CrawloConfig.presets().gentle()
|
|
149
|
-
}
|
|
150
|
-
config = presets[args.env]
|
|
151
|
-
print(f"🎛️ 使用预设配置: {args.env}")
|
|
152
|
-
|
|
153
|
-
# 3. 使用模式配置
|
|
154
|
-
elif args.distributed:
|
|
155
|
-
config = CrawloConfig.distributed()
|
|
156
|
-
print("🌐 启用分布式模式")
|
|
157
|
-
elif args.auto:
|
|
158
|
-
config = CrawloConfig.auto()
|
|
159
|
-
print("🤖 启用自动检测模式")
|
|
160
|
-
else:
|
|
161
|
-
# 默认单机模式
|
|
162
|
-
config = CrawloConfig.standalone()
|
|
163
|
-
print("💻 使用单机模式(默认)")
|
|
164
|
-
|
|
165
|
-
# 4. 应用命令行参数覆盖
|
|
166
|
-
if args.concurrency:
|
|
167
|
-
config.set('CONCURRENCY', args.concurrency)
|
|
168
|
-
print(f"⚡ 设置并发数: {args.concurrency}")
|
|
169
|
-
|
|
170
|
-
if args.delay:
|
|
171
|
-
config.set('DOWNLOAD_DELAY', args.delay)
|
|
172
|
-
print(f"⏱️ 设置请求延迟: {args.delay}秒")
|
|
173
|
-
|
|
174
|
-
if args.debug:
|
|
175
|
-
config.set('LOG_LEVEL', 'DEBUG')
|
|
176
|
-
print("🐛 启用调试模式")
|
|
177
|
-
|
|
178
|
-
if args.dry_run:
|
|
179
|
-
# 干运行模式的配置(可根据需要调整)
|
|
180
|
-
config.set('DOWNLOAD_DELAY', 0.1) # 加快速度
|
|
181
|
-
config.set('CONCURRENCY', 1) # 降低并发
|
|
182
|
-
print("🧪 启用干运行模式")
|
|
183
|
-
|
|
184
|
-
return config
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
async def main():
|
|
188
|
-
"""主函数:解析参数,构建配置,启动爬虫"""
|
|
189
|
-
|
|
190
|
-
# 解析命令行参数
|
|
191
|
-
parser = create_parser()
|
|
192
|
-
args = parser.parse_args()
|
|
193
|
-
|
|
194
|
-
# 检查是否指定了爬虫
|
|
195
|
-
if not args.spiders:
|
|
196
|
-
print("❌ 请指定要运行的爬虫名称")
|
|
197
|
-
print("\n可用的爬虫:")
|
|
198
|
-
print(" # TODO: 在这里列出你的爬虫")
|
|
199
|
-
print(" # from {{project_name}}.spiders import MySpider")
|
|
200
|
-
print("\n使用方法: python run.py <spider_name>")
|
|
201
|
-
parser.print_help()
|
|
202
|
-
return
|
|
203
|
-
|
|
204
|
-
# 构建配置
|
|
205
|
-
config = build_config(args)
|
|
206
|
-
|
|
207
|
-
# 创建爬虫进程
|
|
208
|
-
print(f"\n🚀 正在启动爬虫: {', '.join(args.spiders)}")
|
|
209
|
-
|
|
210
|
-
if args.dry_run:
|
|
211
|
-
print(" 🧪 [干运行模式] 将解析页面但不执行实际爬取")
|
|
212
|
-
|
|
213
|
-
try:
|
|
214
|
-
# 应用配置并启动
|
|
215
|
-
process = CrawlerProcess(settings=config.to_dict())
|
|
216
|
-
|
|
217
|
-
# 检查是否要运行所有爬虫
|
|
218
|
-
if 'all' in [s.lower() for s in args.spiders]:
|
|
219
|
-
# 获取所有已注册的爬虫名称
|
|
220
|
-
spider_names = process.get_spider_names()
|
|
221
|
-
if not spider_names:
|
|
222
|
-
print("❌ 未找到任何爬虫")
|
|
223
|
-
print("💡 请确保:")
|
|
224
|
-
print(" • 爬虫定义在 'spiders/' 目录中")
|
|
225
|
-
print(" • 爬虫类有 'name' 属性")
|
|
226
|
-
return 1
|
|
227
|
-
|
|
228
|
-
print(f"📋 找到 {len(spider_names)} 个爬虫: {', '.join(spider_names)}")
|
|
229
|
-
# 运行所有爬虫
|
|
230
|
-
await process.crawl(spider_names)
|
|
231
|
-
else:
|
|
232
|
-
# 运行指定爬虫
|
|
233
|
-
await process.crawl(args.spiders)
|
|
234
|
-
|
|
235
|
-
print("\n✅ 所有爬虫执行完成")
|
|
236
|
-
|
|
237
|
-
except ImportError as e:
|
|
238
|
-
print(f"❌ 无法导入爬虫: {e}")
|
|
239
|
-
print(" 请检查爬虫文件是否存在,并更新 run.py 中的导入语句")
|
|
240
|
-
except Exception as e:
|
|
241
|
-
print(f"❌ 运行错误: {e}")
|
|
242
|
-
raise
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
if __name__ == '__main__':
|
|
246
|
-
try:
|
|
247
|
-
asyncio.run(main())
|
|
248
|
-
except KeyboardInterrupt:
|
|
249
|
-
print("\n⏹️ 用户中断爬虫执行")
|
|
250
|
-
except Exception as e:
|
|
251
|
-
print(f"❌ 运行错误: {e}")
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
{{project_name}} 项目运行脚本
|
|
5
|
+
============================
|
|
6
|
+
基于 Crawlo 框架的智能爬虫启动器。
|
|
7
|
+
支持单机/分布式模式,灵活配置,开箱即用。
|
|
8
|
+
|
|
9
|
+
🎯 快速使用:
|
|
10
|
+
python run.py spider_name # 单机模式运行
|
|
11
|
+
python run.py spider_name --distributed # 分布式模式运行
|
|
12
|
+
python run.py spider_name --env production # 使用预设配置
|
|
13
|
+
python run.py all # 运行所有爬虫
|
|
14
|
+
|
|
15
|
+
🔧 高级选项:
|
|
16
|
+
python run.py spider_name --dry-run # 干运行(不执行实际爬取)
|
|
17
|
+
python run.py spider_name --concurrency 16 # 自定义并发数
|
|
18
|
+
python run.py spider_name --mode gentle # 温和模式(低负载)
|
|
19
|
+
python run.py spider1 spider2 --distributed # 多爬虫分布式运行
|
|
20
|
+
|
|
21
|
+
📦 配置模式:
|
|
22
|
+
--standalone 单机模式(默认)- 内存队列,无需外部依赖
|
|
23
|
+
--distributed 分布式模式 - Redis队列,支持多节点
|
|
24
|
+
--auto 自动模式 - 智能检测Redis可用性
|
|
25
|
+
|
|
26
|
+
🎛️ 预设配置:
|
|
27
|
+
--env development 开发环境(调试友好)
|
|
28
|
+
--env production 生产环境(高性能)
|
|
29
|
+
--env large-scale 大规模爬取(优化内存)
|
|
30
|
+
--env gentle 温和模式(低负载)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
import os
|
|
34
|
+
import sys
|
|
35
|
+
import asyncio
|
|
36
|
+
import argparse
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
from crawlo.crawler import CrawlerProcess
|
|
39
|
+
from crawlo.config import CrawloConfig
|
|
40
|
+
from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def create_parser():
|
|
44
|
+
"""创建命令行参数解析器"""
|
|
45
|
+
parser = argparse.ArgumentParser(
|
|
46
|
+
description='{{project_name}} 爬虫启动器 - 基于 Crawlo 框架',
|
|
47
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
48
|
+
epilog="""
|
|
49
|
+
示例用法:
|
|
50
|
+
python run.py my_spider # 默认单机模式
|
|
51
|
+
python run.py my_spider --distributed # 分布式模式
|
|
52
|
+
python run.py my_spider --env production # 生产环境配置
|
|
53
|
+
python run.py spider1 spider2 # 运行多个爬虫
|
|
54
|
+
python run.py all # 运行所有爬虫
|
|
55
|
+
python run.py my_spider --dry-run # 测试模式
|
|
56
|
+
"""
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# 爬虫名称(位置参数)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
'spiders',
|
|
62
|
+
nargs='*',
|
|
63
|
+
help='要运行的爬虫名称(可指定多个,"all"表示运行所有爬虫)'
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# 运行模式选择
|
|
67
|
+
mode_group = parser.add_mutually_exclusive_group()
|
|
68
|
+
mode_group.add_argument(
|
|
69
|
+
'--standalone',
|
|
70
|
+
action='store_true',
|
|
71
|
+
help='单机模式(默认)- 使用内存队列,无需外部依赖'
|
|
72
|
+
)
|
|
73
|
+
mode_group.add_argument(
|
|
74
|
+
'--distributed',
|
|
75
|
+
action='store_true',
|
|
76
|
+
help='分布式模式 - 使用 Redis 队列,支持多节点爬取'
|
|
77
|
+
)
|
|
78
|
+
mode_group.add_argument(
|
|
79
|
+
'--auto',
|
|
80
|
+
action='store_true',
|
|
81
|
+
help='自动模式 - 智能检测 Redis 可用性选择队列类型'
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# 预设环境配置
|
|
85
|
+
parser.add_argument(
|
|
86
|
+
'--env',
|
|
87
|
+
choices=['development', 'production', 'large-scale', 'gentle'],
|
|
88
|
+
help='预设环境配置(优先级高于模式选择)'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# 性能调优选项
|
|
92
|
+
parser.add_argument(
|
|
93
|
+
'--concurrency',
|
|
94
|
+
type=int,
|
|
95
|
+
help='并发请求数(覆盖默认设置)'
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
'--delay',
|
|
100
|
+
type=float,
|
|
101
|
+
help='请求延迟时间(秒)'
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# 功能选项
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
'--dry-run',
|
|
107
|
+
action='store_true',
|
|
108
|
+
help='干运行模式 - 解析页面但不执行实际爬取操作'
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
'--debug',
|
|
113
|
+
action='store_true',
|
|
114
|
+
help='启用调试模式 - 详细日志输出'
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
'--config-file',
|
|
119
|
+
type=str,
|
|
120
|
+
help='自定义配置文件路径'
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# 环境变量支持
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
'--from-env',
|
|
126
|
+
action='store_true',
|
|
127
|
+
help='从环境变量加载配置(CRAWLO_*)'
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return parser
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def build_config(args):
|
|
134
|
+
"""根据命令行参数构建配置"""
|
|
135
|
+
config = None
|
|
136
|
+
|
|
137
|
+
# 1. 优先使用环境变量配置
|
|
138
|
+
if args.from_env:
|
|
139
|
+
config = CrawloConfig.from_env()
|
|
140
|
+
print("📋 使用环境变量配置")
|
|
141
|
+
|
|
142
|
+
# 2. 使用预设环境配置
|
|
143
|
+
elif args.env:
|
|
144
|
+
presets = {
|
|
145
|
+
'development': CrawloConfig.presets().development(),
|
|
146
|
+
'production': CrawloConfig.presets().production(),
|
|
147
|
+
'large-scale': CrawloConfig.presets().large_scale(),
|
|
148
|
+
'gentle': CrawloConfig.presets().gentle()
|
|
149
|
+
}
|
|
150
|
+
config = presets[args.env]
|
|
151
|
+
print(f"🎛️ 使用预设配置: {args.env}")
|
|
152
|
+
|
|
153
|
+
# 3. 使用模式配置
|
|
154
|
+
elif args.distributed:
|
|
155
|
+
config = CrawloConfig.distributed()
|
|
156
|
+
print("🌐 启用分布式模式")
|
|
157
|
+
elif args.auto:
|
|
158
|
+
config = CrawloConfig.auto()
|
|
159
|
+
print("🤖 启用自动检测模式")
|
|
160
|
+
else:
|
|
161
|
+
# 默认单机模式
|
|
162
|
+
config = CrawloConfig.standalone()
|
|
163
|
+
print("💻 使用单机模式(默认)")
|
|
164
|
+
|
|
165
|
+
# 4. 应用命令行参数覆盖
|
|
166
|
+
if args.concurrency:
|
|
167
|
+
config.set('CONCURRENCY', args.concurrency)
|
|
168
|
+
print(f"⚡ 设置并发数: {args.concurrency}")
|
|
169
|
+
|
|
170
|
+
if args.delay:
|
|
171
|
+
config.set('DOWNLOAD_DELAY', args.delay)
|
|
172
|
+
print(f"⏱️ 设置请求延迟: {args.delay}秒")
|
|
173
|
+
|
|
174
|
+
if args.debug:
|
|
175
|
+
config.set('LOG_LEVEL', 'DEBUG')
|
|
176
|
+
print("🐛 启用调试模式")
|
|
177
|
+
|
|
178
|
+
if args.dry_run:
|
|
179
|
+
# 干运行模式的配置(可根据需要调整)
|
|
180
|
+
config.set('DOWNLOAD_DELAY', 0.1) # 加快速度
|
|
181
|
+
config.set('CONCURRENCY', 1) # 降低并发
|
|
182
|
+
print("🧪 启用干运行模式")
|
|
183
|
+
|
|
184
|
+
return config
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
async def main():
|
|
188
|
+
"""主函数:解析参数,构建配置,启动爬虫"""
|
|
189
|
+
|
|
190
|
+
# 解析命令行参数
|
|
191
|
+
parser = create_parser()
|
|
192
|
+
args = parser.parse_args()
|
|
193
|
+
|
|
194
|
+
# 检查是否指定了爬虫
|
|
195
|
+
if not args.spiders:
|
|
196
|
+
print("❌ 请指定要运行的爬虫名称")
|
|
197
|
+
print("\n可用的爬虫:")
|
|
198
|
+
print(" # TODO: 在这里列出你的爬虫")
|
|
199
|
+
print(" # from {{project_name}}.spiders import MySpider")
|
|
200
|
+
print("\n使用方法: python run.py <spider_name>")
|
|
201
|
+
parser.print_help()
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
# 构建配置
|
|
205
|
+
config = build_config(args)
|
|
206
|
+
|
|
207
|
+
# 创建爬虫进程
|
|
208
|
+
print(f"\n🚀 正在启动爬虫: {', '.join(args.spiders)}")
|
|
209
|
+
|
|
210
|
+
if args.dry_run:
|
|
211
|
+
print(" 🧪 [干运行模式] 将解析页面但不执行实际爬取")
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
# 应用配置并启动
|
|
215
|
+
process = CrawlerProcess(settings=config.to_dict())
|
|
216
|
+
|
|
217
|
+
# 检查是否要运行所有爬虫
|
|
218
|
+
if 'all' in [s.lower() for s in args.spiders]:
|
|
219
|
+
# 获取所有已注册的爬虫名称
|
|
220
|
+
spider_names = process.get_spider_names()
|
|
221
|
+
if not spider_names:
|
|
222
|
+
print("❌ 未找到任何爬虫")
|
|
223
|
+
print("💡 请确保:")
|
|
224
|
+
print(" • 爬虫定义在 'spiders/' 目录中")
|
|
225
|
+
print(" • 爬虫类有 'name' 属性")
|
|
226
|
+
return 1
|
|
227
|
+
|
|
228
|
+
print(f"📋 找到 {len(spider_names)} 个爬虫: {', '.join(spider_names)}")
|
|
229
|
+
# 运行所有爬虫
|
|
230
|
+
await process.crawl(spider_names)
|
|
231
|
+
else:
|
|
232
|
+
# 运行指定爬虫
|
|
233
|
+
await process.crawl(args.spiders)
|
|
234
|
+
|
|
235
|
+
print("\n✅ 所有爬虫执行完成")
|
|
236
|
+
|
|
237
|
+
except ImportError as e:
|
|
238
|
+
print(f"❌ 无法导入爬虫: {e}")
|
|
239
|
+
print(" 请检查爬虫文件是否存在,并更新 run.py 中的导入语句")
|
|
240
|
+
except Exception as e:
|
|
241
|
+
print(f"❌ 运行错误: {e}")
|
|
242
|
+
raise
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
if __name__ == '__main__':
|
|
246
|
+
try:
|
|
247
|
+
asyncio.run(main())
|
|
248
|
+
except KeyboardInterrupt:
|
|
249
|
+
print("\n⏹️ 用户中断爬虫执行")
|
|
250
|
+
except Exception as e:
|
|
251
|
+
print(f"❌ 运行错误: {e}")
|
|
252
252
|
sys.exit(1)
|