crawlo 1.2.2__py3-none-any.whl → 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/check.py +69 -69
- crawlo/commands/genspider.py +25 -25
- crawlo/commands/list.py +23 -23
- crawlo/commands/run.py +34 -34
- crawlo/commands/startproject.py +35 -36
- crawlo/commands/stats.py +21 -21
- crawlo/commands/utils.py +4 -4
- crawlo/config.py +4 -4
- crawlo/config_validator.py +1 -2
- crawlo/data/__init__.py +6 -0
- crawlo/data/user_agents.py +108 -0
- crawlo/downloader/hybrid_downloader.py +0 -1
- crawlo/downloader/selenium_downloader.py +1 -1
- crawlo/extension/health_check.py +2 -2
- crawlo/items/fields.py +0 -1
- crawlo/middleware/offsite.py +0 -1
- crawlo/middleware/proxy.py +3 -2
- crawlo/middleware/request_ignore.py +0 -1
- crawlo/middleware/response_code.py +0 -1
- crawlo/middleware/response_filter.py +0 -1
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +3 -3
- crawlo/pipelines/database_dedup_pipeline.py +1 -3
- crawlo/pipelines/memory_dedup_pipeline.py +2 -2
- crawlo/pipelines/mysql_pipeline.py +4 -3
- crawlo/pipelines/redis_dedup_pipeline.py +2 -4
- crawlo/project.py +2 -2
- crawlo/subscriber.py +1 -2
- crawlo/templates/project/settings.py.tmpl +1 -3
- crawlo/templates/project/settings_distributed.py.tmpl +2 -0
- crawlo/utils/batch_processor.py +2 -3
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/enhanced_error_handler.py +3 -6
- crawlo/utils/env_config.py +1 -1
- crawlo/utils/error_handler.py +2 -4
- crawlo/utils/large_scale_helper.py +2 -1
- crawlo/utils/performance_monitor.py +5 -4
- crawlo/utils/redis_connection_pool.py +4 -4
- crawlo/utils/redis_key_validator.py +1 -2
- crawlo/utils/request_serializer.py +1 -2
- crawlo/utils/spider_loader.py +0 -1
- {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/METADATA +1 -1
- {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/RECORD +47 -45
- {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/WHEEL +0 -0
- {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/top_level.txt +0 -0
crawlo/commands/run.py
CHANGED
|
@@ -56,8 +56,8 @@ def main(args):
|
|
|
56
56
|
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
57
57
|
"""
|
|
58
58
|
if len(args) < 1:
|
|
59
|
-
console.print("[bold red]❌
|
|
60
|
-
console.print("💡
|
|
59
|
+
console.print("[bold red]❌ 用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
60
|
+
console.print("💡 示例:")
|
|
61
61
|
console.print(" [blue]crawlo run baidu[/blue]")
|
|
62
62
|
console.print(" [blue]crawlo run all[/blue]")
|
|
63
63
|
console.print(" [blue]crawlo run all --json --no-stats[/blue]")
|
|
@@ -72,14 +72,14 @@ def main(args):
|
|
|
72
72
|
# 1. 查找项目根目录
|
|
73
73
|
project_root = get_project_root()
|
|
74
74
|
if not project_root:
|
|
75
|
-
msg = ":cross_mark: [bold red]
|
|
75
|
+
msg = ":cross_mark: [bold red]找不到 'crawlo.cfg'[/bold red]\n💡 请在项目目录中运行此命令。"
|
|
76
76
|
if show_json:
|
|
77
|
-
console.print_json(data={"success": False, "error": "
|
|
77
|
+
console.print_json(data={"success": False, "error": "未找到项目根目录"})
|
|
78
78
|
return 1
|
|
79
79
|
else:
|
|
80
80
|
console.print(Panel(
|
|
81
81
|
Text.from_markup(msg),
|
|
82
|
-
title="❌
|
|
82
|
+
title="❌ 非Crawlo项目",
|
|
83
83
|
border_style="red",
|
|
84
84
|
padding=(1, 2)
|
|
85
85
|
))
|
|
@@ -92,24 +92,24 @@ def main(args):
|
|
|
92
92
|
# 2. 读取 crawlo.cfg 获取 settings 模块
|
|
93
93
|
cfg_file = project_root / "crawlo.cfg"
|
|
94
94
|
if not cfg_file.exists():
|
|
95
|
-
msg = f"
|
|
95
|
+
msg = f"在 {project_root} 中未找到 crawlo.cfg"
|
|
96
96
|
if show_json:
|
|
97
97
|
console.print_json(data={"success": False, "error": msg})
|
|
98
98
|
return 1
|
|
99
99
|
else:
|
|
100
|
-
console.print(Panel(msg, title="❌
|
|
100
|
+
console.print(Panel(msg, title="❌ 缺少配置文件", border_style="red"))
|
|
101
101
|
return 1
|
|
102
102
|
|
|
103
103
|
config = configparser.ConfigParser()
|
|
104
104
|
config.read(cfg_file, encoding="utf-8")
|
|
105
105
|
|
|
106
106
|
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
107
|
-
msg = "
|
|
107
|
+
msg = "crawlo.cfg 中缺少 [settings] 部分或 'default' 选项"
|
|
108
108
|
if show_json:
|
|
109
109
|
console.print_json(data={"success": False, "error": msg})
|
|
110
110
|
return 1
|
|
111
111
|
else:
|
|
112
|
-
console.print(Panel(msg, title="❌
|
|
112
|
+
console.print(Panel(msg, title="❌ 无效配置", border_style="red"))
|
|
113
113
|
return 1
|
|
114
114
|
|
|
115
115
|
settings_module = config.get("settings", "default")
|
|
@@ -119,12 +119,12 @@ def main(args):
|
|
|
119
119
|
try:
|
|
120
120
|
import_module(project_package)
|
|
121
121
|
except ImportError as e:
|
|
122
|
-
msg = f"
|
|
122
|
+
msg = f"导入项目包 '{project_package}' 失败: {e}"
|
|
123
123
|
if show_json:
|
|
124
124
|
console.print_json(data={"success": False, "error": msg})
|
|
125
125
|
return 1
|
|
126
126
|
else:
|
|
127
|
-
console.print(Panel(msg, title="❌
|
|
127
|
+
console.print(Panel(msg, title="❌ 导入错误", border_style="red"))
|
|
128
128
|
return 1
|
|
129
129
|
|
|
130
130
|
# 4. 加载 settings 和爬虫模块
|
|
@@ -136,20 +136,20 @@ def main(args):
|
|
|
136
136
|
if spider_arg.lower() == "all":
|
|
137
137
|
spider_names = process.get_spider_names()
|
|
138
138
|
if not spider_names:
|
|
139
|
-
msg = "
|
|
139
|
+
msg = "未找到爬虫。"
|
|
140
140
|
if show_json:
|
|
141
141
|
console.print_json(data={"success": False, "error": msg})
|
|
142
142
|
return 1
|
|
143
143
|
else:
|
|
144
144
|
console.print(Panel(
|
|
145
145
|
Text.from_markup(
|
|
146
|
-
":cross_mark: [bold red]
|
|
147
|
-
"[bold]💡
|
|
148
|
-
" •
|
|
149
|
-
" •
|
|
150
|
-
" •
|
|
146
|
+
":cross_mark: [bold red]未找到爬虫。[/bold red]\n\n"
|
|
147
|
+
"[bold]💡 确保:[/bold]\n"
|
|
148
|
+
" • 爬虫定义于 '[cyan]spiders/[/cyan]' 目录\n"
|
|
149
|
+
" • 具有 [green]`name`[/green] 属性\n"
|
|
150
|
+
" • 模块已导入 (例如通过 [cyan]__init__.py[/cyan])"
|
|
151
151
|
),
|
|
152
|
-
title="❌
|
|
152
|
+
title="❌ 未找到爬虫",
|
|
153
153
|
border_style="red",
|
|
154
154
|
padding=(1, 2)
|
|
155
155
|
))
|
|
@@ -157,13 +157,13 @@ def main(args):
|
|
|
157
157
|
|
|
158
158
|
# 显示即将运行的爬虫列表
|
|
159
159
|
table = Table(
|
|
160
|
-
title=f"🚀
|
|
160
|
+
title=f"🚀 启动全部 {len(spider_names)} 个爬虫",
|
|
161
161
|
box=box.ROUNDED,
|
|
162
162
|
show_header=True,
|
|
163
163
|
header_style="bold magenta"
|
|
164
164
|
)
|
|
165
|
-
table.add_column("
|
|
166
|
-
table.add_column("
|
|
165
|
+
table.add_column("名称", style="cyan")
|
|
166
|
+
table.add_column("类名", style="green")
|
|
167
167
|
|
|
168
168
|
for name in sorted(spider_names):
|
|
169
169
|
cls = process.get_spider_class(name)
|
|
@@ -183,15 +183,15 @@ def main(args):
|
|
|
183
183
|
TextColumn("[progress.description]{task.description}"),
|
|
184
184
|
transient=True,
|
|
185
185
|
) as progress:
|
|
186
|
-
task = progress.add_task("
|
|
186
|
+
task = progress.add_task("正在运行所有爬虫...", total=None)
|
|
187
187
|
asyncio.run(process.crawl(spider_names))
|
|
188
188
|
|
|
189
189
|
if show_json:
|
|
190
190
|
console.print_json(data={"success": True, "spiders": spider_names})
|
|
191
191
|
else:
|
|
192
192
|
console.print(Panel(
|
|
193
|
-
":tada: [bold green]
|
|
194
|
-
title="✅
|
|
193
|
+
":tada: [bold green]所有爬虫运行完成![/bold green]",
|
|
194
|
+
title="✅ 全部完成",
|
|
195
195
|
border_style="green"
|
|
196
196
|
))
|
|
197
197
|
return 0
|
|
@@ -200,7 +200,7 @@ def main(args):
|
|
|
200
200
|
spider_name = spider_arg
|
|
201
201
|
if not process.is_spider_registered(spider_name):
|
|
202
202
|
available = process.get_spider_names()
|
|
203
|
-
msg = f"
|
|
203
|
+
msg = f"爬虫 '[cyan]{spider_name}[/cyan]' 未找到。"
|
|
204
204
|
if show_json:
|
|
205
205
|
console.print_json(data={
|
|
206
206
|
"success": False,
|
|
@@ -211,16 +211,16 @@ def main(args):
|
|
|
211
211
|
else:
|
|
212
212
|
panel_content = Text.from_markup(msg + "\n")
|
|
213
213
|
if available:
|
|
214
|
-
panel_content.append("\n💡
|
|
214
|
+
panel_content.append("\n💡 可用爬虫:\n")
|
|
215
215
|
for name in sorted(available):
|
|
216
216
|
cls = process.get_spider_class(name)
|
|
217
217
|
panel_content.append(f" • [cyan]{name}[/cyan] ([green]{cls.__name__}[/green])\n")
|
|
218
218
|
else:
|
|
219
|
-
panel_content.append("\n💡
|
|
219
|
+
panel_content.append("\n💡 未找到爬虫。请检查爬虫模块。")
|
|
220
220
|
|
|
221
221
|
console.print(Panel(
|
|
222
222
|
panel_content,
|
|
223
|
-
title="❌
|
|
223
|
+
title="❌ 爬虫未找到",
|
|
224
224
|
border_style="red",
|
|
225
225
|
padding=(1, 2)
|
|
226
226
|
))
|
|
@@ -231,7 +231,7 @@ def main(args):
|
|
|
231
231
|
# 显示启动信息
|
|
232
232
|
if not show_json:
|
|
233
233
|
info_table = Table(
|
|
234
|
-
title=f"🚀
|
|
234
|
+
title=f"🚀 启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
|
|
235
235
|
box=box.SIMPLE,
|
|
236
236
|
show_header=False,
|
|
237
237
|
title_style="bold green"
|
|
@@ -255,21 +255,21 @@ def main(args):
|
|
|
255
255
|
TextColumn("[progress.description]{task.description}"),
|
|
256
256
|
transient=True,
|
|
257
257
|
) as progress:
|
|
258
|
-
task = progress.add_task(f"
|
|
258
|
+
task = progress.add_task(f"正在运行 {spider_name}...", total=None)
|
|
259
259
|
asyncio.run(process.crawl(spider_name))
|
|
260
260
|
|
|
261
261
|
if show_json:
|
|
262
262
|
console.print_json(data={"success": True, "spider": spider_name})
|
|
263
263
|
else:
|
|
264
264
|
console.print(Panel(
|
|
265
|
-
f":tada: [bold green]
|
|
266
|
-
title="✅
|
|
265
|
+
f":tada: [bold green]爬虫 '[cyan]{spider_name}[/cyan]' 运行完成![/bold green]",
|
|
266
|
+
title="✅ 完成",
|
|
267
267
|
border_style="green"
|
|
268
268
|
))
|
|
269
269
|
return 0
|
|
270
270
|
|
|
271
271
|
except KeyboardInterrupt:
|
|
272
|
-
msg = "⚠️
|
|
272
|
+
msg = "⚠️ 爬虫被用户中断。"
|
|
273
273
|
if show_json:
|
|
274
274
|
console.print_json(data={"success": False, "error": msg})
|
|
275
275
|
else:
|
|
@@ -277,7 +277,7 @@ def main(args):
|
|
|
277
277
|
return 1
|
|
278
278
|
except Exception as e:
|
|
279
279
|
logger.exception("Exception during 'crawlo run'")
|
|
280
|
-
msg = f"
|
|
280
|
+
msg = f"意外错误: {e}"
|
|
281
281
|
if show_json:
|
|
282
282
|
console.print_json(data={"success": False, "error": msg})
|
|
283
283
|
else:
|
crawlo/commands/startproject.py
CHANGED
|
@@ -211,13 +211,13 @@ def validate_project_name(project_name: str) -> tuple[bool, str]:
|
|
|
211
211
|
"""
|
|
212
212
|
# 检查是否为空
|
|
213
213
|
if not project_name or not project_name.strip():
|
|
214
|
-
return False, "
|
|
214
|
+
return False, "项目名称不能为空"
|
|
215
215
|
|
|
216
216
|
project_name = project_name.strip()
|
|
217
217
|
|
|
218
218
|
# 检查长度
|
|
219
219
|
if len(project_name) > 50:
|
|
220
|
-
return False, "
|
|
220
|
+
return False, "项目名称太长(最多50个字符)"
|
|
221
221
|
|
|
222
222
|
# 检查是否为Python关键字
|
|
223
223
|
python_keywords = {
|
|
@@ -228,22 +228,21 @@ def validate_project_name(project_name: str) -> tuple[bool, str]:
|
|
|
228
228
|
'while', 'with', 'yield'
|
|
229
229
|
}
|
|
230
230
|
if project_name in python_keywords:
|
|
231
|
-
return False, f"'{project_name}'
|
|
231
|
+
return False, f"'{project_name}' 是Python关键字,不能用作项目名称"
|
|
232
232
|
|
|
233
233
|
# 检查是否为有效的Python标识符
|
|
234
234
|
if not project_name.isidentifier():
|
|
235
|
-
return False, "
|
|
235
|
+
return False, "项目名称必须是有效的Python标识符"
|
|
236
236
|
|
|
237
237
|
# 检查格式(建议使用snake_case)
|
|
238
238
|
if not re.match(r'^[a-z][a-z0-9_]*$', project_name):
|
|
239
239
|
return False, (
|
|
240
|
-
"
|
|
241
|
-
"contain only lowercase letters, numbers, and underscores"
|
|
240
|
+
"项目名称应以小写字母开头,只能包含小写字母、数字和下划线"
|
|
242
241
|
)
|
|
243
242
|
|
|
244
243
|
# 检查是否以数字结尾(不推荐)
|
|
245
244
|
if project_name[-1].isdigit():
|
|
246
|
-
return False, "
|
|
245
|
+
return False, "项目名称不应以数字结尾"
|
|
247
246
|
|
|
248
247
|
return True, ""
|
|
249
248
|
|
|
@@ -284,8 +283,8 @@ def show_module_options():
|
|
|
284
283
|
|
|
285
284
|
def main(args):
|
|
286
285
|
if len(args) < 1:
|
|
287
|
-
console.print("[bold red]
|
|
288
|
-
console.print("💡
|
|
286
|
+
console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo startproject[/blue] <项目名称> [模板类型] [--modules 模块1,模块2]")
|
|
287
|
+
console.print("💡 示例:")
|
|
289
288
|
console.print(" [blue]crawlo startproject[/blue] my_spider_project")
|
|
290
289
|
console.print(" [blue]crawlo startproject[/blue] news_crawler simple")
|
|
291
290
|
console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed --modules mysql,proxy")
|
|
@@ -315,8 +314,8 @@ def main(args):
|
|
|
315
314
|
# 验证模板类型
|
|
316
315
|
if template_type not in TEMPLATE_TYPES:
|
|
317
316
|
show_error_panel(
|
|
318
|
-
"
|
|
319
|
-
f"
|
|
317
|
+
"无效的模板类型",
|
|
318
|
+
f"不支持模板类型 '[cyan]{template_type}[/cyan]'。\n"
|
|
320
319
|
)
|
|
321
320
|
show_template_options()
|
|
322
321
|
return 1
|
|
@@ -325,14 +324,14 @@ def main(args):
|
|
|
325
324
|
is_valid, error_msg = validate_project_name(project_name)
|
|
326
325
|
if not is_valid:
|
|
327
326
|
show_error_panel(
|
|
328
|
-
"
|
|
329
|
-
f"[cyan]{project_name}[/cyan]
|
|
327
|
+
"无效的项目名称",
|
|
328
|
+
f"[cyan]{project_name}[/cyan] 不是有效的项目名称。\n"
|
|
330
329
|
f"❌ {error_msg}\n\n"
|
|
331
|
-
"💡
|
|
332
|
-
" •
|
|
333
|
-
" •
|
|
334
|
-
" •
|
|
335
|
-
" •
|
|
330
|
+
"💡 项目名称应:\n"
|
|
331
|
+
" • 以小写字母开头\n"
|
|
332
|
+
" • 只能包含小写字母、数字和下划线\n"
|
|
333
|
+
" • 是有效的Python标识符\n"
|
|
334
|
+
" • 不能是Python关键字"
|
|
336
335
|
)
|
|
337
336
|
return 1
|
|
338
337
|
|
|
@@ -340,9 +339,9 @@ def main(args):
|
|
|
340
339
|
|
|
341
340
|
if project_dir.exists():
|
|
342
341
|
show_error_panel(
|
|
343
|
-
"
|
|
344
|
-
f"
|
|
345
|
-
"💡
|
|
342
|
+
"目录已存在",
|
|
343
|
+
f"目录 '[cyan]{project_dir}[/cyan]' 已存在。\n"
|
|
344
|
+
"💡 请选择不同的项目名称或删除现有目录。"
|
|
346
345
|
)
|
|
347
346
|
return 1
|
|
348
347
|
|
|
@@ -358,26 +357,26 @@ def main(args):
|
|
|
358
357
|
if cfg_template.exists():
|
|
359
358
|
cfg_content = _render_template(cfg_template, context)
|
|
360
359
|
(project_dir / 'crawlo.cfg').write_text(cfg_content, encoding='utf-8')
|
|
361
|
-
console.print(f":white_check_mark:
|
|
360
|
+
console.print(f":white_check_mark: 已创建 [green]{project_dir / 'crawlo.cfg'}[/green]")
|
|
362
361
|
else:
|
|
363
|
-
console.print("[yellow]⚠
|
|
362
|
+
console.print("[yellow]⚠ 警告:[/yellow] 找不到模板 'crawlo.cfg.tmpl'。")
|
|
364
363
|
|
|
365
364
|
# 3. 复制并渲染项目包内容
|
|
366
365
|
package_dir = project_dir / project_name
|
|
367
366
|
_copytree_with_templates(template_dir, package_dir, context, template_type, modules)
|
|
368
|
-
console.print(f":white_check_mark:
|
|
367
|
+
console.print(f":white_check_mark: 已创建项目包: [green]{package_dir}[/green]")
|
|
369
368
|
|
|
370
369
|
# 4. 创建 logs 目录
|
|
371
370
|
(project_dir / 'logs').mkdir(exist_ok=True)
|
|
372
|
-
console.print(":white_check_mark:
|
|
371
|
+
console.print(":white_check_mark: 已创建 logs 目录")
|
|
373
372
|
|
|
374
373
|
# 5. 创建 output 目录(用于数据输出)
|
|
375
374
|
(project_dir / 'output').mkdir(exist_ok=True)
|
|
376
|
-
console.print(":white_check_mark:
|
|
375
|
+
console.print(":white_check_mark: 已创建 output 目录")
|
|
377
376
|
|
|
378
377
|
# 成功面板
|
|
379
|
-
success_text = Text.from_markup(f"
|
|
380
|
-
console.print(Panel(success_text, title=":rocket:
|
|
378
|
+
success_text = Text.from_markup(f"项目 '[bold cyan]{project_name}[/bold cyan]' 创建成功!")
|
|
379
|
+
console.print(Panel(success_text, title=":rocket: 成功", border_style="green", padding=(1, 2)))
|
|
381
380
|
|
|
382
381
|
# 显示使用的模板类型
|
|
383
382
|
if template_type != 'default':
|
|
@@ -389,15 +388,15 @@ def main(args):
|
|
|
389
388
|
|
|
390
389
|
# 下一步操作提示(对齐美观 + 语法高亮)
|
|
391
390
|
next_steps = f"""
|
|
392
|
-
[bold]🚀
|
|
391
|
+
[bold]🚀 下一步操作:[/bold]
|
|
393
392
|
[blue]cd[/blue] {project_name}
|
|
394
393
|
[blue]crawlo genspider[/blue] example example.com
|
|
395
394
|
[blue]crawlo run[/blue] example
|
|
396
395
|
|
|
397
|
-
[bold]📚
|
|
398
|
-
[blue]crawlo list[/blue] #
|
|
399
|
-
[blue]crawlo check[/blue] example #
|
|
400
|
-
[blue]crawlo stats[/blue] #
|
|
396
|
+
[bold]📚 了解更多:[/bold]
|
|
397
|
+
[blue]crawlo list[/blue] # 列出所有爬虫
|
|
398
|
+
[blue]crawlo check[/blue] example # 检查爬虫有效性
|
|
399
|
+
[blue]crawlo stats[/blue] # 查看统计信息
|
|
401
400
|
""".strip()
|
|
402
401
|
console.print(next_steps)
|
|
403
402
|
|
|
@@ -405,12 +404,12 @@ def main(args):
|
|
|
405
404
|
|
|
406
405
|
except Exception as e:
|
|
407
406
|
show_error_panel(
|
|
408
|
-
"
|
|
409
|
-
f"
|
|
407
|
+
"创建失败",
|
|
408
|
+
f"创建项目失败: {e}"
|
|
410
409
|
)
|
|
411
410
|
if project_dir.exists():
|
|
412
411
|
shutil.rmtree(project_dir, ignore_errors=True)
|
|
413
|
-
console.print("[red]:cross_mark:
|
|
412
|
+
console.print("[red]:cross_mark: 已清理部分创建的项目。[/red]")
|
|
414
413
|
return 1
|
|
415
414
|
|
|
416
415
|
if __name__ == "__main__":
|
crawlo/commands/stats.py
CHANGED
|
@@ -61,9 +61,9 @@ def record_stats(crawler):
|
|
|
61
61
|
"timestamp": datetime.now().isoformat(),
|
|
62
62
|
"stats": stats
|
|
63
63
|
}, f, ensure_ascii=False, indent=2, default=str)
|
|
64
|
-
logger.info(f"📊
|
|
64
|
+
logger.info(f"📊 爬虫 '{spider_name}' 的统计信息已保存 → {filename}")
|
|
65
65
|
except Exception as e:
|
|
66
|
-
logger.error(f"
|
|
66
|
+
logger.error(f"保存 '{spider_name}' 的统计信息失败: {e}")
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
def load_all_stats() -> Dict[str, list]:
|
|
@@ -85,7 +85,7 @@ def load_all_stats() -> Dict[str, list]:
|
|
|
85
85
|
spider_name = data.get("spider", "unknown")
|
|
86
86
|
result.setdefault(spider_name, []).append(data)
|
|
87
87
|
except Exception as e:
|
|
88
|
-
logger.warning(f"
|
|
88
|
+
logger.warning(f"加载统计文件 {file} 失败: {e}")
|
|
89
89
|
return result
|
|
90
90
|
|
|
91
91
|
|
|
@@ -99,11 +99,11 @@ def format_value(v: Any) -> str:
|
|
|
99
99
|
return s
|
|
100
100
|
|
|
101
101
|
|
|
102
|
-
def display_stats_table(stats_data: dict, title: str = "
|
|
102
|
+
def display_stats_table(stats_data: dict, title: str = "统计信息"):
|
|
103
103
|
"""通用函数:用 rich.table 展示统计数据"""
|
|
104
104
|
table = Table(title=title, box=box.ROUNDED, show_header=True, header_style="bold magenta")
|
|
105
|
-
table.add_column("
|
|
106
|
-
table.add_column("
|
|
105
|
+
table.add_column("键", style="cyan", no_wrap=True)
|
|
106
|
+
table.add_column("值", style="green")
|
|
107
107
|
|
|
108
108
|
for k in sorted(stats_data.keys()):
|
|
109
109
|
table.add_row(k, format_value(stats_data[k]))
|
|
@@ -120,7 +120,7 @@ def main(args):
|
|
|
120
120
|
crawlo stats myspider --all → 显示所有历史(同上)
|
|
121
121
|
"""
|
|
122
122
|
if len(args) > 2:
|
|
123
|
-
console.print("[bold red]
|
|
123
|
+
console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo stats[/blue] [爬虫名称] [--all]")
|
|
124
124
|
return 1
|
|
125
125
|
|
|
126
126
|
spider_name = None
|
|
@@ -135,11 +135,11 @@ def main(args):
|
|
|
135
135
|
if not all_stats:
|
|
136
136
|
console.print(Panel(
|
|
137
137
|
Text.from_markup(
|
|
138
|
-
":chart_with_upwards_trend: [bold]
|
|
139
|
-
"💡
|
|
140
|
-
f"📁
|
|
138
|
+
":chart_with_upwards_trend: [bold]未找到统计信息。[/bold]\n"
|
|
139
|
+
"💡 先运行一个爬虫以生成统计信息。\n"
|
|
140
|
+
f"📁 统计目录: [cyan]{get_stats_dir()}[/cyan]"
|
|
141
141
|
),
|
|
142
|
-
title="📊
|
|
142
|
+
title="📊 统计信息",
|
|
143
143
|
border_style="yellow",
|
|
144
144
|
padding=(1, 2)
|
|
145
145
|
))
|
|
@@ -148,8 +148,8 @@ def main(args):
|
|
|
148
148
|
# 显示所有爬虫最近一次运行
|
|
149
149
|
if not spider_name:
|
|
150
150
|
console.print(Panel(
|
|
151
|
-
"[bold]
|
|
152
|
-
title="📊
|
|
151
|
+
"[bold]最近的爬虫统计信息(上次运行)[/bold]",
|
|
152
|
+
title="📊 爬虫统计概览",
|
|
153
153
|
border_style="green",
|
|
154
154
|
padding=(0, 1)
|
|
155
155
|
))
|
|
@@ -158,31 +158,31 @@ def main(args):
|
|
|
158
158
|
latest = runs[0]
|
|
159
159
|
ts = latest['timestamp'][:19]
|
|
160
160
|
console.print(f"🕷️ [bold cyan]{name}[/bold cyan] ([green]{ts}[/green])")
|
|
161
|
-
display_stats_table(latest["stats"], title=f"
|
|
161
|
+
display_stats_table(latest["stats"], title=f"{name} 的统计信息")
|
|
162
162
|
console.print() # 空行分隔
|
|
163
163
|
|
|
164
164
|
return 0
|
|
165
165
|
|
|
166
166
|
# 显示指定爬虫的历史
|
|
167
167
|
if spider_name not in all_stats:
|
|
168
|
-
console.print(f"[bold red]:cross_mark:
|
|
168
|
+
console.print(f"[bold red]:cross_mark: 未找到爬虫 '[cyan]{spider_name}[/cyan]' 的统计信息[/bold red]")
|
|
169
169
|
available = ', '.join(all_stats.keys())
|
|
170
170
|
if available:
|
|
171
|
-
console.print(f":bulb:
|
|
171
|
+
console.print(f":bulb: 可用爬虫: [green]{available}[/green]")
|
|
172
172
|
return 1
|
|
173
173
|
|
|
174
174
|
runs = all_stats[spider_name]
|
|
175
175
|
if show_all:
|
|
176
|
-
console.print(f":bar_chart: [bold]
|
|
176
|
+
console.print(f":bar_chart: [bold]'[cyan]{spider_name}[/cyan]' 的所有运行记录 ({len(runs)} 次):[/bold]")
|
|
177
177
|
else:
|
|
178
178
|
runs = runs[:1]
|
|
179
|
-
console.print(f":bar_chart: [bold]
|
|
179
|
+
console.print(f":bar_chart: [bold]'[cyan]{spider_name}[/cyan]' 的上次运行:[/bold]")
|
|
180
180
|
|
|
181
181
|
for i, run in enumerate(runs, 1):
|
|
182
182
|
ts = run['timestamp']
|
|
183
|
-
subtitle = f"
|
|
184
|
-
display_stats_table(run["stats"], title=f"
|
|
183
|
+
subtitle = f"运行 #{i} · {ts}" if show_all else f"上次运行 · {ts}"
|
|
184
|
+
display_stats_table(run["stats"], title=f"{spider_name} 的统计信息 — {subtitle}")
|
|
185
185
|
if i < len(runs):
|
|
186
186
|
console.print("─" * 60)
|
|
187
187
|
|
|
188
|
-
return 0
|
|
188
|
+
return 0
|
crawlo/commands/utils.py
CHANGED
|
@@ -46,7 +46,7 @@ def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
|
|
|
46
46
|
# 1. 查找项目根目录
|
|
47
47
|
project_root = get_project_root()
|
|
48
48
|
if not project_root:
|
|
49
|
-
return False, None, "
|
|
49
|
+
return False, None, "找不到 'crawlo.cfg'。请在项目目录中运行此命令。"
|
|
50
50
|
|
|
51
51
|
# 2. 将项目根加入 Python 路径
|
|
52
52
|
project_root_str = str(project_root)
|
|
@@ -60,10 +60,10 @@ def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
|
|
|
60
60
|
try:
|
|
61
61
|
config.read(cfg_file, encoding="utf-8")
|
|
62
62
|
except Exception as e:
|
|
63
|
-
return False, None, f"
|
|
63
|
+
return False, None, f"读取 crawlo.cfg 失败: {e}"
|
|
64
64
|
|
|
65
65
|
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
66
|
-
return False, None, "
|
|
66
|
+
return False, None, "无效的 crawlo.cfg:缺少 [settings] 部分或 'default' 选项"
|
|
67
67
|
|
|
68
68
|
# 4. 获取项目包名
|
|
69
69
|
settings_module = config.get("settings", "default")
|
|
@@ -73,7 +73,7 @@ def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
|
|
|
73
73
|
try:
|
|
74
74
|
import_module(project_package)
|
|
75
75
|
except ImportError as e:
|
|
76
|
-
return False, None, f"
|
|
76
|
+
return False, None, f"导入项目包 '{project_package}' 失败: {e}"
|
|
77
77
|
|
|
78
78
|
return True, project_package, None
|
|
79
79
|
|
crawlo/config.py
CHANGED
|
@@ -19,11 +19,11 @@ Crawlo 配置工厂
|
|
|
19
19
|
config = CrawloConfig.from_env()
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
-
from typing import Dict, Any, Optional
|
|
23
|
-
|
|
24
|
-
from crawlo.mode_manager import ModeManager, standalone_mode, distributed_mode, auto_mode, from_env
|
|
25
|
-
from crawlo.utils.log import get_logger
|
|
22
|
+
from typing import Dict, Any, Optional
|
|
23
|
+
|
|
26
24
|
from crawlo.config_validator import validate_config
|
|
25
|
+
from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode, from_env
|
|
26
|
+
from crawlo.utils.log import get_logger
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class CrawloConfig:
|
crawlo/config_validator.py
CHANGED