crawlo 1.2.2__py3-none-any.whl → 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (47) hide show
  1. crawlo/__version__.py +1 -1
  2. crawlo/commands/check.py +69 -69
  3. crawlo/commands/genspider.py +25 -25
  4. crawlo/commands/list.py +23 -23
  5. crawlo/commands/run.py +34 -34
  6. crawlo/commands/startproject.py +35 -36
  7. crawlo/commands/stats.py +21 -21
  8. crawlo/commands/utils.py +4 -4
  9. crawlo/config.py +4 -4
  10. crawlo/config_validator.py +1 -2
  11. crawlo/data/__init__.py +6 -0
  12. crawlo/data/user_agents.py +108 -0
  13. crawlo/downloader/hybrid_downloader.py +0 -1
  14. crawlo/downloader/selenium_downloader.py +1 -1
  15. crawlo/extension/health_check.py +2 -2
  16. crawlo/items/fields.py +0 -1
  17. crawlo/middleware/offsite.py +0 -1
  18. crawlo/middleware/proxy.py +3 -2
  19. crawlo/middleware/request_ignore.py +0 -1
  20. crawlo/middleware/response_code.py +0 -1
  21. crawlo/middleware/response_filter.py +0 -1
  22. crawlo/middleware/retry.py +1 -1
  23. crawlo/mode_manager.py +3 -3
  24. crawlo/pipelines/database_dedup_pipeline.py +1 -3
  25. crawlo/pipelines/memory_dedup_pipeline.py +2 -2
  26. crawlo/pipelines/mysql_pipeline.py +4 -3
  27. crawlo/pipelines/redis_dedup_pipeline.py +2 -4
  28. crawlo/project.py +2 -2
  29. crawlo/subscriber.py +1 -2
  30. crawlo/templates/project/settings.py.tmpl +1 -3
  31. crawlo/templates/project/settings_distributed.py.tmpl +2 -0
  32. crawlo/utils/batch_processor.py +2 -3
  33. crawlo/utils/controlled_spider_mixin.py +1 -1
  34. crawlo/utils/enhanced_error_handler.py +3 -6
  35. crawlo/utils/env_config.py +1 -1
  36. crawlo/utils/error_handler.py +2 -4
  37. crawlo/utils/large_scale_helper.py +2 -1
  38. crawlo/utils/performance_monitor.py +5 -4
  39. crawlo/utils/redis_connection_pool.py +4 -4
  40. crawlo/utils/redis_key_validator.py +1 -2
  41. crawlo/utils/request_serializer.py +1 -2
  42. crawlo/utils/spider_loader.py +0 -1
  43. {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/METADATA +1 -1
  44. {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/RECORD +47 -45
  45. {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/WHEEL +0 -0
  46. {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/entry_points.txt +0 -0
  47. {crawlo-1.2.2.dist-info → crawlo-1.2.3.dist-info}/top_level.txt +0 -0
crawlo/commands/run.py CHANGED
@@ -56,8 +56,8 @@ def main(args):
56
56
  crawlo run <spider_name>|all [--json] [--no-stats]
57
57
  """
58
58
  if len(args) < 1:
59
- console.print("[bold red]❌ Usage:[/bold red] [blue]crawlo run[/blue] <spider_name>|all [bold yellow][--json] [--no-stats][/bold yellow]")
60
- console.print("💡 Examples:")
59
+ console.print("[bold red]❌ 用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
60
+ console.print("💡 示例:")
61
61
  console.print(" [blue]crawlo run baidu[/blue]")
62
62
  console.print(" [blue]crawlo run all[/blue]")
63
63
  console.print(" [blue]crawlo run all --json --no-stats[/blue]")
@@ -72,14 +72,14 @@ def main(args):
72
72
  # 1. 查找项目根目录
73
73
  project_root = get_project_root()
74
74
  if not project_root:
75
- msg = ":cross_mark: [bold red]Cannot find 'crawlo.cfg'[/bold red]\n💡 Run this command inside your project directory."
75
+ msg = ":cross_mark: [bold red]找不到 'crawlo.cfg'[/bold red]\n💡 请在项目目录中运行此命令。"
76
76
  if show_json:
77
- console.print_json(data={"success": False, "error": "Project root not found"})
77
+ console.print_json(data={"success": False, "error": "未找到项目根目录"})
78
78
  return 1
79
79
  else:
80
80
  console.print(Panel(
81
81
  Text.from_markup(msg),
82
- title="❌ Not in a Crawlo Project",
82
+ title="❌ Crawlo项目",
83
83
  border_style="red",
84
84
  padding=(1, 2)
85
85
  ))
@@ -92,24 +92,24 @@ def main(args):
92
92
  # 2. 读取 crawlo.cfg 获取 settings 模块
93
93
  cfg_file = project_root / "crawlo.cfg"
94
94
  if not cfg_file.exists():
95
- msg = f"crawlo.cfg not found in {project_root}"
95
+ msg = f" {project_root} 中未找到 crawlo.cfg"
96
96
  if show_json:
97
97
  console.print_json(data={"success": False, "error": msg})
98
98
  return 1
99
99
  else:
100
- console.print(Panel(msg, title="❌ Missing Config", border_style="red"))
100
+ console.print(Panel(msg, title="❌ 缺少配置文件", border_style="red"))
101
101
  return 1
102
102
 
103
103
  config = configparser.ConfigParser()
104
104
  config.read(cfg_file, encoding="utf-8")
105
105
 
106
106
  if not config.has_section("settings") or not config.has_option("settings", "default"):
107
- msg = "Missing [settings] section or 'default' option in crawlo.cfg"
107
+ msg = "crawlo.cfg 中缺少 [settings] 部分或 'default' 选项"
108
108
  if show_json:
109
109
  console.print_json(data={"success": False, "error": msg})
110
110
  return 1
111
111
  else:
112
- console.print(Panel(msg, title="❌ Invalid Config", border_style="red"))
112
+ console.print(Panel(msg, title="❌ 无效配置", border_style="red"))
113
113
  return 1
114
114
 
115
115
  settings_module = config.get("settings", "default")
@@ -119,12 +119,12 @@ def main(args):
119
119
  try:
120
120
  import_module(project_package)
121
121
  except ImportError as e:
122
- msg = f"Failed to import project package '{project_package}': {e}"
122
+ msg = f"导入项目包 '{project_package}' 失败: {e}"
123
123
  if show_json:
124
124
  console.print_json(data={"success": False, "error": msg})
125
125
  return 1
126
126
  else:
127
- console.print(Panel(msg, title="❌ Import Error", border_style="red"))
127
+ console.print(Panel(msg, title="❌ 导入错误", border_style="red"))
128
128
  return 1
129
129
 
130
130
  # 4. 加载 settings 和爬虫模块
@@ -136,20 +136,20 @@ def main(args):
136
136
  if spider_arg.lower() == "all":
137
137
  spider_names = process.get_spider_names()
138
138
  if not spider_names:
139
- msg = "No spiders found."
139
+ msg = "未找到爬虫。"
140
140
  if show_json:
141
141
  console.print_json(data={"success": False, "error": msg})
142
142
  return 1
143
143
  else:
144
144
  console.print(Panel(
145
145
  Text.from_markup(
146
- ":cross_mark: [bold red]No spiders found.[/bold red]\n\n"
147
- "[bold]💡 Make sure:[/bold]\n"
148
- " • Spiders are defined in '[cyan]spiders/[/cyan]'\n"
149
- " • They have a [green]`name`[/green] attribute\n"
150
- " • Modules are imported (e.g. via [cyan]__init__.py[/cyan])"
146
+ ":cross_mark: [bold red]未找到爬虫。[/bold red]\n\n"
147
+ "[bold]💡 确保:[/bold]\n"
148
+ " • 爬虫定义于 '[cyan]spiders/[/cyan]' 目录\n"
149
+ " • 具有 [green]`name`[/green] 属性\n"
150
+ " • 模块已导入 (例如通过 [cyan]__init__.py[/cyan])"
151
151
  ),
152
- title="❌ No Spiders",
152
+ title="❌ 未找到爬虫",
153
153
  border_style="red",
154
154
  padding=(1, 2)
155
155
  ))
@@ -157,13 +157,13 @@ def main(args):
157
157
 
158
158
  # 显示即将运行的爬虫列表
159
159
  table = Table(
160
- title=f"🚀 Starting ALL {len(spider_names)} spider(s)",
160
+ title=f"🚀 启动全部 {len(spider_names)} 个爬虫",
161
161
  box=box.ROUNDED,
162
162
  show_header=True,
163
163
  header_style="bold magenta"
164
164
  )
165
- table.add_column("Name", style="cyan")
166
- table.add_column("Class", style="green")
165
+ table.add_column("名称", style="cyan")
166
+ table.add_column("类名", style="green")
167
167
 
168
168
  for name in sorted(spider_names):
169
169
  cls = process.get_spider_class(name)
@@ -183,15 +183,15 @@ def main(args):
183
183
  TextColumn("[progress.description]{task.description}"),
184
184
  transient=True,
185
185
  ) as progress:
186
- task = progress.add_task("Running all spiders...", total=None)
186
+ task = progress.add_task("正在运行所有爬虫...", total=None)
187
187
  asyncio.run(process.crawl(spider_names))
188
188
 
189
189
  if show_json:
190
190
  console.print_json(data={"success": True, "spiders": spider_names})
191
191
  else:
192
192
  console.print(Panel(
193
- ":tada: [bold green]All spiders completed successfully![/bold green]",
194
- title="✅ All Done",
193
+ ":tada: [bold green]所有爬虫运行完成![/bold green]",
194
+ title="✅ 全部完成",
195
195
  border_style="green"
196
196
  ))
197
197
  return 0
@@ -200,7 +200,7 @@ def main(args):
200
200
  spider_name = spider_arg
201
201
  if not process.is_spider_registered(spider_name):
202
202
  available = process.get_spider_names()
203
- msg = f"Spider '[cyan]{spider_name}[/cyan]' not found."
203
+ msg = f"爬虫 '[cyan]{spider_name}[/cyan]' 未找到。"
204
204
  if show_json:
205
205
  console.print_json(data={
206
206
  "success": False,
@@ -211,16 +211,16 @@ def main(args):
211
211
  else:
212
212
  panel_content = Text.from_markup(msg + "\n")
213
213
  if available:
214
- panel_content.append("\n💡 Available spiders:\n")
214
+ panel_content.append("\n💡 可用爬虫:\n")
215
215
  for name in sorted(available):
216
216
  cls = process.get_spider_class(name)
217
217
  panel_content.append(f" • [cyan]{name}[/cyan] ([green]{cls.__name__}[/green])\n")
218
218
  else:
219
- panel_content.append("\n💡 No spiders found. Check your spiders module.")
219
+ panel_content.append("\n💡 未找到爬虫。请检查爬虫模块。")
220
220
 
221
221
  console.print(Panel(
222
222
  panel_content,
223
- title="❌ Spider Not Found",
223
+ title="❌ 爬虫未找到",
224
224
  border_style="red",
225
225
  padding=(1, 2)
226
226
  ))
@@ -231,7 +231,7 @@ def main(args):
231
231
  # 显示启动信息
232
232
  if not show_json:
233
233
  info_table = Table(
234
- title=f"🚀 Starting Spider: [bold cyan]{spider_name}[/bold cyan]",
234
+ title=f"🚀 启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
235
235
  box=box.SIMPLE,
236
236
  show_header=False,
237
237
  title_style="bold green"
@@ -255,21 +255,21 @@ def main(args):
255
255
  TextColumn("[progress.description]{task.description}"),
256
256
  transient=True,
257
257
  ) as progress:
258
- task = progress.add_task(f"Running {spider_name}...", total=None)
258
+ task = progress.add_task(f"正在运行 {spider_name}...", total=None)
259
259
  asyncio.run(process.crawl(spider_name))
260
260
 
261
261
  if show_json:
262
262
  console.print_json(data={"success": True, "spider": spider_name})
263
263
  else:
264
264
  console.print(Panel(
265
- f":tada: [bold green]Spider '[cyan]{spider_name}[/cyan]' completed successfully![/bold green]",
266
- title="✅ Done",
265
+ f":tada: [bold green]爬虫 '[cyan]{spider_name}[/cyan]' 运行完成![/bold green]",
266
+ title="✅ 完成",
267
267
  border_style="green"
268
268
  ))
269
269
  return 0
270
270
 
271
271
  except KeyboardInterrupt:
272
- msg = "⚠️ Spider interrupted by user."
272
+ msg = "⚠️ 爬虫被用户中断。"
273
273
  if show_json:
274
274
  console.print_json(data={"success": False, "error": msg})
275
275
  else:
@@ -277,7 +277,7 @@ def main(args):
277
277
  return 1
278
278
  except Exception as e:
279
279
  logger.exception("Exception during 'crawlo run'")
280
- msg = f"Unexpected error: {e}"
280
+ msg = f"意外错误: {e}"
281
281
  if show_json:
282
282
  console.print_json(data={"success": False, "error": msg})
283
283
  else:
@@ -211,13 +211,13 @@ def validate_project_name(project_name: str) -> tuple[bool, str]:
211
211
  """
212
212
  # 检查是否为空
213
213
  if not project_name or not project_name.strip():
214
- return False, "Project name cannot be empty"
214
+ return False, "项目名称不能为空"
215
215
 
216
216
  project_name = project_name.strip()
217
217
 
218
218
  # 检查长度
219
219
  if len(project_name) > 50:
220
- return False, "Project name too long (max 50 characters)"
220
+ return False, "项目名称太长(最多50个字符)"
221
221
 
222
222
  # 检查是否为Python关键字
223
223
  python_keywords = {
@@ -228,22 +228,21 @@ def validate_project_name(project_name: str) -> tuple[bool, str]:
228
228
  'while', 'with', 'yield'
229
229
  }
230
230
  if project_name in python_keywords:
231
- return False, f"'{project_name}' is a Python keyword and cannot be used as project name"
231
+ return False, f"'{project_name}' Python关键字,不能用作项目名称"
232
232
 
233
233
  # 检查是否为有效的Python标识符
234
234
  if not project_name.isidentifier():
235
- return False, "Project name must be a valid Python identifier"
235
+ return False, "项目名称必须是有效的Python标识符"
236
236
 
237
237
  # 检查格式(建议使用snake_case)
238
238
  if not re.match(r'^[a-z][a-z0-9_]*$', project_name):
239
239
  return False, (
240
- "Project name should start with lowercase letter and "
241
- "contain only lowercase letters, numbers, and underscores"
240
+ "项目名称应以小写字母开头,只能包含小写字母、数字和下划线"
242
241
  )
243
242
 
244
243
  # 检查是否以数字结尾(不推荐)
245
244
  if project_name[-1].isdigit():
246
- return False, "Project name should not end with a number"
245
+ return False, "项目名称不应以数字结尾"
247
246
 
248
247
  return True, ""
249
248
 
@@ -284,8 +283,8 @@ def show_module_options():
284
283
 
285
284
  def main(args):
286
285
  if len(args) < 1:
287
- console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name> [template_type] [--modules module1,module2]")
288
- console.print("💡 Examples:")
286
+ console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo startproject[/blue] <项目名称> [模板类型] [--modules 模块1,模块2]")
287
+ console.print("💡 示例:")
289
288
  console.print(" [blue]crawlo startproject[/blue] my_spider_project")
290
289
  console.print(" [blue]crawlo startproject[/blue] news_crawler simple")
291
290
  console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed --modules mysql,proxy")
@@ -315,8 +314,8 @@ def main(args):
315
314
  # 验证模板类型
316
315
  if template_type not in TEMPLATE_TYPES:
317
316
  show_error_panel(
318
- "Invalid Template Type",
319
- f"Template type '[cyan]{template_type}[/cyan]' is not supported.\n"
317
+ "无效的模板类型",
318
+ f"不支持模板类型 '[cyan]{template_type}[/cyan]'。\n"
320
319
  )
321
320
  show_template_options()
322
321
  return 1
@@ -325,14 +324,14 @@ def main(args):
325
324
  is_valid, error_msg = validate_project_name(project_name)
326
325
  if not is_valid:
327
326
  show_error_panel(
328
- "Invalid Project Name",
329
- f"[cyan]{project_name}[/cyan] is not a valid project name.\n"
327
+ "无效的项目名称",
328
+ f"[cyan]{project_name}[/cyan] 不是有效的项目名称。\n"
330
329
  f"❌ {error_msg}\n\n"
331
- "💡 Project name should:\n"
332
- " • Start with lowercase letter\n"
333
- " • Contain only lowercase letters, numbers, and underscores\n"
334
- " • Be a valid Python identifier\n"
335
- " • Not be a Python keyword"
330
+ "💡 项目名称应:\n"
331
+ " • 以小写字母开头\n"
332
+ " • 只能包含小写字母、数字和下划线\n"
333
+ " • 是有效的Python标识符\n"
334
+ " • 不能是Python关键字"
336
335
  )
337
336
  return 1
338
337
 
@@ -340,9 +339,9 @@ def main(args):
340
339
 
341
340
  if project_dir.exists():
342
341
  show_error_panel(
343
- "Directory Exists",
344
- f"Directory '[cyan]{project_dir}[/cyan]' already exists.\n"
345
- "💡 Choose a different project name or remove the existing directory."
342
+ "目录已存在",
343
+ f"目录 '[cyan]{project_dir}[/cyan]' 已存在。\n"
344
+ "💡 请选择不同的项目名称或删除现有目录。"
346
345
  )
347
346
  return 1
348
347
 
@@ -358,26 +357,26 @@ def main(args):
358
357
  if cfg_template.exists():
359
358
  cfg_content = _render_template(cfg_template, context)
360
359
  (project_dir / 'crawlo.cfg').write_text(cfg_content, encoding='utf-8')
361
- console.print(f":white_check_mark: Created [green]{project_dir / 'crawlo.cfg'}[/green]")
360
+ console.print(f":white_check_mark: 已创建 [green]{project_dir / 'crawlo.cfg'}[/green]")
362
361
  else:
363
- console.print("[yellow]⚠ Warning:[/yellow] Template 'crawlo.cfg.tmpl' not found.")
362
+ console.print("[yellow]⚠ 警告:[/yellow] 找不到模板 'crawlo.cfg.tmpl'")
364
363
 
365
364
  # 3. 复制并渲染项目包内容
366
365
  package_dir = project_dir / project_name
367
366
  _copytree_with_templates(template_dir, package_dir, context, template_type, modules)
368
- console.print(f":white_check_mark: Created project package: [green]{package_dir}[/green]")
367
+ console.print(f":white_check_mark: 已创建项目包: [green]{package_dir}[/green]")
369
368
 
370
369
  # 4. 创建 logs 目录
371
370
  (project_dir / 'logs').mkdir(exist_ok=True)
372
- console.print(":white_check_mark: Created logs directory")
371
+ console.print(":white_check_mark: 已创建 logs 目录")
373
372
 
374
373
  # 5. 创建 output 目录(用于数据输出)
375
374
  (project_dir / 'output').mkdir(exist_ok=True)
376
- console.print(":white_check_mark: Created output directory")
375
+ console.print(":white_check_mark: 已创建 output 目录")
377
376
 
378
377
  # 成功面板
379
- success_text = Text.from_markup(f"Project '[bold cyan]{project_name}[/bold cyan]' created successfully!")
380
- console.print(Panel(success_text, title=":rocket: Success", border_style="green", padding=(1, 2)))
378
+ success_text = Text.from_markup(f"项目 '[bold cyan]{project_name}[/bold cyan]' 创建成功!")
379
+ console.print(Panel(success_text, title=":rocket: 成功", border_style="green", padding=(1, 2)))
381
380
 
382
381
  # 显示使用的模板类型
383
382
  if template_type != 'default':
@@ -389,15 +388,15 @@ def main(args):
389
388
 
390
389
  # 下一步操作提示(对齐美观 + 语法高亮)
391
390
  next_steps = f"""
392
- [bold]🚀 Next steps:[/bold]
391
+ [bold]🚀 下一步操作:[/bold]
393
392
  [blue]cd[/blue] {project_name}
394
393
  [blue]crawlo genspider[/blue] example example.com
395
394
  [blue]crawlo run[/blue] example
396
395
 
397
- [bold]📚 Learn more:[/bold]
398
- [blue]crawlo list[/blue] # List all spiders
399
- [blue]crawlo check[/blue] example # Check spider validity
400
- [blue]crawlo stats[/blue] # View statistics
396
+ [bold]📚 了解更多:[/bold]
397
+ [blue]crawlo list[/blue] # 列出所有爬虫
398
+ [blue]crawlo check[/blue] example # 检查爬虫有效性
399
+ [blue]crawlo stats[/blue] # 查看统计信息
401
400
  """.strip()
402
401
  console.print(next_steps)
403
402
 
@@ -405,12 +404,12 @@ def main(args):
405
404
 
406
405
  except Exception as e:
407
406
  show_error_panel(
408
- "Creation Failed",
409
- f"Failed to create project: {e}"
407
+ "创建失败",
408
+ f"创建项目失败: {e}"
410
409
  )
411
410
  if project_dir.exists():
412
411
  shutil.rmtree(project_dir, ignore_errors=True)
413
- console.print("[red]:cross_mark: Cleaned up partially created project.[/red]")
412
+ console.print("[red]:cross_mark: 已清理部分创建的项目。[/red]")
414
413
  return 1
415
414
 
416
415
  if __name__ == "__main__":
crawlo/commands/stats.py CHANGED
@@ -61,9 +61,9 @@ def record_stats(crawler):
61
61
  "timestamp": datetime.now().isoformat(),
62
62
  "stats": stats
63
63
  }, f, ensure_ascii=False, indent=2, default=str)
64
- logger.info(f"📊 Stats saved for spider '{spider_name}' → {filename}")
64
+ logger.info(f"📊 爬虫 '{spider_name}' 的统计信息已保存 → {filename}")
65
65
  except Exception as e:
66
- logger.error(f"Failed to save stats for '{spider_name}': {e}")
66
+ logger.error(f"保存 '{spider_name}' 的统计信息失败: {e}")
67
67
 
68
68
 
69
69
  def load_all_stats() -> Dict[str, list]:
@@ -85,7 +85,7 @@ def load_all_stats() -> Dict[str, list]:
85
85
  spider_name = data.get("spider", "unknown")
86
86
  result.setdefault(spider_name, []).append(data)
87
87
  except Exception as e:
88
- logger.warning(f"Failed to load stats file {file}: {e}")
88
+ logger.warning(f"加载统计文件 {file} 失败: {e}")
89
89
  return result
90
90
 
91
91
 
@@ -99,11 +99,11 @@ def format_value(v: Any) -> str:
99
99
  return s
100
100
 
101
101
 
102
- def display_stats_table(stats_data: dict, title: str = "Statistics"):
102
+ def display_stats_table(stats_data: dict, title: str = "统计信息"):
103
103
  """通用函数:用 rich.table 展示统计数据"""
104
104
  table = Table(title=title, box=box.ROUNDED, show_header=True, header_style="bold magenta")
105
- table.add_column("Key", style="cyan", no_wrap=True)
106
- table.add_column("Value", style="green")
105
+ table.add_column("", style="cyan", no_wrap=True)
106
+ table.add_column("", style="green")
107
107
 
108
108
  for k in sorted(stats_data.keys()):
109
109
  table.add_row(k, format_value(stats_data[k]))
@@ -120,7 +120,7 @@ def main(args):
120
120
  crawlo stats myspider --all → 显示所有历史(同上)
121
121
  """
122
122
  if len(args) > 2:
123
- console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo stats[/blue] [spider_name] [--all]")
123
+ console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo stats[/blue] [爬虫名称] [--all]")
124
124
  return 1
125
125
 
126
126
  spider_name = None
@@ -135,11 +135,11 @@ def main(args):
135
135
  if not all_stats:
136
136
  console.print(Panel(
137
137
  Text.from_markup(
138
- ":chart_with_upwards_trend: [bold]No stats found.[/bold]\n"
139
- "💡 Run a spider first to generate statistics.\n"
140
- f"📁 Stats directory: [cyan]{get_stats_dir()}[/cyan]"
138
+ ":chart_with_upwards_trend: [bold]未找到统计信息。[/bold]\n"
139
+ "💡 先运行一个爬虫以生成统计信息。\n"
140
+ f"📁 统计目录: [cyan]{get_stats_dir()}[/cyan]"
141
141
  ),
142
- title="📊 Statistics",
142
+ title="📊 统计信息",
143
143
  border_style="yellow",
144
144
  padding=(1, 2)
145
145
  ))
@@ -148,8 +148,8 @@ def main(args):
148
148
  # 显示所有爬虫最近一次运行
149
149
  if not spider_name:
150
150
  console.print(Panel(
151
- "[bold]Recent Spider Statistics (last run)[/bold]",
152
- title="📊 Spider Stats Overview",
151
+ "[bold]最近的爬虫统计信息(上次运行)[/bold]",
152
+ title="📊 爬虫统计概览",
153
153
  border_style="green",
154
154
  padding=(0, 1)
155
155
  ))
@@ -158,31 +158,31 @@ def main(args):
158
158
  latest = runs[0]
159
159
  ts = latest['timestamp'][:19]
160
160
  console.print(f"🕷️ [bold cyan]{name}[/bold cyan] ([green]{ts}[/green])")
161
- display_stats_table(latest["stats"], title=f"Stats for {name}")
161
+ display_stats_table(latest["stats"], title=f"{name} 的统计信息")
162
162
  console.print() # 空行分隔
163
163
 
164
164
  return 0
165
165
 
166
166
  # 显示指定爬虫的历史
167
167
  if spider_name not in all_stats:
168
- console.print(f"[bold red]:cross_mark: No stats found for spider '[cyan]{spider_name}[/cyan]'[/bold red]")
168
+ console.print(f"[bold red]:cross_mark: 未找到爬虫 '[cyan]{spider_name}[/cyan]' 的统计信息[/bold red]")
169
169
  available = ', '.join(all_stats.keys())
170
170
  if available:
171
- console.print(f":bulb: Available spiders: [green]{available}[/green]")
171
+ console.print(f":bulb: 可用爬虫: [green]{available}[/green]")
172
172
  return 1
173
173
 
174
174
  runs = all_stats[spider_name]
175
175
  if show_all:
176
- console.print(f":bar_chart: [bold]All runs for '[cyan]{spider_name}[/cyan]' ({len(runs)} runs):[/bold]")
176
+ console.print(f":bar_chart: [bold]'[cyan]{spider_name}[/cyan]' 的所有运行记录 ({len(runs)} ):[/bold]")
177
177
  else:
178
178
  runs = runs[:1]
179
- console.print(f":bar_chart: [bold]Last run for '[cyan]{spider_name}[/cyan]':[/bold]")
179
+ console.print(f":bar_chart: [bold]'[cyan]{spider_name}[/cyan]' 的上次运行:[/bold]")
180
180
 
181
181
  for i, run in enumerate(runs, 1):
182
182
  ts = run['timestamp']
183
- subtitle = f"Run #{i} · {ts}" if show_all else f"Last Run · {ts}"
184
- display_stats_table(run["stats"], title=f"Stats for {spider_name} — {subtitle}")
183
+ subtitle = f"运行 #{i} · {ts}" if show_all else f"上次运行 · {ts}"
184
+ display_stats_table(run["stats"], title=f"{spider_name} 的统计信息 — {subtitle}")
185
185
  if i < len(runs):
186
186
  console.print("─" * 60)
187
187
 
188
- return 0
188
+ return 0
crawlo/commands/utils.py CHANGED
@@ -46,7 +46,7 @@ def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
46
46
  # 1. 查找项目根目录
47
47
  project_root = get_project_root()
48
48
  if not project_root:
49
- return False, None, "Cannot find 'crawlo.cfg'. Run this command inside your project directory."
49
+ return False, None, "找不到 'crawlo.cfg'。请在项目目录中运行此命令。"
50
50
 
51
51
  # 2. 将项目根加入 Python 路径
52
52
  project_root_str = str(project_root)
@@ -60,10 +60,10 @@ def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
60
60
  try:
61
61
  config.read(cfg_file, encoding="utf-8")
62
62
  except Exception as e:
63
- return False, None, f"Failed to read crawlo.cfg: {e}"
63
+ return False, None, f"读取 crawlo.cfg 失败: {e}"
64
64
 
65
65
  if not config.has_section("settings") or not config.has_option("settings", "default"):
66
- return False, None, "Invalid crawlo.cfg: missing [settings] section or 'default' option"
66
+ return False, None, "无效的 crawlo.cfg:缺少 [settings] 部分或 'default' 选项"
67
67
 
68
68
  # 4. 获取项目包名
69
69
  settings_module = config.get("settings", "default")
@@ -73,7 +73,7 @@ def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
73
73
  try:
74
74
  import_module(project_package)
75
75
  except ImportError as e:
76
- return False, None, f"Failed to import project package '{project_package}': {e}"
76
+ return False, None, f"导入项目包 '{project_package}' 失败: {e}"
77
77
 
78
78
  return True, project_package, None
79
79
 
crawlo/config.py CHANGED
@@ -19,11 +19,11 @@ Crawlo 配置工厂
19
19
  config = CrawloConfig.from_env()
20
20
  """
21
21
 
22
- from typing import Dict, Any, Optional, Union
23
- import os
24
- from crawlo.mode_manager import ModeManager, standalone_mode, distributed_mode, auto_mode, from_env
25
- from crawlo.utils.log import get_logger
22
+ from typing import Dict, Any, Optional
23
+
26
24
  from crawlo.config_validator import validate_config
25
+ from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode, from_env
26
+ from crawlo.utils.log import get_logger
27
27
 
28
28
 
29
29
  class CrawloConfig:
@@ -5,9 +5,8 @@
5
5
  ==========
6
6
  提供配置项的验证和默认值设置功能,确保配置的合理性和一致性。
7
7
  """
8
+ from typing import Dict, Any, List, Tuple
8
9
 
9
- from typing import Dict, Any, List, Tuple, Optional
10
- import re
11
10
  from crawlo.utils.log import get_logger
12
11
 
13
12
 
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Crawlo 数据模块
5
+ 包含用户代理、字典等数据文件
6
+ """