crawlo 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (111) hide show
  1. crawlo/__init__.py +33 -24
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -155
  6. crawlo/commands/genspider.py +125 -110
  7. crawlo/commands/list.py +147 -119
  8. crawlo/commands/run.py +285 -170
  9. crawlo/commands/startproject.py +111 -101
  10. crawlo/commands/stats.py +188 -167
  11. crawlo/core/__init__.py +2 -2
  12. crawlo/core/engine.py +158 -158
  13. crawlo/core/processor.py +40 -40
  14. crawlo/core/scheduler.py +57 -57
  15. crawlo/crawler.py +494 -492
  16. crawlo/downloader/__init__.py +78 -78
  17. crawlo/downloader/aiohttp_downloader.py +199 -199
  18. crawlo/downloader/cffi_downloader.py +242 -277
  19. crawlo/downloader/httpx_downloader.py +246 -246
  20. crawlo/event.py +11 -11
  21. crawlo/exceptions.py +78 -78
  22. crawlo/extension/__init__.py +31 -31
  23. crawlo/extension/log_interval.py +49 -49
  24. crawlo/extension/log_stats.py +44 -44
  25. crawlo/extension/logging_extension.py +34 -34
  26. crawlo/filters/__init__.py +37 -37
  27. crawlo/filters/aioredis_filter.py +150 -150
  28. crawlo/filters/memory_filter.py +202 -202
  29. crawlo/items/__init__.py +23 -23
  30. crawlo/items/base.py +21 -21
  31. crawlo/items/fields.py +53 -53
  32. crawlo/items/items.py +104 -104
  33. crawlo/middleware/__init__.py +21 -21
  34. crawlo/middleware/default_header.py +32 -32
  35. crawlo/middleware/download_delay.py +28 -28
  36. crawlo/middleware/middleware_manager.py +135 -135
  37. crawlo/middleware/proxy.py +245 -245
  38. crawlo/middleware/request_ignore.py +30 -30
  39. crawlo/middleware/response_code.py +18 -18
  40. crawlo/middleware/response_filter.py +26 -26
  41. crawlo/middleware/retry.py +90 -90
  42. crawlo/network/__init__.py +7 -7
  43. crawlo/network/request.py +203 -203
  44. crawlo/network/response.py +166 -166
  45. crawlo/pipelines/__init__.py +13 -13
  46. crawlo/pipelines/console_pipeline.py +39 -39
  47. crawlo/pipelines/mongo_pipeline.py +116 -116
  48. crawlo/pipelines/mysql_batch_pipline.py +272 -272
  49. crawlo/pipelines/mysql_pipeline.py +195 -195
  50. crawlo/pipelines/pipeline_manager.py +56 -56
  51. crawlo/project.py +153 -0
  52. crawlo/settings/__init__.py +7 -7
  53. crawlo/settings/default_settings.py +166 -168
  54. crawlo/settings/setting_manager.py +99 -99
  55. crawlo/spider/__init__.py +129 -129
  56. crawlo/stats_collector.py +59 -59
  57. crawlo/subscriber.py +106 -106
  58. crawlo/task_manager.py +27 -27
  59. crawlo/templates/crawlo.cfg.tmpl +10 -10
  60. crawlo/templates/project/__init__.py.tmpl +3 -3
  61. crawlo/templates/project/items.py.tmpl +17 -17
  62. crawlo/templates/project/middlewares.py.tmpl +75 -75
  63. crawlo/templates/project/pipelines.py.tmpl +63 -63
  64. crawlo/templates/project/settings.py.tmpl +54 -54
  65. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  66. crawlo/templates/spider/spider.py.tmpl +31 -31
  67. crawlo/utils/__init__.py +7 -7
  68. crawlo/utils/date_tools.py +233 -233
  69. crawlo/utils/db_helper.py +343 -343
  70. crawlo/utils/func_tools.py +82 -82
  71. crawlo/utils/log.py +128 -128
  72. crawlo/utils/pqueue.py +173 -173
  73. crawlo/utils/request.py +267 -267
  74. crawlo/utils/spider_loader.py +62 -62
  75. crawlo/utils/system.py +11 -11
  76. crawlo/utils/tools.py +4 -4
  77. crawlo/utils/url.py +39 -39
  78. crawlo-1.1.1.dist-info/METADATA +220 -0
  79. crawlo-1.1.1.dist-info/RECORD +100 -0
  80. examples/__init__.py +7 -0
  81. examples/baidu_spider/__init__.py +7 -0
  82. examples/baidu_spider/demo.py +94 -0
  83. examples/baidu_spider/items.py +46 -0
  84. examples/baidu_spider/middleware.py +49 -0
  85. examples/baidu_spider/pipeline.py +55 -0
  86. examples/baidu_spider/run.py +27 -0
  87. examples/baidu_spider/settings.py +121 -0
  88. examples/baidu_spider/spiders/__init__.py +7 -0
  89. examples/baidu_spider/spiders/bai_du.py +61 -0
  90. examples/baidu_spider/spiders/miit.py +159 -0
  91. examples/baidu_spider/spiders/sina.py +79 -0
  92. tests/__init__.py +7 -7
  93. tests/test_proxy_health_check.py +32 -32
  94. tests/test_proxy_middleware_integration.py +136 -136
  95. tests/test_proxy_providers.py +56 -56
  96. tests/test_proxy_stats.py +19 -19
  97. tests/test_proxy_strategies.py +59 -59
  98. crawlo/utils/concurrency_manager.py +0 -125
  99. crawlo/utils/project.py +0 -197
  100. crawlo-1.1.0.dist-info/METADATA +0 -49
  101. crawlo-1.1.0.dist-info/RECORD +0 -97
  102. examples/gxb/__init__.py +0 -0
  103. examples/gxb/items.py +0 -36
  104. examples/gxb/run.py +0 -16
  105. examples/gxb/settings.py +0 -72
  106. examples/gxb/spider/__init__.py +0 -2
  107. examples/gxb/spider/miit_spider.py +0 -180
  108. examples/gxb/spider/telecom_device.py +0 -129
  109. {crawlo-1.1.0.dist-info → crawlo-1.1.1.dist-info}/WHEEL +0 -0
  110. {crawlo-1.1.0.dist-info → crawlo-1.1.1.dist-info}/entry_points.txt +0 -0
  111. {crawlo-1.1.0.dist-info → crawlo-1.1.1.dist-info}/top_level.txt +0 -0
@@ -1,101 +1,111 @@
1
- # crawlo/commands/startproject.py
2
- import os
3
- import shutil
4
- from pathlib import Path
5
-
6
- TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
7
-
8
-
9
- def _render_template(tmpl_path, context):
10
- """读取模板文件,替换 {{key}} context 中的值"""
11
- with open(tmpl_path, 'r', encoding='utf-8') as f:
12
- content = f.read()
13
- for key, value in context.items():
14
- content = content.replace(f'{{{{{key}}}}}', str(value))
15
- return content
16
-
17
-
18
- def _copytree_with_templates(src, dst, context):
19
- """
20
- 递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
21
- """
22
- src_path = Path(src)
23
- dst_path = Path(dst)
24
- dst_path.mkdir(parents=True, exist_ok=True)
25
-
26
- for item in src_path.rglob('*'):
27
- rel_path = item.relative_to(src_path)
28
- dst_item = dst_path / rel_path
29
-
30
- if item.is_dir():
31
- # 创建目标目录
32
- dst_item.mkdir(parents=True, exist_ok=True)
33
- else:
34
- if item.suffix == '.tmpl':
35
- # 渲染模板文件,并去掉 .tmpl 后缀
36
- rendered_content = _render_template(item, context)
37
- final_dst = dst_item.with_suffix('') # 去掉 .tmpl
38
- final_dst.parent.mkdir(parents=True, exist_ok=True) # 确保父目录存在
39
- with open(final_dst, 'w', encoding='utf-8') as f:
40
- f.write(rendered_content)
41
- else:
42
- # 普通文件,直接复制
43
- shutil.copy2(item, dst_item)
44
-
45
-
46
- def main(args):
47
- if len(args) != 1:
48
- print("Usage: crawlo startproject <project_name>")
49
- return 1
50
-
51
- project_name = args[0]
52
- project_dir = Path(project_name)
53
-
54
- if project_dir.exists():
55
- print(f"Error: Directory '{project_dir}' already exists.")
56
- return 1
57
-
58
- context = {'project_name': project_name}
59
- template_dir = TEMPLATES_DIR / 'project'
60
-
61
- try:
62
- # 1. 创建项目根目录
63
- project_dir.mkdir()
64
-
65
- # 2. 处理 crawlo.cfg.tmpl:单独渲染并写入项目根目录
66
- cfg_template = TEMPLATES_DIR / 'crawlo.cfg.tmpl' # ✅ 使用 templates/ 目录下的模板
67
- if cfg_template.exists():
68
- cfg_content = _render_template(cfg_template, context)
69
- (project_dir / 'crawlo.cfg').write_text(cfg_content, encoding='utf-8')
70
- else:
71
- print("Warning: crawlo.cfg.tmpl not found in templates.")
72
-
73
- # 3. 复制所有其他模板文件到项目包内 (project_dir / project_name)
74
- package_dir = project_dir / project_name
75
- # 这会复制 __init__.py.tmpl, items.py.tmpl, settings.py.tmpl, spiders/ 等
76
- # 并将它们渲染为 .py 文件
77
- _copytree_with_templates(template_dir, package_dir, context)
78
-
79
- # 4. 创建 logs 目录
80
- (project_dir / 'logs').mkdir(exist_ok=True)
81
-
82
- print(f"""
83
- 项目 '{project_name}' 创建成功!
84
-
85
- 进入项目目录:
86
- cd {project_name}
87
-
88
- 创建一个爬虫:
89
- crawlo genspider example example.com
90
-
91
- 运行爬虫:
92
- crawlo run example
93
- """)
94
- return 0
95
-
96
- except Exception as e:
97
- print(f"Error creating project: {e}")
98
- # 如果出错,尝试清理已创建的目录
99
- if project_dir.exists():
100
- shutil.rmtree(project_dir, ignore_errors=True)
101
- return 1
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo startproject baidu,创建项目。
7
+ """
8
+ import shutil
9
+ from pathlib import Path
10
+ from rich.console import Console
11
+ from rich.panel import Panel
12
+ from rich.text import Text
13
+
14
+ # 初始化 rich 控制台
15
+ console = Console()
16
+
17
+ TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
18
+
19
+
20
+ def _render_template(tmpl_path, context):
21
+ """读取模板文件,替换 {{key}} 为 context 中的值"""
22
+ with open(tmpl_path, 'r', encoding='utf-8') as f:
23
+ content = f.read()
24
+ for key, value in context.items():
25
+ content = content.replace(f'{{{{{key}}}}}', str(value))
26
+ return content
27
+
28
+
29
+ def _copytree_with_templates(src, dst, context):
30
+ """
31
+ 递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
32
+ """
33
+ src_path = Path(src)
34
+ dst_path = Path(dst)
35
+ dst_path.mkdir(parents=True, exist_ok=True)
36
+
37
+ for item in src_path.rglob('*'):
38
+ rel_path = item.relative_to(src_path)
39
+ dst_item = dst_path / rel_path
40
+
41
+ if item.is_dir():
42
+ dst_item.mkdir(parents=True, exist_ok=True)
43
+ else:
44
+ if item.suffix == '.tmpl':
45
+ rendered_content = _render_template(item, context)
46
+ final_dst = dst_item.with_suffix('')
47
+ final_dst.parent.mkdir(parents=True, exist_ok=True)
48
+ with open(final_dst, 'w', encoding='utf-8') as f:
49
+ f.write(rendered_content)
50
+ else:
51
+ shutil.copy2(item, dst_item)
52
+
53
+
54
+ def main(args):
55
+ if len(args) != 1:
56
+ console.print("[bold red]Error:[/bold red] Usage: crawlo startproject <project_name>")
57
+ return 1
58
+
59
+ project_name = args[0]
60
+ project_dir = Path(project_name)
61
+
62
+ if project_dir.exists():
63
+ console.print(f"[bold red]Error:[/bold red] Directory '[cyan]{project_dir}[/cyan]' already exists.")
64
+ return 1
65
+
66
+ context = {'project_name': project_name}
67
+ template_dir = TEMPLATES_DIR / 'project'
68
+
69
+ try:
70
+ # 1. 创建项目根目录
71
+ project_dir.mkdir()
72
+
73
+ # 2. 渲染 crawlo.cfg.tmpl
74
+ cfg_template = TEMPLATES_DIR / 'crawlo.cfg.tmpl'
75
+ if cfg_template.exists():
76
+ cfg_content = _render_template(cfg_template, context)
77
+ (project_dir / 'crawlo.cfg').write_text(cfg_content, encoding='utf-8')
78
+ console.print(f":white_check_mark: Created [green]{project_dir / 'crawlo.cfg'}[/green]")
79
+ else:
80
+ console.print("[yellow]⚠ Warning:[/yellow] Template 'crawlo.cfg.tmpl' not found.")
81
+
82
+ # 3. 复制并渲染项目包内容
83
+ package_dir = project_dir / project_name
84
+ _copytree_with_templates(template_dir, package_dir, context)
85
+ console.print(f":white_check_mark: Created project package: [green]{package_dir}[/green]")
86
+
87
+ # 4. 创建 logs 目录
88
+ (project_dir / 'logs').mkdir(exist_ok=True)
89
+ console.print(":white_check_mark: Created logs directory")
90
+
91
+ # 成功面板
92
+ success_text = Text.from_markup(f"Project '[bold cyan]{project_name}[/bold cyan]' created successfully!")
93
+ console.print(Panel(success_text, title=":rocket: Success", border_style="green", padding=(1, 2)))
94
+
95
+ # 下一步操作提示(对齐美观 + 语法高亮)
96
+ next_steps = f"""
97
+ [bold]Next steps:[/bold]
98
+ [blue]cd[/blue] {project_name}
99
+ [blue]crawlo genspider[/blue] example example.com
100
+ [blue]crawlo run[/blue] example
101
+ """.strip()
102
+ console.print(next_steps)
103
+
104
+ return 0
105
+
106
+ except Exception as e:
107
+ console.print(f"[bold red]Error creating project:[/bold red] {e}")
108
+ if project_dir.exists():
109
+ shutil.rmtree(project_dir, ignore_errors=True)
110
+ console.print("[red]:cross_mark: Cleaned up partially created project.[/red]")
111
+ return 1
crawlo/commands/stats.py CHANGED
@@ -1,167 +1,188 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- # @Time : 2025-08-31 22:36
5
- # @Author : crawl-coder
6
- # @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
7
- """
8
-
9
- import sys
10
- import json
11
- from pathlib import Path
12
- from datetime import datetime
13
- from typing import Dict, Any
14
-
15
- from crawlo.utils.log import get_logger
16
-
17
-
18
- logger = get_logger(__name__)
19
-
20
- # 默认存储目录(相对于项目根目录)
21
- STATS_DIR = "logs/stats"
22
-
23
-
24
- def get_stats_dir() -> Path:
25
- """
26
- 获取统计文件存储目录,优先使用项目根下的 logs/stats/
27
- 如果不在项目中,回退到当前目录
28
- """
29
- # 尝试查找项目根目录(通过 crawlo.cfg)
30
- current = Path.cwd()
31
- for _ in range(10):
32
- if (current / "crawlo.cfg").exists():
33
- return current / STATS_DIR
34
- if current == current.parent:
35
- break
36
- current = current.parent
37
-
38
- # 回退:使用当前目录下的 logs/stats
39
- return Path.cwd() / STATS_DIR
40
-
41
-
42
- def record_stats(crawler):
43
- """
44
- 【供爬虫运行时调用】记录爬虫结束后的统计信息到 JSON 文件
45
- 需在 Crawler 的 closed 回调中调用
46
- """
47
- spider_name = getattr(crawler.spider, "name", "unknown")
48
- stats = crawler.stats.get_stats() if crawler.stats else {}
49
-
50
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
51
- stats_dir = Path(get_stats_dir())
52
- stats_dir.mkdir(parents=True, exist_ok=True)
53
-
54
- filename = stats_dir / f"{spider_name}_{timestamp}.json"
55
- try:
56
- with open(filename, "w", encoding="utf-8") as f:
57
- json.dump({
58
- "spider": spider_name,
59
- "timestamp": datetime.now().isoformat(),
60
- "stats": stats
61
- }, f, ensure_ascii=False, indent=2, default=str)
62
- logger.info(f"📊 Stats saved for spider '{spider_name}' → {filename}")
63
- except Exception as e:
64
- logger.error(f"Failed to save stats for '{spider_name}': {e}")
65
-
66
-
67
- def load_all_stats() -> Dict[str, list]:
68
- """
69
- 加载所有已保存的统计文件,按 spider name 分组
70
- 返回: {spider_name: [stats_record, ...]}
71
- """
72
- stats_dir = get_stats_dir()
73
- if not stats_dir.exists():
74
- return {}
75
-
76
- result = {}
77
- json_files = sorted(stats_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
78
-
79
- for file in json_files:
80
- try:
81
- with open(file, "r", encoding="utf-8") as f:
82
- data = json.load(f)
83
- spider_name = data.get("spider", "unknown")
84
- result.setdefault(spider_name, []).append(data)
85
- except Exception as e:
86
- logger.warning(f"Failed to load stats file {file}: {e}")
87
- return result
88
-
89
-
90
- def format_value(v: Any) -> str:
91
- """格式化值,防止太长或不可打印"""
92
- if isinstance(v, float):
93
- return f"{v:.4f}"
94
- return str(v)
95
-
96
-
97
- def main(args):
98
- """
99
- 主函数:查看统计信息
100
- 用法:
101
- crawlo stats → 显示所有爬虫最近一次运行
102
- crawlo stats myspider → 显示指定爬虫所有历史记录
103
- crawlo stats myspider --all → 显示所有历史(同上)
104
- """
105
- if len(args) > 2:
106
- print("Usage: crawlo stats [spider_name] [--all]")
107
- return 1
108
-
109
- spider_name = None
110
- show_all = False
111
-
112
- if args:
113
- spider_name = args[0]
114
- show_all = "--all" in args or "-a" in args
115
-
116
- # 加载所有 stats
117
- all_stats = load_all_stats()
118
- if not all_stats:
119
- print("📊 No stats found. Run a spider first.")
120
- print(f"💡 Stats are saved in: {get_stats_dir()}")
121
- return 0
122
-
123
- if not spider_name:
124
- # 显示每个爬虫最近一次运行
125
- print("📊 Recent Spider Statistics (last run):")
126
- print("-" * 60)
127
- for name, runs in all_stats.items():
128
- latest = runs[0]
129
- print(f"🕷️ {name} ({latest['timestamp'][:19]})")
130
- stats = latest["stats"]
131
- for k in sorted(stats.keys()):
132
- print(f" {k:<30} {format_value(stats[k])}")
133
- print()
134
- return 0
135
-
136
- else:
137
- # 查看指定爬虫
138
- if spider_name not in all_stats:
139
- print(f"📊 No stats found for spider '{spider_name}'")
140
- available = ', '.join(all_stats.keys())
141
- if available:
142
- print(f"💡 Available spiders: {available}")
143
- return 1
144
-
145
- runs = all_stats[spider_name]
146
- if show_all:
147
- print(f"📊 All runs for '{spider_name}' ({len(runs)} runs):")
148
- else:
149
- runs = runs[:1]
150
- print(f"📊 Last run for '{spider_name}':")
151
-
152
- print("-" * 60)
153
- for run in runs:
154
- print(f"⏱️ Timestamp: {run['timestamp']}")
155
- stats = run["stats"]
156
- for k in sorted(stats.keys()):
157
- print(f" {k:<30} {format_value(stats[k])}")
158
- print("─" * 60)
159
- return 0
160
-
161
-
162
- if __name__ == "__main__":
163
- """
164
- 支持直接运行:
165
- python -m crawlo.commands.stats
166
- """
167
- sys.exit(main(sys.argv[1:]))
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
7
+ """
8
+ import json
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+ from typing import Dict, Any
12
+
13
+ from rich.console import Console
14
+ from rich.table import Table
15
+ from rich.panel import Panel
16
+ from rich.text import Text
17
+ from rich import box
18
+
19
+ from crawlo.utils.log import get_logger
20
+
21
+
22
+ logger = get_logger(__name__)
23
+ console = Console()
24
+
25
+ # 默认存储目录(相对于项目根目录)
26
+ STATS_DIR = "logs/stats"
27
+
28
+
29
+ def get_stats_dir() -> Path:
30
+ """
31
+ 获取统计文件存储目录,优先使用项目根下的 logs/stats/
32
+ 如果不在项目中,回退到当前目录
33
+ """
34
+ current = Path.cwd()
35
+ for _ in range(10):
36
+ if (current / "crawlo.cfg").exists():
37
+ return current / STATS_DIR
38
+ if current == current.parent:
39
+ break
40
+ current = current.parent
41
+ return Path.cwd() / STATS_DIR
42
+
43
+
44
+ def record_stats(crawler):
45
+ """
46
+ 【供爬虫运行时调用】记录爬虫结束后的统计信息到 JSON 文件
47
+ 需在 Crawler closed 回调中调用
48
+ """
49
+ spider_name = getattr(crawler.spider, "name", "unknown")
50
+ stats = crawler.stats.get_stats() if crawler.stats else {}
51
+
52
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
53
+ stats_dir = Path(get_stats_dir())
54
+ stats_dir.mkdir(parents=True, exist_ok=True)
55
+
56
+ filename = stats_dir / f"{spider_name}_{timestamp}.json"
57
+ try:
58
+ with open(filename, "w", encoding="utf-8") as f:
59
+ json.dump({
60
+ "spider": spider_name,
61
+ "timestamp": datetime.now().isoformat(),
62
+ "stats": stats
63
+ }, f, ensure_ascii=False, indent=2, default=str)
64
+ logger.info(f"📊 Stats saved for spider '{spider_name}' {filename}")
65
+ except Exception as e:
66
+ logger.error(f"Failed to save stats for '{spider_name}': {e}")
67
+
68
+
69
+ def load_all_stats() -> Dict[str, list]:
70
+ """
71
+ 加载所有已保存的统计文件,按 spider name 分组
72
+ 返回: {spider_name: [stats_record, ...]}
73
+ """
74
+ stats_dir = get_stats_dir()
75
+ if not stats_dir.exists():
76
+ return {}
77
+
78
+ result = {}
79
+ json_files = sorted(stats_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
80
+
81
+ for file in json_files:
82
+ try:
83
+ with open(file, "r", encoding="utf-8") as f:
84
+ data = json.load(f)
85
+ spider_name = data.get("spider", "unknown")
86
+ result.setdefault(spider_name, []).append(data)
87
+ except Exception as e:
88
+ logger.warning(f"Failed to load stats file {file}: {e}")
89
+ return result
90
+
91
+
92
+ def format_value(v: Any) -> str:
93
+ """格式化值,防止太长或不可打印"""
94
+ if isinstance(v, float):
95
+ return f"{v:.4f}"
96
+ s = str(v)
97
+ if len(s) > 80:
98
+ return s[:77] + "..."
99
+ return s
100
+
101
+
102
+ def display_stats_table(stats_data: dict, title: str = "Statistics"):
103
+ """通用函数:用 rich.table 展示统计数据"""
104
+ table = Table(title=title, box=box.ROUNDED, show_header=True, header_style="bold magenta")
105
+ table.add_column("Key", style="cyan", no_wrap=True)
106
+ table.add_column("Value", style="green")
107
+
108
+ for k in sorted(stats_data.keys()):
109
+ table.add_row(k, format_value(stats_data[k]))
110
+
111
+ console.print(table)
112
+
113
+
114
+ def main(args):
115
+ """
116
+ 主函数:查看统计信息
117
+ 用法:
118
+ crawlo stats → 显示所有爬虫最近一次运行
119
+ crawlo stats myspider → 显示指定爬虫所有历史记录
120
+ crawlo stats myspider --all → 显示所有历史(同上)
121
+ """
122
+ if len(args) > 2:
123
+ console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo stats[/blue] [spider_name] [--all]")
124
+ return 1
125
+
126
+ spider_name = None
127
+ show_all = False
128
+
129
+ if args:
130
+ spider_name = args[0]
131
+ show_all = "--all" in args or "-a" in args
132
+
133
+ all_stats = load_all_stats()
134
+
135
+ if not all_stats:
136
+ console.print(Panel(
137
+ Text.from_markup(
138
+ ":chart_with_upwards_trend: [bold]No stats found.[/bold]\n"
139
+ "💡 Run a spider first to generate statistics.\n"
140
+ f"📁 Stats directory: [cyan]{get_stats_dir()}[/cyan]"
141
+ ),
142
+ title="📊 Statistics",
143
+ border_style="yellow",
144
+ padding=(1, 2)
145
+ ))
146
+ return 0
147
+
148
+ # 显示所有爬虫最近一次运行
149
+ if not spider_name:
150
+ console.print(Panel(
151
+ "[bold]Recent Spider Statistics (last run)[/bold]",
152
+ title="📊 Spider Stats Overview",
153
+ border_style="green",
154
+ padding=(0, 1)
155
+ ))
156
+
157
+ for name, runs in all_stats.items():
158
+ latest = runs[0]
159
+ ts = latest['timestamp'][:19]
160
+ console.print(f"🕷️ [bold cyan]{name}[/bold cyan] ([green]{ts}[/green])")
161
+ display_stats_table(latest["stats"], title=f"Stats for {name}")
162
+ console.print() # 空行分隔
163
+
164
+ return 0
165
+
166
+ # 显示指定爬虫的历史
167
+ if spider_name not in all_stats:
168
+ console.print(f"[bold red]:cross_mark: No stats found for spider '[cyan]{spider_name}[/cyan]'[/bold red]")
169
+ available = ', '.join(all_stats.keys())
170
+ if available:
171
+ console.print(f":bulb: Available spiders: [green]{available}[/green]")
172
+ return 1
173
+
174
+ runs = all_stats[spider_name]
175
+ if show_all:
176
+ console.print(f":bar_chart: [bold]All runs for '[cyan]{spider_name}[/cyan]' ({len(runs)} runs):[/bold]")
177
+ else:
178
+ runs = runs[:1]
179
+ console.print(f":bar_chart: [bold]Last run for '[cyan]{spider_name}[/cyan]':[/bold]")
180
+
181
+ for i, run in enumerate(runs, 1):
182
+ ts = run['timestamp']
183
+ subtitle = f"Run #{i} · {ts}" if show_all else f"Last Run · {ts}"
184
+ display_stats_table(run["stats"], title=f"Stats for {spider_name} — {subtitle}")
185
+ if i < len(runs):
186
+ console.print("─" * 60)
187
+
188
+ return 0
crawlo/core/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-