crawlo 1.0.8__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (112) hide show
  1. {crawlo-1.0.8/crawlo.egg-info → crawlo-1.1.0}/PKG-INFO +1 -1
  2. crawlo-1.1.0/crawlo/__version__.py +1 -0
  3. crawlo-1.1.0/crawlo/commands/__init__.py +14 -0
  4. crawlo-1.1.0/crawlo/commands/check.py +156 -0
  5. crawlo-1.1.0/crawlo/commands/list.py +119 -0
  6. crawlo-1.1.0/crawlo/commands/run.py +171 -0
  7. crawlo-1.1.0/crawlo/commands/stats.py +167 -0
  8. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/core/engine.py +1 -1
  9. {crawlo-1.0.8 → crawlo-1.1.0/crawlo.egg-info}/PKG-INFO +1 -1
  10. crawlo-1.1.0/examples/gxb/spider/__init__.py +2 -0
  11. crawlo-1.0.8/crawlo/__version__.py +0 -1
  12. crawlo-1.0.8/crawlo/commands/__init__.py +0 -10
  13. crawlo-1.0.8/crawlo/commands/check.py +0 -107
  14. crawlo-1.0.8/crawlo/commands/list.py +0 -92
  15. crawlo-1.0.8/crawlo/commands/run.py +0 -181
  16. crawlo-1.0.8/crawlo/commands/stats.py +0 -59
  17. crawlo-1.0.8/examples/gxb/spider/__init__.py +0 -0
  18. {crawlo-1.0.8 → crawlo-1.1.0}/LICENSE +0 -0
  19. {crawlo-1.0.8 → crawlo-1.1.0}/MANIFEST.in +0 -0
  20. {crawlo-1.0.8 → crawlo-1.1.0}/README.md +0 -0
  21. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/__init__.py +0 -0
  22. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/cli.py +0 -0
  23. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/commands/genspider.py +0 -0
  24. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/commands/startproject.py +0 -0
  25. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/core/__init__.py +0 -0
  26. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/core/processor.py +0 -0
  27. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/core/scheduler.py +0 -0
  28. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/crawler.py +0 -0
  29. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/downloader/__init__.py +0 -0
  30. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/downloader/aiohttp_downloader.py +0 -0
  31. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/downloader/cffi_downloader.py +0 -0
  32. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/downloader/httpx_downloader.py +0 -0
  33. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/event.py +0 -0
  34. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/exceptions.py +0 -0
  35. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/extension/__init__.py +0 -0
  36. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/extension/log_interval.py +0 -0
  37. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/extension/log_stats.py +0 -0
  38. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/extension/logging_extension.py +0 -0
  39. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/filters/__init__.py +0 -0
  40. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/filters/aioredis_filter.py +0 -0
  41. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/filters/memory_filter.py +0 -0
  42. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/items/__init__.py +0 -0
  43. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/items/base.py +0 -0
  44. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/items/fields.py +0 -0
  45. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/items/items.py +0 -0
  46. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/__init__.py +0 -0
  47. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/default_header.py +0 -0
  48. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/download_delay.py +0 -0
  49. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/middleware_manager.py +0 -0
  50. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/proxy.py +0 -0
  51. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/request_ignore.py +0 -0
  52. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/response_code.py +0 -0
  53. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/response_filter.py +0 -0
  54. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/middleware/retry.py +0 -0
  55. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/network/__init__.py +0 -0
  56. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/network/request.py +0 -0
  57. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/network/response.py +0 -0
  58. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/pipelines/__init__.py +0 -0
  59. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/pipelines/console_pipeline.py +0 -0
  60. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/pipelines/mongo_pipeline.py +0 -0
  61. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/pipelines/mysql_batch_pipline.py +0 -0
  62. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/pipelines/mysql_pipeline.py +0 -0
  63. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/pipelines/pipeline_manager.py +0 -0
  64. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/settings/__init__.py +0 -0
  65. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/settings/default_settings.py +0 -0
  66. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/settings/setting_manager.py +0 -0
  67. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/spider/__init__.py +0 -0
  68. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/stats_collector.py +0 -0
  69. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/subscriber.py +0 -0
  70. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/task_manager.py +0 -0
  71. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  72. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/project/__init__.py.tmpl +0 -0
  73. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/project/items.py.tmpl +0 -0
  74. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  75. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  76. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/project/settings.py.tmpl +0 -0
  77. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  78. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/templates/spider/spider.py.tmpl +0 -0
  79. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/__init__.py +0 -0
  80. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/concurrency_manager.py +0 -0
  81. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/date_tools.py +0 -0
  82. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/db_helper.py +0 -0
  83. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/func_tools.py +0 -0
  84. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/log.py +0 -0
  85. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/pqueue.py +0 -0
  86. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/project.py +0 -0
  87. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/request.py +0 -0
  88. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/spider_loader.py +0 -0
  89. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/system.py +0 -0
  90. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/tools.py +0 -0
  91. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo/utils/url.py +0 -0
  92. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo.egg-info/SOURCES.txt +0 -0
  93. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo.egg-info/dependency_links.txt +0 -0
  94. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo.egg-info/entry_points.txt +0 -0
  95. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo.egg-info/requires.txt +0 -0
  96. {crawlo-1.0.8 → crawlo-1.1.0}/crawlo.egg-info/top_level.txt +0 -0
  97. {crawlo-1.0.8 → crawlo-1.1.0}/examples/__init__.py +0 -0
  98. {crawlo-1.0.8 → crawlo-1.1.0}/examples/gxb/__init__.py +0 -0
  99. {crawlo-1.0.8 → crawlo-1.1.0}/examples/gxb/items.py +0 -0
  100. {crawlo-1.0.8 → crawlo-1.1.0}/examples/gxb/run.py +0 -0
  101. {crawlo-1.0.8 → crawlo-1.1.0}/examples/gxb/settings.py +0 -0
  102. {crawlo-1.0.8 → crawlo-1.1.0}/examples/gxb/spider/miit_spider.py +0 -0
  103. {crawlo-1.0.8 → crawlo-1.1.0}/examples/gxb/spider/telecom_device.py +0 -0
  104. {crawlo-1.0.8 → crawlo-1.1.0}/pyproject.toml +0 -0
  105. {crawlo-1.0.8 → crawlo-1.1.0}/requirements.txt +0 -0
  106. {crawlo-1.0.8 → crawlo-1.1.0}/setup.cfg +0 -0
  107. {crawlo-1.0.8 → crawlo-1.1.0}/tests/__init__.py +0 -0
  108. {crawlo-1.0.8 → crawlo-1.1.0}/tests/test_proxy_health_check.py +0 -0
  109. {crawlo-1.0.8 → crawlo-1.1.0}/tests/test_proxy_middleware_integration.py +0 -0
  110. {crawlo-1.0.8 → crawlo-1.1.0}/tests/test_proxy_providers.py +0 -0
  111. {crawlo-1.0.8 → crawlo-1.1.0}/tests/test_proxy_stats.py +0 -0
  112. {crawlo-1.0.8 → crawlo-1.1.0}/tests/test_proxy_strategies.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.8
3
+ Version: 1.1.0
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -0,0 +1 @@
1
+ __version__ = "1.1.0"
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+
4
+ _commands = {
5
+ 'startproject': 'crawlo.commands.startproject',
6
+ 'genspider': 'crawlo.commands.genspider',
7
+ 'run': 'crawlo.commands.run',
8
+ 'check': 'crawlo.commands.check',
9
+ 'list': 'crawlo.commands.list',
10
+ 'stats': 'crawlo.commands.stats'
11
+ }
12
+
13
+ def get_commands():
14
+ return _commands
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:35
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo check,检查所有爬虫定义是否合规。
7
+ """
8
+
9
+ import sys
10
+ import configparser
11
+ from pathlib import Path
12
+ from importlib import import_module
13
+
14
+ from crawlo.crawler import CrawlerProcess
15
+ from crawlo.utils.log import get_logger
16
+
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ def get_project_root():
22
+ """
23
+ 从当前目录向上查找 crawlo.cfg,确定项目根目录
24
+ """
25
+ current = Path.cwd()
26
+
27
+ for _ in range(10):
28
+ cfg = current / "crawlo.cfg"
29
+ if cfg.exists():
30
+ return current
31
+
32
+ if current == current.parent:
33
+ break
34
+ current = current.parent
35
+
36
+ return None
37
+
38
+
39
+ def main(args):
40
+ """
41
+ 主函数:检查所有爬虫定义的合规性
42
+ 用法: crawlo check
43
+ """
44
+ if args:
45
+ print("❌ Usage: crawlo check")
46
+ return 1
47
+
48
+ try:
49
+ # 1. 查找项目根目录
50
+ project_root = get_project_root()
51
+ if not project_root:
52
+ print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
53
+ print("💡 Tip: Run this command inside your project directory.")
54
+ return 1
55
+
56
+ project_root_str = str(project_root)
57
+ if project_root_str not in sys.path:
58
+ sys.path.insert(0, project_root_str)
59
+
60
+ # 2. 读取 crawlo.cfg
61
+ cfg_file = project_root / "crawlo.cfg"
62
+ if not cfg_file.exists():
63
+ print(f"❌ Error: Expected config file not found: {cfg_file}")
64
+ return 1
65
+
66
+ config = configparser.ConfigParser()
67
+ config.read(cfg_file, encoding="utf-8")
68
+
69
+ if not config.has_section("settings") or not config.has_option("settings", "default"):
70
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
71
+ return 1
72
+
73
+ settings_module = config.get("settings", "default")
74
+ project_package = settings_module.split(".")[0]
75
+
76
+ # 3. 确保项目包可导入
77
+ try:
78
+ import_module(project_package)
79
+ except ImportError as e:
80
+ print(f"❌ Failed to import project package '{project_package}': {e}")
81
+ return 1
82
+
83
+ # 4. 加载爬虫
84
+ spider_modules = [f"{project_package}.spiders"]
85
+ process = CrawlerProcess(spider_modules=spider_modules)
86
+ spider_names = process.get_spider_names()
87
+
88
+ if not spider_names:
89
+ print("📭 No spiders found.")
90
+ print("💡 Make sure:")
91
+ print(" • Spiders are defined in the 'spiders' module")
92
+ print(" • They have a `name` attribute")
93
+ print(" • Modules are properly imported")
94
+ return 1
95
+
96
+ print(f"🔍 Checking {len(spider_names)} spider(s)...")
97
+ print("-" * 60)
98
+
99
+ issues_found = False
100
+
101
+ for name in sorted(spider_names):
102
+ cls = process.get_spider_class(name)
103
+ issues = []
104
+
105
+ # 检查 name 属性
106
+ if not getattr(cls, "name", None):
107
+ issues.append("missing or empty 'name' attribute")
108
+ elif not isinstance(cls.name, str):
109
+ issues.append("'name' is not a string")
110
+
111
+ # 检查 start_requests 是否可调用
112
+ if not callable(getattr(cls, "start_requests", None)):
113
+ issues.append("missing or non-callable 'start_requests' method")
114
+
115
+ # 检查 start_urls 类型(不应是字符串)
116
+ if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
117
+ issues.append("'start_urls' is a string; should be list or tuple")
118
+
119
+ # 实例化并检查 parse 方法(非强制但推荐)
120
+ try:
121
+ spider = cls.create_instance(None)
122
+ if not callable(getattr(spider, "parse", None)):
123
+ issues.append("no 'parse' method defined (recommended)")
124
+ except Exception as e:
125
+ issues.append(f"failed to instantiate spider: {e}")
126
+
127
+ # 输出结果
128
+ if issues:
129
+ print(f"❌ {name:<20} {cls.__name__}")
130
+ for issue in issues:
131
+ print(f" • {issue}")
132
+ issues_found = True
133
+ else:
134
+ print(f"✅ {name:<20} {cls.__name__} (OK)")
135
+
136
+ print("-" * 60)
137
+
138
+ if issues_found:
139
+ print("⚠️ Some spiders have issues. Please fix them.")
140
+ return 1
141
+ else:
142
+ print("🎉 All spiders are compliant and well-defined!")
143
+ return 0
144
+
145
+ except Exception as e:
146
+ print(f"❌ Unexpected error during check: {e}")
147
+ logger.exception("Exception in 'crawlo check'")
148
+ return 1
149
+
150
+
151
+ if __name__ == "__main__":
152
+ """
153
+ 支持直接运行:
154
+ python -m crawlo.commands.check
155
+ """
156
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,119 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:33
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
7
+ """
8
+
9
+ import sys
10
+ import configparser
11
+ from pathlib import Path
12
+ from importlib import import_module
13
+
14
+ from crawlo.crawler import CrawlerProcess
15
+ from crawlo.utils.log import get_logger
16
+
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ def get_project_root():
22
+ """
23
+ 自动检测项目根目录:从当前目录向上查找 crawlo.cfg
24
+ 找到后返回该目录路径(字符串),最多向上查找10层。
25
+ """
26
+ current = Path.cwd()
27
+
28
+ for _ in range(10):
29
+ cfg = current / "crawlo.cfg"
30
+ if cfg.exists():
31
+ return str(current)
32
+
33
+ # 到达文件系统根目录
34
+ if current == current.parent:
35
+ break
36
+ current = current.parent
37
+
38
+ return None # 未找到
39
+
40
+
41
+ def main(args):
42
+ """
43
+ 主函数:列出所有可用爬虫
44
+ 用法: crawlo list
45
+ """
46
+ if args:
47
+ print("❌ Usage: crawlo list")
48
+ return 1
49
+
50
+ try:
51
+ # 1. 查找项目根目录
52
+ project_root = get_project_root()
53
+ if not project_root:
54
+ print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
55
+ print("💡 Tip: Run this command inside your project directory, or create a project with 'crawlo startproject'.")
56
+ return 1
57
+
58
+ project_root_path = Path(project_root)
59
+ project_root_str = str(project_root_path)
60
+
61
+ # 2. 将项目根加入 Python 路径,以便导入项目模块
62
+ if project_root_str not in sys.path:
63
+ sys.path.insert(0, project_root_str)
64
+
65
+ # 3. 读取 crawlo.cfg 获取 settings 模块
66
+ cfg_file = project_root_path / "crawlo.cfg"
67
+ config = configparser.ConfigParser()
68
+ config.read(cfg_file, encoding="utf-8")
69
+
70
+ if not config.has_section("settings") or not config.has_option("settings", "default"):
71
+ print("❌ Error: Invalid crawlo.cfg — missing [settings] or 'default' option.")
72
+ return 1
73
+
74
+ settings_module = config.get("settings", "default")
75
+ project_package = settings_module.split(".")[0]
76
+
77
+ # 4. 确保项目包可导入(可选:尝试导入以触发异常)
78
+ try:
79
+ import_module(project_package)
80
+ except ImportError as e:
81
+ print(f"❌ Failed to import project package '{project_package}': {e}")
82
+ return 1
83
+
84
+ # 5. 初始化 CrawlerProcess 并加载爬虫模块
85
+ spider_modules = [f"{project_package}.spiders"]
86
+ process = CrawlerProcess(spider_modules=spider_modules)
87
+
88
+ # 6. 获取所有爬虫名称
89
+ spider_names = process.get_spider_names()
90
+ if not spider_names:
91
+ print("📭 No spiders found in 'spiders/' directory.")
92
+ print("💡 Make sure:")
93
+ print(" • Spider classes inherit from `crawlo.spider.Spider`")
94
+ print(" • Each spider has a `name` attribute")
95
+ print(" • Spiders are imported in `spiders/__init__.py` (if using package)")
96
+ return 1
97
+
98
+ # 7. 输出爬虫列表
99
+ print(f"📋 Found {len(spider_names)} spider(s):")
100
+ print("-" * 60)
101
+ for name in sorted(spider_names):
102
+ spider_cls = process.get_spider_class(name)
103
+ module_name = spider_cls.__module__.replace(f"{project_package}.", "")
104
+ print(f"🕷️ {name:<20} {spider_cls.__name__:<25} ({module_name})")
105
+ print("-" * 60)
106
+ return 0
107
+
108
+ except Exception as e:
109
+ print(f"❌ Unexpected error: {e}")
110
+ logger.exception("Exception during 'crawlo list'")
111
+ return 1
112
+
113
+
114
+ if __name__ == "__main__":
115
+ """
116
+ 支持直接运行:
117
+ python -m crawlo.commands.list
118
+ """
119
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
7
+ """
8
+ import sys
9
+ import asyncio
10
+ import configparser
11
+ from pathlib import Path
12
+ from importlib import import_module
13
+
14
+ from crawlo.crawler import CrawlerProcess
15
+ from crawlo.utils.log import get_logger
16
+ from crawlo.utils.project import get_settings
17
+ from crawlo.commands.stats import record_stats # 自动记录 stats
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ def get_project_root():
23
+ """
24
+ 向上查找 crawlo.cfg 来确定项目根目录
25
+ """
26
+ current = Path.cwd()
27
+
28
+ for _ in range(10):
29
+ cfg = current / "crawlo.cfg"
30
+ if cfg.exists():
31
+ return current
32
+
33
+ if current == current.parent:
34
+ break
35
+ current = current.parent
36
+
37
+ return None
38
+
39
+
40
+ def main(args):
41
+ """
42
+ 主函数:运行指定爬虫
43
+ 用法:
44
+ crawlo run <spider_name>
45
+ crawlo run all
46
+ """
47
+ if len(args) < 1:
48
+ print("❌ Usage: crawlo run <spider_name>|all")
49
+ print("💡 Examples:")
50
+ print(" crawlo run baidu")
51
+ print(" crawlo run all")
52
+ return 1
53
+
54
+ spider_arg = args[0]
55
+
56
+ try:
57
+ # 1. 查找项目根目录
58
+ project_root = get_project_root()
59
+ if not project_root:
60
+ print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
61
+ print("💡 Tip: Run this command inside your project directory.")
62
+ return 1
63
+
64
+ project_root_str = str(project_root)
65
+ if project_root_str not in sys.path:
66
+ sys.path.insert(0, project_root_str)
67
+
68
+ # 2. 读取 crawlo.cfg 获取 settings 模块
69
+ cfg_file = project_root / "crawlo.cfg"
70
+ if not cfg_file.exists():
71
+ print(f"❌ Error: crawlo.cfg not found in {project_root}")
72
+ return 1
73
+
74
+ config = configparser.ConfigParser()
75
+ config.read(cfg_file, encoding="utf-8")
76
+
77
+ if not config.has_section("settings") or not config.has_option("settings", "default"):
78
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
79
+ return 1
80
+
81
+ settings_module = config.get("settings", "default")
82
+ project_package = settings_module.split(".")[0]
83
+
84
+ # 3. 确保项目包可导入
85
+ try:
86
+ import_module(project_package)
87
+ except ImportError as e:
88
+ print(f"❌ Failed to import project package '{project_package}': {e}")
89
+ return 1
90
+
91
+ # 4. 加载 settings 和爬虫模块
92
+ settings = get_settings() # 此时已安全
93
+ spider_modules = [f"{project_package}.spiders"]
94
+ process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
95
+
96
+ # === 情况1:运行所有爬虫 ===
97
+ if spider_arg.lower() == "all":
98
+ spider_names = process.get_spider_names()
99
+ if not spider_names:
100
+ print("❌ No spiders found.")
101
+ print("💡 Make sure:")
102
+ print(" • Spiders are defined in 'spiders/'")
103
+ print(" • They have a `name` attribute")
104
+ print(" • Modules are imported (e.g. via __init__.py)")
105
+ return 1
106
+
107
+ print(f"🚀 Starting ALL {len(spider_names)} spider(s):")
108
+ print("-" * 60)
109
+ for name in sorted(spider_names):
110
+ cls = process.get_spider_class(name)
111
+ print(f"🕷️ {name:<20} {cls.__name__}")
112
+ print("-" * 60)
113
+
114
+ # 注册 stats 记录(每个爬虫结束时保存)
115
+ for crawler in process.crawlers:
116
+ crawler.signals.connect(record_stats, signal="spider_closed")
117
+
118
+ # 并行运行所有爬虫(可改为串行:for name in ... await process.crawl(name))
119
+ asyncio.run(process.crawl(spider_names))
120
+ print("✅ All spiders completed.")
121
+ return 0
122
+
123
+ # === 情况2:运行单个爬虫 ===
124
+ spider_name = spider_arg
125
+ if not process.is_spider_registered(spider_name):
126
+ print(f"❌ Spider '{spider_name}' not found.")
127
+ available = process.get_spider_names()
128
+ if available:
129
+ print("💡 Available spiders:")
130
+ for name in sorted(available):
131
+ cls = process.get_spider_class(name)
132
+ print(f" • {name} ({cls.__name__})")
133
+ else:
134
+ print("💡 No spiders found. Check your spiders module.")
135
+ return 1
136
+
137
+ spider_class = process.get_spider_class(spider_name)
138
+
139
+ # 打印启动信息
140
+ print(f"🚀 Starting spider: {spider_name}")
141
+ print(f"📦 Project: {project_package}")
142
+ print(f"CppClass: {spider_class.__name__}")
143
+ print(f"📄 Module: {spider_class.__module__}")
144
+ print("-" * 50)
145
+
146
+ # 注册 stats 记录
147
+ for crawler in process.crawlers:
148
+ crawler.signals.connect(record_stats, signal="spider_closed")
149
+
150
+ # 运行爬虫
151
+ asyncio.run(process.crawl(spider_name))
152
+
153
+ print("-" * 50)
154
+ print("✅ Spider completed successfully!")
155
+ return 0
156
+
157
+ except KeyboardInterrupt:
158
+ print("\n⚠️ Spider interrupted by user.")
159
+ return 1
160
+ except Exception as e:
161
+ print(f"❌ Unexpected error: {e}")
162
+ logger.exception("Exception during 'crawlo run'")
163
+ return 1
164
+
165
+
166
+ if __name__ == "__main__":
167
+ """
168
+ 支持直接运行:
169
+ python -m crawlo.commands.run spider_name
170
+ """
171
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-31 22:36
5
+ # @Author : crawl-coder
6
+ # @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
7
+ """
8
+
9
+ import sys
10
+ import json
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ from typing import Dict, Any
14
+
15
+ from crawlo.utils.log import get_logger
16
+
17
+
18
+ logger = get_logger(__name__)
19
+
20
+ # 默认存储目录(相对于项目根目录)
21
+ STATS_DIR = "logs/stats"
22
+
23
+
24
+ def get_stats_dir() -> Path:
25
+ """
26
+ 获取统计文件存储目录,优先使用项目根下的 logs/stats/
27
+ 如果不在项目中,回退到当前目录
28
+ """
29
+ # 尝试查找项目根目录(通过 crawlo.cfg)
30
+ current = Path.cwd()
31
+ for _ in range(10):
32
+ if (current / "crawlo.cfg").exists():
33
+ return current / STATS_DIR
34
+ if current == current.parent:
35
+ break
36
+ current = current.parent
37
+
38
+ # 回退:使用当前目录下的 logs/stats
39
+ return Path.cwd() / STATS_DIR
40
+
41
+
42
+ def record_stats(crawler):
43
+ """
44
+ 【供爬虫运行时调用】记录爬虫结束后的统计信息到 JSON 文件
45
+ 需在 Crawler 的 closed 回调中调用
46
+ """
47
+ spider_name = getattr(crawler.spider, "name", "unknown")
48
+ stats = crawler.stats.get_stats() if crawler.stats else {}
49
+
50
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
51
+ stats_dir = Path(get_stats_dir())
52
+ stats_dir.mkdir(parents=True, exist_ok=True)
53
+
54
+ filename = stats_dir / f"{spider_name}_{timestamp}.json"
55
+ try:
56
+ with open(filename, "w", encoding="utf-8") as f:
57
+ json.dump({
58
+ "spider": spider_name,
59
+ "timestamp": datetime.now().isoformat(),
60
+ "stats": stats
61
+ }, f, ensure_ascii=False, indent=2, default=str)
62
+ logger.info(f"📊 Stats saved for spider '{spider_name}' → {filename}")
63
+ except Exception as e:
64
+ logger.error(f"Failed to save stats for '{spider_name}': {e}")
65
+
66
+
67
+ def load_all_stats() -> Dict[str, list]:
68
+ """
69
+ 加载所有已保存的统计文件,按 spider name 分组
70
+ 返回: {spider_name: [stats_record, ...]}
71
+ """
72
+ stats_dir = get_stats_dir()
73
+ if not stats_dir.exists():
74
+ return {}
75
+
76
+ result = {}
77
+ json_files = sorted(stats_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
78
+
79
+ for file in json_files:
80
+ try:
81
+ with open(file, "r", encoding="utf-8") as f:
82
+ data = json.load(f)
83
+ spider_name = data.get("spider", "unknown")
84
+ result.setdefault(spider_name, []).append(data)
85
+ except Exception as e:
86
+ logger.warning(f"Failed to load stats file {file}: {e}")
87
+ return result
88
+
89
+
90
+ def format_value(v: Any) -> str:
91
+ """格式化值,防止太长或不可打印"""
92
+ if isinstance(v, float):
93
+ return f"{v:.4f}"
94
+ return str(v)
95
+
96
+
97
+ def main(args):
98
+ """
99
+ 主函数:查看统计信息
100
+ 用法:
101
+ crawlo stats → 显示所有爬虫最近一次运行
102
+ crawlo stats myspider → 显示指定爬虫所有历史记录
103
+ crawlo stats myspider --all → 显示所有历史(同上)
104
+ """
105
+ if len(args) > 2:
106
+ print("Usage: crawlo stats [spider_name] [--all]")
107
+ return 1
108
+
109
+ spider_name = None
110
+ show_all = False
111
+
112
+ if args:
113
+ spider_name = args[0]
114
+ show_all = "--all" in args or "-a" in args
115
+
116
+ # 加载所有 stats
117
+ all_stats = load_all_stats()
118
+ if not all_stats:
119
+ print("📊 No stats found. Run a spider first.")
120
+ print(f"💡 Stats are saved in: {get_stats_dir()}")
121
+ return 0
122
+
123
+ if not spider_name:
124
+ # 显示每个爬虫最近一次运行
125
+ print("📊 Recent Spider Statistics (last run):")
126
+ print("-" * 60)
127
+ for name, runs in all_stats.items():
128
+ latest = runs[0]
129
+ print(f"🕷️ {name} ({latest['timestamp'][:19]})")
130
+ stats = latest["stats"]
131
+ for k in sorted(stats.keys()):
132
+ print(f" {k:<30} {format_value(stats[k])}")
133
+ print()
134
+ return 0
135
+
136
+ else:
137
+ # 查看指定爬虫
138
+ if spider_name not in all_stats:
139
+ print(f"📊 No stats found for spider '{spider_name}'")
140
+ available = ', '.join(all_stats.keys())
141
+ if available:
142
+ print(f"💡 Available spiders: {available}")
143
+ return 1
144
+
145
+ runs = all_stats[spider_name]
146
+ if show_all:
147
+ print(f"📊 All runs for '{spider_name}' ({len(runs)} runs):")
148
+ else:
149
+ runs = runs[:1]
150
+ print(f"📊 Last run for '{spider_name}':")
151
+
152
+ print("-" * 60)
153
+ for run in runs:
154
+ print(f"⏱️ Timestamp: {run['timestamp']}")
155
+ stats = run["stats"]
156
+ for k in sorted(stats.keys()):
157
+ print(f" {k:<30} {format_value(stats[k])}")
158
+ print("─" * 60)
159
+ return 0
160
+
161
+
162
+ if __name__ == "__main__":
163
+ """
164
+ 支持直接运行:
165
+ python -m crawlo.commands.stats
166
+ """
167
+ sys.exit(main(sys.argv[1:]))
@@ -42,7 +42,7 @@ class Engine(object):
42
42
  def engine_start(self):
43
43
  self.running = True
44
44
  self.logger.info(
45
- f"Crawlo (version {self.settings.get_int('VERSION')}) started. "
45
+ f"Crawlo (version {self.settings.get_float('VERSION')}) started. "
46
46
  f"(project name : {self.settings.get('PROJECT_NAME')})"
47
47
  )
48
48
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.8
3
+ Version: 1.1.0
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -0,0 +1,2 @@
1
+ from .miit_spider import MiitSpider
2
+ from .telecom_device import TelecomDeviceLicensesSpider
@@ -1 +0,0 @@
1
- __version__ = "1.0.8"
@@ -1,10 +0,0 @@
1
- # crawlo/commands/__init__.py
2
- # 定义可用的命令
3
- _commands = {
4
- 'startproject': 'crawlo.commands.startproject',
5
- 'genspider': 'crawlo.commands.genspider',
6
- 'run': 'crawlo.commands.run',
7
- }
8
-
9
- def get_commands():
10
- return _commands