crawlo 1.0.5__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (112) hide show
  1. {crawlo-1.0.5/crawlo.egg-info → crawlo-1.0.7}/PKG-INFO +1 -1
  2. crawlo-1.0.7/crawlo/__version__.py +1 -0
  3. crawlo-1.0.7/crawlo/cli.py +41 -0
  4. crawlo-1.0.7/crawlo/commands/__init__.py +10 -0
  5. crawlo-1.0.7/crawlo/commands/genspider.py +111 -0
  6. crawlo-1.0.7/crawlo/commands/run.py +149 -0
  7. crawlo-1.0.7/crawlo/commands/startproject.py +101 -0
  8. crawlo-1.0.7/crawlo/crawler.py +219 -0
  9. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/exceptions.py +5 -0
  10. crawlo-1.0.7/crawlo/items/__init__.py +23 -0
  11. crawlo-1.0.7/crawlo/items/base.py +22 -0
  12. crawlo-1.0.5/crawlo/items/__init__.py → crawlo-1.0.7/crawlo/items/fields.py +12 -20
  13. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/items/items.py +10 -20
  14. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/settings/default_settings.py +1 -1
  15. crawlo-1.0.7/crawlo/templates/crawlo.cfg.tmpl +11 -0
  16. crawlo-1.0.7/crawlo/templates/project/__init__.py.tmpl +4 -0
  17. crawlo-1.0.7/crawlo/templates/project/items.py.tmpl +18 -0
  18. crawlo-1.0.7/crawlo/templates/project/middlewares.py.tmpl +76 -0
  19. crawlo-1.0.7/crawlo/templates/project/pipelines.py.tmpl +64 -0
  20. crawlo-1.0.7/crawlo/templates/project/settings.py.tmpl +54 -0
  21. crawlo-1.0.7/crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
  22. crawlo-1.0.7/crawlo/templates/spider/spider.py.tmpl +32 -0
  23. crawlo-1.0.7/crawlo/utils/project.py +199 -0
  24. crawlo-1.0.7/crawlo/utils/spider_loader.py +63 -0
  25. {crawlo-1.0.5 → crawlo-1.0.7/crawlo.egg-info}/PKG-INFO +1 -1
  26. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo.egg-info/SOURCES.txt +17 -7
  27. crawlo-1.0.7/crawlo.egg-info/entry_points.txt +2 -0
  28. {crawlo-1.0.5 → crawlo-1.0.7}/examples/gxb/items.py +1 -1
  29. {crawlo-1.0.5 → crawlo-1.0.7}/examples/gxb/run.py +2 -1
  30. {crawlo-1.0.5 → crawlo-1.0.7}/examples/gxb/settings.py +2 -1
  31. crawlo-1.0.5/examples/gxb/spider/telecom_device_licenses.py → crawlo-1.0.7/examples/gxb/spider/telecom_device.py +1 -1
  32. {crawlo-1.0.5 → crawlo-1.0.7}/setup.cfg +1 -1
  33. crawlo-1.0.5/crawlo/__version__.py +0 -1
  34. crawlo-1.0.5/crawlo/crawler.py +0 -424
  35. crawlo-1.0.5/crawlo/templates/item_template.tmpl +0 -22
  36. crawlo-1.0.5/crawlo/templates/project_template/main.py +0 -33
  37. crawlo-1.0.5/crawlo/templates/project_template/setting.py +0 -190
  38. crawlo-1.0.5/crawlo/templates/spider_template.tmpl +0 -31
  39. crawlo-1.0.5/crawlo/utils/project.py +0 -59
  40. crawlo-1.0.5/crawlo.egg-info/entry_points.txt +0 -2
  41. crawlo-1.0.5/examples/gxb/__init__.py +0 -0
  42. crawlo-1.0.5/examples/gxb/spider/__init__.py +0 -0
  43. {crawlo-1.0.5 → crawlo-1.0.7}/LICENSE +0 -0
  44. {crawlo-1.0.5 → crawlo-1.0.7}/MANIFEST.in +0 -0
  45. {crawlo-1.0.5 → crawlo-1.0.7}/README.md +0 -0
  46. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/__init__.py +0 -0
  47. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/core/__init__.py +0 -0
  48. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/core/engine.py +0 -0
  49. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/core/processor.py +0 -0
  50. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/core/scheduler.py +0 -0
  51. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/downloader/__init__.py +0 -0
  52. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/downloader/aiohttp_downloader.py +0 -0
  53. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/downloader/cffi_downloader.py +0 -0
  54. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/downloader/httpx_downloader.py +0 -0
  55. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/event.py +0 -0
  56. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/extension/__init__.py +0 -0
  57. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/extension/log_interval.py +0 -0
  58. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/extension/log_stats.py +0 -0
  59. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/extension/logging_extension.py +0 -0
  60. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/filters/__init__.py +0 -0
  61. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/filters/aioredis_filter.py +0 -0
  62. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/filters/memory_filter.py +0 -0
  63. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/__init__.py +0 -0
  64. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/default_header.py +0 -0
  65. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/download_delay.py +0 -0
  66. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/middleware_manager.py +0 -0
  67. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/proxy.py +0 -0
  68. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/request_ignore.py +0 -0
  69. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/response_code.py +0 -0
  70. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/response_filter.py +0 -0
  71. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/middleware/retry.py +0 -0
  72. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/network/__init__.py +0 -0
  73. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/network/request.py +0 -0
  74. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/network/response.py +0 -0
  75. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/pipelines/__init__.py +0 -0
  76. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/pipelines/console_pipeline.py +0 -0
  77. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/pipelines/mongo_pipeline.py +0 -0
  78. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/pipelines/mysql_batch_pipline.py +0 -0
  79. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/pipelines/mysql_pipeline.py +0 -0
  80. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/pipelines/pipeline_manager.py +0 -0
  81. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/settings/__init__.py +0 -0
  82. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/settings/setting_manager.py +0 -0
  83. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/spider/__init__.py +0 -0
  84. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/stats_collector.py +0 -0
  85. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/subscriber.py +0 -0
  86. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/task_manager.py +0 -0
  87. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/__init__.py +0 -0
  88. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/concurrency_manager.py +0 -0
  89. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/date_tools.py +0 -0
  90. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/db_helper.py +0 -0
  91. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/func_tools.py +0 -0
  92. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/log.py +0 -0
  93. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/pqueue.py +0 -0
  94. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/request.py +0 -0
  95. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/system.py +0 -0
  96. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/tools.py +0 -0
  97. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo/utils/url.py +0 -0
  98. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo.egg-info/dependency_links.txt +0 -0
  99. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo.egg-info/requires.txt +0 -0
  100. {crawlo-1.0.5 → crawlo-1.0.7}/crawlo.egg-info/top_level.txt +0 -0
  101. {crawlo-1.0.5/crawlo/templates/project_template/items → crawlo-1.0.7/examples}/__init__.py +0 -0
  102. {crawlo-1.0.5/crawlo/templates/project_template/spiders → crawlo-1.0.7/examples/gxb}/__init__.py +0 -0
  103. {crawlo-1.0.5/examples → crawlo-1.0.7/examples/gxb/spider}/__init__.py +0 -0
  104. {crawlo-1.0.5 → crawlo-1.0.7}/examples/gxb/spider/miit_spider.py +0 -0
  105. {crawlo-1.0.5 → crawlo-1.0.7}/pyproject.toml +0 -0
  106. {crawlo-1.0.5 → crawlo-1.0.7}/requirements.txt +0 -0
  107. {crawlo-1.0.5 → crawlo-1.0.7}/tests/__init__.py +0 -0
  108. {crawlo-1.0.5 → crawlo-1.0.7}/tests/test_proxy_health_check.py +0 -0
  109. {crawlo-1.0.5 → crawlo-1.0.7}/tests/test_proxy_middleware_integration.py +0 -0
  110. {crawlo-1.0.5 → crawlo-1.0.7}/tests/test_proxy_providers.py +0 -0
  111. {crawlo-1.0.5 → crawlo-1.0.7}/tests/test_proxy_stats.py +0 -0
  112. {crawlo-1.0.5 → crawlo-1.0.7}/tests/test_proxy_strategies.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -0,0 +1 @@
1
+ __version__ = "1.0.7"
@@ -0,0 +1,41 @@
1
+ # crawlo/cli.py
2
+ # !/usr/bin/python
3
+ # -*- coding: UTF-8 -*-
4
+ import sys
5
+ import argparse
6
+ from crawlo.commands import get_commands
7
+
8
+
9
+ def main():
10
+ # 获取所有可用命令
11
+ commands = get_commands()
12
+
13
+ parser = argparse.ArgumentParser(
14
+ description="Crawlo: A lightweight web crawler framework.",
15
+ usage="crawlo <command> [options]"
16
+ )
17
+ parser.add_argument('command', help='Available commands: ' + ', '.join(commands.keys()))
18
+ # 注意:这里不添加具体参数,由子命令解析
19
+
20
+ # 只解析命令
21
+ args, unknown = parser.parse_known_args()
22
+
23
+ if args.command not in commands:
24
+ print(f"Unknown command: {args.command}")
25
+ print(f"Available commands: {', '.join(commands.keys())}")
26
+ sys.exit(1)
27
+
28
+ # 动态导入并执行命令
29
+ try:
30
+ module = __import__(commands[args.command], fromlist=['main'])
31
+ sys.exit(module.main(unknown))
32
+ except ImportError as e:
33
+ print(f"Failed to load command '{args.command}': {e}")
34
+ sys.exit(1)
35
+ except Exception as e:
36
+ print(f"Command '{args.command}' failed: {e}")
37
+ sys.exit(1)
38
+
39
+
40
+ if __name__ == '__main__':
41
+ main()
@@ -0,0 +1,10 @@
1
+ # crawlo/commands/__init__.py
2
+ # 定义可用的命令
3
+ _commands = {
4
+ 'startproject': 'crawlo.commands.startproject',
5
+ 'genspider': 'crawlo.commands.genspider',
6
+ 'run': 'crawlo.commands.run',
7
+ }
8
+
9
+ def get_commands():
10
+ return _commands
@@ -0,0 +1,111 @@
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+ import configparser
5
+ import importlib
6
+
7
+ TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
8
+
9
+
10
+ def _render_template(tmpl_path, context):
11
+ """读取模板文件,替换 {{key}} 为 context 中的值"""
12
+ with open(tmpl_path, 'r', encoding='utf-8') as f:
13
+ content = f.read()
14
+ for key, value in context.items():
15
+ content = content.replace(f'{{{{{key}}}}}', str(value))
16
+ return content
17
+
18
+
19
+ def main(args):
20
+ if len(args) < 2:
21
+ print("Usage: crawlo genspider <spider_name> <domain>")
22
+ return 1
23
+
24
+ spider_name = args[0]
25
+ domain = args[1]
26
+
27
+ # 查找项目根目录
28
+ project_root = None
29
+ current = Path.cwd()
30
+ while True:
31
+ cfg_file = current / 'crawlo.cfg'
32
+ if cfg_file.exists():
33
+ project_root = current
34
+ break
35
+ parent = current.parent
36
+ if parent == current:
37
+ break
38
+ current = parent
39
+
40
+ if not project_root:
41
+ print("Error: Not a crawlo project. crawlo.cfg not found.")
42
+ return 1
43
+
44
+ # 将项目根目录加入 sys.path
45
+ if str(project_root) not in sys.path:
46
+ sys.path.insert(0, str(project_root))
47
+
48
+ # 从 crawlo.cfg 读取 settings 模块,获取项目包名
49
+ config = configparser.ConfigParser()
50
+ try:
51
+ config.read(cfg_file, encoding='utf-8')
52
+ settings_module = config.get('settings', 'default')
53
+ project_package = settings_module.split('.')[0] # e.g., myproject.settings -> myproject
54
+ except Exception as e:
55
+ print(f"Error reading crawlo.cfg: {e}")
56
+ return 1
57
+
58
+ # 确定 items 模块的路径
59
+ items_module_path = f"{project_package}.items"
60
+
61
+ # 尝试导入 items 模块
62
+ try:
63
+ items_module = importlib.import_module(items_module_path)
64
+ # 获取模块中所有大写开头的类
65
+ item_classes = [cls for cls in items_module.__dict__.values()
66
+ if isinstance(cls, type) and cls.__name__.isupper()]
67
+
68
+ # 如果找到了类,使用第一个作为默认
69
+ if item_classes:
70
+ default_item_class = item_classes[0].__name__
71
+ else:
72
+ default_item_class = "ExampleItem" # 回退到示例
73
+ except ImportError as e:
74
+ print(f"Error importing items module '{items_module_path}': {e}")
75
+ default_item_class = "ExampleItem"
76
+
77
+ # 创建爬虫文件
78
+ spiders_dir = project_root / project_package / 'spiders'
79
+ if not spiders_dir.exists():
80
+ spiders_dir.mkdir(parents=True)
81
+
82
+ spider_file = spiders_dir / f'{spider_name}.py'
83
+ if spider_file.exists():
84
+ print(f"Error: Spider '{spider_name}' already exists.")
85
+ return 1
86
+
87
+ # ✅ 修正模板路径
88
+ tmpl_path = TEMPLATES_DIR / 'spider' / 'spider.py.tmpl'
89
+
90
+ if not tmpl_path.exists():
91
+ print(f"Error: Template file not found at {tmpl_path}")
92
+ return 1
93
+
94
+ # ✅ 生成正确的类名
95
+ class_name = f"{spider_name.capitalize()}Spider"
96
+
97
+ context = {
98
+ 'spider_name': spider_name,
99
+ 'domain': domain,
100
+ 'project_name': project_package,
101
+ 'item_class': default_item_class,
102
+ 'class_name': class_name # ✅ 添加处理好的类名
103
+ }
104
+
105
+ content = _render_template(tmpl_path, context)
106
+
107
+ with open(spider_file, 'w', encoding='utf-8') as f:
108
+ f.write(content)
109
+
110
+ print(f"Spider '{spider_name}' created in {spider_file}")
111
+ return 0
@@ -0,0 +1,149 @@
1
+ # crawlo/commands/run.py
2
+ import asyncio
3
+ import importlib
4
+ import sys
5
+ from pathlib import Path
6
+ import configparser
7
+
8
+ from crawlo.crawler import CrawlerProcess
9
+ from crawlo.utils.project import get_settings
10
+ from crawlo.utils.log import get_logger
11
+ from crawlo.utils.spider_loader import SpiderLoader
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def main(args):
17
+ """
18
+ 运行指定爬虫的主函数
19
+ 用法: crawlo run <spider_name>
20
+ """
21
+ if len(args) < 1:
22
+ print("Usage: crawlo run <spider_name>")
23
+ print("Example: crawlo run baidu")
24
+ return 1
25
+
26
+ spider_name = args[0]
27
+
28
+ try:
29
+ # 1. 获取项目根目录
30
+ project_root = get_settings()
31
+
32
+ # 将项目根目录添加到 Python 路径
33
+ if str(project_root) not in sys.path:
34
+ sys.path.insert(0, str(project_root))
35
+
36
+ # 2. 读取配置文件获取项目包名
37
+ cfg_file = project_root / 'crawlo.cfg'
38
+ if not cfg_file.exists():
39
+ print(f"❌ Error: crawlo.cfg not found in {project_root}")
40
+ return 1
41
+
42
+ config = configparser.ConfigParser()
43
+ config.read(cfg_file, encoding='utf-8')
44
+
45
+ if not config.has_section('settings') or not config.has_option('settings', 'default'):
46
+ print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
47
+ return 1
48
+
49
+ settings_module = config.get('settings', 'default')
50
+ project_package = settings_module.split('.')[0]
51
+
52
+ # 3. 查找并加载指定名称的 Spider
53
+ spider_class = find_spider_by_name(project_package, spider_name)
54
+ if spider_class is None:
55
+ return 1
56
+
57
+ # 4. 创建 CrawlerProcess 并运行单个爬虫
58
+ settings = get_settings()
59
+ process = CrawlerProcess(settings)
60
+
61
+ print(f"🚀 Starting spider: {spider_class.name}")
62
+ print(f"📁 Project: {project_package}")
63
+ print(f"🕷️ Class: {spider_class.__name__}")
64
+ print("-" * 50)
65
+
66
+ # 运行单个爬虫
67
+ asyncio.run(process.crawl(spider_class))
68
+
69
+ print("-" * 50)
70
+ print("✅ Spider completed successfully!")
71
+ return 0
72
+
73
+ except Exception as e:
74
+ print(f"❌ Error running spider: {e}")
75
+ import traceback
76
+ traceback.print_exc()
77
+ return 1
78
+
79
+
80
+ def find_spider_by_name(project_package: str, target_spider_name: str):
81
+ """使用 SpiderLoader 查找爬虫"""
82
+ loader = SpiderLoader(project_package)
83
+ spider_class = loader.load(target_spider_name)
84
+
85
+ if spider_class is None:
86
+ print(f"❌ Error: Spider with name '{target_spider_name}' not found")
87
+ print("💡 Available spiders:")
88
+ available_spiders = loader.list()
89
+ for spider_name in available_spiders:
90
+ print(f" - {spider_name}")
91
+ return None
92
+
93
+ return spider_class
94
+
95
+
96
+ def list_available_spiders(project_package: str):
97
+ """
98
+ 列出所有可用的爬虫
99
+ """
100
+ spiders_dir = Path.cwd() / project_package / 'spiders'
101
+ if not spiders_dir.exists():
102
+ print(" No spiders directory found")
103
+ return
104
+
105
+ spider_count = 0
106
+ for py_file in spiders_dir.glob("*.py"):
107
+ if py_file.name.startswith('_'):
108
+ continue
109
+
110
+ module_name = py_file.stem
111
+ spider_module_path = f"{project_package}.spiders.{module_name}"
112
+
113
+ try:
114
+ module = importlib.import_module(spider_module_path)
115
+ except ImportError:
116
+ continue
117
+
118
+ # 查找模块中所有 Spider 子类
119
+ from crawlo.spider import Spider
120
+ for attr_name in dir(module):
121
+ attr_value = getattr(module, attr_name)
122
+ if (isinstance(attr_value, type) and
123
+ issubclass(attr_value, Spider) and
124
+ attr_value != Spider and
125
+ hasattr(attr_value, 'name')):
126
+ print(f" - {attr_value.name} (class: {attr_value.__name__}, module: {module_name})")
127
+ spider_count += 1
128
+
129
+ if spider_count == 0:
130
+ print(" No spiders found")
131
+
132
+
133
+ def run_spider_by_name(spider_name: str, project_root: Path = None):
134
+ """
135
+ 直接在代码中通过 spider name 运行爬虫
136
+ """
137
+ if project_root:
138
+ if str(project_root) not in sys.path:
139
+ sys.path.insert(0, str(project_root))
140
+
141
+ args = [spider_name]
142
+ return main(args)
143
+
144
+
145
+ if __name__ == '__main__':
146
+ # 允许直接运行: python -m crawlo.commands.run <spider_name>
147
+ import sys
148
+
149
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,101 @@
1
+ # crawlo/commands/startproject.py
2
+ import os
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
7
+
8
+
9
+ def _render_template(tmpl_path, context):
10
+ """读取模板文件,替换 {{key}} 为 context 中的值"""
11
+ with open(tmpl_path, 'r', encoding='utf-8') as f:
12
+ content = f.read()
13
+ for key, value in context.items():
14
+ content = content.replace(f'{{{{{key}}}}}', str(value))
15
+ return content
16
+
17
+
18
+ def _copytree_with_templates(src, dst, context):
19
+ """
20
+ 递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
21
+ """
22
+ src_path = Path(src)
23
+ dst_path = Path(dst)
24
+ dst_path.mkdir(parents=True, exist_ok=True)
25
+
26
+ for item in src_path.rglob('*'):
27
+ rel_path = item.relative_to(src_path)
28
+ dst_item = dst_path / rel_path
29
+
30
+ if item.is_dir():
31
+ # 创建目标目录
32
+ dst_item.mkdir(parents=True, exist_ok=True)
33
+ else:
34
+ if item.suffix == '.tmpl':
35
+ # 渲染模板文件,并去掉 .tmpl 后缀
36
+ rendered_content = _render_template(item, context)
37
+ final_dst = dst_item.with_suffix('') # 去掉 .tmpl
38
+ final_dst.parent.mkdir(parents=True, exist_ok=True) # 确保父目录存在
39
+ with open(final_dst, 'w', encoding='utf-8') as f:
40
+ f.write(rendered_content)
41
+ else:
42
+ # 普通文件,直接复制
43
+ shutil.copy2(item, dst_item)
44
+
45
+
46
+ def main(args):
47
+ if len(args) != 1:
48
+ print("Usage: crawlo startproject <project_name>")
49
+ return 1
50
+
51
+ project_name = args[0]
52
+ project_dir = Path(project_name)
53
+
54
+ if project_dir.exists():
55
+ print(f"Error: Directory '{project_dir}' already exists.")
56
+ return 1
57
+
58
+ context = {'project_name': project_name}
59
+ template_dir = TEMPLATES_DIR / 'project'
60
+
61
+ try:
62
+ # 1. 创建项目根目录
63
+ project_dir.mkdir()
64
+
65
+ # 2. 处理 crawlo.cfg.tmpl:单独渲染并写入项目根目录
66
+ cfg_template = TEMPLATES_DIR / 'crawlo.cfg.tmpl' # ✅ 使用 templates/ 目录下的模板
67
+ if cfg_template.exists():
68
+ cfg_content = _render_template(cfg_template, context)
69
+ (project_dir / 'crawlo.cfg').write_text(cfg_content, encoding='utf-8')
70
+ else:
71
+ print("Warning: crawlo.cfg.tmpl not found in templates.")
72
+
73
+ # 3. 复制所有其他模板文件到项目包内 (project_dir / project_name)
74
+ package_dir = project_dir / project_name
75
+ # 这会复制 __init__.py.tmpl, items.py.tmpl, settings.py.tmpl, spiders/ 等
76
+ # 并将它们渲染为 .py 文件
77
+ _copytree_with_templates(template_dir, package_dir, context)
78
+
79
+ # 4. 创建 logs 目录
80
+ (project_dir / 'logs').mkdir(exist_ok=True)
81
+
82
+ print(f"""
83
+ ✔ 项目 '{project_name}' 创建成功!
84
+
85
+ 进入项目目录:
86
+ cd {project_name}
87
+
88
+ 创建一个爬虫:
89
+ crawlo genspider example example.com
90
+
91
+ 运行爬虫:
92
+ crawlo run example
93
+ """)
94
+ return 0
95
+
96
+ except Exception as e:
97
+ print(f"Error creating project: {e}")
98
+ # 如果出错,尝试清理已创建的目录
99
+ if project_dir.exists():
100
+ shutil.rmtree(project_dir, ignore_errors=True)
101
+ return 1
@@ -0,0 +1,219 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ import asyncio
4
+ import signal
5
+ from typing import Type, Optional, Set, List
6
+
7
+ from crawlo.spider import Spider
8
+ from crawlo.core.engine import Engine
9
+ from crawlo.utils.log import get_logger
10
+ from crawlo.subscriber import Subscriber
11
+ from crawlo.extension import ExtensionManager
12
+ from crawlo.exceptions import SpiderTypeError
13
+ from crawlo.stats_collector import StatsCollector
14
+ from crawlo.event import spider_opened, spider_closed
15
+ from crawlo.settings.setting_manager import SettingManager
16
+ from crawlo.utils.project import merge_settings, get_settings
17
+
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class Crawler:
23
+ """单个爬虫运行实例,绑定 Spider 与引擎"""
24
+
25
+ def __init__(self, spider_cls: Type[Spider], settings: SettingManager):
26
+ self.spider_cls = spider_cls
27
+ self.spider: Optional[Spider] = None
28
+ self.engine: Optional[Engine] = None
29
+ self.stats: Optional[StatsCollector] = None
30
+ self.subscriber: Optional[Subscriber] = None
31
+ self.extension: Optional[ExtensionManager] = None
32
+ self.settings: SettingManager = settings.copy()
33
+ self._closed = False # 新增状态
34
+ self._close_lock = asyncio.Lock()
35
+
36
+ async def crawl(self):
37
+ """启动爬虫核心流程"""
38
+ self.subscriber = self._create_subscriber()
39
+ self.spider = self._create_spider()
40
+ self.engine = self._create_engine()
41
+ self.stats = self._create_stats()
42
+ self.extension = self._create_extension()
43
+
44
+ await self.engine.start_spider(self.spider)
45
+
46
+ @staticmethod
47
+ def _create_subscriber() -> Subscriber:
48
+ return Subscriber()
49
+
50
+ def _create_spider(self) -> Spider:
51
+ spider = self.spider_cls.create_instance(self)
52
+
53
+ # --- 关键属性检查 ---
54
+ if not getattr(spider, 'name', None):
55
+ raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
56
+
57
+ if not callable(getattr(spider, 'start_requests', None)):
58
+ raise AttributeError(f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。")
59
+
60
+ start_urls = getattr(spider, 'start_urls', [])
61
+ if isinstance(start_urls, str):
62
+ raise TypeError(f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。")
63
+
64
+ if not callable(getattr(spider, 'parse', None)):
65
+ logger.warning(
66
+ f"爬虫 '{spider.name}' 未定义 'parse' 方法。请确保所有 Request 都指定了回调函数,否则响应将被忽略。")
67
+
68
+ self._set_spider(spider)
69
+ return spider
70
+
71
+ def _create_engine(self) -> Engine:
72
+ engine = Engine(self)
73
+ engine.engine_start()
74
+ return engine
75
+
76
+ def _create_stats(self) -> StatsCollector:
77
+ return StatsCollector(self)
78
+
79
+ def _create_extension(self) -> ExtensionManager:
80
+ return ExtensionManager.create_instance(self)
81
+
82
+ def _set_spider(self, spider: Spider):
83
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
84
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
85
+ merge_settings(spider, self.settings)
86
+
87
+ async def close(self, reason='finished') -> None:
88
+ async with self._close_lock:
89
+ if self._closed:
90
+ return
91
+ self._closed = True
92
+ await self.subscriber.notify(spider_closed)
93
+ if self.stats and self.spider:
94
+ self.stats.close_spider(spider=self.spider, reason=reason)
95
+
96
+
97
+ class CrawlerProcess:
98
+ """
99
+ 爬虫进程管理器,支持多爬虫并发调度、信号量控制、实时日志与优雅关闭
100
+ """
101
+
102
+ def __init__(self, settings: Optional[SettingManager] = None, max_concurrency: Optional[int] = None):
103
+ self.settings: SettingManager = settings or self._get_default_settings()
104
+ self.crawlers: Set[Crawler] = set()
105
+ self._active_tasks: Set[asyncio.Task] = set()
106
+
107
+ # 使用专用配置,降级使用 CONCURRENCY
108
+ self.max_concurrency: int = (
109
+ max_concurrency
110
+ or self.settings.get('MAX_RUNNING_SPIDERS')
111
+ or self.settings.get('CONCURRENCY', 3)
112
+ )
113
+ self.semaphore = asyncio.Semaphore(self.max_concurrency)
114
+
115
+ # 注册信号量
116
+ signal.signal(signal.SIGINT, self._shutdown)
117
+ signal.signal(signal.SIGTERM, self._shutdown)
118
+ logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
119
+
120
+ async def crawl(self, spiders):
121
+ """
122
+ 启动一个或多个爬虫,流式调度,支持实时进度反馈
123
+ """
124
+ spider_classes = self._normalize_spiders(spiders)
125
+ total = len(spider_classes)
126
+
127
+ if total == 0:
128
+ raise ValueError("至少需要提供一个爬虫类")
129
+
130
+ # 按名称排序
131
+ spider_classes.sort(key=lambda cls: cls.__name__.lower())
132
+
133
+ logger.info(f"启动 {total} 个爬虫.")
134
+
135
+ # 流式启动所有爬虫任务
136
+ tasks = [
137
+ asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
138
+ for index, spider_cls in enumerate(spider_classes)
139
+ ]
140
+
141
+ # 等待所有任务完成(失败不中断)
142
+ results = await asyncio.gather(*tasks, return_exceptions=True)
143
+
144
+ # 统计异常
145
+ failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
146
+ if failed:
147
+ logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes[i].__name__ for i in failed]}")
148
+
149
+ @staticmethod
150
+ def _normalize_spiders(spiders) -> List[Type[Spider]]:
151
+ """标准化输入为爬虫类列表"""
152
+ if isinstance(spiders, type) and issubclass(spiders, Spider):
153
+ return [spiders]
154
+ elif isinstance(spiders, (list, tuple)):
155
+ return list(spiders)
156
+ else:
157
+ raise TypeError("spiders 必须是爬虫类或爬虫类列表/元组")
158
+
159
+ async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
160
+ """
161
+ 受信号量限制的爬虫运行函数,带进度日志
162
+ """
163
+ task = asyncio.current_task()
164
+ self._active_tasks.add(task)
165
+
166
+ try:
167
+ # 获取并发许可
168
+ await self.semaphore.acquire()
169
+
170
+ start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
171
+ logger.info(start_msg)
172
+
173
+ # 创建并运行爬虫
174
+ crawler = self._create_crawler(spider_cls)
175
+ self.crawlers.add(crawler)
176
+ await crawler.crawl()
177
+
178
+ end_msg = f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}"
179
+ logger.info(end_msg)
180
+
181
+ except Exception as e:
182
+ logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
183
+ raise
184
+ finally:
185
+ if task in self._active_tasks:
186
+ self._active_tasks.remove(task)
187
+ self.semaphore.release() # 必须释放
188
+
189
+ def _create_crawler(self, spider_cls: Type[Spider]) -> Crawler:
190
+ """创建爬虫实例"""
191
+ if isinstance(spider_cls, str):
192
+ raise SpiderTypeError(f"不支持字符串形式的爬虫: {spider_cls}")
193
+ return Crawler(spider_cls, self.settings)
194
+
195
+ def _shutdown(self, _signum, _frame):
196
+ """优雅关闭信号处理"""
197
+ logger.warning("收到关闭信号,正在停止所有爬虫...")
198
+ for crawler in list(self.crawlers):
199
+ if crawler.engine:
200
+ crawler.engine.running = False
201
+ crawler.engine.normal = False
202
+ asyncio.create_task(self._wait_for_shutdown())
203
+
204
+ async def _wait_for_shutdown(self):
205
+ """等待所有活跃任务完成"""
206
+ pending = [t for t in self._active_tasks if not t.done()]
207
+ if pending:
208
+ logger.info(f"等待 {len(pending)} 个活跃任务完成...")
209
+ await asyncio.gather(*pending, return_exceptions=True)
210
+ logger.info("所有爬虫已优雅关闭")
211
+
212
+ @classmethod
213
+ def _get_default_settings(cls) -> SettingManager:
214
+ """加载默认配置"""
215
+ try:
216
+ return get_settings()
217
+ except Exception as e:
218
+ logger.warning(f"无法加载默认配置: {e}")
219
+ return SettingManager()
@@ -71,3 +71,8 @@ class ReceiverTypeError(Exception):
71
71
  class SpiderCreationError(Exception):
72
72
  """爬虫实例化失败异常"""
73
73
  pass
74
+
75
+
76
+ class ItemValidationError(Exception):
77
+ """Item 字段验证错误"""
78
+ pass
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ crawlo.items 包
5
+ ===============
6
+ 提供 Item 和 Field 类用于数据定义和验证。
7
+ """
8
+ from .items import Item
9
+ from .fields import Field
10
+ from .base import ItemMeta
11
+
12
+ from crawlo.exceptions import ItemInitError, ItemAttributeError
13
+
14
+ __all__ = [
15
+ 'Item',
16
+ 'Field',
17
+ 'ItemMeta',
18
+ 'ItemInitError',
19
+ 'ItemAttributeError'
20
+ ]
21
+
22
+
23
+