crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +68 -42
- crawlo/commands/list.py +102 -93
- crawlo/commands/startproject.py +89 -4
- crawlo/commands/utils.py +187 -0
- crawlo/config.py +280 -0
- crawlo/core/engine.py +16 -3
- crawlo/core/enhanced_engine.py +190 -0
- crawlo/core/scheduler.py +113 -8
- crawlo/crawler.py +840 -307
- crawlo/downloader/__init__.py +181 -17
- crawlo/downloader/aiohttp_downloader.py +15 -2
- crawlo/downloader/cffi_downloader.py +11 -1
- crawlo/downloader/httpx_downloader.py +14 -3
- crawlo/filters/__init__.py +122 -5
- crawlo/filters/aioredis_filter.py +128 -36
- crawlo/filters/memory_filter.py +99 -32
- crawlo/middleware/proxy.py +11 -8
- crawlo/middleware/retry.py +40 -5
- crawlo/mode_manager.py +201 -0
- crawlo/network/__init__.py +17 -3
- crawlo/network/request.py +118 -10
- crawlo/network/response.py +131 -28
- crawlo/pipelines/__init__.py +1 -1
- crawlo/pipelines/csv_pipeline.py +317 -0
- crawlo/pipelines/json_pipeline.py +219 -0
- crawlo/queue/__init__.py +0 -0
- crawlo/queue/pqueue.py +37 -0
- crawlo/queue/queue_manager.py +304 -0
- crawlo/queue/redis_priority_queue.py +192 -0
- crawlo/settings/default_settings.py +68 -9
- crawlo/spider/__init__.py +576 -66
- crawlo/task_manager.py +4 -1
- crawlo/templates/project/middlewares.py.tmpl +56 -45
- crawlo/templates/project/pipelines.py.tmpl +308 -36
- crawlo/templates/project/run.py.tmpl +239 -0
- crawlo/templates/project/settings.py.tmpl +211 -17
- crawlo/templates/spider/spider.py.tmpl +153 -7
- crawlo/utils/controlled_spider_mixin.py +336 -0
- crawlo/utils/large_scale_config.py +287 -0
- crawlo/utils/large_scale_helper.py +344 -0
- crawlo/utils/queue_helper.py +176 -0
- crawlo/utils/request_serializer.py +220 -0
- crawlo-1.1.2.dist-info/METADATA +567 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
- tests/test_final_validation.py +154 -0
- tests/test_redis_config.py +29 -0
- tests/test_redis_queue.py +225 -0
- tests/test_request_serialization.py +71 -0
- tests/test_scheduler.py +242 -0
- crawlo/pipelines/mysql_batch_pipline.py +0 -273
- crawlo/utils/pqueue.py +0 -174
- crawlo-1.1.1.dist-info/METADATA +0 -220
- examples/baidu_spider/__init__.py +0 -7
- examples/baidu_spider/demo.py +0 -94
- examples/baidu_spider/items.py +0 -46
- examples/baidu_spider/middleware.py +0 -49
- examples/baidu_spider/pipeline.py +0 -55
- examples/baidu_spider/run.py +0 -27
- examples/baidu_spider/settings.py +0 -121
- examples/baidu_spider/spiders/__init__.py +0 -7
- examples/baidu_spider/spiders/bai_du.py +0 -61
- examples/baidu_spider/spiders/miit.py +0 -159
- examples/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
crawlo/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
Crawlo - 一个异步爬虫框架
|
|
5
5
|
"""
|
|
6
6
|
from crawlo.spider import Spider
|
|
7
|
-
from crawlo.items
|
|
7
|
+
from crawlo.items import Item, Field
|
|
8
8
|
from crawlo.network.request import Request
|
|
9
9
|
from crawlo.network.response import Response
|
|
10
10
|
from crawlo.downloader import DownloaderBase
|
|
@@ -26,6 +26,7 @@ except Exception:
|
|
|
26
26
|
__all__ = [
|
|
27
27
|
'Spider',
|
|
28
28
|
'Item',
|
|
29
|
+
'Field',
|
|
29
30
|
'Request',
|
|
30
31
|
'Response',
|
|
31
32
|
'DownloaderBase',
|
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.2"
|
crawlo/commands/genspider.py
CHANGED
|
@@ -11,6 +11,15 @@ import configparser
|
|
|
11
11
|
import importlib
|
|
12
12
|
from rich.console import Console
|
|
13
13
|
|
|
14
|
+
from .utils import (
|
|
15
|
+
get_project_root,
|
|
16
|
+
validate_project_environment,
|
|
17
|
+
show_error_panel,
|
|
18
|
+
show_success_panel,
|
|
19
|
+
validate_spider_name,
|
|
20
|
+
is_valid_domain
|
|
21
|
+
)
|
|
22
|
+
|
|
14
23
|
# 初始化 rich 控制台
|
|
15
24
|
console = Console()
|
|
16
25
|
|
|
@@ -29,41 +38,42 @@ def _render_template(tmpl_path, context):
|
|
|
29
38
|
def main(args):
|
|
30
39
|
if len(args) < 2:
|
|
31
40
|
console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo genspider[/blue] <spider_name> <domain>")
|
|
41
|
+
console.print("💡 Examples:")
|
|
42
|
+
console.print(" [blue]crawlo genspider[/blue] news_spider news.example.com")
|
|
43
|
+
console.print(" [blue]crawlo genspider[/blue] product_spider shop.example.com")
|
|
32
44
|
return 1
|
|
33
45
|
|
|
34
46
|
spider_name = args[0]
|
|
35
47
|
domain = args[1]
|
|
36
|
-
|
|
37
|
-
#
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if not
|
|
51
|
-
|
|
48
|
+
|
|
49
|
+
# 验证爬虫名称
|
|
50
|
+
if not validate_spider_name(spider_name):
|
|
51
|
+
show_error_panel(
|
|
52
|
+
"Invalid Spider Name",
|
|
53
|
+
f"Spider name '[cyan]{spider_name}[/cyan]' is invalid.\n"
|
|
54
|
+
"💡 Spider name should:\n"
|
|
55
|
+
" • Start with lowercase letter\n"
|
|
56
|
+
" • Contain only lowercase letters, numbers, and underscores\n"
|
|
57
|
+
" • Be a valid Python identifier"
|
|
58
|
+
)
|
|
59
|
+
return 1
|
|
60
|
+
|
|
61
|
+
# 验证域名格式
|
|
62
|
+
if not is_valid_domain(domain):
|
|
63
|
+
show_error_panel(
|
|
64
|
+
"Invalid Domain",
|
|
65
|
+
f"Domain '[cyan]{domain}[/cyan]' format is invalid.\n"
|
|
66
|
+
"💡 Please provide a valid domain name like 'example.com'"
|
|
67
|
+
)
|
|
52
68
|
return 1
|
|
53
69
|
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# 从 crawlo.cfg 读取 settings 模块,获取项目包名
|
|
59
|
-
config = configparser.ConfigParser()
|
|
60
|
-
try:
|
|
61
|
-
config.read(cfg_file, encoding='utf-8')
|
|
62
|
-
settings_module = config.get('settings', 'default')
|
|
63
|
-
project_package = settings_module.split('.')[0] # e.g., myproject.settings -> myproject
|
|
64
|
-
except Exception as e:
|
|
65
|
-
console.print(f"[bold red]:cross_mark: Error reading crawlo.cfg:[/bold red] {e}")
|
|
70
|
+
# 验证项目环境
|
|
71
|
+
is_valid, project_package, error_msg = validate_project_environment()
|
|
72
|
+
if not is_valid:
|
|
73
|
+
show_error_panel("Not a Crawlo Project", error_msg)
|
|
66
74
|
return 1
|
|
75
|
+
|
|
76
|
+
project_root = get_project_root()
|
|
67
77
|
|
|
68
78
|
# 确定 items 模块的路径
|
|
69
79
|
items_module_path = f"{project_package}.items"
|
|
@@ -93,17 +103,23 @@ def main(args):
|
|
|
93
103
|
|
|
94
104
|
spider_file = spiders_dir / f'{spider_name}.py'
|
|
95
105
|
if spider_file.exists():
|
|
96
|
-
|
|
106
|
+
show_error_panel(
|
|
107
|
+
"Spider Already Exists",
|
|
108
|
+
f"Spider '[cyan]{spider_name}[/cyan]' already exists at\n[green]{spider_file}[/green]"
|
|
109
|
+
)
|
|
97
110
|
return 1
|
|
98
111
|
|
|
99
112
|
# 模板路径
|
|
100
113
|
tmpl_path = TEMPLATES_DIR / 'spider' / 'spider.py.tmpl'
|
|
101
114
|
if not tmpl_path.exists():
|
|
102
|
-
|
|
115
|
+
show_error_panel(
|
|
116
|
+
"Template Not Found",
|
|
117
|
+
f"Template file not found at [cyan]{tmpl_path}[/cyan]"
|
|
118
|
+
)
|
|
103
119
|
return 1
|
|
104
120
|
|
|
105
121
|
# 生成类名
|
|
106
|
-
class_name = f"{spider_name.capitalize()}Spider"
|
|
122
|
+
class_name = f"{spider_name.replace('_', '').capitalize()}Spider"
|
|
107
123
|
|
|
108
124
|
context = {
|
|
109
125
|
'spider_name': spider_name,
|
|
@@ -113,14 +129,24 @@ def main(args):
|
|
|
113
129
|
'class_name': class_name
|
|
114
130
|
}
|
|
115
131
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
132
|
+
try:
|
|
133
|
+
content = _render_template(tmpl_path, context)
|
|
134
|
+
with open(spider_file, 'w', encoding='utf-8') as f:
|
|
135
|
+
f.write(content)
|
|
136
|
+
|
|
137
|
+
console.print(f":white_check_mark: [green]Spider '[bold]{spider_name}[/bold]' created successfully![/green]")
|
|
138
|
+
console.print(f" → Location: [cyan]{spider_file}[/cyan]")
|
|
139
|
+
console.print(f" → Class: [yellow]{class_name}[/yellow]")
|
|
140
|
+
console.print(f" → Domain: [blue]{domain}[/blue]")
|
|
141
|
+
console.print("\n[bold]Next steps:[/bold]")
|
|
142
|
+
console.print(f" [blue]crawlo run[/blue] {spider_name}")
|
|
143
|
+
console.print(f" [blue]crawlo check[/blue] {spider_name}")
|
|
144
|
+
|
|
145
|
+
return 0
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
show_error_panel(
|
|
149
|
+
"Creation Failed",
|
|
150
|
+
f"Failed to create spider: {e}"
|
|
151
|
+
)
|
|
152
|
+
return 1
|
crawlo/commands/list.py
CHANGED
|
@@ -6,7 +6,6 @@
|
|
|
6
6
|
# @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
|
|
7
7
|
"""
|
|
8
8
|
import sys
|
|
9
|
-
import configparser
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from importlib import import_module
|
|
12
11
|
|
|
@@ -18,110 +17,96 @@ from rich import box
|
|
|
18
17
|
|
|
19
18
|
from crawlo.crawler import CrawlerProcess
|
|
20
19
|
from crawlo.utils.log import get_logger
|
|
20
|
+
from .utils import validate_project_environment, show_error_panel
|
|
21
21
|
|
|
22
22
|
logger = get_logger(__name__)
|
|
23
23
|
console = Console()
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def get_project_root():
|
|
27
|
-
"""
|
|
28
|
-
自动检测项目根目录:从当前目录向上查找 crawlo.cfg
|
|
29
|
-
找到后返回该目录路径(字符串),最多向上查找10层。
|
|
30
|
-
"""
|
|
31
|
-
current = Path.cwd()
|
|
32
|
-
for _ in range(10):
|
|
33
|
-
cfg = current / "crawlo.cfg"
|
|
34
|
-
if cfg.exists():
|
|
35
|
-
return str(current)
|
|
36
|
-
if current == current.parent:
|
|
37
|
-
break
|
|
38
|
-
current = current.parent
|
|
39
|
-
return None # 未找到
|
|
40
|
-
|
|
41
|
-
|
|
42
26
|
def main(args):
|
|
43
27
|
"""
|
|
44
28
|
主函数:列出所有可用爬虫
|
|
45
|
-
用法: crawlo list
|
|
29
|
+
用法: crawlo list [--json]
|
|
46
30
|
"""
|
|
47
|
-
|
|
48
|
-
|
|
31
|
+
show_json = "--json" in args
|
|
32
|
+
|
|
33
|
+
# 过滤掉参数后检查是否有额外参数
|
|
34
|
+
filtered_args = [arg for arg in args if not arg.startswith('--')]
|
|
35
|
+
if filtered_args:
|
|
36
|
+
if show_json:
|
|
37
|
+
console.print_json(data={"success": False, "error": "Usage: crawlo list [--json]"})
|
|
38
|
+
else:
|
|
39
|
+
console.print("[bold red]❌ Error:[/bold red] Usage: [blue]crawlo list[/blue] [--json]")
|
|
49
40
|
return 1
|
|
50
41
|
|
|
51
42
|
try:
|
|
52
|
-
#
|
|
53
|
-
|
|
54
|
-
if not
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
"🚀 Or create a new project with:\n"
|
|
60
|
-
" [blue]crawlo startproject myproject[/blue]"
|
|
61
|
-
),
|
|
62
|
-
title="❌ Not in a Crawlo Project",
|
|
63
|
-
border_style="red",
|
|
64
|
-
padding=(1, 2)
|
|
65
|
-
))
|
|
66
|
-
return 1
|
|
67
|
-
|
|
68
|
-
project_root_path = Path(project_root)
|
|
69
|
-
project_root_str = str(project_root_path)
|
|
70
|
-
|
|
71
|
-
# 2. 将项目根加入 Python 路径
|
|
72
|
-
if project_root_str not in sys.path:
|
|
73
|
-
sys.path.insert(0, project_root_str)
|
|
74
|
-
|
|
75
|
-
# 3. 读取 crawlo.cfg 获取 settings 模块
|
|
76
|
-
cfg_file = project_root_path / "crawlo.cfg"
|
|
77
|
-
config = configparser.ConfigParser()
|
|
78
|
-
config.read(cfg_file, encoding="utf-8")
|
|
79
|
-
|
|
80
|
-
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
81
|
-
console.print(Panel(
|
|
82
|
-
":cross_mark: [bold red]Invalid crawlo.cfg[/bold red]\n"
|
|
83
|
-
"Missing [settings] section or 'default' option.",
|
|
84
|
-
title="❌ Config Error",
|
|
85
|
-
border_style="red"
|
|
86
|
-
))
|
|
87
|
-
return 1
|
|
88
|
-
|
|
89
|
-
settings_module = config.get("settings", "default")
|
|
90
|
-
project_package = settings_module.split(".")[0]
|
|
91
|
-
|
|
92
|
-
# 4. 确保项目包可导入
|
|
93
|
-
try:
|
|
94
|
-
import_module(project_package)
|
|
95
|
-
except ImportError as e:
|
|
96
|
-
console.print(Panel(
|
|
97
|
-
f":cross_mark: Failed to import project package '[cyan]{project_package}[/cyan]':\n{e}",
|
|
98
|
-
title="❌ Import Error",
|
|
99
|
-
border_style="red"
|
|
100
|
-
))
|
|
43
|
+
# 验证项目环境
|
|
44
|
+
is_valid, project_package, error_msg = validate_project_environment()
|
|
45
|
+
if not is_valid:
|
|
46
|
+
if show_json:
|
|
47
|
+
console.print_json(data={"success": False, "error": error_msg})
|
|
48
|
+
else:
|
|
49
|
+
show_error_panel("Not a Crawlo Project", error_msg)
|
|
101
50
|
return 1
|
|
102
51
|
|
|
103
|
-
#
|
|
52
|
+
# 初始化 CrawlerProcess 并加载爬虫模块
|
|
104
53
|
spider_modules = [f"{project_package}.spiders"]
|
|
105
54
|
process = CrawlerProcess(spider_modules=spider_modules)
|
|
106
55
|
|
|
107
|
-
#
|
|
56
|
+
# 获取所有爬虫名称
|
|
108
57
|
spider_names = process.get_spider_names()
|
|
109
58
|
if not spider_names:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
":
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
59
|
+
if show_json:
|
|
60
|
+
console.print_json(data={
|
|
61
|
+
"success": True,
|
|
62
|
+
"spiders": [],
|
|
63
|
+
"message": "No spiders found in project"
|
|
64
|
+
})
|
|
65
|
+
else:
|
|
66
|
+
console.print(Panel(
|
|
67
|
+
Text.from_markup(
|
|
68
|
+
":envelope_with_arrow: [bold]No spiders found[/bold] in '[cyan]spiders/[/cyan]' directory.\n\n"
|
|
69
|
+
"[bold]💡 Make sure:[/bold]\n"
|
|
70
|
+
" • Spider classes inherit from [blue]`crawlo.spider.Spider`[/blue]\n"
|
|
71
|
+
" • Each spider has a [green]`name`[/green] attribute\n"
|
|
72
|
+
" • Spiders are imported in [cyan]`spiders/__init__.py`[/cyan] (if using package)"
|
|
73
|
+
),
|
|
74
|
+
title="📭 No Spiders Found",
|
|
75
|
+
border_style="yellow",
|
|
76
|
+
padding=(1, 2)
|
|
77
|
+
))
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
# 准备爬虫信息
|
|
81
|
+
spider_info = []
|
|
82
|
+
for name in sorted(spider_names):
|
|
83
|
+
spider_cls = process.get_spider_class(name)
|
|
84
|
+
module_name = spider_cls.__module__.replace(f"{project_package}.", "")
|
|
85
|
+
|
|
86
|
+
# 获取额外信息
|
|
87
|
+
start_urls_count = len(getattr(spider_cls, 'start_urls', []))
|
|
88
|
+
allowed_domains = getattr(spider_cls, 'allowed_domains', [])
|
|
89
|
+
custom_settings = getattr(spider_cls, 'custom_settings', {})
|
|
90
|
+
|
|
91
|
+
spider_info.append({
|
|
92
|
+
"name": name,
|
|
93
|
+
"class": spider_cls.__name__,
|
|
94
|
+
"module": module_name,
|
|
95
|
+
"start_urls_count": start_urls_count,
|
|
96
|
+
"allowed_domains": allowed_domains,
|
|
97
|
+
"has_custom_settings": bool(custom_settings)
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
# JSON 输出
|
|
101
|
+
if show_json:
|
|
102
|
+
console.print_json(data={
|
|
103
|
+
"success": True,
|
|
104
|
+
"count": len(spider_info),
|
|
105
|
+
"spiders": spider_info
|
|
106
|
+
})
|
|
107
|
+
return 0
|
|
108
|
+
|
|
109
|
+
# 表格输出
|
|
125
110
|
table = Table(
|
|
126
111
|
title=f"📋 Found {len(spider_names)} spider(s)",
|
|
127
112
|
box=box.ROUNDED,
|
|
@@ -132,16 +117,40 @@ def main(args):
|
|
|
132
117
|
table.add_column("Name", style="cyan", no_wrap=True)
|
|
133
118
|
table.add_column("Class", style="green")
|
|
134
119
|
table.add_column("Module", style="dim")
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
120
|
+
table.add_column("URLs", style="blue", justify="center")
|
|
121
|
+
table.add_column("Domains", style="yellow")
|
|
122
|
+
table.add_column("Custom Settings", style="magenta", justify="center")
|
|
123
|
+
|
|
124
|
+
for info in spider_info:
|
|
125
|
+
domains_display = ", ".join(info["allowed_domains"][:2]) # 显示前2个域名
|
|
126
|
+
if len(info["allowed_domains"]) > 2:
|
|
127
|
+
domains_display += f" (+{len(info['allowed_domains'])-2})"
|
|
128
|
+
elif not domains_display:
|
|
129
|
+
domains_display = "-"
|
|
130
|
+
|
|
131
|
+
table.add_row(
|
|
132
|
+
info["name"],
|
|
133
|
+
info["class"],
|
|
134
|
+
info["module"],
|
|
135
|
+
str(info["start_urls_count"]),
|
|
136
|
+
domains_display,
|
|
137
|
+
"✓" if info["has_custom_settings"] else "-"
|
|
138
|
+
)
|
|
140
139
|
|
|
141
140
|
console.print(table)
|
|
141
|
+
|
|
142
|
+
# 显示使用提示
|
|
143
|
+
console.print("\n[bold]🚀 Next steps:[/bold]")
|
|
144
|
+
console.print(" [blue]crawlo run[/blue] <spider_name> # Run a specific spider")
|
|
145
|
+
console.print(" [blue]crawlo run[/blue] all # Run all spiders")
|
|
146
|
+
console.print(" [blue]crawlo check[/blue] <spider_name> # Check spider validity")
|
|
147
|
+
|
|
142
148
|
return 0
|
|
143
149
|
|
|
144
150
|
except Exception as e:
|
|
145
|
-
|
|
151
|
+
if show_json:
|
|
152
|
+
console.print_json(data={"success": False, "error": str(e)})
|
|
153
|
+
else:
|
|
154
|
+
console.print(f"[bold red]❌ Unexpected error:[/bold red] {e}")
|
|
146
155
|
logger.exception("Exception during 'crawlo list'")
|
|
147
|
-
return 1
|
|
156
|
+
return 1
|
crawlo/commands/startproject.py
CHANGED
|
@@ -6,11 +6,14 @@
|
|
|
6
6
|
# @Desc : 命令行入口:crawlo startproject baidu,创建项目。
|
|
7
7
|
"""
|
|
8
8
|
import shutil
|
|
9
|
+
import re
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from rich.console import Console
|
|
11
12
|
from rich.panel import Panel
|
|
12
13
|
from rich.text import Text
|
|
13
14
|
|
|
15
|
+
from .utils import show_error_panel, show_success_panel
|
|
16
|
+
|
|
14
17
|
# 初始化 rich 控制台
|
|
15
18
|
console = Console()
|
|
16
19
|
|
|
@@ -51,16 +54,86 @@ def _copytree_with_templates(src, dst, context):
|
|
|
51
54
|
shutil.copy2(item, dst_item)
|
|
52
55
|
|
|
53
56
|
|
|
57
|
+
def validate_project_name(project_name: str) -> tuple[bool, str]:
|
|
58
|
+
"""
|
|
59
|
+
验证项目名称是否有效
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
tuple[bool, str]: (是否有效, 错误信息)
|
|
63
|
+
"""
|
|
64
|
+
# 检查是否为空
|
|
65
|
+
if not project_name or not project_name.strip():
|
|
66
|
+
return False, "Project name cannot be empty"
|
|
67
|
+
|
|
68
|
+
project_name = project_name.strip()
|
|
69
|
+
|
|
70
|
+
# 检查长度
|
|
71
|
+
if len(project_name) > 50:
|
|
72
|
+
return False, "Project name too long (max 50 characters)"
|
|
73
|
+
|
|
74
|
+
# 检查是否为Python关键字
|
|
75
|
+
python_keywords = {
|
|
76
|
+
'False', 'None', 'True', 'and', 'as', 'assert', 'break', 'class',
|
|
77
|
+
'continue', 'def', 'del', 'elif', 'else', 'except', 'finally',
|
|
78
|
+
'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda',
|
|
79
|
+
'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try',
|
|
80
|
+
'while', 'with', 'yield'
|
|
81
|
+
}
|
|
82
|
+
if project_name in python_keywords:
|
|
83
|
+
return False, f"'{project_name}' is a Python keyword and cannot be used as project name"
|
|
84
|
+
|
|
85
|
+
# 检查是否为有效的Python标识符
|
|
86
|
+
if not project_name.isidentifier():
|
|
87
|
+
return False, "Project name must be a valid Python identifier"
|
|
88
|
+
|
|
89
|
+
# 检查格式(建议使用snake_case)
|
|
90
|
+
if not re.match(r'^[a-z][a-z0-9_]*$', project_name):
|
|
91
|
+
return False, (
|
|
92
|
+
"Project name should start with lowercase letter and "
|
|
93
|
+
"contain only lowercase letters, numbers, and underscores"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 检查是否以数字结尾(不推荐)
|
|
97
|
+
if project_name[-1].isdigit():
|
|
98
|
+
return False, "Project name should not end with a number"
|
|
99
|
+
|
|
100
|
+
return True, ""
|
|
101
|
+
|
|
102
|
+
|
|
54
103
|
def main(args):
|
|
55
104
|
if len(args) != 1:
|
|
56
|
-
console.print("[bold red]Error:[/bold red] Usage: crawlo startproject <project_name>")
|
|
105
|
+
console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name>")
|
|
106
|
+
console.print("💡 Examples:")
|
|
107
|
+
console.print(" [blue]crawlo startproject[/blue] my_spider_project")
|
|
108
|
+
console.print(" [blue]crawlo startproject[/blue] news_crawler")
|
|
109
|
+
console.print(" [blue]crawlo startproject[/blue] ecommerce_spider")
|
|
57
110
|
return 1
|
|
58
111
|
|
|
59
112
|
project_name = args[0]
|
|
113
|
+
|
|
114
|
+
# 验证项目名称
|
|
115
|
+
is_valid, error_msg = validate_project_name(project_name)
|
|
116
|
+
if not is_valid:
|
|
117
|
+
show_error_panel(
|
|
118
|
+
"Invalid Project Name",
|
|
119
|
+
f"[cyan]{project_name}[/cyan] is not a valid project name.\n"
|
|
120
|
+
f"❌ {error_msg}\n\n"
|
|
121
|
+
"💡 Project name should:\n"
|
|
122
|
+
" • Start with lowercase letter\n"
|
|
123
|
+
" • Contain only lowercase letters, numbers, and underscores\n"
|
|
124
|
+
" • Be a valid Python identifier\n"
|
|
125
|
+
" • Not be a Python keyword"
|
|
126
|
+
)
|
|
127
|
+
return 1
|
|
128
|
+
|
|
60
129
|
project_dir = Path(project_name)
|
|
61
130
|
|
|
62
131
|
if project_dir.exists():
|
|
63
|
-
|
|
132
|
+
show_error_panel(
|
|
133
|
+
"Directory Exists",
|
|
134
|
+
f"Directory '[cyan]{project_dir}[/cyan]' already exists.\n"
|
|
135
|
+
"💡 Choose a different project name or remove the existing directory."
|
|
136
|
+
)
|
|
64
137
|
return 1
|
|
65
138
|
|
|
66
139
|
context = {'project_name': project_name}
|
|
@@ -87,6 +160,10 @@ def main(args):
|
|
|
87
160
|
# 4. 创建 logs 目录
|
|
88
161
|
(project_dir / 'logs').mkdir(exist_ok=True)
|
|
89
162
|
console.print(":white_check_mark: Created logs directory")
|
|
163
|
+
|
|
164
|
+
# 5. 创建 output 目录(用于数据输出)
|
|
165
|
+
(project_dir / 'output').mkdir(exist_ok=True)
|
|
166
|
+
console.print(":white_check_mark: Created output directory")
|
|
90
167
|
|
|
91
168
|
# 成功面板
|
|
92
169
|
success_text = Text.from_markup(f"Project '[bold cyan]{project_name}[/bold cyan]' created successfully!")
|
|
@@ -94,17 +171,25 @@ def main(args):
|
|
|
94
171
|
|
|
95
172
|
# 下一步操作提示(对齐美观 + 语法高亮)
|
|
96
173
|
next_steps = f"""
|
|
97
|
-
[bold]Next steps:[/bold]
|
|
174
|
+
[bold]🚀 Next steps:[/bold]
|
|
98
175
|
[blue]cd[/blue] {project_name}
|
|
99
176
|
[blue]crawlo genspider[/blue] example example.com
|
|
100
177
|
[blue]crawlo run[/blue] example
|
|
178
|
+
|
|
179
|
+
[bold]📚 Learn more:[/bold]
|
|
180
|
+
[blue]crawlo list[/blue] # List all spiders
|
|
181
|
+
[blue]crawlo check[/blue] example # Check spider validity
|
|
182
|
+
[blue]crawlo stats[/blue] # View statistics
|
|
101
183
|
""".strip()
|
|
102
184
|
console.print(next_steps)
|
|
103
185
|
|
|
104
186
|
return 0
|
|
105
187
|
|
|
106
188
|
except Exception as e:
|
|
107
|
-
|
|
189
|
+
show_error_panel(
|
|
190
|
+
"Creation Failed",
|
|
191
|
+
f"Failed to create project: {e}"
|
|
192
|
+
)
|
|
108
193
|
if project_dir.exists():
|
|
109
194
|
shutil.rmtree(project_dir, ignore_errors=True)
|
|
110
195
|
console.print("[red]:cross_mark: Cleaned up partially created project.[/red]")
|