crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +34 -34
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +285 -285
- crawlo/commands/startproject.py +196 -196
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +279 -279
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +171 -171
- crawlo/core/enhanced_engine.py +189 -189
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +165 -165
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +242 -242
- crawlo/downloader/aiohttp_downloader.py +212 -212
- crawlo/downloader/cffi_downloader.py +251 -251
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +38 -31
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +58 -49
- crawlo/extension/log_stats.py +82 -44
- crawlo/extension/logging_extension.py +44 -35
- crawlo/extension/memory_monitor.py +89 -0
- crawlo/extension/performance_profiler.py +118 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +241 -241
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +248 -248
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +200 -200
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +311 -311
- crawlo/network/response.py +271 -271
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +224 -224
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +132 -117
- crawlo/pipelines/mysql_pipeline.py +317 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/pipelines/redis_dedup_pipeline.py +162 -162
- crawlo/project.py +153 -153
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +307 -307
- crawlo/queue/redis_priority_queue.py +208 -208
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +278 -244
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +131 -106
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +111 -87
- crawlo/templates/project/pipelines.py.tmpl +97 -341
- crawlo/templates/project/run.py.tmpl +251 -251
- crawlo/templates/project/settings.py.tmpl +279 -250
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +142 -178
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.4.dist-info/METADATA +403 -0
- crawlo-1.1.4.dist-info/RECORD +117 -0
- examples/__init__.py +7 -7
- examples/controlled_spider_example.py +205 -205
- tests/__init__.py +7 -7
- tests/test_final_validation.py +153 -153
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_redis_config.py +28 -28
- tests/test_redis_queue.py +224 -224
- tests/test_request_serialization.py +70 -70
- tests/test_scheduler.py +241 -241
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
crawlo/commands/check.py
CHANGED
|
@@ -1,595 +1,595 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-08-31 22:35
|
|
5
|
-
# @Author : crawl-coder
|
|
6
|
-
# @Desc : 命令行入口:crawlo check,检查所有爬虫定义是否合规。
|
|
7
|
-
"""
|
|
8
|
-
import sys
|
|
9
|
-
import ast
|
|
10
|
-
import astor
|
|
11
|
-
import re
|
|
12
|
-
import time
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
import configparser
|
|
15
|
-
from importlib import import_module
|
|
16
|
-
|
|
17
|
-
from rich.console import Console
|
|
18
|
-
from rich.panel import Panel
|
|
19
|
-
from rich.table import Table
|
|
20
|
-
from rich.text import Text
|
|
21
|
-
from rich import box
|
|
22
|
-
|
|
23
|
-
from watchdog.observers import Observer
|
|
24
|
-
from watchdog.events import FileSystemEventHandler
|
|
25
|
-
|
|
26
|
-
from crawlo.crawler import CrawlerProcess
|
|
27
|
-
from crawlo.utils.log import get_logger
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
logger = get_logger(__name__)
|
|
31
|
-
console = Console()
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def get_project_root():
|
|
35
|
-
"""
|
|
36
|
-
从当前目录向上查找 crawlo.cfg,确定项目根目录
|
|
37
|
-
"""
|
|
38
|
-
current = Path.cwd()
|
|
39
|
-
for _ in range(10):
|
|
40
|
-
cfg = current / "crawlo.cfg"
|
|
41
|
-
if cfg.exists():
|
|
42
|
-
return current
|
|
43
|
-
if current == current.parent:
|
|
44
|
-
break
|
|
45
|
-
current = current.parent
|
|
46
|
-
return None
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def auto_fix_spider_file(spider_cls, file_path: Path):
|
|
50
|
-
"""自动修复 spider 文件中的常见问题"""
|
|
51
|
-
try:
|
|
52
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
53
|
-
source = f.read()
|
|
54
|
-
|
|
55
|
-
fixed = False
|
|
56
|
-
tree = ast.parse(source)
|
|
57
|
-
|
|
58
|
-
# 查找 Spider 类定义
|
|
59
|
-
class_node = None
|
|
60
|
-
for node in ast.walk(tree):
|
|
61
|
-
if isinstance(node, ast.ClassDef) and node.name == spider_cls.__name__:
|
|
62
|
-
class_node = node
|
|
63
|
-
break
|
|
64
|
-
|
|
65
|
-
if not class_node:
|
|
66
|
-
return False, "Could not find class definition in file."
|
|
67
|
-
|
|
68
|
-
# 1. 修复 name 为空或缺失
|
|
69
|
-
name_assign = None
|
|
70
|
-
for node in class_node.body:
|
|
71
|
-
if isinstance(node, ast.Assign):
|
|
72
|
-
for target in node.targets:
|
|
73
|
-
if isinstance(target, ast.Name) and target.id == "name":
|
|
74
|
-
name_assign = node
|
|
75
|
-
break
|
|
76
|
-
|
|
77
|
-
if not name_assign or (
|
|
78
|
-
isinstance(name_assign.value, ast.Constant) and not name_assign.value.value
|
|
79
|
-
):
|
|
80
|
-
# 生成默认 name:类名转 snake_case
|
|
81
|
-
default_name = re.sub(r'(?<!^)(?=[A-Z])', '_', spider_cls.__name__).lower().replace("_spider", "")
|
|
82
|
-
new_assign = ast.Assign(
|
|
83
|
-
targets=[ast.Name(id="name", ctx=ast.Store())],
|
|
84
|
-
value=ast.Constant(value=default_name)
|
|
85
|
-
)
|
|
86
|
-
if name_assign:
|
|
87
|
-
index = class_node.body.index(name_assign)
|
|
88
|
-
class_node.body[index] = new_assign
|
|
89
|
-
else:
|
|
90
|
-
class_node.body.insert(0, new_assign)
|
|
91
|
-
fixed = True
|
|
92
|
-
|
|
93
|
-
# 2. 修复 start_urls 是字符串
|
|
94
|
-
start_urls_assign = None
|
|
95
|
-
for node in class_node.body:
|
|
96
|
-
if isinstance(node, ast.Assign):
|
|
97
|
-
for target in node.targets:
|
|
98
|
-
if isinstance(target, ast.Name) and target.id == "start_urls":
|
|
99
|
-
start_urls_assign = node
|
|
100
|
-
break
|
|
101
|
-
|
|
102
|
-
if start_urls_assign and isinstance(start_urls_assign.value, ast.Constant) and isinstance(start_urls_assign.value.value, str):
|
|
103
|
-
new_value = ast.List(elts=[ast.Constant(value=start_urls_assign.value.value)], ctx=ast.Load())
|
|
104
|
-
start_urls_assign.value = new_value
|
|
105
|
-
fixed = True
|
|
106
|
-
|
|
107
|
-
# 3. 修复缺少 parse 方法
|
|
108
|
-
has_parse = any(
|
|
109
|
-
isinstance(node, ast.FunctionDef) and node.name == "parse"
|
|
110
|
-
for node in class_node.body
|
|
111
|
-
)
|
|
112
|
-
if not has_parse:
|
|
113
|
-
parse_method = ast.FunctionDef(
|
|
114
|
-
name="parse",
|
|
115
|
-
args=ast.arguments(
|
|
116
|
-
posonlyargs=[],
|
|
117
|
-
args=[ast.arg(arg="self"), ast.arg(arg="response")],
|
|
118
|
-
kwonlyargs=[],
|
|
119
|
-
kw_defaults=[],
|
|
120
|
-
defaults=[],
|
|
121
|
-
vararg=None,
|
|
122
|
-
kwarg=None
|
|
123
|
-
),
|
|
124
|
-
body=[
|
|
125
|
-
ast.Expr(value=ast.Constant(value="默认 parse 方法,返回 item 或继续请求")),
|
|
126
|
-
ast.Pass()
|
|
127
|
-
],
|
|
128
|
-
decorator_list=[],
|
|
129
|
-
returns=None
|
|
130
|
-
)
|
|
131
|
-
class_node.body.append(parse_method)
|
|
132
|
-
fixed = True
|
|
133
|
-
|
|
134
|
-
# 4. 修复 allowed_domains 是字符串
|
|
135
|
-
allowed_domains_assign = None
|
|
136
|
-
for node in class_node.body:
|
|
137
|
-
if isinstance(node, ast.Assign):
|
|
138
|
-
for target in node.targets:
|
|
139
|
-
if isinstance(target, ast.Name) and target.id == "allowed_domains":
|
|
140
|
-
allowed_domains_assign = node
|
|
141
|
-
break
|
|
142
|
-
|
|
143
|
-
if allowed_domains_assign and isinstance(allowed_domains_assign.value, ast.Constant) and isinstance(allowed_domains_assign.value.value, str):
|
|
144
|
-
new_value = ast.List(elts=[ast.Constant(value=allowed_domains_assign.value.value)], ctx=ast.Load())
|
|
145
|
-
allowed_domains_assign.value = new_value
|
|
146
|
-
fixed = True
|
|
147
|
-
|
|
148
|
-
# 5. 修复缺失 custom_settings
|
|
149
|
-
has_custom_settings = any(
|
|
150
|
-
isinstance(node, ast.Assign) and
|
|
151
|
-
any(isinstance(t, ast.Name) and t.id == "custom_settings" for t in node.targets)
|
|
152
|
-
for node in class_node.body
|
|
153
|
-
)
|
|
154
|
-
if not has_custom_settings:
|
|
155
|
-
new_assign = ast.Assign(
|
|
156
|
-
targets=[ast.Name(id="custom_settings", ctx=ast.Store())],
|
|
157
|
-
value=ast.Dict(keys=[], values=[])
|
|
158
|
-
)
|
|
159
|
-
# 插入在 name 之后
|
|
160
|
-
insert_index = 1
|
|
161
|
-
for i, node in enumerate(class_node.body):
|
|
162
|
-
if isinstance(node, ast.Assign) and any(
|
|
163
|
-
isinstance(t, ast.Name) and t.id == "name" for t in node.targets
|
|
164
|
-
):
|
|
165
|
-
insert_index = i + 1
|
|
166
|
-
break
|
|
167
|
-
class_node.body.insert(insert_index, new_assign)
|
|
168
|
-
fixed = True
|
|
169
|
-
|
|
170
|
-
# 6. 修复缺失 start_requests 方法
|
|
171
|
-
has_start_requests = any(
|
|
172
|
-
isinstance(node, ast.FunctionDef) and node.name == "start_requests"
|
|
173
|
-
for node in class_node.body
|
|
174
|
-
)
|
|
175
|
-
if not has_start_requests:
|
|
176
|
-
start_requests_method = ast.FunctionDef(
|
|
177
|
-
name="start_requests",
|
|
178
|
-
args=ast.arguments(
|
|
179
|
-
posonlyargs=[],
|
|
180
|
-
args=[ast.arg(arg="self")],
|
|
181
|
-
kwonlyargs=[],
|
|
182
|
-
kw_defaults=[],
|
|
183
|
-
defaults=[],
|
|
184
|
-
vararg=None,
|
|
185
|
-
kwarg=None
|
|
186
|
-
),
|
|
187
|
-
body=[
|
|
188
|
-
ast.Expr(value=ast.Constant(value="默认 start_requests,从 start_urls 生成请求")),
|
|
189
|
-
ast.For(
|
|
190
|
-
target=ast.Name(id="url", ctx=ast.Store()),
|
|
191
|
-
iter=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="start_urls", ctx=ast.Load()),
|
|
192
|
-
body=[
|
|
193
|
-
ast.Expr(
|
|
194
|
-
value=ast.Call(
|
|
195
|
-
func=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="make_request", ctx=ast.Load()),
|
|
196
|
-
args=[ast.Name(id="url", ctx=ast.Load())],
|
|
197
|
-
keywords=[]
|
|
198
|
-
)
|
|
199
|
-
)
|
|
200
|
-
],
|
|
201
|
-
orelse=[]
|
|
202
|
-
)
|
|
203
|
-
],
|
|
204
|
-
decorator_list=[],
|
|
205
|
-
returns=None
|
|
206
|
-
)
|
|
207
|
-
# 插入在 custom_settings 或 name 之后,parse 之前
|
|
208
|
-
insert_index = 2
|
|
209
|
-
for i, node in enumerate(class_node.body):
|
|
210
|
-
if isinstance(node, ast.FunctionDef) and node.name == "parse":
|
|
211
|
-
insert_index = i
|
|
212
|
-
break
|
|
213
|
-
elif isinstance(node, ast.Assign) and any(
|
|
214
|
-
isinstance(t, ast.Name) and t.id in ("name", "custom_settings") for t in node.targets
|
|
215
|
-
):
|
|
216
|
-
insert_index = i + 1
|
|
217
|
-
class_node.body.insert(insert_index, start_requests_method)
|
|
218
|
-
fixed = True
|
|
219
|
-
|
|
220
|
-
if fixed:
|
|
221
|
-
fixed_source = astor.to_source(tree)
|
|
222
|
-
with open(file_path, "w", encoding="utf-8") as f:
|
|
223
|
-
f.write(fixed_source)
|
|
224
|
-
return True, "File auto-fixed successfully."
|
|
225
|
-
else:
|
|
226
|
-
return False, "No fixable issues found."
|
|
227
|
-
|
|
228
|
-
except Exception as e:
|
|
229
|
-
return False, f"Failed to auto-fix: {e}"
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class SpiderChangeHandler(FileSystemEventHandler):
|
|
233
|
-
def __init__(self, project_root, spider_modules, show_fix=False, console=None):
|
|
234
|
-
self.project_root = project_root
|
|
235
|
-
self.spider_modules = spider_modules
|
|
236
|
-
self.show_fix = show_fix
|
|
237
|
-
self.console = console or Console()
|
|
238
|
-
|
|
239
|
-
def on_modified(self, event):
|
|
240
|
-
if event.is_directory:
|
|
241
|
-
return
|
|
242
|
-
if event.src_path.endswith(".py") and "spiders" in event.src_path:
|
|
243
|
-
file_path = Path(event.src_path)
|
|
244
|
-
spider_name = file_path.stem
|
|
245
|
-
self.console.print(f"\n:eyes: [bold blue]Detected change in[/bold blue] [cyan]{file_path}[/cyan]")
|
|
246
|
-
self.check_and_fix_spider(spider_name)
|
|
247
|
-
|
|
248
|
-
def check_and_fix_spider(self, spider_name):
|
|
249
|
-
try:
|
|
250
|
-
process = CrawlerProcess(spider_modules=self.spider_modules)
|
|
251
|
-
if spider_name not in process.get_spider_names():
|
|
252
|
-
self.console.print(f"[yellow]⚠️ {spider_name} is not a registered spider.[/yellow]")
|
|
253
|
-
return
|
|
254
|
-
|
|
255
|
-
cls = process.get_spider_class(spider_name)
|
|
256
|
-
issues = []
|
|
257
|
-
|
|
258
|
-
# 简化检查
|
|
259
|
-
if not getattr(cls, "name", None):
|
|
260
|
-
issues.append("missing or empty 'name' attribute")
|
|
261
|
-
if not callable(getattr(cls, "start_requests", None)):
|
|
262
|
-
issues.append("missing 'start_requests' method")
|
|
263
|
-
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
264
|
-
issues.append("'start_urls' is string")
|
|
265
|
-
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
266
|
-
issues.append("'allowed_domains' is string")
|
|
267
|
-
|
|
268
|
-
try:
|
|
269
|
-
spider = cls.create_instance(None)
|
|
270
|
-
if not callable(getattr(spider, "parse", None)):
|
|
271
|
-
issues.append("no 'parse' method")
|
|
272
|
-
except Exception:
|
|
273
|
-
issues.append("failed to instantiate")
|
|
274
|
-
|
|
275
|
-
if issues:
|
|
276
|
-
self.console.print(f"[red]❌ {spider_name} has issues:[/red]")
|
|
277
|
-
for issue in issues:
|
|
278
|
-
self.console.print(f" • {issue}")
|
|
279
|
-
|
|
280
|
-
if self.show_fix:
|
|
281
|
-
file_path = Path(cls.__file__)
|
|
282
|
-
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
283
|
-
if fixed:
|
|
284
|
-
self.console.print(f"[green]✅ Auto-fixed: {msg}[/green]")
|
|
285
|
-
else:
|
|
286
|
-
self.console.print(f"[yellow]⚠️ Could not fix: {msg}[/yellow]")
|
|
287
|
-
else:
|
|
288
|
-
self.console.print(f"[green]✅ {spider_name} is compliant.[/green]")
|
|
289
|
-
|
|
290
|
-
except Exception as e:
|
|
291
|
-
self.console.print(f"[red]❌ Error checking {spider_name}: {e}[/red]")
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def watch_spiders(project_root, project_package, show_fix=False):
|
|
295
|
-
console = Console()
|
|
296
|
-
spider_path = project_root / project_package / "spiders"
|
|
297
|
-
if not spider_path.exists():
|
|
298
|
-
console.print(f"[red]❌ Spiders directory not found: {spider_path}[/red]")
|
|
299
|
-
return
|
|
300
|
-
|
|
301
|
-
spider_modules = [f"{project_package}.spiders"]
|
|
302
|
-
event_handler = SpiderChangeHandler(project_root, spider_modules, show_fix, console)
|
|
303
|
-
observer = Observer()
|
|
304
|
-
observer.schedule(event_handler, str(spider_path), recursive=False)
|
|
305
|
-
|
|
306
|
-
console.print(Panel(
|
|
307
|
-
f":eyes: [bold blue]Watching for changes in[/bold blue] [cyan]{spider_path}[/cyan]\n"
|
|
308
|
-
"Edit any spider file to trigger auto-check...",
|
|
309
|
-
title="🚀 Watch Mode Started",
|
|
310
|
-
border_style="blue"
|
|
311
|
-
))
|
|
312
|
-
|
|
313
|
-
observer.start()
|
|
314
|
-
try:
|
|
315
|
-
while True:
|
|
316
|
-
time.sleep(1)
|
|
317
|
-
except KeyboardInterrupt:
|
|
318
|
-
console.print("\n[bold red]🛑 Watch mode stopped.[/bold red]")
|
|
319
|
-
observer.stop()
|
|
320
|
-
observer.join()
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def main(args):
|
|
324
|
-
"""
|
|
325
|
-
主函数:检查所有爬虫定义的合规性
|
|
326
|
-
用法:
|
|
327
|
-
crawlo check
|
|
328
|
-
crawlo check --fix
|
|
329
|
-
crawlo check --ci
|
|
330
|
-
crawlo check --json
|
|
331
|
-
crawlo check --watch
|
|
332
|
-
"""
|
|
333
|
-
show_fix = "--fix" in args or "-f" in args
|
|
334
|
-
show_ci = "--ci" in args
|
|
335
|
-
show_json = "--json" in args
|
|
336
|
-
show_watch = "--watch" in args
|
|
337
|
-
|
|
338
|
-
valid_args = {"--fix", "-f", "--ci", "--json", "--watch"}
|
|
339
|
-
if any(arg not in valid_args for arg in args):
|
|
340
|
-
console.print("[bold red]❌ Error:[/bold red] Usage: [blue]crawlo check[/blue] [--fix] [--ci] [--json] [--watch]")
|
|
341
|
-
return 1
|
|
342
|
-
|
|
343
|
-
try:
|
|
344
|
-
# 1. 查找项目根目录
|
|
345
|
-
project_root = get_project_root()
|
|
346
|
-
if not project_root:
|
|
347
|
-
msg = ":cross_mark: [bold red]Cannot find 'crawlo.cfg'[/bold red]\n💡 Run this command inside your project directory."
|
|
348
|
-
if show_json:
|
|
349
|
-
console.print_json(data={"success": False, "error": "Project root not found"})
|
|
350
|
-
return 1
|
|
351
|
-
elif show_ci:
|
|
352
|
-
console.print("❌ Project root not found. crawlo.cfg missing.")
|
|
353
|
-
return 1
|
|
354
|
-
else:
|
|
355
|
-
console.print(Panel(
|
|
356
|
-
Text.from_markup(msg),
|
|
357
|
-
title="❌ Not in a Crawlo Project",
|
|
358
|
-
border_style="red",
|
|
359
|
-
padding=(1, 2)
|
|
360
|
-
))
|
|
361
|
-
return 1
|
|
362
|
-
|
|
363
|
-
project_root_str = str(project_root)
|
|
364
|
-
if project_root_str not in sys.path:
|
|
365
|
-
sys.path.insert(0, project_root_str)
|
|
366
|
-
|
|
367
|
-
# 2. 读取 crawlo.cfg
|
|
368
|
-
cfg_file = project_root / "crawlo.cfg"
|
|
369
|
-
if not cfg_file.exists():
|
|
370
|
-
msg = f"Config file not found: {cfg_file}"
|
|
371
|
-
if show_json:
|
|
372
|
-
console.print_json(data={"success": False, "error": msg})
|
|
373
|
-
return 1
|
|
374
|
-
elif show_ci:
|
|
375
|
-
console.print(f"❌ {msg}")
|
|
376
|
-
return 1
|
|
377
|
-
else:
|
|
378
|
-
console.print(Panel(msg, title="❌ Missing Config", border_style="red"))
|
|
379
|
-
return 1
|
|
380
|
-
|
|
381
|
-
config = configparser.ConfigParser()
|
|
382
|
-
config.read(cfg_file, encoding="utf-8")
|
|
383
|
-
|
|
384
|
-
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
385
|
-
msg = "Missing [settings] section or 'default' option in crawlo.cfg"
|
|
386
|
-
if show_json:
|
|
387
|
-
console.print_json(data={"success": False, "error": msg})
|
|
388
|
-
return 1
|
|
389
|
-
elif show_ci:
|
|
390
|
-
console.print(f"❌ {msg}")
|
|
391
|
-
return 1
|
|
392
|
-
else:
|
|
393
|
-
console.print(Panel(msg, title="❌ Invalid Config", border_style="red"))
|
|
394
|
-
return 1
|
|
395
|
-
|
|
396
|
-
settings_module = config.get("settings", "default")
|
|
397
|
-
project_package = settings_module.split(".")[0]
|
|
398
|
-
|
|
399
|
-
# 3. 确保项目包可导入
|
|
400
|
-
try:
|
|
401
|
-
import_module(project_package)
|
|
402
|
-
except ImportError as e:
|
|
403
|
-
msg = f"Failed to import project package '{project_package}': {e}"
|
|
404
|
-
if show_json:
|
|
405
|
-
console.print_json(data={"success": False, "error": msg})
|
|
406
|
-
return 1
|
|
407
|
-
elif show_ci:
|
|
408
|
-
console.print(f"❌ {msg}")
|
|
409
|
-
return 1
|
|
410
|
-
else:
|
|
411
|
-
console.print(Panel(msg, title="❌ Import Error", border_style="red"))
|
|
412
|
-
return 1
|
|
413
|
-
|
|
414
|
-
# 4. 加载爬虫
|
|
415
|
-
spider_modules = [f"{project_package}.spiders"]
|
|
416
|
-
process = CrawlerProcess(spider_modules=spider_modules)
|
|
417
|
-
spider_names = process.get_spider_names()
|
|
418
|
-
|
|
419
|
-
if not spider_names:
|
|
420
|
-
msg = "No spiders found."
|
|
421
|
-
if show_json:
|
|
422
|
-
console.print_json(data={"success": True, "warning": msg})
|
|
423
|
-
return 0
|
|
424
|
-
elif show_ci:
|
|
425
|
-
console.print("📭 No spiders found.")
|
|
426
|
-
return 0
|
|
427
|
-
else:
|
|
428
|
-
console.print(Panel(
|
|
429
|
-
Text.from_markup(
|
|
430
|
-
":envelope_with_arrow: [bold]No spiders found[/bold]\n\n"
|
|
431
|
-
"[bold]💡 Make sure:[/bold]\n"
|
|
432
|
-
" • Spiders are defined in '[cyan]spiders[/cyan]' module\n"
|
|
433
|
-
" • They have a [green]`name`[/green] attribute\n"
|
|
434
|
-
" • Modules are properly imported"
|
|
435
|
-
),
|
|
436
|
-
title="📭 No Spiders Found",
|
|
437
|
-
border_style="yellow",
|
|
438
|
-
padding=(1, 2)
|
|
439
|
-
))
|
|
440
|
-
return 0
|
|
441
|
-
|
|
442
|
-
# 5. 如果启用 watch 模式,启动监听
|
|
443
|
-
if show_watch:
|
|
444
|
-
console.print("[bold blue]:eyes: Starting watch mode...[/bold blue]")
|
|
445
|
-
watch_spiders(project_root, project_package, show_fix)
|
|
446
|
-
return 0 # watch 是长期运行,不返回
|
|
447
|
-
|
|
448
|
-
# 6. 开始检查(非 watch 模式)
|
|
449
|
-
if not show_ci and not show_json:
|
|
450
|
-
console.print(f":mag: [bold]Checking {len(spider_names)} spider(s)...[/bold]\n")
|
|
451
|
-
|
|
452
|
-
issues_found = False
|
|
453
|
-
results = []
|
|
454
|
-
|
|
455
|
-
for name in sorted(spider_names):
|
|
456
|
-
cls = process.get_spider_class(name)
|
|
457
|
-
issues = []
|
|
458
|
-
|
|
459
|
-
# 检查 name 属性
|
|
460
|
-
if not getattr(cls, "name", None):
|
|
461
|
-
issues.append("missing or empty 'name' attribute")
|
|
462
|
-
elif not isinstance(cls.name, str):
|
|
463
|
-
issues.append("'name' is not a string")
|
|
464
|
-
|
|
465
|
-
# 检查 start_requests 是否可调用
|
|
466
|
-
if not callable(getattr(cls, "start_requests", None)):
|
|
467
|
-
issues.append("missing or non-callable 'start_requests' method")
|
|
468
|
-
|
|
469
|
-
# 检查 start_urls 类型(不应是字符串)
|
|
470
|
-
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
471
|
-
issues.append("'start_urls' is a string; should be list or tuple")
|
|
472
|
-
|
|
473
|
-
# 检查 allowed_domains 类型
|
|
474
|
-
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
475
|
-
issues.append("'allowed_domains' is a string; should be list or tuple")
|
|
476
|
-
|
|
477
|
-
# 实例化并检查 parse 方法
|
|
478
|
-
try:
|
|
479
|
-
spider = cls.create_instance(None)
|
|
480
|
-
if not callable(getattr(spider, "parse", None)):
|
|
481
|
-
issues.append("no 'parse' method defined (recommended)")
|
|
482
|
-
except Exception as e:
|
|
483
|
-
issues.append(f"failed to instantiate spider: {e}")
|
|
484
|
-
|
|
485
|
-
# 自动修复(如果启用)
|
|
486
|
-
if issues and show_fix:
|
|
487
|
-
try:
|
|
488
|
-
file_path = Path(cls.__file__)
|
|
489
|
-
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
490
|
-
if fixed:
|
|
491
|
-
if not show_ci and not show_json:
|
|
492
|
-
console.print(f"[green]🔧 Auto-fixed {name} → {msg}[/green]")
|
|
493
|
-
issues = [] # 认为已修复
|
|
494
|
-
else:
|
|
495
|
-
if not show_ci and not show_json:
|
|
496
|
-
console.print(f"[yellow]⚠️ Could not auto-fix {name}: {msg}[/yellow]")
|
|
497
|
-
except Exception as e:
|
|
498
|
-
if not show_ci and not show_json:
|
|
499
|
-
console.print(f"[yellow]⚠️ Failed to locate source file for {name}: {e}[/yellow]")
|
|
500
|
-
|
|
501
|
-
results.append({
|
|
502
|
-
"name": name,
|
|
503
|
-
"class": cls.__name__,
|
|
504
|
-
"file": getattr(cls, "__file__", "unknown"),
|
|
505
|
-
"issues": issues
|
|
506
|
-
})
|
|
507
|
-
|
|
508
|
-
if issues:
|
|
509
|
-
issues_found = True
|
|
510
|
-
|
|
511
|
-
# 7. 生成报告数据
|
|
512
|
-
report = {
|
|
513
|
-
"success": not issues_found,
|
|
514
|
-
"total_spiders": len(spider_names),
|
|
515
|
-
"issues": [
|
|
516
|
-
{"name": r["name"], "class": r["class"], "file": r["file"], "problems": r["issues"]}
|
|
517
|
-
for r in results if r["issues"]
|
|
518
|
-
]
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
# 8. 输出(根据模式)
|
|
522
|
-
if show_json:
|
|
523
|
-
console.print_json(data=report)
|
|
524
|
-
return 1 if issues_found else 0
|
|
525
|
-
|
|
526
|
-
if show_ci:
|
|
527
|
-
if issues_found:
|
|
528
|
-
console.print("❌ Compliance check failed.")
|
|
529
|
-
for r in results:
|
|
530
|
-
if r["issues"]:
|
|
531
|
-
console.print(f" • {r['name']}: {', '.join(r['issues'])}")
|
|
532
|
-
else:
|
|
533
|
-
console.print("✅ All spiders compliant.")
|
|
534
|
-
return 1 if issues_found else 0
|
|
535
|
-
|
|
536
|
-
# 9. 默认 rich 输出
|
|
537
|
-
table = Table(
|
|
538
|
-
title="🔍 Spider Compliance Check Results",
|
|
539
|
-
box=box.ROUNDED,
|
|
540
|
-
show_header=True,
|
|
541
|
-
header_style="bold magenta",
|
|
542
|
-
title_style="bold green"
|
|
543
|
-
)
|
|
544
|
-
table.add_column("Status", style="bold", width=4)
|
|
545
|
-
table.add_column("Name", style="cyan")
|
|
546
|
-
table.add_column("Class", style="green")
|
|
547
|
-
table.add_column("Issues", style="yellow", overflow="fold")
|
|
548
|
-
|
|
549
|
-
for res in results:
|
|
550
|
-
if res["issues"]:
|
|
551
|
-
status = "[red]❌[/red]"
|
|
552
|
-
issues_text = "\n".join(f"• {issue}" for issue in res["issues"])
|
|
553
|
-
else:
|
|
554
|
-
status = "[green]✅[/green]"
|
|
555
|
-
issues_text = "—"
|
|
556
|
-
|
|
557
|
-
table.add_row(status, res["name"], res["class"], issues_text)
|
|
558
|
-
|
|
559
|
-
console.print(table)
|
|
560
|
-
console.print()
|
|
561
|
-
|
|
562
|
-
if issues_found:
|
|
563
|
-
console.print(Panel(
|
|
564
|
-
":warning: [bold red]Some spiders have issues.[/bold red]\nPlease fix them before running.",
|
|
565
|
-
title="⚠️ Compliance Check Failed",
|
|
566
|
-
border_style="red",
|
|
567
|
-
padding=(1, 2)
|
|
568
|
-
))
|
|
569
|
-
return 1
|
|
570
|
-
else:
|
|
571
|
-
console.print(Panel(
|
|
572
|
-
":tada: [bold green]All spiders are compliant and well-defined![/bold green]\nReady to crawl! 🕷️🚀",
|
|
573
|
-
title="🎉 Check Passed",
|
|
574
|
-
border_style="green",
|
|
575
|
-
padding=(1, 2)
|
|
576
|
-
))
|
|
577
|
-
return 0
|
|
578
|
-
|
|
579
|
-
except Exception as e:
|
|
580
|
-
logger.exception("Exception in 'crawlo check'")
|
|
581
|
-
if show_json:
|
|
582
|
-
console.print_json(data={"success": False, "error": str(e)})
|
|
583
|
-
elif show_ci:
|
|
584
|
-
console.print(f"❌ Unexpected error: {e}")
|
|
585
|
-
else:
|
|
586
|
-
console.print(f"[bold red]❌ Unexpected error during check:[/bold red] {e}")
|
|
587
|
-
return 1
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
if __name__ == "__main__":
|
|
591
|
-
"""
|
|
592
|
-
支持直接运行:
|
|
593
|
-
python -m crawlo.commands.check
|
|
594
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:35
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo check,检查所有爬虫定义是否合规。
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
import ast
|
|
10
|
+
import astor
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
import configparser
|
|
15
|
+
from importlib import import_module
|
|
16
|
+
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.panel import Panel
|
|
19
|
+
from rich.table import Table
|
|
20
|
+
from rich.text import Text
|
|
21
|
+
from rich import box
|
|
22
|
+
|
|
23
|
+
from watchdog.observers import Observer
|
|
24
|
+
from watchdog.events import FileSystemEventHandler
|
|
25
|
+
|
|
26
|
+
from crawlo.crawler import CrawlerProcess
|
|
27
|
+
from crawlo.utils.log import get_logger
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
logger = get_logger(__name__)
|
|
31
|
+
console = Console()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_project_root():
|
|
35
|
+
"""
|
|
36
|
+
从当前目录向上查找 crawlo.cfg,确定项目根目录
|
|
37
|
+
"""
|
|
38
|
+
current = Path.cwd()
|
|
39
|
+
for _ in range(10):
|
|
40
|
+
cfg = current / "crawlo.cfg"
|
|
41
|
+
if cfg.exists():
|
|
42
|
+
return current
|
|
43
|
+
if current == current.parent:
|
|
44
|
+
break
|
|
45
|
+
current = current.parent
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def auto_fix_spider_file(spider_cls, file_path: Path):
|
|
50
|
+
"""自动修复 spider 文件中的常见问题"""
|
|
51
|
+
try:
|
|
52
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
53
|
+
source = f.read()
|
|
54
|
+
|
|
55
|
+
fixed = False
|
|
56
|
+
tree = ast.parse(source)
|
|
57
|
+
|
|
58
|
+
# 查找 Spider 类定义
|
|
59
|
+
class_node = None
|
|
60
|
+
for node in ast.walk(tree):
|
|
61
|
+
if isinstance(node, ast.ClassDef) and node.name == spider_cls.__name__:
|
|
62
|
+
class_node = node
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if not class_node:
|
|
66
|
+
return False, "Could not find class definition in file."
|
|
67
|
+
|
|
68
|
+
# 1. 修复 name 为空或缺失
|
|
69
|
+
name_assign = None
|
|
70
|
+
for node in class_node.body:
|
|
71
|
+
if isinstance(node, ast.Assign):
|
|
72
|
+
for target in node.targets:
|
|
73
|
+
if isinstance(target, ast.Name) and target.id == "name":
|
|
74
|
+
name_assign = node
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
if not name_assign or (
|
|
78
|
+
isinstance(name_assign.value, ast.Constant) and not name_assign.value.value
|
|
79
|
+
):
|
|
80
|
+
# 生成默认 name:类名转 snake_case
|
|
81
|
+
default_name = re.sub(r'(?<!^)(?=[A-Z])', '_', spider_cls.__name__).lower().replace("_spider", "")
|
|
82
|
+
new_assign = ast.Assign(
|
|
83
|
+
targets=[ast.Name(id="name", ctx=ast.Store())],
|
|
84
|
+
value=ast.Constant(value=default_name)
|
|
85
|
+
)
|
|
86
|
+
if name_assign:
|
|
87
|
+
index = class_node.body.index(name_assign)
|
|
88
|
+
class_node.body[index] = new_assign
|
|
89
|
+
else:
|
|
90
|
+
class_node.body.insert(0, new_assign)
|
|
91
|
+
fixed = True
|
|
92
|
+
|
|
93
|
+
# 2. 修复 start_urls 是字符串
|
|
94
|
+
start_urls_assign = None
|
|
95
|
+
for node in class_node.body:
|
|
96
|
+
if isinstance(node, ast.Assign):
|
|
97
|
+
for target in node.targets:
|
|
98
|
+
if isinstance(target, ast.Name) and target.id == "start_urls":
|
|
99
|
+
start_urls_assign = node
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
if start_urls_assign and isinstance(start_urls_assign.value, ast.Constant) and isinstance(start_urls_assign.value.value, str):
|
|
103
|
+
new_value = ast.List(elts=[ast.Constant(value=start_urls_assign.value.value)], ctx=ast.Load())
|
|
104
|
+
start_urls_assign.value = new_value
|
|
105
|
+
fixed = True
|
|
106
|
+
|
|
107
|
+
# 3. 修复缺少 parse 方法
|
|
108
|
+
has_parse = any(
|
|
109
|
+
isinstance(node, ast.FunctionDef) and node.name == "parse"
|
|
110
|
+
for node in class_node.body
|
|
111
|
+
)
|
|
112
|
+
if not has_parse:
|
|
113
|
+
parse_method = ast.FunctionDef(
|
|
114
|
+
name="parse",
|
|
115
|
+
args=ast.arguments(
|
|
116
|
+
posonlyargs=[],
|
|
117
|
+
args=[ast.arg(arg="self"), ast.arg(arg="response")],
|
|
118
|
+
kwonlyargs=[],
|
|
119
|
+
kw_defaults=[],
|
|
120
|
+
defaults=[],
|
|
121
|
+
vararg=None,
|
|
122
|
+
kwarg=None
|
|
123
|
+
),
|
|
124
|
+
body=[
|
|
125
|
+
ast.Expr(value=ast.Constant(value="默认 parse 方法,返回 item 或继续请求")),
|
|
126
|
+
ast.Pass()
|
|
127
|
+
],
|
|
128
|
+
decorator_list=[],
|
|
129
|
+
returns=None
|
|
130
|
+
)
|
|
131
|
+
class_node.body.append(parse_method)
|
|
132
|
+
fixed = True
|
|
133
|
+
|
|
134
|
+
# 4. 修复 allowed_domains 是字符串
|
|
135
|
+
allowed_domains_assign = None
|
|
136
|
+
for node in class_node.body:
|
|
137
|
+
if isinstance(node, ast.Assign):
|
|
138
|
+
for target in node.targets:
|
|
139
|
+
if isinstance(target, ast.Name) and target.id == "allowed_domains":
|
|
140
|
+
allowed_domains_assign = node
|
|
141
|
+
break
|
|
142
|
+
|
|
143
|
+
if allowed_domains_assign and isinstance(allowed_domains_assign.value, ast.Constant) and isinstance(allowed_domains_assign.value.value, str):
|
|
144
|
+
new_value = ast.List(elts=[ast.Constant(value=allowed_domains_assign.value.value)], ctx=ast.Load())
|
|
145
|
+
allowed_domains_assign.value = new_value
|
|
146
|
+
fixed = True
|
|
147
|
+
|
|
148
|
+
# 5. 修复缺失 custom_settings
|
|
149
|
+
has_custom_settings = any(
|
|
150
|
+
isinstance(node, ast.Assign) and
|
|
151
|
+
any(isinstance(t, ast.Name) and t.id == "custom_settings" for t in node.targets)
|
|
152
|
+
for node in class_node.body
|
|
153
|
+
)
|
|
154
|
+
if not has_custom_settings:
|
|
155
|
+
new_assign = ast.Assign(
|
|
156
|
+
targets=[ast.Name(id="custom_settings", ctx=ast.Store())],
|
|
157
|
+
value=ast.Dict(keys=[], values=[])
|
|
158
|
+
)
|
|
159
|
+
# 插入在 name 之后
|
|
160
|
+
insert_index = 1
|
|
161
|
+
for i, node in enumerate(class_node.body):
|
|
162
|
+
if isinstance(node, ast.Assign) and any(
|
|
163
|
+
isinstance(t, ast.Name) and t.id == "name" for t in node.targets
|
|
164
|
+
):
|
|
165
|
+
insert_index = i + 1
|
|
166
|
+
break
|
|
167
|
+
class_node.body.insert(insert_index, new_assign)
|
|
168
|
+
fixed = True
|
|
169
|
+
|
|
170
|
+
# 6. 修复缺失 start_requests 方法
|
|
171
|
+
has_start_requests = any(
|
|
172
|
+
isinstance(node, ast.FunctionDef) and node.name == "start_requests"
|
|
173
|
+
for node in class_node.body
|
|
174
|
+
)
|
|
175
|
+
if not has_start_requests:
|
|
176
|
+
start_requests_method = ast.FunctionDef(
|
|
177
|
+
name="start_requests",
|
|
178
|
+
args=ast.arguments(
|
|
179
|
+
posonlyargs=[],
|
|
180
|
+
args=[ast.arg(arg="self")],
|
|
181
|
+
kwonlyargs=[],
|
|
182
|
+
kw_defaults=[],
|
|
183
|
+
defaults=[],
|
|
184
|
+
vararg=None,
|
|
185
|
+
kwarg=None
|
|
186
|
+
),
|
|
187
|
+
body=[
|
|
188
|
+
ast.Expr(value=ast.Constant(value="默认 start_requests,从 start_urls 生成请求")),
|
|
189
|
+
ast.For(
|
|
190
|
+
target=ast.Name(id="url", ctx=ast.Store()),
|
|
191
|
+
iter=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="start_urls", ctx=ast.Load()),
|
|
192
|
+
body=[
|
|
193
|
+
ast.Expr(
|
|
194
|
+
value=ast.Call(
|
|
195
|
+
func=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="make_request", ctx=ast.Load()),
|
|
196
|
+
args=[ast.Name(id="url", ctx=ast.Load())],
|
|
197
|
+
keywords=[]
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
],
|
|
201
|
+
orelse=[]
|
|
202
|
+
)
|
|
203
|
+
],
|
|
204
|
+
decorator_list=[],
|
|
205
|
+
returns=None
|
|
206
|
+
)
|
|
207
|
+
# 插入在 custom_settings 或 name 之后,parse 之前
|
|
208
|
+
insert_index = 2
|
|
209
|
+
for i, node in enumerate(class_node.body):
|
|
210
|
+
if isinstance(node, ast.FunctionDef) and node.name == "parse":
|
|
211
|
+
insert_index = i
|
|
212
|
+
break
|
|
213
|
+
elif isinstance(node, ast.Assign) and any(
|
|
214
|
+
isinstance(t, ast.Name) and t.id in ("name", "custom_settings") for t in node.targets
|
|
215
|
+
):
|
|
216
|
+
insert_index = i + 1
|
|
217
|
+
class_node.body.insert(insert_index, start_requests_method)
|
|
218
|
+
fixed = True
|
|
219
|
+
|
|
220
|
+
if fixed:
|
|
221
|
+
fixed_source = astor.to_source(tree)
|
|
222
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
223
|
+
f.write(fixed_source)
|
|
224
|
+
return True, "File auto-fixed successfully."
|
|
225
|
+
else:
|
|
226
|
+
return False, "No fixable issues found."
|
|
227
|
+
|
|
228
|
+
except Exception as e:
|
|
229
|
+
return False, f"Failed to auto-fix: {e}"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class SpiderChangeHandler(FileSystemEventHandler):
|
|
233
|
+
def __init__(self, project_root, spider_modules, show_fix=False, console=None):
|
|
234
|
+
self.project_root = project_root
|
|
235
|
+
self.spider_modules = spider_modules
|
|
236
|
+
self.show_fix = show_fix
|
|
237
|
+
self.console = console or Console()
|
|
238
|
+
|
|
239
|
+
def on_modified(self, event):
|
|
240
|
+
if event.is_directory:
|
|
241
|
+
return
|
|
242
|
+
if event.src_path.endswith(".py") and "spiders" in event.src_path:
|
|
243
|
+
file_path = Path(event.src_path)
|
|
244
|
+
spider_name = file_path.stem
|
|
245
|
+
self.console.print(f"\n:eyes: [bold blue]Detected change in[/bold blue] [cyan]{file_path}[/cyan]")
|
|
246
|
+
self.check_and_fix_spider(spider_name)
|
|
247
|
+
|
|
248
|
+
def check_and_fix_spider(self, spider_name):
|
|
249
|
+
try:
|
|
250
|
+
process = CrawlerProcess(spider_modules=self.spider_modules)
|
|
251
|
+
if spider_name not in process.get_spider_names():
|
|
252
|
+
self.console.print(f"[yellow]⚠️ {spider_name} is not a registered spider.[/yellow]")
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
cls = process.get_spider_class(spider_name)
|
|
256
|
+
issues = []
|
|
257
|
+
|
|
258
|
+
# 简化检查
|
|
259
|
+
if not getattr(cls, "name", None):
|
|
260
|
+
issues.append("missing or empty 'name' attribute")
|
|
261
|
+
if not callable(getattr(cls, "start_requests", None)):
|
|
262
|
+
issues.append("missing 'start_requests' method")
|
|
263
|
+
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
264
|
+
issues.append("'start_urls' is string")
|
|
265
|
+
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
266
|
+
issues.append("'allowed_domains' is string")
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
spider = cls.create_instance(None)
|
|
270
|
+
if not callable(getattr(spider, "parse", None)):
|
|
271
|
+
issues.append("no 'parse' method")
|
|
272
|
+
except Exception:
|
|
273
|
+
issues.append("failed to instantiate")
|
|
274
|
+
|
|
275
|
+
if issues:
|
|
276
|
+
self.console.print(f"[red]❌ {spider_name} has issues:[/red]")
|
|
277
|
+
for issue in issues:
|
|
278
|
+
self.console.print(f" • {issue}")
|
|
279
|
+
|
|
280
|
+
if self.show_fix:
|
|
281
|
+
file_path = Path(cls.__file__)
|
|
282
|
+
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
283
|
+
if fixed:
|
|
284
|
+
self.console.print(f"[green]✅ Auto-fixed: {msg}[/green]")
|
|
285
|
+
else:
|
|
286
|
+
self.console.print(f"[yellow]⚠️ Could not fix: {msg}[/yellow]")
|
|
287
|
+
else:
|
|
288
|
+
self.console.print(f"[green]✅ {spider_name} is compliant.[/green]")
|
|
289
|
+
|
|
290
|
+
except Exception as e:
|
|
291
|
+
self.console.print(f"[red]❌ Error checking {spider_name}: {e}[/red]")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def watch_spiders(project_root, project_package, show_fix=False):
|
|
295
|
+
console = Console()
|
|
296
|
+
spider_path = project_root / project_package / "spiders"
|
|
297
|
+
if not spider_path.exists():
|
|
298
|
+
console.print(f"[red]❌ Spiders directory not found: {spider_path}[/red]")
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
302
|
+
event_handler = SpiderChangeHandler(project_root, spider_modules, show_fix, console)
|
|
303
|
+
observer = Observer()
|
|
304
|
+
observer.schedule(event_handler, str(spider_path), recursive=False)
|
|
305
|
+
|
|
306
|
+
console.print(Panel(
|
|
307
|
+
f":eyes: [bold blue]Watching for changes in[/bold blue] [cyan]{spider_path}[/cyan]\n"
|
|
308
|
+
"Edit any spider file to trigger auto-check...",
|
|
309
|
+
title="🚀 Watch Mode Started",
|
|
310
|
+
border_style="blue"
|
|
311
|
+
))
|
|
312
|
+
|
|
313
|
+
observer.start()
|
|
314
|
+
try:
|
|
315
|
+
while True:
|
|
316
|
+
time.sleep(1)
|
|
317
|
+
except KeyboardInterrupt:
|
|
318
|
+
console.print("\n[bold red]🛑 Watch mode stopped.[/bold red]")
|
|
319
|
+
observer.stop()
|
|
320
|
+
observer.join()
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def main(args):
|
|
324
|
+
"""
|
|
325
|
+
主函数:检查所有爬虫定义的合规性
|
|
326
|
+
用法:
|
|
327
|
+
crawlo check
|
|
328
|
+
crawlo check --fix
|
|
329
|
+
crawlo check --ci
|
|
330
|
+
crawlo check --json
|
|
331
|
+
crawlo check --watch
|
|
332
|
+
"""
|
|
333
|
+
show_fix = "--fix" in args or "-f" in args
|
|
334
|
+
show_ci = "--ci" in args
|
|
335
|
+
show_json = "--json" in args
|
|
336
|
+
show_watch = "--watch" in args
|
|
337
|
+
|
|
338
|
+
valid_args = {"--fix", "-f", "--ci", "--json", "--watch"}
|
|
339
|
+
if any(arg not in valid_args for arg in args):
|
|
340
|
+
console.print("[bold red]❌ Error:[/bold red] Usage: [blue]crawlo check[/blue] [--fix] [--ci] [--json] [--watch]")
|
|
341
|
+
return 1
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
# 1. 查找项目根目录
|
|
345
|
+
project_root = get_project_root()
|
|
346
|
+
if not project_root:
|
|
347
|
+
msg = ":cross_mark: [bold red]Cannot find 'crawlo.cfg'[/bold red]\n💡 Run this command inside your project directory."
|
|
348
|
+
if show_json:
|
|
349
|
+
console.print_json(data={"success": False, "error": "Project root not found"})
|
|
350
|
+
return 1
|
|
351
|
+
elif show_ci:
|
|
352
|
+
console.print("❌ Project root not found. crawlo.cfg missing.")
|
|
353
|
+
return 1
|
|
354
|
+
else:
|
|
355
|
+
console.print(Panel(
|
|
356
|
+
Text.from_markup(msg),
|
|
357
|
+
title="❌ Not in a Crawlo Project",
|
|
358
|
+
border_style="red",
|
|
359
|
+
padding=(1, 2)
|
|
360
|
+
))
|
|
361
|
+
return 1
|
|
362
|
+
|
|
363
|
+
project_root_str = str(project_root)
|
|
364
|
+
if project_root_str not in sys.path:
|
|
365
|
+
sys.path.insert(0, project_root_str)
|
|
366
|
+
|
|
367
|
+
# 2. 读取 crawlo.cfg
|
|
368
|
+
cfg_file = project_root / "crawlo.cfg"
|
|
369
|
+
if not cfg_file.exists():
|
|
370
|
+
msg = f"Config file not found: {cfg_file}"
|
|
371
|
+
if show_json:
|
|
372
|
+
console.print_json(data={"success": False, "error": msg})
|
|
373
|
+
return 1
|
|
374
|
+
elif show_ci:
|
|
375
|
+
console.print(f"❌ {msg}")
|
|
376
|
+
return 1
|
|
377
|
+
else:
|
|
378
|
+
console.print(Panel(msg, title="❌ Missing Config", border_style="red"))
|
|
379
|
+
return 1
|
|
380
|
+
|
|
381
|
+
config = configparser.ConfigParser()
|
|
382
|
+
config.read(cfg_file, encoding="utf-8")
|
|
383
|
+
|
|
384
|
+
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
385
|
+
msg = "Missing [settings] section or 'default' option in crawlo.cfg"
|
|
386
|
+
if show_json:
|
|
387
|
+
console.print_json(data={"success": False, "error": msg})
|
|
388
|
+
return 1
|
|
389
|
+
elif show_ci:
|
|
390
|
+
console.print(f"❌ {msg}")
|
|
391
|
+
return 1
|
|
392
|
+
else:
|
|
393
|
+
console.print(Panel(msg, title="❌ Invalid Config", border_style="red"))
|
|
394
|
+
return 1
|
|
395
|
+
|
|
396
|
+
settings_module = config.get("settings", "default")
|
|
397
|
+
project_package = settings_module.split(".")[0]
|
|
398
|
+
|
|
399
|
+
# 3. 确保项目包可导入
|
|
400
|
+
try:
|
|
401
|
+
import_module(project_package)
|
|
402
|
+
except ImportError as e:
|
|
403
|
+
msg = f"Failed to import project package '{project_package}': {e}"
|
|
404
|
+
if show_json:
|
|
405
|
+
console.print_json(data={"success": False, "error": msg})
|
|
406
|
+
return 1
|
|
407
|
+
elif show_ci:
|
|
408
|
+
console.print(f"❌ {msg}")
|
|
409
|
+
return 1
|
|
410
|
+
else:
|
|
411
|
+
console.print(Panel(msg, title="❌ Import Error", border_style="red"))
|
|
412
|
+
return 1
|
|
413
|
+
|
|
414
|
+
# 4. 加载爬虫
|
|
415
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
416
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
417
|
+
spider_names = process.get_spider_names()
|
|
418
|
+
|
|
419
|
+
if not spider_names:
|
|
420
|
+
msg = "No spiders found."
|
|
421
|
+
if show_json:
|
|
422
|
+
console.print_json(data={"success": True, "warning": msg})
|
|
423
|
+
return 0
|
|
424
|
+
elif show_ci:
|
|
425
|
+
console.print("📭 No spiders found.")
|
|
426
|
+
return 0
|
|
427
|
+
else:
|
|
428
|
+
console.print(Panel(
|
|
429
|
+
Text.from_markup(
|
|
430
|
+
":envelope_with_arrow: [bold]No spiders found[/bold]\n\n"
|
|
431
|
+
"[bold]💡 Make sure:[/bold]\n"
|
|
432
|
+
" • Spiders are defined in '[cyan]spiders[/cyan]' module\n"
|
|
433
|
+
" • They have a [green]`name`[/green] attribute\n"
|
|
434
|
+
" • Modules are properly imported"
|
|
435
|
+
),
|
|
436
|
+
title="📭 No Spiders Found",
|
|
437
|
+
border_style="yellow",
|
|
438
|
+
padding=(1, 2)
|
|
439
|
+
))
|
|
440
|
+
return 0
|
|
441
|
+
|
|
442
|
+
# 5. 如果启用 watch 模式,启动监听
|
|
443
|
+
if show_watch:
|
|
444
|
+
console.print("[bold blue]:eyes: Starting watch mode...[/bold blue]")
|
|
445
|
+
watch_spiders(project_root, project_package, show_fix)
|
|
446
|
+
return 0 # watch 是长期运行,不返回
|
|
447
|
+
|
|
448
|
+
# 6. 开始检查(非 watch 模式)
|
|
449
|
+
if not show_ci and not show_json:
|
|
450
|
+
console.print(f":mag: [bold]Checking {len(spider_names)} spider(s)...[/bold]\n")
|
|
451
|
+
|
|
452
|
+
issues_found = False
|
|
453
|
+
results = []
|
|
454
|
+
|
|
455
|
+
for name in sorted(spider_names):
|
|
456
|
+
cls = process.get_spider_class(name)
|
|
457
|
+
issues = []
|
|
458
|
+
|
|
459
|
+
# 检查 name 属性
|
|
460
|
+
if not getattr(cls, "name", None):
|
|
461
|
+
issues.append("missing or empty 'name' attribute")
|
|
462
|
+
elif not isinstance(cls.name, str):
|
|
463
|
+
issues.append("'name' is not a string")
|
|
464
|
+
|
|
465
|
+
# 检查 start_requests 是否可调用
|
|
466
|
+
if not callable(getattr(cls, "start_requests", None)):
|
|
467
|
+
issues.append("missing or non-callable 'start_requests' method")
|
|
468
|
+
|
|
469
|
+
# 检查 start_urls 类型(不应是字符串)
|
|
470
|
+
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
471
|
+
issues.append("'start_urls' is a string; should be list or tuple")
|
|
472
|
+
|
|
473
|
+
# 检查 allowed_domains 类型
|
|
474
|
+
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
475
|
+
issues.append("'allowed_domains' is a string; should be list or tuple")
|
|
476
|
+
|
|
477
|
+
# 实例化并检查 parse 方法
|
|
478
|
+
try:
|
|
479
|
+
spider = cls.create_instance(None)
|
|
480
|
+
if not callable(getattr(spider, "parse", None)):
|
|
481
|
+
issues.append("no 'parse' method defined (recommended)")
|
|
482
|
+
except Exception as e:
|
|
483
|
+
issues.append(f"failed to instantiate spider: {e}")
|
|
484
|
+
|
|
485
|
+
# 自动修复(如果启用)
|
|
486
|
+
if issues and show_fix:
|
|
487
|
+
try:
|
|
488
|
+
file_path = Path(cls.__file__)
|
|
489
|
+
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
490
|
+
if fixed:
|
|
491
|
+
if not show_ci and not show_json:
|
|
492
|
+
console.print(f"[green]🔧 Auto-fixed {name} → {msg}[/green]")
|
|
493
|
+
issues = [] # 认为已修复
|
|
494
|
+
else:
|
|
495
|
+
if not show_ci and not show_json:
|
|
496
|
+
console.print(f"[yellow]⚠️ Could not auto-fix {name}: {msg}[/yellow]")
|
|
497
|
+
except Exception as e:
|
|
498
|
+
if not show_ci and not show_json:
|
|
499
|
+
console.print(f"[yellow]⚠️ Failed to locate source file for {name}: {e}[/yellow]")
|
|
500
|
+
|
|
501
|
+
results.append({
|
|
502
|
+
"name": name,
|
|
503
|
+
"class": cls.__name__,
|
|
504
|
+
"file": getattr(cls, "__file__", "unknown"),
|
|
505
|
+
"issues": issues
|
|
506
|
+
})
|
|
507
|
+
|
|
508
|
+
if issues:
|
|
509
|
+
issues_found = True
|
|
510
|
+
|
|
511
|
+
# 7. 生成报告数据
|
|
512
|
+
report = {
|
|
513
|
+
"success": not issues_found,
|
|
514
|
+
"total_spiders": len(spider_names),
|
|
515
|
+
"issues": [
|
|
516
|
+
{"name": r["name"], "class": r["class"], "file": r["file"], "problems": r["issues"]}
|
|
517
|
+
for r in results if r["issues"]
|
|
518
|
+
]
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
# 8. 输出(根据模式)
|
|
522
|
+
if show_json:
|
|
523
|
+
console.print_json(data=report)
|
|
524
|
+
return 1 if issues_found else 0
|
|
525
|
+
|
|
526
|
+
if show_ci:
|
|
527
|
+
if issues_found:
|
|
528
|
+
console.print("❌ Compliance check failed.")
|
|
529
|
+
for r in results:
|
|
530
|
+
if r["issues"]:
|
|
531
|
+
console.print(f" • {r['name']}: {', '.join(r['issues'])}")
|
|
532
|
+
else:
|
|
533
|
+
console.print("✅ All spiders compliant.")
|
|
534
|
+
return 1 if issues_found else 0
|
|
535
|
+
|
|
536
|
+
# 9. 默认 rich 输出
|
|
537
|
+
table = Table(
|
|
538
|
+
title="🔍 Spider Compliance Check Results",
|
|
539
|
+
box=box.ROUNDED,
|
|
540
|
+
show_header=True,
|
|
541
|
+
header_style="bold magenta",
|
|
542
|
+
title_style="bold green"
|
|
543
|
+
)
|
|
544
|
+
table.add_column("Status", style="bold", width=4)
|
|
545
|
+
table.add_column("Name", style="cyan")
|
|
546
|
+
table.add_column("Class", style="green")
|
|
547
|
+
table.add_column("Issues", style="yellow", overflow="fold")
|
|
548
|
+
|
|
549
|
+
for res in results:
|
|
550
|
+
if res["issues"]:
|
|
551
|
+
status = "[red]❌[/red]"
|
|
552
|
+
issues_text = "\n".join(f"• {issue}" for issue in res["issues"])
|
|
553
|
+
else:
|
|
554
|
+
status = "[green]✅[/green]"
|
|
555
|
+
issues_text = "—"
|
|
556
|
+
|
|
557
|
+
table.add_row(status, res["name"], res["class"], issues_text)
|
|
558
|
+
|
|
559
|
+
console.print(table)
|
|
560
|
+
console.print()
|
|
561
|
+
|
|
562
|
+
if issues_found:
|
|
563
|
+
console.print(Panel(
|
|
564
|
+
":warning: [bold red]Some spiders have issues.[/bold red]\nPlease fix them before running.",
|
|
565
|
+
title="⚠️ Compliance Check Failed",
|
|
566
|
+
border_style="red",
|
|
567
|
+
padding=(1, 2)
|
|
568
|
+
))
|
|
569
|
+
return 1
|
|
570
|
+
else:
|
|
571
|
+
console.print(Panel(
|
|
572
|
+
":tada: [bold green]All spiders are compliant and well-defined![/bold green]\nReady to crawl! 🕷️🚀",
|
|
573
|
+
title="🎉 Check Passed",
|
|
574
|
+
border_style="green",
|
|
575
|
+
padding=(1, 2)
|
|
576
|
+
))
|
|
577
|
+
return 0
|
|
578
|
+
|
|
579
|
+
except Exception as e:
|
|
580
|
+
logger.exception("Exception in 'crawlo check'")
|
|
581
|
+
if show_json:
|
|
582
|
+
console.print_json(data={"success": False, "error": str(e)})
|
|
583
|
+
elif show_ci:
|
|
584
|
+
console.print(f"❌ Unexpected error: {e}")
|
|
585
|
+
else:
|
|
586
|
+
console.print(f"[bold red]❌ Unexpected error during check:[/bold red] {e}")
|
|
587
|
+
return 1
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
if __name__ == "__main__":
|
|
591
|
+
"""
|
|
592
|
+
支持直接运行:
|
|
593
|
+
python -m crawlo.commands.check
|
|
594
|
+
"""
|
|
595
595
|
sys.exit(main(sys.argv[1:]))
|