crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
crawlo/commands/check.py
CHANGED
|
@@ -1,595 +1,595 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-08-31 22:35
|
|
5
|
-
# @Author : crawl-coder
|
|
6
|
-
# @Desc : 命令行入口:crawlo check,检查所有爬虫定义是否合规。
|
|
7
|
-
"""
|
|
8
|
-
import sys
|
|
9
|
-
import ast
|
|
10
|
-
import astor
|
|
11
|
-
import re
|
|
12
|
-
import time
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
import configparser
|
|
15
|
-
from importlib import import_module
|
|
16
|
-
|
|
17
|
-
from rich.console import Console
|
|
18
|
-
from rich.panel import Panel
|
|
19
|
-
from rich.table import Table
|
|
20
|
-
from rich.text import Text
|
|
21
|
-
from rich import box
|
|
22
|
-
|
|
23
|
-
from watchdog.observers import Observer
|
|
24
|
-
from watchdog.events import FileSystemEventHandler
|
|
25
|
-
|
|
26
|
-
from crawlo.crawler import CrawlerProcess
|
|
27
|
-
from crawlo.
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
logger = get_logger(__name__)
|
|
31
|
-
console = Console()
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def get_project_root():
|
|
35
|
-
"""
|
|
36
|
-
从当前目录向上查找 crawlo.cfg,确定项目根目录
|
|
37
|
-
"""
|
|
38
|
-
current = Path.cwd()
|
|
39
|
-
for _ in range(10):
|
|
40
|
-
cfg = current / "crawlo.cfg"
|
|
41
|
-
if cfg.exists():
|
|
42
|
-
return current
|
|
43
|
-
if current == current.parent:
|
|
44
|
-
break
|
|
45
|
-
current = current.parent
|
|
46
|
-
return None
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def auto_fix_spider_file(spider_cls, file_path: Path):
|
|
50
|
-
"""自动修复 spider 文件中的常见问题"""
|
|
51
|
-
try:
|
|
52
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
53
|
-
source = f.read()
|
|
54
|
-
|
|
55
|
-
fixed = False
|
|
56
|
-
tree = ast.parse(source)
|
|
57
|
-
|
|
58
|
-
# 查找 Spider 类定义
|
|
59
|
-
class_node = None
|
|
60
|
-
for node in ast.walk(tree):
|
|
61
|
-
if isinstance(node, ast.ClassDef) and node.name == spider_cls.__name__:
|
|
62
|
-
class_node = node
|
|
63
|
-
break
|
|
64
|
-
|
|
65
|
-
if not class_node:
|
|
66
|
-
return False, "在文件中找不到类定义。"
|
|
67
|
-
|
|
68
|
-
# 1. 修复 name 为空或缺失
|
|
69
|
-
name_assign = None
|
|
70
|
-
for node in class_node.body:
|
|
71
|
-
if isinstance(node, ast.Assign):
|
|
72
|
-
for target in node.targets:
|
|
73
|
-
if isinstance(target, ast.Name) and target.id == "name":
|
|
74
|
-
name_assign = node
|
|
75
|
-
break
|
|
76
|
-
|
|
77
|
-
if not name_assign or (
|
|
78
|
-
isinstance(name_assign.value, ast.Constant) and not name_assign.value.value
|
|
79
|
-
):
|
|
80
|
-
# 生成默认 name:类名转 snake_case
|
|
81
|
-
default_name = re.sub(r'(?<!^)(?=[A-Z])', '_', spider_cls.__name__).lower().replace("_spider", "")
|
|
82
|
-
new_assign = ast.Assign(
|
|
83
|
-
targets=[ast.Name(id="name", ctx=ast.Store())],
|
|
84
|
-
value=ast.Constant(value=default_name)
|
|
85
|
-
)
|
|
86
|
-
if name_assign:
|
|
87
|
-
index = class_node.body.index(name_assign)
|
|
88
|
-
class_node.body[index] = new_assign
|
|
89
|
-
else:
|
|
90
|
-
class_node.body.insert(0, new_assign)
|
|
91
|
-
fixed = True
|
|
92
|
-
|
|
93
|
-
# 2. 修复 start_urls 是字符串
|
|
94
|
-
start_urls_assign = None
|
|
95
|
-
for node in class_node.body:
|
|
96
|
-
if isinstance(node, ast.Assign):
|
|
97
|
-
for target in node.targets:
|
|
98
|
-
if isinstance(target, ast.Name) and target.id == "start_urls":
|
|
99
|
-
start_urls_assign = node
|
|
100
|
-
break
|
|
101
|
-
|
|
102
|
-
if start_urls_assign and isinstance(start_urls_assign.value, ast.Constant) and isinstance(start_urls_assign.value.value, str):
|
|
103
|
-
new_value = ast.List(elts=[ast.Constant(value=start_urls_assign.value.value)], ctx=ast.Load())
|
|
104
|
-
start_urls_assign.value = new_value
|
|
105
|
-
fixed = True
|
|
106
|
-
|
|
107
|
-
# 3. 修复缺少 parse 方法
|
|
108
|
-
has_parse = any(
|
|
109
|
-
isinstance(node, ast.FunctionDef) and node.name == "parse"
|
|
110
|
-
for node in class_node.body
|
|
111
|
-
)
|
|
112
|
-
if not has_parse:
|
|
113
|
-
parse_method = ast.FunctionDef(
|
|
114
|
-
name="parse",
|
|
115
|
-
args=ast.arguments(
|
|
116
|
-
posonlyargs=[],
|
|
117
|
-
args=[ast.arg(arg="self"), ast.arg(arg="response")],
|
|
118
|
-
kwonlyargs=[],
|
|
119
|
-
kw_defaults=[],
|
|
120
|
-
defaults=[],
|
|
121
|
-
vararg=None,
|
|
122
|
-
kwarg=None
|
|
123
|
-
),
|
|
124
|
-
body=[
|
|
125
|
-
ast.Expr(value=ast.Constant(value="默认 parse 方法,返回 item 或继续请求")),
|
|
126
|
-
ast.Pass()
|
|
127
|
-
],
|
|
128
|
-
decorator_list=[],
|
|
129
|
-
returns=None
|
|
130
|
-
)
|
|
131
|
-
class_node.body.append(parse_method)
|
|
132
|
-
fixed = True
|
|
133
|
-
|
|
134
|
-
# 4. 修复 allowed_domains 是字符串
|
|
135
|
-
allowed_domains_assign = None
|
|
136
|
-
for node in class_node.body:
|
|
137
|
-
if isinstance(node, ast.Assign):
|
|
138
|
-
for target in node.targets:
|
|
139
|
-
if isinstance(target, ast.Name) and target.id == "allowed_domains":
|
|
140
|
-
allowed_domains_assign = node
|
|
141
|
-
break
|
|
142
|
-
|
|
143
|
-
if allowed_domains_assign and isinstance(allowed_domains_assign.value, ast.Constant) and isinstance(allowed_domains_assign.value.value, str):
|
|
144
|
-
new_value = ast.List(elts=[ast.Constant(value=allowed_domains_assign.value.value)], ctx=ast.Load())
|
|
145
|
-
allowed_domains_assign.value = new_value
|
|
146
|
-
fixed = True
|
|
147
|
-
|
|
148
|
-
# 5. 修复缺失 custom_settings
|
|
149
|
-
has_custom_settings = any(
|
|
150
|
-
isinstance(node, ast.Assign) and
|
|
151
|
-
any(isinstance(t, ast.Name) and t.id == "custom_settings" for t in node.targets)
|
|
152
|
-
for node in class_node.body
|
|
153
|
-
)
|
|
154
|
-
if not has_custom_settings:
|
|
155
|
-
new_assign = ast.Assign(
|
|
156
|
-
targets=[ast.Name(id="custom_settings", ctx=ast.Store())],
|
|
157
|
-
value=ast.Dict(keys=[], values=[])
|
|
158
|
-
)
|
|
159
|
-
# 插入在 name 之后
|
|
160
|
-
insert_index = 1
|
|
161
|
-
for i, node in enumerate(class_node.body):
|
|
162
|
-
if isinstance(node, ast.Assign) and any(
|
|
163
|
-
isinstance(t, ast.Name) and t.id == "name" for t in node.targets
|
|
164
|
-
):
|
|
165
|
-
insert_index = i + 1
|
|
166
|
-
break
|
|
167
|
-
class_node.body.insert(insert_index, new_assign)
|
|
168
|
-
fixed = True
|
|
169
|
-
|
|
170
|
-
# 6. 修复缺失 start_requests 方法
|
|
171
|
-
has_start_requests = any(
|
|
172
|
-
isinstance(node, ast.FunctionDef) and node.name == "start_requests"
|
|
173
|
-
for node in class_node.body
|
|
174
|
-
)
|
|
175
|
-
if not has_start_requests:
|
|
176
|
-
start_requests_method = ast.FunctionDef(
|
|
177
|
-
name="start_requests",
|
|
178
|
-
args=ast.arguments(
|
|
179
|
-
posonlyargs=[],
|
|
180
|
-
args=[ast.arg(arg="self")],
|
|
181
|
-
kwonlyargs=[],
|
|
182
|
-
kw_defaults=[],
|
|
183
|
-
defaults=[],
|
|
184
|
-
vararg=None,
|
|
185
|
-
kwarg=None
|
|
186
|
-
),
|
|
187
|
-
body=[
|
|
188
|
-
ast.Expr(value=ast.Constant(value="默认 start_requests,从 start_urls 生成请求")),
|
|
189
|
-
ast.For(
|
|
190
|
-
target=ast.Name(id="url", ctx=ast.Store()),
|
|
191
|
-
iter=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="start_urls", ctx=ast.Load()),
|
|
192
|
-
body=[
|
|
193
|
-
ast.Expr(
|
|
194
|
-
value=ast.Call(
|
|
195
|
-
func=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="make_request", ctx=ast.Load()),
|
|
196
|
-
args=[ast.Name(id="url", ctx=ast.Load())],
|
|
197
|
-
keywords=[]
|
|
198
|
-
)
|
|
199
|
-
)
|
|
200
|
-
],
|
|
201
|
-
orelse=[]
|
|
202
|
-
)
|
|
203
|
-
],
|
|
204
|
-
decorator_list=[],
|
|
205
|
-
returns=None
|
|
206
|
-
)
|
|
207
|
-
# 插入在 custom_settings 或 name 之后,parse 之前
|
|
208
|
-
insert_index = 2
|
|
209
|
-
for i, node in enumerate(class_node.body):
|
|
210
|
-
if isinstance(node, ast.FunctionDef) and node.name == "parse":
|
|
211
|
-
insert_index = i
|
|
212
|
-
break
|
|
213
|
-
elif isinstance(node, ast.Assign) and any(
|
|
214
|
-
isinstance(t, ast.Name) and t.id in ("name", "custom_settings") for t in node.targets
|
|
215
|
-
):
|
|
216
|
-
insert_index = i + 1
|
|
217
|
-
class_node.body.insert(insert_index, start_requests_method)
|
|
218
|
-
fixed = True
|
|
219
|
-
|
|
220
|
-
if fixed:
|
|
221
|
-
fixed_source = astor.to_source(tree)
|
|
222
|
-
with open(file_path, "w", encoding="utf-8") as f:
|
|
223
|
-
f.write(fixed_source)
|
|
224
|
-
return True, "文件自动修复成功。"
|
|
225
|
-
else:
|
|
226
|
-
return False, "未找到可修复的问题。"
|
|
227
|
-
|
|
228
|
-
except Exception as e:
|
|
229
|
-
return False, f"自动修复失败: {e}"
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class SpiderChangeHandler(FileSystemEventHandler):
|
|
233
|
-
def __init__(self, project_root, spider_modules, show_fix=False, console=None):
|
|
234
|
-
self.project_root = project_root
|
|
235
|
-
self.spider_modules = spider_modules
|
|
236
|
-
self.show_fix = show_fix
|
|
237
|
-
self.console = console or Console()
|
|
238
|
-
|
|
239
|
-
def on_modified(self, event):
|
|
240
|
-
if event.is_directory:
|
|
241
|
-
return
|
|
242
|
-
if event.src_path.endswith(".py") and "spiders" in event.src_path:
|
|
243
|
-
file_path = Path(event.src_path)
|
|
244
|
-
spider_name = file_path.stem
|
|
245
|
-
self.console.print(f"\n[bold blue]检测到变更[/bold blue] [cyan]{file_path}[/cyan]")
|
|
246
|
-
self.check_and_fix_spider(spider_name)
|
|
247
|
-
|
|
248
|
-
def check_and_fix_spider(self, spider_name):
|
|
249
|
-
try:
|
|
250
|
-
process = CrawlerProcess(spider_modules=self.spider_modules)
|
|
251
|
-
if spider_name not in process.get_spider_names():
|
|
252
|
-
self.console.print(f"[yellow]{spider_name} 不是已注册的爬虫。[/yellow]")
|
|
253
|
-
return
|
|
254
|
-
|
|
255
|
-
cls = process.get_spider_class(spider_name)
|
|
256
|
-
issues = []
|
|
257
|
-
|
|
258
|
-
# 简化检查
|
|
259
|
-
if not getattr(cls, "name", None):
|
|
260
|
-
issues.append("缺少或为空的 'name' 属性")
|
|
261
|
-
if not callable(getattr(cls, "start_requests", None)):
|
|
262
|
-
issues.append("缺少 'start_requests' 方法")
|
|
263
|
-
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
264
|
-
issues.append("'start_urls' 是字符串")
|
|
265
|
-
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
266
|
-
issues.append("'allowed_domains' 是字符串")
|
|
267
|
-
|
|
268
|
-
try:
|
|
269
|
-
spider = cls.create_instance(None)
|
|
270
|
-
if not callable(getattr(spider, "parse", None)):
|
|
271
|
-
issues.append("缺少 'parse' 方法")
|
|
272
|
-
except Exception:
|
|
273
|
-
issues.append("实例化失败")
|
|
274
|
-
|
|
275
|
-
if issues:
|
|
276
|
-
self.console.print(f"[red]{spider_name} 存在问题:[/red]")
|
|
277
|
-
for issue in issues:
|
|
278
|
-
self.console.print(f" • {issue}")
|
|
279
|
-
|
|
280
|
-
if self.show_fix:
|
|
281
|
-
file_path = Path(cls.__file__)
|
|
282
|
-
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
283
|
-
if fixed:
|
|
284
|
-
self.console.print(f"[green]自动修复: {msg}[/green]")
|
|
285
|
-
else:
|
|
286
|
-
self.console.print(f"[yellow]无法修复: {msg}[/yellow]")
|
|
287
|
-
else:
|
|
288
|
-
self.console.print(f"[green]{spider_name} 合规。[/green]")
|
|
289
|
-
|
|
290
|
-
except Exception as e:
|
|
291
|
-
self.console.print(f"[red]检查 {spider_name} 时出错: {e}[/red]")
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def watch_spiders(project_root: Path, project_package: str, show_fix: bool):
|
|
295
|
-
"""监听 spiders 目录变化并自动检查"""
|
|
296
|
-
spider_path = project_root / project_package / "spiders"
|
|
297
|
-
if not spider_path.exists():
|
|
298
|
-
console.print(f"[bold red]Spider 目录未找到:[/bold red] {spider_path}")
|
|
299
|
-
return
|
|
300
|
-
|
|
301
|
-
spider_modules = [f"{project_package}.spiders"]
|
|
302
|
-
event_handler = SpiderChangeHandler(project_root, spider_modules, show_fix, console)
|
|
303
|
-
observer = Observer()
|
|
304
|
-
observer.schedule(event_handler, str(spider_path), recursive=False)
|
|
305
|
-
|
|
306
|
-
console.print(Panel(
|
|
307
|
-
f"[bold blue]监听[/bold blue] [cyan]{spider_path}[/cyan] 中的变更\n"
|
|
308
|
-
"编辑任何爬虫文件以触发自动检查...",
|
|
309
|
-
title="已启动监听模式",
|
|
310
|
-
border_style="blue"
|
|
311
|
-
))
|
|
312
|
-
|
|
313
|
-
observer.start()
|
|
314
|
-
try:
|
|
315
|
-
while True:
|
|
316
|
-
time.sleep(1)
|
|
317
|
-
except KeyboardInterrupt:
|
|
318
|
-
console.print("\n[bold red]🛑 监听模式已停止。[/bold red]")
|
|
319
|
-
observer.stop()
|
|
320
|
-
observer.join()
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def main(args):
|
|
324
|
-
"""
|
|
325
|
-
主函数:检查所有爬虫定义的合规性
|
|
326
|
-
用法:
|
|
327
|
-
crawlo check
|
|
328
|
-
crawlo check --fix
|
|
329
|
-
crawlo check --ci
|
|
330
|
-
crawlo check --json
|
|
331
|
-
crawlo check --watch
|
|
332
|
-
"""
|
|
333
|
-
show_fix = "--fix" in args or "-f" in args
|
|
334
|
-
show_ci = "--ci" in args
|
|
335
|
-
show_json = "--json" in args
|
|
336
|
-
show_watch = "--watch" in args
|
|
337
|
-
|
|
338
|
-
valid_args = {"--fix", "-f", "--ci", "--json", "--watch"}
|
|
339
|
-
if any(arg not in valid_args for arg in args):
|
|
340
|
-
console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo check[/blue] [--fix] [--ci] [--json] [--watch]")
|
|
341
|
-
return 1
|
|
342
|
-
|
|
343
|
-
try:
|
|
344
|
-
# 1. 查找项目根目录
|
|
345
|
-
project_root = get_project_root()
|
|
346
|
-
if not project_root:
|
|
347
|
-
msg = "[bold red]找不到 'crawlo.cfg'[/bold red]\n请在项目目录中运行此命令。"
|
|
348
|
-
if show_json:
|
|
349
|
-
console.print_json(data={"success": False, "error": "未找到项目根目录"})
|
|
350
|
-
return 1
|
|
351
|
-
elif show_ci:
|
|
352
|
-
console.print("未找到项目根目录。缺少 crawlo.cfg。")
|
|
353
|
-
return 1
|
|
354
|
-
else:
|
|
355
|
-
console.print(Panel(
|
|
356
|
-
Text.from_markup(msg),
|
|
357
|
-
title="非Crawlo项目",
|
|
358
|
-
border_style="red",
|
|
359
|
-
padding=(1, 2)
|
|
360
|
-
))
|
|
361
|
-
return 1
|
|
362
|
-
|
|
363
|
-
project_root_str = str(project_root)
|
|
364
|
-
if project_root_str not in sys.path:
|
|
365
|
-
sys.path.insert(0, project_root_str)
|
|
366
|
-
|
|
367
|
-
# 2. 读取 crawlo.cfg
|
|
368
|
-
cfg_file = project_root / "crawlo.cfg"
|
|
369
|
-
if not cfg_file.exists():
|
|
370
|
-
msg = f"配置文件未找到: {cfg_file}"
|
|
371
|
-
if show_json:
|
|
372
|
-
console.print_json(data={"success": False, "error": msg})
|
|
373
|
-
return 1
|
|
374
|
-
elif show_ci:
|
|
375
|
-
console.print(f"{msg}")
|
|
376
|
-
return 1
|
|
377
|
-
else:
|
|
378
|
-
console.print(Panel(msg, title="缺少配置文件", border_style="red"))
|
|
379
|
-
return 1
|
|
380
|
-
|
|
381
|
-
config = configparser.ConfigParser()
|
|
382
|
-
config.read(cfg_file, encoding="utf-8")
|
|
383
|
-
|
|
384
|
-
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
385
|
-
msg = "crawlo.cfg 中缺少 [settings] 部分或 'default' 选项"
|
|
386
|
-
if show_json:
|
|
387
|
-
console.print_json(data={"success": False, "error": msg})
|
|
388
|
-
return 1
|
|
389
|
-
elif show_ci:
|
|
390
|
-
console.print(f"{msg}")
|
|
391
|
-
return 1
|
|
392
|
-
else:
|
|
393
|
-
console.print(Panel(msg, title="无效配置", border_style="red"))
|
|
394
|
-
return 1
|
|
395
|
-
|
|
396
|
-
settings_module = config.get("settings", "default")
|
|
397
|
-
project_package = settings_module.split(".")[0]
|
|
398
|
-
|
|
399
|
-
# 3. 确保项目包可导入
|
|
400
|
-
try:
|
|
401
|
-
import_module(project_package)
|
|
402
|
-
except ImportError as e:
|
|
403
|
-
msg = f"导入项目包 '{project_package}' 失败: {e}"
|
|
404
|
-
if show_json:
|
|
405
|
-
console.print_json(data={"success": False, "error": msg})
|
|
406
|
-
return 1
|
|
407
|
-
elif show_ci:
|
|
408
|
-
console.print(f"{msg}")
|
|
409
|
-
return 1
|
|
410
|
-
else:
|
|
411
|
-
console.print(Panel(msg, title="导入错误", border_style="red"))
|
|
412
|
-
return 1
|
|
413
|
-
|
|
414
|
-
# 4. 加载爬虫
|
|
415
|
-
spider_modules = [f"{project_package}.spiders"]
|
|
416
|
-
process = CrawlerProcess(spider_modules=spider_modules)
|
|
417
|
-
spider_names = process.get_spider_names()
|
|
418
|
-
|
|
419
|
-
if not spider_names:
|
|
420
|
-
msg = "未找到爬虫。"
|
|
421
|
-
if show_json:
|
|
422
|
-
console.print_json(data={"success": True, "warning": msg})
|
|
423
|
-
return 0
|
|
424
|
-
elif show_ci:
|
|
425
|
-
console.print("未找到爬虫。")
|
|
426
|
-
return 0
|
|
427
|
-
else:
|
|
428
|
-
console.print(Panel(
|
|
429
|
-
Text.from_markup(
|
|
430
|
-
"[bold]未找到爬虫[/bold]\n\n"
|
|
431
|
-
"[bold]确保:[/bold]\n"
|
|
432
|
-
" • 爬虫定义于 '[cyan]spiders[/cyan]' 模块\n"
|
|
433
|
-
" • 具有 [green]`name`[/green] 属性\n"
|
|
434
|
-
" • 模块已正确导入"
|
|
435
|
-
),
|
|
436
|
-
title="未找到爬虫",
|
|
437
|
-
border_style="yellow",
|
|
438
|
-
padding=(1, 2)
|
|
439
|
-
))
|
|
440
|
-
return 0
|
|
441
|
-
|
|
442
|
-
# 5. 如果启用 watch 模式,启动监听
|
|
443
|
-
if show_watch:
|
|
444
|
-
console.print("[bold blue]启动监听模式...[/bold blue]")
|
|
445
|
-
watch_spiders(project_root, project_package, show_fix)
|
|
446
|
-
return 0 # watch 是长期运行,不返回
|
|
447
|
-
|
|
448
|
-
# 6. 开始检查(非 watch 模式)
|
|
449
|
-
if not show_ci and not show_json:
|
|
450
|
-
console.print(f"[bold]正在检查 {len(spider_names)} 个爬虫...[/bold]\n")
|
|
451
|
-
|
|
452
|
-
issues_found = False
|
|
453
|
-
results = []
|
|
454
|
-
|
|
455
|
-
for name in sorted(spider_names):
|
|
456
|
-
cls = process.get_spider_class(name)
|
|
457
|
-
issues = []
|
|
458
|
-
|
|
459
|
-
# 检查 name 属性
|
|
460
|
-
if not getattr(cls, "name", None):
|
|
461
|
-
issues.append("缺少或为空的 'name' 属性")
|
|
462
|
-
elif not isinstance(cls.name, str):
|
|
463
|
-
issues.append("'name' 不是字符串")
|
|
464
|
-
|
|
465
|
-
# 检查 start_requests 是否可调用
|
|
466
|
-
if not callable(getattr(cls, "start_requests", None)):
|
|
467
|
-
issues.append("缺少或不可调用的 'start_requests' 方法")
|
|
468
|
-
|
|
469
|
-
# 检查 start_urls 类型(不应是字符串)
|
|
470
|
-
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
471
|
-
issues.append("'start_urls' 是字符串;应为列表或元组")
|
|
472
|
-
|
|
473
|
-
# 检查 allowed_domains 类型
|
|
474
|
-
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
475
|
-
issues.append("'allowed_domains' 是字符串;应为列表或元组")
|
|
476
|
-
|
|
477
|
-
# 实例化并检查 parse 方法
|
|
478
|
-
try:
|
|
479
|
-
spider = cls.create_instance(None)
|
|
480
|
-
if not callable(getattr(spider, "parse", None)):
|
|
481
|
-
issues.append("未定义 'parse' 方法(推荐)")
|
|
482
|
-
except Exception as e:
|
|
483
|
-
issues.append(f"实例化爬虫失败: {e}")
|
|
484
|
-
|
|
485
|
-
# 自动修复(如果启用)
|
|
486
|
-
if issues and show_fix:
|
|
487
|
-
try:
|
|
488
|
-
file_path = Path(cls.__file__)
|
|
489
|
-
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
490
|
-
if fixed:
|
|
491
|
-
if not show_ci and not show_json:
|
|
492
|
-
console.print(f"[green]已自动修复 {name} → {msg}[/green]")
|
|
493
|
-
issues = [] # 认为已修复
|
|
494
|
-
else:
|
|
495
|
-
if not show_ci and not show_json:
|
|
496
|
-
console.print(f"[yellow]无法自动修复 {name}: {msg}[/yellow]")
|
|
497
|
-
except Exception as e:
|
|
498
|
-
if not show_ci and not show_json:
|
|
499
|
-
console.print(f"[yellow]找不到 {name} 的源文件: {e}[/yellow]")
|
|
500
|
-
|
|
501
|
-
results.append({
|
|
502
|
-
"name": name,
|
|
503
|
-
"class": cls.__name__,
|
|
504
|
-
"file": getattr(cls, "__file__", "unknown"),
|
|
505
|
-
"issues": issues
|
|
506
|
-
})
|
|
507
|
-
|
|
508
|
-
if issues:
|
|
509
|
-
issues_found = True
|
|
510
|
-
|
|
511
|
-
# 7. 生成报告数据
|
|
512
|
-
report = {
|
|
513
|
-
"success": not issues_found,
|
|
514
|
-
"total_spiders": len(spider_names),
|
|
515
|
-
"issues": [
|
|
516
|
-
{"name": r["name"], "class": r["class"], "file": r["file"], "problems": r["issues"]}
|
|
517
|
-
for r in results if r["issues"]
|
|
518
|
-
]
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
# 8. 输出(根据模式)
|
|
522
|
-
if show_json:
|
|
523
|
-
console.print_json(data=report)
|
|
524
|
-
return 1 if issues_found else 0
|
|
525
|
-
|
|
526
|
-
if show_ci:
|
|
527
|
-
if issues_found:
|
|
528
|
-
console.print("合规性检查失败。")
|
|
529
|
-
for r in results:
|
|
530
|
-
if r["issues"]:
|
|
531
|
-
console.print(f" • {r['name']}: {', '.join(r['issues'])}")
|
|
532
|
-
else:
|
|
533
|
-
console.print("所有爬虫合规。")
|
|
534
|
-
return 1 if issues_found else 0
|
|
535
|
-
|
|
536
|
-
# 9. 默认 rich 输出
|
|
537
|
-
table = Table(
|
|
538
|
-
title="爬虫合规性检查结果",
|
|
539
|
-
box=box.ROUNDED,
|
|
540
|
-
show_header=True,
|
|
541
|
-
header_style="bold magenta",
|
|
542
|
-
title_style="bold green"
|
|
543
|
-
)
|
|
544
|
-
table.add_column("状态", style="bold", width=4)
|
|
545
|
-
table.add_column("名称", style="cyan")
|
|
546
|
-
table.add_column("类名", style="green")
|
|
547
|
-
table.add_column("问题", style="yellow", overflow="fold")
|
|
548
|
-
|
|
549
|
-
for res in results:
|
|
550
|
-
if res["issues"]:
|
|
551
|
-
status = "[red]X[/red]"
|
|
552
|
-
issues_text = "\n".join(f"• {issue}" for issue in res["issues"])
|
|
553
|
-
else:
|
|
554
|
-
status = "[green]√[/green]"
|
|
555
|
-
issues_text = "—"
|
|
556
|
-
|
|
557
|
-
table.add_row(status, res["name"], res["class"], issues_text)
|
|
558
|
-
|
|
559
|
-
console.print(table)
|
|
560
|
-
console.print()
|
|
561
|
-
|
|
562
|
-
if issues_found:
|
|
563
|
-
console.print(Panel(
|
|
564
|
-
"[bold red]一些爬虫存在问题。[/bold red]\n请在运行前修复这些问题。",
|
|
565
|
-
title="合规性检查失败",
|
|
566
|
-
border_style="red",
|
|
567
|
-
padding=(1, 2)
|
|
568
|
-
))
|
|
569
|
-
return 1
|
|
570
|
-
else:
|
|
571
|
-
console.print(Panel(
|
|
572
|
-
"[bold green]所有爬虫都合规且定义良好![/bold green]\n准备开始爬取! ",
|
|
573
|
-
title="检查通过",
|
|
574
|
-
border_style="green",
|
|
575
|
-
padding=(1, 2)
|
|
576
|
-
))
|
|
577
|
-
return 0
|
|
578
|
-
|
|
579
|
-
except Exception as e:
|
|
580
|
-
logger.exception("执行 'crawlo check' 时发生异常")
|
|
581
|
-
if show_json:
|
|
582
|
-
console.print_json(data={"success": False, "error": str(e)})
|
|
583
|
-
elif show_ci:
|
|
584
|
-
console.print(f"意外错误: {e}")
|
|
585
|
-
else:
|
|
586
|
-
console.print(f"[bold red]检查过程中发生意外错误:[/bold red] {e}")
|
|
587
|
-
return 1
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
if __name__ == "__main__":
|
|
591
|
-
"""
|
|
592
|
-
支持直接运行:
|
|
593
|
-
python -m crawlo.commands.check
|
|
594
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:35
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo check,检查所有爬虫定义是否合规。
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
import ast
|
|
10
|
+
import astor
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
import configparser
|
|
15
|
+
from importlib import import_module
|
|
16
|
+
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.panel import Panel
|
|
19
|
+
from rich.table import Table
|
|
20
|
+
from rich.text import Text
|
|
21
|
+
from rich import box
|
|
22
|
+
|
|
23
|
+
from watchdog.observers import Observer
|
|
24
|
+
from watchdog.events import FileSystemEventHandler
|
|
25
|
+
|
|
26
|
+
from crawlo.crawler import CrawlerProcess
|
|
27
|
+
from crawlo.logging import get_logger
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
logger = get_logger(__name__)
|
|
31
|
+
console = Console()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_project_root():
|
|
35
|
+
"""
|
|
36
|
+
从当前目录向上查找 crawlo.cfg,确定项目根目录
|
|
37
|
+
"""
|
|
38
|
+
current = Path.cwd()
|
|
39
|
+
for _ in range(10):
|
|
40
|
+
cfg = current / "crawlo.cfg"
|
|
41
|
+
if cfg.exists():
|
|
42
|
+
return current
|
|
43
|
+
if current == current.parent:
|
|
44
|
+
break
|
|
45
|
+
current = current.parent
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def auto_fix_spider_file(spider_cls, file_path: Path):
|
|
50
|
+
"""自动修复 spider 文件中的常见问题"""
|
|
51
|
+
try:
|
|
52
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
53
|
+
source = f.read()
|
|
54
|
+
|
|
55
|
+
fixed = False
|
|
56
|
+
tree = ast.parse(source)
|
|
57
|
+
|
|
58
|
+
# 查找 Spider 类定义
|
|
59
|
+
class_node = None
|
|
60
|
+
for node in ast.walk(tree):
|
|
61
|
+
if isinstance(node, ast.ClassDef) and node.name == spider_cls.__name__:
|
|
62
|
+
class_node = node
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if not class_node:
|
|
66
|
+
return False, "在文件中找不到类定义。"
|
|
67
|
+
|
|
68
|
+
# 1. 修复 name 为空或缺失
|
|
69
|
+
name_assign = None
|
|
70
|
+
for node in class_node.body:
|
|
71
|
+
if isinstance(node, ast.Assign):
|
|
72
|
+
for target in node.targets:
|
|
73
|
+
if isinstance(target, ast.Name) and target.id == "name":
|
|
74
|
+
name_assign = node
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
if not name_assign or (
|
|
78
|
+
isinstance(name_assign.value, ast.Constant) and not name_assign.value.value
|
|
79
|
+
):
|
|
80
|
+
# 生成默认 name:类名转 snake_case
|
|
81
|
+
default_name = re.sub(r'(?<!^)(?=[A-Z])', '_', spider_cls.__name__).lower().replace("_spider", "")
|
|
82
|
+
new_assign = ast.Assign(
|
|
83
|
+
targets=[ast.Name(id="name", ctx=ast.Store())],
|
|
84
|
+
value=ast.Constant(value=default_name)
|
|
85
|
+
)
|
|
86
|
+
if name_assign:
|
|
87
|
+
index = class_node.body.index(name_assign)
|
|
88
|
+
class_node.body[index] = new_assign
|
|
89
|
+
else:
|
|
90
|
+
class_node.body.insert(0, new_assign)
|
|
91
|
+
fixed = True
|
|
92
|
+
|
|
93
|
+
# 2. 修复 start_urls 是字符串
|
|
94
|
+
start_urls_assign = None
|
|
95
|
+
for node in class_node.body:
|
|
96
|
+
if isinstance(node, ast.Assign):
|
|
97
|
+
for target in node.targets:
|
|
98
|
+
if isinstance(target, ast.Name) and target.id == "start_urls":
|
|
99
|
+
start_urls_assign = node
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
if start_urls_assign and isinstance(start_urls_assign.value, ast.Constant) and isinstance(start_urls_assign.value.value, str):
|
|
103
|
+
new_value = ast.List(elts=[ast.Constant(value=start_urls_assign.value.value)], ctx=ast.Load())
|
|
104
|
+
start_urls_assign.value = new_value
|
|
105
|
+
fixed = True
|
|
106
|
+
|
|
107
|
+
# 3. 修复缺少 parse 方法
|
|
108
|
+
has_parse = any(
|
|
109
|
+
isinstance(node, ast.FunctionDef) and node.name == "parse"
|
|
110
|
+
for node in class_node.body
|
|
111
|
+
)
|
|
112
|
+
if not has_parse:
|
|
113
|
+
parse_method = ast.FunctionDef(
|
|
114
|
+
name="parse",
|
|
115
|
+
args=ast.arguments(
|
|
116
|
+
posonlyargs=[],
|
|
117
|
+
args=[ast.arg(arg="self"), ast.arg(arg="response")],
|
|
118
|
+
kwonlyargs=[],
|
|
119
|
+
kw_defaults=[],
|
|
120
|
+
defaults=[],
|
|
121
|
+
vararg=None,
|
|
122
|
+
kwarg=None
|
|
123
|
+
),
|
|
124
|
+
body=[
|
|
125
|
+
ast.Expr(value=ast.Constant(value="默认 parse 方法,返回 item 或继续请求")),
|
|
126
|
+
ast.Pass()
|
|
127
|
+
],
|
|
128
|
+
decorator_list=[],
|
|
129
|
+
returns=None
|
|
130
|
+
)
|
|
131
|
+
class_node.body.append(parse_method)
|
|
132
|
+
fixed = True
|
|
133
|
+
|
|
134
|
+
# 4. 修复 allowed_domains 是字符串
|
|
135
|
+
allowed_domains_assign = None
|
|
136
|
+
for node in class_node.body:
|
|
137
|
+
if isinstance(node, ast.Assign):
|
|
138
|
+
for target in node.targets:
|
|
139
|
+
if isinstance(target, ast.Name) and target.id == "allowed_domains":
|
|
140
|
+
allowed_domains_assign = node
|
|
141
|
+
break
|
|
142
|
+
|
|
143
|
+
if allowed_domains_assign and isinstance(allowed_domains_assign.value, ast.Constant) and isinstance(allowed_domains_assign.value.value, str):
|
|
144
|
+
new_value = ast.List(elts=[ast.Constant(value=allowed_domains_assign.value.value)], ctx=ast.Load())
|
|
145
|
+
allowed_domains_assign.value = new_value
|
|
146
|
+
fixed = True
|
|
147
|
+
|
|
148
|
+
# 5. 修复缺失 custom_settings
|
|
149
|
+
has_custom_settings = any(
|
|
150
|
+
isinstance(node, ast.Assign) and
|
|
151
|
+
any(isinstance(t, ast.Name) and t.id == "custom_settings" for t in node.targets)
|
|
152
|
+
for node in class_node.body
|
|
153
|
+
)
|
|
154
|
+
if not has_custom_settings:
|
|
155
|
+
new_assign = ast.Assign(
|
|
156
|
+
targets=[ast.Name(id="custom_settings", ctx=ast.Store())],
|
|
157
|
+
value=ast.Dict(keys=[], values=[])
|
|
158
|
+
)
|
|
159
|
+
# 插入在 name 之后
|
|
160
|
+
insert_index = 1
|
|
161
|
+
for i, node in enumerate(class_node.body):
|
|
162
|
+
if isinstance(node, ast.Assign) and any(
|
|
163
|
+
isinstance(t, ast.Name) and t.id == "name" for t in node.targets
|
|
164
|
+
):
|
|
165
|
+
insert_index = i + 1
|
|
166
|
+
break
|
|
167
|
+
class_node.body.insert(insert_index, new_assign)
|
|
168
|
+
fixed = True
|
|
169
|
+
|
|
170
|
+
# 6. 修复缺失 start_requests 方法
|
|
171
|
+
has_start_requests = any(
|
|
172
|
+
isinstance(node, ast.FunctionDef) and node.name == "start_requests"
|
|
173
|
+
for node in class_node.body
|
|
174
|
+
)
|
|
175
|
+
if not has_start_requests:
|
|
176
|
+
start_requests_method = ast.FunctionDef(
|
|
177
|
+
name="start_requests",
|
|
178
|
+
args=ast.arguments(
|
|
179
|
+
posonlyargs=[],
|
|
180
|
+
args=[ast.arg(arg="self")],
|
|
181
|
+
kwonlyargs=[],
|
|
182
|
+
kw_defaults=[],
|
|
183
|
+
defaults=[],
|
|
184
|
+
vararg=None,
|
|
185
|
+
kwarg=None
|
|
186
|
+
),
|
|
187
|
+
body=[
|
|
188
|
+
ast.Expr(value=ast.Constant(value="默认 start_requests,从 start_urls 生成请求")),
|
|
189
|
+
ast.For(
|
|
190
|
+
target=ast.Name(id="url", ctx=ast.Store()),
|
|
191
|
+
iter=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="start_urls", ctx=ast.Load()),
|
|
192
|
+
body=[
|
|
193
|
+
ast.Expr(
|
|
194
|
+
value=ast.Call(
|
|
195
|
+
func=ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr="make_request", ctx=ast.Load()),
|
|
196
|
+
args=[ast.Name(id="url", ctx=ast.Load())],
|
|
197
|
+
keywords=[]
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
],
|
|
201
|
+
orelse=[]
|
|
202
|
+
)
|
|
203
|
+
],
|
|
204
|
+
decorator_list=[],
|
|
205
|
+
returns=None
|
|
206
|
+
)
|
|
207
|
+
# 插入在 custom_settings 或 name 之后,parse 之前
|
|
208
|
+
insert_index = 2
|
|
209
|
+
for i, node in enumerate(class_node.body):
|
|
210
|
+
if isinstance(node, ast.FunctionDef) and node.name == "parse":
|
|
211
|
+
insert_index = i
|
|
212
|
+
break
|
|
213
|
+
elif isinstance(node, ast.Assign) and any(
|
|
214
|
+
isinstance(t, ast.Name) and t.id in ("name", "custom_settings") for t in node.targets
|
|
215
|
+
):
|
|
216
|
+
insert_index = i + 1
|
|
217
|
+
class_node.body.insert(insert_index, start_requests_method)
|
|
218
|
+
fixed = True
|
|
219
|
+
|
|
220
|
+
if fixed:
|
|
221
|
+
fixed_source = astor.to_source(tree)
|
|
222
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
223
|
+
f.write(fixed_source)
|
|
224
|
+
return True, "文件自动修复成功。"
|
|
225
|
+
else:
|
|
226
|
+
return False, "未找到可修复的问题。"
|
|
227
|
+
|
|
228
|
+
except Exception as e:
|
|
229
|
+
return False, f"自动修复失败: {e}"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class SpiderChangeHandler(FileSystemEventHandler):
|
|
233
|
+
def __init__(self, project_root, spider_modules, show_fix=False, console=None):
|
|
234
|
+
self.project_root = project_root
|
|
235
|
+
self.spider_modules = spider_modules
|
|
236
|
+
self.show_fix = show_fix
|
|
237
|
+
self.console = console or Console()
|
|
238
|
+
|
|
239
|
+
def on_modified(self, event):
|
|
240
|
+
if event.is_directory:
|
|
241
|
+
return
|
|
242
|
+
if event.src_path.endswith(".py") and "spiders" in event.src_path:
|
|
243
|
+
file_path = Path(event.src_path)
|
|
244
|
+
spider_name = file_path.stem
|
|
245
|
+
self.console.print(f"\n[bold blue]检测到变更[/bold blue] [cyan]{file_path}[/cyan]")
|
|
246
|
+
self.check_and_fix_spider(spider_name)
|
|
247
|
+
|
|
248
|
+
def check_and_fix_spider(self, spider_name):
|
|
249
|
+
try:
|
|
250
|
+
process = CrawlerProcess(spider_modules=self.spider_modules)
|
|
251
|
+
if spider_name not in process.get_spider_names():
|
|
252
|
+
self.console.print(f"[yellow]{spider_name} 不是已注册的爬虫。[/yellow]")
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
cls = process.get_spider_class(spider_name)
|
|
256
|
+
issues = []
|
|
257
|
+
|
|
258
|
+
# 简化检查
|
|
259
|
+
if not getattr(cls, "name", None):
|
|
260
|
+
issues.append("缺少或为空的 'name' 属性")
|
|
261
|
+
if not callable(getattr(cls, "start_requests", None)):
|
|
262
|
+
issues.append("缺少 'start_requests' 方法")
|
|
263
|
+
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
264
|
+
issues.append("'start_urls' 是字符串")
|
|
265
|
+
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
266
|
+
issues.append("'allowed_domains' 是字符串")
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
spider = cls.create_instance(None)
|
|
270
|
+
if not callable(getattr(spider, "parse", None)):
|
|
271
|
+
issues.append("缺少 'parse' 方法")
|
|
272
|
+
except Exception:
|
|
273
|
+
issues.append("实例化失败")
|
|
274
|
+
|
|
275
|
+
if issues:
|
|
276
|
+
self.console.print(f"[red]{spider_name} 存在问题:[/red]")
|
|
277
|
+
for issue in issues:
|
|
278
|
+
self.console.print(f" • {issue}")
|
|
279
|
+
|
|
280
|
+
if self.show_fix:
|
|
281
|
+
file_path = Path(cls.__file__)
|
|
282
|
+
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
283
|
+
if fixed:
|
|
284
|
+
self.console.print(f"[green]自动修复: {msg}[/green]")
|
|
285
|
+
else:
|
|
286
|
+
self.console.print(f"[yellow]无法修复: {msg}[/yellow]")
|
|
287
|
+
else:
|
|
288
|
+
self.console.print(f"[green]{spider_name} 合规。[/green]")
|
|
289
|
+
|
|
290
|
+
except Exception as e:
|
|
291
|
+
self.console.print(f"[red]检查 {spider_name} 时出错: {e}[/red]")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def watch_spiders(project_root: Path, project_package: str, show_fix: bool):
|
|
295
|
+
"""监听 spiders 目录变化并自动检查"""
|
|
296
|
+
spider_path = project_root / project_package / "spiders"
|
|
297
|
+
if not spider_path.exists():
|
|
298
|
+
console.print(f"[bold red]Spider 目录未找到:[/bold red] {spider_path}")
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
302
|
+
event_handler = SpiderChangeHandler(project_root, spider_modules, show_fix, console)
|
|
303
|
+
observer = Observer()
|
|
304
|
+
observer.schedule(event_handler, str(spider_path), recursive=False)
|
|
305
|
+
|
|
306
|
+
console.print(Panel(
|
|
307
|
+
f"[bold blue]监听[/bold blue] [cyan]{spider_path}[/cyan] 中的变更\n"
|
|
308
|
+
"编辑任何爬虫文件以触发自动检查...",
|
|
309
|
+
title="已启动监听模式",
|
|
310
|
+
border_style="blue"
|
|
311
|
+
))
|
|
312
|
+
|
|
313
|
+
observer.start()
|
|
314
|
+
try:
|
|
315
|
+
while True:
|
|
316
|
+
time.sleep(1)
|
|
317
|
+
except KeyboardInterrupt:
|
|
318
|
+
console.print("\n[bold red]🛑 监听模式已停止。[/bold red]")
|
|
319
|
+
observer.stop()
|
|
320
|
+
observer.join()
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def main(args):
|
|
324
|
+
"""
|
|
325
|
+
主函数:检查所有爬虫定义的合规性
|
|
326
|
+
用法:
|
|
327
|
+
crawlo check
|
|
328
|
+
crawlo check --fix
|
|
329
|
+
crawlo check --ci
|
|
330
|
+
crawlo check --json
|
|
331
|
+
crawlo check --watch
|
|
332
|
+
"""
|
|
333
|
+
show_fix = "--fix" in args or "-f" in args
|
|
334
|
+
show_ci = "--ci" in args
|
|
335
|
+
show_json = "--json" in args
|
|
336
|
+
show_watch = "--watch" in args
|
|
337
|
+
|
|
338
|
+
valid_args = {"--fix", "-f", "--ci", "--json", "--watch"}
|
|
339
|
+
if any(arg not in valid_args for arg in args):
|
|
340
|
+
console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo check[/blue] [--fix] [--ci] [--json] [--watch]")
|
|
341
|
+
return 1
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
# 1. 查找项目根目录
|
|
345
|
+
project_root = get_project_root()
|
|
346
|
+
if not project_root:
|
|
347
|
+
msg = "[bold red]找不到 'crawlo.cfg'[/bold red]\n请在项目目录中运行此命令。"
|
|
348
|
+
if show_json:
|
|
349
|
+
console.print_json(data={"success": False, "error": "未找到项目根目录"})
|
|
350
|
+
return 1
|
|
351
|
+
elif show_ci:
|
|
352
|
+
console.print("未找到项目根目录。缺少 crawlo.cfg。")
|
|
353
|
+
return 1
|
|
354
|
+
else:
|
|
355
|
+
console.print(Panel(
|
|
356
|
+
Text.from_markup(msg),
|
|
357
|
+
title="非Crawlo项目",
|
|
358
|
+
border_style="red",
|
|
359
|
+
padding=(1, 2)
|
|
360
|
+
))
|
|
361
|
+
return 1
|
|
362
|
+
|
|
363
|
+
project_root_str = str(project_root)
|
|
364
|
+
if project_root_str not in sys.path:
|
|
365
|
+
sys.path.insert(0, project_root_str)
|
|
366
|
+
|
|
367
|
+
# 2. 读取 crawlo.cfg
|
|
368
|
+
cfg_file = project_root / "crawlo.cfg"
|
|
369
|
+
if not cfg_file.exists():
|
|
370
|
+
msg = f"配置文件未找到: {cfg_file}"
|
|
371
|
+
if show_json:
|
|
372
|
+
console.print_json(data={"success": False, "error": msg})
|
|
373
|
+
return 1
|
|
374
|
+
elif show_ci:
|
|
375
|
+
console.print(f"{msg}")
|
|
376
|
+
return 1
|
|
377
|
+
else:
|
|
378
|
+
console.print(Panel(msg, title="缺少配置文件", border_style="red"))
|
|
379
|
+
return 1
|
|
380
|
+
|
|
381
|
+
config = configparser.ConfigParser()
|
|
382
|
+
config.read(cfg_file, encoding="utf-8")
|
|
383
|
+
|
|
384
|
+
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
385
|
+
msg = "crawlo.cfg 中缺少 [settings] 部分或 'default' 选项"
|
|
386
|
+
if show_json:
|
|
387
|
+
console.print_json(data={"success": False, "error": msg})
|
|
388
|
+
return 1
|
|
389
|
+
elif show_ci:
|
|
390
|
+
console.print(f"{msg}")
|
|
391
|
+
return 1
|
|
392
|
+
else:
|
|
393
|
+
console.print(Panel(msg, title="无效配置", border_style="red"))
|
|
394
|
+
return 1
|
|
395
|
+
|
|
396
|
+
settings_module = config.get("settings", "default")
|
|
397
|
+
project_package = settings_module.split(".")[0]
|
|
398
|
+
|
|
399
|
+
# 3. 确保项目包可导入
|
|
400
|
+
try:
|
|
401
|
+
import_module(project_package)
|
|
402
|
+
except ImportError as e:
|
|
403
|
+
msg = f"导入项目包 '{project_package}' 失败: {e}"
|
|
404
|
+
if show_json:
|
|
405
|
+
console.print_json(data={"success": False, "error": msg})
|
|
406
|
+
return 1
|
|
407
|
+
elif show_ci:
|
|
408
|
+
console.print(f"{msg}")
|
|
409
|
+
return 1
|
|
410
|
+
else:
|
|
411
|
+
console.print(Panel(msg, title="导入错误", border_style="red"))
|
|
412
|
+
return 1
|
|
413
|
+
|
|
414
|
+
# 4. 加载爬虫
|
|
415
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
416
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
417
|
+
spider_names = process.get_spider_names()
|
|
418
|
+
|
|
419
|
+
if not spider_names:
|
|
420
|
+
msg = "未找到爬虫。"
|
|
421
|
+
if show_json:
|
|
422
|
+
console.print_json(data={"success": True, "warning": msg})
|
|
423
|
+
return 0
|
|
424
|
+
elif show_ci:
|
|
425
|
+
console.print("未找到爬虫。")
|
|
426
|
+
return 0
|
|
427
|
+
else:
|
|
428
|
+
console.print(Panel(
|
|
429
|
+
Text.from_markup(
|
|
430
|
+
"[bold]未找到爬虫[/bold]\n\n"
|
|
431
|
+
"[bold]确保:[/bold]\n"
|
|
432
|
+
" • 爬虫定义于 '[cyan]spiders[/cyan]' 模块\n"
|
|
433
|
+
" • 具有 [green]`name`[/green] 属性\n"
|
|
434
|
+
" • 模块已正确导入"
|
|
435
|
+
),
|
|
436
|
+
title="未找到爬虫",
|
|
437
|
+
border_style="yellow",
|
|
438
|
+
padding=(1, 2)
|
|
439
|
+
))
|
|
440
|
+
return 0
|
|
441
|
+
|
|
442
|
+
# 5. 如果启用 watch 模式,启动监听
|
|
443
|
+
if show_watch:
|
|
444
|
+
console.print("[bold blue]启动监听模式...[/bold blue]")
|
|
445
|
+
watch_spiders(project_root, project_package, show_fix)
|
|
446
|
+
return 0 # watch 是长期运行,不返回
|
|
447
|
+
|
|
448
|
+
# 6. 开始检查(非 watch 模式)
|
|
449
|
+
if not show_ci and not show_json:
|
|
450
|
+
console.print(f"[bold]正在检查 {len(spider_names)} 个爬虫...[/bold]\n")
|
|
451
|
+
|
|
452
|
+
issues_found = False
|
|
453
|
+
results = []
|
|
454
|
+
|
|
455
|
+
for name in sorted(spider_names):
|
|
456
|
+
cls = process.get_spider_class(name)
|
|
457
|
+
issues = []
|
|
458
|
+
|
|
459
|
+
# 检查 name 属性
|
|
460
|
+
if not getattr(cls, "name", None):
|
|
461
|
+
issues.append("缺少或为空的 'name' 属性")
|
|
462
|
+
elif not isinstance(cls.name, str):
|
|
463
|
+
issues.append("'name' 不是字符串")
|
|
464
|
+
|
|
465
|
+
# 检查 start_requests 是否可调用
|
|
466
|
+
if not callable(getattr(cls, "start_requests", None)):
|
|
467
|
+
issues.append("缺少或不可调用的 'start_requests' 方法")
|
|
468
|
+
|
|
469
|
+
# 检查 start_urls 类型(不应是字符串)
|
|
470
|
+
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
471
|
+
issues.append("'start_urls' 是字符串;应为列表或元组")
|
|
472
|
+
|
|
473
|
+
# 检查 allowed_domains 类型
|
|
474
|
+
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
475
|
+
issues.append("'allowed_domains' 是字符串;应为列表或元组")
|
|
476
|
+
|
|
477
|
+
# 实例化并检查 parse 方法
|
|
478
|
+
try:
|
|
479
|
+
spider = cls.create_instance(None)
|
|
480
|
+
if not callable(getattr(spider, "parse", None)):
|
|
481
|
+
issues.append("未定义 'parse' 方法(推荐)")
|
|
482
|
+
except Exception as e:
|
|
483
|
+
issues.append(f"实例化爬虫失败: {e}")
|
|
484
|
+
|
|
485
|
+
# 自动修复(如果启用)
|
|
486
|
+
if issues and show_fix:
|
|
487
|
+
try:
|
|
488
|
+
file_path = Path(cls.__file__)
|
|
489
|
+
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
490
|
+
if fixed:
|
|
491
|
+
if not show_ci and not show_json:
|
|
492
|
+
console.print(f"[green]已自动修复 {name} → {msg}[/green]")
|
|
493
|
+
issues = [] # 认为已修复
|
|
494
|
+
else:
|
|
495
|
+
if not show_ci and not show_json:
|
|
496
|
+
console.print(f"[yellow]无法自动修复 {name}: {msg}[/yellow]")
|
|
497
|
+
except Exception as e:
|
|
498
|
+
if not show_ci and not show_json:
|
|
499
|
+
console.print(f"[yellow]找不到 {name} 的源文件: {e}[/yellow]")
|
|
500
|
+
|
|
501
|
+
results.append({
|
|
502
|
+
"name": name,
|
|
503
|
+
"class": cls.__name__,
|
|
504
|
+
"file": getattr(cls, "__file__", "unknown"),
|
|
505
|
+
"issues": issues
|
|
506
|
+
})
|
|
507
|
+
|
|
508
|
+
if issues:
|
|
509
|
+
issues_found = True
|
|
510
|
+
|
|
511
|
+
# 7. 生成报告数据
|
|
512
|
+
report = {
|
|
513
|
+
"success": not issues_found,
|
|
514
|
+
"total_spiders": len(spider_names),
|
|
515
|
+
"issues": [
|
|
516
|
+
{"name": r["name"], "class": r["class"], "file": r["file"], "problems": r["issues"]}
|
|
517
|
+
for r in results if r["issues"]
|
|
518
|
+
]
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
# 8. 输出(根据模式)
|
|
522
|
+
if show_json:
|
|
523
|
+
console.print_json(data=report)
|
|
524
|
+
return 1 if issues_found else 0
|
|
525
|
+
|
|
526
|
+
if show_ci:
|
|
527
|
+
if issues_found:
|
|
528
|
+
console.print("合规性检查失败。")
|
|
529
|
+
for r in results:
|
|
530
|
+
if r["issues"]:
|
|
531
|
+
console.print(f" • {r['name']}: {', '.join(r['issues'])}")
|
|
532
|
+
else:
|
|
533
|
+
console.print("所有爬虫合规。")
|
|
534
|
+
return 1 if issues_found else 0
|
|
535
|
+
|
|
536
|
+
# 9. 默认 rich 输出
|
|
537
|
+
table = Table(
|
|
538
|
+
title="爬虫合规性检查结果",
|
|
539
|
+
box=box.ROUNDED,
|
|
540
|
+
show_header=True,
|
|
541
|
+
header_style="bold magenta",
|
|
542
|
+
title_style="bold green"
|
|
543
|
+
)
|
|
544
|
+
table.add_column("状态", style="bold", width=4)
|
|
545
|
+
table.add_column("名称", style="cyan")
|
|
546
|
+
table.add_column("类名", style="green")
|
|
547
|
+
table.add_column("问题", style="yellow", overflow="fold")
|
|
548
|
+
|
|
549
|
+
for res in results:
|
|
550
|
+
if res["issues"]:
|
|
551
|
+
status = "[red]X[/red]"
|
|
552
|
+
issues_text = "\n".join(f"• {issue}" for issue in res["issues"])
|
|
553
|
+
else:
|
|
554
|
+
status = "[green]√[/green]"
|
|
555
|
+
issues_text = "—"
|
|
556
|
+
|
|
557
|
+
table.add_row(status, res["name"], res["class"], issues_text)
|
|
558
|
+
|
|
559
|
+
console.print(table)
|
|
560
|
+
console.print()
|
|
561
|
+
|
|
562
|
+
if issues_found:
|
|
563
|
+
console.print(Panel(
|
|
564
|
+
"[bold red]一些爬虫存在问题。[/bold red]\n请在运行前修复这些问题。",
|
|
565
|
+
title="合规性检查失败",
|
|
566
|
+
border_style="red",
|
|
567
|
+
padding=(1, 2)
|
|
568
|
+
))
|
|
569
|
+
return 1
|
|
570
|
+
else:
|
|
571
|
+
console.print(Panel(
|
|
572
|
+
"[bold green]所有爬虫都合规且定义良好![/bold green]\n准备开始爬取! ",
|
|
573
|
+
title="检查通过",
|
|
574
|
+
border_style="green",
|
|
575
|
+
padding=(1, 2)
|
|
576
|
+
))
|
|
577
|
+
return 0
|
|
578
|
+
|
|
579
|
+
except Exception as e:
|
|
580
|
+
logger.exception("执行 'crawlo check' 时发生异常")
|
|
581
|
+
if show_json:
|
|
582
|
+
console.print_json(data={"success": False, "error": str(e)})
|
|
583
|
+
elif show_ci:
|
|
584
|
+
console.print(f"意外错误: {e}")
|
|
585
|
+
else:
|
|
586
|
+
console.print(f"[bold red]检查过程中发生意外错误:[/bold red] {e}")
|
|
587
|
+
return 1
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
if __name__ == "__main__":
|
|
591
|
+
"""
|
|
592
|
+
支持直接运行:
|
|
593
|
+
python -m crawlo.commands.check
|
|
594
|
+
"""
|
|
595
595
|
sys.exit(main(sys.argv[1:]))
|