crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/initialization/core.py
CHANGED
|
@@ -1,241 +1,241 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
核心初始化器 - 协调整个初始化过程
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import threading
|
|
8
|
-
import time
|
|
9
|
-
import signal
|
|
10
|
-
from typing import Optional, Any
|
|
11
|
-
|
|
12
|
-
from .built_in import register_built_in_initializers
|
|
13
|
-
from .context import InitializationContext
|
|
14
|
-
from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition, validate_phase_dependencies
|
|
15
|
-
from .registry import get_global_registry
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from crawlo.utils.singleton import singleton
|
|
19
|
-
|
|
20
|
-
@singleton
|
|
21
|
-
class CoreInitializer:
|
|
22
|
-
"""
|
|
23
|
-
核心初始化器 - 协调整个框架的初始化过程
|
|
24
|
-
|
|
25
|
-
职责:
|
|
26
|
-
1. 管理初始化阶段的执行顺序
|
|
27
|
-
2. 处理阶段间的依赖关系
|
|
28
|
-
3. 提供统一的初始化入口
|
|
29
|
-
4. 错误处理和降级策略
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(self):
|
|
33
|
-
self._context: Optional[InitializationContext] = None
|
|
34
|
-
self._is_ready = False
|
|
35
|
-
self._init_lock = threading.RLock()
|
|
36
|
-
|
|
37
|
-
# 在注册内置初始化器之前,先验证阶段依赖关系
|
|
38
|
-
is_valid, error_msg = validate_phase_dependencies()
|
|
39
|
-
if not is_valid:
|
|
40
|
-
raise RuntimeError(f"初始化阶段配置错误: {error_msg}")
|
|
41
|
-
|
|
42
|
-
# 注册内置初始化器
|
|
43
|
-
register_built_in_initializers()
|
|
44
|
-
|
|
45
|
-
@property
|
|
46
|
-
def context(self) -> Optional[InitializationContext]:
|
|
47
|
-
"""获取初始化上下文"""
|
|
48
|
-
return self._context
|
|
49
|
-
|
|
50
|
-
@property
|
|
51
|
-
def is_ready(self) -> bool:
|
|
52
|
-
"""检查框架是否已准备就绪"""
|
|
53
|
-
return self._is_ready
|
|
54
|
-
|
|
55
|
-
def initialize(self, settings=None, **kwargs) -> Any:
|
|
56
|
-
"""
|
|
57
|
-
执行框架初始化
|
|
58
|
-
|
|
59
|
-
Args:
|
|
60
|
-
settings: 配置对象
|
|
61
|
-
**kwargs: 额外的配置参数
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
初始化后的配置管理器
|
|
65
|
-
"""
|
|
66
|
-
with self._init_lock:
|
|
67
|
-
# 如果已经初始化完成,直接返回
|
|
68
|
-
if self._is_ready and self._context and self._context.settings:
|
|
69
|
-
return self._context.settings
|
|
70
|
-
|
|
71
|
-
# 创建初始化上下文
|
|
72
|
-
context = InitializationContext()
|
|
73
|
-
context.custom_settings = kwargs
|
|
74
|
-
context.settings = settings
|
|
75
|
-
self._context = context
|
|
76
|
-
|
|
77
|
-
try:
|
|
78
|
-
# 执行初始化阶段
|
|
79
|
-
self._execute_initialization_phases(context)
|
|
80
|
-
|
|
81
|
-
# 检查关键阶段是否完成
|
|
82
|
-
if not context.is_phase_completed(InitializationPhase.SETTINGS):
|
|
83
|
-
raise RuntimeError("Settings initialization failed")
|
|
84
|
-
|
|
85
|
-
self._is_ready = True
|
|
86
|
-
context.finish()
|
|
87
|
-
|
|
88
|
-
return context.settings
|
|
89
|
-
|
|
90
|
-
except Exception as e:
|
|
91
|
-
context.add_error(f"Framework initialization failed: {e}")
|
|
92
|
-
context.finish()
|
|
93
|
-
|
|
94
|
-
# 降级策略
|
|
95
|
-
return self._fallback_initialization(settings, **kwargs)
|
|
96
|
-
|
|
97
|
-
def _execute_initialization_phases(self, context: InitializationContext):
|
|
98
|
-
"""执行初始化阶段"""
|
|
99
|
-
registry = get_global_registry()
|
|
100
|
-
execution_order = get_execution_order()
|
|
101
|
-
|
|
102
|
-
# 只执行已注册的阶段
|
|
103
|
-
registered_phases = set(registry.get_all_phases())
|
|
104
|
-
|
|
105
|
-
for phase in execution_order:
|
|
106
|
-
if phase == InitializationPhase.ERROR:
|
|
107
|
-
continue
|
|
108
|
-
|
|
109
|
-
# 只执行已注册的阶段
|
|
110
|
-
if phase not in registered_phases:
|
|
111
|
-
continue
|
|
112
|
-
|
|
113
|
-
context.set_current_phase(phase)
|
|
114
|
-
|
|
115
|
-
# 检查依赖关系
|
|
116
|
-
if not self._check_dependencies(phase, context):
|
|
117
|
-
phase_def = get_phase_definition(phase)
|
|
118
|
-
if not (phase_def and phase_def.optional):
|
|
119
|
-
raise RuntimeError(f"Dependencies not satisfied for phase {phase}")
|
|
120
|
-
else:
|
|
121
|
-
# 可选阶段,跳过
|
|
122
|
-
continue
|
|
123
|
-
|
|
124
|
-
# 执行阶段(带超时控制)
|
|
125
|
-
start_time = time.time()
|
|
126
|
-
try:
|
|
127
|
-
result = self._execute_phase_with_timeout(phase, context, registry)
|
|
128
|
-
result.duration = time.time() - start_time
|
|
129
|
-
|
|
130
|
-
context.mark_phase_completed(phase, result)
|
|
131
|
-
|
|
132
|
-
if not result.success and not self._is_phase_optional(phase):
|
|
133
|
-
raise RuntimeError(f"Phase {phase} failed: {result.error}")
|
|
134
|
-
|
|
135
|
-
except Exception as e:
|
|
136
|
-
duration = time.time() - start_time
|
|
137
|
-
result = PhaseResult(
|
|
138
|
-
phase=phase,
|
|
139
|
-
success=False,
|
|
140
|
-
duration=duration,
|
|
141
|
-
error=e
|
|
142
|
-
)
|
|
143
|
-
context.mark_phase_completed(phase, result)
|
|
144
|
-
|
|
145
|
-
if not self._is_phase_optional(phase):
|
|
146
|
-
raise
|
|
147
|
-
|
|
148
|
-
def _execute_phase_with_timeout(self, phase: InitializationPhase,
|
|
149
|
-
context: InitializationContext,
|
|
150
|
-
registry) -> PhaseResult:
|
|
151
|
-
"""
|
|
152
|
-
执行阶段并支持超时控制
|
|
153
|
-
|
|
154
|
-
Args:
|
|
155
|
-
phase: 初始化阶段
|
|
156
|
-
context: 初始化上下文
|
|
157
|
-
registry: 初始化器注册表
|
|
158
|
-
|
|
159
|
-
Returns:
|
|
160
|
-
PhaseResult: 阶段执行结果
|
|
161
|
-
|
|
162
|
-
Raises:
|
|
163
|
-
TimeoutError: 阶段执行超时
|
|
164
|
-
"""
|
|
165
|
-
phase_def = get_phase_definition(phase)
|
|
166
|
-
timeout = phase_def.timeout if phase_def else 30.0
|
|
167
|
-
|
|
168
|
-
# 使用线程执行,支持超时
|
|
169
|
-
result_container: list[Optional[PhaseResult]] = [None]
|
|
170
|
-
exception_container: list[Optional[Exception]] = [None]
|
|
171
|
-
|
|
172
|
-
def execute_in_thread():
|
|
173
|
-
try:
|
|
174
|
-
result_container[0] = registry.execute_phase(phase, context)
|
|
175
|
-
except Exception as e:
|
|
176
|
-
exception_container[0] = e
|
|
177
|
-
|
|
178
|
-
thread = threading.Thread(target=execute_in_thread, daemon=True)
|
|
179
|
-
thread.start()
|
|
180
|
-
thread.join(timeout=timeout)
|
|
181
|
-
|
|
182
|
-
if thread.is_alive():
|
|
183
|
-
# 超时了
|
|
184
|
-
error_msg = f"Phase {phase.value} execution timeout after {timeout} seconds"
|
|
185
|
-
context.add_warning(error_msg)
|
|
186
|
-
return PhaseResult(
|
|
187
|
-
phase=phase,
|
|
188
|
-
success=False,
|
|
189
|
-
error=TimeoutError(error_msg)
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
# 检查是否有异常
|
|
193
|
-
if exception_container[0]:
|
|
194
|
-
raise exception_container[0]
|
|
195
|
-
|
|
196
|
-
# 返回结果(已经确保不为None)
|
|
197
|
-
if result_container[0] is None:
|
|
198
|
-
raise RuntimeError(f"Phase {phase.value} returned None result")
|
|
199
|
-
return result_container[0]
|
|
200
|
-
|
|
201
|
-
def _check_dependencies(self, phase: InitializationPhase,
|
|
202
|
-
context: InitializationContext) -> bool:
|
|
203
|
-
"""检查阶段依赖关系"""
|
|
204
|
-
phase_def = get_phase_definition(phase)
|
|
205
|
-
if not phase_def:
|
|
206
|
-
return True
|
|
207
|
-
|
|
208
|
-
for dependency in phase_def.dependencies:
|
|
209
|
-
if not context.is_phase_completed(dependency):
|
|
210
|
-
return False
|
|
211
|
-
|
|
212
|
-
return True
|
|
213
|
-
|
|
214
|
-
def _is_phase_optional(self, phase: InitializationPhase) -> bool:
|
|
215
|
-
"""检查阶段是否可选"""
|
|
216
|
-
phase_def = get_phase_definition(phase)
|
|
217
|
-
return phase_def.optional if phase_def else False
|
|
218
|
-
|
|
219
|
-
def _fallback_initialization(self, settings=None, **kwargs):
|
|
220
|
-
"""降级初始化策略"""
|
|
221
|
-
try:
|
|
222
|
-
# 尝试创建基本的配置管理器
|
|
223
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
224
|
-
|
|
225
|
-
if settings:
|
|
226
|
-
return settings
|
|
227
|
-
else:
|
|
228
|
-
fallback_settings = SettingManager()
|
|
229
|
-
if kwargs:
|
|
230
|
-
fallback_settings.update_attributes(kwargs)
|
|
231
|
-
return fallback_settings
|
|
232
|
-
|
|
233
|
-
except Exception:
|
|
234
|
-
# 如果连降级都失败,返回None
|
|
235
|
-
return None
|
|
236
|
-
|
|
237
|
-
def reset(self):
|
|
238
|
-
"""重置初始化状态(主要用于测试)"""
|
|
239
|
-
with self._init_lock:
|
|
240
|
-
self._context = None
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
核心初始化器 - 协调整个初始化过程
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import signal
|
|
10
|
+
from typing import Optional, Any
|
|
11
|
+
|
|
12
|
+
from .built_in import register_built_in_initializers
|
|
13
|
+
from .context import InitializationContext
|
|
14
|
+
from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition, validate_phase_dependencies
|
|
15
|
+
from .registry import get_global_registry
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
from crawlo.utils.singleton import singleton
|
|
19
|
+
|
|
20
|
+
@singleton
|
|
21
|
+
class CoreInitializer:
|
|
22
|
+
"""
|
|
23
|
+
核心初始化器 - 协调整个框架的初始化过程
|
|
24
|
+
|
|
25
|
+
职责:
|
|
26
|
+
1. 管理初始化阶段的执行顺序
|
|
27
|
+
2. 处理阶段间的依赖关系
|
|
28
|
+
3. 提供统一的初始化入口
|
|
29
|
+
4. 错误处理和降级策略
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
self._context: Optional[InitializationContext] = None
|
|
34
|
+
self._is_ready = False
|
|
35
|
+
self._init_lock = threading.RLock()
|
|
36
|
+
|
|
37
|
+
# 在注册内置初始化器之前,先验证阶段依赖关系
|
|
38
|
+
is_valid, error_msg = validate_phase_dependencies()
|
|
39
|
+
if not is_valid:
|
|
40
|
+
raise RuntimeError(f"初始化阶段配置错误: {error_msg}")
|
|
41
|
+
|
|
42
|
+
# 注册内置初始化器
|
|
43
|
+
register_built_in_initializers()
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def context(self) -> Optional[InitializationContext]:
|
|
47
|
+
"""获取初始化上下文"""
|
|
48
|
+
return self._context
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def is_ready(self) -> bool:
|
|
52
|
+
"""检查框架是否已准备就绪"""
|
|
53
|
+
return self._is_ready
|
|
54
|
+
|
|
55
|
+
def initialize(self, settings=None, **kwargs) -> Any:
|
|
56
|
+
"""
|
|
57
|
+
执行框架初始化
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
settings: 配置对象
|
|
61
|
+
**kwargs: 额外的配置参数
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
初始化后的配置管理器
|
|
65
|
+
"""
|
|
66
|
+
with self._init_lock:
|
|
67
|
+
# 如果已经初始化完成,直接返回
|
|
68
|
+
if self._is_ready and self._context and self._context.settings:
|
|
69
|
+
return self._context.settings
|
|
70
|
+
|
|
71
|
+
# 创建初始化上下文
|
|
72
|
+
context = InitializationContext()
|
|
73
|
+
context.custom_settings = kwargs
|
|
74
|
+
context.settings = settings
|
|
75
|
+
self._context = context
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
# 执行初始化阶段
|
|
79
|
+
self._execute_initialization_phases(context)
|
|
80
|
+
|
|
81
|
+
# 检查关键阶段是否完成
|
|
82
|
+
if not context.is_phase_completed(InitializationPhase.SETTINGS):
|
|
83
|
+
raise RuntimeError("Settings initialization failed")
|
|
84
|
+
|
|
85
|
+
self._is_ready = True
|
|
86
|
+
context.finish()
|
|
87
|
+
|
|
88
|
+
return context.settings
|
|
89
|
+
|
|
90
|
+
except Exception as e:
|
|
91
|
+
context.add_error(f"Framework initialization failed: {e}")
|
|
92
|
+
context.finish()
|
|
93
|
+
|
|
94
|
+
# 降级策略
|
|
95
|
+
return self._fallback_initialization(settings, **kwargs)
|
|
96
|
+
|
|
97
|
+
def _execute_initialization_phases(self, context: InitializationContext):
|
|
98
|
+
"""执行初始化阶段"""
|
|
99
|
+
registry = get_global_registry()
|
|
100
|
+
execution_order = get_execution_order()
|
|
101
|
+
|
|
102
|
+
# 只执行已注册的阶段
|
|
103
|
+
registered_phases = set(registry.get_all_phases())
|
|
104
|
+
|
|
105
|
+
for phase in execution_order:
|
|
106
|
+
if phase == InitializationPhase.ERROR:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
# 只执行已注册的阶段
|
|
110
|
+
if phase not in registered_phases:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
context.set_current_phase(phase)
|
|
114
|
+
|
|
115
|
+
# 检查依赖关系
|
|
116
|
+
if not self._check_dependencies(phase, context):
|
|
117
|
+
phase_def = get_phase_definition(phase)
|
|
118
|
+
if not (phase_def and phase_def.optional):
|
|
119
|
+
raise RuntimeError(f"Dependencies not satisfied for phase {phase}")
|
|
120
|
+
else:
|
|
121
|
+
# 可选阶段,跳过
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
# 执行阶段(带超时控制)
|
|
125
|
+
start_time = time.time()
|
|
126
|
+
try:
|
|
127
|
+
result = self._execute_phase_with_timeout(phase, context, registry)
|
|
128
|
+
result.duration = time.time() - start_time
|
|
129
|
+
|
|
130
|
+
context.mark_phase_completed(phase, result)
|
|
131
|
+
|
|
132
|
+
if not result.success and not self._is_phase_optional(phase):
|
|
133
|
+
raise RuntimeError(f"Phase {phase} failed: {result.error}")
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
duration = time.time() - start_time
|
|
137
|
+
result = PhaseResult(
|
|
138
|
+
phase=phase,
|
|
139
|
+
success=False,
|
|
140
|
+
duration=duration,
|
|
141
|
+
error=e
|
|
142
|
+
)
|
|
143
|
+
context.mark_phase_completed(phase, result)
|
|
144
|
+
|
|
145
|
+
if not self._is_phase_optional(phase):
|
|
146
|
+
raise
|
|
147
|
+
|
|
148
|
+
def _execute_phase_with_timeout(self, phase: InitializationPhase,
|
|
149
|
+
context: InitializationContext,
|
|
150
|
+
registry) -> PhaseResult:
|
|
151
|
+
"""
|
|
152
|
+
执行阶段并支持超时控制
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
phase: 初始化阶段
|
|
156
|
+
context: 初始化上下文
|
|
157
|
+
registry: 初始化器注册表
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
PhaseResult: 阶段执行结果
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
TimeoutError: 阶段执行超时
|
|
164
|
+
"""
|
|
165
|
+
phase_def = get_phase_definition(phase)
|
|
166
|
+
timeout = phase_def.timeout if phase_def else 30.0
|
|
167
|
+
|
|
168
|
+
# 使用线程执行,支持超时
|
|
169
|
+
result_container: list[Optional[PhaseResult]] = [None]
|
|
170
|
+
exception_container: list[Optional[Exception]] = [None]
|
|
171
|
+
|
|
172
|
+
def execute_in_thread():
|
|
173
|
+
try:
|
|
174
|
+
result_container[0] = registry.execute_phase(phase, context)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
exception_container[0] = e
|
|
177
|
+
|
|
178
|
+
thread = threading.Thread(target=execute_in_thread, daemon=True)
|
|
179
|
+
thread.start()
|
|
180
|
+
thread.join(timeout=timeout)
|
|
181
|
+
|
|
182
|
+
if thread.is_alive():
|
|
183
|
+
# 超时了
|
|
184
|
+
error_msg = f"Phase {phase.value} execution timeout after {timeout} seconds"
|
|
185
|
+
context.add_warning(error_msg)
|
|
186
|
+
return PhaseResult(
|
|
187
|
+
phase=phase,
|
|
188
|
+
success=False,
|
|
189
|
+
error=TimeoutError(error_msg)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# 检查是否有异常
|
|
193
|
+
if exception_container[0]:
|
|
194
|
+
raise exception_container[0]
|
|
195
|
+
|
|
196
|
+
# 返回结果(已经确保不为None)
|
|
197
|
+
if result_container[0] is None:
|
|
198
|
+
raise RuntimeError(f"Phase {phase.value} returned None result")
|
|
199
|
+
return result_container[0]
|
|
200
|
+
|
|
201
|
+
def _check_dependencies(self, phase: InitializationPhase,
|
|
202
|
+
context: InitializationContext) -> bool:
|
|
203
|
+
"""检查阶段依赖关系"""
|
|
204
|
+
phase_def = get_phase_definition(phase)
|
|
205
|
+
if not phase_def:
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
for dependency in phase_def.dependencies:
|
|
209
|
+
if not context.is_phase_completed(dependency):
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
return True
|
|
213
|
+
|
|
214
|
+
def _is_phase_optional(self, phase: InitializationPhase) -> bool:
|
|
215
|
+
"""检查阶段是否可选"""
|
|
216
|
+
phase_def = get_phase_definition(phase)
|
|
217
|
+
return phase_def.optional if phase_def else False
|
|
218
|
+
|
|
219
|
+
def _fallback_initialization(self, settings=None, **kwargs):
|
|
220
|
+
"""降级初始化策略"""
|
|
221
|
+
try:
|
|
222
|
+
# 尝试创建基本的配置管理器
|
|
223
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
224
|
+
|
|
225
|
+
if settings:
|
|
226
|
+
return settings
|
|
227
|
+
else:
|
|
228
|
+
fallback_settings = SettingManager()
|
|
229
|
+
if kwargs:
|
|
230
|
+
fallback_settings.update_attributes(kwargs)
|
|
231
|
+
return fallback_settings
|
|
232
|
+
|
|
233
|
+
except Exception:
|
|
234
|
+
# 如果连降级都失败,返回None
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
def reset(self):
|
|
238
|
+
"""重置初始化状态(主要用于测试)"""
|
|
239
|
+
with self._init_lock:
|
|
240
|
+
self._context = None
|
|
241
241
|
self._is_ready = False
|