crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,144 +1,144 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
初始化器注册表 - 管理所有初始化器的注册和执行
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import threading
|
|
8
|
-
from typing import Dict, Optional, Callable, List
|
|
9
|
-
from .context import InitializationContext
|
|
10
|
-
from .phases import InitializationPhase, PhaseResult
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class Initializer:
|
|
14
|
-
"""初始化器基类"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, phase: InitializationPhase):
|
|
17
|
-
self._phase = phase
|
|
18
|
-
|
|
19
|
-
@property
|
|
20
|
-
def phase(self) -> InitializationPhase:
|
|
21
|
-
"""获取初始化阶段"""
|
|
22
|
-
return self._phase
|
|
23
|
-
|
|
24
|
-
def initialize(self, context: InitializationContext) -> PhaseResult:
|
|
25
|
-
"""执行初始化 - 子类必须实现"""
|
|
26
|
-
raise NotImplementedError("Subclasses must implement initialize method")
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class BaseInitializer(Initializer):
|
|
30
|
-
"""基础初始化器类 - 为向后兼容保留"""
|
|
31
|
-
|
|
32
|
-
def __init__(self, phase: InitializationPhase):
|
|
33
|
-
super().__init__(phase)
|
|
34
|
-
|
|
35
|
-
def _create_result(self, success: bool, duration: float = 0.0,
|
|
36
|
-
artifacts: Optional[Dict] = None, error: Optional[Exception] = None) -> PhaseResult:
|
|
37
|
-
"""创建初始化结果"""
|
|
38
|
-
from .utils import create_initialization_result
|
|
39
|
-
return create_initialization_result(
|
|
40
|
-
phase=self.phase,
|
|
41
|
-
success=success,
|
|
42
|
-
duration=duration,
|
|
43
|
-
artifacts=artifacts,
|
|
44
|
-
error=error
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class InitializerRegistry:
|
|
49
|
-
"""
|
|
50
|
-
初始化器注册表 - 管理所有初始化器的注册和执行
|
|
51
|
-
|
|
52
|
-
特点:
|
|
53
|
-
1. 线程安全的注册和执行
|
|
54
|
-
2. 支持函数式和类式初始化器
|
|
55
|
-
3. 统一的结果处理
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
def __init__(self):
|
|
59
|
-
self._initializers: Dict[InitializationPhase, Initializer] = {}
|
|
60
|
-
self._lock = threading.RLock()
|
|
61
|
-
|
|
62
|
-
def register(self, initializer: Initializer):
|
|
63
|
-
"""注册初始化器"""
|
|
64
|
-
with self._lock:
|
|
65
|
-
phase = initializer.phase
|
|
66
|
-
if phase in self._initializers:
|
|
67
|
-
raise ValueError(f"Initializer for phase {phase} already registered")
|
|
68
|
-
self._initializers[phase] = initializer
|
|
69
|
-
|
|
70
|
-
def register_function(self, phase: InitializationPhase,
|
|
71
|
-
init_func: Callable[[InitializationContext], PhaseResult]):
|
|
72
|
-
"""注册函数式初始化器"""
|
|
73
|
-
|
|
74
|
-
class FunctionInitializer(Initializer):
|
|
75
|
-
def __init__(self, phase: InitializationPhase, func: Callable):
|
|
76
|
-
super().__init__(phase)
|
|
77
|
-
self._phase = phase
|
|
78
|
-
self._func = func
|
|
79
|
-
|
|
80
|
-
def initialize(self, context: InitializationContext) -> PhaseResult:
|
|
81
|
-
return self._func(context)
|
|
82
|
-
|
|
83
|
-
self.register(FunctionInitializer(phase, init_func))
|
|
84
|
-
|
|
85
|
-
def get_initializer(self, phase: InitializationPhase) -> Optional[Initializer]:
|
|
86
|
-
"""获取指定阶段的初始化器"""
|
|
87
|
-
with self._lock:
|
|
88
|
-
return self._initializers.get(phase)
|
|
89
|
-
|
|
90
|
-
def get_all_phases(self) -> List[InitializationPhase]:
|
|
91
|
-
"""获取所有已注册的阶段"""
|
|
92
|
-
with self._lock:
|
|
93
|
-
return list(self._initializers.keys())
|
|
94
|
-
|
|
95
|
-
def has_initializer(self, phase: InitializationPhase) -> bool:
|
|
96
|
-
"""检查是否有指定阶段的初始化器"""
|
|
97
|
-
with self._lock:
|
|
98
|
-
return phase in self._initializers
|
|
99
|
-
|
|
100
|
-
def clear(self):
|
|
101
|
-
"""清空注册表"""
|
|
102
|
-
with self._lock:
|
|
103
|
-
self._initializers.clear()
|
|
104
|
-
|
|
105
|
-
def execute_phase(self, phase: InitializationPhase,
|
|
106
|
-
context: InitializationContext) -> PhaseResult:
|
|
107
|
-
"""执行指定阶段的初始化"""
|
|
108
|
-
initializer = self.get_initializer(phase)
|
|
109
|
-
if not initializer:
|
|
110
|
-
error = ValueError(f"No initializer registered for phase {phase}")
|
|
111
|
-
return PhaseResult(
|
|
112
|
-
phase=phase,
|
|
113
|
-
success=False,
|
|
114
|
-
error=error
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
try:
|
|
118
|
-
return initializer.initialize(context)
|
|
119
|
-
except Exception as e:
|
|
120
|
-
return PhaseResult(
|
|
121
|
-
phase=phase,
|
|
122
|
-
success=False,
|
|
123
|
-
error=e
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
# 全局注册表实例
|
|
128
|
-
_global_registry = InitializerRegistry()
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def get_global_registry() -> InitializerRegistry:
|
|
132
|
-
"""获取全局注册表"""
|
|
133
|
-
return _global_registry
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def register_initializer(initializer: Initializer):
|
|
137
|
-
"""注册初始化器到全局注册表"""
|
|
138
|
-
_global_registry.register(initializer)
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def register_phase_function(phase: InitializationPhase,
|
|
142
|
-
init_func: Callable[[InitializationContext], PhaseResult]):
|
|
143
|
-
"""注册函数式初始化器到全局注册表"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
初始化器注册表 - 管理所有初始化器的注册和执行
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import threading
|
|
8
|
+
from typing import Dict, Optional, Callable, List
|
|
9
|
+
from .context import InitializationContext
|
|
10
|
+
from .phases import InitializationPhase, PhaseResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Initializer:
|
|
14
|
+
"""初始化器基类"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, phase: InitializationPhase):
|
|
17
|
+
self._phase = phase
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def phase(self) -> InitializationPhase:
|
|
21
|
+
"""获取初始化阶段"""
|
|
22
|
+
return self._phase
|
|
23
|
+
|
|
24
|
+
def initialize(self, context: InitializationContext) -> PhaseResult:
|
|
25
|
+
"""执行初始化 - 子类必须实现"""
|
|
26
|
+
raise NotImplementedError("Subclasses must implement initialize method")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseInitializer(Initializer):
|
|
30
|
+
"""基础初始化器类 - 为向后兼容保留"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, phase: InitializationPhase):
|
|
33
|
+
super().__init__(phase)
|
|
34
|
+
|
|
35
|
+
def _create_result(self, success: bool, duration: float = 0.0,
|
|
36
|
+
artifacts: Optional[Dict] = None, error: Optional[Exception] = None) -> PhaseResult:
|
|
37
|
+
"""创建初始化结果"""
|
|
38
|
+
from .utils import create_initialization_result
|
|
39
|
+
return create_initialization_result(
|
|
40
|
+
phase=self.phase,
|
|
41
|
+
success=success,
|
|
42
|
+
duration=duration,
|
|
43
|
+
artifacts=artifacts,
|
|
44
|
+
error=error
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class InitializerRegistry:
|
|
49
|
+
"""
|
|
50
|
+
初始化器注册表 - 管理所有初始化器的注册和执行
|
|
51
|
+
|
|
52
|
+
特点:
|
|
53
|
+
1. 线程安全的注册和执行
|
|
54
|
+
2. 支持函数式和类式初始化器
|
|
55
|
+
3. 统一的结果处理
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self):
|
|
59
|
+
self._initializers: Dict[InitializationPhase, Initializer] = {}
|
|
60
|
+
self._lock = threading.RLock()
|
|
61
|
+
|
|
62
|
+
def register(self, initializer: Initializer):
|
|
63
|
+
"""注册初始化器"""
|
|
64
|
+
with self._lock:
|
|
65
|
+
phase = initializer.phase
|
|
66
|
+
if phase in self._initializers:
|
|
67
|
+
raise ValueError(f"Initializer for phase {phase} already registered")
|
|
68
|
+
self._initializers[phase] = initializer
|
|
69
|
+
|
|
70
|
+
def register_function(self, phase: InitializationPhase,
|
|
71
|
+
init_func: Callable[[InitializationContext], PhaseResult]):
|
|
72
|
+
"""注册函数式初始化器"""
|
|
73
|
+
|
|
74
|
+
class FunctionInitializer(Initializer):
|
|
75
|
+
def __init__(self, phase: InitializationPhase, func: Callable):
|
|
76
|
+
super().__init__(phase)
|
|
77
|
+
self._phase = phase
|
|
78
|
+
self._func = func
|
|
79
|
+
|
|
80
|
+
def initialize(self, context: InitializationContext) -> PhaseResult:
|
|
81
|
+
return self._func(context)
|
|
82
|
+
|
|
83
|
+
self.register(FunctionInitializer(phase, init_func))
|
|
84
|
+
|
|
85
|
+
def get_initializer(self, phase: InitializationPhase) -> Optional[Initializer]:
|
|
86
|
+
"""获取指定阶段的初始化器"""
|
|
87
|
+
with self._lock:
|
|
88
|
+
return self._initializers.get(phase)
|
|
89
|
+
|
|
90
|
+
def get_all_phases(self) -> List[InitializationPhase]:
|
|
91
|
+
"""获取所有已注册的阶段"""
|
|
92
|
+
with self._lock:
|
|
93
|
+
return list(self._initializers.keys())
|
|
94
|
+
|
|
95
|
+
def has_initializer(self, phase: InitializationPhase) -> bool:
|
|
96
|
+
"""检查是否有指定阶段的初始化器"""
|
|
97
|
+
with self._lock:
|
|
98
|
+
return phase in self._initializers
|
|
99
|
+
|
|
100
|
+
def clear(self):
|
|
101
|
+
"""清空注册表"""
|
|
102
|
+
with self._lock:
|
|
103
|
+
self._initializers.clear()
|
|
104
|
+
|
|
105
|
+
def execute_phase(self, phase: InitializationPhase,
|
|
106
|
+
context: InitializationContext) -> PhaseResult:
|
|
107
|
+
"""执行指定阶段的初始化"""
|
|
108
|
+
initializer = self.get_initializer(phase)
|
|
109
|
+
if not initializer:
|
|
110
|
+
error = ValueError(f"No initializer registered for phase {phase}")
|
|
111
|
+
return PhaseResult(
|
|
112
|
+
phase=phase,
|
|
113
|
+
success=False,
|
|
114
|
+
error=error
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
return initializer.initialize(context)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
return PhaseResult(
|
|
121
|
+
phase=phase,
|
|
122
|
+
success=False,
|
|
123
|
+
error=e
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# 全局注册表实例
|
|
128
|
+
_global_registry = InitializerRegistry()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_global_registry() -> InitializerRegistry:
|
|
132
|
+
"""获取全局注册表"""
|
|
133
|
+
return _global_registry
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def register_initializer(initializer: Initializer):
|
|
137
|
+
"""注册初始化器到全局注册表"""
|
|
138
|
+
_global_registry.register(initializer)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def register_phase_function(phase: InitializationPhase,
|
|
142
|
+
init_func: Callable[[InitializationContext], PhaseResult]):
|
|
143
|
+
"""注册函数式初始化器到全局注册表"""
|
|
144
144
|
_global_registry.register_function(phase, init_func)
|
crawlo/initialization/utils.py
CHANGED
|
@@ -1,49 +1,49 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
初始化工具模块 - 提供通用的初始化工具函数
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import time
|
|
8
|
-
from typing import Optional, Dict, Any
|
|
9
|
-
from .phases import PhaseResult, InitializationPhase
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def create_initialization_result(
|
|
13
|
-
phase: 'InitializationPhase',
|
|
14
|
-
success: bool,
|
|
15
|
-
duration: float = 0.0,
|
|
16
|
-
artifacts: Optional[Dict[str, Any]] = None,
|
|
17
|
-
error: Optional[Exception] = None
|
|
18
|
-
) -> PhaseResult:
|
|
19
|
-
"""
|
|
20
|
-
创建标准化的初始化结果
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
phase: 初始化阶段
|
|
24
|
-
success: 是否成功
|
|
25
|
-
duration: 执行时长
|
|
26
|
-
artifacts: 产生的工件数据
|
|
27
|
-
error: 异常对象
|
|
28
|
-
|
|
29
|
-
Returns:
|
|
30
|
-
PhaseResult: 标准化的初始化结果
|
|
31
|
-
"""
|
|
32
|
-
return PhaseResult(
|
|
33
|
-
phase=phase,
|
|
34
|
-
success=success,
|
|
35
|
-
duration=duration,
|
|
36
|
-
artifacts=artifacts or {},
|
|
37
|
-
error=error
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class InitializationTimer:
|
|
42
|
-
"""初始化计时器"""
|
|
43
|
-
|
|
44
|
-
def __init__(self):
|
|
45
|
-
self.start_time = time.time()
|
|
46
|
-
|
|
47
|
-
def get_duration(self) -> float:
|
|
48
|
-
"""获取经过的时间"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
初始化工具模块 - 提供通用的初始化工具函数
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import time
|
|
8
|
+
from typing import Optional, Dict, Any
|
|
9
|
+
from .phases import PhaseResult, InitializationPhase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_initialization_result(
|
|
13
|
+
phase: 'InitializationPhase',
|
|
14
|
+
success: bool,
|
|
15
|
+
duration: float = 0.0,
|
|
16
|
+
artifacts: Optional[Dict[str, Any]] = None,
|
|
17
|
+
error: Optional[Exception] = None
|
|
18
|
+
) -> PhaseResult:
|
|
19
|
+
"""
|
|
20
|
+
创建标准化的初始化结果
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
phase: 初始化阶段
|
|
24
|
+
success: 是否成功
|
|
25
|
+
duration: 执行时长
|
|
26
|
+
artifacts: 产生的工件数据
|
|
27
|
+
error: 异常对象
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
PhaseResult: 标准化的初始化结果
|
|
31
|
+
"""
|
|
32
|
+
return PhaseResult(
|
|
33
|
+
phase=phase,
|
|
34
|
+
success=success,
|
|
35
|
+
duration=duration,
|
|
36
|
+
artifacts=artifacts or {},
|
|
37
|
+
error=error
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class InitializationTimer:
|
|
42
|
+
"""初始化计时器"""
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
self.start_time = time.time()
|
|
46
|
+
|
|
47
|
+
def get_duration(self) -> float:
|
|
48
|
+
"""获取经过的时间"""
|
|
49
49
|
return time.time() - self.start_time
|
crawlo/interfaces.py
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import List, Type, Protocol
|
|
3
|
-
|
|
4
|
-
from crawlo.spider import Spider
|
|
5
|
-
from crawlo.network.request import Request
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class ISpiderLoader(Protocol):
|
|
9
|
-
"""Spider loader interface"""
|
|
10
|
-
|
|
11
|
-
@abstractmethod
|
|
12
|
-
def load(self, spider_name: str) -> Type[Spider]:
|
|
13
|
-
"""Load a spider by name"""
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
@abstractmethod
|
|
17
|
-
def list(self) -> List[str]:
|
|
18
|
-
"""List all available spider names"""
|
|
19
|
-
pass
|
|
20
|
-
|
|
21
|
-
@abstractmethod
|
|
22
|
-
def find_by_request(self, request: Request) -> List[str]:
|
|
23
|
-
"""Find spider names that can handle the given request"""
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Type, Protocol
|
|
3
|
+
|
|
4
|
+
from crawlo.spider import Spider
|
|
5
|
+
from crawlo.network.request import Request
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ISpiderLoader(Protocol):
|
|
9
|
+
"""Spider loader interface"""
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def load(self, spider_name: str) -> Type[Spider]:
|
|
13
|
+
"""Load a spider by name"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def list(self) -> List[str]:
|
|
18
|
+
"""List all available spider names"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def find_by_request(self, request: Request) -> List[str]:
|
|
23
|
+
"""Find spider names that can handle the given request"""
|
|
24
24
|
pass
|
crawlo/items/__init__.py
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
crawlo.items 包
|
|
5
|
-
===============
|
|
6
|
-
提供 Item 和 Field 类用于数据定义和验证。
|
|
7
|
-
"""
|
|
8
|
-
from .items import Item
|
|
9
|
-
from .fields import Field
|
|
10
|
-
from .base import ItemMeta
|
|
11
|
-
|
|
12
|
-
from crawlo.exceptions import ItemInitError, ItemAttributeError
|
|
13
|
-
|
|
14
|
-
__all__ = [
|
|
15
|
-
'Item',
|
|
16
|
-
'Field',
|
|
17
|
-
'ItemMeta',
|
|
18
|
-
'ItemInitError',
|
|
19
|
-
'ItemAttributeError'
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
crawlo.items 包
|
|
5
|
+
===============
|
|
6
|
+
提供 Item 和 Field 类用于数据定义和验证。
|
|
7
|
+
"""
|
|
8
|
+
from .items import Item
|
|
9
|
+
from .fields import Field
|
|
10
|
+
from .base import ItemMeta
|
|
11
|
+
|
|
12
|
+
from crawlo.exceptions import ItemInitError, ItemAttributeError
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
'Item',
|
|
16
|
+
'Field',
|
|
17
|
+
'ItemMeta',
|
|
18
|
+
'ItemInitError',
|
|
19
|
+
'ItemAttributeError'
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
crawlo/items/base.py
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
基础元类定义
|
|
5
|
-
"""
|
|
6
|
-
from abc import ABCMeta
|
|
7
|
-
from .fields import Field
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class ItemMeta(ABCMeta):
|
|
11
|
-
def __new__(mcs, name, bases, attrs):
|
|
12
|
-
fields = {}
|
|
13
|
-
cls_attrs = {}
|
|
14
|
-
|
|
15
|
-
for attr_name, attr_value in attrs.items():
|
|
16
|
-
if isinstance(attr_value, Field):
|
|
17
|
-
fields[attr_name] = attr_value
|
|
18
|
-
else:
|
|
19
|
-
cls_attrs[attr_name] = attr_value
|
|
20
|
-
|
|
21
|
-
cls_instance = super().__new__(mcs, name, bases, cls_attrs)
|
|
22
|
-
cls_instance.FIELDS = fields
|
|
23
|
-
return cls_instance
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
基础元类定义
|
|
5
|
+
"""
|
|
6
|
+
from abc import ABCMeta
|
|
7
|
+
from .fields import Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ItemMeta(ABCMeta):
|
|
11
|
+
def __new__(mcs, name, bases, attrs):
|
|
12
|
+
fields = {}
|
|
13
|
+
cls_attrs = {}
|
|
14
|
+
|
|
15
|
+
for attr_name, attr_value in attrs.items():
|
|
16
|
+
if isinstance(attr_value, Field):
|
|
17
|
+
fields[attr_name] = attr_value
|
|
18
|
+
else:
|
|
19
|
+
cls_attrs[attr_name] = attr_value
|
|
20
|
+
|
|
21
|
+
cls_instance = super().__new__(mcs, name, bases, cls_attrs)
|
|
22
|
+
cls_instance.FIELDS = fields
|
|
23
|
+
return cls_instance
|
crawlo/items/fields.py
CHANGED
|
@@ -1,53 +1,53 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Field 类定义
|
|
5
|
-
"""
|
|
6
|
-
from typing import Any, Optional, Type
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class Field:
|
|
10
|
-
"""
|
|
11
|
-
字段定义类,用于定义 Item 的字段属性和验证规则
|
|
12
|
-
"""
|
|
13
|
-
def __init__(
|
|
14
|
-
self,
|
|
15
|
-
nullable: bool = True,
|
|
16
|
-
*,
|
|
17
|
-
default: Any = None,
|
|
18
|
-
field_type: Optional[Type] = None,
|
|
19
|
-
max_length: Optional[int] = None,
|
|
20
|
-
description: str = ""
|
|
21
|
-
):
|
|
22
|
-
self.nullable = nullable
|
|
23
|
-
self.default = default
|
|
24
|
-
self.field_type = field_type
|
|
25
|
-
self.max_length = max_length
|
|
26
|
-
self.description = description
|
|
27
|
-
|
|
28
|
-
def validate(self, value: Any, field_name: str = "") -> Any:
|
|
29
|
-
"""
|
|
30
|
-
验证字段值是否符合规则
|
|
31
|
-
"""
|
|
32
|
-
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
33
|
-
if self.default is not None:
|
|
34
|
-
return self.default
|
|
35
|
-
elif not self.nullable:
|
|
36
|
-
raise ValueError(
|
|
37
|
-
f"字段 '{field_name}' 不允许为空。"
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
if value is not None and not (isinstance(value, str) and value.strip() == ""):
|
|
41
|
-
if self.field_type and not isinstance(value, self.field_type):
|
|
42
|
-
raise TypeError(
|
|
43
|
-
f"字段 '{field_name}' 类型错误:期望类型 {self.field_type}, 得到 {type(value)},值:{value!r}"
|
|
44
|
-
)
|
|
45
|
-
if self.max_length and len(str(value)) > self.max_length:
|
|
46
|
-
raise ValueError(
|
|
47
|
-
f"字段 '{field_name}' 长度超限:最大长度 {self.max_length},当前长度 {len(str(value))},值:{value!r}"
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
return value
|
|
51
|
-
|
|
52
|
-
def __repr__(self):
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Field 类定义
|
|
5
|
+
"""
|
|
6
|
+
from typing import Any, Optional, Type
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Field:
|
|
10
|
+
"""
|
|
11
|
+
字段定义类,用于定义 Item 的字段属性和验证规则
|
|
12
|
+
"""
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
nullable: bool = True,
|
|
16
|
+
*,
|
|
17
|
+
default: Any = None,
|
|
18
|
+
field_type: Optional[Type] = None,
|
|
19
|
+
max_length: Optional[int] = None,
|
|
20
|
+
description: str = ""
|
|
21
|
+
):
|
|
22
|
+
self.nullable = nullable
|
|
23
|
+
self.default = default
|
|
24
|
+
self.field_type = field_type
|
|
25
|
+
self.max_length = max_length
|
|
26
|
+
self.description = description
|
|
27
|
+
|
|
28
|
+
def validate(self, value: Any, field_name: str = "") -> Any:
|
|
29
|
+
"""
|
|
30
|
+
验证字段值是否符合规则
|
|
31
|
+
"""
|
|
32
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
33
|
+
if self.default is not None:
|
|
34
|
+
return self.default
|
|
35
|
+
elif not self.nullable:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"字段 '{field_name}' 不允许为空。"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if value is not None and not (isinstance(value, str) and value.strip() == ""):
|
|
41
|
+
if self.field_type and not isinstance(value, self.field_type):
|
|
42
|
+
raise TypeError(
|
|
43
|
+
f"字段 '{field_name}' 类型错误:期望类型 {self.field_type}, 得到 {type(value)},值:{value!r}"
|
|
44
|
+
)
|
|
45
|
+
if self.max_length and len(str(value)) > self.max_length:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"字段 '{field_name}' 长度超限:最大长度 {self.max_length},当前长度 {len(str(value))},值:{value!r}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return value
|
|
51
|
+
|
|
52
|
+
def __repr__(self):
|
|
53
53
|
return f"<Field nullable={self.nullable} type={self.field_type} default={self.default}>"
|