crawlo 1.3.3__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (279) hide show
  1. crawlo/__init__.py +87 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +46 -2
  16. crawlo/core/engine.py +439 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -256
  19. crawlo/crawler.py +639 -1167
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -52
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +28 -0
  40. crawlo/factories/base.py +69 -0
  41. crawlo/factories/crawler.py +104 -0
  42. crawlo/factories/registry.py +85 -0
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -234
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -0
  47. crawlo/initialization/__init__.py +40 -0
  48. crawlo/initialization/built_in.py +426 -0
  49. crawlo/initialization/context.py +142 -0
  50. crawlo/initialization/core.py +194 -0
  51. crawlo/initialization/phases.py +149 -0
  52. crawlo/initialization/registry.py +146 -0
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -22
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +38 -0
  58. crawlo/logging/config.py +97 -0
  59. crawlo/logging/factory.py +129 -0
  60. crawlo/logging/manager.py +112 -0
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -187
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +318 -318
  85. crawlo/pipelines/pipeline_manager.py +76 -75
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -325
  88. crawlo/queue/pqueue.py +43 -37
  89. crawlo/queue/queue_manager.py +503 -379
  90. crawlo/queue/redis_priority_queue.py +326 -306
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -225
  93. crawlo/settings/setting_manager.py +214 -198
  94. crawlo/spider/__init__.py +657 -639
  95. crawlo/stats_collector.py +73 -59
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +139 -30
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +168 -267
  104. crawlo/templates/project/settings_distributed.py.tmpl +167 -180
  105. crawlo/templates/project/settings_gentle.py.tmpl +167 -61
  106. crawlo/templates/project/settings_high_performance.py.tmpl +168 -131
  107. crawlo/templates/project/settings_minimal.py.tmpl +66 -35
  108. crawlo/templates/project/settings_simple.py.tmpl +165 -102
  109. crawlo/templates/project/spiders/__init__.py.tmpl +10 -6
  110. crawlo/templates/run.py.tmpl +34 -38
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +10 -0
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +365 -0
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +26 -0
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -124
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +44 -200
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -351
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -218
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/METADATA +1126 -1020
  149. crawlo-1.3.4.dist-info/RECORD +278 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +107 -107
  154. tests/baidu_performance_test.py +109 -0
  155. tests/baidu_test.py +60 -0
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +213 -0
  158. tests/comprehensive_test.py +82 -0
  159. tests/comprehensive_testing_summary.md +187 -0
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +70 -0
  164. tests/debug_framework_logger.py +85 -0
  165. tests/debug_log_levels.py +64 -0
  166. tests/debug_pipelines.py +66 -66
  167. tests/distributed_test.py +67 -0
  168. tests/distributed_test_debug.py +77 -0
  169. tests/dynamic_loading_example.py +523 -523
  170. tests/dynamic_loading_test.py +104 -104
  171. tests/env_config_example.py +133 -133
  172. tests/error_handling_example.py +171 -171
  173. tests/final_command_test_report.md +0 -0
  174. tests/final_comprehensive_test.py +152 -0
  175. tests/final_validation_test.py +183 -0
  176. tests/framework_performance_test.py +203 -0
  177. tests/optimized_performance_test.py +212 -0
  178. tests/performance_comparison.py +246 -0
  179. tests/queue_blocking_test.py +114 -0
  180. tests/queue_test.py +90 -0
  181. tests/redis_key_validation_demo.py +130 -130
  182. tests/request_params_example.py +150 -150
  183. tests/response_improvements_example.py +144 -144
  184. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  185. tests/scrapy_comparison/scrapy_test.py +134 -0
  186. tests/simple_command_test.py +120 -0
  187. tests/simple_crawlo_test.py +128 -0
  188. tests/simple_log_test.py +58 -0
  189. tests/simple_optimization_test.py +129 -0
  190. tests/simple_spider_test.py +50 -0
  191. tests/simple_test.py +48 -0
  192. tests/test_advanced_tools.py +148 -148
  193. tests/test_all_commands.py +231 -0
  194. tests/test_all_redis_key_configs.py +145 -145
  195. tests/test_authenticated_proxy.py +141 -141
  196. tests/test_batch_processor.py +179 -0
  197. tests/test_cleaners.py +54 -54
  198. tests/test_component_factory.py +175 -0
  199. tests/test_comprehensive.py +146 -146
  200. tests/test_config_consistency.py +80 -80
  201. tests/test_config_merge.py +152 -152
  202. tests/test_config_validator.py +182 -182
  203. tests/test_controlled_spider_mixin.py +80 -0
  204. tests/test_crawlo_proxy_integration.py +108 -108
  205. tests/test_date_tools.py +123 -123
  206. tests/test_default_header_middleware.py +158 -158
  207. tests/test_distributed.py +65 -65
  208. tests/test_double_crawlo_fix.py +207 -207
  209. tests/test_double_crawlo_fix_simple.py +124 -124
  210. tests/test_download_delay_middleware.py +221 -221
  211. tests/test_downloader_proxy_compatibility.py +268 -268
  212. tests/test_dynamic_downloaders_proxy.py +124 -124
  213. tests/test_dynamic_proxy.py +92 -92
  214. tests/test_dynamic_proxy_config.py +146 -146
  215. tests/test_dynamic_proxy_real.py +109 -109
  216. tests/test_edge_cases.py +303 -303
  217. tests/test_enhanced_error_handler.py +270 -270
  218. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  219. tests/test_env_config.py +121 -121
  220. tests/test_error_handler_compatibility.py +112 -112
  221. tests/test_factories.py +253 -0
  222. tests/test_final_validation.py +153 -153
  223. tests/test_framework_env_usage.py +103 -103
  224. tests/test_framework_logger.py +67 -0
  225. tests/test_framework_startup.py +65 -0
  226. tests/test_integration.py +169 -169
  227. tests/test_item_dedup_redis_key.py +122 -122
  228. tests/test_large_scale_config.py +113 -0
  229. tests/test_large_scale_helper.py +236 -0
  230. tests/test_mode_change.py +73 -0
  231. tests/test_mode_consistency.py +51 -51
  232. tests/test_offsite_middleware.py +221 -221
  233. tests/test_parsel.py +29 -29
  234. tests/test_performance.py +327 -327
  235. tests/test_performance_monitor.py +116 -0
  236. tests/test_proxy_api.py +264 -264
  237. tests/test_proxy_health_check.py +32 -32
  238. tests/test_proxy_middleware.py +121 -121
  239. tests/test_proxy_middleware_enhanced.py +216 -216
  240. tests/test_proxy_middleware_integration.py +136 -136
  241. tests/test_proxy_middleware_refactored.py +184 -184
  242. tests/test_proxy_providers.py +56 -56
  243. tests/test_proxy_stats.py +19 -19
  244. tests/test_proxy_strategies.py +59 -59
  245. tests/test_queue_empty_check.py +42 -0
  246. tests/test_queue_manager_double_crawlo.py +173 -173
  247. tests/test_queue_manager_redis_key.py +176 -176
  248. tests/test_random_user_agent.py +72 -72
  249. tests/test_real_scenario_proxy.py +195 -195
  250. tests/test_redis_config.py +28 -28
  251. tests/test_redis_connection_pool.py +294 -294
  252. tests/test_redis_key_naming.py +181 -181
  253. tests/test_redis_key_validator.py +123 -123
  254. tests/test_redis_queue.py +224 -224
  255. tests/test_request_ignore_middleware.py +182 -182
  256. tests/test_request_params.py +111 -111
  257. tests/test_request_serialization.py +70 -70
  258. tests/test_response_code_middleware.py +349 -349
  259. tests/test_response_filter_middleware.py +427 -427
  260. tests/test_response_improvements.py +152 -152
  261. tests/test_retry_middleware.py +241 -241
  262. tests/test_scheduler.py +252 -252
  263. tests/test_scheduler_config_update.py +133 -133
  264. tests/test_simple_response.py +61 -61
  265. tests/test_telecom_spider_redis_key.py +205 -205
  266. tests/test_template_content.py +87 -87
  267. tests/test_template_redis_key.py +134 -134
  268. tests/test_tools.py +159 -159
  269. tests/test_user_agents.py +96 -96
  270. tests/tools_example.py +260 -260
  271. tests/untested_features_report.md +139 -0
  272. tests/verify_debug.py +52 -0
  273. tests/verify_distributed.py +117 -117
  274. tests/verify_log_fix.py +112 -0
  275. crawlo-1.3.3.dist-info/RECORD +0 -219
  276. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  277. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  278. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  279. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 核心初始化器 - 协调整个初始化过程
5
+ """
6
+
7
+ import time
8
+ import threading
9
+ from typing import Optional, Any
10
+
11
+ from .context import InitializationContext
12
+ from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
13
+ from .registry import get_global_registry, BaseInitializer, register_initializer
14
+ from .built_in import register_built_in_initializers
15
+
16
+
17
+ class CoreInitializer:
18
+ """
19
+ 核心初始化器 - 协调整个框架的初始化过程
20
+
21
+ 职责:
22
+ 1. 管理初始化阶段的执行顺序
23
+ 2. 处理阶段间的依赖关系
24
+ 3. 提供统一的初始化入口
25
+ 4. 错误处理和降级策略
26
+ """
27
+
28
+ _instance: Optional['CoreInitializer'] = None
29
+ _lock = threading.Lock()
30
+
31
+ def __new__(cls) -> 'CoreInitializer':
32
+ if cls._instance is None:
33
+ with cls._lock:
34
+ if cls._instance is None:
35
+ cls._instance = super(CoreInitializer, cls).__new__(cls)
36
+ cls._instance._initialized = False
37
+ return cls._instance
38
+
39
+ def __init__(self):
40
+ if hasattr(self, '_initialized') and self._initialized:
41
+ return
42
+
43
+ self._context: Optional[InitializationContext] = None
44
+ self._is_ready = False
45
+ self._init_lock = threading.RLock()
46
+
47
+ # 注册内置初始化器
48
+ register_built_in_initializers()
49
+
50
+ self._initialized = True
51
+
52
+ @property
53
+ def context(self) -> Optional[InitializationContext]:
54
+ """获取初始化上下文"""
55
+ return self._context
56
+
57
+ @property
58
+ def is_ready(self) -> bool:
59
+ """检查框架是否已准备就绪"""
60
+ return self._is_ready
61
+
62
+ def initialize(self, settings=None, **kwargs) -> Any:
63
+ """
64
+ 执行框架初始化
65
+
66
+ Args:
67
+ settings: 配置对象
68
+ **kwargs: 额外的配置参数
69
+
70
+ Returns:
71
+ 初始化后的配置管理器
72
+ """
73
+ with self._init_lock:
74
+ # 如果已经初始化完成,直接返回
75
+ if self._is_ready and self._context and self._context.settings:
76
+ return self._context.settings
77
+
78
+ # 创建初始化上下文
79
+ context = InitializationContext()
80
+ context.custom_settings = kwargs
81
+ self._context = context
82
+
83
+ try:
84
+ # 执行初始化阶段
85
+ self._execute_initialization_phases(context)
86
+
87
+ # 检查关键阶段是否完成
88
+ if not context.is_phase_completed(InitializationPhase.SETTINGS):
89
+ raise RuntimeError("Settings initialization failed")
90
+
91
+ self._is_ready = True
92
+ context.finish()
93
+
94
+ return context.settings
95
+
96
+ except Exception as e:
97
+ context.add_error(f"Framework initialization failed: {e}")
98
+ context.finish()
99
+
100
+ # 降级策略
101
+ return self._fallback_initialization(settings, **kwargs)
102
+
103
+ def _execute_initialization_phases(self, context: InitializationContext):
104
+ """执行初始化阶段"""
105
+ registry = get_global_registry()
106
+ execution_order = get_execution_order()
107
+
108
+ # 只执行已注册的阶段
109
+ registered_phases = set(registry.get_all_phases())
110
+
111
+ for phase in execution_order:
112
+ if phase == InitializationPhase.ERROR:
113
+ continue
114
+
115
+ # 只执行已注册的阶段
116
+ if phase not in registered_phases:
117
+ continue
118
+
119
+ context.set_current_phase(phase)
120
+
121
+ # 检查依赖关系
122
+ if not self._check_dependencies(phase, context):
123
+ phase_def = get_phase_definition(phase)
124
+ if not (phase_def and phase_def.optional):
125
+ raise RuntimeError(f"Dependencies not satisfied for phase {phase}")
126
+ else:
127
+ # 可选阶段,跳过
128
+ continue
129
+
130
+ # 执行阶段
131
+ start_time = time.time()
132
+ try:
133
+ result = registry.execute_phase(phase, context)
134
+ result.duration = time.time() - start_time
135
+
136
+ context.mark_phase_completed(phase, result)
137
+
138
+ if not result.success and not self._is_phase_optional(phase):
139
+ raise RuntimeError(f"Phase {phase} failed: {result.error}")
140
+
141
+ except Exception as e:
142
+ duration = time.time() - start_time
143
+ result = PhaseResult(
144
+ phase=phase,
145
+ success=False,
146
+ duration=duration,
147
+ error=e
148
+ )
149
+ context.mark_phase_completed(phase, result)
150
+
151
+ if not self._is_phase_optional(phase):
152
+ raise
153
+
154
+ def _check_dependencies(self, phase: InitializationPhase,
155
+ context: InitializationContext) -> bool:
156
+ """检查阶段依赖关系"""
157
+ phase_def = get_phase_definition(phase)
158
+ if not phase_def:
159
+ return True
160
+
161
+ for dependency in phase_def.dependencies:
162
+ if not context.is_phase_completed(dependency):
163
+ return False
164
+
165
+ return True
166
+
167
+ def _is_phase_optional(self, phase: InitializationPhase) -> bool:
168
+ """检查阶段是否可选"""
169
+ phase_def = get_phase_definition(phase)
170
+ return phase_def.optional if phase_def else False
171
+
172
+ def _fallback_initialization(self, settings=None, **kwargs):
173
+ """降级初始化策略"""
174
+ try:
175
+ # 尝试创建基本的配置管理器
176
+ from crawlo.settings.setting_manager import SettingManager
177
+
178
+ if settings:
179
+ return settings
180
+ else:
181
+ fallback_settings = SettingManager()
182
+ if kwargs:
183
+ fallback_settings.update_attributes(kwargs)
184
+ return fallback_settings
185
+
186
+ except Exception:
187
+ # 如果连降级都失败,返回None
188
+ return None
189
+
190
+ def reset(self):
191
+ """重置初始化状态(主要用于测试)"""
192
+ with self._init_lock:
193
+ self._context = None
194
+ self._is_ready = False
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 初始化阶段定义
5
+ """
6
+
7
+ from enum import Enum
8
+ from dataclasses import dataclass
9
+ from typing import List, Optional
10
+
11
+
12
+ class InitializationPhase(Enum):
13
+ """初始化阶段枚举"""
14
+
15
+ # 阶段0:准备阶段
16
+ PREPARING = "preparing"
17
+
18
+ # 阶段1:日志系统初始化
19
+ LOGGING = "logging"
20
+
21
+ # 阶段2:配置系统初始化
22
+ SETTINGS = "settings"
23
+
24
+ # 阶段3:核心组件初始化
25
+ CORE_COMPONENTS = "core_components"
26
+
27
+ # 阶段4:扩展组件初始化
28
+ EXTENSIONS = "extensions"
29
+
30
+ # 阶段5:框架启动日志记录
31
+ FRAMEWORK_STARTUP_LOG = "framework_startup_log"
32
+
33
+ # 阶段6:完成
34
+ COMPLETED = "completed"
35
+
36
+ # 错误状态
37
+ ERROR = "error"
38
+
39
+
40
+ @dataclass
41
+ class PhaseResult:
42
+ """阶段执行结果"""
43
+ phase: InitializationPhase
44
+ success: bool
45
+ duration: float = 0.0
46
+ error: Optional[Exception] = None
47
+ artifacts: dict = None # 阶段产生的工件
48
+
49
+ def __post_init__(self):
50
+ if self.artifacts is None:
51
+ self.artifacts = {}
52
+
53
+
54
+ @dataclass
55
+ class PhaseDefinition:
56
+ """阶段定义"""
57
+ phase: InitializationPhase
58
+ name: str
59
+ description: str
60
+ dependencies: List[InitializationPhase] = None
61
+ optional: bool = False
62
+ timeout: float = 30.0 # 超时时间(秒)
63
+
64
+ def __post_init__(self):
65
+ if self.dependencies is None:
66
+ self.dependencies = []
67
+
68
+
69
+ # 预定义的初始化阶段
70
+ PHASE_DEFINITIONS = [
71
+ PhaseDefinition(
72
+ phase=InitializationPhase.PREPARING,
73
+ name="准备阶段",
74
+ description="初始化基础环境和检查前置条件",
75
+ dependencies=[],
76
+ timeout=5.0
77
+ ),
78
+ PhaseDefinition(
79
+ phase=InitializationPhase.LOGGING,
80
+ name="日志系统",
81
+ description="配置和初始化日志系统",
82
+ dependencies=[], # 移除对PREPARING的依赖
83
+ timeout=10.0
84
+ ),
85
+ PhaseDefinition(
86
+ phase=InitializationPhase.SETTINGS,
87
+ name="配置系统",
88
+ description="加载和验证配置",
89
+ dependencies=[InitializationPhase.LOGGING],
90
+ timeout=15.0
91
+ ),
92
+ PhaseDefinition(
93
+ phase=InitializationPhase.CORE_COMPONENTS,
94
+ name="核心组件",
95
+ description="初始化框架核心组件",
96
+ dependencies=[InitializationPhase.SETTINGS],
97
+ timeout=20.0
98
+ ),
99
+ PhaseDefinition(
100
+ phase=InitializationPhase.EXTENSIONS,
101
+ name="扩展组件",
102
+ description="加载和初始化扩展组件",
103
+ dependencies=[InitializationPhase.CORE_COMPONENTS],
104
+ optional=True,
105
+ timeout=15.0
106
+ ),
107
+ PhaseDefinition(
108
+ phase=InitializationPhase.FRAMEWORK_STARTUP_LOG,
109
+ name="框架启动日志",
110
+ description="记录框架启动相关信息",
111
+ dependencies=[InitializationPhase.LOGGING, InitializationPhase.SETTINGS],
112
+ timeout=5.0
113
+ ),
114
+ PhaseDefinition(
115
+ phase=InitializationPhase.COMPLETED,
116
+ name="初始化完成",
117
+ description="框架初始化完成",
118
+ dependencies=[
119
+ InitializationPhase.CORE_COMPONENTS,
120
+ InitializationPhase.FRAMEWORK_STARTUP_LOG
121
+ ], # Extensions是可选的
122
+ timeout=5.0
123
+ )
124
+ ]
125
+
126
+
127
+ def get_phase_definition(phase: InitializationPhase) -> Optional[PhaseDefinition]:
128
+ """获取阶段定义"""
129
+ for definition in PHASE_DEFINITIONS:
130
+ if definition.phase == phase:
131
+ return definition
132
+ return None
133
+
134
+
135
+ def get_execution_order() -> List[InitializationPhase]:
136
+ """获取执行顺序"""
137
+ return [definition.phase for definition in PHASE_DEFINITIONS]
138
+
139
+
140
+ def validate_dependencies() -> bool:
141
+ """验证阶段依赖关系的正确性"""
142
+ phases = {definition.phase for definition in PHASE_DEFINITIONS}
143
+
144
+ for definition in PHASE_DEFINITIONS:
145
+ for dependency in definition.dependencies:
146
+ if dependency not in phases:
147
+ return False
148
+
149
+ return True
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 初始化器注册表 - 管理所有初始化器的注册和执行
5
+ """
6
+
7
+ import threading
8
+ from typing import Dict, Optional, Callable, List
9
+ from .context import InitializationContext
10
+ from .phases import InitializationPhase, PhaseResult
11
+
12
+
13
+ class Initializer:
14
+ """初始化器基类"""
15
+
16
+ def __init__(self, phase: InitializationPhase):
17
+ self._phase = phase
18
+
19
+ @property
20
+ def phase(self) -> InitializationPhase:
21
+ """获取初始化阶段"""
22
+ return self._phase
23
+
24
+ def initialize(self, context: InitializationContext) -> PhaseResult:
25
+ """执行初始化 - 子类必须实现"""
26
+ raise NotImplementedError("Subclasses must implement initialize method")
27
+
28
+
29
+ class BaseInitializer(Initializer):
30
+ """基础初始化器类 - 为向后兼容保留"""
31
+
32
+ def __init__(self, phase: InitializationPhase):
33
+ super().__init__(phase)
34
+
35
+ def _create_result(self, success: bool, duration: float = 0.0,
36
+ artifacts: Optional[Dict] = None, error: Optional[Exception] = None) -> PhaseResult:
37
+ """创建初始化结果"""
38
+ return PhaseResult(
39
+ phase=self.phase,
40
+ success=success,
41
+ duration=duration,
42
+ artifacts=artifacts or {},
43
+ error=error
44
+ )
45
+
46
+
47
+ class InitializerRegistry:
48
+ """
49
+ 初始化器注册表 - 管理所有初始化器的注册和执行
50
+
51
+ 特点:
52
+ 1. 线程安全的注册和执行
53
+ 2. 支持函数式和类式初始化器
54
+ 3. 统一的结果处理
55
+ """
56
+
57
+ def __init__(self):
58
+ self._initializers: Dict[InitializationPhase, Initializer] = {}
59
+ self._lock = threading.RLock()
60
+
61
+ def register(self, initializer: Initializer):
62
+ """注册初始化器"""
63
+ with self._lock:
64
+ phase = initializer.phase
65
+ if phase in self._initializers:
66
+ raise ValueError(f"Initializer for phase {phase} already registered")
67
+ self._initializers[phase] = initializer
68
+
69
+ def register_function(self, phase: InitializationPhase,
70
+ init_func: Callable[[InitializationContext], PhaseResult]):
71
+ """注册函数式初始化器"""
72
+
73
+ class FunctionInitializer:
74
+ def __init__(self, phase: InitializationPhase, func: Callable):
75
+ self._phase = phase
76
+ self._func = func
77
+
78
+ @property
79
+ def phase(self) -> InitializationPhase:
80
+ return self._phase
81
+
82
+ def initialize(self, context: InitializationContext) -> PhaseResult:
83
+ return self._func(context)
84
+
85
+ self.register(FunctionInitializer(phase, init_func))
86
+
87
+ def get_initializer(self, phase: InitializationPhase) -> Optional[Initializer]:
88
+ """获取指定阶段的初始化器"""
89
+ with self._lock:
90
+ return self._initializers.get(phase)
91
+
92
+ def get_all_phases(self) -> List[InitializationPhase]:
93
+ """获取所有已注册的阶段"""
94
+ with self._lock:
95
+ return list(self._initializers.keys())
96
+
97
+ def has_initializer(self, phase: InitializationPhase) -> bool:
98
+ """检查是否有指定阶段的初始化器"""
99
+ with self._lock:
100
+ return phase in self._initializers
101
+
102
+ def clear(self):
103
+ """清空注册表"""
104
+ with self._lock:
105
+ self._initializers.clear()
106
+
107
+ def execute_phase(self, phase: InitializationPhase,
108
+ context: InitializationContext) -> PhaseResult:
109
+ """执行指定阶段的初始化"""
110
+ initializer = self.get_initializer(phase)
111
+ if not initializer:
112
+ error = ValueError(f"No initializer registered for phase {phase}")
113
+ return PhaseResult(
114
+ phase=phase,
115
+ success=False,
116
+ error=error
117
+ )
118
+
119
+ try:
120
+ return initializer.initialize(context)
121
+ except Exception as e:
122
+ return PhaseResult(
123
+ phase=phase,
124
+ success=False,
125
+ error=e
126
+ )
127
+
128
+
129
+ # 全局注册表实例
130
+ _global_registry = InitializerRegistry()
131
+
132
+
133
+ def get_global_registry() -> InitializerRegistry:
134
+ """获取全局注册表"""
135
+ return _global_registry
136
+
137
+
138
+ def register_initializer(initializer: Initializer):
139
+ """注册初始化器到全局注册表"""
140
+ _global_registry.register(initializer)
141
+
142
+
143
+ def register_phase_function(phase: InitializationPhase,
144
+ init_func: Callable[[InitializationContext], PhaseResult]):
145
+ """注册函数式初始化器到全局注册表"""
146
+ _global_registry.register_function(phase, init_func)
crawlo/items/__init__.py CHANGED
@@ -1,23 +1,23 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- crawlo.items 包
5
- ===============
6
- 提供 Item 和 Field 类用于数据定义和验证。
7
- """
8
- from .items import Item
9
- from .fields import Field
10
- from .base import ItemMeta
11
-
12
- from crawlo.exceptions import ItemInitError, ItemAttributeError
13
-
14
- __all__ = [
15
- 'Item',
16
- 'Field',
17
- 'ItemMeta',
18
- 'ItemInitError',
19
- 'ItemAttributeError'
20
- ]
21
-
22
-
23
-
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ crawlo.items 包
5
+ ===============
6
+ 提供 Item 和 Field 类用于数据定义和验证。
7
+ """
8
+ from .items import Item
9
+ from .fields import Field
10
+ from .base import ItemMeta
11
+
12
+ from crawlo.exceptions import ItemInitError, ItemAttributeError
13
+
14
+ __all__ = [
15
+ 'Item',
16
+ 'Field',
17
+ 'ItemMeta',
18
+ 'ItemInitError',
19
+ 'ItemAttributeError'
20
+ ]
21
+
22
+
23
+
crawlo/items/base.py CHANGED
@@ -1,22 +1,23 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 基础元类定义
5
- """
6
- from abc import ABCMeta
7
- from .fields import Field
8
-
9
- class ItemMeta(ABCMeta):
10
- def __new__(mcs, name, bases, attrs):
11
- fields = {}
12
- cls_attrs = {}
13
-
14
- for attr_name, attr_value in attrs.items():
15
- if isinstance(attr_value, Field):
16
- fields[attr_name] = attr_value
17
- else:
18
- cls_attrs[attr_name] = attr_value
19
-
20
- cls_instance = super().__new__(mcs, name, bases, cls_attrs)
21
- cls_instance.FIELDS = fields
22
- return cls_instance
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 基础元类定义
5
+ """
6
+ from abc import ABCMeta
7
+ from .fields import Field
8
+
9
+
10
+ class ItemMeta(ABCMeta):
11
+ def __new__(mcs, name, bases, attrs):
12
+ fields = {}
13
+ cls_attrs = {}
14
+
15
+ for attr_name, attr_value in attrs.items():
16
+ if isinstance(attr_value, Field):
17
+ fields[attr_name] = attr_value
18
+ else:
19
+ cls_attrs[attr_name] = attr_value
20
+
21
+ cls_instance = super().__new__(mcs, name, bases, cls_attrs)
22
+ cls_instance.FIELDS = fields
23
+ return cls_instance
crawlo/items/fields.py CHANGED
@@ -1,53 +1,53 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- Field 类定义
5
- """
6
- from typing import Any, Optional, Type
7
-
8
-
9
- class Field:
10
- """
11
- 字段定义类,用于定义 Item 的字段属性和验证规则
12
- """
13
- def __init__(
14
- self,
15
- nullable: bool = True,
16
- *,
17
- default: Any = None,
18
- field_type: Optional[Type] = None,
19
- max_length: Optional[int] = None,
20
- description: str = ""
21
- ):
22
- self.nullable = nullable
23
- self.default = default
24
- self.field_type = field_type
25
- self.max_length = max_length
26
- self.description = description
27
-
28
- def validate(self, value: Any, field_name: str = "") -> Any:
29
- """
30
- 验证字段值是否符合规则
31
- """
32
- if value is None or (isinstance(value, str) and value.strip() == ""):
33
- if self.default is not None:
34
- return self.default
35
- elif not self.nullable:
36
- raise ValueError(
37
- f"字段 '{field_name}' 不允许为空。"
38
- )
39
-
40
- if value is not None and not (isinstance(value, str) and value.strip() == ""):
41
- if self.field_type and not isinstance(value, self.field_type):
42
- raise TypeError(
43
- f"字段 '{field_name}' 类型错误:期望类型 {self.field_type}, 得到 {type(value)},值:{value!r}"
44
- )
45
- if self.max_length and len(str(value)) > self.max_length:
46
- raise ValueError(
47
- f"字段 '{field_name}' 长度超限:最大长度 {self.max_length},当前长度 {len(str(value))},值:{value!r}"
48
- )
49
-
50
- return value
51
-
52
- def __repr__(self):
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Field 类定义
5
+ """
6
+ from typing import Any, Optional, Type
7
+
8
+
9
+ class Field:
10
+ """
11
+ 字段定义类,用于定义 Item 的字段属性和验证规则
12
+ """
13
+ def __init__(
14
+ self,
15
+ nullable: bool = True,
16
+ *,
17
+ default: Any = None,
18
+ field_type: Optional[Type] = None,
19
+ max_length: Optional[int] = None,
20
+ description: str = ""
21
+ ):
22
+ self.nullable = nullable
23
+ self.default = default
24
+ self.field_type = field_type
25
+ self.max_length = max_length
26
+ self.description = description
27
+
28
+ def validate(self, value: Any, field_name: str = "") -> Any:
29
+ """
30
+ 验证字段值是否符合规则
31
+ """
32
+ if value is None or (isinstance(value, str) and value.strip() == ""):
33
+ if self.default is not None:
34
+ return self.default
35
+ elif not self.nullable:
36
+ raise ValueError(
37
+ f"字段 '{field_name}' 不允许为空。"
38
+ )
39
+
40
+ if value is not None and not (isinstance(value, str) and value.strip() == ""):
41
+ if self.field_type and not isinstance(value, self.field_type):
42
+ raise TypeError(
43
+ f"字段 '{field_name}' 类型错误:期望类型 {self.field_type}, 得到 {type(value)},值:{value!r}"
44
+ )
45
+ if self.max_length and len(str(value)) > self.max_length:
46
+ raise ValueError(
47
+ f"字段 '{field_name}' 长度超限:最大长度 {self.max_length},当前长度 {len(str(value))},值:{value!r}"
48
+ )
49
+
50
+ return value
51
+
52
+ def __repr__(self):
53
53
  return f"<Field nullable={self.nullable} type={self.field_type} default={self.default}>"