crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -6,14 +6,18 @@
6
6
 
7
7
  import threading
8
8
  import time
9
+ import signal
9
10
  from typing import Optional, Any
10
11
 
11
12
  from .built_in import register_built_in_initializers
12
13
  from .context import InitializationContext
13
- from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
14
+ from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition, validate_phase_dependencies
14
15
  from .registry import get_global_registry
15
16
 
16
17
 
18
+ from crawlo.utils.singleton import singleton
19
+
20
+ @singleton
17
21
  class CoreInitializer:
18
22
  """
19
23
  核心初始化器 - 协调整个框架的初始化过程
@@ -25,29 +29,18 @@ class CoreInitializer:
25
29
  4. 错误处理和降级策略
26
30
  """
27
31
 
28
- _instance: Optional['CoreInitializer'] = None
29
- _lock = threading.Lock()
30
-
31
- def __new__(cls) -> 'CoreInitializer':
32
- if cls._instance is None:
33
- with cls._lock:
34
- if cls._instance is None:
35
- cls._instance = super(CoreInitializer, cls).__new__(cls)
36
- cls._instance._initialized = False
37
- return cls._instance
38
-
39
32
  def __init__(self):
40
- if hasattr(self, '_initialized') and self._initialized:
41
- return
42
-
43
33
  self._context: Optional[InitializationContext] = None
44
34
  self._is_ready = False
45
35
  self._init_lock = threading.RLock()
46
36
 
37
+ # 在注册内置初始化器之前,先验证阶段依赖关系
38
+ is_valid, error_msg = validate_phase_dependencies()
39
+ if not is_valid:
40
+ raise RuntimeError(f"初始化阶段配置错误: {error_msg}")
41
+
47
42
  # 注册内置初始化器
48
43
  register_built_in_initializers()
49
-
50
- self._initialized = True
51
44
 
52
45
  @property
53
46
  def context(self) -> Optional[InitializationContext]:
@@ -128,10 +121,10 @@ class CoreInitializer:
128
121
  # 可选阶段,跳过
129
122
  continue
130
123
 
131
- # 执行阶段
124
+ # 执行阶段(带超时控制)
132
125
  start_time = time.time()
133
126
  try:
134
- result = registry.execute_phase(phase, context)
127
+ result = self._execute_phase_with_timeout(phase, context, registry)
135
128
  result.duration = time.time() - start_time
136
129
 
137
130
  context.mark_phase_completed(phase, result)
@@ -152,6 +145,59 @@ class CoreInitializer:
152
145
  if not self._is_phase_optional(phase):
153
146
  raise
154
147
 
148
+ def _execute_phase_with_timeout(self, phase: InitializationPhase,
149
+ context: InitializationContext,
150
+ registry) -> PhaseResult:
151
+ """
152
+ 执行阶段并支持超时控制
153
+
154
+ Args:
155
+ phase: 初始化阶段
156
+ context: 初始化上下文
157
+ registry: 初始化器注册表
158
+
159
+ Returns:
160
+ PhaseResult: 阶段执行结果
161
+
162
+ Raises:
163
+ TimeoutError: 阶段执行超时
164
+ """
165
+ phase_def = get_phase_definition(phase)
166
+ timeout = phase_def.timeout if phase_def else 30.0
167
+
168
+ # 使用线程执行,支持超时
169
+ result_container: list[Optional[PhaseResult]] = [None]
170
+ exception_container: list[Optional[Exception]] = [None]
171
+
172
+ def execute_in_thread():
173
+ try:
174
+ result_container[0] = registry.execute_phase(phase, context)
175
+ except Exception as e:
176
+ exception_container[0] = e
177
+
178
+ thread = threading.Thread(target=execute_in_thread, daemon=True)
179
+ thread.start()
180
+ thread.join(timeout=timeout)
181
+
182
+ if thread.is_alive():
183
+ # 超时了
184
+ error_msg = f"Phase {phase.value} execution timeout after {timeout} seconds"
185
+ context.add_warning(error_msg)
186
+ return PhaseResult(
187
+ phase=phase,
188
+ success=False,
189
+ error=TimeoutError(error_msg)
190
+ )
191
+
192
+ # 检查是否有异常
193
+ if exception_container[0]:
194
+ raise exception_container[0]
195
+
196
+ # 返回结果(已经确保不为None)
197
+ if result_container[0] is None:
198
+ raise RuntimeError(f"Phase {phase.value} returned None result")
199
+ return result_container[0]
200
+
155
201
  def _check_dependencies(self, phase: InitializationPhase,
156
202
  context: InitializationContext) -> bool:
157
203
  """检查阶段依赖关系"""
@@ -6,7 +6,7 @@
6
6
 
7
7
  from enum import Enum
8
8
  from dataclasses import dataclass
9
- from typing import List, Optional
9
+ from typing import List, Optional, Dict
10
10
 
11
11
 
12
12
  class InitializationPhase(Enum):
@@ -146,4 +146,85 @@ def validate_dependencies() -> bool:
146
146
  if dependency not in phases:
147
147
  return False
148
148
 
149
- return True
149
+ return True
150
+
151
+
152
+ def detect_circular_dependencies() -> Optional[List[InitializationPhase]]:
153
+ """
154
+ 检测循环依赖
155
+
156
+ 使用DFS(深度优先搜索)算法检测初始化阶段的循环依赖。
157
+
158
+ Returns:
159
+ Optional[List[InitializationPhase]]: 如果存在循环,返回循环路径;否则返回None
160
+
161
+ 算法说明:
162
+ 使用三色标记法:
163
+ - 白色(0):未访问
164
+ - 灰色(1):正在访问(在当前DFS路径中)
165
+ - 黑色(2):已完成访问
166
+
167
+ 如果在DFS过程中遇到灰色节点,说明存在循环依赖。
168
+ """
169
+ # 构建依赖图
170
+ dependency_graph: Dict[InitializationPhase, List[InitializationPhase]] = {}
171
+ for definition in PHASE_DEFINITIONS:
172
+ dependency_graph[definition.phase] = definition.dependencies.copy()
173
+
174
+ # 三色标记:0-白色(未访问),1-灰色(访问中),2-黑色(已完成)
175
+ color: Dict[InitializationPhase, int] = {phase: 0 for phase in dependency_graph}
176
+ parent: Dict[InitializationPhase, Optional[InitializationPhase]] = {phase: None for phase in dependency_graph}
177
+
178
+ def dfs(node: InitializationPhase) -> Optional[List[InitializationPhase]]:
179
+ """DFS遍历检测循环"""
180
+ color[node] = 1 # 标记为灰色(访问中)
181
+
182
+ for neighbor in dependency_graph.get(node, []):
183
+ if color[neighbor] == 1: # 遇到灰色节点,发现循环
184
+ # 重建循环路径
185
+ cycle = [neighbor]
186
+ current: Optional[InitializationPhase] = node
187
+ while current is not None and current != neighbor:
188
+ cycle.append(current)
189
+ current = parent.get(current)
190
+ cycle.append(neighbor)
191
+ cycle.reverse()
192
+ return cycle
193
+
194
+ if color[neighbor] == 0: # 未访问的节点
195
+ parent[neighbor] = node
196
+ result = dfs(neighbor)
197
+ if result:
198
+ return result
199
+
200
+ color[node] = 2 # 标记为黑色(已完成)
201
+ return None
202
+
203
+ # 对所有未访问的节点执行DFS
204
+ for phase in dependency_graph:
205
+ if color[phase] == 0:
206
+ cycle = dfs(phase)
207
+ if cycle:
208
+ return cycle
209
+
210
+ return None
211
+
212
+
213
+ def validate_phase_dependencies() -> tuple[bool, Optional[str]]:
214
+ """
215
+ 全面验证阶段依赖关系
216
+
217
+ Returns:
218
+ tuple[bool, Optional[str]]: (是否有效, 错误信息)
219
+ """
220
+ # 1. 检查依赖是否存在
221
+ if not validate_dependencies():
222
+ return False, "存在未定义的依赖阶段"
223
+
224
+ # 2. 检查循环依赖
225
+ cycle = detect_circular_dependencies()
226
+ if cycle:
227
+ cycle_path = ' -> '.join([phase.value for phase in cycle])
228
+ return False, f"检测到循环依赖: {cycle_path}"
229
+
230
+ return True, None
@@ -35,11 +35,12 @@ class BaseInitializer(Initializer):
35
35
  def _create_result(self, success: bool, duration: float = 0.0,
36
36
  artifacts: Optional[Dict] = None, error: Optional[Exception] = None) -> PhaseResult:
37
37
  """创建初始化结果"""
38
- return PhaseResult(
38
+ from .utils import create_initialization_result
39
+ return create_initialization_result(
39
40
  phase=self.phase,
40
41
  success=success,
41
42
  duration=duration,
42
- artifacts=artifacts or {},
43
+ artifacts=artifacts,
43
44
  error=error
44
45
  )
45
46
 
@@ -70,15 +71,12 @@ class InitializerRegistry:
70
71
  init_func: Callable[[InitializationContext], PhaseResult]):
71
72
  """注册函数式初始化器"""
72
73
 
73
- class FunctionInitializer:
74
+ class FunctionInitializer(Initializer):
74
75
  def __init__(self, phase: InitializationPhase, func: Callable):
76
+ super().__init__(phase)
75
77
  self._phase = phase
76
78
  self._func = func
77
79
 
78
- @property
79
- def phase(self) -> InitializationPhase:
80
- return self._phase
81
-
82
80
  def initialize(self, context: InitializationContext) -> PhaseResult:
83
81
  return self._func(context)
84
82
 
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 初始化工具模块 - 提供通用的初始化工具函数
5
+ """
6
+
7
+ import time
8
+ from typing import Optional, Dict, Any
9
+ from .phases import PhaseResult, InitializationPhase
10
+
11
+
12
+ def create_initialization_result(
13
+ phase: 'InitializationPhase',
14
+ success: bool,
15
+ duration: float = 0.0,
16
+ artifacts: Optional[Dict[str, Any]] = None,
17
+ error: Optional[Exception] = None
18
+ ) -> PhaseResult:
19
+ """
20
+ 创建标准化的初始化结果
21
+
22
+ Args:
23
+ phase: 初始化阶段
24
+ success: 是否成功
25
+ duration: 执行时长
26
+ artifacts: 产生的工件数据
27
+ error: 异常对象
28
+
29
+ Returns:
30
+ PhaseResult: 标准化的初始化结果
31
+ """
32
+ return PhaseResult(
33
+ phase=phase,
34
+ success=success,
35
+ duration=duration,
36
+ artifacts=artifacts or {},
37
+ error=error
38
+ )
39
+
40
+
41
+ class InitializationTimer:
42
+ """初始化计时器"""
43
+
44
+ def __init__(self):
45
+ self.start_time = time.time()
46
+
47
+ def get_duration(self) -> float:
48
+ """获取经过的时间"""
49
+ return time.time() - self.start_time
@@ -14,33 +14,29 @@ Crawlo统一日志系统
14
14
  from .manager import LogManager
15
15
  from .factory import LoggerFactory
16
16
  from .config import LogConfig
17
- from .monitor import LogPerformanceMonitor
17
+
18
18
 
19
19
  # 统一的公共接口
20
20
  def get_logger(name: str = 'default'):
21
21
  """获取logger实例"""
22
22
  return LoggerFactory.get_logger(name)
23
23
 
24
+
24
25
  def configure_logging(settings=None, **kwargs):
25
26
  """配置日志系统"""
26
27
  return LogManager().configure(settings, **kwargs)
27
28
 
29
+
28
30
  def is_configured() -> bool:
29
31
  """检查日志系统是否已配置"""
30
32
  return LogManager().is_configured
31
33
 
32
- def get_monitor() -> LogPerformanceMonitor:
33
- """获取日志性能监控器"""
34
- from .monitor import get_monitor as _get_monitor
35
- return _get_monitor()
36
34
 
37
35
  __all__ = [
38
36
  'LogManager',
39
- 'LoggerFactory',
37
+ 'LoggerFactory',
40
38
  'LogConfig',
41
- 'LogPerformanceMonitor',
42
39
  'get_logger',
43
40
  'configure_logging',
44
- 'is_configured',
45
- 'get_monitor'
46
- ]
41
+ 'is_configured'
42
+ ]
crawlo/logging/config.py CHANGED
@@ -13,6 +13,45 @@ from typing import Optional, Dict, Any
13
13
  class LogConfig:
14
14
  """日志配置数据类 - 简单明确的配置结构"""
15
15
 
16
+ # 预设配置模板
17
+ TEMPLATES = {
18
+ 'minimal': {
19
+ 'level': 'INFO',
20
+ 'format': '%(asctime)s - %(levelname)s: %(message)s',
21
+ 'console_enabled': True,
22
+ 'file_enabled': False
23
+ },
24
+ 'standard': {
25
+ 'level': 'INFO',
26
+ 'format': '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s',
27
+ 'console_enabled': True,
28
+ 'file_enabled': True,
29
+ 'file_path': 'logs/crawlo.log',
30
+ # 注意:standard模板未指定max_bytes和backup_count,
31
+ # 将使用类定义的默认值(10MB, 5个备份)或用户在settings.py中设置的值
32
+ # 如果用户不想要日志轮转,可以在settings.py中设置LOG_MAX_BYTES=0
33
+ # 当max_bytes或backup_count为0时,日志轮转将被禁用,文件会持续增长
34
+ },
35
+ 'detailed': {
36
+ 'level': 'DEBUG',
37
+ 'format': '%(asctime)s - [%(name)s] - %(levelname)s - %(pathname)s:%(lineno)d: %(message)s',
38
+ 'console_enabled': True,
39
+ 'file_enabled': True,
40
+ 'file_path': 'logs/crawlo.log',
41
+ 'max_bytes': 20 * 1024 * 1024, # 20MB,适用于大多数生产环境
42
+ 'backup_count': 10 # 10个备份文件,可保留约10次轮转的历史
43
+ },
44
+ 'production': {
45
+ 'level': 'WARNING',
46
+ 'format': '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s',
47
+ 'console_enabled': False, # 生产环境通常禁用控制台输出
48
+ 'file_enabled': True,
49
+ 'file_path': 'logs/crawlo.log',
50
+ 'max_bytes': 50 * 1024 * 1024, # 50MB,适用于高负载生产环境
51
+ 'backup_count': 20 # 20个备份文件,可保留较长时间的历史记录
52
+ }
53
+ }
54
+
16
55
  # 基本配置
17
56
  level: str = "INFO"
18
57
  format: str = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
@@ -54,21 +93,45 @@ class LogConfig:
54
93
  # 获取默认值
55
94
  format_default_value = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
56
95
 
96
+ # 确保类型安全
97
+ def safe_get_str(key: str, default: str = '') -> str:
98
+ value = get_val(key, default)
99
+ return str(value) if value is not None else default
100
+
101
+ def safe_get_int(key: str, default: int) -> int:
102
+ value = get_val(key, default)
103
+ try:
104
+ return int(value) if value is not None else default
105
+ except (ValueError, TypeError):
106
+ return default
107
+
108
+ def safe_get_bool(key: str, default: bool) -> bool:
109
+ value = get_val(key, default)
110
+ if isinstance(value, bool):
111
+ return value
112
+ if isinstance(value, str):
113
+ return value.lower() in ('1', 'true', 'yes', 'on')
114
+ return bool(value) if value is not None else default
115
+
116
+ def safe_get_dict(key: str, default: dict) -> dict:
117
+ value = get_val(key, default)
118
+ return value if isinstance(value, dict) else default
119
+
57
120
  return cls(
58
- level=get_val('LOG_LEVEL', 'INFO'),
59
- format=get_val('LOG_FORMAT', format_default_value),
60
- encoding=get_val('LOG_ENCODING', 'utf-8'),
61
- file_path=get_val('LOG_FILE'),
62
- max_bytes=get_val('LOG_MAX_BYTES', 10 * 1024 * 1024),
63
- backup_count=get_val('LOG_BACKUP_COUNT', 5),
64
- console_enabled=get_val('LOG_CONSOLE_ENABLED', True),
65
- file_enabled=get_val('LOG_FILE_ENABLED', True),
66
- console_level=get_val('LOG_CONSOLE_LEVEL'), # 允许单独设置控制台级别
67
- file_level=get_val('LOG_FILE_LEVEL'), # 允许单独设置文件级别
68
- include_thread_id=get_val('LOG_INCLUDE_THREAD_ID', False),
69
- include_process_id=get_val('LOG_INCLUDE_PROCESS_ID', False),
70
- include_module_path=get_val('LOG_INCLUDE_MODULE_PATH', False),
71
- module_levels=get_val('LOG_LEVELS', {})
121
+ level=safe_get_str('LOG_LEVEL', 'INFO'),
122
+ format=safe_get_str('LOG_FORMAT', format_default_value),
123
+ encoding=safe_get_str('LOG_ENCODING', 'utf-8'),
124
+ file_path=safe_get_str('LOG_FILE'),
125
+ max_bytes=safe_get_int('LOG_MAX_BYTES', 10 * 1024 * 1024), # 从200MB改为10MB以保持一致性
126
+ backup_count=safe_get_int('LOG_BACKUP_COUNT', 5),
127
+ console_enabled=safe_get_bool('LOG_CONSOLE_ENABLED', True),
128
+ file_enabled=safe_get_bool('LOG_FILE_ENABLED', True),
129
+ console_level=safe_get_str('LOG_CONSOLE_LEVEL'), # 允许单独设置控制台级别
130
+ file_level=safe_get_str('LOG_FILE_LEVEL'), # 允许单独设置文件级别
131
+ include_thread_id=safe_get_bool('LOG_INCLUDE_THREAD_ID', False),
132
+ include_process_id=safe_get_bool('LOG_INCLUDE_PROCESS_ID', False),
133
+ include_module_path=safe_get_bool('LOG_INCLUDE_MODULE_PATH', False),
134
+ module_levels=safe_get_dict('LOG_LEVELS', {})
72
135
  )
73
136
 
74
137
  @classmethod
@@ -101,6 +164,22 @@ class LogConfig:
101
164
 
102
165
  return cls(**mapped_dict)
103
166
 
167
+ @classmethod
168
+ def from_template(cls, template_name: str) -> 'LogConfig':
169
+ """从模板创建配置
170
+
171
+ Args:
172
+ template_name: 模板名称 (minimal, standard, detailed, production)
173
+
174
+ Returns:
175
+ LogConfig: 配置对象
176
+ """
177
+ if template_name not in cls.TEMPLATES:
178
+ raise ValueError(f"未知的模板名称: {template_name},可用模板: {', '.join(cls.TEMPLATES.keys())}")
179
+
180
+ template_config = cls.TEMPLATES[template_name]
181
+ return cls(**template_config)
182
+
104
183
  def get_module_level(self, module_name: str) -> str:
105
184
  """获取模块的日志级别"""
106
185
  # 先查找精确匹配
@@ -169,21 +248,25 @@ class LogConfig:
169
248
 
170
249
  return base_format
171
250
 
172
- def validate(self) -> bool:
173
- """验证配置有效性"""
251
+ def validate(self) -> tuple[bool, str]:
252
+ """验证配置有效性
253
+
254
+ Returns:
255
+ tuple[bool, str]: (是否有效, 错误信息)
256
+ """
174
257
  valid_levels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'}
175
258
 
176
259
  # 验证主级别
177
260
  if self.level.upper() not in valid_levels:
178
- return False
261
+ return False, f"无效的日志级别: {self.level},有效级别为: {', '.join(valid_levels)}"
179
262
 
180
263
  # 验证控制台级别
181
264
  if self.console_level and self.console_level.upper() not in valid_levels:
182
- return False
265
+ return False, f"无效的控制台日志级别: {self.console_level},有效级别为: {', '.join(valid_levels)}"
183
266
 
184
267
  # 验证文件级别
185
268
  if self.file_level and self.file_level.upper() not in valid_levels:
186
- return False
269
+ return False, f"无效的文件日志级别: {self.file_level},有效级别为: {', '.join(valid_levels)}"
187
270
 
188
271
  # 确保日志目录存在
189
272
  if self.file_path and self.file_enabled:
@@ -191,7 +274,8 @@ class LogConfig:
191
274
  log_dir = os.path.dirname(self.file_path)
192
275
  if log_dir and not os.path.exists(log_dir):
193
276
  os.makedirs(log_dir, exist_ok=True)
194
- except (OSError, PermissionError):
195
- return False
277
+ except (OSError, PermissionError) as e:
278
+ log_dir = os.path.dirname(self.file_path) if self.file_path else "未知"
279
+ return False, f"无法创建日志目录 {log_dir}: {e}"
196
280
 
197
- return True
281
+ return True, "配置有效"
crawlo/logging/factory.py CHANGED
@@ -8,16 +8,17 @@ import logging
8
8
  import os
9
9
  import sys
10
10
  import threading
11
- from typing import Dict, Optional
12
11
  from weakref import WeakValueDictionary
13
12
 
14
13
  # 尝试导入concurrent-log-handler,如果不可用则回退到标准库
15
14
  try:
16
15
  from concurrent_log_handler import ConcurrentRotatingFileHandler
17
16
  USE_CONCURRENT_HANDLER = True
17
+ RotatingFileHandler = ConcurrentRotatingFileHandler # 别名以避免未绑定错误
18
18
  except ImportError:
19
19
  from logging.handlers import RotatingFileHandler
20
20
  USE_CONCURRENT_HANDLER = False
21
+ ConcurrentRotatingFileHandler = RotatingFileHandler # 别名以避免未绑定错误
21
22
 
22
23
  from .manager import get_config, is_configured, configure
23
24
  from .config import LogConfig
@@ -69,7 +70,7 @@ class LoggerFactory:
69
70
  """创建新的Logger实例"""
70
71
  config = get_config()
71
72
  if not config:
72
- raise RuntimeError("Log system not configured")
73
+ raise RuntimeError("日志系统未配置,请先调用 configure_logging() 进行配置")
73
74
 
74
75
  # 创建Logger
75
76
  logger = logging.getLogger(name)
@@ -103,6 +104,7 @@ class LoggerFactory:
103
104
  os.makedirs(log_dir, exist_ok=True)
104
105
 
105
106
  # 根据平台选择合适的Handler
107
+ file_handler = None
106
108
  if USE_CONCURRENT_HANDLER:
107
109
  file_handler = ConcurrentRotatingFileHandler(
108
110
  filename=config.file_path,
@@ -133,12 +135,14 @@ class LoggerFactory:
133
135
  encoding=config.encoding
134
136
  )
135
137
 
136
- file_handler.setFormatter(formatter)
137
- # 使用专门的文件级别或模块级别
138
- file_level = config.get_file_level()
139
- level = getattr(logging, file_level.upper(), logging.INFO)
140
- file_handler.setLevel(level)
141
- logger.addHandler(file_handler)
138
+ # 添加文件处理器(如果创建成功)
139
+ if file_handler is not None:
140
+ file_handler.setFormatter(formatter)
141
+ # 使用专门的文件级别或模块级别
142
+ file_level = config.get_file_level()
143
+ level = getattr(logging, file_level.upper(), logging.INFO)
144
+ file_handler.setLevel(level)
145
+ logger.addHandler(file_handler)
142
146
  except Exception as e:
143
147
  # 文件Handler创建失败时,至少保证控制台输出
144
148
  console_handler = logging.StreamHandler()
crawlo/logging/manager.py CHANGED
@@ -5,10 +5,12 @@
5
5
  """
6
6
 
7
7
  import threading
8
- from typing import Optional, Any
8
+ from typing import Optional
9
9
  from .config import LogConfig
10
+ from crawlo.utils.singleton import singleton
10
11
 
11
12
 
13
+ @singleton
12
14
  class LogManager:
13
15
  """
14
16
  日志管理器 - 单例模式
@@ -18,38 +20,23 @@ class LogManager:
18
20
  2. 配置状态跟踪
19
21
  3. 线程安全的配置更新
20
22
  """
21
-
22
- _instance: Optional['LogManager'] = None
23
- _lock = threading.Lock()
24
-
25
- def __new__(cls) -> 'LogManager':
26
- if cls._instance is None:
27
- with cls._lock:
28
- if cls._instance is None:
29
- cls._instance = super(LogManager, cls).__new__(cls)
30
- cls._instance._initialized = False
31
- return cls._instance
32
-
23
+
33
24
  def __init__(self):
34
- if hasattr(self, '_initialized') and self._initialized:
35
- return
36
-
37
25
  self._config: Optional[LogConfig] = None
38
26
  self._configured = False
39
27
  self._config_lock = threading.RLock()
40
- self._initialized = True
41
-
28
+
42
29
  @property
43
30
  def config(self) -> Optional[LogConfig]:
44
31
  """获取当前配置"""
45
32
  with self._config_lock:
46
33
  return self._config
47
-
34
+
48
35
  @property
49
36
  def is_configured(self) -> bool:
50
37
  """检查是否已配置"""
51
38
  return self._configured
52
-
39
+
53
40
  def configure(self, settings=None, **kwargs) -> LogConfig:
54
41
  """
55
42
  配置日志系统
@@ -74,16 +61,17 @@ class LogManager:
74
61
  config = LogConfig.from_dict(kwargs)
75
62
  else:
76
63
  config = LogConfig() # 使用默认配置
77
-
64
+
78
65
  # 验证配置
79
- if not config.validate():
80
- raise ValueError("Invalid log configuration")
81
-
66
+ is_valid, error_msg = config.validate()
67
+ if not is_valid:
68
+ raise ValueError(f"Invalid log configuration: {error_msg}")
69
+
82
70
  self._config = config
83
71
  self._configured = True
84
-
72
+
85
73
  return config
86
-
74
+
87
75
  def reset(self):
88
76
  """重置配置(主要用于测试)"""
89
77
  with self._config_lock:
@@ -94,19 +82,23 @@ class LogManager:
94
82
  # 全局实例
95
83
  _log_manager = LogManager()
96
84
 
85
+
97
86
  # 模块级便捷函数
98
87
  def configure(settings=None, **kwargs) -> LogConfig:
99
88
  """配置日志系统"""
100
89
  return _log_manager.configure(settings, **kwargs)
101
90
 
91
+
102
92
  def is_configured() -> bool:
103
93
  """检查是否已配置"""
104
94
  return _log_manager.is_configured
105
95
 
96
+
106
97
  def get_config() -> Optional[LogConfig]:
107
98
  """获取当前配置"""
108
99
  return _log_manager.config
109
100
 
101
+
110
102
  def reset():
111
103
  """重置配置"""
112
- _log_manager.reset()
104
+ _log_manager.reset()