crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/initialization/core.py
CHANGED
|
@@ -6,14 +6,18 @@
|
|
|
6
6
|
|
|
7
7
|
import threading
|
|
8
8
|
import time
|
|
9
|
+
import signal
|
|
9
10
|
from typing import Optional, Any
|
|
10
11
|
|
|
11
12
|
from .built_in import register_built_in_initializers
|
|
12
13
|
from .context import InitializationContext
|
|
13
|
-
from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
|
|
14
|
+
from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition, validate_phase_dependencies
|
|
14
15
|
from .registry import get_global_registry
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
from crawlo.utils.singleton import singleton
|
|
19
|
+
|
|
20
|
+
@singleton
|
|
17
21
|
class CoreInitializer:
|
|
18
22
|
"""
|
|
19
23
|
核心初始化器 - 协调整个框架的初始化过程
|
|
@@ -25,29 +29,18 @@ class CoreInitializer:
|
|
|
25
29
|
4. 错误处理和降级策略
|
|
26
30
|
"""
|
|
27
31
|
|
|
28
|
-
_instance: Optional['CoreInitializer'] = None
|
|
29
|
-
_lock = threading.Lock()
|
|
30
|
-
|
|
31
|
-
def __new__(cls) -> 'CoreInitializer':
|
|
32
|
-
if cls._instance is None:
|
|
33
|
-
with cls._lock:
|
|
34
|
-
if cls._instance is None:
|
|
35
|
-
cls._instance = super(CoreInitializer, cls).__new__(cls)
|
|
36
|
-
cls._instance._initialized = False
|
|
37
|
-
return cls._instance
|
|
38
|
-
|
|
39
32
|
def __init__(self):
|
|
40
|
-
if hasattr(self, '_initialized') and self._initialized:
|
|
41
|
-
return
|
|
42
|
-
|
|
43
33
|
self._context: Optional[InitializationContext] = None
|
|
44
34
|
self._is_ready = False
|
|
45
35
|
self._init_lock = threading.RLock()
|
|
46
36
|
|
|
37
|
+
# 在注册内置初始化器之前,先验证阶段依赖关系
|
|
38
|
+
is_valid, error_msg = validate_phase_dependencies()
|
|
39
|
+
if not is_valid:
|
|
40
|
+
raise RuntimeError(f"初始化阶段配置错误: {error_msg}")
|
|
41
|
+
|
|
47
42
|
# 注册内置初始化器
|
|
48
43
|
register_built_in_initializers()
|
|
49
|
-
|
|
50
|
-
self._initialized = True
|
|
51
44
|
|
|
52
45
|
@property
|
|
53
46
|
def context(self) -> Optional[InitializationContext]:
|
|
@@ -128,10 +121,10 @@ class CoreInitializer:
|
|
|
128
121
|
# 可选阶段,跳过
|
|
129
122
|
continue
|
|
130
123
|
|
|
131
|
-
#
|
|
124
|
+
# 执行阶段(带超时控制)
|
|
132
125
|
start_time = time.time()
|
|
133
126
|
try:
|
|
134
|
-
result =
|
|
127
|
+
result = self._execute_phase_with_timeout(phase, context, registry)
|
|
135
128
|
result.duration = time.time() - start_time
|
|
136
129
|
|
|
137
130
|
context.mark_phase_completed(phase, result)
|
|
@@ -152,6 +145,59 @@ class CoreInitializer:
|
|
|
152
145
|
if not self._is_phase_optional(phase):
|
|
153
146
|
raise
|
|
154
147
|
|
|
148
|
+
def _execute_phase_with_timeout(self, phase: InitializationPhase,
|
|
149
|
+
context: InitializationContext,
|
|
150
|
+
registry) -> PhaseResult:
|
|
151
|
+
"""
|
|
152
|
+
执行阶段并支持超时控制
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
phase: 初始化阶段
|
|
156
|
+
context: 初始化上下文
|
|
157
|
+
registry: 初始化器注册表
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
PhaseResult: 阶段执行结果
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
TimeoutError: 阶段执行超时
|
|
164
|
+
"""
|
|
165
|
+
phase_def = get_phase_definition(phase)
|
|
166
|
+
timeout = phase_def.timeout if phase_def else 30.0
|
|
167
|
+
|
|
168
|
+
# 使用线程执行,支持超时
|
|
169
|
+
result_container: list[Optional[PhaseResult]] = [None]
|
|
170
|
+
exception_container: list[Optional[Exception]] = [None]
|
|
171
|
+
|
|
172
|
+
def execute_in_thread():
|
|
173
|
+
try:
|
|
174
|
+
result_container[0] = registry.execute_phase(phase, context)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
exception_container[0] = e
|
|
177
|
+
|
|
178
|
+
thread = threading.Thread(target=execute_in_thread, daemon=True)
|
|
179
|
+
thread.start()
|
|
180
|
+
thread.join(timeout=timeout)
|
|
181
|
+
|
|
182
|
+
if thread.is_alive():
|
|
183
|
+
# 超时了
|
|
184
|
+
error_msg = f"Phase {phase.value} execution timeout after {timeout} seconds"
|
|
185
|
+
context.add_warning(error_msg)
|
|
186
|
+
return PhaseResult(
|
|
187
|
+
phase=phase,
|
|
188
|
+
success=False,
|
|
189
|
+
error=TimeoutError(error_msg)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# 检查是否有异常
|
|
193
|
+
if exception_container[0]:
|
|
194
|
+
raise exception_container[0]
|
|
195
|
+
|
|
196
|
+
# 返回结果(已经确保不为None)
|
|
197
|
+
if result_container[0] is None:
|
|
198
|
+
raise RuntimeError(f"Phase {phase.value} returned None result")
|
|
199
|
+
return result_container[0]
|
|
200
|
+
|
|
155
201
|
def _check_dependencies(self, phase: InitializationPhase,
|
|
156
202
|
context: InitializationContext) -> bool:
|
|
157
203
|
"""检查阶段依赖关系"""
|
crawlo/initialization/phases.py
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from dataclasses import dataclass
|
|
9
|
-
from typing import List, Optional
|
|
9
|
+
from typing import List, Optional, Dict
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class InitializationPhase(Enum):
|
|
@@ -146,4 +146,85 @@ def validate_dependencies() -> bool:
|
|
|
146
146
|
if dependency not in phases:
|
|
147
147
|
return False
|
|
148
148
|
|
|
149
|
-
return True
|
|
149
|
+
return True
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def detect_circular_dependencies() -> Optional[List[InitializationPhase]]:
|
|
153
|
+
"""
|
|
154
|
+
检测循环依赖
|
|
155
|
+
|
|
156
|
+
使用DFS(深度优先搜索)算法检测初始化阶段的循环依赖。
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Optional[List[InitializationPhase]]: 如果存在循环,返回循环路径;否则返回None
|
|
160
|
+
|
|
161
|
+
算法说明:
|
|
162
|
+
使用三色标记法:
|
|
163
|
+
- 白色(0):未访问
|
|
164
|
+
- 灰色(1):正在访问(在当前DFS路径中)
|
|
165
|
+
- 黑色(2):已完成访问
|
|
166
|
+
|
|
167
|
+
如果在DFS过程中遇到灰色节点,说明存在循环依赖。
|
|
168
|
+
"""
|
|
169
|
+
# 构建依赖图
|
|
170
|
+
dependency_graph: Dict[InitializationPhase, List[InitializationPhase]] = {}
|
|
171
|
+
for definition in PHASE_DEFINITIONS:
|
|
172
|
+
dependency_graph[definition.phase] = definition.dependencies.copy()
|
|
173
|
+
|
|
174
|
+
# 三色标记:0-白色(未访问),1-灰色(访问中),2-黑色(已完成)
|
|
175
|
+
color: Dict[InitializationPhase, int] = {phase: 0 for phase in dependency_graph}
|
|
176
|
+
parent: Dict[InitializationPhase, Optional[InitializationPhase]] = {phase: None for phase in dependency_graph}
|
|
177
|
+
|
|
178
|
+
def dfs(node: InitializationPhase) -> Optional[List[InitializationPhase]]:
|
|
179
|
+
"""DFS遍历检测循环"""
|
|
180
|
+
color[node] = 1 # 标记为灰色(访问中)
|
|
181
|
+
|
|
182
|
+
for neighbor in dependency_graph.get(node, []):
|
|
183
|
+
if color[neighbor] == 1: # 遇到灰色节点,发现循环
|
|
184
|
+
# 重建循环路径
|
|
185
|
+
cycle = [neighbor]
|
|
186
|
+
current: Optional[InitializationPhase] = node
|
|
187
|
+
while current is not None and current != neighbor:
|
|
188
|
+
cycle.append(current)
|
|
189
|
+
current = parent.get(current)
|
|
190
|
+
cycle.append(neighbor)
|
|
191
|
+
cycle.reverse()
|
|
192
|
+
return cycle
|
|
193
|
+
|
|
194
|
+
if color[neighbor] == 0: # 未访问的节点
|
|
195
|
+
parent[neighbor] = node
|
|
196
|
+
result = dfs(neighbor)
|
|
197
|
+
if result:
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
color[node] = 2 # 标记为黑色(已完成)
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
# 对所有未访问的节点执行DFS
|
|
204
|
+
for phase in dependency_graph:
|
|
205
|
+
if color[phase] == 0:
|
|
206
|
+
cycle = dfs(phase)
|
|
207
|
+
if cycle:
|
|
208
|
+
return cycle
|
|
209
|
+
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def validate_phase_dependencies() -> tuple[bool, Optional[str]]:
|
|
214
|
+
"""
|
|
215
|
+
全面验证阶段依赖关系
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
tuple[bool, Optional[str]]: (是否有效, 错误信息)
|
|
219
|
+
"""
|
|
220
|
+
# 1. 检查依赖是否存在
|
|
221
|
+
if not validate_dependencies():
|
|
222
|
+
return False, "存在未定义的依赖阶段"
|
|
223
|
+
|
|
224
|
+
# 2. 检查循环依赖
|
|
225
|
+
cycle = detect_circular_dependencies()
|
|
226
|
+
if cycle:
|
|
227
|
+
cycle_path = ' -> '.join([phase.value for phase in cycle])
|
|
228
|
+
return False, f"检测到循环依赖: {cycle_path}"
|
|
229
|
+
|
|
230
|
+
return True, None
|
|
@@ -35,11 +35,12 @@ class BaseInitializer(Initializer):
|
|
|
35
35
|
def _create_result(self, success: bool, duration: float = 0.0,
|
|
36
36
|
artifacts: Optional[Dict] = None, error: Optional[Exception] = None) -> PhaseResult:
|
|
37
37
|
"""创建初始化结果"""
|
|
38
|
-
|
|
38
|
+
from .utils import create_initialization_result
|
|
39
|
+
return create_initialization_result(
|
|
39
40
|
phase=self.phase,
|
|
40
41
|
success=success,
|
|
41
42
|
duration=duration,
|
|
42
|
-
artifacts=artifacts
|
|
43
|
+
artifacts=artifacts,
|
|
43
44
|
error=error
|
|
44
45
|
)
|
|
45
46
|
|
|
@@ -70,15 +71,12 @@ class InitializerRegistry:
|
|
|
70
71
|
init_func: Callable[[InitializationContext], PhaseResult]):
|
|
71
72
|
"""注册函数式初始化器"""
|
|
72
73
|
|
|
73
|
-
class FunctionInitializer:
|
|
74
|
+
class FunctionInitializer(Initializer):
|
|
74
75
|
def __init__(self, phase: InitializationPhase, func: Callable):
|
|
76
|
+
super().__init__(phase)
|
|
75
77
|
self._phase = phase
|
|
76
78
|
self._func = func
|
|
77
79
|
|
|
78
|
-
@property
|
|
79
|
-
def phase(self) -> InitializationPhase:
|
|
80
|
-
return self._phase
|
|
81
|
-
|
|
82
80
|
def initialize(self, context: InitializationContext) -> PhaseResult:
|
|
83
81
|
return self._func(context)
|
|
84
82
|
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
初始化工具模块 - 提供通用的初始化工具函数
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import time
|
|
8
|
+
from typing import Optional, Dict, Any
|
|
9
|
+
from .phases import PhaseResult, InitializationPhase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_initialization_result(
|
|
13
|
+
phase: 'InitializationPhase',
|
|
14
|
+
success: bool,
|
|
15
|
+
duration: float = 0.0,
|
|
16
|
+
artifacts: Optional[Dict[str, Any]] = None,
|
|
17
|
+
error: Optional[Exception] = None
|
|
18
|
+
) -> PhaseResult:
|
|
19
|
+
"""
|
|
20
|
+
创建标准化的初始化结果
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
phase: 初始化阶段
|
|
24
|
+
success: 是否成功
|
|
25
|
+
duration: 执行时长
|
|
26
|
+
artifacts: 产生的工件数据
|
|
27
|
+
error: 异常对象
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
PhaseResult: 标准化的初始化结果
|
|
31
|
+
"""
|
|
32
|
+
return PhaseResult(
|
|
33
|
+
phase=phase,
|
|
34
|
+
success=success,
|
|
35
|
+
duration=duration,
|
|
36
|
+
artifacts=artifacts or {},
|
|
37
|
+
error=error
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class InitializationTimer:
|
|
42
|
+
"""初始化计时器"""
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
self.start_time = time.time()
|
|
46
|
+
|
|
47
|
+
def get_duration(self) -> float:
|
|
48
|
+
"""获取经过的时间"""
|
|
49
|
+
return time.time() - self.start_time
|
crawlo/logging/__init__.py
CHANGED
|
@@ -14,33 +14,29 @@ Crawlo统一日志系统
|
|
|
14
14
|
from .manager import LogManager
|
|
15
15
|
from .factory import LoggerFactory
|
|
16
16
|
from .config import LogConfig
|
|
17
|
-
|
|
17
|
+
|
|
18
18
|
|
|
19
19
|
# 统一的公共接口
|
|
20
20
|
def get_logger(name: str = 'default'):
|
|
21
21
|
"""获取logger实例"""
|
|
22
22
|
return LoggerFactory.get_logger(name)
|
|
23
23
|
|
|
24
|
+
|
|
24
25
|
def configure_logging(settings=None, **kwargs):
|
|
25
26
|
"""配置日志系统"""
|
|
26
27
|
return LogManager().configure(settings, **kwargs)
|
|
27
28
|
|
|
29
|
+
|
|
28
30
|
def is_configured() -> bool:
|
|
29
31
|
"""检查日志系统是否已配置"""
|
|
30
32
|
return LogManager().is_configured
|
|
31
33
|
|
|
32
|
-
def get_monitor() -> LogPerformanceMonitor:
|
|
33
|
-
"""获取日志性能监控器"""
|
|
34
|
-
from .monitor import get_monitor as _get_monitor
|
|
35
|
-
return _get_monitor()
|
|
36
34
|
|
|
37
35
|
__all__ = [
|
|
38
36
|
'LogManager',
|
|
39
|
-
'LoggerFactory',
|
|
37
|
+
'LoggerFactory',
|
|
40
38
|
'LogConfig',
|
|
41
|
-
'LogPerformanceMonitor',
|
|
42
39
|
'get_logger',
|
|
43
40
|
'configure_logging',
|
|
44
|
-
'is_configured'
|
|
45
|
-
|
|
46
|
-
]
|
|
41
|
+
'is_configured'
|
|
42
|
+
]
|
crawlo/logging/config.py
CHANGED
|
@@ -13,6 +13,45 @@ from typing import Optional, Dict, Any
|
|
|
13
13
|
class LogConfig:
|
|
14
14
|
"""日志配置数据类 - 简单明确的配置结构"""
|
|
15
15
|
|
|
16
|
+
# 预设配置模板
|
|
17
|
+
TEMPLATES = {
|
|
18
|
+
'minimal': {
|
|
19
|
+
'level': 'INFO',
|
|
20
|
+
'format': '%(asctime)s - %(levelname)s: %(message)s',
|
|
21
|
+
'console_enabled': True,
|
|
22
|
+
'file_enabled': False
|
|
23
|
+
},
|
|
24
|
+
'standard': {
|
|
25
|
+
'level': 'INFO',
|
|
26
|
+
'format': '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s',
|
|
27
|
+
'console_enabled': True,
|
|
28
|
+
'file_enabled': True,
|
|
29
|
+
'file_path': 'logs/crawlo.log',
|
|
30
|
+
# 注意:standard模板未指定max_bytes和backup_count,
|
|
31
|
+
# 将使用类定义的默认值(10MB, 5个备份)或用户在settings.py中设置的值
|
|
32
|
+
# 如果用户不想要日志轮转,可以在settings.py中设置LOG_MAX_BYTES=0
|
|
33
|
+
# 当max_bytes或backup_count为0时,日志轮转将被禁用,文件会持续增长
|
|
34
|
+
},
|
|
35
|
+
'detailed': {
|
|
36
|
+
'level': 'DEBUG',
|
|
37
|
+
'format': '%(asctime)s - [%(name)s] - %(levelname)s - %(pathname)s:%(lineno)d: %(message)s',
|
|
38
|
+
'console_enabled': True,
|
|
39
|
+
'file_enabled': True,
|
|
40
|
+
'file_path': 'logs/crawlo.log',
|
|
41
|
+
'max_bytes': 20 * 1024 * 1024, # 20MB,适用于大多数生产环境
|
|
42
|
+
'backup_count': 10 # 10个备份文件,可保留约10次轮转的历史
|
|
43
|
+
},
|
|
44
|
+
'production': {
|
|
45
|
+
'level': 'WARNING',
|
|
46
|
+
'format': '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s',
|
|
47
|
+
'console_enabled': False, # 生产环境通常禁用控制台输出
|
|
48
|
+
'file_enabled': True,
|
|
49
|
+
'file_path': 'logs/crawlo.log',
|
|
50
|
+
'max_bytes': 50 * 1024 * 1024, # 50MB,适用于高负载生产环境
|
|
51
|
+
'backup_count': 20 # 20个备份文件,可保留较长时间的历史记录
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
16
55
|
# 基本配置
|
|
17
56
|
level: str = "INFO"
|
|
18
57
|
format: str = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
|
|
@@ -54,21 +93,45 @@ class LogConfig:
|
|
|
54
93
|
# 获取默认值
|
|
55
94
|
format_default_value = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
|
|
56
95
|
|
|
96
|
+
# 确保类型安全
|
|
97
|
+
def safe_get_str(key: str, default: str = '') -> str:
|
|
98
|
+
value = get_val(key, default)
|
|
99
|
+
return str(value) if value is not None else default
|
|
100
|
+
|
|
101
|
+
def safe_get_int(key: str, default: int) -> int:
|
|
102
|
+
value = get_val(key, default)
|
|
103
|
+
try:
|
|
104
|
+
return int(value) if value is not None else default
|
|
105
|
+
except (ValueError, TypeError):
|
|
106
|
+
return default
|
|
107
|
+
|
|
108
|
+
def safe_get_bool(key: str, default: bool) -> bool:
|
|
109
|
+
value = get_val(key, default)
|
|
110
|
+
if isinstance(value, bool):
|
|
111
|
+
return value
|
|
112
|
+
if isinstance(value, str):
|
|
113
|
+
return value.lower() in ('1', 'true', 'yes', 'on')
|
|
114
|
+
return bool(value) if value is not None else default
|
|
115
|
+
|
|
116
|
+
def safe_get_dict(key: str, default: dict) -> dict:
|
|
117
|
+
value = get_val(key, default)
|
|
118
|
+
return value if isinstance(value, dict) else default
|
|
119
|
+
|
|
57
120
|
return cls(
|
|
58
|
-
level=
|
|
59
|
-
format=
|
|
60
|
-
encoding=
|
|
61
|
-
file_path=
|
|
62
|
-
max_bytes=
|
|
63
|
-
backup_count=
|
|
64
|
-
console_enabled=
|
|
65
|
-
file_enabled=
|
|
66
|
-
console_level=
|
|
67
|
-
file_level=
|
|
68
|
-
include_thread_id=
|
|
69
|
-
include_process_id=
|
|
70
|
-
include_module_path=
|
|
71
|
-
module_levels=
|
|
121
|
+
level=safe_get_str('LOG_LEVEL', 'INFO'),
|
|
122
|
+
format=safe_get_str('LOG_FORMAT', format_default_value),
|
|
123
|
+
encoding=safe_get_str('LOG_ENCODING', 'utf-8'),
|
|
124
|
+
file_path=safe_get_str('LOG_FILE'),
|
|
125
|
+
max_bytes=safe_get_int('LOG_MAX_BYTES', 10 * 1024 * 1024), # 从200MB改为10MB以保持一致性
|
|
126
|
+
backup_count=safe_get_int('LOG_BACKUP_COUNT', 5),
|
|
127
|
+
console_enabled=safe_get_bool('LOG_CONSOLE_ENABLED', True),
|
|
128
|
+
file_enabled=safe_get_bool('LOG_FILE_ENABLED', True),
|
|
129
|
+
console_level=safe_get_str('LOG_CONSOLE_LEVEL'), # 允许单独设置控制台级别
|
|
130
|
+
file_level=safe_get_str('LOG_FILE_LEVEL'), # 允许单独设置文件级别
|
|
131
|
+
include_thread_id=safe_get_bool('LOG_INCLUDE_THREAD_ID', False),
|
|
132
|
+
include_process_id=safe_get_bool('LOG_INCLUDE_PROCESS_ID', False),
|
|
133
|
+
include_module_path=safe_get_bool('LOG_INCLUDE_MODULE_PATH', False),
|
|
134
|
+
module_levels=safe_get_dict('LOG_LEVELS', {})
|
|
72
135
|
)
|
|
73
136
|
|
|
74
137
|
@classmethod
|
|
@@ -101,6 +164,22 @@ class LogConfig:
|
|
|
101
164
|
|
|
102
165
|
return cls(**mapped_dict)
|
|
103
166
|
|
|
167
|
+
@classmethod
|
|
168
|
+
def from_template(cls, template_name: str) -> 'LogConfig':
|
|
169
|
+
"""从模板创建配置
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
template_name: 模板名称 (minimal, standard, detailed, production)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
LogConfig: 配置对象
|
|
176
|
+
"""
|
|
177
|
+
if template_name not in cls.TEMPLATES:
|
|
178
|
+
raise ValueError(f"未知的模板名称: {template_name},可用模板: {', '.join(cls.TEMPLATES.keys())}")
|
|
179
|
+
|
|
180
|
+
template_config = cls.TEMPLATES[template_name]
|
|
181
|
+
return cls(**template_config)
|
|
182
|
+
|
|
104
183
|
def get_module_level(self, module_name: str) -> str:
|
|
105
184
|
"""获取模块的日志级别"""
|
|
106
185
|
# 先查找精确匹配
|
|
@@ -169,21 +248,25 @@ class LogConfig:
|
|
|
169
248
|
|
|
170
249
|
return base_format
|
|
171
250
|
|
|
172
|
-
def validate(self) -> bool:
|
|
173
|
-
"""验证配置有效性
|
|
251
|
+
def validate(self) -> tuple[bool, str]:
|
|
252
|
+
"""验证配置有效性
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
tuple[bool, str]: (是否有效, 错误信息)
|
|
256
|
+
"""
|
|
174
257
|
valid_levels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'}
|
|
175
258
|
|
|
176
259
|
# 验证主级别
|
|
177
260
|
if self.level.upper() not in valid_levels:
|
|
178
|
-
return False
|
|
261
|
+
return False, f"无效的日志级别: {self.level},有效级别为: {', '.join(valid_levels)}"
|
|
179
262
|
|
|
180
263
|
# 验证控制台级别
|
|
181
264
|
if self.console_level and self.console_level.upper() not in valid_levels:
|
|
182
|
-
return False
|
|
265
|
+
return False, f"无效的控制台日志级别: {self.console_level},有效级别为: {', '.join(valid_levels)}"
|
|
183
266
|
|
|
184
267
|
# 验证文件级别
|
|
185
268
|
if self.file_level and self.file_level.upper() not in valid_levels:
|
|
186
|
-
return False
|
|
269
|
+
return False, f"无效的文件日志级别: {self.file_level},有效级别为: {', '.join(valid_levels)}"
|
|
187
270
|
|
|
188
271
|
# 确保日志目录存在
|
|
189
272
|
if self.file_path and self.file_enabled:
|
|
@@ -191,7 +274,8 @@ class LogConfig:
|
|
|
191
274
|
log_dir = os.path.dirname(self.file_path)
|
|
192
275
|
if log_dir and not os.path.exists(log_dir):
|
|
193
276
|
os.makedirs(log_dir, exist_ok=True)
|
|
194
|
-
except (OSError, PermissionError):
|
|
195
|
-
|
|
277
|
+
except (OSError, PermissionError) as e:
|
|
278
|
+
log_dir = os.path.dirname(self.file_path) if self.file_path else "未知"
|
|
279
|
+
return False, f"无法创建日志目录 {log_dir}: {e}"
|
|
196
280
|
|
|
197
|
-
return True
|
|
281
|
+
return True, "配置有效"
|
crawlo/logging/factory.py
CHANGED
|
@@ -8,16 +8,17 @@ import logging
|
|
|
8
8
|
import os
|
|
9
9
|
import sys
|
|
10
10
|
import threading
|
|
11
|
-
from typing import Dict, Optional
|
|
12
11
|
from weakref import WeakValueDictionary
|
|
13
12
|
|
|
14
13
|
# 尝试导入concurrent-log-handler,如果不可用则回退到标准库
|
|
15
14
|
try:
|
|
16
15
|
from concurrent_log_handler import ConcurrentRotatingFileHandler
|
|
17
16
|
USE_CONCURRENT_HANDLER = True
|
|
17
|
+
RotatingFileHandler = ConcurrentRotatingFileHandler # 别名以避免未绑定错误
|
|
18
18
|
except ImportError:
|
|
19
19
|
from logging.handlers import RotatingFileHandler
|
|
20
20
|
USE_CONCURRENT_HANDLER = False
|
|
21
|
+
ConcurrentRotatingFileHandler = RotatingFileHandler # 别名以避免未绑定错误
|
|
21
22
|
|
|
22
23
|
from .manager import get_config, is_configured, configure
|
|
23
24
|
from .config import LogConfig
|
|
@@ -69,7 +70,7 @@ class LoggerFactory:
|
|
|
69
70
|
"""创建新的Logger实例"""
|
|
70
71
|
config = get_config()
|
|
71
72
|
if not config:
|
|
72
|
-
raise RuntimeError("
|
|
73
|
+
raise RuntimeError("日志系统未配置,请先调用 configure_logging() 进行配置")
|
|
73
74
|
|
|
74
75
|
# 创建Logger
|
|
75
76
|
logger = logging.getLogger(name)
|
|
@@ -103,6 +104,7 @@ class LoggerFactory:
|
|
|
103
104
|
os.makedirs(log_dir, exist_ok=True)
|
|
104
105
|
|
|
105
106
|
# 根据平台选择合适的Handler
|
|
107
|
+
file_handler = None
|
|
106
108
|
if USE_CONCURRENT_HANDLER:
|
|
107
109
|
file_handler = ConcurrentRotatingFileHandler(
|
|
108
110
|
filename=config.file_path,
|
|
@@ -133,12 +135,14 @@ class LoggerFactory:
|
|
|
133
135
|
encoding=config.encoding
|
|
134
136
|
)
|
|
135
137
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
138
|
+
# 添加文件处理器(如果创建成功)
|
|
139
|
+
if file_handler is not None:
|
|
140
|
+
file_handler.setFormatter(formatter)
|
|
141
|
+
# 使用专门的文件级别或模块级别
|
|
142
|
+
file_level = config.get_file_level()
|
|
143
|
+
level = getattr(logging, file_level.upper(), logging.INFO)
|
|
144
|
+
file_handler.setLevel(level)
|
|
145
|
+
logger.addHandler(file_handler)
|
|
142
146
|
except Exception as e:
|
|
143
147
|
# 文件Handler创建失败时,至少保证控制台输出
|
|
144
148
|
console_handler = logging.StreamHandler()
|
crawlo/logging/manager.py
CHANGED
|
@@ -5,10 +5,12 @@
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import threading
|
|
8
|
-
from typing import Optional
|
|
8
|
+
from typing import Optional
|
|
9
9
|
from .config import LogConfig
|
|
10
|
+
from crawlo.utils.singleton import singleton
|
|
10
11
|
|
|
11
12
|
|
|
13
|
+
@singleton
|
|
12
14
|
class LogManager:
|
|
13
15
|
"""
|
|
14
16
|
日志管理器 - 单例模式
|
|
@@ -18,38 +20,23 @@ class LogManager:
|
|
|
18
20
|
2. 配置状态跟踪
|
|
19
21
|
3. 线程安全的配置更新
|
|
20
22
|
"""
|
|
21
|
-
|
|
22
|
-
_instance: Optional['LogManager'] = None
|
|
23
|
-
_lock = threading.Lock()
|
|
24
|
-
|
|
25
|
-
def __new__(cls) -> 'LogManager':
|
|
26
|
-
if cls._instance is None:
|
|
27
|
-
with cls._lock:
|
|
28
|
-
if cls._instance is None:
|
|
29
|
-
cls._instance = super(LogManager, cls).__new__(cls)
|
|
30
|
-
cls._instance._initialized = False
|
|
31
|
-
return cls._instance
|
|
32
|
-
|
|
23
|
+
|
|
33
24
|
def __init__(self):
|
|
34
|
-
if hasattr(self, '_initialized') and self._initialized:
|
|
35
|
-
return
|
|
36
|
-
|
|
37
25
|
self._config: Optional[LogConfig] = None
|
|
38
26
|
self._configured = False
|
|
39
27
|
self._config_lock = threading.RLock()
|
|
40
|
-
|
|
41
|
-
|
|
28
|
+
|
|
42
29
|
@property
|
|
43
30
|
def config(self) -> Optional[LogConfig]:
|
|
44
31
|
"""获取当前配置"""
|
|
45
32
|
with self._config_lock:
|
|
46
33
|
return self._config
|
|
47
|
-
|
|
34
|
+
|
|
48
35
|
@property
|
|
49
36
|
def is_configured(self) -> bool:
|
|
50
37
|
"""检查是否已配置"""
|
|
51
38
|
return self._configured
|
|
52
|
-
|
|
39
|
+
|
|
53
40
|
def configure(self, settings=None, **kwargs) -> LogConfig:
|
|
54
41
|
"""
|
|
55
42
|
配置日志系统
|
|
@@ -74,16 +61,17 @@ class LogManager:
|
|
|
74
61
|
config = LogConfig.from_dict(kwargs)
|
|
75
62
|
else:
|
|
76
63
|
config = LogConfig() # 使用默认配置
|
|
77
|
-
|
|
64
|
+
|
|
78
65
|
# 验证配置
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
66
|
+
is_valid, error_msg = config.validate()
|
|
67
|
+
if not is_valid:
|
|
68
|
+
raise ValueError(f"Invalid log configuration: {error_msg}")
|
|
69
|
+
|
|
82
70
|
self._config = config
|
|
83
71
|
self._configured = True
|
|
84
|
-
|
|
72
|
+
|
|
85
73
|
return config
|
|
86
|
-
|
|
74
|
+
|
|
87
75
|
def reset(self):
|
|
88
76
|
"""重置配置(主要用于测试)"""
|
|
89
77
|
with self._config_lock:
|
|
@@ -94,19 +82,23 @@ class LogManager:
|
|
|
94
82
|
# 全局实例
|
|
95
83
|
_log_manager = LogManager()
|
|
96
84
|
|
|
85
|
+
|
|
97
86
|
# 模块级便捷函数
|
|
98
87
|
def configure(settings=None, **kwargs) -> LogConfig:
|
|
99
88
|
"""配置日志系统"""
|
|
100
89
|
return _log_manager.configure(settings, **kwargs)
|
|
101
90
|
|
|
91
|
+
|
|
102
92
|
def is_configured() -> bool:
|
|
103
93
|
"""检查是否已配置"""
|
|
104
94
|
return _log_manager.is_configured
|
|
105
95
|
|
|
96
|
+
|
|
106
97
|
def get_config() -> Optional[LogConfig]:
|
|
107
98
|
"""获取当前配置"""
|
|
108
99
|
return _log_manager.config
|
|
109
100
|
|
|
101
|
+
|
|
110
102
|
def reset():
|
|
111
103
|
"""重置配置"""
|
|
112
|
-
_log_manager.reset()
|
|
104
|
+
_log_manager.reset()
|