crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
编码工具模块
|
|
5
|
+
==================
|
|
6
|
+
提供用于处理HTTP响应编码检测的辅助函数,作为w3lib库的替代实现。
|
|
7
|
+
|
|
8
|
+
该模块包含以下主要函数:
|
|
9
|
+
- html_body_declared_encoding: 从HTML meta标签中检测编码声明
|
|
10
|
+
- http_content_type_encoding: 从HTTP Content-Type头部检测编码
|
|
11
|
+
- read_bom: 检测字节顺序标记(BOM)
|
|
12
|
+
- resolve_encoding: 解析编码名称
|
|
13
|
+
- html_to_unicode: 将HTML内容转换为Unicode字符串
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
from typing import Optional, Tuple, Callable
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def html_body_declared_encoding(html_body_str: bytes) -> Optional[str]:
|
|
21
|
+
"""
|
|
22
|
+
HTML meta 标签声明编码检测的替代实现
|
|
23
|
+
|
|
24
|
+
:param html_body_str: HTML内容字节串
|
|
25
|
+
:return: 检测到的编码或None
|
|
26
|
+
"""
|
|
27
|
+
if isinstance(html_body_str, str):
|
|
28
|
+
html_body_str = html_body_str.encode('utf-8')
|
|
29
|
+
|
|
30
|
+
# 只检查前4KB内容
|
|
31
|
+
html_start = html_body_str[:4096]
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
# 尝试解码为ASCII(忽略错误)
|
|
35
|
+
html_text = html_start.decode('ascii', errors='ignore')
|
|
36
|
+
|
|
37
|
+
# 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
|
|
38
|
+
# <meta charset="utf-8">
|
|
39
|
+
charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
|
|
40
|
+
if charset_match:
|
|
41
|
+
return charset_match.group(1).lower()
|
|
42
|
+
|
|
43
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
|
44
|
+
content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
|
|
45
|
+
if content_match:
|
|
46
|
+
return content_match.group(1).lower()
|
|
47
|
+
|
|
48
|
+
except Exception:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def http_content_type_encoding(content_type: str) -> Optional[str]:
|
|
55
|
+
"""
|
|
56
|
+
HTTP Content-Type 头部编码检测的替代实现
|
|
57
|
+
|
|
58
|
+
:param content_type: Content-Type头部值
|
|
59
|
+
:return: 检测到的编码或None
|
|
60
|
+
"""
|
|
61
|
+
if not content_type:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
|
|
65
|
+
if charset_match:
|
|
66
|
+
return charset_match.group(1).lower()
|
|
67
|
+
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def read_bom(data: bytes) -> Tuple[Optional[str], bytes]:
|
|
72
|
+
"""
|
|
73
|
+
检测字节顺序标记(BOM)的替代实现
|
|
74
|
+
|
|
75
|
+
:param data: 字节数据
|
|
76
|
+
:return: (编码, 去除BOM后的数据)
|
|
77
|
+
"""
|
|
78
|
+
if data.startswith(b'\xff\xfe'):
|
|
79
|
+
return 'utf-16-le', data[2:]
|
|
80
|
+
elif data.startswith(b'\xfe\xff'):
|
|
81
|
+
return 'utf-16-be', data[2:]
|
|
82
|
+
elif data.startswith(b'\xff\xfe\x00\x00'):
|
|
83
|
+
return 'utf-32-le', data[4:]
|
|
84
|
+
elif data.startswith(b'\x00\x00\xfe\xff'):
|
|
85
|
+
return 'utf-32-be', data[4:]
|
|
86
|
+
elif data.startswith(b'\xef\xbb\xbf'):
|
|
87
|
+
return 'utf-8', data[3:]
|
|
88
|
+
else:
|
|
89
|
+
return None, data
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def resolve_encoding(encoding: str) -> Optional[str]:
|
|
93
|
+
"""
|
|
94
|
+
解析编码名称的替代实现
|
|
95
|
+
|
|
96
|
+
:param encoding: 编码名称
|
|
97
|
+
:return: 标准化后的编码名称或None
|
|
98
|
+
"""
|
|
99
|
+
if not encoding:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
# 常见编码别名映射
|
|
103
|
+
encoding_aliases = {
|
|
104
|
+
'utf8': 'utf-8',
|
|
105
|
+
'utf-8-sig': 'utf-8',
|
|
106
|
+
'ucs2': 'utf-16',
|
|
107
|
+
'ucs-2': 'utf-16',
|
|
108
|
+
'ucs4': 'utf-32',
|
|
109
|
+
'ucs-4': 'utf-32',
|
|
110
|
+
'iso-8859-1': 'latin1',
|
|
111
|
+
'iso-latin-1': 'latin1',
|
|
112
|
+
'cp936': 'gbk',
|
|
113
|
+
'ms936': 'gbk',
|
|
114
|
+
'gb2312': 'gbk',
|
|
115
|
+
'gb_2312': 'gbk',
|
|
116
|
+
'gb_2312-80': 'gbk',
|
|
117
|
+
'csgb2312': 'gbk',
|
|
118
|
+
'big5-hkscs': 'big5',
|
|
119
|
+
'shift_jis': 'shift-jis',
|
|
120
|
+
'sjis': 'shift-jis',
|
|
121
|
+
'windows-31j': 'shift-jis',
|
|
122
|
+
'cskoi8r': 'koi8-r',
|
|
123
|
+
'koi8_r': 'koi8-r',
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
encoding = encoding.lower().strip()
|
|
127
|
+
return encoding_aliases.get(encoding, encoding)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def html_to_unicode(content_type_header: str,
|
|
131
|
+
html_body_str: bytes,
|
|
132
|
+
auto_detect_fun: Optional[Callable[[bytes], Optional[str]]] = None,
|
|
133
|
+
default_encoding: str = 'utf-8') -> Tuple[str, str]:
|
|
134
|
+
"""
|
|
135
|
+
将HTML内容转换为Unicode字符串的替代实现
|
|
136
|
+
|
|
137
|
+
:param content_type_header: Content-Type头部
|
|
138
|
+
:param html_body_str: HTML内容字节串
|
|
139
|
+
:param auto_detect_fun: 自动检测编码的回调函数
|
|
140
|
+
:param default_encoding: 默认编码
|
|
141
|
+
:return: (编码, Unicode字符串)
|
|
142
|
+
"""
|
|
143
|
+
# 1. 检测BOM
|
|
144
|
+
bom_enc, html_body_str = read_bom(html_body_str)
|
|
145
|
+
if bom_enc:
|
|
146
|
+
try:
|
|
147
|
+
return bom_enc, html_body_str.decode(bom_enc)
|
|
148
|
+
except (UnicodeDecodeError, LookupError):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
# 2. 从Content-Type头部获取编码
|
|
152
|
+
header_enc = http_content_type_encoding(content_type_header)
|
|
153
|
+
if header_enc:
|
|
154
|
+
try:
|
|
155
|
+
return header_enc, html_body_str.decode(header_enc)
|
|
156
|
+
except (UnicodeDecodeError, LookupError):
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
# 3. 从HTML meta标签获取编码
|
|
160
|
+
meta_enc = html_body_declared_encoding(html_body_str)
|
|
161
|
+
if meta_enc:
|
|
162
|
+
try:
|
|
163
|
+
return meta_enc, html_body_str.decode(meta_enc)
|
|
164
|
+
except (UnicodeDecodeError, LookupError):
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
# 4. 使用自动检测函数
|
|
168
|
+
if auto_detect_fun:
|
|
169
|
+
auto_enc = auto_detect_fun(html_body_str)
|
|
170
|
+
if auto_enc:
|
|
171
|
+
try:
|
|
172
|
+
return auto_enc, html_body_str.decode(auto_enc)
|
|
173
|
+
except (UnicodeDecodeError, LookupError):
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
# 5. 使用默认编码
|
|
177
|
+
try:
|
|
178
|
+
return default_encoding, html_body_str.decode(default_encoding)
|
|
179
|
+
except (UnicodeDecodeError, LookupError):
|
|
180
|
+
# 最后尝试使用错误容忍的方式解码
|
|
181
|
+
return 'utf-8', html_body_str.decode('utf-8', errors='replace')
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
__all__ = [
|
|
185
|
+
"html_body_declared_encoding",
|
|
186
|
+
"http_content_type_encoding",
|
|
187
|
+
"read_bom",
|
|
188
|
+
"resolve_encoding",
|
|
189
|
+
"html_to_unicode"
|
|
190
|
+
]
|
crawlo/utils/error_handler.py
CHANGED
|
@@ -9,7 +9,7 @@ from functools import wraps
|
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
from typing import Optional, Callable, Any, Dict, List
|
|
11
11
|
|
|
12
|
-
from crawlo.
|
|
12
|
+
from crawlo.logging import get_logger
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class ErrorContext:
|
|
@@ -66,7 +66,7 @@ class EnhancedErrorHandler:
|
|
|
66
66
|
"""增强版错误处理器"""
|
|
67
67
|
|
|
68
68
|
def __init__(self, logger_name: str = __name__, log_level: str = 'ERROR'):
|
|
69
|
-
self.logger = get_logger(logger_name
|
|
69
|
+
self.logger = get_logger(logger_name)
|
|
70
70
|
self.error_history: List[Dict] = [] # 错误历史记录
|
|
71
71
|
self.max_history_size = 100 # 最大历史记录数
|
|
72
72
|
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
资源泄露检测器
|
|
5
|
+
==============
|
|
6
|
+
|
|
7
|
+
提供资源泄露检测和分析功能
|
|
8
|
+
"""
|
|
9
|
+
import gc
|
|
10
|
+
import time
|
|
11
|
+
import psutil
|
|
12
|
+
from typing import Dict, List, Any, Optional
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
|
|
15
|
+
from crawlo.logging import get_logger
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ResourceSnapshot:
|
|
19
|
+
"""资源快照"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, name: str = ""):
|
|
22
|
+
self.name = name
|
|
23
|
+
self.timestamp = time.time()
|
|
24
|
+
|
|
25
|
+
# 进程信息
|
|
26
|
+
process = psutil.Process()
|
|
27
|
+
self.memory_mb = process.memory_info().rss / 1024 / 1024
|
|
28
|
+
self.cpu_percent = process.cpu_percent()
|
|
29
|
+
self.num_threads = process.num_threads()
|
|
30
|
+
self.num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0
|
|
31
|
+
|
|
32
|
+
# GC信息
|
|
33
|
+
gc.collect() # 先触发一次GC
|
|
34
|
+
self.gc_objects = len(gc.get_objects())
|
|
35
|
+
self.gc_stats = gc.get_stats()
|
|
36
|
+
|
|
37
|
+
# 对象类型统计
|
|
38
|
+
self.object_types = self._count_object_types()
|
|
39
|
+
|
|
40
|
+
def _count_object_types(self, top_n: int = 20) -> Dict[str, int]:
|
|
41
|
+
"""统计前N个对象类型"""
|
|
42
|
+
type_counts = defaultdict(int)
|
|
43
|
+
|
|
44
|
+
for obj in gc.get_objects():
|
|
45
|
+
type_name = type(obj).__name__
|
|
46
|
+
type_counts[type_name] += 1
|
|
47
|
+
|
|
48
|
+
# 返回前N个
|
|
49
|
+
sorted_types = sorted(type_counts.items(), key=lambda x: x[1], reverse=True)
|
|
50
|
+
return dict(sorted_types[:top_n])
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
53
|
+
"""转换为字典"""
|
|
54
|
+
return {
|
|
55
|
+
'name': self.name,
|
|
56
|
+
'timestamp': self.timestamp,
|
|
57
|
+
'memory_mb': round(self.memory_mb, 2),
|
|
58
|
+
'cpu_percent': self.cpu_percent,
|
|
59
|
+
'num_threads': self.num_threads,
|
|
60
|
+
'num_fds': self.num_fds,
|
|
61
|
+
'gc_objects': self.gc_objects,
|
|
62
|
+
'object_types': self.object_types,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class LeakDetector:
|
|
67
|
+
"""
|
|
68
|
+
资源泄露检测器
|
|
69
|
+
|
|
70
|
+
功能:
|
|
71
|
+
1. 定期记录资源快照
|
|
72
|
+
2. 分析资源增长趋势
|
|
73
|
+
3. 识别可能的泄露点
|
|
74
|
+
4. 生成诊断报告
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(self, name: str = "default"):
|
|
78
|
+
self.name = name
|
|
79
|
+
self._snapshots: List[ResourceSnapshot] = []
|
|
80
|
+
self._logger = get_logger(f"LeakDetector.{name}")
|
|
81
|
+
self._baseline: Optional[ResourceSnapshot] = None
|
|
82
|
+
|
|
83
|
+
def set_baseline(self, name: str = "baseline"):
|
|
84
|
+
"""设置基线快照"""
|
|
85
|
+
self._baseline = ResourceSnapshot(name)
|
|
86
|
+
self._logger.info(f"Baseline set: {self._baseline.memory_mb:.2f}MB")
|
|
87
|
+
|
|
88
|
+
def snapshot(self, name: str = ""):
|
|
89
|
+
"""记录当前资源快照"""
|
|
90
|
+
snapshot = ResourceSnapshot(name or f"snapshot_{len(self._snapshots)}")
|
|
91
|
+
self._snapshots.append(snapshot)
|
|
92
|
+
|
|
93
|
+
self._logger.debug(
|
|
94
|
+
f"Snapshot '{snapshot.name}': {snapshot.memory_mb:.2f}MB, "
|
|
95
|
+
f"{snapshot.gc_objects} objects"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return snapshot
|
|
99
|
+
|
|
100
|
+
def analyze(self, threshold_mb: float = 10.0) -> Dict[str, Any]:
|
|
101
|
+
"""
|
|
102
|
+
分析资源使用情况
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
threshold_mb: 内存增长阈值(MB),超过视为可能泄露
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
分析结果
|
|
109
|
+
"""
|
|
110
|
+
if len(self._snapshots) < 2:
|
|
111
|
+
return {
|
|
112
|
+
'status': 'insufficient_data',
|
|
113
|
+
'message': 'Need at least 2 snapshots for analysis',
|
|
114
|
+
'snapshot_count': len(self._snapshots),
|
|
115
|
+
'changes': {
|
|
116
|
+
'memory_mb': 0.0,
|
|
117
|
+
'memory_percent': 0.0,
|
|
118
|
+
'objects': 0,
|
|
119
|
+
'objects_percent': 0.0,
|
|
120
|
+
'file_descriptors': 0,
|
|
121
|
+
'threads': 0,
|
|
122
|
+
},
|
|
123
|
+
'potential_leaks': [],
|
|
124
|
+
'type_changes': [],
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
first = self._baseline or self._snapshots[0]
|
|
128
|
+
latest = self._snapshots[-1]
|
|
129
|
+
|
|
130
|
+
# 内存增长
|
|
131
|
+
memory_growth_mb = latest.memory_mb - first.memory_mb
|
|
132
|
+
memory_growth_percent = (memory_growth_mb / first.memory_mb) * 100 if first.memory_mb > 0 else 0
|
|
133
|
+
|
|
134
|
+
# 对象数量增长
|
|
135
|
+
object_growth = latest.gc_objects - first.gc_objects
|
|
136
|
+
object_growth_percent = (object_growth / first.gc_objects) * 100 if first.gc_objects > 0 else 0
|
|
137
|
+
|
|
138
|
+
# 文件描述符增长
|
|
139
|
+
fd_growth = latest.num_fds - first.num_fds
|
|
140
|
+
|
|
141
|
+
# 线程数增长
|
|
142
|
+
thread_growth = latest.num_threads - first.num_threads
|
|
143
|
+
|
|
144
|
+
# 检测泄露
|
|
145
|
+
potential_leaks = []
|
|
146
|
+
|
|
147
|
+
if memory_growth_mb > threshold_mb:
|
|
148
|
+
potential_leaks.append({
|
|
149
|
+
'type': 'memory',
|
|
150
|
+
'severity': 'high' if memory_growth_mb > threshold_mb * 2 else 'medium',
|
|
151
|
+
'growth_mb': round(memory_growth_mb, 2),
|
|
152
|
+
'growth_percent': round(memory_growth_percent, 2),
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
if object_growth > 1000:
|
|
156
|
+
potential_leaks.append({
|
|
157
|
+
'type': 'objects',
|
|
158
|
+
'severity': 'medium',
|
|
159
|
+
'growth': object_growth,
|
|
160
|
+
'growth_percent': round(object_growth_percent, 2),
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
if fd_growth > 10:
|
|
164
|
+
potential_leaks.append({
|
|
165
|
+
'type': 'file_descriptors',
|
|
166
|
+
'severity': 'high',
|
|
167
|
+
'growth': fd_growth,
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
if thread_growth > 5:
|
|
171
|
+
potential_leaks.append({
|
|
172
|
+
'type': 'threads',
|
|
173
|
+
'severity': 'medium',
|
|
174
|
+
'growth': thread_growth,
|
|
175
|
+
})
|
|
176
|
+
|
|
177
|
+
# 对象类型变化分析
|
|
178
|
+
type_changes = self._analyze_type_changes(first, latest)
|
|
179
|
+
|
|
180
|
+
result = {
|
|
181
|
+
'status': 'leak_detected' if potential_leaks else 'healthy',
|
|
182
|
+
'duration_seconds': latest.timestamp - first.timestamp,
|
|
183
|
+
'baseline': first.to_dict(),
|
|
184
|
+
'latest': latest.to_dict(),
|
|
185
|
+
'changes': {
|
|
186
|
+
'memory_mb': round(memory_growth_mb, 2),
|
|
187
|
+
'memory_percent': round(memory_growth_percent, 2),
|
|
188
|
+
'objects': object_growth,
|
|
189
|
+
'objects_percent': round(object_growth_percent, 2),
|
|
190
|
+
'file_descriptors': fd_growth,
|
|
191
|
+
'threads': thread_growth,
|
|
192
|
+
},
|
|
193
|
+
'potential_leaks': potential_leaks,
|
|
194
|
+
'type_changes': type_changes,
|
|
195
|
+
'snapshot_count': len(self._snapshots),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# 记录分析结果
|
|
199
|
+
if potential_leaks:
|
|
200
|
+
self._logger.warning(
|
|
201
|
+
f"Potential leaks detected: {len(potential_leaks)} issue(s), "
|
|
202
|
+
f"memory growth: {memory_growth_mb:.2f}MB ({memory_growth_percent:.1f}%)"
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
self._logger.info(
|
|
206
|
+
f"No leaks detected, memory growth: {memory_growth_mb:.2f}MB "
|
|
207
|
+
f"({memory_growth_percent:.1f}%)"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
return result
|
|
211
|
+
|
|
212
|
+
def _analyze_type_changes(self, first: ResourceSnapshot, latest: ResourceSnapshot, top_n: int = 10) -> List[Dict[str, Any]]:
|
|
213
|
+
"""分析对象类型变化"""
|
|
214
|
+
changes = []
|
|
215
|
+
|
|
216
|
+
# 找出增长最多的类型
|
|
217
|
+
for type_name in set(list(first.object_types.keys()) + list(latest.object_types.keys())):
|
|
218
|
+
old_count = first.object_types.get(type_name, 0)
|
|
219
|
+
new_count = latest.object_types.get(type_name, 0)
|
|
220
|
+
growth = new_count - old_count
|
|
221
|
+
|
|
222
|
+
if growth > 0:
|
|
223
|
+
changes.append({
|
|
224
|
+
'type': type_name,
|
|
225
|
+
'old_count': old_count,
|
|
226
|
+
'new_count': new_count,
|
|
227
|
+
'growth': growth,
|
|
228
|
+
'growth_percent': round((growth / old_count) * 100, 2) if old_count > 0 else float('inf')
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
# 按增长数量排序
|
|
232
|
+
changes.sort(key=lambda x: x['growth'], reverse=True)
|
|
233
|
+
|
|
234
|
+
return changes[:top_n]
|
|
235
|
+
|
|
236
|
+
def get_trend(self, metric: str = 'memory_mb') -> List[float]:
|
|
237
|
+
"""获取指标趋势"""
|
|
238
|
+
return [getattr(s, metric) for s in self._snapshots]
|
|
239
|
+
|
|
240
|
+
def generate_report(self) -> str:
|
|
241
|
+
"""生成诊断报告"""
|
|
242
|
+
if not self._snapshots:
|
|
243
|
+
return "No snapshots available"
|
|
244
|
+
|
|
245
|
+
analysis = self.analyze()
|
|
246
|
+
|
|
247
|
+
# 如果数据不足,返回简单报告
|
|
248
|
+
if analysis['status'] == 'insufficient_data':
|
|
249
|
+
return (
|
|
250
|
+
"=" * 60 + "\n" +
|
|
251
|
+
"资源泄露检测报告\n" +
|
|
252
|
+
"=" * 60 + "\n" +
|
|
253
|
+
f"检测器: {self.name}\n" +
|
|
254
|
+
f"快照数量: {analysis['snapshot_count']}\n" +
|
|
255
|
+
"\n" +
|
|
256
|
+
"⚠️ 数据不足: " + analysis['message'] + "\n" +
|
|
257
|
+
"=" * 60
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
report = []
|
|
261
|
+
report.append("=" * 60)
|
|
262
|
+
report.append("资源泄露检测报告")
|
|
263
|
+
report.append("=" * 60)
|
|
264
|
+
report.append(f"检测器: {self.name}")
|
|
265
|
+
report.append(f"快照数量: {analysis['snapshot_count']}")
|
|
266
|
+
report.append(f"持续时间: {analysis['duration_seconds']:.2f}秒")
|
|
267
|
+
report.append("")
|
|
268
|
+
|
|
269
|
+
report.append("资源变化:")
|
|
270
|
+
report.append("-" * 60)
|
|
271
|
+
changes = analysis['changes']
|
|
272
|
+
report.append(f" 内存: {changes['memory_mb']:+.2f}MB ({changes['memory_percent']:+.2f}%)")
|
|
273
|
+
report.append(f" 对象数: {changes['objects']:+d} ({changes['objects_percent']:+.2f}%)")
|
|
274
|
+
report.append(f" 文件描述符: {changes['file_descriptors']:+d}")
|
|
275
|
+
report.append(f" 线程数: {changes['threads']:+d}")
|
|
276
|
+
report.append("")
|
|
277
|
+
|
|
278
|
+
if analysis['potential_leaks']:
|
|
279
|
+
report.append("⚠️ 潜在泄露:")
|
|
280
|
+
report.append("-" * 60)
|
|
281
|
+
for leak in analysis['potential_leaks']:
|
|
282
|
+
report.append(f" - {leak['type']}: {leak['severity']} severity")
|
|
283
|
+
if 'growth_mb' in leak:
|
|
284
|
+
report.append(f" 增长: {leak['growth_mb']:.2f}MB ({leak['growth_percent']:.2f}%)")
|
|
285
|
+
elif 'growth' in leak:
|
|
286
|
+
report.append(f" 增长: {leak['growth']}")
|
|
287
|
+
report.append("")
|
|
288
|
+
else:
|
|
289
|
+
report.append("✅ 未检测到明显泄露")
|
|
290
|
+
report.append("")
|
|
291
|
+
|
|
292
|
+
if analysis['type_changes']:
|
|
293
|
+
report.append("对象类型变化(Top 10):")
|
|
294
|
+
report.append("-" * 60)
|
|
295
|
+
for change in analysis['type_changes'][:10]:
|
|
296
|
+
report.append(
|
|
297
|
+
f" {change['type']}: {change['old_count']} -> {change['new_count']} "
|
|
298
|
+
f"(+{change['growth']})"
|
|
299
|
+
)
|
|
300
|
+
report.append("")
|
|
301
|
+
|
|
302
|
+
report.append("=" * 60)
|
|
303
|
+
|
|
304
|
+
return "\n".join(report)
|
|
305
|
+
|
|
306
|
+
def clear(self):
|
|
307
|
+
"""清除所有快照"""
|
|
308
|
+
self._snapshots.clear()
|
|
309
|
+
self._baseline = None
|
|
310
|
+
self._logger.debug("Snapshots cleared")
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# 全局检测器注册表
|
|
314
|
+
_global_detectors: Dict[str, LeakDetector] = {}
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def get_leak_detector(name: str = "default") -> LeakDetector:
|
|
318
|
+
"""
|
|
319
|
+
获取泄露检测器实例(单例)
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
name: 检测器名称
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
LeakDetector实例
|
|
326
|
+
"""
|
|
327
|
+
if name not in _global_detectors:
|
|
328
|
+
_global_detectors[name] = LeakDetector(name)
|
|
329
|
+
return _global_detectors[name]
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def cleanup_detectors():
|
|
333
|
+
"""清理所有检测器"""
|
|
334
|
+
global _global_detectors
|
|
335
|
+
_global_detectors.clear()
|