crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 编码工具模块
5
+ ==================
6
+ 提供用于处理HTTP响应编码检测的辅助函数,作为w3lib库的替代实现。
7
+
8
+ 该模块包含以下主要函数:
9
+ - html_body_declared_encoding: 从HTML meta标签中检测编码声明
10
+ - http_content_type_encoding: 从HTTP Content-Type头部检测编码
11
+ - read_bom: 检测字节顺序标记(BOM)
12
+ - resolve_encoding: 解析编码名称
13
+ - html_to_unicode: 将HTML内容转换为Unicode字符串
14
+ """
15
+
16
+ import re
17
+ from typing import Optional, Tuple, Callable
18
+
19
+
20
+ def html_body_declared_encoding(html_body_str: bytes) -> Optional[str]:
21
+ """
22
+ HTML meta 标签声明编码检测的替代实现
23
+
24
+ :param html_body_str: HTML内容字节串
25
+ :return: 检测到的编码或None
26
+ """
27
+ if isinstance(html_body_str, str):
28
+ html_body_str = html_body_str.encode('utf-8')
29
+
30
+ # 只检查前4KB内容
31
+ html_start = html_body_str[:4096]
32
+
33
+ try:
34
+ # 尝试解码为ASCII(忽略错误)
35
+ html_text = html_start.decode('ascii', errors='ignore')
36
+
37
+ # 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
38
+ # <meta charset="utf-8">
39
+ charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
40
+ if charset_match:
41
+ return charset_match.group(1).lower()
42
+
43
+ # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
44
+ content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
45
+ if content_match:
46
+ return content_match.group(1).lower()
47
+
48
+ except Exception:
49
+ pass
50
+
51
+ return None
52
+
53
+
54
+ def http_content_type_encoding(content_type: str) -> Optional[str]:
55
+ """
56
+ HTTP Content-Type 头部编码检测的替代实现
57
+
58
+ :param content_type: Content-Type头部值
59
+ :return: 检测到的编码或None
60
+ """
61
+ if not content_type:
62
+ return None
63
+
64
+ charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
65
+ if charset_match:
66
+ return charset_match.group(1).lower()
67
+
68
+ return None
69
+
70
+
71
+ def read_bom(data: bytes) -> Tuple[Optional[str], bytes]:
72
+ """
73
+ 检测字节顺序标记(BOM)的替代实现
74
+
75
+ :param data: 字节数据
76
+ :return: (编码, 去除BOM后的数据)
77
+ """
78
+ if data.startswith(b'\xff\xfe'):
79
+ return 'utf-16-le', data[2:]
80
+ elif data.startswith(b'\xfe\xff'):
81
+ return 'utf-16-be', data[2:]
82
+ elif data.startswith(b'\xff\xfe\x00\x00'):
83
+ return 'utf-32-le', data[4:]
84
+ elif data.startswith(b'\x00\x00\xfe\xff'):
85
+ return 'utf-32-be', data[4:]
86
+ elif data.startswith(b'\xef\xbb\xbf'):
87
+ return 'utf-8', data[3:]
88
+ else:
89
+ return None, data
90
+
91
+
92
+ def resolve_encoding(encoding: str) -> Optional[str]:
93
+ """
94
+ 解析编码名称的替代实现
95
+
96
+ :param encoding: 编码名称
97
+ :return: 标准化后的编码名称或None
98
+ """
99
+ if not encoding:
100
+ return None
101
+
102
+ # 常见编码别名映射
103
+ encoding_aliases = {
104
+ 'utf8': 'utf-8',
105
+ 'utf-8-sig': 'utf-8',
106
+ 'ucs2': 'utf-16',
107
+ 'ucs-2': 'utf-16',
108
+ 'ucs4': 'utf-32',
109
+ 'ucs-4': 'utf-32',
110
+ 'iso-8859-1': 'latin1',
111
+ 'iso-latin-1': 'latin1',
112
+ 'cp936': 'gbk',
113
+ 'ms936': 'gbk',
114
+ 'gb2312': 'gbk',
115
+ 'gb_2312': 'gbk',
116
+ 'gb_2312-80': 'gbk',
117
+ 'csgb2312': 'gbk',
118
+ 'big5-hkscs': 'big5',
119
+ 'shift_jis': 'shift-jis',
120
+ 'sjis': 'shift-jis',
121
+ 'windows-31j': 'shift-jis',
122
+ 'cskoi8r': 'koi8-r',
123
+ 'koi8_r': 'koi8-r',
124
+ }
125
+
126
+ encoding = encoding.lower().strip()
127
+ return encoding_aliases.get(encoding, encoding)
128
+
129
+
130
+ def html_to_unicode(content_type_header: str,
131
+ html_body_str: bytes,
132
+ auto_detect_fun: Optional[Callable[[bytes], Optional[str]]] = None,
133
+ default_encoding: str = 'utf-8') -> Tuple[str, str]:
134
+ """
135
+ 将HTML内容转换为Unicode字符串的替代实现
136
+
137
+ :param content_type_header: Content-Type头部
138
+ :param html_body_str: HTML内容字节串
139
+ :param auto_detect_fun: 自动检测编码的回调函数
140
+ :param default_encoding: 默认编码
141
+ :return: (编码, Unicode字符串)
142
+ """
143
+ # 1. 检测BOM
144
+ bom_enc, html_body_str = read_bom(html_body_str)
145
+ if bom_enc:
146
+ try:
147
+ return bom_enc, html_body_str.decode(bom_enc)
148
+ except (UnicodeDecodeError, LookupError):
149
+ pass
150
+
151
+ # 2. 从Content-Type头部获取编码
152
+ header_enc = http_content_type_encoding(content_type_header)
153
+ if header_enc:
154
+ try:
155
+ return header_enc, html_body_str.decode(header_enc)
156
+ except (UnicodeDecodeError, LookupError):
157
+ pass
158
+
159
+ # 3. 从HTML meta标签获取编码
160
+ meta_enc = html_body_declared_encoding(html_body_str)
161
+ if meta_enc:
162
+ try:
163
+ return meta_enc, html_body_str.decode(meta_enc)
164
+ except (UnicodeDecodeError, LookupError):
165
+ pass
166
+
167
+ # 4. 使用自动检测函数
168
+ if auto_detect_fun:
169
+ auto_enc = auto_detect_fun(html_body_str)
170
+ if auto_enc:
171
+ try:
172
+ return auto_enc, html_body_str.decode(auto_enc)
173
+ except (UnicodeDecodeError, LookupError):
174
+ pass
175
+
176
+ # 5. 使用默认编码
177
+ try:
178
+ return default_encoding, html_body_str.decode(default_encoding)
179
+ except (UnicodeDecodeError, LookupError):
180
+ # 最后尝试使用错误容忍的方式解码
181
+ return 'utf-8', html_body_str.decode('utf-8', errors='replace')
182
+
183
+
184
+ __all__ = [
185
+ "html_body_declared_encoding",
186
+ "http_content_type_encoding",
187
+ "read_bom",
188
+ "resolve_encoding",
189
+ "html_to_unicode"
190
+ ]
@@ -9,7 +9,7 @@ from functools import wraps
9
9
  from datetime import datetime
10
10
  from typing import Optional, Callable, Any, Dict, List
11
11
 
12
- from crawlo.utils.log import get_logger
12
+ from crawlo.logging import get_logger
13
13
 
14
14
 
15
15
  class ErrorContext:
@@ -66,7 +66,7 @@ class EnhancedErrorHandler:
66
66
  """增强版错误处理器"""
67
67
 
68
68
  def __init__(self, logger_name: str = __name__, log_level: str = 'ERROR'):
69
- self.logger = get_logger(logger_name, log_level)
69
+ self.logger = get_logger(logger_name)
70
70
  self.error_history: List[Dict] = [] # 错误历史记录
71
71
  self.max_history_size = 100 # 最大历史记录数
72
72
 
@@ -8,7 +8,7 @@ import json
8
8
  import time
9
9
  from typing import Generator, List, Dict, Any
10
10
 
11
- from crawlo.utils.log import get_logger
11
+ from crawlo.logging import get_logger
12
12
 
13
13
 
14
14
  class LargeScaleHelper:
@@ -0,0 +1,335 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 资源泄露检测器
5
+ ==============
6
+
7
+ 提供资源泄露检测和分析功能
8
+ """
9
+ import gc
10
+ import time
11
+ import psutil
12
+ from typing import Dict, List, Any, Optional
13
+ from collections import defaultdict
14
+
15
+ from crawlo.logging import get_logger
16
+
17
+
18
+ class ResourceSnapshot:
19
+ """资源快照"""
20
+
21
+ def __init__(self, name: str = ""):
22
+ self.name = name
23
+ self.timestamp = time.time()
24
+
25
+ # 进程信息
26
+ process = psutil.Process()
27
+ self.memory_mb = process.memory_info().rss / 1024 / 1024
28
+ self.cpu_percent = process.cpu_percent()
29
+ self.num_threads = process.num_threads()
30
+ self.num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0
31
+
32
+ # GC信息
33
+ gc.collect() # 先触发一次GC
34
+ self.gc_objects = len(gc.get_objects())
35
+ self.gc_stats = gc.get_stats()
36
+
37
+ # 对象类型统计
38
+ self.object_types = self._count_object_types()
39
+
40
+ def _count_object_types(self, top_n: int = 20) -> Dict[str, int]:
41
+ """统计前N个对象类型"""
42
+ type_counts = defaultdict(int)
43
+
44
+ for obj in gc.get_objects():
45
+ type_name = type(obj).__name__
46
+ type_counts[type_name] += 1
47
+
48
+ # 返回前N个
49
+ sorted_types = sorted(type_counts.items(), key=lambda x: x[1], reverse=True)
50
+ return dict(sorted_types[:top_n])
51
+
52
+ def to_dict(self) -> Dict[str, Any]:
53
+ """转换为字典"""
54
+ return {
55
+ 'name': self.name,
56
+ 'timestamp': self.timestamp,
57
+ 'memory_mb': round(self.memory_mb, 2),
58
+ 'cpu_percent': self.cpu_percent,
59
+ 'num_threads': self.num_threads,
60
+ 'num_fds': self.num_fds,
61
+ 'gc_objects': self.gc_objects,
62
+ 'object_types': self.object_types,
63
+ }
64
+
65
+
66
+ class LeakDetector:
67
+ """
68
+ 资源泄露检测器
69
+
70
+ 功能:
71
+ 1. 定期记录资源快照
72
+ 2. 分析资源增长趋势
73
+ 3. 识别可能的泄露点
74
+ 4. 生成诊断报告
75
+ """
76
+
77
+ def __init__(self, name: str = "default"):
78
+ self.name = name
79
+ self._snapshots: List[ResourceSnapshot] = []
80
+ self._logger = get_logger(f"LeakDetector.{name}")
81
+ self._baseline: Optional[ResourceSnapshot] = None
82
+
83
+ def set_baseline(self, name: str = "baseline"):
84
+ """设置基线快照"""
85
+ self._baseline = ResourceSnapshot(name)
86
+ self._logger.info(f"Baseline set: {self._baseline.memory_mb:.2f}MB")
87
+
88
+ def snapshot(self, name: str = ""):
89
+ """记录当前资源快照"""
90
+ snapshot = ResourceSnapshot(name or f"snapshot_{len(self._snapshots)}")
91
+ self._snapshots.append(snapshot)
92
+
93
+ self._logger.debug(
94
+ f"Snapshot '{snapshot.name}': {snapshot.memory_mb:.2f}MB, "
95
+ f"{snapshot.gc_objects} objects"
96
+ )
97
+
98
+ return snapshot
99
+
100
+ def analyze(self, threshold_mb: float = 10.0) -> Dict[str, Any]:
101
+ """
102
+ 分析资源使用情况
103
+
104
+ Args:
105
+ threshold_mb: 内存增长阈值(MB),超过视为可能泄露
106
+
107
+ Returns:
108
+ 分析结果
109
+ """
110
+ if len(self._snapshots) < 2:
111
+ return {
112
+ 'status': 'insufficient_data',
113
+ 'message': 'Need at least 2 snapshots for analysis',
114
+ 'snapshot_count': len(self._snapshots),
115
+ 'changes': {
116
+ 'memory_mb': 0.0,
117
+ 'memory_percent': 0.0,
118
+ 'objects': 0,
119
+ 'objects_percent': 0.0,
120
+ 'file_descriptors': 0,
121
+ 'threads': 0,
122
+ },
123
+ 'potential_leaks': [],
124
+ 'type_changes': [],
125
+ }
126
+
127
+ first = self._baseline or self._snapshots[0]
128
+ latest = self._snapshots[-1]
129
+
130
+ # 内存增长
131
+ memory_growth_mb = latest.memory_mb - first.memory_mb
132
+ memory_growth_percent = (memory_growth_mb / first.memory_mb) * 100 if first.memory_mb > 0 else 0
133
+
134
+ # 对象数量增长
135
+ object_growth = latest.gc_objects - first.gc_objects
136
+ object_growth_percent = (object_growth / first.gc_objects) * 100 if first.gc_objects > 0 else 0
137
+
138
+ # 文件描述符增长
139
+ fd_growth = latest.num_fds - first.num_fds
140
+
141
+ # 线程数增长
142
+ thread_growth = latest.num_threads - first.num_threads
143
+
144
+ # 检测泄露
145
+ potential_leaks = []
146
+
147
+ if memory_growth_mb > threshold_mb:
148
+ potential_leaks.append({
149
+ 'type': 'memory',
150
+ 'severity': 'high' if memory_growth_mb > threshold_mb * 2 else 'medium',
151
+ 'growth_mb': round(memory_growth_mb, 2),
152
+ 'growth_percent': round(memory_growth_percent, 2),
153
+ })
154
+
155
+ if object_growth > 1000:
156
+ potential_leaks.append({
157
+ 'type': 'objects',
158
+ 'severity': 'medium',
159
+ 'growth': object_growth,
160
+ 'growth_percent': round(object_growth_percent, 2),
161
+ })
162
+
163
+ if fd_growth > 10:
164
+ potential_leaks.append({
165
+ 'type': 'file_descriptors',
166
+ 'severity': 'high',
167
+ 'growth': fd_growth,
168
+ })
169
+
170
+ if thread_growth > 5:
171
+ potential_leaks.append({
172
+ 'type': 'threads',
173
+ 'severity': 'medium',
174
+ 'growth': thread_growth,
175
+ })
176
+
177
+ # 对象类型变化分析
178
+ type_changes = self._analyze_type_changes(first, latest)
179
+
180
+ result = {
181
+ 'status': 'leak_detected' if potential_leaks else 'healthy',
182
+ 'duration_seconds': latest.timestamp - first.timestamp,
183
+ 'baseline': first.to_dict(),
184
+ 'latest': latest.to_dict(),
185
+ 'changes': {
186
+ 'memory_mb': round(memory_growth_mb, 2),
187
+ 'memory_percent': round(memory_growth_percent, 2),
188
+ 'objects': object_growth,
189
+ 'objects_percent': round(object_growth_percent, 2),
190
+ 'file_descriptors': fd_growth,
191
+ 'threads': thread_growth,
192
+ },
193
+ 'potential_leaks': potential_leaks,
194
+ 'type_changes': type_changes,
195
+ 'snapshot_count': len(self._snapshots),
196
+ }
197
+
198
+ # 记录分析结果
199
+ if potential_leaks:
200
+ self._logger.warning(
201
+ f"Potential leaks detected: {len(potential_leaks)} issue(s), "
202
+ f"memory growth: {memory_growth_mb:.2f}MB ({memory_growth_percent:.1f}%)"
203
+ )
204
+ else:
205
+ self._logger.info(
206
+ f"No leaks detected, memory growth: {memory_growth_mb:.2f}MB "
207
+ f"({memory_growth_percent:.1f}%)"
208
+ )
209
+
210
+ return result
211
+
212
+ def _analyze_type_changes(self, first: ResourceSnapshot, latest: ResourceSnapshot, top_n: int = 10) -> List[Dict[str, Any]]:
213
+ """分析对象类型变化"""
214
+ changes = []
215
+
216
+ # 找出增长最多的类型
217
+ for type_name in set(list(first.object_types.keys()) + list(latest.object_types.keys())):
218
+ old_count = first.object_types.get(type_name, 0)
219
+ new_count = latest.object_types.get(type_name, 0)
220
+ growth = new_count - old_count
221
+
222
+ if growth > 0:
223
+ changes.append({
224
+ 'type': type_name,
225
+ 'old_count': old_count,
226
+ 'new_count': new_count,
227
+ 'growth': growth,
228
+ 'growth_percent': round((growth / old_count) * 100, 2) if old_count > 0 else float('inf')
229
+ })
230
+
231
+ # 按增长数量排序
232
+ changes.sort(key=lambda x: x['growth'], reverse=True)
233
+
234
+ return changes[:top_n]
235
+
236
+ def get_trend(self, metric: str = 'memory_mb') -> List[float]:
237
+ """获取指标趋势"""
238
+ return [getattr(s, metric) for s in self._snapshots]
239
+
240
+ def generate_report(self) -> str:
241
+ """生成诊断报告"""
242
+ if not self._snapshots:
243
+ return "No snapshots available"
244
+
245
+ analysis = self.analyze()
246
+
247
+ # 如果数据不足,返回简单报告
248
+ if analysis['status'] == 'insufficient_data':
249
+ return (
250
+ "=" * 60 + "\n" +
251
+ "资源泄露检测报告\n" +
252
+ "=" * 60 + "\n" +
253
+ f"检测器: {self.name}\n" +
254
+ f"快照数量: {analysis['snapshot_count']}\n" +
255
+ "\n" +
256
+ "⚠️ 数据不足: " + analysis['message'] + "\n" +
257
+ "=" * 60
258
+ )
259
+
260
+ report = []
261
+ report.append("=" * 60)
262
+ report.append("资源泄露检测报告")
263
+ report.append("=" * 60)
264
+ report.append(f"检测器: {self.name}")
265
+ report.append(f"快照数量: {analysis['snapshot_count']}")
266
+ report.append(f"持续时间: {analysis['duration_seconds']:.2f}秒")
267
+ report.append("")
268
+
269
+ report.append("资源变化:")
270
+ report.append("-" * 60)
271
+ changes = analysis['changes']
272
+ report.append(f" 内存: {changes['memory_mb']:+.2f}MB ({changes['memory_percent']:+.2f}%)")
273
+ report.append(f" 对象数: {changes['objects']:+d} ({changes['objects_percent']:+.2f}%)")
274
+ report.append(f" 文件描述符: {changes['file_descriptors']:+d}")
275
+ report.append(f" 线程数: {changes['threads']:+d}")
276
+ report.append("")
277
+
278
+ if analysis['potential_leaks']:
279
+ report.append("⚠️ 潜在泄露:")
280
+ report.append("-" * 60)
281
+ for leak in analysis['potential_leaks']:
282
+ report.append(f" - {leak['type']}: {leak['severity']} severity")
283
+ if 'growth_mb' in leak:
284
+ report.append(f" 增长: {leak['growth_mb']:.2f}MB ({leak['growth_percent']:.2f}%)")
285
+ elif 'growth' in leak:
286
+ report.append(f" 增长: {leak['growth']}")
287
+ report.append("")
288
+ else:
289
+ report.append("✅ 未检测到明显泄露")
290
+ report.append("")
291
+
292
+ if analysis['type_changes']:
293
+ report.append("对象类型变化(Top 10):")
294
+ report.append("-" * 60)
295
+ for change in analysis['type_changes'][:10]:
296
+ report.append(
297
+ f" {change['type']}: {change['old_count']} -> {change['new_count']} "
298
+ f"(+{change['growth']})"
299
+ )
300
+ report.append("")
301
+
302
+ report.append("=" * 60)
303
+
304
+ return "\n".join(report)
305
+
306
+ def clear(self):
307
+ """清除所有快照"""
308
+ self._snapshots.clear()
309
+ self._baseline = None
310
+ self._logger.debug("Snapshots cleared")
311
+
312
+
313
+ # 全局检测器注册表
314
+ _global_detectors: Dict[str, LeakDetector] = {}
315
+
316
+
317
+ def get_leak_detector(name: str = "default") -> LeakDetector:
318
+ """
319
+ 获取泄露检测器实例(单例)
320
+
321
+ Args:
322
+ name: 检测器名称
323
+
324
+ Returns:
325
+ LeakDetector实例
326
+ """
327
+ if name not in _global_detectors:
328
+ _global_detectors[name] = LeakDetector(name)
329
+ return _global_detectors[name]
330
+
331
+
332
+ def cleanup_detectors():
333
+ """清理所有检测器"""
334
+ global _global_detectors
335
+ _global_detectors.clear()