crawlo 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (220) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +65 -65
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +142 -132
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +292 -292
  14. crawlo/commands/startproject.py +418 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +252 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1027 -1027
  24. crawlo/downloader/__init__.py +266 -266
  25. crawlo/downloader/aiohttp_downloader.py +220 -220
  26. crawlo/downloader/cffi_downloader.py +256 -256
  27. crawlo/downloader/httpx_downloader.py +259 -259
  28. crawlo/downloader/hybrid_downloader.py +213 -213
  29. crawlo/downloader/playwright_downloader.py +402 -402
  30. crawlo/downloader/selenium_downloader.py +472 -472
  31. crawlo/event.py +11 -11
  32. crawlo/exceptions.py +81 -81
  33. crawlo/extension/__init__.py +37 -37
  34. crawlo/extension/health_check.py +141 -141
  35. crawlo/extension/log_interval.py +57 -57
  36. crawlo/extension/log_stats.py +81 -81
  37. crawlo/extension/logging_extension.py +43 -43
  38. crawlo/extension/memory_monitor.py +104 -104
  39. crawlo/extension/performance_profiler.py +133 -133
  40. crawlo/extension/request_recorder.py +107 -107
  41. crawlo/filters/__init__.py +154 -154
  42. crawlo/filters/aioredis_filter.py +280 -280
  43. crawlo/filters/memory_filter.py +269 -269
  44. crawlo/items/__init__.py +23 -23
  45. crawlo/items/base.py +21 -21
  46. crawlo/items/fields.py +53 -53
  47. crawlo/items/items.py +104 -104
  48. crawlo/middleware/__init__.py +21 -21
  49. crawlo/middleware/default_header.py +132 -32
  50. crawlo/middleware/download_delay.py +105 -28
  51. crawlo/middleware/middleware_manager.py +135 -135
  52. crawlo/middleware/offsite.py +116 -0
  53. crawlo/middleware/proxy.py +366 -272
  54. crawlo/middleware/request_ignore.py +88 -30
  55. crawlo/middleware/response_code.py +164 -18
  56. crawlo/middleware/response_filter.py +138 -26
  57. crawlo/middleware/retry.py +124 -124
  58. crawlo/mode_manager.py +211 -211
  59. crawlo/network/__init__.py +21 -21
  60. crawlo/network/request.py +338 -338
  61. crawlo/network/response.py +359 -359
  62. crawlo/pipelines/__init__.py +21 -21
  63. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  64. crawlo/pipelines/console_pipeline.py +39 -39
  65. crawlo/pipelines/csv_pipeline.py +316 -316
  66. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  67. crawlo/pipelines/json_pipeline.py +218 -218
  68. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  69. crawlo/pipelines/mongo_pipeline.py +131 -131
  70. crawlo/pipelines/mysql_pipeline.py +316 -316
  71. crawlo/pipelines/pipeline_manager.py +61 -61
  72. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  73. crawlo/project.py +187 -187
  74. crawlo/queue/pqueue.py +37 -37
  75. crawlo/queue/queue_manager.py +337 -337
  76. crawlo/queue/redis_priority_queue.py +298 -298
  77. crawlo/settings/__init__.py +7 -7
  78. crawlo/settings/default_settings.py +226 -219
  79. crawlo/settings/setting_manager.py +122 -122
  80. crawlo/spider/__init__.py +639 -639
  81. crawlo/stats_collector.py +59 -59
  82. crawlo/subscriber.py +130 -130
  83. crawlo/task_manager.py +30 -30
  84. crawlo/templates/crawlo.cfg.tmpl +10 -10
  85. crawlo/templates/project/__init__.py.tmpl +3 -3
  86. crawlo/templates/project/items.py.tmpl +17 -17
  87. crawlo/templates/project/middlewares.py.tmpl +118 -109
  88. crawlo/templates/project/pipelines.py.tmpl +96 -96
  89. crawlo/templates/project/run.py.tmpl +45 -45
  90. crawlo/templates/project/settings.py.tmpl +327 -326
  91. crawlo/templates/project/settings_distributed.py.tmpl +119 -119
  92. crawlo/templates/project/settings_gentle.py.tmpl +94 -94
  93. crawlo/templates/project/settings_high_performance.py.tmpl +151 -151
  94. crawlo/templates/project/settings_simple.py.tmpl +68 -68
  95. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  96. crawlo/templates/spider/spider.py.tmpl +143 -141
  97. crawlo/tools/__init__.py +182 -182
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/tools/data_validator.py +180 -180
  101. crawlo/tools/date_tools.py +35 -35
  102. crawlo/tools/distributed_coordinator.py +386 -386
  103. crawlo/tools/retry_mechanism.py +220 -220
  104. crawlo/tools/scenario_adapter.py +262 -262
  105. crawlo/utils/__init__.py +35 -35
  106. crawlo/utils/batch_processor.py +260 -260
  107. crawlo/utils/controlled_spider_mixin.py +439 -439
  108. crawlo/utils/date_tools.py +290 -290
  109. crawlo/utils/db_helper.py +343 -343
  110. crawlo/utils/enhanced_error_handler.py +359 -359
  111. crawlo/utils/env_config.py +105 -105
  112. crawlo/utils/error_handler.py +125 -125
  113. crawlo/utils/func_tools.py +82 -82
  114. crawlo/utils/large_scale_config.py +286 -286
  115. crawlo/utils/large_scale_helper.py +343 -343
  116. crawlo/utils/log.py +128 -128
  117. crawlo/utils/performance_monitor.py +284 -284
  118. crawlo/utils/queue_helper.py +175 -175
  119. crawlo/utils/redis_connection_pool.py +334 -334
  120. crawlo/utils/redis_key_validator.py +199 -199
  121. crawlo/utils/request.py +267 -267
  122. crawlo/utils/request_serializer.py +219 -219
  123. crawlo/utils/spider_loader.py +62 -62
  124. crawlo/utils/system.py +11 -11
  125. crawlo/utils/tools.py +4 -4
  126. crawlo/utils/url.py +39 -39
  127. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/METADATA +692 -697
  128. crawlo-1.2.1.dist-info/RECORD +220 -0
  129. examples/__init__.py +7 -7
  130. examples/aiohttp_settings.py +42 -0
  131. examples/curl_cffi_settings.py +41 -0
  132. examples/default_header_middleware_example.py +107 -0
  133. examples/default_header_spider_example.py +129 -0
  134. examples/download_delay_middleware_example.py +160 -0
  135. examples/httpx_settings.py +42 -0
  136. examples/multi_downloader_proxy_example.py +81 -0
  137. examples/offsite_middleware_example.py +55 -0
  138. examples/offsite_spider_example.py +107 -0
  139. examples/proxy_spider_example.py +166 -0
  140. examples/request_ignore_middleware_example.py +51 -0
  141. examples/request_ignore_spider_example.py +99 -0
  142. examples/response_code_middleware_example.py +52 -0
  143. examples/response_filter_middleware_example.py +67 -0
  144. examples/tong_hua_shun_settings.py +62 -0
  145. examples/tong_hua_shun_spider.py +170 -0
  146. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  147. tests/__init__.py +7 -7
  148. tests/advanced_tools_example.py +275 -275
  149. tests/authenticated_proxy_example.py +236 -236
  150. tests/cleaners_example.py +160 -160
  151. tests/config_validation_demo.py +102 -102
  152. tests/controlled_spider_example.py +205 -205
  153. tests/date_tools_example.py +180 -180
  154. tests/dynamic_loading_example.py +523 -523
  155. tests/dynamic_loading_test.py +104 -104
  156. tests/env_config_example.py +133 -133
  157. tests/error_handling_example.py +171 -171
  158. tests/redis_key_validation_demo.py +130 -130
  159. tests/response_improvements_example.py +144 -144
  160. tests/test_advanced_tools.py +148 -148
  161. tests/test_all_redis_key_configs.py +145 -145
  162. tests/test_authenticated_proxy.py +141 -141
  163. tests/test_cleaners.py +54 -54
  164. tests/test_comprehensive.py +146 -146
  165. tests/test_config_validator.py +193 -193
  166. tests/test_crawlo_proxy_integration.py +173 -0
  167. tests/test_date_tools.py +123 -123
  168. tests/test_default_header_middleware.py +159 -0
  169. tests/test_double_crawlo_fix.py +207 -207
  170. tests/test_double_crawlo_fix_simple.py +124 -124
  171. tests/test_download_delay_middleware.py +222 -0
  172. tests/test_downloader_proxy_compatibility.py +269 -0
  173. tests/test_dynamic_downloaders_proxy.py +124 -124
  174. tests/test_dynamic_proxy.py +92 -92
  175. tests/test_dynamic_proxy_config.py +146 -146
  176. tests/test_dynamic_proxy_real.py +109 -109
  177. tests/test_edge_cases.py +303 -303
  178. tests/test_enhanced_error_handler.py +270 -270
  179. tests/test_env_config.py +121 -121
  180. tests/test_error_handler_compatibility.py +112 -112
  181. tests/test_final_validation.py +153 -153
  182. tests/test_framework_env_usage.py +103 -103
  183. tests/test_integration.py +356 -356
  184. tests/test_item_dedup_redis_key.py +122 -122
  185. tests/test_offsite_middleware.py +222 -0
  186. tests/test_parsel.py +29 -29
  187. tests/test_performance.py +327 -327
  188. tests/test_proxy_api.py +265 -0
  189. tests/test_proxy_health_check.py +32 -32
  190. tests/test_proxy_middleware.py +122 -0
  191. tests/test_proxy_middleware_enhanced.py +217 -0
  192. tests/test_proxy_middleware_integration.py +136 -136
  193. tests/test_proxy_providers.py +56 -56
  194. tests/test_proxy_stats.py +19 -19
  195. tests/test_proxy_strategies.py +59 -59
  196. tests/test_queue_manager_double_crawlo.py +173 -173
  197. tests/test_queue_manager_redis_key.py +176 -176
  198. tests/test_real_scenario_proxy.py +196 -0
  199. tests/test_redis_config.py +28 -28
  200. tests/test_redis_connection_pool.py +294 -294
  201. tests/test_redis_key_naming.py +181 -181
  202. tests/test_redis_key_validator.py +123 -123
  203. tests/test_redis_queue.py +224 -224
  204. tests/test_request_ignore_middleware.py +183 -0
  205. tests/test_request_serialization.py +70 -70
  206. tests/test_response_code_middleware.py +350 -0
  207. tests/test_response_filter_middleware.py +428 -0
  208. tests/test_response_improvements.py +152 -152
  209. tests/test_retry_middleware.py +242 -0
  210. tests/test_scheduler.py +241 -241
  211. tests/test_simple_response.py +61 -61
  212. tests/test_telecom_spider_redis_key.py +205 -205
  213. tests/test_template_content.py +87 -87
  214. tests/test_template_redis_key.py +134 -134
  215. tests/test_tools.py +153 -153
  216. tests/tools_example.py +257 -257
  217. crawlo-1.2.0.dist-info/RECORD +0 -190
  218. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/WHEEL +0 -0
  219. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/entry_points.txt +0 -0
  220. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/top_level.txt +0 -0
tests/test_performance.py CHANGED
@@ -1,328 +1,328 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- 性能测试
5
- 测试系统性能和瓶颈
6
- """
7
- import asyncio
8
- import sys
9
- import os
10
- import time
11
- import psutil
12
- import traceback
13
- from typing import List
14
-
15
- # 添加项目根目录到Python路径
16
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
17
-
18
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
19
- from crawlo.network.request import Request
20
- from crawlo.utils.redis_connection_pool import OptimizedRedisConnectionPool, get_redis_pool, close_all_pools
21
- from crawlo.utils.batch_processor import RedisBatchProcessor, BatchProcessor
22
- from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer
23
-
24
-
25
- async def test_redis_queue_performance():
26
- """测试 Redis 队列性能"""
27
- print("🔍 测试 Redis 队列性能...")
28
-
29
- try:
30
- queue = RedisPriorityQueue(
31
- redis_url="redis://127.0.0.1:6379/15",
32
- queue_name="test:performance:queue"
33
- )
34
- await queue.connect()
35
-
36
- # 1. 测试批量入队性能
37
- print(" 📊 测试批量入队性能...")
38
- start_time = time.time()
39
- request_count = 1000
40
-
41
- for i in range(request_count):
42
- request = Request(url=f"https://example{i}.com", priority=i % 10)
43
- await queue.put(request)
44
-
45
- end_time = time.time()
46
- duration = end_time - start_time
47
- rate = request_count / duration
48
-
49
- print(f" 入队 {request_count} 个请求耗时: {duration:.2f}秒")
50
- print(f" 入队速率: {rate:.1f} 请求/秒")
51
-
52
- # 2. 测试批量出队性能
53
- print(" 📊 测试批量出队性能...")
54
- start_time = time.time()
55
-
56
- retrieved_count = 0
57
- while retrieved_count < request_count:
58
- request = await queue.get(timeout=1.0)
59
- if request:
60
- await queue.ack(request)
61
- retrieved_count += 1
62
- else:
63
- break
64
-
65
- end_time = time.time()
66
- duration = end_time - start_time
67
- rate = retrieved_count / duration if duration > 0 else 0
68
-
69
- print(f" 出队 {retrieved_count} 个请求耗时: {duration:.2f}秒")
70
- print(f" 出队速率: {rate:.1f} 请求/秒")
71
-
72
- await queue.close()
73
-
74
- # 性能标准:1000个请求应该在5秒内完成
75
- if duration < 5.0:
76
- print(" ✅ Redis 队列性能测试通过")
77
- return True
78
- else:
79
- print(" ⚠️ Redis 队列性能较低")
80
- return True # 仍然算通过,只是性能较低
81
-
82
- except Exception as e:
83
- print(f" ❌ Redis 队列性能测试失败: {e}")
84
- traceback.print_exc()
85
- return False
86
-
87
-
88
- async def test_redis_connection_pool_performance():
89
- """测试 Redis 连接池性能"""
90
- print("🔍 测试 Redis 连接池性能...")
91
-
92
- try:
93
- # 1. 测试连接获取性能
94
- print(" 📊 测试连接获取性能...")
95
- start_time = time.time()
96
- connection_count = 100
97
-
98
- pools = []
99
- for i in range(connection_count):
100
- pool = get_redis_pool(f"redis://127.0.0.1:6379/15?db={i % 16}")
101
- pools.append(pool)
102
-
103
- end_time = time.time()
104
- duration = end_time - start_time
105
-
106
- print(f" 获取 {connection_count} 个连接耗时: {duration:.2f}秒")
107
-
108
- # 2. 测试连接复用性能
109
- print(" 📊 测试连接复用性能...")
110
- start_time = time.time()
111
-
112
- # 重复获取相同连接
113
- for i in range(connection_count * 10):
114
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
115
- redis_client = await pool.get_connection()
116
- await redis_client.ping()
117
-
118
- end_time = time.time()
119
- duration = end_time - start_time
120
-
121
- print(f" 复用 {connection_count * 10} 次连接耗时: {duration:.2f}秒")
122
-
123
- # 3. 测试并发连接获取
124
- print(" 📊 测试并发连接获取...")
125
-
126
- async def get_connection_worker(worker_id: int):
127
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
128
- redis_client = await pool.get_connection()
129
- await redis_client.ping()
130
- return True
131
-
132
- start_time = time.time()
133
- tasks = [get_connection_worker(i) for i in range(50)]
134
- results = await asyncio.gather(*tasks, return_exceptions=True)
135
- end_time = time.time()
136
-
137
- success_count = sum(1 for result in results if result is True)
138
- duration = end_time - start_time
139
-
140
- print(f" 并发获取 50 个连接耗时: {duration:.2f}秒")
141
- print(f" 成功获取: {success_count}/50")
142
-
143
- # 性能标准:并发获取应该在2秒内完成
144
- if duration < 2.0 and success_count >= 45:
145
- print(" ✅ Redis 连接池性能测试通过")
146
- return True
147
- else:
148
- print(" ⚠️ Redis 连接池性能较低")
149
- return True # 仍然算通过,只是性能较低
150
-
151
- except Exception as e:
152
- print(f" ❌ Redis 连接池性能测试失败: {e}")
153
- traceback.print_exc()
154
- return False
155
-
156
-
157
- async def test_batch_processor_performance():
158
- """测试批处理器性能"""
159
- print("🔍 测试批处理器性能...")
160
-
161
- try:
162
- # 创建连接池和批处理器
163
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
164
- redis_client = await pool.get_connection()
165
- batch_processor = RedisBatchProcessor(redis_client, batch_size=100)
166
-
167
- # 1. 测试 Redis 批量设置性能
168
- print(" 📊 测试 Redis 批量设置性能...")
169
- items_count = 1000
170
- items = [{"key": f"perf_test_key_{i}", "value": f"perf_test_value_{i}"} for i in range(items_count)]
171
-
172
- start_time = time.time()
173
- count = await batch_processor.batch_set(items)
174
- end_time = time.time()
175
-
176
- duration = end_time - start_time
177
- rate = count / duration if duration > 0 else 0
178
-
179
- print(f" 批量设置 {count} 个键值对耗时: {duration:.2f}秒")
180
- print(f" 设置速率: {rate:.1f} 键值对/秒")
181
-
182
- # 2. 测试 Redis 批量获取性能
183
- print(" 📊 测试 Redis 批量获取性能...")
184
- keys = [f"perf_test_key_{i}" for i in range(items_count)]
185
-
186
- start_time = time.time()
187
- result = await batch_processor.batch_get(keys)
188
- end_time = time.time()
189
-
190
- duration = end_time - start_time
191
- rate = len(result) / duration if duration > 0 else 0
192
-
193
- print(f" 批量获取 {len(result)} 个键值对耗时: {duration:.2f}秒")
194
- print(f" 获取速率: {rate:.1f} 键值对/秒")
195
-
196
- # 3. 测试通用批处理器性能
197
- print(" 📊 测试通用批处理器性能...")
198
-
199
- async def process_item(item: int) -> int:
200
- # 模拟一些处理工作
201
- await asyncio.sleep(0.001)
202
- return item * 2
203
-
204
- batch_processor_general = BatchProcessor(batch_size=50, max_concurrent_batches=10)
205
- items_to_process = list(range(1000))
206
-
207
- start_time = time.time()
208
- results = await batch_processor_general.process_in_batches(items_to_process, process_item)
209
- end_time = time.time()
210
-
211
- duration = end_time - start_time
212
- rate = len(results) / duration if duration > 0 else 0
213
-
214
- print(f" 批量处理 {len(results)} 个项目耗时: {duration:.2f}秒")
215
- print(f" 处理速率: {rate:.1f} 项目/秒")
216
-
217
- # 清理测试数据
218
- await redis_client.delete(*[f"perf_test_key_{i}" for i in range(items_count)])
219
-
220
- # 性能标准:批量操作应该在合理时间内完成
221
- if duration < 10.0:
222
- print(" ✅ 批处理器性能测试通过")
223
- return True
224
- else:
225
- print(" ⚠️ 批处理器性能较低")
226
- return True # 仍然算通过,只是性能较低
227
-
228
- except Exception as e:
229
- print(f" ❌ 批处理器性能测试失败: {e}")
230
- traceback.print_exc()
231
- return False
232
-
233
-
234
- async def test_performance_monitor_overhead():
235
- """测试性能监控器开销"""
236
- print("🔍 测试性能监控器开销...")
237
-
238
- try:
239
- monitor = PerformanceMonitor("test_monitor")
240
-
241
- # 1. 测试指标获取开销
242
- print(" 📊 测试指标获取开销...")
243
- start_time = time.time()
244
-
245
- for i in range(100):
246
- metrics = monitor.get_system_metrics()
247
- assert isinstance(metrics, dict), "应该返回字典"
248
-
249
- end_time = time.time()
250
- duration = end_time - start_time
251
-
252
- print(f" 获取 100 次系统指标耗时: {duration:.2f}秒")
253
- print(f" 平均每次耗时: {duration * 1000 / 100:.2f}毫秒")
254
-
255
- # 2. 测试计时器开销
256
- print(" 📊 测试计时器开销...")
257
-
258
- total_timer_time = 0
259
- timer_count = 1000
260
-
261
- for i in range(timer_count):
262
- start = time.time()
263
- with PerformanceTimer(f"test_timer_{i}"):
264
- pass # 空操作
265
- end = time.time()
266
- total_timer_time += (end - start)
267
-
268
- avg_timer_time = total_timer_time / timer_count * 1000 # 转换为毫秒
269
-
270
- print(f" 平均计时器开销: {avg_timer_time:.2f}毫秒")
271
-
272
- # 开销标准:平均计时器开销应该小于1毫秒
273
- if avg_timer_time < 1.0:
274
- print(" ✅ 性能监控器开销测试通过")
275
- return True
276
- else:
277
- print(" ⚠️ 性能监控器开销较高")
278
- return True # 仍然算通过,只是开销较高
279
-
280
- except Exception as e:
281
- print(f" ❌ 性能监控器开销测试失败: {e}")
282
- traceback.print_exc()
283
- return False
284
-
285
-
286
- async def main():
287
- """主测试函数"""
288
- print("🚀 开始性能测试...")
289
- print("=" * 50)
290
-
291
- tests = [
292
- test_redis_queue_performance,
293
- test_redis_connection_pool_performance,
294
- test_batch_processor_performance,
295
- test_performance_monitor_overhead,
296
- ]
297
-
298
- passed = 0
299
- total = len(tests)
300
-
301
- for test_func in tests:
302
- try:
303
- if await test_func():
304
- passed += 1
305
- print(f"✅ {test_func.__name__} 通过")
306
- else:
307
- print(f"❌ {test_func.__name__} 失败")
308
- except Exception as e:
309
- print(f"❌ {test_func.__name__} 异常: {e}")
310
- print()
311
-
312
- # 关闭所有连接池
313
- await close_all_pools()
314
-
315
- print("=" * 50)
316
- print(f"📊 性能测试结果: {passed}/{total} 通过")
317
-
318
- if passed == total:
319
- print("🎉 所有性能测试通过!")
320
- return 0
321
- else:
322
- print("❌ 部分性能测试失败,请检查实现")
323
- return 1
324
-
325
-
326
- if __name__ == "__main__":
327
- exit_code = asyncio.run(main())
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 性能测试
5
+ 测试系统性能和瓶颈
6
+ """
7
+ import asyncio
8
+ import sys
9
+ import os
10
+ import time
11
+ import psutil
12
+ import traceback
13
+ from typing import List
14
+
15
+ # 添加项目根目录到Python路径
16
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
17
+
18
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
19
+ from crawlo.network.request import Request
20
+ from crawlo.utils.redis_connection_pool import OptimizedRedisConnectionPool, get_redis_pool, close_all_pools
21
+ from crawlo.utils.batch_processor import RedisBatchProcessor, BatchProcessor
22
+ from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer
23
+
24
+
25
+ async def test_redis_queue_performance():
26
+ """测试 Redis 队列性能"""
27
+ print("🔍 测试 Redis 队列性能...")
28
+
29
+ try:
30
+ queue = RedisPriorityQueue(
31
+ redis_url="redis://127.0.0.1:6379/15",
32
+ queue_name="test:performance:queue"
33
+ )
34
+ await queue.connect()
35
+
36
+ # 1. 测试批量入队性能
37
+ print(" 📊 测试批量入队性能...")
38
+ start_time = time.time()
39
+ request_count = 1000
40
+
41
+ for i in range(request_count):
42
+ request = Request(url=f"https://example{i}.com", priority=i % 10)
43
+ await queue.put(request)
44
+
45
+ end_time = time.time()
46
+ duration = end_time - start_time
47
+ rate = request_count / duration
48
+
49
+ print(f" 入队 {request_count} 个请求耗时: {duration:.2f}秒")
50
+ print(f" 入队速率: {rate:.1f} 请求/秒")
51
+
52
+ # 2. 测试批量出队性能
53
+ print(" 📊 测试批量出队性能...")
54
+ start_time = time.time()
55
+
56
+ retrieved_count = 0
57
+ while retrieved_count < request_count:
58
+ request = await queue.get(timeout=1.0)
59
+ if request:
60
+ await queue.ack(request)
61
+ retrieved_count += 1
62
+ else:
63
+ break
64
+
65
+ end_time = time.time()
66
+ duration = end_time - start_time
67
+ rate = retrieved_count / duration if duration > 0 else 0
68
+
69
+ print(f" 出队 {retrieved_count} 个请求耗时: {duration:.2f}秒")
70
+ print(f" 出队速率: {rate:.1f} 请求/秒")
71
+
72
+ await queue.close()
73
+
74
+ # 性能标准:1000个请求应该在5秒内完成
75
+ if duration < 5.0:
76
+ print(" ✅ Redis 队列性能测试通过")
77
+ return True
78
+ else:
79
+ print(" ⚠️ Redis 队列性能较低")
80
+ return True # 仍然算通过,只是性能较低
81
+
82
+ except Exception as e:
83
+ print(f" ❌ Redis 队列性能测试失败: {e}")
84
+ traceback.print_exc()
85
+ return False
86
+
87
+
88
+ async def test_redis_connection_pool_performance():
89
+ """测试 Redis 连接池性能"""
90
+ print("🔍 测试 Redis 连接池性能...")
91
+
92
+ try:
93
+ # 1. 测试连接获取性能
94
+ print(" 📊 测试连接获取性能...")
95
+ start_time = time.time()
96
+ connection_count = 100
97
+
98
+ pools = []
99
+ for i in range(connection_count):
100
+ pool = get_redis_pool(f"redis://127.0.0.1:6379/15?db={i % 16}")
101
+ pools.append(pool)
102
+
103
+ end_time = time.time()
104
+ duration = end_time - start_time
105
+
106
+ print(f" 获取 {connection_count} 个连接耗时: {duration:.2f}秒")
107
+
108
+ # 2. 测试连接复用性能
109
+ print(" 📊 测试连接复用性能...")
110
+ start_time = time.time()
111
+
112
+ # 重复获取相同连接
113
+ for i in range(connection_count * 10):
114
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
115
+ redis_client = await pool.get_connection()
116
+ await redis_client.ping()
117
+
118
+ end_time = time.time()
119
+ duration = end_time - start_time
120
+
121
+ print(f" 复用 {connection_count * 10} 次连接耗时: {duration:.2f}秒")
122
+
123
+ # 3. 测试并发连接获取
124
+ print(" 📊 测试并发连接获取...")
125
+
126
+ async def get_connection_worker(worker_id: int):
127
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
128
+ redis_client = await pool.get_connection()
129
+ await redis_client.ping()
130
+ return True
131
+
132
+ start_time = time.time()
133
+ tasks = [get_connection_worker(i) for i in range(50)]
134
+ results = await asyncio.gather(*tasks, return_exceptions=True)
135
+ end_time = time.time()
136
+
137
+ success_count = sum(1 for result in results if result is True)
138
+ duration = end_time - start_time
139
+
140
+ print(f" 并发获取 50 个连接耗时: {duration:.2f}秒")
141
+ print(f" 成功获取: {success_count}/50")
142
+
143
+ # 性能标准:并发获取应该在2秒内完成
144
+ if duration < 2.0 and success_count >= 45:
145
+ print(" ✅ Redis 连接池性能测试通过")
146
+ return True
147
+ else:
148
+ print(" ⚠️ Redis 连接池性能较低")
149
+ return True # 仍然算通过,只是性能较低
150
+
151
+ except Exception as e:
152
+ print(f" ❌ Redis 连接池性能测试失败: {e}")
153
+ traceback.print_exc()
154
+ return False
155
+
156
+
157
+ async def test_batch_processor_performance():
158
+ """测试批处理器性能"""
159
+ print("🔍 测试批处理器性能...")
160
+
161
+ try:
162
+ # 创建连接池和批处理器
163
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
164
+ redis_client = await pool.get_connection()
165
+ batch_processor = RedisBatchProcessor(redis_client, batch_size=100)
166
+
167
+ # 1. 测试 Redis 批量设置性能
168
+ print(" 📊 测试 Redis 批量设置性能...")
169
+ items_count = 1000
170
+ items = [{"key": f"perf_test_key_{i}", "value": f"perf_test_value_{i}"} for i in range(items_count)]
171
+
172
+ start_time = time.time()
173
+ count = await batch_processor.batch_set(items)
174
+ end_time = time.time()
175
+
176
+ duration = end_time - start_time
177
+ rate = count / duration if duration > 0 else 0
178
+
179
+ print(f" 批量设置 {count} 个键值对耗时: {duration:.2f}秒")
180
+ print(f" 设置速率: {rate:.1f} 键值对/秒")
181
+
182
+ # 2. 测试 Redis 批量获取性能
183
+ print(" 📊 测试 Redis 批量获取性能...")
184
+ keys = [f"perf_test_key_{i}" for i in range(items_count)]
185
+
186
+ start_time = time.time()
187
+ result = await batch_processor.batch_get(keys)
188
+ end_time = time.time()
189
+
190
+ duration = end_time - start_time
191
+ rate = len(result) / duration if duration > 0 else 0
192
+
193
+ print(f" 批量获取 {len(result)} 个键值对耗时: {duration:.2f}秒")
194
+ print(f" 获取速率: {rate:.1f} 键值对/秒")
195
+
196
+ # 3. 测试通用批处理器性能
197
+ print(" 📊 测试通用批处理器性能...")
198
+
199
+ async def process_item(item: int) -> int:
200
+ # 模拟一些处理工作
201
+ await asyncio.sleep(0.001)
202
+ return item * 2
203
+
204
+ batch_processor_general = BatchProcessor(batch_size=50, max_concurrent_batches=10)
205
+ items_to_process = list(range(1000))
206
+
207
+ start_time = time.time()
208
+ results = await batch_processor_general.process_in_batches(items_to_process, process_item)
209
+ end_time = time.time()
210
+
211
+ duration = end_time - start_time
212
+ rate = len(results) / duration if duration > 0 else 0
213
+
214
+ print(f" 批量处理 {len(results)} 个项目耗时: {duration:.2f}秒")
215
+ print(f" 处理速率: {rate:.1f} 项目/秒")
216
+
217
+ # 清理测试数据
218
+ await redis_client.delete(*[f"perf_test_key_{i}" for i in range(items_count)])
219
+
220
+ # 性能标准:批量操作应该在合理时间内完成
221
+ if duration < 10.0:
222
+ print(" ✅ 批处理器性能测试通过")
223
+ return True
224
+ else:
225
+ print(" ⚠️ 批处理器性能较低")
226
+ return True # 仍然算通过,只是性能较低
227
+
228
+ except Exception as e:
229
+ print(f" ❌ 批处理器性能测试失败: {e}")
230
+ traceback.print_exc()
231
+ return False
232
+
233
+
234
+ async def test_performance_monitor_overhead():
235
+ """测试性能监控器开销"""
236
+ print("🔍 测试性能监控器开销...")
237
+
238
+ try:
239
+ monitor = PerformanceMonitor("test_monitor")
240
+
241
+ # 1. 测试指标获取开销
242
+ print(" 📊 测试指标获取开销...")
243
+ start_time = time.time()
244
+
245
+ for i in range(100):
246
+ metrics = monitor.get_system_metrics()
247
+ assert isinstance(metrics, dict), "应该返回字典"
248
+
249
+ end_time = time.time()
250
+ duration = end_time - start_time
251
+
252
+ print(f" 获取 100 次系统指标耗时: {duration:.2f}秒")
253
+ print(f" 平均每次耗时: {duration * 1000 / 100:.2f}毫秒")
254
+
255
+ # 2. 测试计时器开销
256
+ print(" 📊 测试计时器开销...")
257
+
258
+ total_timer_time = 0
259
+ timer_count = 1000
260
+
261
+ for i in range(timer_count):
262
+ start = time.time()
263
+ with PerformanceTimer(f"test_timer_{i}"):
264
+ pass # 空操作
265
+ end = time.time()
266
+ total_timer_time += (end - start)
267
+
268
+ avg_timer_time = total_timer_time / timer_count * 1000 # 转换为毫秒
269
+
270
+ print(f" 平均计时器开销: {avg_timer_time:.2f}毫秒")
271
+
272
+ # 开销标准:平均计时器开销应该小于1毫秒
273
+ if avg_timer_time < 1.0:
274
+ print(" ✅ 性能监控器开销测试通过")
275
+ return True
276
+ else:
277
+ print(" ⚠️ 性能监控器开销较高")
278
+ return True # 仍然算通过,只是开销较高
279
+
280
+ except Exception as e:
281
+ print(f" ❌ 性能监控器开销测试失败: {e}")
282
+ traceback.print_exc()
283
+ return False
284
+
285
+
286
+ async def main():
287
+ """主测试函数"""
288
+ print("🚀 开始性能测试...")
289
+ print("=" * 50)
290
+
291
+ tests = [
292
+ test_redis_queue_performance,
293
+ test_redis_connection_pool_performance,
294
+ test_batch_processor_performance,
295
+ test_performance_monitor_overhead,
296
+ ]
297
+
298
+ passed = 0
299
+ total = len(tests)
300
+
301
+ for test_func in tests:
302
+ try:
303
+ if await test_func():
304
+ passed += 1
305
+ print(f"✅ {test_func.__name__} 通过")
306
+ else:
307
+ print(f"❌ {test_func.__name__} 失败")
308
+ except Exception as e:
309
+ print(f"❌ {test_func.__name__} 异常: {e}")
310
+ print()
311
+
312
+ # 关闭所有连接池
313
+ await close_all_pools()
314
+
315
+ print("=" * 50)
316
+ print(f"📊 性能测试结果: {passed}/{total} 通过")
317
+
318
+ if passed == total:
319
+ print("🎉 所有性能测试通过!")
320
+ return 0
321
+ else:
322
+ print("❌ 部分性能测试失败,请检查实现")
323
+ return 1
324
+
325
+
326
+ if __name__ == "__main__":
327
+ exit_code = asyncio.run(main())
328
328
  exit(exit_code)