crawlo 1.2.8__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (221) hide show
  1. crawlo/__init__.py +63 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +314 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +186 -186
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -251
  15. crawlo/core/__init__.py +2 -2
  16. crawlo/core/engine.py +365 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +256 -251
  19. crawlo/crawler.py +1097 -1099
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -107
  22. crawlo/downloader/__init__.py +273 -266
  23. crawlo/downloader/aiohttp_downloader.py +226 -228
  24. crawlo/downloader/cffi_downloader.py +245 -256
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +45 -43
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/filters/__init__.py +154 -154
  40. crawlo/filters/aioredis_filter.py +234 -234
  41. crawlo/filters/memory_filter.py +269 -269
  42. crawlo/items/__init__.py +23 -23
  43. crawlo/items/base.py +21 -21
  44. crawlo/items/fields.py +52 -52
  45. crawlo/items/items.py +104 -104
  46. crawlo/middleware/__init__.py +21 -21
  47. crawlo/middleware/default_header.py +132 -132
  48. crawlo/middleware/download_delay.py +104 -104
  49. crawlo/middleware/middleware_manager.py +136 -136
  50. crawlo/middleware/offsite.py +114 -114
  51. crawlo/middleware/proxy.py +386 -368
  52. crawlo/middleware/request_ignore.py +86 -86
  53. crawlo/middleware/response_code.py +163 -163
  54. crawlo/middleware/response_filter.py +136 -136
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/middleware/simple_proxy.py +65 -0
  57. crawlo/mode_manager.py +212 -211
  58. crawlo/network/__init__.py +21 -21
  59. crawlo/network/request.py +379 -338
  60. crawlo/network/response.py +359 -359
  61. crawlo/pipelines/__init__.py +21 -21
  62. crawlo/pipelines/bloom_dedup_pipeline.py +157 -157
  63. crawlo/pipelines/console_pipeline.py +39 -39
  64. crawlo/pipelines/csv_pipeline.py +316 -316
  65. crawlo/pipelines/database_dedup_pipeline.py +223 -223
  66. crawlo/pipelines/json_pipeline.py +218 -218
  67. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  68. crawlo/pipelines/mongo_pipeline.py +131 -131
  69. crawlo/pipelines/mysql_pipeline.py +317 -317
  70. crawlo/pipelines/pipeline_manager.py +74 -62
  71. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  72. crawlo/project.py +284 -315
  73. crawlo/queue/pqueue.py +37 -37
  74. crawlo/queue/queue_manager.py +379 -378
  75. crawlo/queue/redis_priority_queue.py +306 -306
  76. crawlo/settings/__init__.py +7 -7
  77. crawlo/settings/default_settings.py +216 -220
  78. crawlo/settings/setting_manager.py +175 -122
  79. crawlo/spider/__init__.py +639 -639
  80. crawlo/stats_collector.py +59 -59
  81. crawlo/subscriber.py +129 -129
  82. crawlo/task_manager.py +30 -30
  83. crawlo/templates/crawlo.cfg.tmpl +10 -10
  84. crawlo/templates/project/__init__.py.tmpl +3 -3
  85. crawlo/templates/project/items.py.tmpl +17 -17
  86. crawlo/templates/project/middlewares.py.tmpl +118 -118
  87. crawlo/templates/project/pipelines.py.tmpl +96 -96
  88. crawlo/templates/project/settings.py.tmpl +261 -288
  89. crawlo/templates/project/settings_distributed.py.tmpl +174 -157
  90. crawlo/templates/project/settings_gentle.py.tmpl +95 -100
  91. crawlo/templates/project/settings_high_performance.py.tmpl +125 -134
  92. crawlo/templates/project/settings_minimal.py.tmpl +30 -0
  93. crawlo/templates/project/settings_simple.py.tmpl +96 -98
  94. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  95. crawlo/templates/run.py.tmpl +47 -47
  96. crawlo/templates/spider/spider.py.tmpl +143 -143
  97. crawlo/tools/__init__.py +200 -182
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/{cleaners → tools}/data_formatter.py +225 -225
  101. crawlo/tools/data_validator.py +180 -180
  102. crawlo/tools/date_tools.py +290 -36
  103. crawlo/tools/distributed_coordinator.py +388 -387
  104. crawlo/{cleaners → tools}/encoding_converter.py +127 -126
  105. crawlo/tools/request_tools.py +83 -0
  106. crawlo/tools/retry_mechanism.py +224 -221
  107. crawlo/tools/scenario_adapter.py +262 -262
  108. crawlo/{cleaners → tools}/text_cleaner.py +232 -232
  109. crawlo/utils/__init__.py +35 -35
  110. crawlo/utils/batch_processor.py +259 -259
  111. crawlo/utils/controlled_spider_mixin.py +439 -439
  112. crawlo/utils/db_helper.py +343 -343
  113. crawlo/utils/enhanced_error_handler.py +356 -356
  114. crawlo/utils/env_config.py +142 -142
  115. crawlo/utils/error_handler.py +123 -123
  116. crawlo/utils/func_tools.py +82 -82
  117. crawlo/utils/large_scale_config.py +286 -286
  118. crawlo/utils/large_scale_helper.py +344 -344
  119. crawlo/utils/log.py +146 -128
  120. crawlo/utils/performance_monitor.py +285 -285
  121. crawlo/utils/queue_helper.py +175 -175
  122. crawlo/utils/redis_connection_pool.py +351 -351
  123. crawlo/utils/redis_key_validator.py +198 -198
  124. crawlo/utils/request.py +267 -267
  125. crawlo/utils/request_serializer.py +218 -218
  126. crawlo/utils/spider_loader.py +61 -61
  127. crawlo/utils/system.py +11 -11
  128. crawlo/utils/tools.py +4 -4
  129. crawlo/utils/url.py +39 -39
  130. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/METADATA +1011 -764
  131. crawlo-1.3.0.dist-info/RECORD +219 -0
  132. examples/__init__.py +7 -7
  133. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  134. tests/__init__.py +7 -7
  135. tests/advanced_tools_example.py +275 -275
  136. tests/authenticated_proxy_example.py +107 -237
  137. tests/cleaners_example.py +160 -160
  138. tests/config_validation_demo.py +143 -103
  139. tests/controlled_spider_example.py +205 -205
  140. tests/date_tools_example.py +180 -180
  141. tests/debug_pipelines.py +67 -0
  142. tests/dynamic_loading_example.py +523 -523
  143. tests/dynamic_loading_test.py +104 -104
  144. tests/env_config_example.py +133 -133
  145. tests/error_handling_example.py +171 -171
  146. tests/redis_key_validation_demo.py +130 -130
  147. tests/request_params_example.py +151 -0
  148. tests/response_improvements_example.py +144 -144
  149. tests/test_advanced_tools.py +148 -148
  150. tests/test_all_redis_key_configs.py +145 -145
  151. tests/test_authenticated_proxy.py +141 -141
  152. tests/test_cleaners.py +54 -54
  153. tests/test_comprehensive.py +146 -146
  154. tests/test_config_consistency.py +80 -80
  155. tests/test_config_merge.py +153 -0
  156. tests/test_config_validator.py +182 -193
  157. tests/test_crawlo_proxy_integration.py +109 -173
  158. tests/test_date_tools.py +123 -123
  159. tests/test_default_header_middleware.py +158 -158
  160. tests/test_distributed.py +65 -0
  161. tests/test_double_crawlo_fix.py +207 -207
  162. tests/test_double_crawlo_fix_simple.py +124 -124
  163. tests/test_download_delay_middleware.py +221 -221
  164. tests/test_downloader_proxy_compatibility.py +268 -268
  165. tests/test_dynamic_downloaders_proxy.py +124 -124
  166. tests/test_dynamic_proxy.py +92 -92
  167. tests/test_dynamic_proxy_config.py +146 -146
  168. tests/test_dynamic_proxy_real.py +109 -109
  169. tests/test_edge_cases.py +303 -303
  170. tests/test_enhanced_error_handler.py +270 -270
  171. tests/test_env_config.py +121 -121
  172. tests/test_error_handler_compatibility.py +112 -112
  173. tests/test_final_validation.py +153 -153
  174. tests/test_framework_env_usage.py +103 -103
  175. tests/test_integration.py +169 -357
  176. tests/test_item_dedup_redis_key.py +122 -122
  177. tests/test_mode_consistency.py +51 -51
  178. tests/test_offsite_middleware.py +221 -221
  179. tests/test_parsel.py +29 -29
  180. tests/test_performance.py +327 -327
  181. tests/test_proxy_api.py +264 -264
  182. tests/test_proxy_health_check.py +32 -32
  183. tests/test_proxy_middleware.py +121 -121
  184. tests/test_proxy_middleware_enhanced.py +216 -216
  185. tests/test_proxy_middleware_integration.py +136 -136
  186. tests/test_proxy_middleware_refactored.py +185 -0
  187. tests/test_proxy_providers.py +56 -56
  188. tests/test_proxy_stats.py +19 -19
  189. tests/test_proxy_strategies.py +59 -59
  190. tests/test_queue_manager_double_crawlo.py +173 -173
  191. tests/test_queue_manager_redis_key.py +176 -176
  192. tests/test_random_user_agent.py +73 -0
  193. tests/test_real_scenario_proxy.py +195 -195
  194. tests/test_redis_config.py +28 -28
  195. tests/test_redis_connection_pool.py +294 -294
  196. tests/test_redis_key_naming.py +181 -181
  197. tests/test_redis_key_validator.py +123 -123
  198. tests/test_redis_queue.py +224 -224
  199. tests/test_request_ignore_middleware.py +182 -182
  200. tests/test_request_params.py +112 -0
  201. tests/test_request_serialization.py +70 -70
  202. tests/test_response_code_middleware.py +349 -349
  203. tests/test_response_filter_middleware.py +427 -427
  204. tests/test_response_improvements.py +152 -152
  205. tests/test_retry_middleware.py +241 -241
  206. tests/test_scheduler.py +252 -252
  207. tests/test_scheduler_config_update.py +133 -133
  208. tests/test_simple_response.py +61 -61
  209. tests/test_telecom_spider_redis_key.py +205 -205
  210. tests/test_template_content.py +87 -87
  211. tests/test_template_redis_key.py +134 -134
  212. tests/test_tools.py +159 -153
  213. tests/test_user_agents.py +97 -0
  214. tests/tools_example.py +260 -257
  215. tests/verify_distributed.py +117 -0
  216. crawlo/cleaners/__init__.py +0 -61
  217. crawlo/utils/date_tools.py +0 -290
  218. crawlo-1.2.8.dist-info/RECORD +0 -209
  219. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/WHEEL +0 -0
  220. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/entry_points.txt +0 -0
  221. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/top_level.txt +0 -0
tests/test_performance.py CHANGED
@@ -1,328 +1,328 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- 性能测试
5
- 测试系统性能和瓶颈
6
- """
7
- import asyncio
8
- import sys
9
- import os
10
- import time
11
- import psutil
12
- import traceback
13
- from typing import List
14
-
15
- # 添加项目根目录到Python路径
16
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
17
-
18
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
19
- from crawlo.network.request import Request
20
- from crawlo.utils.redis_connection_pool import OptimizedRedisConnectionPool, get_redis_pool, close_all_pools
21
- from crawlo.utils.batch_processor import RedisBatchProcessor, BatchProcessor
22
- from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer
23
-
24
-
25
- async def test_redis_queue_performance():
26
- """测试 Redis 队列性能"""
27
- print("🔍 测试 Redis 队列性能...")
28
-
29
- try:
30
- queue = RedisPriorityQueue(
31
- redis_url="redis://127.0.0.1:6379/15",
32
- queue_name="test:performance:queue"
33
- )
34
- await queue.connect()
35
-
36
- # 1. 测试批量入队性能
37
- print(" 📊 测试批量入队性能...")
38
- start_time = time.time()
39
- request_count = 1000
40
-
41
- for i in range(request_count):
42
- request = Request(url=f"https://example{i}.com", priority=i % 10)
43
- await queue.put(request)
44
-
45
- end_time = time.time()
46
- duration = end_time - start_time
47
- rate = request_count / duration
48
-
49
- print(f" 入队 {request_count} 个请求耗时: {duration:.2f}秒")
50
- print(f" 入队速率: {rate:.1f} 请求/秒")
51
-
52
- # 2. 测试批量出队性能
53
- print(" 📊 测试批量出队性能...")
54
- start_time = time.time()
55
-
56
- retrieved_count = 0
57
- while retrieved_count < request_count:
58
- request = await queue.get(timeout=1.0)
59
- if request:
60
- await queue.ack(request)
61
- retrieved_count += 1
62
- else:
63
- break
64
-
65
- end_time = time.time()
66
- duration = end_time - start_time
67
- rate = retrieved_count / duration if duration > 0 else 0
68
-
69
- print(f" 出队 {retrieved_count} 个请求耗时: {duration:.2f}秒")
70
- print(f" 出队速率: {rate:.1f} 请求/秒")
71
-
72
- await queue.close()
73
-
74
- # 性能标准:1000个请求应该在5秒内完成
75
- if duration < 5.0:
76
- print(" Redis 队列性能测试通过")
77
- return True
78
- else:
79
- print(" ⚠️ Redis 队列性能较低")
80
- return True # 仍然算通过,只是性能较低
81
-
82
- except Exception as e:
83
- print(f" Redis 队列性能测试失败: {e}")
84
- traceback.print_exc()
85
- return False
86
-
87
-
88
- async def test_redis_connection_pool_performance():
89
- """测试 Redis 连接池性能"""
90
- print("🔍 测试 Redis 连接池性能...")
91
-
92
- try:
93
- # 1. 测试连接获取性能
94
- print(" 📊 测试连接获取性能...")
95
- start_time = time.time()
96
- connection_count = 100
97
-
98
- pools = []
99
- for i in range(connection_count):
100
- pool = get_redis_pool(f"redis://127.0.0.1:6379/15?db={i % 16}")
101
- pools.append(pool)
102
-
103
- end_time = time.time()
104
- duration = end_time - start_time
105
-
106
- print(f" 获取 {connection_count} 个连接耗时: {duration:.2f}秒")
107
-
108
- # 2. 测试连接复用性能
109
- print(" 📊 测试连接复用性能...")
110
- start_time = time.time()
111
-
112
- # 重复获取相同连接
113
- for i in range(connection_count * 10):
114
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
115
- redis_client = await pool.get_connection()
116
- await redis_client.ping()
117
-
118
- end_time = time.time()
119
- duration = end_time - start_time
120
-
121
- print(f" 复用 {connection_count * 10} 次连接耗时: {duration:.2f}秒")
122
-
123
- # 3. 测试并发连接获取
124
- print(" 📊 测试并发连接获取...")
125
-
126
- async def get_connection_worker(worker_id: int):
127
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
128
- redis_client = await pool.get_connection()
129
- await redis_client.ping()
130
- return True
131
-
132
- start_time = time.time()
133
- tasks = [get_connection_worker(i) for i in range(50)]
134
- results = await asyncio.gather(*tasks, return_exceptions=True)
135
- end_time = time.time()
136
-
137
- success_count = sum(1 for result in results if result is True)
138
- duration = end_time - start_time
139
-
140
- print(f" 并发获取 50 个连接耗时: {duration:.2f}秒")
141
- print(f" 成功获取: {success_count}/50")
142
-
143
- # 性能标准:并发获取应该在2秒内完成
144
- if duration < 2.0 and success_count >= 45:
145
- print(" Redis 连接池性能测试通过")
146
- return True
147
- else:
148
- print(" ⚠️ Redis 连接池性能较低")
149
- return True # 仍然算通过,只是性能较低
150
-
151
- except Exception as e:
152
- print(f" Redis 连接池性能测试失败: {e}")
153
- traceback.print_exc()
154
- return False
155
-
156
-
157
- async def test_batch_processor_performance():
158
- """测试批处理器性能"""
159
- print("🔍 测试批处理器性能...")
160
-
161
- try:
162
- # 创建连接池和批处理器
163
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
164
- redis_client = await pool.get_connection()
165
- batch_processor = RedisBatchProcessor(redis_client, batch_size=100)
166
-
167
- # 1. 测试 Redis 批量设置性能
168
- print(" 📊 测试 Redis 批量设置性能...")
169
- items_count = 1000
170
- items = [{"key": f"perf_test_key_{i}", "value": f"perf_test_value_{i}"} for i in range(items_count)]
171
-
172
- start_time = time.time()
173
- count = await batch_processor.batch_set(items)
174
- end_time = time.time()
175
-
176
- duration = end_time - start_time
177
- rate = count / duration if duration > 0 else 0
178
-
179
- print(f" 批量设置 {count} 个键值对耗时: {duration:.2f}秒")
180
- print(f" 设置速率: {rate:.1f} 键值对/秒")
181
-
182
- # 2. 测试 Redis 批量获取性能
183
- print(" 📊 测试 Redis 批量获取性能...")
184
- keys = [f"perf_test_key_{i}" for i in range(items_count)]
185
-
186
- start_time = time.time()
187
- result = await batch_processor.batch_get(keys)
188
- end_time = time.time()
189
-
190
- duration = end_time - start_time
191
- rate = len(result) / duration if duration > 0 else 0
192
-
193
- print(f" 批量获取 {len(result)} 个键值对耗时: {duration:.2f}秒")
194
- print(f" 获取速率: {rate:.1f} 键值对/秒")
195
-
196
- # 3. 测试通用批处理器性能
197
- print(" 📊 测试通用批处理器性能...")
198
-
199
- async def process_item(item: int) -> int:
200
- # 模拟一些处理工作
201
- await asyncio.sleep(0.001)
202
- return item * 2
203
-
204
- batch_processor_general = BatchProcessor(batch_size=50, max_concurrent_batches=10)
205
- items_to_process = list(range(1000))
206
-
207
- start_time = time.time()
208
- results = await batch_processor_general.process_in_batches(items_to_process, process_item)
209
- end_time = time.time()
210
-
211
- duration = end_time - start_time
212
- rate = len(results) / duration if duration > 0 else 0
213
-
214
- print(f" 批量处理 {len(results)} 个项目耗时: {duration:.2f}秒")
215
- print(f" 处理速率: {rate:.1f} 项目/秒")
216
-
217
- # 清理测试数据
218
- await redis_client.delete(*[f"perf_test_key_{i}" for i in range(items_count)])
219
-
220
- # 性能标准:批量操作应该在合理时间内完成
221
- if duration < 10.0:
222
- print(" 批处理器性能测试通过")
223
- return True
224
- else:
225
- print(" ⚠️ 批处理器性能较低")
226
- return True # 仍然算通过,只是性能较低
227
-
228
- except Exception as e:
229
- print(f" 批处理器性能测试失败: {e}")
230
- traceback.print_exc()
231
- return False
232
-
233
-
234
- async def test_performance_monitor_overhead():
235
- """测试性能监控器开销"""
236
- print("🔍 测试性能监控器开销...")
237
-
238
- try:
239
- monitor = PerformanceMonitor("test_monitor")
240
-
241
- # 1. 测试指标获取开销
242
- print(" 📊 测试指标获取开销...")
243
- start_time = time.time()
244
-
245
- for i in range(100):
246
- metrics = monitor.get_system_metrics()
247
- assert isinstance(metrics, dict), "应该返回字典"
248
-
249
- end_time = time.time()
250
- duration = end_time - start_time
251
-
252
- print(f" 获取 100 次系统指标耗时: {duration:.2f}秒")
253
- print(f" 平均每次耗时: {duration * 1000 / 100:.2f}毫秒")
254
-
255
- # 2. 测试计时器开销
256
- print(" 📊 测试计时器开销...")
257
-
258
- total_timer_time = 0
259
- timer_count = 1000
260
-
261
- for i in range(timer_count):
262
- start = time.time()
263
- with PerformanceTimer(f"test_timer_{i}"):
264
- pass # 空操作
265
- end = time.time()
266
- total_timer_time += (end - start)
267
-
268
- avg_timer_time = total_timer_time / timer_count * 1000 # 转换为毫秒
269
-
270
- print(f" 平均计时器开销: {avg_timer_time:.2f}毫秒")
271
-
272
- # 开销标准:平均计时器开销应该小于1毫秒
273
- if avg_timer_time < 1.0:
274
- print(" 性能监控器开销测试通过")
275
- return True
276
- else:
277
- print(" ⚠️ 性能监控器开销较高")
278
- return True # 仍然算通过,只是开销较高
279
-
280
- except Exception as e:
281
- print(f" 性能监控器开销测试失败: {e}")
282
- traceback.print_exc()
283
- return False
284
-
285
-
286
- async def main():
287
- """主测试函数"""
288
- print("🚀 开始性能测试...")
289
- print("=" * 50)
290
-
291
- tests = [
292
- test_redis_queue_performance,
293
- test_redis_connection_pool_performance,
294
- test_batch_processor_performance,
295
- test_performance_monitor_overhead,
296
- ]
297
-
298
- passed = 0
299
- total = len(tests)
300
-
301
- for test_func in tests:
302
- try:
303
- if await test_func():
304
- passed += 1
305
- print(f"{test_func.__name__} 通过")
306
- else:
307
- print(f"{test_func.__name__} 失败")
308
- except Exception as e:
309
- print(f"{test_func.__name__} 异常: {e}")
310
- print()
311
-
312
- # 关闭所有连接池
313
- await close_all_pools()
314
-
315
- print("=" * 50)
316
- print(f"📊 性能测试结果: {passed}/{total} 通过")
317
-
318
- if passed == total:
319
- print("🎉 所有性能测试通过!")
320
- return 0
321
- else:
322
- print("部分性能测试失败,请检查实现")
323
- return 1
324
-
325
-
326
- if __name__ == "__main__":
327
- exit_code = asyncio.run(main())
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 性能测试
5
+ 测试系统性能和瓶颈
6
+ """
7
+ import asyncio
8
+ import sys
9
+ import os
10
+ import time
11
+ import psutil
12
+ import traceback
13
+ from typing import List
14
+
15
+ # 添加项目根目录到Python路径
16
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
17
+
18
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
19
+ from crawlo.network.request import Request
20
+ from crawlo.utils.redis_connection_pool import OptimizedRedisConnectionPool, get_redis_pool, close_all_pools
21
+ from crawlo.utils.batch_processor import RedisBatchProcessor, BatchProcessor
22
+ from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer
23
+
24
+
25
+ async def test_redis_queue_performance():
26
+ """测试 Redis 队列性能"""
27
+ print("测试 Redis 队列性能...")
28
+
29
+ try:
30
+ queue = RedisPriorityQueue(
31
+ redis_url="redis://127.0.0.1:6379/15",
32
+ queue_name="test:performance:queue"
33
+ )
34
+ await queue.connect()
35
+
36
+ # 1. 测试批量入队性能
37
+ print(" 测试批量入队性能...")
38
+ start_time = time.time()
39
+ request_count = 1000
40
+
41
+ for i in range(request_count):
42
+ request = Request(url=f"https://example{i}.com", priority=i % 10)
43
+ await queue.put(request)
44
+
45
+ end_time = time.time()
46
+ duration = end_time - start_time
47
+ rate = request_count / duration
48
+
49
+ print(f" 入队 {request_count} 个请求耗时: {duration:.2f}秒")
50
+ print(f" 入队速率: {rate:.1f} 请求/秒")
51
+
52
+ # 2. 测试批量出队性能
53
+ print(" 测试批量出队性能...")
54
+ start_time = time.time()
55
+
56
+ retrieved_count = 0
57
+ while retrieved_count < request_count:
58
+ request = await queue.get(timeout=1.0)
59
+ if request:
60
+ await queue.ack(request)
61
+ retrieved_count += 1
62
+ else:
63
+ break
64
+
65
+ end_time = time.time()
66
+ duration = end_time - start_time
67
+ rate = retrieved_count / duration if duration > 0 else 0
68
+
69
+ print(f" 出队 {retrieved_count} 个请求耗时: {duration:.2f}秒")
70
+ print(f" 出队速率: {rate:.1f} 请求/秒")
71
+
72
+ await queue.close()
73
+
74
+ # 性能标准:1000个请求应该在5秒内完成
75
+ if duration < 5.0:
76
+ print(" Redis 队列性能测试通过")
77
+ return True
78
+ else:
79
+ print(" Redis 队列性能较低")
80
+ return True # 仍然算通过,只是性能较低
81
+
82
+ except Exception as e:
83
+ print(f" Redis 队列性能测试失败: {e}")
84
+ traceback.print_exc()
85
+ return False
86
+
87
+
88
+ async def test_redis_connection_pool_performance():
89
+ """测试 Redis 连接池性能"""
90
+ print("测试 Redis 连接池性能...")
91
+
92
+ try:
93
+ # 1. 测试连接获取性能
94
+ print(" 测试连接获取性能...")
95
+ start_time = time.time()
96
+ connection_count = 100
97
+
98
+ pools = []
99
+ for i in range(connection_count):
100
+ pool = get_redis_pool(f"redis://127.0.0.1:6379/15?db={i % 16}")
101
+ pools.append(pool)
102
+
103
+ end_time = time.time()
104
+ duration = end_time - start_time
105
+
106
+ print(f" 获取 {connection_count} 个连接耗时: {duration:.2f}秒")
107
+
108
+ # 2. 测试连接复用性能
109
+ print(" 测试连接复用性能...")
110
+ start_time = time.time()
111
+
112
+ # 重复获取相同连接
113
+ for i in range(connection_count * 10):
114
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
115
+ redis_client = await pool.get_connection()
116
+ await redis_client.ping()
117
+
118
+ end_time = time.time()
119
+ duration = end_time - start_time
120
+
121
+ print(f" 复用 {connection_count * 10} 次连接耗时: {duration:.2f}秒")
122
+
123
+ # 3. 测试并发连接获取
124
+ print(" 测试并发连接获取...")
125
+
126
+ async def get_connection_worker(worker_id: int):
127
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
128
+ redis_client = await pool.get_connection()
129
+ await redis_client.ping()
130
+ return True
131
+
132
+ start_time = time.time()
133
+ tasks = [get_connection_worker(i) for i in range(50)]
134
+ results = await asyncio.gather(*tasks, return_exceptions=True)
135
+ end_time = time.time()
136
+
137
+ success_count = sum(1 for result in results if result is True)
138
+ duration = end_time - start_time
139
+
140
+ print(f" 并发获取 50 个连接耗时: {duration:.2f}秒")
141
+ print(f" 成功获取: {success_count}/50")
142
+
143
+ # 性能标准:并发获取应该在2秒内完成
144
+ if duration < 2.0 and success_count >= 45:
145
+ print(" Redis 连接池性能测试通过")
146
+ return True
147
+ else:
148
+ print(" Redis 连接池性能较低")
149
+ return True # 仍然算通过,只是性能较低
150
+
151
+ except Exception as e:
152
+ print(f" Redis 连接池性能测试失败: {e}")
153
+ traceback.print_exc()
154
+ return False
155
+
156
+
157
+ async def test_batch_processor_performance():
158
+ """测试批处理器性能"""
159
+ print("测试批处理器性能...")
160
+
161
+ try:
162
+ # 创建连接池和批处理器
163
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
164
+ redis_client = await pool.get_connection()
165
+ batch_processor = RedisBatchProcessor(redis_client, batch_size=100)
166
+
167
+ # 1. 测试 Redis 批量设置性能
168
+ print(" 测试 Redis 批量设置性能...")
169
+ items_count = 1000
170
+ items = [{"key": f"perf_test_key_{i}", "value": f"perf_test_value_{i}"} for i in range(items_count)]
171
+
172
+ start_time = time.time()
173
+ count = await batch_processor.batch_set(items)
174
+ end_time = time.time()
175
+
176
+ duration = end_time - start_time
177
+ rate = count / duration if duration > 0 else 0
178
+
179
+ print(f" 批量设置 {count} 个键值对耗时: {duration:.2f}秒")
180
+ print(f" 设置速率: {rate:.1f} 键值对/秒")
181
+
182
+ # 2. 测试 Redis 批量获取性能
183
+ print(" 测试 Redis 批量获取性能...")
184
+ keys = [f"perf_test_key_{i}" for i in range(items_count)]
185
+
186
+ start_time = time.time()
187
+ result = await batch_processor.batch_get(keys)
188
+ end_time = time.time()
189
+
190
+ duration = end_time - start_time
191
+ rate = len(result) / duration if duration > 0 else 0
192
+
193
+ print(f" 批量获取 {len(result)} 个键值对耗时: {duration:.2f}秒")
194
+ print(f" 获取速率: {rate:.1f} 键值对/秒")
195
+
196
+ # 3. 测试通用批处理器性能
197
+ print(" 测试通用批处理器性能...")
198
+
199
+ async def process_item(item: int) -> int:
200
+ # 模拟一些处理工作
201
+ await asyncio.sleep(0.001)
202
+ return item * 2
203
+
204
+ batch_processor_general = BatchProcessor(batch_size=50, max_concurrent_batches=10)
205
+ items_to_process = list(range(1000))
206
+
207
+ start_time = time.time()
208
+ results = await batch_processor_general.process_in_batches(items_to_process, process_item)
209
+ end_time = time.time()
210
+
211
+ duration = end_time - start_time
212
+ rate = len(results) / duration if duration > 0 else 0
213
+
214
+ print(f" 批量处理 {len(results)} 个项目耗时: {duration:.2f}秒")
215
+ print(f" 处理速率: {rate:.1f} 项目/秒")
216
+
217
+ # 清理测试数据
218
+ await redis_client.delete(*[f"perf_test_key_{i}" for i in range(items_count)])
219
+
220
+ # 性能标准:批量操作应该在合理时间内完成
221
+ if duration < 10.0:
222
+ print(" 批处理器性能测试通过")
223
+ return True
224
+ else:
225
+ print(" 批处理器性能较低")
226
+ return True # 仍然算通过,只是性能较低
227
+
228
+ except Exception as e:
229
+ print(f" 批处理器性能测试失败: {e}")
230
+ traceback.print_exc()
231
+ return False
232
+
233
+
234
+ async def test_performance_monitor_overhead():
235
+ """测试性能监控器开销"""
236
+ print("🔍 测试性能监控器开销...")
237
+
238
+ try:
239
+ monitor = PerformanceMonitor("test_monitor")
240
+
241
+ # 1. 测试指标获取开销
242
+ print(" 测试指标获取开销...")
243
+ start_time = time.time()
244
+
245
+ for i in range(100):
246
+ metrics = monitor.get_system_metrics()
247
+ assert isinstance(metrics, dict), "应该返回字典"
248
+
249
+ end_time = time.time()
250
+ duration = end_time - start_time
251
+
252
+ print(f" 获取 100 次系统指标耗时: {duration:.2f}秒")
253
+ print(f" 平均每次耗时: {duration * 1000 / 100:.2f}毫秒")
254
+
255
+ # 2. 测试计时器开销
256
+ print(" 测试计时器开销...")
257
+
258
+ total_timer_time = 0
259
+ timer_count = 1000
260
+
261
+ for i in range(timer_count):
262
+ start = time.time()
263
+ with PerformanceTimer(f"test_timer_{i}"):
264
+ pass # 空操作
265
+ end = time.time()
266
+ total_timer_time += (end - start)
267
+
268
+ avg_timer_time = total_timer_time / timer_count * 1000 # 转换为毫秒
269
+
270
+ print(f" 平均计时器开销: {avg_timer_time:.2f}毫秒")
271
+
272
+ # 开销标准:平均计时器开销应该小于1毫秒
273
+ if avg_timer_time < 1.0:
274
+ print(" 性能监控器开销测试通过")
275
+ return True
276
+ else:
277
+ print(" 性能监控器开销较高")
278
+ return True # 仍然算通过,只是开销较高
279
+
280
+ except Exception as e:
281
+ print(f" 性能监控器开销测试失败: {e}")
282
+ traceback.print_exc()
283
+ return False
284
+
285
+
286
+ async def main():
287
+ """主测试函数"""
288
+ print("开始性能测试...")
289
+ print("=" * 50)
290
+
291
+ tests = [
292
+ test_redis_queue_performance,
293
+ test_redis_connection_pool_performance,
294
+ test_batch_processor_performance,
295
+ test_performance_monitor_overhead,
296
+ ]
297
+
298
+ passed = 0
299
+ total = len(tests)
300
+
301
+ for test_func in tests:
302
+ try:
303
+ if await test_func():
304
+ passed += 1
305
+ print(f"{test_func.__name__} 通过")
306
+ else:
307
+ print(f"{test_func.__name__} 失败")
308
+ except Exception as e:
309
+ print(f"{test_func.__name__} 异常: {e}")
310
+ print()
311
+
312
+ # 关闭所有连接池
313
+ await close_all_pools()
314
+
315
+ print("=" * 50)
316
+ print(f"性能测试结果: {passed}/{total} 通过")
317
+
318
+ if passed == total:
319
+ print("所有性能测试通过!")
320
+ return 0
321
+ else:
322
+ print("部分性能测试失败,请检查实现")
323
+ return 1
324
+
325
+
326
+ if __name__ == "__main__":
327
+ exit_code = asyncio.run(main())
328
328
  exit(exit_code)