crawlo 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (219) hide show
  1. crawlo/__init__.py +63 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +322 -314
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +2 -2
  16. crawlo/core/engine.py +365 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +256 -256
  19. crawlo/crawler.py +1166 -1168
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +226 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +52 -45
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/filters/__init__.py +154 -154
  40. crawlo/filters/aioredis_filter.py +234 -234
  41. crawlo/filters/memory_filter.py +269 -269
  42. crawlo/items/__init__.py +23 -23
  43. crawlo/items/base.py +21 -21
  44. crawlo/items/fields.py +52 -52
  45. crawlo/items/items.py +104 -104
  46. crawlo/middleware/__init__.py +21 -21
  47. crawlo/middleware/default_header.py +132 -132
  48. crawlo/middleware/download_delay.py +104 -104
  49. crawlo/middleware/middleware_manager.py +135 -135
  50. crawlo/middleware/offsite.py +123 -115
  51. crawlo/middleware/proxy.py +386 -386
  52. crawlo/middleware/request_ignore.py +86 -86
  53. crawlo/middleware/response_code.py +163 -163
  54. crawlo/middleware/response_filter.py +136 -136
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/middleware/simple_proxy.py +65 -65
  57. crawlo/mode_manager.py +187 -148
  58. crawlo/network/__init__.py +21 -21
  59. crawlo/network/request.py +379 -379
  60. crawlo/network/response.py +359 -359
  61. crawlo/pipelines/__init__.py +21 -21
  62. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  63. crawlo/pipelines/console_pipeline.py +39 -39
  64. crawlo/pipelines/csv_pipeline.py +316 -316
  65. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  66. crawlo/pipelines/json_pipeline.py +218 -218
  67. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  68. crawlo/pipelines/mongo_pipeline.py +131 -131
  69. crawlo/pipelines/mysql_pipeline.py +318 -318
  70. crawlo/pipelines/pipeline_manager.py +75 -75
  71. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  72. crawlo/project.py +325 -297
  73. crawlo/queue/pqueue.py +37 -37
  74. crawlo/queue/queue_manager.py +379 -379
  75. crawlo/queue/redis_priority_queue.py +306 -306
  76. crawlo/settings/__init__.py +7 -7
  77. crawlo/settings/default_settings.py +225 -225
  78. crawlo/settings/setting_manager.py +198 -198
  79. crawlo/spider/__init__.py +639 -639
  80. crawlo/stats_collector.py +59 -59
  81. crawlo/subscriber.py +129 -129
  82. crawlo/task_manager.py +30 -30
  83. crawlo/templates/crawlo.cfg.tmpl +10 -10
  84. crawlo/templates/project/__init__.py.tmpl +3 -3
  85. crawlo/templates/project/items.py.tmpl +17 -17
  86. crawlo/templates/project/middlewares.py.tmpl +118 -118
  87. crawlo/templates/project/pipelines.py.tmpl +96 -96
  88. crawlo/templates/project/settings.py.tmpl +266 -261
  89. crawlo/templates/project/settings_distributed.py.tmpl +179 -174
  90. crawlo/templates/project/settings_gentle.py.tmpl +60 -95
  91. crawlo/templates/project/settings_high_performance.py.tmpl +130 -125
  92. crawlo/templates/project/settings_minimal.py.tmpl +34 -29
  93. crawlo/templates/project/settings_simple.py.tmpl +101 -96
  94. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  95. crawlo/templates/run.py.tmpl +38 -47
  96. crawlo/templates/spider/spider.py.tmpl +143 -143
  97. crawlo/tools/__init__.py +200 -200
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/tools/data_formatter.py +225 -225
  101. crawlo/tools/data_validator.py +180 -180
  102. crawlo/tools/date_tools.py +289 -289
  103. crawlo/tools/distributed_coordinator.py +388 -388
  104. crawlo/tools/encoding_converter.py +127 -127
  105. crawlo/tools/request_tools.py +82 -82
  106. crawlo/tools/retry_mechanism.py +224 -224
  107. crawlo/tools/scenario_adapter.py +262 -262
  108. crawlo/tools/text_cleaner.py +232 -232
  109. crawlo/utils/__init__.py +34 -34
  110. crawlo/utils/batch_processor.py +259 -259
  111. crawlo/utils/controlled_spider_mixin.py +439 -439
  112. crawlo/utils/db_helper.py +343 -343
  113. crawlo/utils/enhanced_error_handler.py +356 -356
  114. crawlo/utils/env_config.py +142 -142
  115. crawlo/utils/error_handler.py +123 -123
  116. crawlo/utils/func_tools.py +82 -82
  117. crawlo/utils/large_scale_config.py +286 -286
  118. crawlo/utils/large_scale_helper.py +344 -344
  119. crawlo/utils/log.py +199 -146
  120. crawlo/utils/performance_monitor.py +285 -285
  121. crawlo/utils/queue_helper.py +175 -175
  122. crawlo/utils/redis_connection_pool.py +351 -351
  123. crawlo/utils/redis_key_validator.py +198 -198
  124. crawlo/utils/request.py +267 -267
  125. crawlo/utils/request_serializer.py +218 -218
  126. crawlo/utils/spider_loader.py +61 -61
  127. crawlo/utils/system.py +11 -11
  128. crawlo/utils/tools.py +4 -4
  129. crawlo/utils/url.py +39 -39
  130. {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/METADATA +1020 -1020
  131. crawlo-1.3.3.dist-info/RECORD +219 -0
  132. examples/__init__.py +7 -7
  133. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  134. tests/__init__.py +7 -7
  135. tests/advanced_tools_example.py +275 -275
  136. tests/authenticated_proxy_example.py +107 -107
  137. tests/cleaners_example.py +160 -160
  138. tests/config_validation_demo.py +142 -142
  139. tests/controlled_spider_example.py +205 -205
  140. tests/date_tools_example.py +180 -180
  141. tests/debug_pipelines.py +66 -66
  142. tests/dynamic_loading_example.py +523 -523
  143. tests/dynamic_loading_test.py +104 -104
  144. tests/env_config_example.py +133 -133
  145. tests/error_handling_example.py +171 -171
  146. tests/redis_key_validation_demo.py +130 -130
  147. tests/request_params_example.py +150 -150
  148. tests/response_improvements_example.py +144 -144
  149. tests/test_advanced_tools.py +148 -148
  150. tests/test_all_redis_key_configs.py +145 -145
  151. tests/test_authenticated_proxy.py +141 -141
  152. tests/test_cleaners.py +54 -54
  153. tests/test_comprehensive.py +146 -146
  154. tests/test_config_consistency.py +80 -80
  155. tests/test_config_merge.py +152 -152
  156. tests/test_config_validator.py +182 -182
  157. tests/test_crawlo_proxy_integration.py +108 -108
  158. tests/test_date_tools.py +123 -123
  159. tests/test_default_header_middleware.py +158 -158
  160. tests/test_distributed.py +65 -65
  161. tests/test_double_crawlo_fix.py +207 -207
  162. tests/test_double_crawlo_fix_simple.py +124 -124
  163. tests/test_download_delay_middleware.py +221 -221
  164. tests/test_downloader_proxy_compatibility.py +268 -268
  165. tests/test_dynamic_downloaders_proxy.py +124 -124
  166. tests/test_dynamic_proxy.py +92 -92
  167. tests/test_dynamic_proxy_config.py +146 -146
  168. tests/test_dynamic_proxy_real.py +109 -109
  169. tests/test_edge_cases.py +303 -303
  170. tests/test_enhanced_error_handler.py +270 -270
  171. tests/test_env_config.py +121 -121
  172. tests/test_error_handler_compatibility.py +112 -112
  173. tests/test_final_validation.py +153 -153
  174. tests/test_framework_env_usage.py +103 -103
  175. tests/test_integration.py +169 -169
  176. tests/test_item_dedup_redis_key.py +122 -122
  177. tests/test_mode_consistency.py +51 -51
  178. tests/test_offsite_middleware.py +221 -221
  179. tests/test_parsel.py +29 -29
  180. tests/test_performance.py +327 -327
  181. tests/test_proxy_api.py +264 -264
  182. tests/test_proxy_health_check.py +32 -32
  183. tests/test_proxy_middleware.py +121 -121
  184. tests/test_proxy_middleware_enhanced.py +216 -216
  185. tests/test_proxy_middleware_integration.py +136 -136
  186. tests/test_proxy_middleware_refactored.py +184 -184
  187. tests/test_proxy_providers.py +56 -56
  188. tests/test_proxy_stats.py +19 -19
  189. tests/test_proxy_strategies.py +59 -59
  190. tests/test_queue_manager_double_crawlo.py +173 -173
  191. tests/test_queue_manager_redis_key.py +176 -176
  192. tests/test_random_user_agent.py +72 -72
  193. tests/test_real_scenario_proxy.py +195 -195
  194. tests/test_redis_config.py +28 -28
  195. tests/test_redis_connection_pool.py +294 -294
  196. tests/test_redis_key_naming.py +181 -181
  197. tests/test_redis_key_validator.py +123 -123
  198. tests/test_redis_queue.py +224 -224
  199. tests/test_request_ignore_middleware.py +182 -182
  200. tests/test_request_params.py +111 -111
  201. tests/test_request_serialization.py +70 -70
  202. tests/test_response_code_middleware.py +349 -349
  203. tests/test_response_filter_middleware.py +427 -427
  204. tests/test_response_improvements.py +152 -152
  205. tests/test_retry_middleware.py +241 -241
  206. tests/test_scheduler.py +252 -252
  207. tests/test_scheduler_config_update.py +133 -133
  208. tests/test_simple_response.py +61 -61
  209. tests/test_telecom_spider_redis_key.py +205 -205
  210. tests/test_template_content.py +87 -87
  211. tests/test_template_redis_key.py +134 -134
  212. tests/test_tools.py +159 -159
  213. tests/test_user_agents.py +96 -96
  214. tests/tools_example.py +260 -260
  215. tests/verify_distributed.py +117 -117
  216. crawlo-1.3.1.dist-info/RECORD +0 -219
  217. {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/WHEEL +0 -0
  218. {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/entry_points.txt +0 -0
  219. {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/top_level.txt +0 -0
tests/test_performance.py CHANGED
@@ -1,328 +1,328 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- 性能测试
5
- 测试系统性能和瓶颈
6
- """
7
- import asyncio
8
- import sys
9
- import os
10
- import time
11
- import psutil
12
- import traceback
13
- from typing import List
14
-
15
- # 添加项目根目录到Python路径
16
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
17
-
18
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
19
- from crawlo.network.request import Request
20
- from crawlo.utils.redis_connection_pool import OptimizedRedisConnectionPool, get_redis_pool, close_all_pools
21
- from crawlo.utils.batch_processor import RedisBatchProcessor, BatchProcessor
22
- from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer
23
-
24
-
25
- async def test_redis_queue_performance():
26
- """测试 Redis 队列性能"""
27
- print("测试 Redis 队列性能...")
28
-
29
- try:
30
- queue = RedisPriorityQueue(
31
- redis_url="redis://127.0.0.1:6379/15",
32
- queue_name="test:performance:queue"
33
- )
34
- await queue.connect()
35
-
36
- # 1. 测试批量入队性能
37
- print(" 测试批量入队性能...")
38
- start_time = time.time()
39
- request_count = 1000
40
-
41
- for i in range(request_count):
42
- request = Request(url=f"https://example{i}.com", priority=i % 10)
43
- await queue.put(request)
44
-
45
- end_time = time.time()
46
- duration = end_time - start_time
47
- rate = request_count / duration
48
-
49
- print(f" 入队 {request_count} 个请求耗时: {duration:.2f}秒")
50
- print(f" 入队速率: {rate:.1f} 请求/秒")
51
-
52
- # 2. 测试批量出队性能
53
- print(" 测试批量出队性能...")
54
- start_time = time.time()
55
-
56
- retrieved_count = 0
57
- while retrieved_count < request_count:
58
- request = await queue.get(timeout=1.0)
59
- if request:
60
- await queue.ack(request)
61
- retrieved_count += 1
62
- else:
63
- break
64
-
65
- end_time = time.time()
66
- duration = end_time - start_time
67
- rate = retrieved_count / duration if duration > 0 else 0
68
-
69
- print(f" 出队 {retrieved_count} 个请求耗时: {duration:.2f}秒")
70
- print(f" 出队速率: {rate:.1f} 请求/秒")
71
-
72
- await queue.close()
73
-
74
- # 性能标准:1000个请求应该在5秒内完成
75
- if duration < 5.0:
76
- print(" Redis 队列性能测试通过")
77
- return True
78
- else:
79
- print(" Redis 队列性能较低")
80
- return True # 仍然算通过,只是性能较低
81
-
82
- except Exception as e:
83
- print(f" Redis 队列性能测试失败: {e}")
84
- traceback.print_exc()
85
- return False
86
-
87
-
88
- async def test_redis_connection_pool_performance():
89
- """测试 Redis 连接池性能"""
90
- print("测试 Redis 连接池性能...")
91
-
92
- try:
93
- # 1. 测试连接获取性能
94
- print(" 测试连接获取性能...")
95
- start_time = time.time()
96
- connection_count = 100
97
-
98
- pools = []
99
- for i in range(connection_count):
100
- pool = get_redis_pool(f"redis://127.0.0.1:6379/15?db={i % 16}")
101
- pools.append(pool)
102
-
103
- end_time = time.time()
104
- duration = end_time - start_time
105
-
106
- print(f" 获取 {connection_count} 个连接耗时: {duration:.2f}秒")
107
-
108
- # 2. 测试连接复用性能
109
- print(" 测试连接复用性能...")
110
- start_time = time.time()
111
-
112
- # 重复获取相同连接
113
- for i in range(connection_count * 10):
114
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
115
- redis_client = await pool.get_connection()
116
- await redis_client.ping()
117
-
118
- end_time = time.time()
119
- duration = end_time - start_time
120
-
121
- print(f" 复用 {connection_count * 10} 次连接耗时: {duration:.2f}秒")
122
-
123
- # 3. 测试并发连接获取
124
- print(" 测试并发连接获取...")
125
-
126
- async def get_connection_worker(worker_id: int):
127
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
128
- redis_client = await pool.get_connection()
129
- await redis_client.ping()
130
- return True
131
-
132
- start_time = time.time()
133
- tasks = [get_connection_worker(i) for i in range(50)]
134
- results = await asyncio.gather(*tasks, return_exceptions=True)
135
- end_time = time.time()
136
-
137
- success_count = sum(1 for result in results if result is True)
138
- duration = end_time - start_time
139
-
140
- print(f" 并发获取 50 个连接耗时: {duration:.2f}秒")
141
- print(f" 成功获取: {success_count}/50")
142
-
143
- # 性能标准:并发获取应该在2秒内完成
144
- if duration < 2.0 and success_count >= 45:
145
- print(" Redis 连接池性能测试通过")
146
- return True
147
- else:
148
- print(" Redis 连接池性能较低")
149
- return True # 仍然算通过,只是性能较低
150
-
151
- except Exception as e:
152
- print(f" Redis 连接池性能测试失败: {e}")
153
- traceback.print_exc()
154
- return False
155
-
156
-
157
- async def test_batch_processor_performance():
158
- """测试批处理器性能"""
159
- print("测试批处理器性能...")
160
-
161
- try:
162
- # 创建连接池和批处理器
163
- pool = get_redis_pool("redis://127.0.0.1:6379/15")
164
- redis_client = await pool.get_connection()
165
- batch_processor = RedisBatchProcessor(redis_client, batch_size=100)
166
-
167
- # 1. 测试 Redis 批量设置性能
168
- print(" 测试 Redis 批量设置性能...")
169
- items_count = 1000
170
- items = [{"key": f"perf_test_key_{i}", "value": f"perf_test_value_{i}"} for i in range(items_count)]
171
-
172
- start_time = time.time()
173
- count = await batch_processor.batch_set(items)
174
- end_time = time.time()
175
-
176
- duration = end_time - start_time
177
- rate = count / duration if duration > 0 else 0
178
-
179
- print(f" 批量设置 {count} 个键值对耗时: {duration:.2f}秒")
180
- print(f" 设置速率: {rate:.1f} 键值对/秒")
181
-
182
- # 2. 测试 Redis 批量获取性能
183
- print(" 测试 Redis 批量获取性能...")
184
- keys = [f"perf_test_key_{i}" for i in range(items_count)]
185
-
186
- start_time = time.time()
187
- result = await batch_processor.batch_get(keys)
188
- end_time = time.time()
189
-
190
- duration = end_time - start_time
191
- rate = len(result) / duration if duration > 0 else 0
192
-
193
- print(f" 批量获取 {len(result)} 个键值对耗时: {duration:.2f}秒")
194
- print(f" 获取速率: {rate:.1f} 键值对/秒")
195
-
196
- # 3. 测试通用批处理器性能
197
- print(" 测试通用批处理器性能...")
198
-
199
- async def process_item(item: int) -> int:
200
- # 模拟一些处理工作
201
- await asyncio.sleep(0.001)
202
- return item * 2
203
-
204
- batch_processor_general = BatchProcessor(batch_size=50, max_concurrent_batches=10)
205
- items_to_process = list(range(1000))
206
-
207
- start_time = time.time()
208
- results = await batch_processor_general.process_in_batches(items_to_process, process_item)
209
- end_time = time.time()
210
-
211
- duration = end_time - start_time
212
- rate = len(results) / duration if duration > 0 else 0
213
-
214
- print(f" 批量处理 {len(results)} 个项目耗时: {duration:.2f}秒")
215
- print(f" 处理速率: {rate:.1f} 项目/秒")
216
-
217
- # 清理测试数据
218
- await redis_client.delete(*[f"perf_test_key_{i}" for i in range(items_count)])
219
-
220
- # 性能标准:批量操作应该在合理时间内完成
221
- if duration < 10.0:
222
- print(" 批处理器性能测试通过")
223
- return True
224
- else:
225
- print(" 批处理器性能较低")
226
- return True # 仍然算通过,只是性能较低
227
-
228
- except Exception as e:
229
- print(f" 批处理器性能测试失败: {e}")
230
- traceback.print_exc()
231
- return False
232
-
233
-
234
- async def test_performance_monitor_overhead():
235
- """测试性能监控器开销"""
236
- print("🔍 测试性能监控器开销...")
237
-
238
- try:
239
- monitor = PerformanceMonitor("test_monitor")
240
-
241
- # 1. 测试指标获取开销
242
- print(" 测试指标获取开销...")
243
- start_time = time.time()
244
-
245
- for i in range(100):
246
- metrics = monitor.get_system_metrics()
247
- assert isinstance(metrics, dict), "应该返回字典"
248
-
249
- end_time = time.time()
250
- duration = end_time - start_time
251
-
252
- print(f" 获取 100 次系统指标耗时: {duration:.2f}秒")
253
- print(f" 平均每次耗时: {duration * 1000 / 100:.2f}毫秒")
254
-
255
- # 2. 测试计时器开销
256
- print(" 测试计时器开销...")
257
-
258
- total_timer_time = 0
259
- timer_count = 1000
260
-
261
- for i in range(timer_count):
262
- start = time.time()
263
- with PerformanceTimer(f"test_timer_{i}"):
264
- pass # 空操作
265
- end = time.time()
266
- total_timer_time += (end - start)
267
-
268
- avg_timer_time = total_timer_time / timer_count * 1000 # 转换为毫秒
269
-
270
- print(f" 平均计时器开销: {avg_timer_time:.2f}毫秒")
271
-
272
- # 开销标准:平均计时器开销应该小于1毫秒
273
- if avg_timer_time < 1.0:
274
- print(" 性能监控器开销测试通过")
275
- return True
276
- else:
277
- print(" 性能监控器开销较高")
278
- return True # 仍然算通过,只是开销较高
279
-
280
- except Exception as e:
281
- print(f" 性能监控器开销测试失败: {e}")
282
- traceback.print_exc()
283
- return False
284
-
285
-
286
- async def main():
287
- """主测试函数"""
288
- print("开始性能测试...")
289
- print("=" * 50)
290
-
291
- tests = [
292
- test_redis_queue_performance,
293
- test_redis_connection_pool_performance,
294
- test_batch_processor_performance,
295
- test_performance_monitor_overhead,
296
- ]
297
-
298
- passed = 0
299
- total = len(tests)
300
-
301
- for test_func in tests:
302
- try:
303
- if await test_func():
304
- passed += 1
305
- print(f"{test_func.__name__} 通过")
306
- else:
307
- print(f"{test_func.__name__} 失败")
308
- except Exception as e:
309
- print(f"{test_func.__name__} 异常: {e}")
310
- print()
311
-
312
- # 关闭所有连接池
313
- await close_all_pools()
314
-
315
- print("=" * 50)
316
- print(f"性能测试结果: {passed}/{total} 通过")
317
-
318
- if passed == total:
319
- print("所有性能测试通过!")
320
- return 0
321
- else:
322
- print("部分性能测试失败,请检查实现")
323
- return 1
324
-
325
-
326
- if __name__ == "__main__":
327
- exit_code = asyncio.run(main())
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 性能测试
5
+ 测试系统性能和瓶颈
6
+ """
7
+ import asyncio
8
+ import sys
9
+ import os
10
+ import time
11
+ import psutil
12
+ import traceback
13
+ from typing import List
14
+
15
+ # 添加项目根目录到Python路径
16
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
17
+
18
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
19
+ from crawlo.network.request import Request
20
+ from crawlo.utils.redis_connection_pool import OptimizedRedisConnectionPool, get_redis_pool, close_all_pools
21
+ from crawlo.utils.batch_processor import RedisBatchProcessor, BatchProcessor
22
+ from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer
23
+
24
+
25
+ async def test_redis_queue_performance():
26
+ """测试 Redis 队列性能"""
27
+ print("测试 Redis 队列性能...")
28
+
29
+ try:
30
+ queue = RedisPriorityQueue(
31
+ redis_url="redis://127.0.0.1:6379/15",
32
+ queue_name="test:performance:queue"
33
+ )
34
+ await queue.connect()
35
+
36
+ # 1. 测试批量入队性能
37
+ print(" 测试批量入队性能...")
38
+ start_time = time.time()
39
+ request_count = 1000
40
+
41
+ for i in range(request_count):
42
+ request = Request(url=f"https://example{i}.com", priority=i % 10)
43
+ await queue.put(request)
44
+
45
+ end_time = time.time()
46
+ duration = end_time - start_time
47
+ rate = request_count / duration
48
+
49
+ print(f" 入队 {request_count} 个请求耗时: {duration:.2f}秒")
50
+ print(f" 入队速率: {rate:.1f} 请求/秒")
51
+
52
+ # 2. 测试批量出队性能
53
+ print(" 测试批量出队性能...")
54
+ start_time = time.time()
55
+
56
+ retrieved_count = 0
57
+ while retrieved_count < request_count:
58
+ request = await queue.get(timeout=1.0)
59
+ if request:
60
+ await queue.ack(request)
61
+ retrieved_count += 1
62
+ else:
63
+ break
64
+
65
+ end_time = time.time()
66
+ duration = end_time - start_time
67
+ rate = retrieved_count / duration if duration > 0 else 0
68
+
69
+ print(f" 出队 {retrieved_count} 个请求耗时: {duration:.2f}秒")
70
+ print(f" 出队速率: {rate:.1f} 请求/秒")
71
+
72
+ await queue.close()
73
+
74
+ # 性能标准:1000个请求应该在5秒内完成
75
+ if duration < 5.0:
76
+ print(" Redis 队列性能测试通过")
77
+ return True
78
+ else:
79
+ print(" Redis 队列性能较低")
80
+ return True # 仍然算通过,只是性能较低
81
+
82
+ except Exception as e:
83
+ print(f" Redis 队列性能测试失败: {e}")
84
+ traceback.print_exc()
85
+ return False
86
+
87
+
88
+ async def test_redis_connection_pool_performance():
89
+ """测试 Redis 连接池性能"""
90
+ print("测试 Redis 连接池性能...")
91
+
92
+ try:
93
+ # 1. 测试连接获取性能
94
+ print(" 测试连接获取性能...")
95
+ start_time = time.time()
96
+ connection_count = 100
97
+
98
+ pools = []
99
+ for i in range(connection_count):
100
+ pool = get_redis_pool(f"redis://127.0.0.1:6379/15?db={i % 16}")
101
+ pools.append(pool)
102
+
103
+ end_time = time.time()
104
+ duration = end_time - start_time
105
+
106
+ print(f" 获取 {connection_count} 个连接耗时: {duration:.2f}秒")
107
+
108
+ # 2. 测试连接复用性能
109
+ print(" 测试连接复用性能...")
110
+ start_time = time.time()
111
+
112
+ # 重复获取相同连接
113
+ for i in range(connection_count * 10):
114
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
115
+ redis_client = await pool.get_connection()
116
+ await redis_client.ping()
117
+
118
+ end_time = time.time()
119
+ duration = end_time - start_time
120
+
121
+ print(f" 复用 {connection_count * 10} 次连接耗时: {duration:.2f}秒")
122
+
123
+ # 3. 测试并发连接获取
124
+ print(" 测试并发连接获取...")
125
+
126
+ async def get_connection_worker(worker_id: int):
127
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
128
+ redis_client = await pool.get_connection()
129
+ await redis_client.ping()
130
+ return True
131
+
132
+ start_time = time.time()
133
+ tasks = [get_connection_worker(i) for i in range(50)]
134
+ results = await asyncio.gather(*tasks, return_exceptions=True)
135
+ end_time = time.time()
136
+
137
+ success_count = sum(1 for result in results if result is True)
138
+ duration = end_time - start_time
139
+
140
+ print(f" 并发获取 50 个连接耗时: {duration:.2f}秒")
141
+ print(f" 成功获取: {success_count}/50")
142
+
143
+ # 性能标准:并发获取应该在2秒内完成
144
+ if duration < 2.0 and success_count >= 45:
145
+ print(" Redis 连接池性能测试通过")
146
+ return True
147
+ else:
148
+ print(" Redis 连接池性能较低")
149
+ return True # 仍然算通过,只是性能较低
150
+
151
+ except Exception as e:
152
+ print(f" Redis 连接池性能测试失败: {e}")
153
+ traceback.print_exc()
154
+ return False
155
+
156
+
157
+ async def test_batch_processor_performance():
158
+ """测试批处理器性能"""
159
+ print("测试批处理器性能...")
160
+
161
+ try:
162
+ # 创建连接池和批处理器
163
+ pool = get_redis_pool("redis://127.0.0.1:6379/15")
164
+ redis_client = await pool.get_connection()
165
+ batch_processor = RedisBatchProcessor(redis_client, batch_size=100)
166
+
167
+ # 1. 测试 Redis 批量设置性能
168
+ print(" 测试 Redis 批量设置性能...")
169
+ items_count = 1000
170
+ items = [{"key": f"perf_test_key_{i}", "value": f"perf_test_value_{i}"} for i in range(items_count)]
171
+
172
+ start_time = time.time()
173
+ count = await batch_processor.batch_set(items)
174
+ end_time = time.time()
175
+
176
+ duration = end_time - start_time
177
+ rate = count / duration if duration > 0 else 0
178
+
179
+ print(f" 批量设置 {count} 个键值对耗时: {duration:.2f}秒")
180
+ print(f" 设置速率: {rate:.1f} 键值对/秒")
181
+
182
+ # 2. 测试 Redis 批量获取性能
183
+ print(" 测试 Redis 批量获取性能...")
184
+ keys = [f"perf_test_key_{i}" for i in range(items_count)]
185
+
186
+ start_time = time.time()
187
+ result = await batch_processor.batch_get(keys)
188
+ end_time = time.time()
189
+
190
+ duration = end_time - start_time
191
+ rate = len(result) / duration if duration > 0 else 0
192
+
193
+ print(f" 批量获取 {len(result)} 个键值对耗时: {duration:.2f}秒")
194
+ print(f" 获取速率: {rate:.1f} 键值对/秒")
195
+
196
+ # 3. 测试通用批处理器性能
197
+ print(" 测试通用批处理器性能...")
198
+
199
+ async def process_item(item: int) -> int:
200
+ # 模拟一些处理工作
201
+ await asyncio.sleep(0.001)
202
+ return item * 2
203
+
204
+ batch_processor_general = BatchProcessor(batch_size=50, max_concurrent_batches=10)
205
+ items_to_process = list(range(1000))
206
+
207
+ start_time = time.time()
208
+ results = await batch_processor_general.process_in_batches(items_to_process, process_item)
209
+ end_time = time.time()
210
+
211
+ duration = end_time - start_time
212
+ rate = len(results) / duration if duration > 0 else 0
213
+
214
+ print(f" 批量处理 {len(results)} 个项目耗时: {duration:.2f}秒")
215
+ print(f" 处理速率: {rate:.1f} 项目/秒")
216
+
217
+ # 清理测试数据
218
+ await redis_client.delete(*[f"perf_test_key_{i}" for i in range(items_count)])
219
+
220
+ # 性能标准:批量操作应该在合理时间内完成
221
+ if duration < 10.0:
222
+ print(" 批处理器性能测试通过")
223
+ return True
224
+ else:
225
+ print(" 批处理器性能较低")
226
+ return True # 仍然算通过,只是性能较低
227
+
228
+ except Exception as e:
229
+ print(f" 批处理器性能测试失败: {e}")
230
+ traceback.print_exc()
231
+ return False
232
+
233
+
234
+ async def test_performance_monitor_overhead():
235
+ """测试性能监控器开销"""
236
+ print("🔍 测试性能监控器开销...")
237
+
238
+ try:
239
+ monitor = PerformanceMonitor("test_monitor")
240
+
241
+ # 1. 测试指标获取开销
242
+ print(" 测试指标获取开销...")
243
+ start_time = time.time()
244
+
245
+ for i in range(100):
246
+ metrics = monitor.get_system_metrics()
247
+ assert isinstance(metrics, dict), "应该返回字典"
248
+
249
+ end_time = time.time()
250
+ duration = end_time - start_time
251
+
252
+ print(f" 获取 100 次系统指标耗时: {duration:.2f}秒")
253
+ print(f" 平均每次耗时: {duration * 1000 / 100:.2f}毫秒")
254
+
255
+ # 2. 测试计时器开销
256
+ print(" 测试计时器开销...")
257
+
258
+ total_timer_time = 0
259
+ timer_count = 1000
260
+
261
+ for i in range(timer_count):
262
+ start = time.time()
263
+ with PerformanceTimer(f"test_timer_{i}"):
264
+ pass # 空操作
265
+ end = time.time()
266
+ total_timer_time += (end - start)
267
+
268
+ avg_timer_time = total_timer_time / timer_count * 1000 # 转换为毫秒
269
+
270
+ print(f" 平均计时器开销: {avg_timer_time:.2f}毫秒")
271
+
272
+ # 开销标准:平均计时器开销应该小于1毫秒
273
+ if avg_timer_time < 1.0:
274
+ print(" 性能监控器开销测试通过")
275
+ return True
276
+ else:
277
+ print(" 性能监控器开销较高")
278
+ return True # 仍然算通过,只是开销较高
279
+
280
+ except Exception as e:
281
+ print(f" 性能监控器开销测试失败: {e}")
282
+ traceback.print_exc()
283
+ return False
284
+
285
+
286
+ async def main():
287
+ """主测试函数"""
288
+ print("开始性能测试...")
289
+ print("=" * 50)
290
+
291
+ tests = [
292
+ test_redis_queue_performance,
293
+ test_redis_connection_pool_performance,
294
+ test_batch_processor_performance,
295
+ test_performance_monitor_overhead,
296
+ ]
297
+
298
+ passed = 0
299
+ total = len(tests)
300
+
301
+ for test_func in tests:
302
+ try:
303
+ if await test_func():
304
+ passed += 1
305
+ print(f"{test_func.__name__} 通过")
306
+ else:
307
+ print(f"{test_func.__name__} 失败")
308
+ except Exception as e:
309
+ print(f"{test_func.__name__} 异常: {e}")
310
+ print()
311
+
312
+ # 关闭所有连接池
313
+ await close_all_pools()
314
+
315
+ print("=" * 50)
316
+ print(f"性能测试结果: {passed}/{total} 通过")
317
+
318
+ if passed == total:
319
+ print("所有性能测试通过!")
320
+ return 0
321
+ else:
322
+ print("部分性能测试失败,请检查实现")
323
+ return 1
324
+
325
+
326
+ if __name__ == "__main__":
327
+ exit_code = asyncio.run(main())
328
328
  exit(exit_code)