crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +28 -1
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +61 -0
- crawlo/cleaners/data_formatter.py +226 -0
- crawlo/cleaners/encoding_converter.py +126 -0
- crawlo/cleaners/text_cleaner.py +233 -0
- crawlo/commands/startproject.py +117 -13
- crawlo/config.py +30 -0
- crawlo/config_validator.py +253 -0
- crawlo/core/engine.py +185 -11
- crawlo/core/scheduler.py +49 -78
- crawlo/crawler.py +6 -6
- crawlo/downloader/__init__.py +24 -0
- crawlo/downloader/aiohttp_downloader.py +8 -0
- crawlo/downloader/cffi_downloader.py +5 -0
- crawlo/downloader/hybrid_downloader.py +214 -0
- crawlo/downloader/playwright_downloader.py +403 -0
- crawlo/downloader/selenium_downloader.py +473 -0
- crawlo/extension/__init__.py +17 -10
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +27 -18
- crawlo/extension/log_stats.py +62 -24
- crawlo/extension/logging_extension.py +18 -9
- crawlo/extension/memory_monitor.py +105 -0
- crawlo/extension/performance_profiler.py +134 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/aioredis_filter.py +50 -12
- crawlo/middleware/proxy.py +26 -2
- crawlo/mode_manager.py +24 -19
- crawlo/network/request.py +30 -3
- crawlo/network/response.py +114 -25
- crawlo/pipelines/mongo_pipeline.py +81 -66
- crawlo/pipelines/mysql_pipeline.py +165 -43
- crawlo/pipelines/redis_dedup_pipeline.py +7 -3
- crawlo/queue/queue_manager.py +15 -2
- crawlo/queue/redis_priority_queue.py +144 -76
- crawlo/settings/default_settings.py +93 -121
- crawlo/subscriber.py +62 -37
- crawlo/templates/project/items.py.tmpl +1 -1
- crawlo/templates/project/middlewares.py.tmpl +73 -49
- crawlo/templates/project/pipelines.py.tmpl +51 -295
- crawlo/templates/project/settings.py.tmpl +93 -17
- crawlo/templates/project/settings_distributed.py.tmpl +120 -0
- crawlo/templates/project/settings_gentle.py.tmpl +95 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
- crawlo/templates/project/settings_simple.py.tmpl +69 -0
- crawlo/templates/spider/spider.py.tmpl +2 -38
- crawlo/tools/__init__.py +183 -0
- crawlo/tools/anti_crawler.py +269 -0
- crawlo/tools/authenticated_proxy.py +241 -0
- crawlo/tools/data_validator.py +181 -0
- crawlo/tools/date_tools.py +36 -0
- crawlo/tools/distributed_coordinator.py +387 -0
- crawlo/tools/retry_mechanism.py +221 -0
- crawlo/tools/scenario_adapter.py +263 -0
- crawlo/utils/__init__.py +29 -1
- crawlo/utils/batch_processor.py +261 -0
- crawlo/utils/date_tools.py +58 -1
- crawlo/utils/enhanced_error_handler.py +360 -0
- crawlo/utils/env_config.py +106 -0
- crawlo/utils/error_handler.py +126 -0
- crawlo/utils/performance_monitor.py +285 -0
- crawlo/utils/redis_connection_pool.py +335 -0
- crawlo/utils/redis_key_validator.py +200 -0
- crawlo-1.1.5.dist-info/METADATA +401 -0
- crawlo-1.1.5.dist-info/RECORD +185 -0
- tests/advanced_tools_example.py +276 -0
- tests/authenticated_proxy_example.py +237 -0
- tests/cleaners_example.py +161 -0
- tests/config_validation_demo.py +103 -0
- tests/date_tools_example.py +181 -0
- tests/dynamic_loading_example.py +524 -0
- tests/dynamic_loading_test.py +105 -0
- tests/env_config_example.py +134 -0
- tests/error_handling_example.py +172 -0
- tests/redis_key_validation_demo.py +131 -0
- tests/response_improvements_example.py +145 -0
- tests/test_advanced_tools.py +149 -0
- tests/test_all_redis_key_configs.py +146 -0
- tests/test_authenticated_proxy.py +142 -0
- tests/test_cleaners.py +55 -0
- tests/test_comprehensive.py +147 -0
- tests/test_config_validator.py +194 -0
- tests/test_date_tools.py +124 -0
- tests/test_dynamic_downloaders_proxy.py +125 -0
- tests/test_dynamic_proxy.py +93 -0
- tests/test_dynamic_proxy_config.py +147 -0
- tests/test_dynamic_proxy_real.py +110 -0
- tests/test_edge_cases.py +304 -0
- tests/test_enhanced_error_handler.py +271 -0
- tests/test_env_config.py +122 -0
- tests/test_error_handler_compatibility.py +113 -0
- tests/test_framework_env_usage.py +104 -0
- tests/test_integration.py +357 -0
- tests/test_item_dedup_redis_key.py +123 -0
- tests/test_parsel.py +30 -0
- tests/test_performance.py +328 -0
- tests/test_queue_manager_redis_key.py +177 -0
- tests/test_redis_connection_pool.py +295 -0
- tests/test_redis_key_naming.py +182 -0
- tests/test_redis_key_validator.py +124 -0
- tests/test_response_improvements.py +153 -0
- tests/test_simple_response.py +62 -0
- tests/test_telecom_spider_redis_key.py +206 -0
- tests/test_template_content.py +88 -0
- tests/test_template_redis_key.py +135 -0
- tests/test_tools.py +154 -0
- tests/tools_example.py +258 -0
- crawlo/core/enhanced_engine.py +0 -190
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
- {examples → tests}/controlled_spider_example.py +0 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
数据项去重Redis Key测试脚本
|
|
5
|
+
用于验证RedisDedupPipeline和示例项目中的Redis去重管道是否使用统一的Redis key命名规范
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import traceback
|
|
10
|
+
|
|
11
|
+
# 添加项目根目录到路径
|
|
12
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
13
|
+
|
|
14
|
+
from crawlo.pipelines.redis_dedup_pipeline import RedisDedupPipeline
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockSettings:
|
|
18
|
+
"""模拟设置类"""
|
|
19
|
+
def __init__(self, project_name="test_project"):
|
|
20
|
+
self.project_name = project_name
|
|
21
|
+
|
|
22
|
+
def get(self, key, default=None):
|
|
23
|
+
if key == 'PROJECT_NAME':
|
|
24
|
+
return self.project_name
|
|
25
|
+
elif key == 'REDIS_HOST':
|
|
26
|
+
return 'localhost'
|
|
27
|
+
elif key == 'REDIS_PORT':
|
|
28
|
+
return 6379
|
|
29
|
+
elif key == 'REDIS_DB':
|
|
30
|
+
return 2
|
|
31
|
+
elif key == 'REDIS_PASSWORD':
|
|
32
|
+
return None
|
|
33
|
+
elif key == 'LOG_LEVEL':
|
|
34
|
+
return 'INFO'
|
|
35
|
+
return default
|
|
36
|
+
|
|
37
|
+
def getint(self, key, default=0):
|
|
38
|
+
if key == 'REDIS_PORT':
|
|
39
|
+
return 6379
|
|
40
|
+
elif key == 'REDIS_DB':
|
|
41
|
+
return 2
|
|
42
|
+
return default
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MockCrawler:
|
|
46
|
+
"""模拟爬虫类"""
|
|
47
|
+
def __init__(self, project_name="test_project"):
|
|
48
|
+
self.settings = MockSettings(project_name)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
async def test_item_dedup_redis_key():
|
|
52
|
+
"""测试数据项去重Redis key命名规范"""
|
|
53
|
+
print("🔍 测试数据项去重Redis key命名规范...")
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# 测试不同的项目名称
|
|
57
|
+
test_cases = [
|
|
58
|
+
{
|
|
59
|
+
"project_name": "books_distributed",
|
|
60
|
+
"expected_key": "crawlo:books_distributed:item:fingerprint",
|
|
61
|
+
"description": "书籍分布式项目"
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"project_name": "api_data_collection",
|
|
65
|
+
"expected_key": "crawlo:api_data_collection:item:fingerprint",
|
|
66
|
+
"description": "API数据采集项目"
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"project_name": "test_project",
|
|
70
|
+
"expected_key": "crawlo:test_project:item:fingerprint",
|
|
71
|
+
"description": "测试项目"
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
for i, test_case in enumerate(test_cases, 1):
|
|
76
|
+
print(f" {i}. 测试 {test_case['description']}...")
|
|
77
|
+
|
|
78
|
+
# 测试RedisDedupPipeline
|
|
79
|
+
mock_crawler = MockCrawler(test_case["project_name"])
|
|
80
|
+
pipeline = RedisDedupPipeline.from_crawler(mock_crawler)
|
|
81
|
+
|
|
82
|
+
# 验证Redis key是否符合规范
|
|
83
|
+
assert pipeline.redis_key == test_case["expected_key"], \
|
|
84
|
+
f"Redis key不匹配: {pipeline.redis_key} != {test_case['expected_key']}"
|
|
85
|
+
|
|
86
|
+
print(f" ✅ Redis key: {pipeline.redis_key}")
|
|
87
|
+
|
|
88
|
+
print("✅ 数据项去重Redis key命名规范测试通过!")
|
|
89
|
+
return True
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
print(f"❌ 数据项去重Redis key命名规范测试失败: {e}")
|
|
93
|
+
traceback.print_exc()
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def main():
|
|
98
|
+
"""主测试函数"""
|
|
99
|
+
print("🚀 开始数据项去重Redis key命名规范测试...")
|
|
100
|
+
print("=" * 50)
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
success = test_item_dedup_redis_key()
|
|
104
|
+
|
|
105
|
+
print("=" * 50)
|
|
106
|
+
if success:
|
|
107
|
+
print("🎉 所有测试通过!数据项去重使用统一的Redis key命名规范")
|
|
108
|
+
else:
|
|
109
|
+
print("❌ 测试失败,请检查实现")
|
|
110
|
+
return 1
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print("=" * 50)
|
|
114
|
+
print(f"❌ 测试过程中发生异常: {e}")
|
|
115
|
+
traceback.print_exc()
|
|
116
|
+
return 1
|
|
117
|
+
|
|
118
|
+
return 0
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
if __name__ == "__main__":
|
|
122
|
+
exit_code = main()
|
|
123
|
+
sys.exit(exit_code)
|
tests/test_parsel.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
parsel 库测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from parsel import Selector, SelectorList
|
|
14
|
+
print("parsel 导入成功")
|
|
15
|
+
|
|
16
|
+
# 测试基本功能
|
|
17
|
+
html = "<html><body><h1>测试标题</h1></body></html>"
|
|
18
|
+
selector = Selector(html)
|
|
19
|
+
print("Selector 创建成功")
|
|
20
|
+
|
|
21
|
+
elements = selector.css('h1')
|
|
22
|
+
print("CSS 选择器执行成功")
|
|
23
|
+
|
|
24
|
+
text = elements.get()
|
|
25
|
+
print(f"获取文本: {text}")
|
|
26
|
+
|
|
27
|
+
except Exception as e:
|
|
28
|
+
print(f"错误: {e}")
|
|
29
|
+
import traceback
|
|
30
|
+
traceback.print_exc()
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
性能测试
|
|
5
|
+
测试系统性能和瓶颈
|
|
6
|
+
"""
|
|
7
|
+
import asyncio
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
import psutil
|
|
12
|
+
import traceback
|
|
13
|
+
from typing import List
|
|
14
|
+
|
|
15
|
+
# 添加项目根目录到Python路径
|
|
16
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
17
|
+
|
|
18
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
19
|
+
from crawlo.network.request import Request
|
|
20
|
+
from crawlo.utils.redis_connection_pool import OptimizedRedisConnectionPool, get_redis_pool, close_all_pools
|
|
21
|
+
from crawlo.utils.batch_processor import RedisBatchProcessor, BatchProcessor
|
|
22
|
+
from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def test_redis_queue_performance():
|
|
26
|
+
"""测试 Redis 队列性能"""
|
|
27
|
+
print("🔍 测试 Redis 队列性能...")
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
queue = RedisPriorityQueue(
|
|
31
|
+
redis_url="redis://127.0.0.1:6379/15",
|
|
32
|
+
queue_name="test:performance:queue"
|
|
33
|
+
)
|
|
34
|
+
await queue.connect()
|
|
35
|
+
|
|
36
|
+
# 1. 测试批量入队性能
|
|
37
|
+
print(" 📊 测试批量入队性能...")
|
|
38
|
+
start_time = time.time()
|
|
39
|
+
request_count = 1000
|
|
40
|
+
|
|
41
|
+
for i in range(request_count):
|
|
42
|
+
request = Request(url=f"https://example{i}.com", priority=i % 10)
|
|
43
|
+
await queue.put(request)
|
|
44
|
+
|
|
45
|
+
end_time = time.time()
|
|
46
|
+
duration = end_time - start_time
|
|
47
|
+
rate = request_count / duration
|
|
48
|
+
|
|
49
|
+
print(f" 入队 {request_count} 个请求耗时: {duration:.2f}秒")
|
|
50
|
+
print(f" 入队速率: {rate:.1f} 请求/秒")
|
|
51
|
+
|
|
52
|
+
# 2. 测试批量出队性能
|
|
53
|
+
print(" 📊 测试批量出队性能...")
|
|
54
|
+
start_time = time.time()
|
|
55
|
+
|
|
56
|
+
retrieved_count = 0
|
|
57
|
+
while retrieved_count < request_count:
|
|
58
|
+
request = await queue.get(timeout=1.0)
|
|
59
|
+
if request:
|
|
60
|
+
await queue.ack(request)
|
|
61
|
+
retrieved_count += 1
|
|
62
|
+
else:
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
end_time = time.time()
|
|
66
|
+
duration = end_time - start_time
|
|
67
|
+
rate = retrieved_count / duration if duration > 0 else 0
|
|
68
|
+
|
|
69
|
+
print(f" 出队 {retrieved_count} 个请求耗时: {duration:.2f}秒")
|
|
70
|
+
print(f" 出队速率: {rate:.1f} 请求/秒")
|
|
71
|
+
|
|
72
|
+
await queue.close()
|
|
73
|
+
|
|
74
|
+
# 性能标准:1000个请求应该在5秒内完成
|
|
75
|
+
if duration < 5.0:
|
|
76
|
+
print(" ✅ Redis 队列性能测试通过")
|
|
77
|
+
return True
|
|
78
|
+
else:
|
|
79
|
+
print(" ⚠️ Redis 队列性能较低")
|
|
80
|
+
return True # 仍然算通过,只是性能较低
|
|
81
|
+
|
|
82
|
+
except Exception as e:
|
|
83
|
+
print(f" ❌ Redis 队列性能测试失败: {e}")
|
|
84
|
+
traceback.print_exc()
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def test_redis_connection_pool_performance():
|
|
89
|
+
"""测试 Redis 连接池性能"""
|
|
90
|
+
print("🔍 测试 Redis 连接池性能...")
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# 1. 测试连接获取性能
|
|
94
|
+
print(" 📊 测试连接获取性能...")
|
|
95
|
+
start_time = time.time()
|
|
96
|
+
connection_count = 100
|
|
97
|
+
|
|
98
|
+
pools = []
|
|
99
|
+
for i in range(connection_count):
|
|
100
|
+
pool = get_redis_pool(f"redis://127.0.0.1:6379/15?db={i % 16}")
|
|
101
|
+
pools.append(pool)
|
|
102
|
+
|
|
103
|
+
end_time = time.time()
|
|
104
|
+
duration = end_time - start_time
|
|
105
|
+
|
|
106
|
+
print(f" 获取 {connection_count} 个连接耗时: {duration:.2f}秒")
|
|
107
|
+
|
|
108
|
+
# 2. 测试连接复用性能
|
|
109
|
+
print(" 📊 测试连接复用性能...")
|
|
110
|
+
start_time = time.time()
|
|
111
|
+
|
|
112
|
+
# 重复获取相同连接
|
|
113
|
+
for i in range(connection_count * 10):
|
|
114
|
+
pool = get_redis_pool("redis://127.0.0.1:6379/15")
|
|
115
|
+
redis_client = await pool.get_connection()
|
|
116
|
+
await redis_client.ping()
|
|
117
|
+
|
|
118
|
+
end_time = time.time()
|
|
119
|
+
duration = end_time - start_time
|
|
120
|
+
|
|
121
|
+
print(f" 复用 {connection_count * 10} 次连接耗时: {duration:.2f}秒")
|
|
122
|
+
|
|
123
|
+
# 3. 测试并发连接获取
|
|
124
|
+
print(" 📊 测试并发连接获取...")
|
|
125
|
+
|
|
126
|
+
async def get_connection_worker(worker_id: int):
|
|
127
|
+
pool = get_redis_pool("redis://127.0.0.1:6379/15")
|
|
128
|
+
redis_client = await pool.get_connection()
|
|
129
|
+
await redis_client.ping()
|
|
130
|
+
return True
|
|
131
|
+
|
|
132
|
+
start_time = time.time()
|
|
133
|
+
tasks = [get_connection_worker(i) for i in range(50)]
|
|
134
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
135
|
+
end_time = time.time()
|
|
136
|
+
|
|
137
|
+
success_count = sum(1 for result in results if result is True)
|
|
138
|
+
duration = end_time - start_time
|
|
139
|
+
|
|
140
|
+
print(f" 并发获取 50 个连接耗时: {duration:.2f}秒")
|
|
141
|
+
print(f" 成功获取: {success_count}/50")
|
|
142
|
+
|
|
143
|
+
# 性能标准:并发获取应该在2秒内完成
|
|
144
|
+
if duration < 2.0 and success_count >= 45:
|
|
145
|
+
print(" ✅ Redis 连接池性能测试通过")
|
|
146
|
+
return True
|
|
147
|
+
else:
|
|
148
|
+
print(" ⚠️ Redis 连接池性能较低")
|
|
149
|
+
return True # 仍然算通过,只是性能较低
|
|
150
|
+
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print(f" ❌ Redis 连接池性能测试失败: {e}")
|
|
153
|
+
traceback.print_exc()
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
async def test_batch_processor_performance():
|
|
158
|
+
"""测试批处理器性能"""
|
|
159
|
+
print("🔍 测试批处理器性能...")
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
# 创建连接池和批处理器
|
|
163
|
+
pool = get_redis_pool("redis://127.0.0.1:6379/15")
|
|
164
|
+
redis_client = await pool.get_connection()
|
|
165
|
+
batch_processor = RedisBatchProcessor(redis_client, batch_size=100)
|
|
166
|
+
|
|
167
|
+
# 1. 测试 Redis 批量设置性能
|
|
168
|
+
print(" 📊 测试 Redis 批量设置性能...")
|
|
169
|
+
items_count = 1000
|
|
170
|
+
items = [{"key": f"perf_test_key_{i}", "value": f"perf_test_value_{i}"} for i in range(items_count)]
|
|
171
|
+
|
|
172
|
+
start_time = time.time()
|
|
173
|
+
count = await batch_processor.batch_set(items)
|
|
174
|
+
end_time = time.time()
|
|
175
|
+
|
|
176
|
+
duration = end_time - start_time
|
|
177
|
+
rate = count / duration if duration > 0 else 0
|
|
178
|
+
|
|
179
|
+
print(f" 批量设置 {count} 个键值对耗时: {duration:.2f}秒")
|
|
180
|
+
print(f" 设置速率: {rate:.1f} 键值对/秒")
|
|
181
|
+
|
|
182
|
+
# 2. 测试 Redis 批量获取性能
|
|
183
|
+
print(" 📊 测试 Redis 批量获取性能...")
|
|
184
|
+
keys = [f"perf_test_key_{i}" for i in range(items_count)]
|
|
185
|
+
|
|
186
|
+
start_time = time.time()
|
|
187
|
+
result = await batch_processor.batch_get(keys)
|
|
188
|
+
end_time = time.time()
|
|
189
|
+
|
|
190
|
+
duration = end_time - start_time
|
|
191
|
+
rate = len(result) / duration if duration > 0 else 0
|
|
192
|
+
|
|
193
|
+
print(f" 批量获取 {len(result)} 个键值对耗时: {duration:.2f}秒")
|
|
194
|
+
print(f" 获取速率: {rate:.1f} 键值对/秒")
|
|
195
|
+
|
|
196
|
+
# 3. 测试通用批处理器性能
|
|
197
|
+
print(" 📊 测试通用批处理器性能...")
|
|
198
|
+
|
|
199
|
+
async def process_item(item: int) -> int:
|
|
200
|
+
# 模拟一些处理工作
|
|
201
|
+
await asyncio.sleep(0.001)
|
|
202
|
+
return item * 2
|
|
203
|
+
|
|
204
|
+
batch_processor_general = BatchProcessor(batch_size=50, max_concurrent_batches=10)
|
|
205
|
+
items_to_process = list(range(1000))
|
|
206
|
+
|
|
207
|
+
start_time = time.time()
|
|
208
|
+
results = await batch_processor_general.process_in_batches(items_to_process, process_item)
|
|
209
|
+
end_time = time.time()
|
|
210
|
+
|
|
211
|
+
duration = end_time - start_time
|
|
212
|
+
rate = len(results) / duration if duration > 0 else 0
|
|
213
|
+
|
|
214
|
+
print(f" 批量处理 {len(results)} 个项目耗时: {duration:.2f}秒")
|
|
215
|
+
print(f" 处理速率: {rate:.1f} 项目/秒")
|
|
216
|
+
|
|
217
|
+
# 清理测试数据
|
|
218
|
+
await redis_client.delete(*[f"perf_test_key_{i}" for i in range(items_count)])
|
|
219
|
+
|
|
220
|
+
# 性能标准:批量操作应该在合理时间内完成
|
|
221
|
+
if duration < 10.0:
|
|
222
|
+
print(" ✅ 批处理器性能测试通过")
|
|
223
|
+
return True
|
|
224
|
+
else:
|
|
225
|
+
print(" ⚠️ 批处理器性能较低")
|
|
226
|
+
return True # 仍然算通过,只是性能较低
|
|
227
|
+
|
|
228
|
+
except Exception as e:
|
|
229
|
+
print(f" ❌ 批处理器性能测试失败: {e}")
|
|
230
|
+
traceback.print_exc()
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
async def test_performance_monitor_overhead():
|
|
235
|
+
"""测试性能监控器开销"""
|
|
236
|
+
print("🔍 测试性能监控器开销...")
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
monitor = PerformanceMonitor("test_monitor")
|
|
240
|
+
|
|
241
|
+
# 1. 测试指标获取开销
|
|
242
|
+
print(" 📊 测试指标获取开销...")
|
|
243
|
+
start_time = time.time()
|
|
244
|
+
|
|
245
|
+
for i in range(100):
|
|
246
|
+
metrics = monitor.get_system_metrics()
|
|
247
|
+
assert isinstance(metrics, dict), "应该返回字典"
|
|
248
|
+
|
|
249
|
+
end_time = time.time()
|
|
250
|
+
duration = end_time - start_time
|
|
251
|
+
|
|
252
|
+
print(f" 获取 100 次系统指标耗时: {duration:.2f}秒")
|
|
253
|
+
print(f" 平均每次耗时: {duration * 1000 / 100:.2f}毫秒")
|
|
254
|
+
|
|
255
|
+
# 2. 测试计时器开销
|
|
256
|
+
print(" 📊 测试计时器开销...")
|
|
257
|
+
|
|
258
|
+
total_timer_time = 0
|
|
259
|
+
timer_count = 1000
|
|
260
|
+
|
|
261
|
+
for i in range(timer_count):
|
|
262
|
+
start = time.time()
|
|
263
|
+
with PerformanceTimer(f"test_timer_{i}"):
|
|
264
|
+
pass # 空操作
|
|
265
|
+
end = time.time()
|
|
266
|
+
total_timer_time += (end - start)
|
|
267
|
+
|
|
268
|
+
avg_timer_time = total_timer_time / timer_count * 1000 # 转换为毫秒
|
|
269
|
+
|
|
270
|
+
print(f" 平均计时器开销: {avg_timer_time:.2f}毫秒")
|
|
271
|
+
|
|
272
|
+
# 开销标准:平均计时器开销应该小于1毫秒
|
|
273
|
+
if avg_timer_time < 1.0:
|
|
274
|
+
print(" ✅ 性能监控器开销测试通过")
|
|
275
|
+
return True
|
|
276
|
+
else:
|
|
277
|
+
print(" ⚠️ 性能监控器开销较高")
|
|
278
|
+
return True # 仍然算通过,只是开销较高
|
|
279
|
+
|
|
280
|
+
except Exception as e:
|
|
281
|
+
print(f" ❌ 性能监控器开销测试失败: {e}")
|
|
282
|
+
traceback.print_exc()
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
async def main():
|
|
287
|
+
"""主测试函数"""
|
|
288
|
+
print("🚀 开始性能测试...")
|
|
289
|
+
print("=" * 50)
|
|
290
|
+
|
|
291
|
+
tests = [
|
|
292
|
+
test_redis_queue_performance,
|
|
293
|
+
test_redis_connection_pool_performance,
|
|
294
|
+
test_batch_processor_performance,
|
|
295
|
+
test_performance_monitor_overhead,
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
passed = 0
|
|
299
|
+
total = len(tests)
|
|
300
|
+
|
|
301
|
+
for test_func in tests:
|
|
302
|
+
try:
|
|
303
|
+
if await test_func():
|
|
304
|
+
passed += 1
|
|
305
|
+
print(f"✅ {test_func.__name__} 通过")
|
|
306
|
+
else:
|
|
307
|
+
print(f"❌ {test_func.__name__} 失败")
|
|
308
|
+
except Exception as e:
|
|
309
|
+
print(f"❌ {test_func.__name__} 异常: {e}")
|
|
310
|
+
print()
|
|
311
|
+
|
|
312
|
+
# 关闭所有连接池
|
|
313
|
+
await close_all_pools()
|
|
314
|
+
|
|
315
|
+
print("=" * 50)
|
|
316
|
+
print(f"📊 性能测试结果: {passed}/{total} 通过")
|
|
317
|
+
|
|
318
|
+
if passed == total:
|
|
319
|
+
print("🎉 所有性能测试通过!")
|
|
320
|
+
return 0
|
|
321
|
+
else:
|
|
322
|
+
print("❌ 部分性能测试失败,请检查实现")
|
|
323
|
+
return 1
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
if __name__ == "__main__":
|
|
327
|
+
exit_code = asyncio.run(main())
|
|
328
|
+
exit(exit_code)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
QueueManager Redis Key 测试脚本
|
|
5
|
+
用于验证QueueManager在创建RedisPriorityQueue时是否正确传递module_name参数
|
|
6
|
+
"""
|
|
7
|
+
import asyncio
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
import traceback
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MockSettings:
|
|
19
|
+
"""模拟设置类"""
|
|
20
|
+
def __init__(self, project_name="test_project"):
|
|
21
|
+
self.project_name = project_name
|
|
22
|
+
self.REDIS_URL = "redis://127.0.0.1:6379/15" # 使用测试数据库
|
|
23
|
+
self.REDIS_TTL = 0
|
|
24
|
+
self.CLEANUP_FP = 0
|
|
25
|
+
self.FILTER_DEBUG = True
|
|
26
|
+
self.LOG_LEVEL = "INFO"
|
|
27
|
+
self.DECODE_RESPONSES = True
|
|
28
|
+
|
|
29
|
+
def get(self, key, default=None):
|
|
30
|
+
if key == 'PROJECT_NAME':
|
|
31
|
+
return self.project_name
|
|
32
|
+
elif key == 'REDIS_URL':
|
|
33
|
+
return self.REDIS_URL
|
|
34
|
+
elif key == 'FILTER_DEBUG':
|
|
35
|
+
return self.FILTER_DEBUG
|
|
36
|
+
elif key == 'LOG_LEVEL':
|
|
37
|
+
return self.LOG_LEVEL
|
|
38
|
+
elif key == 'DECODE_RESPONSES':
|
|
39
|
+
return self.DECODE_RESPONSES
|
|
40
|
+
return default
|
|
41
|
+
|
|
42
|
+
def get_bool(self, key, default=False):
|
|
43
|
+
if key == 'FILTER_DEBUG':
|
|
44
|
+
return self.FILTER_DEBUG
|
|
45
|
+
elif key == 'DECODE_RESPONSES':
|
|
46
|
+
return self.DECODE_RESPONSES
|
|
47
|
+
elif key == 'CLEANUP_FP':
|
|
48
|
+
return self.CLEANUP_FP
|
|
49
|
+
return default
|
|
50
|
+
|
|
51
|
+
def get_int(self, key, default=0):
|
|
52
|
+
if key == 'REDIS_TTL':
|
|
53
|
+
return self.REDIS_TTL
|
|
54
|
+
elif key == 'REDIS_PORT':
|
|
55
|
+
return 6379
|
|
56
|
+
elif key == 'REDIS_DB':
|
|
57
|
+
return 0
|
|
58
|
+
elif key == 'SCHEDULER_MAX_QUEUE_SIZE':
|
|
59
|
+
return 1000
|
|
60
|
+
elif key == 'QUEUE_MAX_RETRIES':
|
|
61
|
+
return 3
|
|
62
|
+
elif key == 'QUEUE_TIMEOUT':
|
|
63
|
+
return 300
|
|
64
|
+
return default
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def test_queue_manager_redis_key():
|
|
68
|
+
"""测试QueueManager创建Redis队列时的key命名"""
|
|
69
|
+
print("🔍 测试QueueManager创建Redis队列时的key命名...")
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
# 测试不同的队列名称配置
|
|
73
|
+
test_cases = [
|
|
74
|
+
{
|
|
75
|
+
"queue_name": "crawlo:books_distributed:queue:requests",
|
|
76
|
+
"expected_module_name": "books_distributed",
|
|
77
|
+
"description": "标准项目名称"
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"queue_name": "crawlo:api_data_collection:queue:requests",
|
|
81
|
+
"expected_module_name": "api_data_collection",
|
|
82
|
+
"description": "API数据采集项目"
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"queue_name": "crawlo:test_project:queue:requests",
|
|
86
|
+
"expected_module_name": "test_project",
|
|
87
|
+
"description": "测试项目"
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"queue_name": "simple_queue_name",
|
|
91
|
+
"expected_module_name": "simple_queue_name",
|
|
92
|
+
"description": "简单队列名称"
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"queue_name": "",
|
|
96
|
+
"expected_module_name": "default",
|
|
97
|
+
"description": "空队列名称"
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
for i, test_case in enumerate(test_cases, 1):
|
|
102
|
+
print(f" {i}. 测试 {test_case['description']}...")
|
|
103
|
+
|
|
104
|
+
# 创建QueueConfig
|
|
105
|
+
config = QueueConfig(
|
|
106
|
+
queue_type=QueueType.REDIS,
|
|
107
|
+
redis_url="redis://127.0.0.1:6379/15",
|
|
108
|
+
queue_name=test_case["queue_name"],
|
|
109
|
+
max_queue_size=1000,
|
|
110
|
+
max_retries=3,
|
|
111
|
+
timeout=300
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# 创建QueueManager
|
|
115
|
+
queue_manager = QueueManager(config)
|
|
116
|
+
|
|
117
|
+
# 创建队列实例
|
|
118
|
+
queue = await queue_manager._create_queue(QueueType.REDIS)
|
|
119
|
+
|
|
120
|
+
# 验证module_name是否正确设置
|
|
121
|
+
assert hasattr(queue, 'module_name'), "RedisPriorityQueue缺少module_name属性"
|
|
122
|
+
assert queue.module_name == test_case["expected_module_name"], \
|
|
123
|
+
f"module_name不匹配: {queue.module_name} != {test_case['expected_module_name']}"
|
|
124
|
+
|
|
125
|
+
# 验证队列名称是否符合规范
|
|
126
|
+
expected_queue_name = f"crawlo:{queue.module_name}:queue:requests"
|
|
127
|
+
expected_processing_queue = f"crawlo:{queue.module_name}:queue:processing"
|
|
128
|
+
expected_failed_queue = f"crawlo:{queue.module_name}:queue:failed"
|
|
129
|
+
|
|
130
|
+
assert queue.queue_name == expected_queue_name, \
|
|
131
|
+
f"队列名称不匹配: {queue.queue_name} != {expected_queue_name}"
|
|
132
|
+
assert queue.processing_queue == expected_processing_queue, \
|
|
133
|
+
f"处理中队列名称不匹配: {queue.processing_queue} != {expected_processing_queue}"
|
|
134
|
+
assert queue.failed_queue == expected_failed_queue, \
|
|
135
|
+
f"失败队列名称不匹配: {queue.failed_queue} != {expected_failed_queue}"
|
|
136
|
+
|
|
137
|
+
print(f" ✅ module_name: {queue.module_name}")
|
|
138
|
+
print(f" ✅ 队列名称: {queue.queue_name}")
|
|
139
|
+
print(f" ✅ 处理中队列名称: {queue.processing_queue}")
|
|
140
|
+
print(f" ✅ 失败队列名称: {queue.failed_queue}")
|
|
141
|
+
|
|
142
|
+
print("✅ QueueManager Redis key命名测试通过!")
|
|
143
|
+
return True
|
|
144
|
+
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print(f"❌ QueueManager Redis key命名测试失败: {e}")
|
|
147
|
+
traceback.print_exc()
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
async def main():
|
|
152
|
+
"""主测试函数"""
|
|
153
|
+
print("🚀 开始QueueManager Redis key命名测试...")
|
|
154
|
+
print("=" * 50)
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
success = await test_queue_manager_redis_key()
|
|
158
|
+
|
|
159
|
+
print("=" * 50)
|
|
160
|
+
if success:
|
|
161
|
+
print("🎉 所有测试通过!QueueManager正确传递module_name参数")
|
|
162
|
+
else:
|
|
163
|
+
print("❌ 测试失败,请检查实现")
|
|
164
|
+
return 1
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
print("=" * 50)
|
|
168
|
+
print(f"❌ 测试过程中发生异常: {e}")
|
|
169
|
+
traceback.print_exc()
|
|
170
|
+
return 1
|
|
171
|
+
|
|
172
|
+
return 0
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if __name__ == "__main__":
|
|
176
|
+
exit_code = asyncio.run(main())
|
|
177
|
+
sys.exit(exit_code)
|