crawlo 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.1.dist-info/METADATA +0 -1199
- crawlo-1.4.1.dist-info/RECORD +0 -309
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -1,84 +1,84 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试增强后的 get_component_logger 函数
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
import sys
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
|
|
11
|
-
# 添加项目根目录到Python路径
|
|
12
|
-
project_root = Path(__file__).parent.parent
|
|
13
|
-
sys.path.insert(0, str(project_root))
|
|
14
|
-
|
|
15
|
-
from crawlo.utils.log import get_component_logger
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class MockComponent:
|
|
19
|
-
"""模拟组件类"""
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class MockSettings:
|
|
24
|
-
"""模拟设置类"""
|
|
25
|
-
def __init__(self):
|
|
26
|
-
self.LOG_LEVEL = 'DEBUG'
|
|
27
|
-
self.LOG_LEVEL_MockComponent = 'WARNING'
|
|
28
|
-
|
|
29
|
-
def get(self, key, default=None):
|
|
30
|
-
return getattr(self, key, default)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def test_get_component_logger():
|
|
34
|
-
"""测试 get_component_logger 函数"""
|
|
35
|
-
print("=== 测试 get_component_logger 函数 ===")
|
|
36
|
-
|
|
37
|
-
# 1. 测试基本用法
|
|
38
|
-
print("1. 测试基本用法...")
|
|
39
|
-
logger1 = get_component_logger(MockComponent)
|
|
40
|
-
print(f" Logger名称: {logger1.name}")
|
|
41
|
-
print(f" Logger级别: {logger1.level}")
|
|
42
|
-
|
|
43
|
-
# 2. 测试带settings的用法
|
|
44
|
-
print("2. 测试带settings的用法...")
|
|
45
|
-
settings = MockSettings()
|
|
46
|
-
logger2 = get_component_logger(MockComponent, settings)
|
|
47
|
-
print(f" Logger名称: {logger2.name}")
|
|
48
|
-
print(f" Logger级别: {logger2.level}")
|
|
49
|
-
|
|
50
|
-
# 3. 测试带level参数的用法
|
|
51
|
-
print("3. 测试带level参数的用法...")
|
|
52
|
-
logger3 = get_component_logger(MockComponent, level='ERROR')
|
|
53
|
-
print(f" Logger名称: {logger3.name}")
|
|
54
|
-
print(f" Logger级别: {logger3.level}")
|
|
55
|
-
|
|
56
|
-
# 4. 测试日志输出
|
|
57
|
-
print("4. 测试日志输出...")
|
|
58
|
-
logger1.info("这是info级别的测试消息")
|
|
59
|
-
logger1.warning("这是warning级别的测试消息")
|
|
60
|
-
logger1.error("这是error级别的测试消息")
|
|
61
|
-
|
|
62
|
-
print("\n=== 测试完成 ===")
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def main():
|
|
66
|
-
"""主函数"""
|
|
67
|
-
print("开始测试增强后的 get_component_logger 函数...")
|
|
68
|
-
|
|
69
|
-
try:
|
|
70
|
-
test_get_component_logger()
|
|
71
|
-
|
|
72
|
-
print("\n=== 所有测试完成 ===")
|
|
73
|
-
|
|
74
|
-
except Exception as e:
|
|
75
|
-
print(f"\n测试过程中出现错误: {e}")
|
|
76
|
-
import traceback
|
|
77
|
-
traceback.print_exc()
|
|
78
|
-
return 1
|
|
79
|
-
|
|
80
|
-
return 0
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试增强后的 get_component_logger 函数
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# 添加项目根目录到Python路径
|
|
12
|
+
project_root = Path(__file__).parent.parent
|
|
13
|
+
sys.path.insert(0, str(project_root))
|
|
14
|
+
|
|
15
|
+
from crawlo.utils.log import get_component_logger
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MockComponent:
|
|
19
|
+
"""模拟组件类"""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MockSettings:
|
|
24
|
+
"""模拟设置类"""
|
|
25
|
+
def __init__(self):
|
|
26
|
+
self.LOG_LEVEL = 'DEBUG'
|
|
27
|
+
self.LOG_LEVEL_MockComponent = 'WARNING'
|
|
28
|
+
|
|
29
|
+
def get(self, key, default=None):
|
|
30
|
+
return getattr(self, key, default)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_get_component_logger():
|
|
34
|
+
"""测试 get_component_logger 函数"""
|
|
35
|
+
print("=== 测试 get_component_logger 函数 ===")
|
|
36
|
+
|
|
37
|
+
# 1. 测试基本用法
|
|
38
|
+
print("1. 测试基本用法...")
|
|
39
|
+
logger1 = get_component_logger(MockComponent)
|
|
40
|
+
print(f" Logger名称: {logger1.name}")
|
|
41
|
+
print(f" Logger级别: {logger1.level}")
|
|
42
|
+
|
|
43
|
+
# 2. 测试带settings的用法
|
|
44
|
+
print("2. 测试带settings的用法...")
|
|
45
|
+
settings = MockSettings()
|
|
46
|
+
logger2 = get_component_logger(MockComponent, settings)
|
|
47
|
+
print(f" Logger名称: {logger2.name}")
|
|
48
|
+
print(f" Logger级别: {logger2.level}")
|
|
49
|
+
|
|
50
|
+
# 3. 测试带level参数的用法
|
|
51
|
+
print("3. 测试带level参数的用法...")
|
|
52
|
+
logger3 = get_component_logger(MockComponent, level='ERROR')
|
|
53
|
+
print(f" Logger名称: {logger3.name}")
|
|
54
|
+
print(f" Logger级别: {logger3.level}")
|
|
55
|
+
|
|
56
|
+
# 4. 测试日志输出
|
|
57
|
+
print("4. 测试日志输出...")
|
|
58
|
+
logger1.info("这是info级别的测试消息")
|
|
59
|
+
logger1.warning("这是warning级别的测试消息")
|
|
60
|
+
logger1.error("这是error级别的测试消息")
|
|
61
|
+
|
|
62
|
+
print("\n=== 测试完成 ===")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def main():
|
|
66
|
+
"""主函数"""
|
|
67
|
+
print("开始测试增强后的 get_component_logger 函数...")
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
test_get_component_logger()
|
|
71
|
+
|
|
72
|
+
print("\n=== 所有测试完成 ===")
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
print(f"\n测试过程中出现错误: {e}")
|
|
76
|
+
import traceback
|
|
77
|
+
traceback.print_exc()
|
|
78
|
+
return 1
|
|
79
|
+
|
|
80
|
+
return 0
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == '__main__':
|
|
84
84
|
sys.exit(main())
|
tests/test_hash_performance.py
CHANGED
|
@@ -1,100 +1,100 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
SHA256 vs MD5 性能对比测试
|
|
5
|
-
=====================
|
|
6
|
-
测试在爬虫场景中两种哈希算法的性能差异
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import hashlib
|
|
10
|
-
import time
|
|
11
|
-
from collections import namedtuple
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
# 创建测试数据
|
|
15
|
-
TestItem = namedtuple('TestItem', ['title', 'url', 'content', 'price', 'tags'])
|
|
16
|
-
|
|
17
|
-
def create_test_items(count=10000):
|
|
18
|
-
"""创建测试数据项"""
|
|
19
|
-
items = []
|
|
20
|
-
for i in range(count):
|
|
21
|
-
item = TestItem(
|
|
22
|
-
title=f"Test Title {i}",
|
|
23
|
-
url=f"https://example.com/page/{i}",
|
|
24
|
-
content=f"This is test content number {i} with some additional text to make it longer",
|
|
25
|
-
price=99.99 + i,
|
|
26
|
-
tags=[f"tag{j}" for j in range(5)]
|
|
27
|
-
)
|
|
28
|
-
items.append(item)
|
|
29
|
-
return items
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def md5_fingerprint(data):
|
|
33
|
-
"""使用MD5生成指纹"""
|
|
34
|
-
if hasattr(data, '_asdict'):
|
|
35
|
-
data_dict = data._asdict()
|
|
36
|
-
else:
|
|
37
|
-
data_dict = {'__data__': str(data)}
|
|
38
|
-
|
|
39
|
-
sorted_items = sorted(data_dict.items())
|
|
40
|
-
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
41
|
-
return hashlib.md5(fingerprint_string.encode('utf-8')).hexdigest()
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def sha256_fingerprint(data):
|
|
45
|
-
"""使用SHA256生成指纹"""
|
|
46
|
-
if hasattr(data, '_asdict'):
|
|
47
|
-
data_dict = data._asdict()
|
|
48
|
-
else:
|
|
49
|
-
data_dict = {'__data__': str(data)}
|
|
50
|
-
|
|
51
|
-
sorted_items = sorted(data_dict.items())
|
|
52
|
-
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
53
|
-
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def performance_test():
|
|
57
|
-
"""性能测试"""
|
|
58
|
-
print("开始哈希算法性能测试...")
|
|
59
|
-
print("=" * 50)
|
|
60
|
-
|
|
61
|
-
# 创建测试数据
|
|
62
|
-
test_items = create_test_items(10000)
|
|
63
|
-
|
|
64
|
-
# 测试MD5性能
|
|
65
|
-
start_time = time.time()
|
|
66
|
-
md5_results = []
|
|
67
|
-
for item in test_items:
|
|
68
|
-
fingerprint = md5_fingerprint(item)
|
|
69
|
-
md5_results.append(fingerprint)
|
|
70
|
-
md5_time = time.time() - start_time
|
|
71
|
-
|
|
72
|
-
# 测试SHA256性能
|
|
73
|
-
start_time = time.time()
|
|
74
|
-
sha256_results = []
|
|
75
|
-
for item in test_items:
|
|
76
|
-
fingerprint = sha256_fingerprint(item)
|
|
77
|
-
sha256_results.append(fingerprint)
|
|
78
|
-
sha256_time = time.time() - start_time
|
|
79
|
-
|
|
80
|
-
# 输出结果
|
|
81
|
-
print(f"测试数据量: {len(test_items)} 条")
|
|
82
|
-
print(f"MD5 耗时: {md5_time:.4f} 秒")
|
|
83
|
-
print(f"SHA256 耗时: {sha256_time:.4f} 秒")
|
|
84
|
-
print(f"性能差异: {((sha256_time - md5_time) / md5_time * 100):.2f}%")
|
|
85
|
-
|
|
86
|
-
# 验证结果一致性
|
|
87
|
-
print("\n验证指纹长度:")
|
|
88
|
-
print(f"MD5 指纹长度: {len(md5_results[0])} 字符")
|
|
89
|
-
print(f"SHA256 指纹长度: {len(sha256_results[0])} 字符")
|
|
90
|
-
|
|
91
|
-
# 检查是否有重复指纹(理论上不应该有)
|
|
92
|
-
md5_unique = len(set(md5_results))
|
|
93
|
-
sha256_unique = len(set(sha256_results))
|
|
94
|
-
print(f"\n唯一指纹数量:")
|
|
95
|
-
print(f"MD5: {md5_unique}/{len(test_items)} ({md5_unique/len(test_items)*100:.2f}%)")
|
|
96
|
-
print(f"SHA256: {sha256_unique}/{len(test_items)} ({sha256_unique/len(test_items)*100:.2f}%)")
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
SHA256 vs MD5 性能对比测试
|
|
5
|
+
=====================
|
|
6
|
+
测试在爬虫场景中两种哈希算法的性能差异
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import time
|
|
11
|
+
from collections import namedtuple
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# 创建测试数据
|
|
15
|
+
TestItem = namedtuple('TestItem', ['title', 'url', 'content', 'price', 'tags'])
|
|
16
|
+
|
|
17
|
+
def create_test_items(count=10000):
|
|
18
|
+
"""创建测试数据项"""
|
|
19
|
+
items = []
|
|
20
|
+
for i in range(count):
|
|
21
|
+
item = TestItem(
|
|
22
|
+
title=f"Test Title {i}",
|
|
23
|
+
url=f"https://example.com/page/{i}",
|
|
24
|
+
content=f"This is test content number {i} with some additional text to make it longer",
|
|
25
|
+
price=99.99 + i,
|
|
26
|
+
tags=[f"tag{j}" for j in range(5)]
|
|
27
|
+
)
|
|
28
|
+
items.append(item)
|
|
29
|
+
return items
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def md5_fingerprint(data):
|
|
33
|
+
"""使用MD5生成指纹"""
|
|
34
|
+
if hasattr(data, '_asdict'):
|
|
35
|
+
data_dict = data._asdict()
|
|
36
|
+
else:
|
|
37
|
+
data_dict = {'__data__': str(data)}
|
|
38
|
+
|
|
39
|
+
sorted_items = sorted(data_dict.items())
|
|
40
|
+
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
41
|
+
return hashlib.md5(fingerprint_string.encode('utf-8')).hexdigest()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def sha256_fingerprint(data):
|
|
45
|
+
"""使用SHA256生成指纹"""
|
|
46
|
+
if hasattr(data, '_asdict'):
|
|
47
|
+
data_dict = data._asdict()
|
|
48
|
+
else:
|
|
49
|
+
data_dict = {'__data__': str(data)}
|
|
50
|
+
|
|
51
|
+
sorted_items = sorted(data_dict.items())
|
|
52
|
+
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
53
|
+
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def performance_test():
|
|
57
|
+
"""性能测试"""
|
|
58
|
+
print("开始哈希算法性能测试...")
|
|
59
|
+
print("=" * 50)
|
|
60
|
+
|
|
61
|
+
# 创建测试数据
|
|
62
|
+
test_items = create_test_items(10000)
|
|
63
|
+
|
|
64
|
+
# 测试MD5性能
|
|
65
|
+
start_time = time.time()
|
|
66
|
+
md5_results = []
|
|
67
|
+
for item in test_items:
|
|
68
|
+
fingerprint = md5_fingerprint(item)
|
|
69
|
+
md5_results.append(fingerprint)
|
|
70
|
+
md5_time = time.time() - start_time
|
|
71
|
+
|
|
72
|
+
# 测试SHA256性能
|
|
73
|
+
start_time = time.time()
|
|
74
|
+
sha256_results = []
|
|
75
|
+
for item in test_items:
|
|
76
|
+
fingerprint = sha256_fingerprint(item)
|
|
77
|
+
sha256_results.append(fingerprint)
|
|
78
|
+
sha256_time = time.time() - start_time
|
|
79
|
+
|
|
80
|
+
# 输出结果
|
|
81
|
+
print(f"测试数据量: {len(test_items)} 条")
|
|
82
|
+
print(f"MD5 耗时: {md5_time:.4f} 秒")
|
|
83
|
+
print(f"SHA256 耗时: {sha256_time:.4f} 秒")
|
|
84
|
+
print(f"性能差异: {((sha256_time - md5_time) / md5_time * 100):.2f}%")
|
|
85
|
+
|
|
86
|
+
# 验证结果一致性
|
|
87
|
+
print("\n验证指纹长度:")
|
|
88
|
+
print(f"MD5 指纹长度: {len(md5_results[0])} 字符")
|
|
89
|
+
print(f"SHA256 指纹长度: {len(sha256_results[0])} 字符")
|
|
90
|
+
|
|
91
|
+
# 检查是否有重复指纹(理论上不应该有)
|
|
92
|
+
md5_unique = len(set(md5_results))
|
|
93
|
+
sha256_unique = len(set(sha256_results))
|
|
94
|
+
print(f"\n唯一指纹数量:")
|
|
95
|
+
print(f"MD5: {md5_unique}/{len(test_items)} ({md5_unique/len(test_items)*100:.2f}%)")
|
|
96
|
+
print(f"SHA256: {sha256_unique}/{len(test_items)} ({sha256_unique/len(test_items)*100:.2f}%)")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == '__main__':
|
|
100
100
|
performance_test()
|