crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,177 +1,177 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
专门测试User-Agent随机性功能
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
import random
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
# 添加项目根目录到Python路径
|
|
13
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
-
|
|
15
|
-
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
16
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
17
|
-
from crawlo.data.user_agents import get_random_user_agent
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class MockLogger:
|
|
21
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
22
|
-
def __init__(self, name, level=None):
|
|
23
|
-
self.name = name
|
|
24
|
-
self.level = level
|
|
25
|
-
self.logs = []
|
|
26
|
-
|
|
27
|
-
def debug(self, msg):
|
|
28
|
-
self.logs.append(('debug', msg))
|
|
29
|
-
|
|
30
|
-
def info(self, msg):
|
|
31
|
-
self.logs.append(('info', msg))
|
|
32
|
-
|
|
33
|
-
def warning(self, msg):
|
|
34
|
-
self.logs.append(('warning', msg))
|
|
35
|
-
|
|
36
|
-
def error(self, msg):
|
|
37
|
-
self.logs.append(('error', msg))
|
|
38
|
-
|
|
39
|
-
def isEnabledFor(self, level):
|
|
40
|
-
return True
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_user_agent_randomness():
|
|
44
|
-
"""测试User-Agent的随机性"""
|
|
45
|
-
print("=== 测试User-Agent的随机性 ===")
|
|
46
|
-
|
|
47
|
-
# 收集20次不同中间件实例生成的User-Agent
|
|
48
|
-
ua_values = []
|
|
49
|
-
|
|
50
|
-
for i in range(20):
|
|
51
|
-
# 每次都创建新的设置和中间件实例
|
|
52
|
-
settings = SettingManager()
|
|
53
|
-
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
54
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
55
|
-
})
|
|
56
|
-
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
57
|
-
settings.set('LOG_LEVEL', 'DEBUG')
|
|
58
|
-
settings.set('RANDOMNESS', True)
|
|
59
|
-
|
|
60
|
-
crawler = Mock()
|
|
61
|
-
crawler.settings = settings
|
|
62
|
-
|
|
63
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
64
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
65
|
-
try:
|
|
66
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
67
|
-
|
|
68
|
-
# 处理请求
|
|
69
|
-
request = Mock()
|
|
70
|
-
request.headers = {}
|
|
71
|
-
request.url = f'https://example.com/test{i}'
|
|
72
|
-
|
|
73
|
-
middleware.process_request(request, Mock())
|
|
74
|
-
|
|
75
|
-
if 'User-Agent' in request.headers:
|
|
76
|
-
ua_values.append(request.headers['User-Agent'])
|
|
77
|
-
|
|
78
|
-
except Exception as e:
|
|
79
|
-
print(f" 测试失败: {e}")
|
|
80
|
-
|
|
81
|
-
# 分析随机性
|
|
82
|
-
unique_uas = set(ua_values)
|
|
83
|
-
print(f" 生成了 {len(ua_values)} 个User-Agent")
|
|
84
|
-
print(f" 其中有 {len(unique_uas)} 个不同的User-Agent")
|
|
85
|
-
print(f" 示例: {list(unique_uas)[:5]}")
|
|
86
|
-
|
|
87
|
-
if len(unique_uas) > 1:
|
|
88
|
-
print(" ✅ User-Agent具有良好的随机性")
|
|
89
|
-
return True
|
|
90
|
-
else:
|
|
91
|
-
print(" ❌ User-Agent缺乏随机性")
|
|
92
|
-
return False
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def test_direct_function_randomness():
|
|
96
|
-
"""测试直接使用函数的随机性"""
|
|
97
|
-
print("\n=== 测试直接使用函数的随机性 ===")
|
|
98
|
-
|
|
99
|
-
# 收集20次调用的结果
|
|
100
|
-
ua_values = []
|
|
101
|
-
|
|
102
|
-
for i in range(20):
|
|
103
|
-
ua = get_random_user_agent()
|
|
104
|
-
ua_values.append(ua)
|
|
105
|
-
|
|
106
|
-
# 分析随机性
|
|
107
|
-
unique_uas = set(ua_values)
|
|
108
|
-
print(f" 生成了 {len(ua_values)} 个User-Agent")
|
|
109
|
-
print(f" 其中有 {len(unique_uas)} 个不同的User-Agent")
|
|
110
|
-
print(f" 示例: {list(unique_uas)[:5]}")
|
|
111
|
-
|
|
112
|
-
if len(unique_uas) > 1:
|
|
113
|
-
print(" ✅ 直接调用函数具有良好的随机性")
|
|
114
|
-
return True
|
|
115
|
-
else:
|
|
116
|
-
print(" ❌ 直接调用函数缺乏随机性")
|
|
117
|
-
return False
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def compare_approaches():
|
|
121
|
-
"""比较不同方法的优缺点"""
|
|
122
|
-
print("\n=== 比较不同方法的优缺点 ===")
|
|
123
|
-
|
|
124
|
-
print("方法1: 使用RANDOM_USER_AGENT_ENABLED")
|
|
125
|
-
print(" 优点:")
|
|
126
|
-
print(" ✓ 内置大量真实User-Agent")
|
|
127
|
-
print(" ✓ 支持设备类型分类")
|
|
128
|
-
print(" ✓ 配置简单")
|
|
129
|
-
print(" ✓ 专门优化")
|
|
130
|
-
print(" 缺点:")
|
|
131
|
-
print(" ✗ 仅限User-Agent")
|
|
132
|
-
|
|
133
|
-
print("\n方法2: 使用RANDOM_HEADERS")
|
|
134
|
-
print(" 优点:")
|
|
135
|
-
print(" ✓ 可以为任意头部添加随机值")
|
|
136
|
-
print(" ✓ 更加灵活")
|
|
137
|
-
print(" ✓ 适用于多种场景")
|
|
138
|
-
print(" 缺点:")
|
|
139
|
-
print(" ✗ 需要用户提供值列表")
|
|
140
|
-
print(" ✗ 配置相对复杂")
|
|
141
|
-
|
|
142
|
-
print("\n方法3: 直接使用get_random_user_agent()")
|
|
143
|
-
print(" 优点:")
|
|
144
|
-
print(" ✓ 最直接")
|
|
145
|
-
print(" ✓ 可编程控制")
|
|
146
|
-
print(" ✓ 无需中间件")
|
|
147
|
-
print(" 缺点:")
|
|
148
|
-
print(" ✗ 需要手动实现")
|
|
149
|
-
print(" ✗ 不如中间件方便")
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def main():
|
|
153
|
-
print("开始测试User-Agent随机性功能...")
|
|
154
|
-
|
|
155
|
-
try:
|
|
156
|
-
# 运行所有测试
|
|
157
|
-
test1_result = test_user_agent_randomness()
|
|
158
|
-
test2_result = test_direct_function_randomness()
|
|
159
|
-
compare_approaches()
|
|
160
|
-
|
|
161
|
-
if test1_result and test2_result:
|
|
162
|
-
print("\n🎉 User-Agent随机性测试通过!")
|
|
163
|
-
print("\n结论:")
|
|
164
|
-
print(" 1. 现有的User-Agent功能具有良好的随机性")
|
|
165
|
-
print(" 2. 可以满足大多数反爬虫需求")
|
|
166
|
-
print(" 3. RANDOM_HEADERS参数提供了额外的灵活性,但不是必需的")
|
|
167
|
-
else:
|
|
168
|
-
print("\n❌ User-Agent随机性测试失败")
|
|
169
|
-
|
|
170
|
-
except Exception as e:
|
|
171
|
-
print(f"\n❌ 测试过程中发生错误: {e}")
|
|
172
|
-
import traceback
|
|
173
|
-
traceback.print_exc()
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
专门测试User-Agent随机性功能
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import random
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
16
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
17
|
+
from crawlo.data.user_agents import get_random_user_agent
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MockLogger:
|
|
21
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
22
|
+
def __init__(self, name, level=None):
|
|
23
|
+
self.name = name
|
|
24
|
+
self.level = level
|
|
25
|
+
self.logs = []
|
|
26
|
+
|
|
27
|
+
def debug(self, msg):
|
|
28
|
+
self.logs.append(('debug', msg))
|
|
29
|
+
|
|
30
|
+
def info(self, msg):
|
|
31
|
+
self.logs.append(('info', msg))
|
|
32
|
+
|
|
33
|
+
def warning(self, msg):
|
|
34
|
+
self.logs.append(('warning', msg))
|
|
35
|
+
|
|
36
|
+
def error(self, msg):
|
|
37
|
+
self.logs.append(('error', msg))
|
|
38
|
+
|
|
39
|
+
def isEnabledFor(self, level):
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_user_agent_randomness():
|
|
44
|
+
"""测试User-Agent的随机性"""
|
|
45
|
+
print("=== 测试User-Agent的随机性 ===")
|
|
46
|
+
|
|
47
|
+
# 收集20次不同中间件实例生成的User-Agent
|
|
48
|
+
ua_values = []
|
|
49
|
+
|
|
50
|
+
for i in range(20):
|
|
51
|
+
# 每次都创建新的设置和中间件实例
|
|
52
|
+
settings = SettingManager()
|
|
53
|
+
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
54
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
55
|
+
})
|
|
56
|
+
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
57
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
58
|
+
settings.set('RANDOMNESS', True)
|
|
59
|
+
|
|
60
|
+
crawler = Mock()
|
|
61
|
+
crawler.settings = settings
|
|
62
|
+
|
|
63
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
64
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
65
|
+
try:
|
|
66
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
67
|
+
|
|
68
|
+
# 处理请求
|
|
69
|
+
request = Mock()
|
|
70
|
+
request.headers = {}
|
|
71
|
+
request.url = f'https://example.com/test{i}'
|
|
72
|
+
|
|
73
|
+
middleware.process_request(request, Mock())
|
|
74
|
+
|
|
75
|
+
if 'User-Agent' in request.headers:
|
|
76
|
+
ua_values.append(request.headers['User-Agent'])
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(f" 测试失败: {e}")
|
|
80
|
+
|
|
81
|
+
# 分析随机性
|
|
82
|
+
unique_uas = set(ua_values)
|
|
83
|
+
print(f" 生成了 {len(ua_values)} 个User-Agent")
|
|
84
|
+
print(f" 其中有 {len(unique_uas)} 个不同的User-Agent")
|
|
85
|
+
print(f" 示例: {list(unique_uas)[:5]}")
|
|
86
|
+
|
|
87
|
+
if len(unique_uas) > 1:
|
|
88
|
+
print(" ✅ User-Agent具有良好的随机性")
|
|
89
|
+
return True
|
|
90
|
+
else:
|
|
91
|
+
print(" ❌ User-Agent缺乏随机性")
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_direct_function_randomness():
|
|
96
|
+
"""测试直接使用函数的随机性"""
|
|
97
|
+
print("\n=== 测试直接使用函数的随机性 ===")
|
|
98
|
+
|
|
99
|
+
# 收集20次调用的结果
|
|
100
|
+
ua_values = []
|
|
101
|
+
|
|
102
|
+
for i in range(20):
|
|
103
|
+
ua = get_random_user_agent()
|
|
104
|
+
ua_values.append(ua)
|
|
105
|
+
|
|
106
|
+
# 分析随机性
|
|
107
|
+
unique_uas = set(ua_values)
|
|
108
|
+
print(f" 生成了 {len(ua_values)} 个User-Agent")
|
|
109
|
+
print(f" 其中有 {len(unique_uas)} 个不同的User-Agent")
|
|
110
|
+
print(f" 示例: {list(unique_uas)[:5]}")
|
|
111
|
+
|
|
112
|
+
if len(unique_uas) > 1:
|
|
113
|
+
print(" ✅ 直接调用函数具有良好的随机性")
|
|
114
|
+
return True
|
|
115
|
+
else:
|
|
116
|
+
print(" ❌ 直接调用函数缺乏随机性")
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def compare_approaches():
|
|
121
|
+
"""比较不同方法的优缺点"""
|
|
122
|
+
print("\n=== 比较不同方法的优缺点 ===")
|
|
123
|
+
|
|
124
|
+
print("方法1: 使用RANDOM_USER_AGENT_ENABLED")
|
|
125
|
+
print(" 优点:")
|
|
126
|
+
print(" ✓ 内置大量真实User-Agent")
|
|
127
|
+
print(" ✓ 支持设备类型分类")
|
|
128
|
+
print(" ✓ 配置简单")
|
|
129
|
+
print(" ✓ 专门优化")
|
|
130
|
+
print(" 缺点:")
|
|
131
|
+
print(" ✗ 仅限User-Agent")
|
|
132
|
+
|
|
133
|
+
print("\n方法2: 使用RANDOM_HEADERS")
|
|
134
|
+
print(" 优点:")
|
|
135
|
+
print(" ✓ 可以为任意头部添加随机值")
|
|
136
|
+
print(" ✓ 更加灵活")
|
|
137
|
+
print(" ✓ 适用于多种场景")
|
|
138
|
+
print(" 缺点:")
|
|
139
|
+
print(" ✗ 需要用户提供值列表")
|
|
140
|
+
print(" ✗ 配置相对复杂")
|
|
141
|
+
|
|
142
|
+
print("\n方法3: 直接使用get_random_user_agent()")
|
|
143
|
+
print(" 优点:")
|
|
144
|
+
print(" ✓ 最直接")
|
|
145
|
+
print(" ✓ 可编程控制")
|
|
146
|
+
print(" ✓ 无需中间件")
|
|
147
|
+
print(" 缺点:")
|
|
148
|
+
print(" ✗ 需要手动实现")
|
|
149
|
+
print(" ✗ 不如中间件方便")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def main():
|
|
153
|
+
print("开始测试User-Agent随机性功能...")
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
# 运行所有测试
|
|
157
|
+
test1_result = test_user_agent_randomness()
|
|
158
|
+
test2_result = test_direct_function_randomness()
|
|
159
|
+
compare_approaches()
|
|
160
|
+
|
|
161
|
+
if test1_result and test2_result:
|
|
162
|
+
print("\n🎉 User-Agent随机性测试通过!")
|
|
163
|
+
print("\n结论:")
|
|
164
|
+
print(" 1. 现有的User-Agent功能具有良好的随机性")
|
|
165
|
+
print(" 2. 可以满足大多数反爬虫需求")
|
|
166
|
+
print(" 3. RANDOM_HEADERS参数提供了额外的灵活性,但不是必需的")
|
|
167
|
+
else:
|
|
168
|
+
print("\n❌ User-Agent随机性测试失败")
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print(f"\n❌ 测试过程中发生错误: {e}")
|
|
172
|
+
import traceback
|
|
173
|
+
traceback.print_exc()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
if __name__ == "__main__":
|
|
177
177
|
main()
|
tests/test_user_agents.py
CHANGED
|
@@ -1,97 +1,97 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试User-Agent列表的功能
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到Python路径
|
|
11
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
-
|
|
13
|
-
from crawlo.data.user_agents import (
|
|
14
|
-
DESKTOP_USER_AGENTS,
|
|
15
|
-
MOBILE_USER_AGENTS,
|
|
16
|
-
BOT_USER_AGENTS,
|
|
17
|
-
CHROME_USER_AGENTS,
|
|
18
|
-
FIREFOX_USER_AGENTS,
|
|
19
|
-
SAFARI_USER_AGENTS,
|
|
20
|
-
EDGE_USER_AGENTS,
|
|
21
|
-
OPERA_USER_AGENTS,
|
|
22
|
-
ALL_USER_AGENTS,
|
|
23
|
-
USER_AGENTS_BY_TYPE,
|
|
24
|
-
get_user_agents,
|
|
25
|
-
get_random_user_agent
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def test_user_agent_counts():
|
|
30
|
-
"""测试User-Agent数量"""
|
|
31
|
-
print("=== User-Agent数量测试 ===")
|
|
32
|
-
print(f"桌面浏览器User-Agent数量: {len(DESKTOP_USER_AGENTS)}")
|
|
33
|
-
print(f"移动设备User-Agent数量: {len(MOBILE_USER_AGENTS)}")
|
|
34
|
-
print(f"爬虫User-Agent数量: {len(BOT_USER_AGENTS)}")
|
|
35
|
-
print(f"Chrome User-Agent数量: {len(CHROME_USER_AGENTS)}")
|
|
36
|
-
print(f"Firefox User-Agent数量: {len(FIREFOX_USER_AGENTS)}")
|
|
37
|
-
print(f"Safari User-Agent数量: {len(SAFARI_USER_AGENTS)}")
|
|
38
|
-
print(f"Edge User-Agent数量: {len(EDGE_USER_AGENTS)}")
|
|
39
|
-
print(f"Opera User-Agent数量: {len(OPERA_USER_AGENTS)}")
|
|
40
|
-
print(f"所有User-Agent数量: {len(ALL_USER_AGENTS)}")
|
|
41
|
-
print()
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def test_get_user_agents():
|
|
45
|
-
"""测试get_user_agents函数"""
|
|
46
|
-
print("=== get_user_agents函数测试 ===")
|
|
47
|
-
for device_type in ["desktop", "mobile", "bot", "all", "chrome", "firefox", "safari", "edge", "opera"]:
|
|
48
|
-
user_agents = get_user_agents(device_type)
|
|
49
|
-
print(f"{device_type}类型User-Agent数量: {len(user_agents)}")
|
|
50
|
-
print()
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def test_get_random_user_agent():
|
|
54
|
-
"""测试get_random_user_agent函数"""
|
|
55
|
-
print("=== get_random_user_agent函数测试 ===")
|
|
56
|
-
for device_type in ["desktop", "mobile", "all", "chrome", "firefox"]:
|
|
57
|
-
ua = get_random_user_agent(device_type)
|
|
58
|
-
print(f"{device_type}类型随机User-Agent: {ua[:100]}...")
|
|
59
|
-
print()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def test_user_agents_content():
|
|
63
|
-
"""测试User-Agent内容"""
|
|
64
|
-
print("=== User-Agent内容测试 ===")
|
|
65
|
-
|
|
66
|
-
# 检查是否包含最新的浏览器版本
|
|
67
|
-
chrome_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Chrome/136" in ua)
|
|
68
|
-
firefox_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Firefox/136" in ua)
|
|
69
|
-
safari_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Version/18" in ua and "Safari" in ua)
|
|
70
|
-
|
|
71
|
-
print(f"包含Chrome 136的User-Agent数量: {chrome_ua_count}")
|
|
72
|
-
print(f"包含Firefox 136的User-Agent数量: {firefox_ua_count}")
|
|
73
|
-
print(f"包含Safari 18的User-Agent数量: {safari_ua_count}")
|
|
74
|
-
|
|
75
|
-
# 检查是否包含移动设备User-Agent
|
|
76
|
-
ios_ua_count = sum(1 for ua in ALL_USER_AGENTS if "iPhone" in ua or "iPad" in ua)
|
|
77
|
-
android_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Android" in ua)
|
|
78
|
-
|
|
79
|
-
print(f"包含iOS设备的User-Agent数量: {ios_ua_count}")
|
|
80
|
-
print(f"包含Android设备的User-Agent数量: {android_ua_count}")
|
|
81
|
-
print()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def main():
|
|
85
|
-
"""主测试函数"""
|
|
86
|
-
print("开始测试User-Agent列表...\n")
|
|
87
|
-
|
|
88
|
-
test_user_agent_counts()
|
|
89
|
-
test_get_user_agents()
|
|
90
|
-
test_get_random_user_agent()
|
|
91
|
-
test_user_agents_content()
|
|
92
|
-
|
|
93
|
-
print("所有测试完成!")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试User-Agent列表的功能
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到Python路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.data.user_agents import (
|
|
14
|
+
DESKTOP_USER_AGENTS,
|
|
15
|
+
MOBILE_USER_AGENTS,
|
|
16
|
+
BOT_USER_AGENTS,
|
|
17
|
+
CHROME_USER_AGENTS,
|
|
18
|
+
FIREFOX_USER_AGENTS,
|
|
19
|
+
SAFARI_USER_AGENTS,
|
|
20
|
+
EDGE_USER_AGENTS,
|
|
21
|
+
OPERA_USER_AGENTS,
|
|
22
|
+
ALL_USER_AGENTS,
|
|
23
|
+
USER_AGENTS_BY_TYPE,
|
|
24
|
+
get_user_agents,
|
|
25
|
+
get_random_user_agent
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_user_agent_counts():
|
|
30
|
+
"""测试User-Agent数量"""
|
|
31
|
+
print("=== User-Agent数量测试 ===")
|
|
32
|
+
print(f"桌面浏览器User-Agent数量: {len(DESKTOP_USER_AGENTS)}")
|
|
33
|
+
print(f"移动设备User-Agent数量: {len(MOBILE_USER_AGENTS)}")
|
|
34
|
+
print(f"爬虫User-Agent数量: {len(BOT_USER_AGENTS)}")
|
|
35
|
+
print(f"Chrome User-Agent数量: {len(CHROME_USER_AGENTS)}")
|
|
36
|
+
print(f"Firefox User-Agent数量: {len(FIREFOX_USER_AGENTS)}")
|
|
37
|
+
print(f"Safari User-Agent数量: {len(SAFARI_USER_AGENTS)}")
|
|
38
|
+
print(f"Edge User-Agent数量: {len(EDGE_USER_AGENTS)}")
|
|
39
|
+
print(f"Opera User-Agent数量: {len(OPERA_USER_AGENTS)}")
|
|
40
|
+
print(f"所有User-Agent数量: {len(ALL_USER_AGENTS)}")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_get_user_agents():
|
|
45
|
+
"""测试get_user_agents函数"""
|
|
46
|
+
print("=== get_user_agents函数测试 ===")
|
|
47
|
+
for device_type in ["desktop", "mobile", "bot", "all", "chrome", "firefox", "safari", "edge", "opera"]:
|
|
48
|
+
user_agents = get_user_agents(device_type)
|
|
49
|
+
print(f"{device_type}类型User-Agent数量: {len(user_agents)}")
|
|
50
|
+
print()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_get_random_user_agent():
|
|
54
|
+
"""测试get_random_user_agent函数"""
|
|
55
|
+
print("=== get_random_user_agent函数测试 ===")
|
|
56
|
+
for device_type in ["desktop", "mobile", "all", "chrome", "firefox"]:
|
|
57
|
+
ua = get_random_user_agent(device_type)
|
|
58
|
+
print(f"{device_type}类型随机User-Agent: {ua[:100]}...")
|
|
59
|
+
print()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_user_agents_content():
|
|
63
|
+
"""测试User-Agent内容"""
|
|
64
|
+
print("=== User-Agent内容测试 ===")
|
|
65
|
+
|
|
66
|
+
# 检查是否包含最新的浏览器版本
|
|
67
|
+
chrome_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Chrome/136" in ua)
|
|
68
|
+
firefox_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Firefox/136" in ua)
|
|
69
|
+
safari_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Version/18" in ua and "Safari" in ua)
|
|
70
|
+
|
|
71
|
+
print(f"包含Chrome 136的User-Agent数量: {chrome_ua_count}")
|
|
72
|
+
print(f"包含Firefox 136的User-Agent数量: {firefox_ua_count}")
|
|
73
|
+
print(f"包含Safari 18的User-Agent数量: {safari_ua_count}")
|
|
74
|
+
|
|
75
|
+
# 检查是否包含移动设备User-Agent
|
|
76
|
+
ios_ua_count = sum(1 for ua in ALL_USER_AGENTS if "iPhone" in ua or "iPad" in ua)
|
|
77
|
+
android_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Android" in ua)
|
|
78
|
+
|
|
79
|
+
print(f"包含iOS设备的User-Agent数量: {ios_ua_count}")
|
|
80
|
+
print(f"包含Android设备的User-Agent数量: {android_ua_count}")
|
|
81
|
+
print()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def main():
|
|
85
|
+
"""主测试函数"""
|
|
86
|
+
print("开始测试User-Agent列表...\n")
|
|
87
|
+
|
|
88
|
+
test_user_agent_counts()
|
|
89
|
+
test_get_user_agents()
|
|
90
|
+
test_get_random_user_agent()
|
|
91
|
+
test_user_agents_content()
|
|
92
|
+
|
|
93
|
+
print("所有测试完成!")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
97
|
main()
|