crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
tests/test_middleware_debug.py
CHANGED
|
@@ -1,142 +1,142 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
调试中间件User-Agent随机性问题
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
import random
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
# 添加项目根目录到Python路径
|
|
13
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
-
|
|
15
|
-
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
16
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
17
|
-
from crawlo.data.user_agents import get_user_agents
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class MockLogger:
|
|
21
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
22
|
-
def __init__(self, name, level=None):
|
|
23
|
-
self.name = name
|
|
24
|
-
self.level = level
|
|
25
|
-
self.logs = []
|
|
26
|
-
|
|
27
|
-
def debug(self, msg):
|
|
28
|
-
self.logs.append(('debug', msg))
|
|
29
|
-
print(f"DEBUG: {msg}")
|
|
30
|
-
|
|
31
|
-
def info(self, msg):
|
|
32
|
-
self.logs.append(('info', msg))
|
|
33
|
-
|
|
34
|
-
def warning(self, msg):
|
|
35
|
-
self.logs.append(('warning', msg))
|
|
36
|
-
|
|
37
|
-
def error(self, msg):
|
|
38
|
-
self.logs.append(('error', msg))
|
|
39
|
-
|
|
40
|
-
def isEnabledFor(self, level):
|
|
41
|
-
return True
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def debug_middleware_initialization():
|
|
45
|
-
"""调试中间件初始化过程"""
|
|
46
|
-
print("=== 调试中间件初始化过程 ===")
|
|
47
|
-
|
|
48
|
-
settings = SettingManager()
|
|
49
|
-
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
50
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
51
|
-
})
|
|
52
|
-
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
53
|
-
settings.set('LOG_LEVEL', 'DEBUG')
|
|
54
|
-
settings.set('RANDOMNESS', True)
|
|
55
|
-
|
|
56
|
-
crawler = Mock()
|
|
57
|
-
crawler.settings = settings
|
|
58
|
-
|
|
59
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
60
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
61
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
62
|
-
|
|
63
|
-
print(f"随机User-Agent启用: {middleware.random_user_agent_enabled}")
|
|
64
|
-
print(f"User-Agent列表数量: {len(middleware.user_agents)}")
|
|
65
|
-
print(f"User-Agent设备类型: {middleware.user_agent_device_type}")
|
|
66
|
-
|
|
67
|
-
# 检查前几个User-Agent
|
|
68
|
-
print("前5个User-Agent:")
|
|
69
|
-
for i, ua in enumerate(middleware.user_agents[:5]):
|
|
70
|
-
print(f" {i+1}. {ua[:50]}...")
|
|
71
|
-
|
|
72
|
-
# 测试_get_random_user_agent方法
|
|
73
|
-
print("\n测试_get_random_user_agent方法:")
|
|
74
|
-
for i in range(10):
|
|
75
|
-
ua = middleware._get_random_user_agent()
|
|
76
|
-
print(f" {i+1}. {ua[:50]}...")
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def test_multiple_middleware_instances():
|
|
80
|
-
"""测试多个中间件实例的随机性"""
|
|
81
|
-
print("\n=== 测试多个中间件实例的随机性 ===")
|
|
82
|
-
|
|
83
|
-
ua_values = []
|
|
84
|
-
|
|
85
|
-
for i in range(10):
|
|
86
|
-
settings = SettingManager()
|
|
87
|
-
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
88
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
89
|
-
})
|
|
90
|
-
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
91
|
-
settings.set('LOG_LEVEL', 'DEBUG')
|
|
92
|
-
settings.set('RANDOMNESS', True)
|
|
93
|
-
|
|
94
|
-
crawler = Mock()
|
|
95
|
-
crawler.settings = settings
|
|
96
|
-
|
|
97
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
98
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
99
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
100
|
-
|
|
101
|
-
# 获取随机User-Agent
|
|
102
|
-
ua = middleware._get_random_user_agent()
|
|
103
|
-
if ua:
|
|
104
|
-
ua_values.append(ua)
|
|
105
|
-
print(f" 实例{i+1}: {ua[:50]}...")
|
|
106
|
-
|
|
107
|
-
unique_uas = set(ua_values)
|
|
108
|
-
print(f"\n生成了 {len(ua_values)} 个User-Agent,其中 {len(unique_uas)} 个不同")
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def check_user_agents_module():
|
|
112
|
-
"""检查user_agents模块"""
|
|
113
|
-
print("\n=== 检查user_agents模块 ===")
|
|
114
|
-
|
|
115
|
-
# 获取不同类型的User-Agent
|
|
116
|
-
device_types = ["all", "desktop", "mobile", "chrome", "firefox"]
|
|
117
|
-
|
|
118
|
-
for device_type in device_types:
|
|
119
|
-
uas = get_user_agents(device_type)
|
|
120
|
-
print(f"{device_type}类型User-Agent数量: {len(uas)}")
|
|
121
|
-
if uas:
|
|
122
|
-
print(f" 示例: {uas[0][:50]}...")
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def main():
|
|
126
|
-
print("开始调试中间件User-Agent随机性问题...")
|
|
127
|
-
|
|
128
|
-
try:
|
|
129
|
-
debug_middleware_initialization()
|
|
130
|
-
test_multiple_middleware_instances()
|
|
131
|
-
check_user_agents_module()
|
|
132
|
-
|
|
133
|
-
print("\n调试完成!")
|
|
134
|
-
|
|
135
|
-
except Exception as e:
|
|
136
|
-
print(f"\n调试过程中发生错误: {e}")
|
|
137
|
-
import traceback
|
|
138
|
-
traceback.print_exc()
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
调试中间件User-Agent随机性问题
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import random
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
16
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
17
|
+
from crawlo.data.user_agents import get_user_agents
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MockLogger:
|
|
21
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
22
|
+
def __init__(self, name, level=None):
|
|
23
|
+
self.name = name
|
|
24
|
+
self.level = level
|
|
25
|
+
self.logs = []
|
|
26
|
+
|
|
27
|
+
def debug(self, msg):
|
|
28
|
+
self.logs.append(('debug', msg))
|
|
29
|
+
print(f"DEBUG: {msg}")
|
|
30
|
+
|
|
31
|
+
def info(self, msg):
|
|
32
|
+
self.logs.append(('info', msg))
|
|
33
|
+
|
|
34
|
+
def warning(self, msg):
|
|
35
|
+
self.logs.append(('warning', msg))
|
|
36
|
+
|
|
37
|
+
def error(self, msg):
|
|
38
|
+
self.logs.append(('error', msg))
|
|
39
|
+
|
|
40
|
+
def isEnabledFor(self, level):
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def debug_middleware_initialization():
|
|
45
|
+
"""调试中间件初始化过程"""
|
|
46
|
+
print("=== 调试中间件初始化过程 ===")
|
|
47
|
+
|
|
48
|
+
settings = SettingManager()
|
|
49
|
+
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
50
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
51
|
+
})
|
|
52
|
+
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
53
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
54
|
+
settings.set('RANDOMNESS', True)
|
|
55
|
+
|
|
56
|
+
crawler = Mock()
|
|
57
|
+
crawler.settings = settings
|
|
58
|
+
|
|
59
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
60
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
61
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
62
|
+
|
|
63
|
+
print(f"随机User-Agent启用: {middleware.random_user_agent_enabled}")
|
|
64
|
+
print(f"User-Agent列表数量: {len(middleware.user_agents)}")
|
|
65
|
+
print(f"User-Agent设备类型: {middleware.user_agent_device_type}")
|
|
66
|
+
|
|
67
|
+
# 检查前几个User-Agent
|
|
68
|
+
print("前5个User-Agent:")
|
|
69
|
+
for i, ua in enumerate(middleware.user_agents[:5]):
|
|
70
|
+
print(f" {i+1}. {ua[:50]}...")
|
|
71
|
+
|
|
72
|
+
# 测试_get_random_user_agent方法
|
|
73
|
+
print("\n测试_get_random_user_agent方法:")
|
|
74
|
+
for i in range(10):
|
|
75
|
+
ua = middleware._get_random_user_agent()
|
|
76
|
+
print(f" {i+1}. {ua[:50]}...")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_multiple_middleware_instances():
|
|
80
|
+
"""测试多个中间件实例的随机性"""
|
|
81
|
+
print("\n=== 测试多个中间件实例的随机性 ===")
|
|
82
|
+
|
|
83
|
+
ua_values = []
|
|
84
|
+
|
|
85
|
+
for i in range(10):
|
|
86
|
+
settings = SettingManager()
|
|
87
|
+
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
88
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
89
|
+
})
|
|
90
|
+
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
91
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
92
|
+
settings.set('RANDOMNESS', True)
|
|
93
|
+
|
|
94
|
+
crawler = Mock()
|
|
95
|
+
crawler.settings = settings
|
|
96
|
+
|
|
97
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
98
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
99
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
100
|
+
|
|
101
|
+
# 获取随机User-Agent
|
|
102
|
+
ua = middleware._get_random_user_agent()
|
|
103
|
+
if ua:
|
|
104
|
+
ua_values.append(ua)
|
|
105
|
+
print(f" 实例{i+1}: {ua[:50]}...")
|
|
106
|
+
|
|
107
|
+
unique_uas = set(ua_values)
|
|
108
|
+
print(f"\n生成了 {len(ua_values)} 个User-Agent,其中 {len(unique_uas)} 个不同")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def check_user_agents_module():
|
|
112
|
+
"""检查user_agents模块"""
|
|
113
|
+
print("\n=== 检查user_agents模块 ===")
|
|
114
|
+
|
|
115
|
+
# 获取不同类型的User-Agent
|
|
116
|
+
device_types = ["all", "desktop", "mobile", "chrome", "firefox"]
|
|
117
|
+
|
|
118
|
+
for device_type in device_types:
|
|
119
|
+
uas = get_user_agents(device_type)
|
|
120
|
+
print(f"{device_type}类型User-Agent数量: {len(uas)}")
|
|
121
|
+
if uas:
|
|
122
|
+
print(f" 示例: {uas[0][:50]}...")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def main():
|
|
126
|
+
print("开始调试中间件User-Agent随机性问题...")
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
debug_middleware_initialization()
|
|
130
|
+
test_multiple_middleware_instances()
|
|
131
|
+
check_user_agents_module()
|
|
132
|
+
|
|
133
|
+
print("\n调试完成!")
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
print(f"\n调试过程中发生错误: {e}")
|
|
137
|
+
import traceback
|
|
138
|
+
traceback.print_exc()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
142
|
main()
|
tests/test_mode_consistency.py
CHANGED
|
@@ -1,52 +1,52 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试模式一致性提示
|
|
5
|
-
"""
|
|
6
|
-
import asyncio
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到路径
|
|
11
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
-
|
|
13
|
-
from crawlo.crawler import CrawlerProcess
|
|
14
|
-
from crawlo.spider import Spider
|
|
15
|
-
from crawlo import Request
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class TestSpider(Spider):
|
|
19
|
-
name = "test_mode_spider"
|
|
20
|
-
|
|
21
|
-
def start_requests(self):
|
|
22
|
-
yield Request("https://httpbin.org/get")
|
|
23
|
-
|
|
24
|
-
def parse(self, response):
|
|
25
|
-
yield {"url": response.url, "status": response.status_code} # 修复:使用status_code而不是status
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
async def test_mode_consistency():
|
|
29
|
-
"""测试模式一致性提示"""
|
|
30
|
-
print("测试模式一致性提示...")
|
|
31
|
-
|
|
32
|
-
try:
|
|
33
|
-
# 创建爬虫进程
|
|
34
|
-
process = CrawlerProcess()
|
|
35
|
-
|
|
36
|
-
# 添加爬虫
|
|
37
|
-
await process.crawl(TestSpider)
|
|
38
|
-
|
|
39
|
-
print("模式一致性测试完成")
|
|
40
|
-
|
|
41
|
-
except Exception as e:
|
|
42
|
-
print(f"测试失败: {e}")
|
|
43
|
-
import traceback
|
|
44
|
-
traceback.print_exc()
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if __name__ == "__main__":
|
|
48
|
-
# 设置日志级别
|
|
49
|
-
import logging
|
|
50
|
-
logging.basicConfig(level=logging.INFO)
|
|
51
|
-
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试模式一致性提示
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.crawler import CrawlerProcess
|
|
14
|
+
from crawlo.spider import Spider
|
|
15
|
+
from crawlo import Request
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestSpider(Spider):
|
|
19
|
+
name = "test_mode_spider"
|
|
20
|
+
|
|
21
|
+
def start_requests(self):
|
|
22
|
+
yield Request("https://httpbin.org/get")
|
|
23
|
+
|
|
24
|
+
def parse(self, response):
|
|
25
|
+
yield {"url": response.url, "status": response.status_code} # 修复:使用status_code而不是status
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def test_mode_consistency():
|
|
29
|
+
"""测试模式一致性提示"""
|
|
30
|
+
print("测试模式一致性提示...")
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
# 创建爬虫进程
|
|
34
|
+
process = CrawlerProcess()
|
|
35
|
+
|
|
36
|
+
# 添加爬虫
|
|
37
|
+
await process.crawl(TestSpider)
|
|
38
|
+
|
|
39
|
+
print("模式一致性测试完成")
|
|
40
|
+
|
|
41
|
+
except Exception as e:
|
|
42
|
+
print(f"测试失败: {e}")
|
|
43
|
+
import traceback
|
|
44
|
+
traceback.print_exc()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
# 设置日志级别
|
|
49
|
+
import logging
|
|
50
|
+
logging.basicConfig(level=logging.INFO)
|
|
51
|
+
|
|
52
52
|
asyncio.run(test_mode_consistency())
|
tests/test_multi_directory.py
CHANGED
|
@@ -1,68 +1,68 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试多个爬虫目录的支持
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
# 添加项目根目录到Python路径
|
|
10
|
-
sys.path.insert(0, os.path.dirname(__file__))
|
|
11
|
-
|
|
12
|
-
# 添加ofweek_standalone到Python路径
|
|
13
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
14
|
-
|
|
15
|
-
def test_multiple_spider_directories():
|
|
16
|
-
"""测试多个爬虫目录的支持"""
|
|
17
|
-
print("测试多个爬虫目录的支持...")
|
|
18
|
-
|
|
19
|
-
# 导入设置
|
|
20
|
-
import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
|
|
21
|
-
|
|
22
|
-
# 创建设置管理器
|
|
23
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
24
|
-
settings = SettingManager()
|
|
25
|
-
settings.set_settings(settings_module)
|
|
26
|
-
|
|
27
|
-
# 检查SPIDER_MODULES配置
|
|
28
|
-
spider_modules = settings.get('SPIDER_MODULES')
|
|
29
|
-
print(f"SPIDER_MODULES配置: {spider_modules}")
|
|
30
|
-
|
|
31
|
-
# 创建CrawlerProcess实例
|
|
32
|
-
from crawlo.crawler import CrawlerProcess
|
|
33
|
-
process = CrawlerProcess(settings=settings)
|
|
34
|
-
|
|
35
|
-
# 检查是否注册了爬虫
|
|
36
|
-
spider_names = process.get_spider_names()
|
|
37
|
-
print(f"已注册的爬虫: {spider_names}")
|
|
38
|
-
|
|
39
|
-
# 验证期望的爬虫是否已注册
|
|
40
|
-
expected_spiders = ['of_week_standalone', 'test_spider']
|
|
41
|
-
registered_spiders = []
|
|
42
|
-
|
|
43
|
-
for spider_name in expected_spiders:
|
|
44
|
-
if spider_name in spider_names:
|
|
45
|
-
print(f"✅ 成功: 爬虫 '{spider_name}' 已注册")
|
|
46
|
-
registered_spiders.append(spider_name)
|
|
47
|
-
else:
|
|
48
|
-
print(f"❌ 失败: 爬虫 '{spider_name}' 未找到")
|
|
49
|
-
|
|
50
|
-
if len(registered_spiders) == len(expected_spiders):
|
|
51
|
-
print(f"🎉 所有爬虫都已成功注册!")
|
|
52
|
-
return True
|
|
53
|
-
else:
|
|
54
|
-
print(f"⚠️ 部分爬虫未注册: {set(expected_spiders) - set(registered_spiders)}")
|
|
55
|
-
return False
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if __name__ == '__main__':
|
|
59
|
-
print("开始测试多个爬虫目录的支持...\n")
|
|
60
|
-
|
|
61
|
-
success = test_multiple_spider_directories()
|
|
62
|
-
|
|
63
|
-
if success:
|
|
64
|
-
print("\n🎉 测试通过!")
|
|
65
|
-
sys.exit(0)
|
|
66
|
-
else:
|
|
67
|
-
print("\n❌ 测试失败!")
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试多个爬虫目录的支持
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到Python路径
|
|
10
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
11
|
+
|
|
12
|
+
# 添加ofweek_standalone到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
14
|
+
|
|
15
|
+
def test_multiple_spider_directories():
|
|
16
|
+
"""测试多个爬虫目录的支持"""
|
|
17
|
+
print("测试多个爬虫目录的支持...")
|
|
18
|
+
|
|
19
|
+
# 导入设置
|
|
20
|
+
import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
|
|
21
|
+
|
|
22
|
+
# 创建设置管理器
|
|
23
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
24
|
+
settings = SettingManager()
|
|
25
|
+
settings.set_settings(settings_module)
|
|
26
|
+
|
|
27
|
+
# 检查SPIDER_MODULES配置
|
|
28
|
+
spider_modules = settings.get('SPIDER_MODULES')
|
|
29
|
+
print(f"SPIDER_MODULES配置: {spider_modules}")
|
|
30
|
+
|
|
31
|
+
# 创建CrawlerProcess实例
|
|
32
|
+
from crawlo.crawler import CrawlerProcess
|
|
33
|
+
process = CrawlerProcess(settings=settings)
|
|
34
|
+
|
|
35
|
+
# 检查是否注册了爬虫
|
|
36
|
+
spider_names = process.get_spider_names()
|
|
37
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
38
|
+
|
|
39
|
+
# 验证期望的爬虫是否已注册
|
|
40
|
+
expected_spiders = ['of_week_standalone', 'test_spider']
|
|
41
|
+
registered_spiders = []
|
|
42
|
+
|
|
43
|
+
for spider_name in expected_spiders:
|
|
44
|
+
if spider_name in spider_names:
|
|
45
|
+
print(f"✅ 成功: 爬虫 '{spider_name}' 已注册")
|
|
46
|
+
registered_spiders.append(spider_name)
|
|
47
|
+
else:
|
|
48
|
+
print(f"❌ 失败: 爬虫 '{spider_name}' 未找到")
|
|
49
|
+
|
|
50
|
+
if len(registered_spiders) == len(expected_spiders):
|
|
51
|
+
print(f"🎉 所有爬虫都已成功注册!")
|
|
52
|
+
return True
|
|
53
|
+
else:
|
|
54
|
+
print(f"⚠️ 部分爬虫未注册: {set(expected_spiders) - set(registered_spiders)}")
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == '__main__':
|
|
59
|
+
print("开始测试多个爬虫目录的支持...\n")
|
|
60
|
+
|
|
61
|
+
success = test_multiple_spider_directories()
|
|
62
|
+
|
|
63
|
+
if success:
|
|
64
|
+
print("\n🎉 测试通过!")
|
|
65
|
+
sys.exit(0)
|
|
66
|
+
else:
|
|
67
|
+
print("\n❌ 测试失败!")
|
|
68
68
|
sys.exit(1)
|