crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
|
@@ -1,273 +1,273 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试代理中间件与Crawlo框架中三个主要下载器的兼容性
|
|
5
|
-
- aiohttp_downloader
|
|
6
|
-
- httpx_downloader
|
|
7
|
-
- curl_cffi_downloader
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import asyncio
|
|
11
|
-
import sys
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
# 添加项目根目录到Python路径
|
|
15
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
16
|
-
|
|
17
|
-
from crawlo.downloader.aiohttp_downloader import AioHttpDownloader
|
|
18
|
-
from crawlo.downloader.httpx_downloader import HttpXDownloader
|
|
19
|
-
from crawlo.downloader.cffi_downloader import CurlCffiDownloader
|
|
20
|
-
from crawlo.middleware.proxy import ProxyMiddleware
|
|
21
|
-
from crawlo.network.request import Request
|
|
22
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class MockSpider:
|
|
26
|
-
"""模拟爬虫类"""
|
|
27
|
-
def __init__(self, crawler):
|
|
28
|
-
self.crawler = crawler
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class MockCrawler:
|
|
32
|
-
"""模拟爬虫实例"""
|
|
33
|
-
def __init__(self, settings):
|
|
34
|
-
self.settings = settings
|
|
35
|
-
self.spider = MockSpider(self) # 添加spider属性
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def create_test_settings(proxy_url=None, proxy_list=None):
|
|
39
|
-
"""创建测试设置"""
|
|
40
|
-
settings = SettingManager()
|
|
41
|
-
settings.set("LOG_LEVEL", "DEBUG")
|
|
42
|
-
settings.set("DOWNLOAD_TIMEOUT", 30)
|
|
43
|
-
settings.set("CONNECTION_POOL_LIMIT", 100)
|
|
44
|
-
settings.set("CONNECTION_POOL_LIMIT_PER_HOST", 20)
|
|
45
|
-
settings.set("DOWNLOAD_MAXSIZE", 10 * 1024 * 1024)
|
|
46
|
-
settings.set("VERIFY_SSL", True)
|
|
47
|
-
|
|
48
|
-
# 代理相关设置
|
|
49
|
-
if proxy_url:
|
|
50
|
-
# 高级代理配置(适用于ProxyMiddleware)
|
|
51
|
-
# 只要配置了代理API URL,中间件就会自动启用
|
|
52
|
-
settings.set("PROXY_API_URL", proxy_url)
|
|
53
|
-
elif proxy_list:
|
|
54
|
-
# 代理配置(适用于ProxyMiddleware)
|
|
55
|
-
# 只要配置了代理列表,中间件就会自动启用
|
|
56
|
-
settings.set("PROXY_LIST", proxy_list)
|
|
57
|
-
|
|
58
|
-
return settings
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
62
|
-
"""测试aiohttp下载器与代理的适配性"""
|
|
63
|
-
print(f"\n=== 测试 aiohttp 下载器与代理 ===")
|
|
64
|
-
print(f"代理URL: {proxy_url}")
|
|
65
|
-
print(f"目标URL: {target_url}")
|
|
66
|
-
|
|
67
|
-
try:
|
|
68
|
-
# 创建设置
|
|
69
|
-
settings = create_test_settings(proxy_url=proxy_url)
|
|
70
|
-
crawler = MockCrawler(settings)
|
|
71
|
-
|
|
72
|
-
# 创建下载器
|
|
73
|
-
downloader = AioHttpDownloader(crawler)
|
|
74
|
-
downloader.open()
|
|
75
|
-
|
|
76
|
-
# 创建代理中间件
|
|
77
|
-
from crawlo.middleware.proxy import ProxyMiddleware
|
|
78
|
-
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
79
|
-
|
|
80
|
-
# 创建请求
|
|
81
|
-
request = Request(url=target_url)
|
|
82
|
-
|
|
83
|
-
# 创建模拟爬虫
|
|
84
|
-
spider = MockSpider(crawler)
|
|
85
|
-
|
|
86
|
-
# 通过代理中间件处理请求
|
|
87
|
-
await proxy_middleware.process_request(request, spider)
|
|
88
|
-
|
|
89
|
-
if request.proxy:
|
|
90
|
-
print(f"✓ 代理已成功设置: {request.proxy}")
|
|
91
|
-
else:
|
|
92
|
-
print("代理未设置")
|
|
93
|
-
|
|
94
|
-
# 尝试下载
|
|
95
|
-
try:
|
|
96
|
-
response = await downloader.download(request)
|
|
97
|
-
if response and response.status_code:
|
|
98
|
-
print(f"✓ 下载成功,状态码: {response.status_code}")
|
|
99
|
-
# 只检查状态码,避免编码问题
|
|
100
|
-
return True
|
|
101
|
-
else:
|
|
102
|
-
print("✗ 下载失败,响应为空")
|
|
103
|
-
return False
|
|
104
|
-
except Exception as e:
|
|
105
|
-
print(f"✗ 下载过程中出错: {e}")
|
|
106
|
-
return False
|
|
107
|
-
|
|
108
|
-
except Exception as e:
|
|
109
|
-
print(f"✗ 测试aiohttp时出错: {e}")
|
|
110
|
-
return False
|
|
111
|
-
finally:
|
|
112
|
-
# 清理资源
|
|
113
|
-
try:
|
|
114
|
-
await downloader.close()
|
|
115
|
-
await proxy_middleware.close()
|
|
116
|
-
except:
|
|
117
|
-
pass
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
async def test_httpx_with_proxy_async(proxy_list, target_url):
|
|
121
|
-
"""测试httpx下载器与代理的适配性"""
|
|
122
|
-
print(f"\n=== 测试 httpx 下载器与代理 ===")
|
|
123
|
-
print(f"代理列表: {proxy_list}")
|
|
124
|
-
print(f"目标URL: {target_url}")
|
|
125
|
-
|
|
126
|
-
try:
|
|
127
|
-
# 创建设置
|
|
128
|
-
settings = create_test_settings(proxy_list=proxy_list)
|
|
129
|
-
crawler = MockCrawler(settings)
|
|
130
|
-
|
|
131
|
-
# 创建下载器
|
|
132
|
-
downloader = HttpXDownloader(crawler)
|
|
133
|
-
downloader.open()
|
|
134
|
-
|
|
135
|
-
# 创建代理中间件
|
|
136
|
-
from crawlo.middleware.simple_proxy import SimpleProxyMiddleware
|
|
137
|
-
proxy_middleware = SimpleProxyMiddleware(settings, "DEBUG")
|
|
138
|
-
|
|
139
|
-
# 创建请求
|
|
140
|
-
request = Request(url=target_url)
|
|
141
|
-
|
|
142
|
-
# 创建模拟爬虫
|
|
143
|
-
spider = MockSpider(crawler)
|
|
144
|
-
|
|
145
|
-
# 通过代理中间件处理请求
|
|
146
|
-
await proxy_middleware.process_request(request, spider)
|
|
147
|
-
|
|
148
|
-
if request.proxy:
|
|
149
|
-
print(f"✓ 代理已成功设置: {request.proxy}")
|
|
150
|
-
else:
|
|
151
|
-
print("代理未设置")
|
|
152
|
-
|
|
153
|
-
# 尝试下载
|
|
154
|
-
try:
|
|
155
|
-
response = await downloader.download(request)
|
|
156
|
-
if response and response.status_code:
|
|
157
|
-
print(f"✓ 下载成功,状态码: {response.status_code}")
|
|
158
|
-
# 只检查状态码,避免编码问题
|
|
159
|
-
return True
|
|
160
|
-
else:
|
|
161
|
-
print("✗ 下载失败,响应为空")
|
|
162
|
-
return False
|
|
163
|
-
except Exception as e:
|
|
164
|
-
print(f"✗ 下载过程中出错: {e}")
|
|
165
|
-
return False
|
|
166
|
-
|
|
167
|
-
except Exception as e:
|
|
168
|
-
print(f"✗ 测试httpx时出错: {e}")
|
|
169
|
-
return False
|
|
170
|
-
finally:
|
|
171
|
-
# 清理资源
|
|
172
|
-
try:
|
|
173
|
-
await downloader.close()
|
|
174
|
-
except:
|
|
175
|
-
pass
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
async def test_curl_cffi_with_proxy_async(proxy_url, target_url):
|
|
179
|
-
"""测试curl-cffi下载器与代理的适配性"""
|
|
180
|
-
print(f"\n=== 测试 curl-cffi 下载器与代理 ===")
|
|
181
|
-
print(f"代理URL: {proxy_url}")
|
|
182
|
-
print(f"目标URL: {target_url}")
|
|
183
|
-
|
|
184
|
-
try:
|
|
185
|
-
# 创建设置
|
|
186
|
-
settings = create_test_settings(proxy_url=proxy_url)
|
|
187
|
-
crawler = MockCrawler(settings)
|
|
188
|
-
|
|
189
|
-
# 创建下载器
|
|
190
|
-
downloader = CurlCffiDownloader(crawler)
|
|
191
|
-
downloader.open()
|
|
192
|
-
|
|
193
|
-
# 创建代理中间件
|
|
194
|
-
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
195
|
-
|
|
196
|
-
# 创建请求
|
|
197
|
-
request = Request(url=target_url)
|
|
198
|
-
|
|
199
|
-
# 创建模拟爬虫
|
|
200
|
-
spider = MockSpider(crawler)
|
|
201
|
-
|
|
202
|
-
# 通过代理中间件处理请求
|
|
203
|
-
await proxy_middleware.process_request(request, spider)
|
|
204
|
-
|
|
205
|
-
if request.proxy:
|
|
206
|
-
print(f"✓ 代理已成功设置: {request.proxy}")
|
|
207
|
-
else:
|
|
208
|
-
print("代理未设置")
|
|
209
|
-
|
|
210
|
-
# 尝试下载
|
|
211
|
-
try:
|
|
212
|
-
response = await downloader.download(request)
|
|
213
|
-
if response and response.status_code:
|
|
214
|
-
print(f"✓ 下载成功,状态码: {response.status_code}")
|
|
215
|
-
# 只检查状态码,避免编码问题
|
|
216
|
-
return True
|
|
217
|
-
else:
|
|
218
|
-
print("✗ 下载失败,响应为空")
|
|
219
|
-
return False
|
|
220
|
-
except Exception as e:
|
|
221
|
-
print(f"✗ 下载过程中出错: {e}")
|
|
222
|
-
return False
|
|
223
|
-
|
|
224
|
-
except Exception as e:
|
|
225
|
-
print(f"✗ 测试curl-cffi时出错: {e}")
|
|
226
|
-
return False
|
|
227
|
-
finally:
|
|
228
|
-
# 清理资源
|
|
229
|
-
try:
|
|
230
|
-
await downloader.close()
|
|
231
|
-
await proxy_middleware.close()
|
|
232
|
-
except:
|
|
233
|
-
pass
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
async def main():
|
|
237
|
-
"""主测试函数"""
|
|
238
|
-
print("开始测试代理中间件与三个下载器的兼容性...")
|
|
239
|
-
|
|
240
|
-
# 使用测试代理URL(这里使用一个公开的测试代理)
|
|
241
|
-
# 注意:在实际使用中,您需要替换为有效的代理URL
|
|
242
|
-
test_proxy_url = "http://test.proxy.api:8080/proxy/getitem/"
|
|
243
|
-
test_proxy_list = ["http://proxy1:8080", "http://proxy2:8080"]
|
|
244
|
-
test_target_url = "https://httpbin.org/ip" # 一个返回IP信息的测试站点
|
|
245
|
-
|
|
246
|
-
print(f"测试代理API: {test_proxy_url}")
|
|
247
|
-
print(f"测试代理列表: {test_proxy_list}")
|
|
248
|
-
print(f"测试目标URL: {test_target_url}")
|
|
249
|
-
|
|
250
|
-
# 测试aiohttp下载器(使用高级代理)
|
|
251
|
-
aiohttp_result = await test_aiohttp_with_proxy(test_proxy_url, test_target_url)
|
|
252
|
-
|
|
253
|
-
# 测试httpx下载器(使用简化代理)
|
|
254
|
-
httpx_result = await test_httpx_with_proxy_async(test_proxy_list, test_target_url)
|
|
255
|
-
|
|
256
|
-
# 测试curl-cffi下载器(使用高级代理)
|
|
257
|
-
curl_cffi_result = await test_curl_cffi_with_proxy_async(test_proxy_url, test_target_url)
|
|
258
|
-
|
|
259
|
-
# 汇总结果
|
|
260
|
-
print("\n" + "="*50)
|
|
261
|
-
print("测试结果汇总:")
|
|
262
|
-
print(f"aiohttp 下载器 (高级代理): {'✓ 通过' if aiohttp_result else '✗ 失败'}")
|
|
263
|
-
print(f"httpx 下载器 (简化代理): {'✓ 通过' if httpx_result else '✗ 失败'}")
|
|
264
|
-
print(f"curl-cffi 下载器 (高级代理): {'✓ 通过' if curl_cffi_result else '✗ 失败'}")
|
|
265
|
-
|
|
266
|
-
overall_result = all([aiohttp_result, httpx_result, curl_cffi_result])
|
|
267
|
-
print(f"\n总体结果: {'✓ 所有下载器都适配代理中间件' if overall_result else '✗ 部分下载器不兼容'}")
|
|
268
|
-
|
|
269
|
-
return overall_result
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试代理中间件与Crawlo框架中三个主要下载器的兼容性
|
|
5
|
+
- aiohttp_downloader
|
|
6
|
+
- httpx_downloader
|
|
7
|
+
- curl_cffi_downloader
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import sys
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
# 添加项目根目录到Python路径
|
|
15
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
16
|
+
|
|
17
|
+
from crawlo.downloader.aiohttp_downloader import AioHttpDownloader
|
|
18
|
+
from crawlo.downloader.httpx_downloader import HttpXDownloader
|
|
19
|
+
from crawlo.downloader.cffi_downloader import CurlCffiDownloader
|
|
20
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
21
|
+
from crawlo.network.request import Request
|
|
22
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MockSpider:
|
|
26
|
+
"""模拟爬虫类"""
|
|
27
|
+
def __init__(self, crawler):
|
|
28
|
+
self.crawler = crawler
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MockCrawler:
|
|
32
|
+
"""模拟爬虫实例"""
|
|
33
|
+
def __init__(self, settings):
|
|
34
|
+
self.settings = settings
|
|
35
|
+
self.spider = MockSpider(self) # 添加spider属性
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def create_test_settings(proxy_url=None, proxy_list=None):
|
|
39
|
+
"""创建测试设置"""
|
|
40
|
+
settings = SettingManager()
|
|
41
|
+
settings.set("LOG_LEVEL", "DEBUG")
|
|
42
|
+
settings.set("DOWNLOAD_TIMEOUT", 30)
|
|
43
|
+
settings.set("CONNECTION_POOL_LIMIT", 100)
|
|
44
|
+
settings.set("CONNECTION_POOL_LIMIT_PER_HOST", 20)
|
|
45
|
+
settings.set("DOWNLOAD_MAXSIZE", 10 * 1024 * 1024)
|
|
46
|
+
settings.set("VERIFY_SSL", True)
|
|
47
|
+
|
|
48
|
+
# 代理相关设置
|
|
49
|
+
if proxy_url:
|
|
50
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
51
|
+
# 只要配置了代理API URL,中间件就会自动启用
|
|
52
|
+
settings.set("PROXY_API_URL", proxy_url)
|
|
53
|
+
elif proxy_list:
|
|
54
|
+
# 代理配置(适用于ProxyMiddleware)
|
|
55
|
+
# 只要配置了代理列表,中间件就会自动启用
|
|
56
|
+
settings.set("PROXY_LIST", proxy_list)
|
|
57
|
+
|
|
58
|
+
return settings
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
62
|
+
"""测试aiohttp下载器与代理的适配性"""
|
|
63
|
+
print(f"\n=== 测试 aiohttp 下载器与代理 ===")
|
|
64
|
+
print(f"代理URL: {proxy_url}")
|
|
65
|
+
print(f"目标URL: {target_url}")
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
# 创建设置
|
|
69
|
+
settings = create_test_settings(proxy_url=proxy_url)
|
|
70
|
+
crawler = MockCrawler(settings)
|
|
71
|
+
|
|
72
|
+
# 创建下载器
|
|
73
|
+
downloader = AioHttpDownloader(crawler)
|
|
74
|
+
downloader.open()
|
|
75
|
+
|
|
76
|
+
# 创建代理中间件
|
|
77
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
78
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
79
|
+
|
|
80
|
+
# 创建请求
|
|
81
|
+
request = Request(url=target_url)
|
|
82
|
+
|
|
83
|
+
# 创建模拟爬虫
|
|
84
|
+
spider = MockSpider(crawler)
|
|
85
|
+
|
|
86
|
+
# 通过代理中间件处理请求
|
|
87
|
+
await proxy_middleware.process_request(request, spider)
|
|
88
|
+
|
|
89
|
+
if request.proxy:
|
|
90
|
+
print(f"✓ 代理已成功设置: {request.proxy}")
|
|
91
|
+
else:
|
|
92
|
+
print("代理未设置")
|
|
93
|
+
|
|
94
|
+
# 尝试下载
|
|
95
|
+
try:
|
|
96
|
+
response = await downloader.download(request)
|
|
97
|
+
if response and response.status_code:
|
|
98
|
+
print(f"✓ 下载成功,状态码: {response.status_code}")
|
|
99
|
+
# 只检查状态码,避免编码问题
|
|
100
|
+
return True
|
|
101
|
+
else:
|
|
102
|
+
print("✗ 下载失败,响应为空")
|
|
103
|
+
return False
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print(f"✗ 下载过程中出错: {e}")
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
print(f"✗ 测试aiohttp时出错: {e}")
|
|
110
|
+
return False
|
|
111
|
+
finally:
|
|
112
|
+
# 清理资源
|
|
113
|
+
try:
|
|
114
|
+
await downloader.close()
|
|
115
|
+
await proxy_middleware.close()
|
|
116
|
+
except:
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
async def test_httpx_with_proxy_async(proxy_list, target_url):
|
|
121
|
+
"""测试httpx下载器与代理的适配性"""
|
|
122
|
+
print(f"\n=== 测试 httpx 下载器与代理 ===")
|
|
123
|
+
print(f"代理列表: {proxy_list}")
|
|
124
|
+
print(f"目标URL: {target_url}")
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
# 创建设置
|
|
128
|
+
settings = create_test_settings(proxy_list=proxy_list)
|
|
129
|
+
crawler = MockCrawler(settings)
|
|
130
|
+
|
|
131
|
+
# 创建下载器
|
|
132
|
+
downloader = HttpXDownloader(crawler)
|
|
133
|
+
downloader.open()
|
|
134
|
+
|
|
135
|
+
# 创建代理中间件
|
|
136
|
+
from crawlo.middleware.simple_proxy import SimpleProxyMiddleware
|
|
137
|
+
proxy_middleware = SimpleProxyMiddleware(settings, "DEBUG")
|
|
138
|
+
|
|
139
|
+
# 创建请求
|
|
140
|
+
request = Request(url=target_url)
|
|
141
|
+
|
|
142
|
+
# 创建模拟爬虫
|
|
143
|
+
spider = MockSpider(crawler)
|
|
144
|
+
|
|
145
|
+
# 通过代理中间件处理请求
|
|
146
|
+
await proxy_middleware.process_request(request, spider)
|
|
147
|
+
|
|
148
|
+
if request.proxy:
|
|
149
|
+
print(f"✓ 代理已成功设置: {request.proxy}")
|
|
150
|
+
else:
|
|
151
|
+
print("代理未设置")
|
|
152
|
+
|
|
153
|
+
# 尝试下载
|
|
154
|
+
try:
|
|
155
|
+
response = await downloader.download(request)
|
|
156
|
+
if response and response.status_code:
|
|
157
|
+
print(f"✓ 下载成功,状态码: {response.status_code}")
|
|
158
|
+
# 只检查状态码,避免编码问题
|
|
159
|
+
return True
|
|
160
|
+
else:
|
|
161
|
+
print("✗ 下载失败,响应为空")
|
|
162
|
+
return False
|
|
163
|
+
except Exception as e:
|
|
164
|
+
print(f"✗ 下载过程中出错: {e}")
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
print(f"✗ 测试httpx时出错: {e}")
|
|
169
|
+
return False
|
|
170
|
+
finally:
|
|
171
|
+
# 清理资源
|
|
172
|
+
try:
|
|
173
|
+
await downloader.close()
|
|
174
|
+
except:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
async def test_curl_cffi_with_proxy_async(proxy_url, target_url):
|
|
179
|
+
"""测试curl-cffi下载器与代理的适配性"""
|
|
180
|
+
print(f"\n=== 测试 curl-cffi 下载器与代理 ===")
|
|
181
|
+
print(f"代理URL: {proxy_url}")
|
|
182
|
+
print(f"目标URL: {target_url}")
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
# 创建设置
|
|
186
|
+
settings = create_test_settings(proxy_url=proxy_url)
|
|
187
|
+
crawler = MockCrawler(settings)
|
|
188
|
+
|
|
189
|
+
# 创建下载器
|
|
190
|
+
downloader = CurlCffiDownloader(crawler)
|
|
191
|
+
downloader.open()
|
|
192
|
+
|
|
193
|
+
# 创建代理中间件
|
|
194
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
195
|
+
|
|
196
|
+
# 创建请求
|
|
197
|
+
request = Request(url=target_url)
|
|
198
|
+
|
|
199
|
+
# 创建模拟爬虫
|
|
200
|
+
spider = MockSpider(crawler)
|
|
201
|
+
|
|
202
|
+
# 通过代理中间件处理请求
|
|
203
|
+
await proxy_middleware.process_request(request, spider)
|
|
204
|
+
|
|
205
|
+
if request.proxy:
|
|
206
|
+
print(f"✓ 代理已成功设置: {request.proxy}")
|
|
207
|
+
else:
|
|
208
|
+
print("代理未设置")
|
|
209
|
+
|
|
210
|
+
# 尝试下载
|
|
211
|
+
try:
|
|
212
|
+
response = await downloader.download(request)
|
|
213
|
+
if response and response.status_code:
|
|
214
|
+
print(f"✓ 下载成功,状态码: {response.status_code}")
|
|
215
|
+
# 只检查状态码,避免编码问题
|
|
216
|
+
return True
|
|
217
|
+
else:
|
|
218
|
+
print("✗ 下载失败,响应为空")
|
|
219
|
+
return False
|
|
220
|
+
except Exception as e:
|
|
221
|
+
print(f"✗ 下载过程中出错: {e}")
|
|
222
|
+
return False
|
|
223
|
+
|
|
224
|
+
except Exception as e:
|
|
225
|
+
print(f"✗ 测试curl-cffi时出错: {e}")
|
|
226
|
+
return False
|
|
227
|
+
finally:
|
|
228
|
+
# 清理资源
|
|
229
|
+
try:
|
|
230
|
+
await downloader.close()
|
|
231
|
+
await proxy_middleware.close()
|
|
232
|
+
except:
|
|
233
|
+
pass
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
async def main():
|
|
237
|
+
"""主测试函数"""
|
|
238
|
+
print("开始测试代理中间件与三个下载器的兼容性...")
|
|
239
|
+
|
|
240
|
+
# 使用测试代理URL(这里使用一个公开的测试代理)
|
|
241
|
+
# 注意:在实际使用中,您需要替换为有效的代理URL
|
|
242
|
+
test_proxy_url = "http://test.proxy.api:8080/proxy/getitem/"
|
|
243
|
+
test_proxy_list = ["http://proxy1:8080", "http://proxy2:8080"]
|
|
244
|
+
test_target_url = "https://httpbin.org/ip" # 一个返回IP信息的测试站点
|
|
245
|
+
|
|
246
|
+
print(f"测试代理API: {test_proxy_url}")
|
|
247
|
+
print(f"测试代理列表: {test_proxy_list}")
|
|
248
|
+
print(f"测试目标URL: {test_target_url}")
|
|
249
|
+
|
|
250
|
+
# 测试aiohttp下载器(使用高级代理)
|
|
251
|
+
aiohttp_result = await test_aiohttp_with_proxy(test_proxy_url, test_target_url)
|
|
252
|
+
|
|
253
|
+
# 测试httpx下载器(使用简化代理)
|
|
254
|
+
httpx_result = await test_httpx_with_proxy_async(test_proxy_list, test_target_url)
|
|
255
|
+
|
|
256
|
+
# 测试curl-cffi下载器(使用高级代理)
|
|
257
|
+
curl_cffi_result = await test_curl_cffi_with_proxy_async(test_proxy_url, test_target_url)
|
|
258
|
+
|
|
259
|
+
# 汇总结果
|
|
260
|
+
print("\n" + "="*50)
|
|
261
|
+
print("测试结果汇总:")
|
|
262
|
+
print(f"aiohttp 下载器 (高级代理): {'✓ 通过' if aiohttp_result else '✗ 失败'}")
|
|
263
|
+
print(f"httpx 下载器 (简化代理): {'✓ 通过' if httpx_result else '✗ 失败'}")
|
|
264
|
+
print(f"curl-cffi 下载器 (高级代理): {'✓ 通过' if curl_cffi_result else '✗ 失败'}")
|
|
265
|
+
|
|
266
|
+
overall_result = all([aiohttp_result, httpx_result, curl_cffi_result])
|
|
267
|
+
print(f"\n总体结果: {'✓ 所有下载器都适配代理中间件' if overall_result else '✗ 部分下载器不兼容'}")
|
|
268
|
+
|
|
269
|
+
return overall_result
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
if __name__ == "__main__":
|
|
273
273
|
asyncio.run(main())
|