crawlo 1.3.4__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +87 -87
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +45 -45
- crawlo/core/engine.py +439 -439
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +257 -257
- crawlo/crawler.py +638 -638
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +228 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -291
- crawlo/initialization/__init__.py +39 -39
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +37 -37
- crawlo/logging/config.py +96 -96
- crawlo/logging/factory.py +128 -128
- crawlo/logging/manager.py +111 -111
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +212 -212
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -318
- crawlo/pipelines/pipeline_manager.py +76 -76
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +327 -327
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +503 -503
- crawlo/queue/redis_priority_queue.py +326 -326
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +321 -321
- crawlo/settings/setting_manager.py +214 -214
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +167 -167
- crawlo/templates/project/settings_distributed.py.tmpl +166 -166
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +80 -44
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/METADATA +1126 -1126
- crawlo-1.3.6.dist-info/RECORD +290 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +127 -0
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +234 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +261 -0
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +143 -0
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +112 -0
- tests/log_generation_timing_test.py +154 -0
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +138 -0
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +42 -0
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +178 -0
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +84 -0
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_system.py +283 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_queue_type.py +107 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.3.4.dist-info/RECORD +0 -278
- {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/WHEEL +0 -0
- {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/top_level.txt +0 -0
tests/distributed_test.py
CHANGED
|
@@ -1,67 +1,67 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
分布式模式测试脚本
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
import asyncio
|
|
10
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
-
|
|
12
|
-
from crawlo.spider import Spider
|
|
13
|
-
from crawlo import Request
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class DistributedTestSpider(Spider):
|
|
17
|
-
"""分布式测试爬虫"""
|
|
18
|
-
name = 'distributed_test_spider'
|
|
19
|
-
|
|
20
|
-
def start_requests(self):
|
|
21
|
-
"""发起测试请求"""
|
|
22
|
-
# 生成一些测试请求
|
|
23
|
-
for i in range(5):
|
|
24
|
-
yield Request(f'https://httpbin.org/get?page={i}', callback=self.parse)
|
|
25
|
-
|
|
26
|
-
def parse(self, response):
|
|
27
|
-
"""解析响应"""
|
|
28
|
-
print(f"成功获取响应: {response.url}")
|
|
29
|
-
print(f"状态码: {response.status_code}")
|
|
30
|
-
return []
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
async def test_distributed_mode():
|
|
34
|
-
"""测试分布式模式"""
|
|
35
|
-
print("开始测试分布式模式...")
|
|
36
|
-
|
|
37
|
-
# 初始化框架,设置为分布式模式
|
|
38
|
-
from crawlo.initialization import initialize_framework
|
|
39
|
-
custom_settings = {
|
|
40
|
-
'RUN_MODE': 'distributed',
|
|
41
|
-
'QUEUE_TYPE': 'redis',
|
|
42
|
-
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
43
|
-
'REDIS_HOST': '127.0.0.1',
|
|
44
|
-
'REDIS_PORT': 6379,
|
|
45
|
-
'REDIS_DB': 15, # 使用测试数据库
|
|
46
|
-
'PROJECT_NAME': 'distributed_test'
|
|
47
|
-
}
|
|
48
|
-
settings = initialize_framework(custom_settings)
|
|
49
|
-
|
|
50
|
-
# 创建爬虫进程
|
|
51
|
-
from crawlo.crawler import CrawlerProcess
|
|
52
|
-
process = CrawlerProcess(settings=settings)
|
|
53
|
-
|
|
54
|
-
# 运行爬虫
|
|
55
|
-
await process.crawl(DistributedTestSpider)
|
|
56
|
-
|
|
57
|
-
print("分布式模式测试完成!")
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def main():
|
|
61
|
-
"""主函数"""
|
|
62
|
-
print("开始分布式模式测试...")
|
|
63
|
-
asyncio.run(test_distributed_mode())
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
分布式模式测试脚本
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import asyncio
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.spider import Spider
|
|
13
|
+
from crawlo import Request
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DistributedTestSpider(Spider):
|
|
17
|
+
"""分布式测试爬虫"""
|
|
18
|
+
name = 'distributed_test_spider'
|
|
19
|
+
|
|
20
|
+
def start_requests(self):
|
|
21
|
+
"""发起测试请求"""
|
|
22
|
+
# 生成一些测试请求
|
|
23
|
+
for i in range(5):
|
|
24
|
+
yield Request(f'https://httpbin.org/get?page={i}', callback=self.parse)
|
|
25
|
+
|
|
26
|
+
def parse(self, response):
|
|
27
|
+
"""解析响应"""
|
|
28
|
+
print(f"成功获取响应: {response.url}")
|
|
29
|
+
print(f"状态码: {response.status_code}")
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def test_distributed_mode():
|
|
34
|
+
"""测试分布式模式"""
|
|
35
|
+
print("开始测试分布式模式...")
|
|
36
|
+
|
|
37
|
+
# 初始化框架,设置为分布式模式
|
|
38
|
+
from crawlo.initialization import initialize_framework
|
|
39
|
+
custom_settings = {
|
|
40
|
+
'RUN_MODE': 'distributed',
|
|
41
|
+
'QUEUE_TYPE': 'redis',
|
|
42
|
+
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
43
|
+
'REDIS_HOST': '127.0.0.1',
|
|
44
|
+
'REDIS_PORT': 6379,
|
|
45
|
+
'REDIS_DB': 15, # 使用测试数据库
|
|
46
|
+
'PROJECT_NAME': 'distributed_test'
|
|
47
|
+
}
|
|
48
|
+
settings = initialize_framework(custom_settings)
|
|
49
|
+
|
|
50
|
+
# 创建爬虫进程
|
|
51
|
+
from crawlo.crawler import CrawlerProcess
|
|
52
|
+
process = CrawlerProcess(settings=settings)
|
|
53
|
+
|
|
54
|
+
# 运行爬虫
|
|
55
|
+
await process.crawl(DistributedTestSpider)
|
|
56
|
+
|
|
57
|
+
print("分布式模式测试完成!")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def main():
|
|
61
|
+
"""主函数"""
|
|
62
|
+
print("开始分布式模式测试...")
|
|
63
|
+
asyncio.run(test_distributed_mode())
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
67
|
main()
|
tests/distributed_test_debug.py
CHANGED
|
@@ -1,77 +1,77 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
分布式模式调试测试脚本
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
import asyncio
|
|
10
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
-
|
|
12
|
-
from crawlo.spider import Spider
|
|
13
|
-
from crawlo import Request
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class DistributedTestSpider(Spider):
|
|
17
|
-
"""分布式测试爬虫"""
|
|
18
|
-
name = 'distributed_test_spider'
|
|
19
|
-
|
|
20
|
-
def start_requests(self):
|
|
21
|
-
"""发起测试请求"""
|
|
22
|
-
# 生成一些测试请求
|
|
23
|
-
for i in range(3):
|
|
24
|
-
yield Request(f'https://httpbin.org/get?page={i}', callback=self.parse)
|
|
25
|
-
|
|
26
|
-
def parse(self, response):
|
|
27
|
-
"""解析响应"""
|
|
28
|
-
print(f"成功获取响应: {response.url}")
|
|
29
|
-
print(f"状态码: {response.status_code}")
|
|
30
|
-
return []
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
async def test_distributed_mode():
|
|
34
|
-
"""测试分布式模式"""
|
|
35
|
-
print("开始测试分布式模式...")
|
|
36
|
-
|
|
37
|
-
# 初始化框架,设置为分布式模式
|
|
38
|
-
from crawlo.initialization import initialize_framework
|
|
39
|
-
custom_settings = {
|
|
40
|
-
'RUN_MODE': 'distributed',
|
|
41
|
-
'QUEUE_TYPE': 'redis',
|
|
42
|
-
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
43
|
-
'REDIS_HOST': '127.0.0.1',
|
|
44
|
-
'REDIS_PORT': 6379,
|
|
45
|
-
'REDIS_DB': 15, # 使用测试数据库
|
|
46
|
-
'PROJECT_NAME': 'distributed_test'
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
print("自定义配置:")
|
|
50
|
-
for key, value in custom_settings.items():
|
|
51
|
-
print(f" {key}: {value}")
|
|
52
|
-
|
|
53
|
-
settings = initialize_framework(custom_settings)
|
|
54
|
-
|
|
55
|
-
print("初始化后的配置:")
|
|
56
|
-
print(f" RUN_MODE: {settings.get('RUN_MODE')}")
|
|
57
|
-
print(f" QUEUE_TYPE: {settings.get('QUEUE_TYPE')}")
|
|
58
|
-
print(f" FILTER_CLASS: {settings.get('FILTER_CLASS')}")
|
|
59
|
-
|
|
60
|
-
# 创建爬虫进程
|
|
61
|
-
from crawlo.crawler import CrawlerProcess
|
|
62
|
-
process = CrawlerProcess(settings=settings)
|
|
63
|
-
|
|
64
|
-
# 运行爬虫
|
|
65
|
-
await process.crawl(DistributedTestSpider)
|
|
66
|
-
|
|
67
|
-
print("分布式模式测试完成!")
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def main():
|
|
71
|
-
"""主函数"""
|
|
72
|
-
print("开始分布式模式调试测试...")
|
|
73
|
-
asyncio.run(test_distributed_mode())
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
分布式模式调试测试脚本
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import asyncio
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.spider import Spider
|
|
13
|
+
from crawlo import Request
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DistributedTestSpider(Spider):
|
|
17
|
+
"""分布式测试爬虫"""
|
|
18
|
+
name = 'distributed_test_spider'
|
|
19
|
+
|
|
20
|
+
def start_requests(self):
|
|
21
|
+
"""发起测试请求"""
|
|
22
|
+
# 生成一些测试请求
|
|
23
|
+
for i in range(3):
|
|
24
|
+
yield Request(f'https://httpbin.org/get?page={i}', callback=self.parse)
|
|
25
|
+
|
|
26
|
+
def parse(self, response):
|
|
27
|
+
"""解析响应"""
|
|
28
|
+
print(f"成功获取响应: {response.url}")
|
|
29
|
+
print(f"状态码: {response.status_code}")
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def test_distributed_mode():
|
|
34
|
+
"""测试分布式模式"""
|
|
35
|
+
print("开始测试分布式模式...")
|
|
36
|
+
|
|
37
|
+
# 初始化框架,设置为分布式模式
|
|
38
|
+
from crawlo.initialization import initialize_framework
|
|
39
|
+
custom_settings = {
|
|
40
|
+
'RUN_MODE': 'distributed',
|
|
41
|
+
'QUEUE_TYPE': 'redis',
|
|
42
|
+
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
43
|
+
'REDIS_HOST': '127.0.0.1',
|
|
44
|
+
'REDIS_PORT': 6379,
|
|
45
|
+
'REDIS_DB': 15, # 使用测试数据库
|
|
46
|
+
'PROJECT_NAME': 'distributed_test'
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
print("自定义配置:")
|
|
50
|
+
for key, value in custom_settings.items():
|
|
51
|
+
print(f" {key}: {value}")
|
|
52
|
+
|
|
53
|
+
settings = initialize_framework(custom_settings)
|
|
54
|
+
|
|
55
|
+
print("初始化后的配置:")
|
|
56
|
+
print(f" RUN_MODE: {settings.get('RUN_MODE')}")
|
|
57
|
+
print(f" QUEUE_TYPE: {settings.get('QUEUE_TYPE')}")
|
|
58
|
+
print(f" FILTER_CLASS: {settings.get('FILTER_CLASS')}")
|
|
59
|
+
|
|
60
|
+
# 创建爬虫进程
|
|
61
|
+
from crawlo.crawler import CrawlerProcess
|
|
62
|
+
process = CrawlerProcess(settings=settings)
|
|
63
|
+
|
|
64
|
+
# 运行爬虫
|
|
65
|
+
await process.crawl(DistributedTestSpider)
|
|
66
|
+
|
|
67
|
+
print("分布式模式测试完成!")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main():
|
|
71
|
+
"""主函数"""
|
|
72
|
+
print("开始分布式模式调试测试...")
|
|
73
|
+
asyncio.run(test_distributed_mode())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
77
|
main()
|