crawlo 1.3.3__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +87 -63
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -323
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +46 -2
- crawlo/core/engine.py +439 -365
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +257 -256
- crawlo/crawler.py +639 -1167
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +228 -226
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +61 -52
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +28 -0
- crawlo/factories/base.py +69 -0
- crawlo/factories/crawler.py +104 -0
- crawlo/factories/registry.py +85 -0
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -234
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -0
- crawlo/initialization/__init__.py +40 -0
- crawlo/initialization/built_in.py +426 -0
- crawlo/initialization/context.py +142 -0
- crawlo/initialization/core.py +194 -0
- crawlo/initialization/phases.py +149 -0
- crawlo/initialization/registry.py +146 -0
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -22
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +38 -0
- crawlo/logging/config.py +97 -0
- crawlo/logging/factory.py +129 -0
- crawlo/logging/manager.py +112 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +212 -187
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +318 -318
- crawlo/pipelines/pipeline_manager.py +76 -75
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +327 -325
- crawlo/queue/pqueue.py +43 -37
- crawlo/queue/queue_manager.py +503 -379
- crawlo/queue/redis_priority_queue.py +326 -306
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +321 -225
- crawlo/settings/setting_manager.py +214 -198
- crawlo/spider/__init__.py +657 -639
- crawlo/stats_collector.py +73 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +139 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +168 -267
- crawlo/templates/project/settings_distributed.py.tmpl +167 -180
- crawlo/templates/project/settings_gentle.py.tmpl +167 -61
- crawlo/templates/project/settings_high_performance.py.tmpl +168 -131
- crawlo/templates/project/settings_minimal.py.tmpl +66 -35
- crawlo/templates/project/settings_simple.py.tmpl +165 -102
- crawlo/templates/project/spiders/__init__.py.tmpl +10 -6
- crawlo/templates/run.py.tmpl +34 -38
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +10 -0
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +365 -0
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +26 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -124
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +44 -200
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -351
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/METADATA +1126 -1020
- crawlo-1.3.4.dist-info/RECORD +278 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +107 -107
- tests/baidu_performance_test.py +109 -0
- tests/baidu_test.py +60 -0
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +213 -0
- tests/comprehensive_test.py +82 -0
- tests/comprehensive_testing_summary.md +187 -0
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +70 -0
- tests/debug_framework_logger.py +85 -0
- tests/debug_log_levels.py +64 -0
- tests/debug_pipelines.py +66 -66
- tests/distributed_test.py +67 -0
- tests/distributed_test_debug.py +77 -0
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_command_test_report.md +0 -0
- tests/final_comprehensive_test.py +152 -0
- tests/final_validation_test.py +183 -0
- tests/framework_performance_test.py +203 -0
- tests/optimized_performance_test.py +212 -0
- tests/performance_comparison.py +246 -0
- tests/queue_blocking_test.py +114 -0
- tests/queue_test.py +90 -0
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +139 -0
- tests/scrapy_comparison/scrapy_test.py +134 -0
- tests/simple_command_test.py +120 -0
- tests/simple_crawlo_test.py +128 -0
- tests/simple_log_test.py +58 -0
- tests/simple_optimization_test.py +129 -0
- tests/simple_spider_test.py +50 -0
- tests/simple_test.py +48 -0
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +231 -0
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +179 -0
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +175 -0
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +80 -0
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +246 -0
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +253 -0
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +67 -0
- tests/test_framework_startup.py +65 -0
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +113 -0
- tests/test_large_scale_helper.py +236 -0
- tests/test_mode_change.py +73 -0
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +116 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +42 -0
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +139 -0
- tests/verify_debug.py +52 -0
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +112 -0
- crawlo-1.3.3.dist-info/RECORD +0 -219
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
- {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
- {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
tests/debug_pipelines.py
CHANGED
|
@@ -1,67 +1,67 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
调试管道配置
|
|
5
|
-
查看实际的管道配置合并结果
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import sys
|
|
9
|
-
import os
|
|
10
|
-
|
|
11
|
-
# 添加项目根目录到路径
|
|
12
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
13
|
-
|
|
14
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
-
|
|
16
|
-
def debug_pipelines():
|
|
17
|
-
"""调试管道配置"""
|
|
18
|
-
print("调试管道配置合并...")
|
|
19
|
-
print("=" * 50)
|
|
20
|
-
|
|
21
|
-
# 用户自定义配置
|
|
22
|
-
user_config = {
|
|
23
|
-
'PIPELINES': [
|
|
24
|
-
'myproject.pipelines.CustomPipeline',
|
|
25
|
-
]
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
settings = SettingManager(user_config)
|
|
29
|
-
|
|
30
|
-
# 获取合并后的管道列表
|
|
31
|
-
pipelines = settings.get('PIPELINES')
|
|
32
|
-
|
|
33
|
-
print("合并后的管道列表:")
|
|
34
|
-
for i, pipeline in enumerate(pipelines):
|
|
35
|
-
print(f" {i}: {pipeline}")
|
|
36
|
-
|
|
37
|
-
print()
|
|
38
|
-
print("默认去重管道:")
|
|
39
|
-
dedup_pipeline = settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
40
|
-
print(f" {dedup_pipeline}")
|
|
41
|
-
|
|
42
|
-
print()
|
|
43
|
-
print("框架默认管道:")
|
|
44
|
-
default_pipelines = settings.get('PIPELINES', []) # 直接获取PIPELINES,它已经包含了默认管道
|
|
45
|
-
# 从合并后的管道中移除去重管道,得到框架默认管道
|
|
46
|
-
if dedup_pipeline:
|
|
47
|
-
default_pipelines_without_dedup = [p for p in default_pipelines if p != dedup_pipeline]
|
|
48
|
-
for i, pipeline in enumerate(default_pipelines_without_dedup):
|
|
49
|
-
print(f" {i}: {pipeline}")
|
|
50
|
-
else:
|
|
51
|
-
for i, pipeline in enumerate(default_pipelines):
|
|
52
|
-
print(f" {i}: {pipeline}")
|
|
53
|
-
|
|
54
|
-
print()
|
|
55
|
-
print("自定义管道:")
|
|
56
|
-
custom_pipelines = settings.get('PIPELINES')
|
|
57
|
-
# 从合并后的管道中移除默认管道,得到自定义管道
|
|
58
|
-
default_pipelines_list = [
|
|
59
|
-
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
60
|
-
'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
61
|
-
]
|
|
62
|
-
custom_pipelines_list = [p for p in custom_pipelines if p not in default_pipelines_list]
|
|
63
|
-
for i, pipeline in enumerate(custom_pipelines_list):
|
|
64
|
-
print(f" {i}: {pipeline}")
|
|
65
|
-
|
|
66
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
调试管道配置
|
|
5
|
+
查看实际的管道配置合并结果
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
# 添加项目根目录到路径
|
|
12
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
13
|
+
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
def debug_pipelines():
|
|
17
|
+
"""调试管道配置"""
|
|
18
|
+
print("调试管道配置合并...")
|
|
19
|
+
print("=" * 50)
|
|
20
|
+
|
|
21
|
+
# 用户自定义配置
|
|
22
|
+
user_config = {
|
|
23
|
+
'PIPELINES': [
|
|
24
|
+
'myproject.pipelines.CustomPipeline',
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
settings = SettingManager(user_config)
|
|
29
|
+
|
|
30
|
+
# 获取合并后的管道列表
|
|
31
|
+
pipelines = settings.get('PIPELINES')
|
|
32
|
+
|
|
33
|
+
print("合并后的管道列表:")
|
|
34
|
+
for i, pipeline in enumerate(pipelines):
|
|
35
|
+
print(f" {i}: {pipeline}")
|
|
36
|
+
|
|
37
|
+
print()
|
|
38
|
+
print("默认去重管道:")
|
|
39
|
+
dedup_pipeline = settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
40
|
+
print(f" {dedup_pipeline}")
|
|
41
|
+
|
|
42
|
+
print()
|
|
43
|
+
print("框架默认管道:")
|
|
44
|
+
default_pipelines = settings.get('PIPELINES', []) # 直接获取PIPELINES,它已经包含了默认管道
|
|
45
|
+
# 从合并后的管道中移除去重管道,得到框架默认管道
|
|
46
|
+
if dedup_pipeline:
|
|
47
|
+
default_pipelines_without_dedup = [p for p in default_pipelines if p != dedup_pipeline]
|
|
48
|
+
for i, pipeline in enumerate(default_pipelines_without_dedup):
|
|
49
|
+
print(f" {i}: {pipeline}")
|
|
50
|
+
else:
|
|
51
|
+
for i, pipeline in enumerate(default_pipelines):
|
|
52
|
+
print(f" {i}: {pipeline}")
|
|
53
|
+
|
|
54
|
+
print()
|
|
55
|
+
print("自定义管道:")
|
|
56
|
+
custom_pipelines = settings.get('PIPELINES')
|
|
57
|
+
# 从合并后的管道中移除默认管道,得到自定义管道
|
|
58
|
+
default_pipelines_list = [
|
|
59
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
60
|
+
'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
61
|
+
]
|
|
62
|
+
custom_pipelines_list = [p for p in custom_pipelines if p not in default_pipelines_list]
|
|
63
|
+
for i, pipeline in enumerate(custom_pipelines_list):
|
|
64
|
+
print(f" {i}: {pipeline}")
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
67
|
debug_pipelines()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
分布式模式测试脚本
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import asyncio
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.spider import Spider
|
|
13
|
+
from crawlo import Request
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DistributedTestSpider(Spider):
|
|
17
|
+
"""分布式测试爬虫"""
|
|
18
|
+
name = 'distributed_test_spider'
|
|
19
|
+
|
|
20
|
+
def start_requests(self):
|
|
21
|
+
"""发起测试请求"""
|
|
22
|
+
# 生成一些测试请求
|
|
23
|
+
for i in range(5):
|
|
24
|
+
yield Request(f'https://httpbin.org/get?page={i}', callback=self.parse)
|
|
25
|
+
|
|
26
|
+
def parse(self, response):
|
|
27
|
+
"""解析响应"""
|
|
28
|
+
print(f"成功获取响应: {response.url}")
|
|
29
|
+
print(f"状态码: {response.status_code}")
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def test_distributed_mode():
|
|
34
|
+
"""测试分布式模式"""
|
|
35
|
+
print("开始测试分布式模式...")
|
|
36
|
+
|
|
37
|
+
# 初始化框架,设置为分布式模式
|
|
38
|
+
from crawlo.initialization import initialize_framework
|
|
39
|
+
custom_settings = {
|
|
40
|
+
'RUN_MODE': 'distributed',
|
|
41
|
+
'QUEUE_TYPE': 'redis',
|
|
42
|
+
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
43
|
+
'REDIS_HOST': '127.0.0.1',
|
|
44
|
+
'REDIS_PORT': 6379,
|
|
45
|
+
'REDIS_DB': 15, # 使用测试数据库
|
|
46
|
+
'PROJECT_NAME': 'distributed_test'
|
|
47
|
+
}
|
|
48
|
+
settings = initialize_framework(custom_settings)
|
|
49
|
+
|
|
50
|
+
# 创建爬虫进程
|
|
51
|
+
from crawlo.crawler import CrawlerProcess
|
|
52
|
+
process = CrawlerProcess(settings=settings)
|
|
53
|
+
|
|
54
|
+
# 运行爬虫
|
|
55
|
+
await process.crawl(DistributedTestSpider)
|
|
56
|
+
|
|
57
|
+
print("分布式模式测试完成!")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def main():
|
|
61
|
+
"""主函数"""
|
|
62
|
+
print("开始分布式模式测试...")
|
|
63
|
+
asyncio.run(test_distributed_mode())
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
main()
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
分布式模式调试测试脚本
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import asyncio
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.spider import Spider
|
|
13
|
+
from crawlo import Request
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DistributedTestSpider(Spider):
|
|
17
|
+
"""分布式测试爬虫"""
|
|
18
|
+
name = 'distributed_test_spider'
|
|
19
|
+
|
|
20
|
+
def start_requests(self):
|
|
21
|
+
"""发起测试请求"""
|
|
22
|
+
# 生成一些测试请求
|
|
23
|
+
for i in range(3):
|
|
24
|
+
yield Request(f'https://httpbin.org/get?page={i}', callback=self.parse)
|
|
25
|
+
|
|
26
|
+
def parse(self, response):
|
|
27
|
+
"""解析响应"""
|
|
28
|
+
print(f"成功获取响应: {response.url}")
|
|
29
|
+
print(f"状态码: {response.status_code}")
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def test_distributed_mode():
|
|
34
|
+
"""测试分布式模式"""
|
|
35
|
+
print("开始测试分布式模式...")
|
|
36
|
+
|
|
37
|
+
# 初始化框架,设置为分布式模式
|
|
38
|
+
from crawlo.initialization import initialize_framework
|
|
39
|
+
custom_settings = {
|
|
40
|
+
'RUN_MODE': 'distributed',
|
|
41
|
+
'QUEUE_TYPE': 'redis',
|
|
42
|
+
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
43
|
+
'REDIS_HOST': '127.0.0.1',
|
|
44
|
+
'REDIS_PORT': 6379,
|
|
45
|
+
'REDIS_DB': 15, # 使用测试数据库
|
|
46
|
+
'PROJECT_NAME': 'distributed_test'
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
print("自定义配置:")
|
|
50
|
+
for key, value in custom_settings.items():
|
|
51
|
+
print(f" {key}: {value}")
|
|
52
|
+
|
|
53
|
+
settings = initialize_framework(custom_settings)
|
|
54
|
+
|
|
55
|
+
print("初始化后的配置:")
|
|
56
|
+
print(f" RUN_MODE: {settings.get('RUN_MODE')}")
|
|
57
|
+
print(f" QUEUE_TYPE: {settings.get('QUEUE_TYPE')}")
|
|
58
|
+
print(f" FILTER_CLASS: {settings.get('FILTER_CLASS')}")
|
|
59
|
+
|
|
60
|
+
# 创建爬虫进程
|
|
61
|
+
from crawlo.crawler import CrawlerProcess
|
|
62
|
+
process = CrawlerProcess(settings=settings)
|
|
63
|
+
|
|
64
|
+
# 运行爬虫
|
|
65
|
+
await process.crawl(DistributedTestSpider)
|
|
66
|
+
|
|
67
|
+
print("分布式模式测试完成!")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main():
|
|
71
|
+
"""主函数"""
|
|
72
|
+
print("开始分布式模式调试测试...")
|
|
73
|
+
asyncio.run(test_distributed_mode())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
main()
|