crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,81 +1,81 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试多个SPIDER_MODULES目录的支持
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import os
|
|
8
|
-
import asyncio
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到Python路径
|
|
11
|
-
sys.path.insert(0, os.path.dirname(__file__))
|
|
12
|
-
|
|
13
|
-
# 添加ofweek_standalone到Python路径
|
|
14
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
15
|
-
|
|
16
|
-
from crawlo.crawler import CrawlerProcess
|
|
17
|
-
from crawlo.spider import get_spider_names
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def test_multiple_spider_modules():
|
|
21
|
-
"""测试多个SPIDER_MODULES目录的支持"""
|
|
22
|
-
print("测试多个SPIDER_MODULES目录的支持...")
|
|
23
|
-
|
|
24
|
-
# 模拟包含多个目录的SPIDER_MODULES配置
|
|
25
|
-
spider_modules = ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
26
|
-
|
|
27
|
-
# 创建CrawlerProcess实例
|
|
28
|
-
process = CrawlerProcess(spider_modules=spider_modules)
|
|
29
|
-
|
|
30
|
-
# 检查是否注册了爬虫
|
|
31
|
-
spider_names = process.get_spider_names()
|
|
32
|
-
print(f"已注册的爬虫: {spider_names}")
|
|
33
|
-
|
|
34
|
-
# 验证期望的爬虫是否已注册
|
|
35
|
-
expected_spider = 'of_week_standalone'
|
|
36
|
-
if expected_spider in spider_names:
|
|
37
|
-
print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
|
|
38
|
-
return True
|
|
39
|
-
else:
|
|
40
|
-
print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
|
|
41
|
-
return False
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def test_settings_with_multiple_spider_modules():
|
|
45
|
-
"""测试settings中配置多个SPIDER_MODULES目录"""
|
|
46
|
-
print("\n测试settings中配置多个SPIDER_MODULES目录...")
|
|
47
|
-
|
|
48
|
-
# 创建模拟的settings对象
|
|
49
|
-
class MockSettings:
|
|
50
|
-
def get(self, key, default=None):
|
|
51
|
-
if key == 'SPIDER_MODULES':
|
|
52
|
-
return ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
53
|
-
return default
|
|
54
|
-
|
|
55
|
-
settings = MockSettings()
|
|
56
|
-
|
|
57
|
-
# 创建CrawlerProcess实例
|
|
58
|
-
process = CrawlerProcess(settings=settings)
|
|
59
|
-
|
|
60
|
-
# 检查是否注册了爬虫
|
|
61
|
-
spider_names = process.get_spider_names()
|
|
62
|
-
print(f"已注册的爬虫: {spider_names}")
|
|
63
|
-
|
|
64
|
-
return True
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
if __name__ == '__main__':
|
|
68
|
-
print("开始测试多个SPIDER_MODULES目录的支持...\n")
|
|
69
|
-
|
|
70
|
-
# 测试显式传递多个spider_modules参数
|
|
71
|
-
success1 = test_multiple_spider_modules()
|
|
72
|
-
|
|
73
|
-
# 测试从settings中读取多个spider_modules配置
|
|
74
|
-
success2 = test_settings_with_multiple_spider_modules()
|
|
75
|
-
|
|
76
|
-
if success1 and success2:
|
|
77
|
-
print("\n🎉 所有测试通过!")
|
|
78
|
-
sys.exit(0)
|
|
79
|
-
else:
|
|
80
|
-
print("\n❌ 部分测试失败!")
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试多个SPIDER_MODULES目录的支持
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import asyncio
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到Python路径
|
|
11
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
12
|
+
|
|
13
|
+
# 添加ofweek_standalone到Python路径
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
15
|
+
|
|
16
|
+
from crawlo.crawler import CrawlerProcess
|
|
17
|
+
from crawlo.spider import get_spider_names
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_multiple_spider_modules():
|
|
21
|
+
"""测试多个SPIDER_MODULES目录的支持"""
|
|
22
|
+
print("测试多个SPIDER_MODULES目录的支持...")
|
|
23
|
+
|
|
24
|
+
# 模拟包含多个目录的SPIDER_MODULES配置
|
|
25
|
+
spider_modules = ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
26
|
+
|
|
27
|
+
# 创建CrawlerProcess实例
|
|
28
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
29
|
+
|
|
30
|
+
# 检查是否注册了爬虫
|
|
31
|
+
spider_names = process.get_spider_names()
|
|
32
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
33
|
+
|
|
34
|
+
# 验证期望的爬虫是否已注册
|
|
35
|
+
expected_spider = 'of_week_standalone'
|
|
36
|
+
if expected_spider in spider_names:
|
|
37
|
+
print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
|
|
38
|
+
return True
|
|
39
|
+
else:
|
|
40
|
+
print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_settings_with_multiple_spider_modules():
|
|
45
|
+
"""测试settings中配置多个SPIDER_MODULES目录"""
|
|
46
|
+
print("\n测试settings中配置多个SPIDER_MODULES目录...")
|
|
47
|
+
|
|
48
|
+
# 创建模拟的settings对象
|
|
49
|
+
class MockSettings:
|
|
50
|
+
def get(self, key, default=None):
|
|
51
|
+
if key == 'SPIDER_MODULES':
|
|
52
|
+
return ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
53
|
+
return default
|
|
54
|
+
|
|
55
|
+
settings = MockSettings()
|
|
56
|
+
|
|
57
|
+
# 创建CrawlerProcess实例
|
|
58
|
+
process = CrawlerProcess(settings=settings)
|
|
59
|
+
|
|
60
|
+
# 检查是否注册了爬虫
|
|
61
|
+
spider_names = process.get_spider_names()
|
|
62
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
63
|
+
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == '__main__':
|
|
68
|
+
print("开始测试多个SPIDER_MODULES目录的支持...\n")
|
|
69
|
+
|
|
70
|
+
# 测试显式传递多个spider_modules参数
|
|
71
|
+
success1 = test_multiple_spider_modules()
|
|
72
|
+
|
|
73
|
+
# 测试从settings中读取多个spider_modules配置
|
|
74
|
+
success2 = test_settings_with_multiple_spider_modules()
|
|
75
|
+
|
|
76
|
+
if success1 and success2:
|
|
77
|
+
print("\n🎉 所有测试通过!")
|
|
78
|
+
sys.exit(0)
|
|
79
|
+
else:
|
|
80
|
+
print("\n❌ 部分测试失败!")
|
|
81
81
|
sys.exit(1)
|
|
@@ -1,165 +1,165 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
import asyncio
|
|
3
|
-
import unittest
|
|
4
|
-
from unittest.mock import Mock, patch
|
|
5
|
-
|
|
6
|
-
from crawlo.pipelines.mysql_pipeline import BaseMySQLPipeline, AsyncmyMySQLPipeline, AiomysqlMySQLPipeline
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class TestMySQLPipelineConfig(unittest.TestCase):
|
|
10
|
-
"""测试MySQL管道配置"""
|
|
11
|
-
|
|
12
|
-
def setUp(self):
|
|
13
|
-
"""设置测试环境"""
|
|
14
|
-
self.mock_crawler = Mock()
|
|
15
|
-
self.mock_crawler.settings = Mock()
|
|
16
|
-
self.mock_crawler.settings.get = Mock(return_value=None)
|
|
17
|
-
self.mock_crawler.settings.get_int = Mock(return_value=100)
|
|
18
|
-
self.mock_crawler.settings.get_bool = Mock(return_value=False)
|
|
19
|
-
self.mock_crawler.subscriber = Mock()
|
|
20
|
-
self.mock_crawler.subscriber.subscribe = Mock()
|
|
21
|
-
|
|
22
|
-
# 模拟爬虫对象
|
|
23
|
-
self.mock_spider = Mock()
|
|
24
|
-
self.mock_spider.name = "test_spider"
|
|
25
|
-
self.mock_spider.custom_settings = {}
|
|
26
|
-
self.mock_spider.mysql_table = None
|
|
27
|
-
self.mock_crawler.spider = self.mock_spider
|
|
28
|
-
|
|
29
|
-
def test_default_config_values(self):
|
|
30
|
-
"""测试默认配置值"""
|
|
31
|
-
# 设置默认返回值(模拟默认配置文件中的值)
|
|
32
|
-
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
33
|
-
'MYSQL_AUTO_UPDATE': False,
|
|
34
|
-
'MYSQL_INSERT_IGNORE': False,
|
|
35
|
-
'MYSQL_USE_BATCH': False
|
|
36
|
-
}.get(key, default))
|
|
37
|
-
|
|
38
|
-
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
39
|
-
'MYSQL_UPDATE_COLUMNS': ()
|
|
40
|
-
}.get(key, default))
|
|
41
|
-
|
|
42
|
-
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
43
|
-
|
|
44
|
-
# 验证默认配置值
|
|
45
|
-
self.assertEqual(pipeline.auto_update, False)
|
|
46
|
-
self.assertEqual(pipeline.insert_ignore, False)
|
|
47
|
-
self.assertEqual(pipeline.update_columns, ())
|
|
48
|
-
|
|
49
|
-
def test_custom_config_values(self):
|
|
50
|
-
"""测试自定义配置值"""
|
|
51
|
-
# 设置自定义配置值
|
|
52
|
-
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
53
|
-
'MYSQL_AUTO_UPDATE': True,
|
|
54
|
-
'MYSQL_INSERT_IGNORE': True
|
|
55
|
-
}.get(key, default))
|
|
56
|
-
|
|
57
|
-
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
58
|
-
'MYSQL_UPDATE_COLUMNS': ('updated_at', 'view_count')
|
|
59
|
-
}.get(key, default))
|
|
60
|
-
|
|
61
|
-
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
62
|
-
|
|
63
|
-
# 验证自定义配置值
|
|
64
|
-
self.assertEqual(pipeline.auto_update, True)
|
|
65
|
-
self.assertEqual(pipeline.insert_ignore, True)
|
|
66
|
-
self.assertEqual(pipeline.update_columns, ('updated_at', 'view_count'))
|
|
67
|
-
|
|
68
|
-
def test_sql_generation_with_config(self):
|
|
69
|
-
"""测试使用配置生成SQL"""
|
|
70
|
-
# 设置配置
|
|
71
|
-
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
72
|
-
'MYSQL_AUTO_UPDATE': True,
|
|
73
|
-
'MYSQL_INSERT_IGNORE': False
|
|
74
|
-
}.get(key, default))
|
|
75
|
-
|
|
76
|
-
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
77
|
-
'MYSQL_UPDATE_COLUMNS': ()
|
|
78
|
-
}.get(key, default))
|
|
79
|
-
|
|
80
|
-
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
81
|
-
|
|
82
|
-
# 测试数据
|
|
83
|
-
item_dict = {"id": 1, "name": "test"}
|
|
84
|
-
|
|
85
|
-
async def test_async():
|
|
86
|
-
with patch('crawlo.pipelines.mysql_pipeline.SQLBuilder.make_insert') as mock_make_insert:
|
|
87
|
-
mock_make_insert.return_value = "TEST SQL"
|
|
88
|
-
|
|
89
|
-
# 调用_make_insert_sql方法
|
|
90
|
-
result = await pipeline._make_insert_sql(item_dict)
|
|
91
|
-
|
|
92
|
-
# 验证调用了SQLBuilder.make_insert并传递了正确的参数
|
|
93
|
-
mock_make_insert.assert_called_once()
|
|
94
|
-
call_args = mock_make_insert.call_args
|
|
95
|
-
self.assertEqual(call_args[1]['auto_update'], True)
|
|
96
|
-
self.assertEqual(call_args[1]['insert_ignore'], False)
|
|
97
|
-
self.assertEqual(call_args[1]['update_columns'], ())
|
|
98
|
-
|
|
99
|
-
asyncio.run(test_async())
|
|
100
|
-
|
|
101
|
-
def test_sql_generation_with_kwargs_override(self):
|
|
102
|
-
"""测试使用kwargs覆盖配置生成SQL"""
|
|
103
|
-
# 设置配置
|
|
104
|
-
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
105
|
-
'MYSQL_AUTO_UPDATE': False,
|
|
106
|
-
'MYSQL_INSERT_IGNORE': False
|
|
107
|
-
}.get(key, default))
|
|
108
|
-
|
|
109
|
-
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
110
|
-
'MYSQL_UPDATE_COLUMNS': ()
|
|
111
|
-
}.get(key, default))
|
|
112
|
-
|
|
113
|
-
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
114
|
-
|
|
115
|
-
# 测试数据
|
|
116
|
-
item_dict = {"id": 1, "name": "test"}
|
|
117
|
-
|
|
118
|
-
async def test_async():
|
|
119
|
-
with patch('crawlo.pipelines.mysql_pipeline.SQLBuilder.make_insert') as mock_make_insert:
|
|
120
|
-
mock_make_insert.return_value = "TEST SQL"
|
|
121
|
-
|
|
122
|
-
# 调用_make_insert_sql方法并传递kwargs
|
|
123
|
-
result = await pipeline._make_insert_sql(
|
|
124
|
-
item_dict,
|
|
125
|
-
auto_update=True,
|
|
126
|
-
insert_ignore=True
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
# 验证kwargs覆盖了配置
|
|
130
|
-
mock_make_insert.assert_called_once()
|
|
131
|
-
call_args = mock_make_insert.call_args
|
|
132
|
-
self.assertEqual(call_args[1]['auto_update'], True)
|
|
133
|
-
self.assertEqual(call_args[1]['insert_ignore'], True)
|
|
134
|
-
self.assertEqual(call_args[1]['update_columns'], ())
|
|
135
|
-
|
|
136
|
-
asyncio.run(test_async())
|
|
137
|
-
|
|
138
|
-
def test_batch_config_passing(self):
|
|
139
|
-
"""测试批量处理中配置的传递"""
|
|
140
|
-
# 设置配置
|
|
141
|
-
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
142
|
-
'MYSQL_AUTO_UPDATE': True,
|
|
143
|
-
'MYSQL_INSERT_IGNORE': False,
|
|
144
|
-
'MYSQL_USE_BATCH': True
|
|
145
|
-
}.get(key, default))
|
|
146
|
-
|
|
147
|
-
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
148
|
-
'MYSQL_UPDATE_COLUMNS': ('updated_at',)
|
|
149
|
-
}.get(key, default))
|
|
150
|
-
|
|
151
|
-
self.mock_crawler.settings.get_int = Mock(side_effect=lambda key, default=100: {
|
|
152
|
-
'MYSQL_BATCH_SIZE': 2
|
|
153
|
-
}.get(key, default))
|
|
154
|
-
|
|
155
|
-
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
156
|
-
|
|
157
|
-
# 验证配置已正确设置
|
|
158
|
-
self.assertEqual(pipeline.auto_update, True)
|
|
159
|
-
self.assertEqual(pipeline.update_columns, ('updated_at',))
|
|
160
|
-
self.assertEqual(pipeline.use_batch, True)
|
|
161
|
-
self.assertEqual(pipeline.batch_size, 2)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
if __name__ == "__main__":
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import asyncio
|
|
3
|
+
import unittest
|
|
4
|
+
from unittest.mock import Mock, patch
|
|
5
|
+
|
|
6
|
+
from crawlo.pipelines.mysql_pipeline import BaseMySQLPipeline, AsyncmyMySQLPipeline, AiomysqlMySQLPipeline
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestMySQLPipelineConfig(unittest.TestCase):
|
|
10
|
+
"""测试MySQL管道配置"""
|
|
11
|
+
|
|
12
|
+
def setUp(self):
|
|
13
|
+
"""设置测试环境"""
|
|
14
|
+
self.mock_crawler = Mock()
|
|
15
|
+
self.mock_crawler.settings = Mock()
|
|
16
|
+
self.mock_crawler.settings.get = Mock(return_value=None)
|
|
17
|
+
self.mock_crawler.settings.get_int = Mock(return_value=100)
|
|
18
|
+
self.mock_crawler.settings.get_bool = Mock(return_value=False)
|
|
19
|
+
self.mock_crawler.subscriber = Mock()
|
|
20
|
+
self.mock_crawler.subscriber.subscribe = Mock()
|
|
21
|
+
|
|
22
|
+
# 模拟爬虫对象
|
|
23
|
+
self.mock_spider = Mock()
|
|
24
|
+
self.mock_spider.name = "test_spider"
|
|
25
|
+
self.mock_spider.custom_settings = {}
|
|
26
|
+
self.mock_spider.mysql_table = None
|
|
27
|
+
self.mock_crawler.spider = self.mock_spider
|
|
28
|
+
|
|
29
|
+
def test_default_config_values(self):
|
|
30
|
+
"""测试默认配置值"""
|
|
31
|
+
# 设置默认返回值(模拟默认配置文件中的值)
|
|
32
|
+
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
33
|
+
'MYSQL_AUTO_UPDATE': False,
|
|
34
|
+
'MYSQL_INSERT_IGNORE': False,
|
|
35
|
+
'MYSQL_USE_BATCH': False
|
|
36
|
+
}.get(key, default))
|
|
37
|
+
|
|
38
|
+
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
39
|
+
'MYSQL_UPDATE_COLUMNS': ()
|
|
40
|
+
}.get(key, default))
|
|
41
|
+
|
|
42
|
+
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
43
|
+
|
|
44
|
+
# 验证默认配置值
|
|
45
|
+
self.assertEqual(pipeline.auto_update, False)
|
|
46
|
+
self.assertEqual(pipeline.insert_ignore, False)
|
|
47
|
+
self.assertEqual(pipeline.update_columns, ())
|
|
48
|
+
|
|
49
|
+
def test_custom_config_values(self):
|
|
50
|
+
"""测试自定义配置值"""
|
|
51
|
+
# 设置自定义配置值
|
|
52
|
+
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
53
|
+
'MYSQL_AUTO_UPDATE': True,
|
|
54
|
+
'MYSQL_INSERT_IGNORE': True
|
|
55
|
+
}.get(key, default))
|
|
56
|
+
|
|
57
|
+
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
58
|
+
'MYSQL_UPDATE_COLUMNS': ('updated_at', 'view_count')
|
|
59
|
+
}.get(key, default))
|
|
60
|
+
|
|
61
|
+
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
62
|
+
|
|
63
|
+
# 验证自定义配置值
|
|
64
|
+
self.assertEqual(pipeline.auto_update, True)
|
|
65
|
+
self.assertEqual(pipeline.insert_ignore, True)
|
|
66
|
+
self.assertEqual(pipeline.update_columns, ('updated_at', 'view_count'))
|
|
67
|
+
|
|
68
|
+
def test_sql_generation_with_config(self):
|
|
69
|
+
"""测试使用配置生成SQL"""
|
|
70
|
+
# 设置配置
|
|
71
|
+
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
72
|
+
'MYSQL_AUTO_UPDATE': True,
|
|
73
|
+
'MYSQL_INSERT_IGNORE': False
|
|
74
|
+
}.get(key, default))
|
|
75
|
+
|
|
76
|
+
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
77
|
+
'MYSQL_UPDATE_COLUMNS': ()
|
|
78
|
+
}.get(key, default))
|
|
79
|
+
|
|
80
|
+
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
81
|
+
|
|
82
|
+
# 测试数据
|
|
83
|
+
item_dict = {"id": 1, "name": "test"}
|
|
84
|
+
|
|
85
|
+
async def test_async():
|
|
86
|
+
with patch('crawlo.pipelines.mysql_pipeline.SQLBuilder.make_insert') as mock_make_insert:
|
|
87
|
+
mock_make_insert.return_value = "TEST SQL"
|
|
88
|
+
|
|
89
|
+
# 调用_make_insert_sql方法
|
|
90
|
+
result = await pipeline._make_insert_sql(item_dict)
|
|
91
|
+
|
|
92
|
+
# 验证调用了SQLBuilder.make_insert并传递了正确的参数
|
|
93
|
+
mock_make_insert.assert_called_once()
|
|
94
|
+
call_args = mock_make_insert.call_args
|
|
95
|
+
self.assertEqual(call_args[1]['auto_update'], True)
|
|
96
|
+
self.assertEqual(call_args[1]['insert_ignore'], False)
|
|
97
|
+
self.assertEqual(call_args[1]['update_columns'], ())
|
|
98
|
+
|
|
99
|
+
asyncio.run(test_async())
|
|
100
|
+
|
|
101
|
+
def test_sql_generation_with_kwargs_override(self):
|
|
102
|
+
"""测试使用kwargs覆盖配置生成SQL"""
|
|
103
|
+
# 设置配置
|
|
104
|
+
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
105
|
+
'MYSQL_AUTO_UPDATE': False,
|
|
106
|
+
'MYSQL_INSERT_IGNORE': False
|
|
107
|
+
}.get(key, default))
|
|
108
|
+
|
|
109
|
+
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
110
|
+
'MYSQL_UPDATE_COLUMNS': ()
|
|
111
|
+
}.get(key, default))
|
|
112
|
+
|
|
113
|
+
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
114
|
+
|
|
115
|
+
# 测试数据
|
|
116
|
+
item_dict = {"id": 1, "name": "test"}
|
|
117
|
+
|
|
118
|
+
async def test_async():
|
|
119
|
+
with patch('crawlo.pipelines.mysql_pipeline.SQLBuilder.make_insert') as mock_make_insert:
|
|
120
|
+
mock_make_insert.return_value = "TEST SQL"
|
|
121
|
+
|
|
122
|
+
# 调用_make_insert_sql方法并传递kwargs
|
|
123
|
+
result = await pipeline._make_insert_sql(
|
|
124
|
+
item_dict,
|
|
125
|
+
auto_update=True,
|
|
126
|
+
insert_ignore=True
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# 验证kwargs覆盖了配置
|
|
130
|
+
mock_make_insert.assert_called_once()
|
|
131
|
+
call_args = mock_make_insert.call_args
|
|
132
|
+
self.assertEqual(call_args[1]['auto_update'], True)
|
|
133
|
+
self.assertEqual(call_args[1]['insert_ignore'], True)
|
|
134
|
+
self.assertEqual(call_args[1]['update_columns'], ())
|
|
135
|
+
|
|
136
|
+
asyncio.run(test_async())
|
|
137
|
+
|
|
138
|
+
def test_batch_config_passing(self):
|
|
139
|
+
"""测试批量处理中配置的传递"""
|
|
140
|
+
# 设置配置
|
|
141
|
+
self.mock_crawler.settings.get_bool = Mock(side_effect=lambda key, default: {
|
|
142
|
+
'MYSQL_AUTO_UPDATE': True,
|
|
143
|
+
'MYSQL_INSERT_IGNORE': False,
|
|
144
|
+
'MYSQL_USE_BATCH': True
|
|
145
|
+
}.get(key, default))
|
|
146
|
+
|
|
147
|
+
self.mock_crawler.settings.get = Mock(side_effect=lambda key, default=None: {
|
|
148
|
+
'MYSQL_UPDATE_COLUMNS': ('updated_at',)
|
|
149
|
+
}.get(key, default))
|
|
150
|
+
|
|
151
|
+
self.mock_crawler.settings.get_int = Mock(side_effect=lambda key, default=100: {
|
|
152
|
+
'MYSQL_BATCH_SIZE': 2
|
|
153
|
+
}.get(key, default))
|
|
154
|
+
|
|
155
|
+
pipeline = AsyncmyMySQLPipeline(self.mock_crawler)
|
|
156
|
+
|
|
157
|
+
# 验证配置已正确设置
|
|
158
|
+
self.assertEqual(pipeline.auto_update, True)
|
|
159
|
+
self.assertEqual(pipeline.update_columns, ('updated_at',))
|
|
160
|
+
self.assertEqual(pipeline.use_batch, True)
|
|
161
|
+
self.assertEqual(pipeline.batch_size, 2)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
if __name__ == "__main__":
|
|
165
165
|
unittest.main()
|