crawlo 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.1.dist-info/METADATA +0 -1199
- crawlo-1.4.1.dist-info/RECORD +0 -309
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
调试中间件User-Agent随机性问题
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import random
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
16
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
17
|
+
from crawlo.data.user_agents import get_user_agents
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MockLogger:
|
|
21
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
22
|
+
def __init__(self, name, level=None):
|
|
23
|
+
self.name = name
|
|
24
|
+
self.level = level
|
|
25
|
+
self.logs = []
|
|
26
|
+
|
|
27
|
+
def debug(self, msg):
|
|
28
|
+
self.logs.append(('debug', msg))
|
|
29
|
+
print(f"DEBUG: {msg}")
|
|
30
|
+
|
|
31
|
+
def info(self, msg):
|
|
32
|
+
self.logs.append(('info', msg))
|
|
33
|
+
|
|
34
|
+
def warning(self, msg):
|
|
35
|
+
self.logs.append(('warning', msg))
|
|
36
|
+
|
|
37
|
+
def error(self, msg):
|
|
38
|
+
self.logs.append(('error', msg))
|
|
39
|
+
|
|
40
|
+
def isEnabledFor(self, level):
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def debug_middleware_initialization():
|
|
45
|
+
"""调试中间件初始化过程"""
|
|
46
|
+
print("=== 调试中间件初始化过程 ===")
|
|
47
|
+
|
|
48
|
+
settings = SettingManager()
|
|
49
|
+
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
50
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
51
|
+
})
|
|
52
|
+
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
53
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
54
|
+
settings.set('RANDOMNESS', True)
|
|
55
|
+
|
|
56
|
+
crawler = Mock()
|
|
57
|
+
crawler.settings = settings
|
|
58
|
+
|
|
59
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
60
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
61
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
62
|
+
|
|
63
|
+
print(f"随机User-Agent启用: {middleware.random_user_agent_enabled}")
|
|
64
|
+
print(f"User-Agent列表数量: {len(middleware.user_agents)}")
|
|
65
|
+
print(f"User-Agent设备类型: {middleware.user_agent_device_type}")
|
|
66
|
+
|
|
67
|
+
# 检查前几个User-Agent
|
|
68
|
+
print("前5个User-Agent:")
|
|
69
|
+
for i, ua in enumerate(middleware.user_agents[:5]):
|
|
70
|
+
print(f" {i+1}. {ua[:50]}...")
|
|
71
|
+
|
|
72
|
+
# 测试_get_random_user_agent方法
|
|
73
|
+
print("\n测试_get_random_user_agent方法:")
|
|
74
|
+
for i in range(10):
|
|
75
|
+
ua = middleware._get_random_user_agent()
|
|
76
|
+
print(f" {i+1}. {ua[:50]}...")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_multiple_middleware_instances():
|
|
80
|
+
"""测试多个中间件实例的随机性"""
|
|
81
|
+
print("\n=== 测试多个中间件实例的随机性 ===")
|
|
82
|
+
|
|
83
|
+
ua_values = []
|
|
84
|
+
|
|
85
|
+
for i in range(10):
|
|
86
|
+
settings = SettingManager()
|
|
87
|
+
settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
88
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
89
|
+
})
|
|
90
|
+
settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
91
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
92
|
+
settings.set('RANDOMNESS', True)
|
|
93
|
+
|
|
94
|
+
crawler = Mock()
|
|
95
|
+
crawler.settings = settings
|
|
96
|
+
|
|
97
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
98
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
99
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
100
|
+
|
|
101
|
+
# 获取随机User-Agent
|
|
102
|
+
ua = middleware._get_random_user_agent()
|
|
103
|
+
if ua:
|
|
104
|
+
ua_values.append(ua)
|
|
105
|
+
print(f" 实例{i+1}: {ua[:50]}...")
|
|
106
|
+
|
|
107
|
+
unique_uas = set(ua_values)
|
|
108
|
+
print(f"\n生成了 {len(ua_values)} 个User-Agent,其中 {len(unique_uas)} 个不同")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def check_user_agents_module():
|
|
112
|
+
"""检查user_agents模块"""
|
|
113
|
+
print("\n=== 检查user_agents模块 ===")
|
|
114
|
+
|
|
115
|
+
# 获取不同类型的User-Agent
|
|
116
|
+
device_types = ["all", "desktop", "mobile", "chrome", "firefox"]
|
|
117
|
+
|
|
118
|
+
for device_type in device_types:
|
|
119
|
+
uas = get_user_agents(device_type)
|
|
120
|
+
print(f"{device_type}类型User-Agent数量: {len(uas)}")
|
|
121
|
+
if uas:
|
|
122
|
+
print(f" 示例: {uas[0][:50]}...")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def main():
|
|
126
|
+
print("开始调试中间件User-Agent随机性问题...")
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
debug_middleware_initialization()
|
|
130
|
+
test_multiple_middleware_instances()
|
|
131
|
+
check_user_agents_module()
|
|
132
|
+
|
|
133
|
+
print("\n调试完成!")
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
print(f"\n调试过程中发生错误: {e}")
|
|
137
|
+
import traceback
|
|
138
|
+
traceback.print_exc()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
|
+
main()
|
tests/test_mode_change.py
CHANGED
|
@@ -1,73 +1,73 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试运行模式日志级别修改
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import os
|
|
8
|
-
sys.path.insert(0, '/')
|
|
9
|
-
|
|
10
|
-
def test_mode_log_level():
|
|
11
|
-
print("=== 测试运行模式日志级别修改 ===")
|
|
12
|
-
|
|
13
|
-
# 删除旧的日志文件
|
|
14
|
-
test_log_file = '/Users/oscar/projects/Crawlo/test_mode_change.log'
|
|
15
|
-
if os.path.exists(test_log_file):
|
|
16
|
-
os.remove(test_log_file)
|
|
17
|
-
|
|
18
|
-
# 准备测试设置
|
|
19
|
-
test_settings = {
|
|
20
|
-
'PROJECT_NAME': 'test_mode_change',
|
|
21
|
-
'LOG_LEVEL': 'INFO',
|
|
22
|
-
'LOG_FILE': test_log_file,
|
|
23
|
-
'RUN_MODE': 'standalone'
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
try:
|
|
27
|
-
# 初始化框架
|
|
28
|
-
from crawlo.initialization import initialize_framework
|
|
29
|
-
settings = initialize_framework(test_settings)
|
|
30
|
-
|
|
31
|
-
print(f"设置初始化完成: {settings.get('PROJECT_NAME')}")
|
|
32
|
-
|
|
33
|
-
# 检查日志文件是否包含运行模式信息
|
|
34
|
-
if os.path.exists(test_log_file):
|
|
35
|
-
with open(test_log_file, 'r', encoding='utf-8') as f:
|
|
36
|
-
content = f.read()
|
|
37
|
-
print(f"日志文件内容长度: {len(content)} 字符")
|
|
38
|
-
|
|
39
|
-
# 检查是否还有INFO级别的运行模式信息
|
|
40
|
-
info_lines = [line for line in content.split('\n') if 'INFO' in line and '使用单机模式' in line]
|
|
41
|
-
debug_lines = [line for line in content.split('\n') if 'DEBUG' in line and '使用单机模式' in line]
|
|
42
|
-
|
|
43
|
-
if info_lines:
|
|
44
|
-
print("❌ 仍然发现INFO级别的运行模式信息:")
|
|
45
|
-
for line in info_lines:
|
|
46
|
-
print(f" {line}")
|
|
47
|
-
else:
|
|
48
|
-
print("✅ 没有发现INFO级别的运行模式信息")
|
|
49
|
-
|
|
50
|
-
if debug_lines:
|
|
51
|
-
print("✅ 发现DEBUG级别的运行模式信息:")
|
|
52
|
-
for line in debug_lines:
|
|
53
|
-
print(f" {line}")
|
|
54
|
-
else:
|
|
55
|
-
print("❌ 没有发现DEBUG级别的运行模式信息")
|
|
56
|
-
|
|
57
|
-
print("\n所有日志内容:")
|
|
58
|
-
lines = content.split('\n')
|
|
59
|
-
for i, line in enumerate(lines, 1):
|
|
60
|
-
if line.strip():
|
|
61
|
-
print(f"{i:3d}: {line}")
|
|
62
|
-
else:
|
|
63
|
-
print("❌ 日志文件未创建")
|
|
64
|
-
|
|
65
|
-
except Exception as e:
|
|
66
|
-
print(f"错误: {e}")
|
|
67
|
-
import traceback
|
|
68
|
-
traceback.print_exc()
|
|
69
|
-
|
|
70
|
-
print("=== 测试完成 ===")
|
|
71
|
-
|
|
72
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试运行模式日志级别修改
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, '/')
|
|
9
|
+
|
|
10
|
+
def test_mode_log_level():
|
|
11
|
+
print("=== 测试运行模式日志级别修改 ===")
|
|
12
|
+
|
|
13
|
+
# 删除旧的日志文件
|
|
14
|
+
test_log_file = '/Users/oscar/projects/Crawlo/test_mode_change.log'
|
|
15
|
+
if os.path.exists(test_log_file):
|
|
16
|
+
os.remove(test_log_file)
|
|
17
|
+
|
|
18
|
+
# 准备测试设置
|
|
19
|
+
test_settings = {
|
|
20
|
+
'PROJECT_NAME': 'test_mode_change',
|
|
21
|
+
'LOG_LEVEL': 'INFO',
|
|
22
|
+
'LOG_FILE': test_log_file,
|
|
23
|
+
'RUN_MODE': 'standalone'
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
# 初始化框架
|
|
28
|
+
from crawlo.initialization import initialize_framework
|
|
29
|
+
settings = initialize_framework(test_settings)
|
|
30
|
+
|
|
31
|
+
print(f"设置初始化完成: {settings.get('PROJECT_NAME')}")
|
|
32
|
+
|
|
33
|
+
# 检查日志文件是否包含运行模式信息
|
|
34
|
+
if os.path.exists(test_log_file):
|
|
35
|
+
with open(test_log_file, 'r', encoding='utf-8') as f:
|
|
36
|
+
content = f.read()
|
|
37
|
+
print(f"日志文件内容长度: {len(content)} 字符")
|
|
38
|
+
|
|
39
|
+
# 检查是否还有INFO级别的运行模式信息
|
|
40
|
+
info_lines = [line for line in content.split('\n') if 'INFO' in line and '使用单机模式' in line]
|
|
41
|
+
debug_lines = [line for line in content.split('\n') if 'DEBUG' in line and '使用单机模式' in line]
|
|
42
|
+
|
|
43
|
+
if info_lines:
|
|
44
|
+
print("❌ 仍然发现INFO级别的运行模式信息:")
|
|
45
|
+
for line in info_lines:
|
|
46
|
+
print(f" {line}")
|
|
47
|
+
else:
|
|
48
|
+
print("✅ 没有发现INFO级别的运行模式信息")
|
|
49
|
+
|
|
50
|
+
if debug_lines:
|
|
51
|
+
print("✅ 发现DEBUG级别的运行模式信息:")
|
|
52
|
+
for line in debug_lines:
|
|
53
|
+
print(f" {line}")
|
|
54
|
+
else:
|
|
55
|
+
print("❌ 没有发现DEBUG级别的运行模式信息")
|
|
56
|
+
|
|
57
|
+
print("\n所有日志内容:")
|
|
58
|
+
lines = content.split('\n')
|
|
59
|
+
for i, line in enumerate(lines, 1):
|
|
60
|
+
if line.strip():
|
|
61
|
+
print(f"{i:3d}: {line}")
|
|
62
|
+
else:
|
|
63
|
+
print("❌ 日志文件未创建")
|
|
64
|
+
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"错误: {e}")
|
|
67
|
+
import traceback
|
|
68
|
+
traceback.print_exc()
|
|
69
|
+
|
|
70
|
+
print("=== 测试完成 ===")
|
|
71
|
+
|
|
72
|
+
if __name__ == "__main__":
|
|
73
73
|
test_mode_log_level()
|
tests/test_mode_consistency.py
CHANGED
|
@@ -1,52 +1,52 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试模式一致性提示
|
|
5
|
-
"""
|
|
6
|
-
import asyncio
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到路径
|
|
11
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
-
|
|
13
|
-
from crawlo.crawler import CrawlerProcess
|
|
14
|
-
from crawlo.spider import Spider
|
|
15
|
-
from crawlo import Request
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class TestSpider(Spider):
|
|
19
|
-
name = "test_mode_spider"
|
|
20
|
-
|
|
21
|
-
def start_requests(self):
|
|
22
|
-
yield Request("https://httpbin.org/get")
|
|
23
|
-
|
|
24
|
-
def parse(self, response):
|
|
25
|
-
yield {"url": response.url, "status": response.status_code} # 修复:使用status_code而不是status
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
async def test_mode_consistency():
|
|
29
|
-
"""测试模式一致性提示"""
|
|
30
|
-
print("测试模式一致性提示...")
|
|
31
|
-
|
|
32
|
-
try:
|
|
33
|
-
# 创建爬虫进程
|
|
34
|
-
process = CrawlerProcess()
|
|
35
|
-
|
|
36
|
-
# 添加爬虫
|
|
37
|
-
await process.crawl(TestSpider)
|
|
38
|
-
|
|
39
|
-
print("模式一致性测试完成")
|
|
40
|
-
|
|
41
|
-
except Exception as e:
|
|
42
|
-
print(f"测试失败: {e}")
|
|
43
|
-
import traceback
|
|
44
|
-
traceback.print_exc()
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if __name__ == "__main__":
|
|
48
|
-
# 设置日志级别
|
|
49
|
-
import logging
|
|
50
|
-
logging.basicConfig(level=logging.INFO)
|
|
51
|
-
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试模式一致性提示
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.crawler import CrawlerProcess
|
|
14
|
+
from crawlo.spider import Spider
|
|
15
|
+
from crawlo import Request
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestSpider(Spider):
|
|
19
|
+
name = "test_mode_spider"
|
|
20
|
+
|
|
21
|
+
def start_requests(self):
|
|
22
|
+
yield Request("https://httpbin.org/get")
|
|
23
|
+
|
|
24
|
+
def parse(self, response):
|
|
25
|
+
yield {"url": response.url, "status": response.status_code} # 修复:使用status_code而不是status
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def test_mode_consistency():
|
|
29
|
+
"""测试模式一致性提示"""
|
|
30
|
+
print("测试模式一致性提示...")
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
# 创建爬虫进程
|
|
34
|
+
process = CrawlerProcess()
|
|
35
|
+
|
|
36
|
+
# 添加爬虫
|
|
37
|
+
await process.crawl(TestSpider)
|
|
38
|
+
|
|
39
|
+
print("模式一致性测试完成")
|
|
40
|
+
|
|
41
|
+
except Exception as e:
|
|
42
|
+
print(f"测试失败: {e}")
|
|
43
|
+
import traceback
|
|
44
|
+
traceback.print_exc()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
# 设置日志级别
|
|
49
|
+
import logging
|
|
50
|
+
logging.basicConfig(level=logging.INFO)
|
|
51
|
+
|
|
52
52
|
asyncio.run(test_mode_consistency())
|