crawlo 1.3.5__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +87 -87
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +45 -45
- crawlo/core/engine.py +439 -439
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +257 -257
- crawlo/crawler.py +638 -638
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +228 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -291
- crawlo/initialization/__init__.py +39 -39
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +37 -37
- crawlo/logging/config.py +96 -96
- crawlo/logging/factory.py +128 -128
- crawlo/logging/manager.py +111 -111
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +212 -212
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +76 -76
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +327 -327
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +503 -503
- crawlo/queue/redis_priority_queue.py +326 -326
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +321 -321
- crawlo/settings/setting_manager.py +214 -214
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +167 -167
- crawlo/templates/project/settings_distributed.py.tmpl +166 -166
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/METADATA +1126 -1126
- crawlo-1.3.6.dist-info/RECORD +290 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +42 -0
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_system.py +282 -282
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_queue_type.py +107 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.3.5.dist-info/RECORD +0 -288
- {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/WHEEL +0 -0
- {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/top_level.txt +0 -0
crawlo/utils/log.py
CHANGED
|
@@ -1,80 +1,80 @@
|
|
|
1
|
-
# ==================== 向后兼容的日志接口 ====================
|
|
2
|
-
# 主要功能已迁移到 crawlo.logging 模块
|
|
3
|
-
# 本文件提供向后兼容接口,同时支持新的日志系统功能
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from typing import Optional, Any
|
|
7
|
-
|
|
8
|
-
# 向后兼容:导入新的日志系统
|
|
9
|
-
try:
|
|
10
|
-
from crawlo.logging import get_logger as new_get_logger, configure_logging
|
|
11
|
-
|
|
12
|
-
_NEW_LOGGING_AVAILABLE = True
|
|
13
|
-
except ImportError:
|
|
14
|
-
_NEW_LOGGING_AVAILABLE = False
|
|
15
|
-
new_get_logger = None
|
|
16
|
-
configure_logging = None
|
|
17
|
-
|
|
18
|
-
LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
# 向后兼容的日志函数
|
|
22
|
-
def get_logger(name: str = 'default', level: Optional[int] = None):
|
|
23
|
-
"""获取Logger实例 - 向后兼容函数"""
|
|
24
|
-
if _NEW_LOGGING_AVAILABLE and new_get_logger:
|
|
25
|
-
# 使用新的日志系统
|
|
26
|
-
return new_get_logger(name)
|
|
27
|
-
else:
|
|
28
|
-
# 降级到基本的Python logging
|
|
29
|
-
logger = logging.getLogger(name)
|
|
30
|
-
if not logger.handlers:
|
|
31
|
-
handler = logging.StreamHandler()
|
|
32
|
-
formatter = logging.Formatter(LOG_FORMAT)
|
|
33
|
-
handler.setFormatter(formatter)
|
|
34
|
-
logger.addHandler(handler)
|
|
35
|
-
logger.setLevel(level or logging.INFO)
|
|
36
|
-
return logger
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def get_component_logger(component_class: Any, settings: Optional[Any] = None, level: Optional[str] = None):
|
|
40
|
-
"""
|
|
41
|
-
获取组件Logger - 推荐的组件日志创建方式
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
component_class: 组件类
|
|
45
|
-
settings: 配置对象,用于读取日志级别配置
|
|
46
|
-
level: 日志级别(优先级低于settings中的配置)
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
logging.Logger: 配置好的Logger实例
|
|
50
|
-
"""
|
|
51
|
-
# 获取组件名称
|
|
52
|
-
if hasattr(component_class, '__name__'):
|
|
53
|
-
component_name = component_class.__name__
|
|
54
|
-
else:
|
|
55
|
-
component_name = str(component_class)
|
|
56
|
-
|
|
57
|
-
# 如果新日志系统可用,使用新系统
|
|
58
|
-
if _NEW_LOGGING_AVAILABLE and new_get_logger:
|
|
59
|
-
return new_get_logger(component_name)
|
|
60
|
-
|
|
61
|
-
# 否则使用向后兼容方式
|
|
62
|
-
# 从settings中获取日志级别(如果提供)
|
|
63
|
-
if settings is not None:
|
|
64
|
-
# 尝试从settings获取组件特定的日志级别
|
|
65
|
-
if hasattr(settings, 'get'):
|
|
66
|
-
# 检查是否有组件特定的日志级别配置
|
|
67
|
-
component_level = settings.get(f'LOG_LEVEL_{component_name}')
|
|
68
|
-
if component_level is not None:
|
|
69
|
-
level = component_level
|
|
70
|
-
else:
|
|
71
|
-
# 检查通用日志级别
|
|
72
|
-
general_level = settings.get('LOG_LEVEL')
|
|
73
|
-
if general_level is not None:
|
|
74
|
-
level = general_level
|
|
75
|
-
|
|
76
|
-
# 转换日志级别
|
|
77
|
-
if isinstance(level, str):
|
|
78
|
-
level = getattr(logging, level.upper(), logging.INFO)
|
|
79
|
-
|
|
1
|
+
# ==================== 向后兼容的日志接口 ====================
|
|
2
|
+
# 主要功能已迁移到 crawlo.logging 模块
|
|
3
|
+
# 本文件提供向后兼容接口,同时支持新的日志系统功能
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional, Any
|
|
7
|
+
|
|
8
|
+
# 向后兼容:导入新的日志系统
|
|
9
|
+
try:
|
|
10
|
+
from crawlo.logging import get_logger as new_get_logger, configure_logging
|
|
11
|
+
|
|
12
|
+
_NEW_LOGGING_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
_NEW_LOGGING_AVAILABLE = False
|
|
15
|
+
new_get_logger = None
|
|
16
|
+
configure_logging = None
|
|
17
|
+
|
|
18
|
+
LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# 向后兼容的日志函数
|
|
22
|
+
def get_logger(name: str = 'default', level: Optional[int] = None):
|
|
23
|
+
"""获取Logger实例 - 向后兼容函数"""
|
|
24
|
+
if _NEW_LOGGING_AVAILABLE and new_get_logger:
|
|
25
|
+
# 使用新的日志系统
|
|
26
|
+
return new_get_logger(name)
|
|
27
|
+
else:
|
|
28
|
+
# 降级到基本的Python logging
|
|
29
|
+
logger = logging.getLogger(name)
|
|
30
|
+
if not logger.handlers:
|
|
31
|
+
handler = logging.StreamHandler()
|
|
32
|
+
formatter = logging.Formatter(LOG_FORMAT)
|
|
33
|
+
handler.setFormatter(formatter)
|
|
34
|
+
logger.addHandler(handler)
|
|
35
|
+
logger.setLevel(level or logging.INFO)
|
|
36
|
+
return logger
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_component_logger(component_class: Any, settings: Optional[Any] = None, level: Optional[str] = None):
|
|
40
|
+
"""
|
|
41
|
+
获取组件Logger - 推荐的组件日志创建方式
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
component_class: 组件类
|
|
45
|
+
settings: 配置对象,用于读取日志级别配置
|
|
46
|
+
level: 日志级别(优先级低于settings中的配置)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
logging.Logger: 配置好的Logger实例
|
|
50
|
+
"""
|
|
51
|
+
# 获取组件名称
|
|
52
|
+
if hasattr(component_class, '__name__'):
|
|
53
|
+
component_name = component_class.__name__
|
|
54
|
+
else:
|
|
55
|
+
component_name = str(component_class)
|
|
56
|
+
|
|
57
|
+
# 如果新日志系统可用,使用新系统
|
|
58
|
+
if _NEW_LOGGING_AVAILABLE and new_get_logger:
|
|
59
|
+
return new_get_logger(component_name)
|
|
60
|
+
|
|
61
|
+
# 否则使用向后兼容方式
|
|
62
|
+
# 从settings中获取日志级别(如果提供)
|
|
63
|
+
if settings is not None:
|
|
64
|
+
# 尝试从settings获取组件特定的日志级别
|
|
65
|
+
if hasattr(settings, 'get'):
|
|
66
|
+
# 检查是否有组件特定的日志级别配置
|
|
67
|
+
component_level = settings.get(f'LOG_LEVEL_{component_name}')
|
|
68
|
+
if component_level is not None:
|
|
69
|
+
level = component_level
|
|
70
|
+
else:
|
|
71
|
+
# 检查通用日志级别
|
|
72
|
+
general_level = settings.get('LOG_LEVEL')
|
|
73
|
+
if general_level is not None:
|
|
74
|
+
level = general_level
|
|
75
|
+
|
|
76
|
+
# 转换日志级别
|
|
77
|
+
if isinstance(level, str):
|
|
78
|
+
level = getattr(logging, level.upper(), logging.INFO)
|
|
79
|
+
|
|
80
80
|
return get_logger(component_name, level)
|