crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/utils/misc.py
CHANGED
|
@@ -1,82 +1,82 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
import pkgutil
|
|
3
|
-
from typing import Iterator, Any, Type
|
|
4
|
-
|
|
5
|
-
from crawlo.spider import Spider
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def walk_modules(module_path: str) -> Iterator[Any]:
|
|
9
|
-
"""
|
|
10
|
-
加载模块并递归遍历其所有子模块
|
|
11
|
-
|
|
12
|
-
Args:
|
|
13
|
-
module_path: 模块路径
|
|
14
|
-
|
|
15
|
-
Yields:
|
|
16
|
-
导入的模块对象
|
|
17
|
-
|
|
18
|
-
Raises:
|
|
19
|
-
ImportError: 如果模块无法导入
|
|
20
|
-
"""
|
|
21
|
-
# 导入模块
|
|
22
|
-
module = importlib.import_module(module_path)
|
|
23
|
-
yield module
|
|
24
|
-
|
|
25
|
-
# 如果是包,则递归导入子模块
|
|
26
|
-
if hasattr(module, '__path__'):
|
|
27
|
-
for loader, submodule_name, is_pkg in pkgutil.walk_packages(module.__path__):
|
|
28
|
-
try:
|
|
29
|
-
submodule_path = f"{module_path}.{submodule_name}"
|
|
30
|
-
submodule = importlib.import_module(submodule_path)
|
|
31
|
-
yield submodule
|
|
32
|
-
|
|
33
|
-
# 如果子模块也是包,递归遍历
|
|
34
|
-
if is_pkg:
|
|
35
|
-
yield from walk_modules(submodule_path)
|
|
36
|
-
except ImportError:
|
|
37
|
-
# 跳过无法导入的子模块
|
|
38
|
-
continue
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def iter_spider_classes(module) -> Iterator[Type[Spider]]:
|
|
42
|
-
"""
|
|
43
|
-
遍历模块中的所有Spider子类
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
module: 要遍历的模块
|
|
47
|
-
|
|
48
|
-
Yields:
|
|
49
|
-
Spider子类
|
|
50
|
-
"""
|
|
51
|
-
for attr_name in dir(module):
|
|
52
|
-
attr_value = getattr(module, attr_name)
|
|
53
|
-
if (isinstance(attr_value, type) and
|
|
54
|
-
issubclass(attr_value, Spider) and
|
|
55
|
-
attr_value != Spider and
|
|
56
|
-
hasattr(attr_value, 'name')):
|
|
57
|
-
yield attr_value
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def load_object(path: str):
|
|
61
|
-
"""
|
|
62
|
-
从路径加载对象
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
path: 对象路径,格式为 module.submodule:object_name 或 module.submodule.object_name
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
加载的对象
|
|
69
|
-
"""
|
|
70
|
-
try:
|
|
71
|
-
# 处理 module.submodule:object_name 格式
|
|
72
|
-
if ':' in path:
|
|
73
|
-
module_path, obj_name = path.split(':', 1)
|
|
74
|
-
module = importlib.import_module(module_path)
|
|
75
|
-
return getattr(module, obj_name)
|
|
76
|
-
else:
|
|
77
|
-
# 处理 module.submodule.object_name 格式
|
|
78
|
-
module_path, obj_name = path.rsplit('.', 1)
|
|
79
|
-
module = importlib.import_module(module_path)
|
|
80
|
-
return getattr(module, obj_name)
|
|
81
|
-
except (ImportError, AttributeError) as e:
|
|
1
|
+
import importlib
|
|
2
|
+
import pkgutil
|
|
3
|
+
from typing import Iterator, Any, Type
|
|
4
|
+
|
|
5
|
+
from crawlo.spider import Spider
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def walk_modules(module_path: str) -> Iterator[Any]:
|
|
9
|
+
"""
|
|
10
|
+
加载模块并递归遍历其所有子模块
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
module_path: 模块路径
|
|
14
|
+
|
|
15
|
+
Yields:
|
|
16
|
+
导入的模块对象
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ImportError: 如果模块无法导入
|
|
20
|
+
"""
|
|
21
|
+
# 导入模块
|
|
22
|
+
module = importlib.import_module(module_path)
|
|
23
|
+
yield module
|
|
24
|
+
|
|
25
|
+
# 如果是包,则递归导入子模块
|
|
26
|
+
if hasattr(module, '__path__'):
|
|
27
|
+
for loader, submodule_name, is_pkg in pkgutil.walk_packages(module.__path__):
|
|
28
|
+
try:
|
|
29
|
+
submodule_path = f"{module_path}.{submodule_name}"
|
|
30
|
+
submodule = importlib.import_module(submodule_path)
|
|
31
|
+
yield submodule
|
|
32
|
+
|
|
33
|
+
# 如果子模块也是包,递归遍历
|
|
34
|
+
if is_pkg:
|
|
35
|
+
yield from walk_modules(submodule_path)
|
|
36
|
+
except ImportError:
|
|
37
|
+
# 跳过无法导入的子模块
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def iter_spider_classes(module) -> Iterator[Type[Spider]]:
|
|
42
|
+
"""
|
|
43
|
+
遍历模块中的所有Spider子类
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
module: 要遍历的模块
|
|
47
|
+
|
|
48
|
+
Yields:
|
|
49
|
+
Spider子类
|
|
50
|
+
"""
|
|
51
|
+
for attr_name in dir(module):
|
|
52
|
+
attr_value = getattr(module, attr_name)
|
|
53
|
+
if (isinstance(attr_value, type) and
|
|
54
|
+
issubclass(attr_value, Spider) and
|
|
55
|
+
attr_value != Spider and
|
|
56
|
+
hasattr(attr_value, 'name')):
|
|
57
|
+
yield attr_value
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_object(path: str):
|
|
61
|
+
"""
|
|
62
|
+
从路径加载对象
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
path: 对象路径,格式为 module.submodule:object_name 或 module.submodule.object_name
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
加载的对象
|
|
69
|
+
"""
|
|
70
|
+
try:
|
|
71
|
+
# 处理 module.submodule:object_name 格式
|
|
72
|
+
if ':' in path:
|
|
73
|
+
module_path, obj_name = path.split(':', 1)
|
|
74
|
+
module = importlib.import_module(module_path)
|
|
75
|
+
return getattr(module, obj_name)
|
|
76
|
+
else:
|
|
77
|
+
# 处理 module.submodule.object_name 格式
|
|
78
|
+
module_path, obj_name = path.rsplit('.', 1)
|
|
79
|
+
module = importlib.import_module(module_path)
|
|
80
|
+
return getattr(module, obj_name)
|
|
81
|
+
except (ImportError, AttributeError) as e:
|
|
82
82
|
raise ImportError(f"Could not load object from path '{path}': {e}")
|
|
@@ -1,157 +1,157 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
MongoDB 连接池管理器
|
|
5
|
-
==================
|
|
6
|
-
|
|
7
|
-
提供单例模式的MongoDB连接池,确保多个爬虫共享同一个连接池,
|
|
8
|
-
避免重复创建连接池导致的资源浪费。
|
|
9
|
-
|
|
10
|
-
特点:
|
|
11
|
-
1. 单例模式 - 全局唯一的连接池实例
|
|
12
|
-
2. 线程安全 - 使用异步锁保护初始化过程
|
|
13
|
-
3. 配置隔离 - 支持不同的数据库配置创建不同的连接池
|
|
14
|
-
4. 自动清理 - 支持资源清理和重置
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import asyncio
|
|
18
|
-
from typing import Dict, Optional, Any
|
|
19
|
-
from motor.motor_asyncio import AsyncIOMotorClient
|
|
20
|
-
from crawlo.logging import get_logger
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class MongoConnectionPoolManager:
|
|
24
|
-
"""MongoDB 连接池管理器(单例模式)"""
|
|
25
|
-
|
|
26
|
-
_instances: Dict[str, 'MongoConnectionPoolManager'] = {}
|
|
27
|
-
_lock = asyncio.Lock()
|
|
28
|
-
|
|
29
|
-
def __init__(self, pool_key: str):
|
|
30
|
-
"""
|
|
31
|
-
初始化连接池管理器
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
pool_key: 连接池唯一标识
|
|
35
|
-
"""
|
|
36
|
-
self.pool_key = pool_key
|
|
37
|
-
self.client: Optional[AsyncIOMotorClient] = None
|
|
38
|
-
self._client_lock = asyncio.Lock()
|
|
39
|
-
self._client_initialized = False
|
|
40
|
-
self._config: Dict[str, Any] = {}
|
|
41
|
-
self.logger = get_logger(f'MongoPool.{pool_key}')
|
|
42
|
-
|
|
43
|
-
@classmethod
|
|
44
|
-
async def get_client(
|
|
45
|
-
cls,
|
|
46
|
-
mongo_uri: str = 'mongodb://localhost:27017',
|
|
47
|
-
db_name: str = 'crawlo',
|
|
48
|
-
max_pool_size: int = 100,
|
|
49
|
-
min_pool_size: int = 10,
|
|
50
|
-
connect_timeout_ms: int = 5000,
|
|
51
|
-
socket_timeout_ms: int = 30000,
|
|
52
|
-
**kwargs
|
|
53
|
-
) -> AsyncIOMotorClient:
|
|
54
|
-
"""
|
|
55
|
-
获取 MongoDB 客户端实例(单例模式)
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
mongo_uri: MongoDB 连接 URI
|
|
59
|
-
db_name: 数据库名
|
|
60
|
-
max_pool_size: 最大连接池大小
|
|
61
|
-
min_pool_size: 最小连接池大小
|
|
62
|
-
connect_timeout_ms: 连接超时(毫秒)
|
|
63
|
-
socket_timeout_ms: Socket 超时(毫秒)
|
|
64
|
-
**kwargs: 其他连接参数
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
MongoDB 客户端实例
|
|
68
|
-
"""
|
|
69
|
-
# 生成连接池唯一标识
|
|
70
|
-
pool_key = f"{mongo_uri}:{db_name}"
|
|
71
|
-
|
|
72
|
-
async with cls._lock:
|
|
73
|
-
if pool_key not in cls._instances:
|
|
74
|
-
instance = cls(pool_key)
|
|
75
|
-
instance._config = {
|
|
76
|
-
'mongo_uri': mongo_uri,
|
|
77
|
-
'db_name': db_name,
|
|
78
|
-
'max_pool_size': max_pool_size,
|
|
79
|
-
'min_pool_size': min_pool_size,
|
|
80
|
-
'connect_timeout_ms': connect_timeout_ms,
|
|
81
|
-
'socket_timeout_ms': socket_timeout_ms,
|
|
82
|
-
**kwargs
|
|
83
|
-
}
|
|
84
|
-
cls._instances[pool_key] = instance
|
|
85
|
-
instance.logger.info(
|
|
86
|
-
f"创建新的 MongoDB 连接池管理器: {pool_key} "
|
|
87
|
-
f"(minPoolSize={min_pool_size}, maxPoolSize={max_pool_size})"
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
instance = cls._instances[pool_key]
|
|
91
|
-
await instance._ensure_client()
|
|
92
|
-
return instance.client
|
|
93
|
-
|
|
94
|
-
async def _ensure_client(self):
|
|
95
|
-
"""确保客户端已初始化(线程安全)"""
|
|
96
|
-
if self._client_initialized and self.client:
|
|
97
|
-
return
|
|
98
|
-
|
|
99
|
-
async with self._client_lock:
|
|
100
|
-
if not self._client_initialized:
|
|
101
|
-
try:
|
|
102
|
-
self.client = AsyncIOMotorClient(
|
|
103
|
-
self._config['mongo_uri'],
|
|
104
|
-
maxPoolSize=self._config['max_pool_size'],
|
|
105
|
-
minPoolSize=self._config['min_pool_size'],
|
|
106
|
-
connectTimeoutMS=self._config['connect_timeout_ms'],
|
|
107
|
-
socketTimeoutMS=self._config['socket_timeout_ms']
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
self._client_initialized = True
|
|
111
|
-
self.logger.info(
|
|
112
|
-
f"MongoDB 客户端初始化成功: {self.pool_key} "
|
|
113
|
-
f"(minPoolSize={self._config['min_pool_size']}, "
|
|
114
|
-
f"maxPoolSize={self._config['max_pool_size']})"
|
|
115
|
-
)
|
|
116
|
-
except Exception as e:
|
|
117
|
-
self.logger.error(f"MongoDB 客户端初始化失败: {e}")
|
|
118
|
-
self._client_initialized = False
|
|
119
|
-
self.client = None
|
|
120
|
-
raise
|
|
121
|
-
|
|
122
|
-
@classmethod
|
|
123
|
-
async def close_all_clients(cls):
|
|
124
|
-
"""关闭所有 MongoDB 客户端"""
|
|
125
|
-
logger = get_logger('MongoPool')
|
|
126
|
-
logger.info(f"开始关闭所有 MongoDB 客户端,共 {len(cls._instances)} 个")
|
|
127
|
-
|
|
128
|
-
for pool_key, instance in cls._instances.items():
|
|
129
|
-
try:
|
|
130
|
-
if instance.client:
|
|
131
|
-
logger.info(f"关闭 MongoDB 客户端: {pool_key}")
|
|
132
|
-
instance.client.close()
|
|
133
|
-
logger.info(f"MongoDB 客户端已关闭: {pool_key}")
|
|
134
|
-
except Exception as e:
|
|
135
|
-
logger.error(f"关闭 MongoDB 客户端 {pool_key} 时发生错误: {e}")
|
|
136
|
-
|
|
137
|
-
cls._instances.clear()
|
|
138
|
-
logger.info("所有 MongoDB 客户端已关闭")
|
|
139
|
-
|
|
140
|
-
@classmethod
|
|
141
|
-
def get_pool_stats(cls) -> Dict[str, Any]:
|
|
142
|
-
"""获取所有连接池的统计信息"""
|
|
143
|
-
stats = {
|
|
144
|
-
'total_pools': len(cls._instances),
|
|
145
|
-
'pools': {}
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
for pool_key, instance in cls._instances.items():
|
|
149
|
-
if instance.client:
|
|
150
|
-
stats['pools'][pool_key] = {
|
|
151
|
-
'uri': instance._config.get('mongo_uri', 'unknown'),
|
|
152
|
-
'db_name': instance._config.get('db_name', 'unknown'),
|
|
153
|
-
'min_pool_size': instance._config.get('min_pool_size', 'unknown'),
|
|
154
|
-
'max_pool_size': instance._config.get('max_pool_size', 'unknown')
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
return stats
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
MongoDB 连接池管理器
|
|
5
|
+
==================
|
|
6
|
+
|
|
7
|
+
提供单例模式的MongoDB连接池,确保多个爬虫共享同一个连接池,
|
|
8
|
+
避免重复创建连接池导致的资源浪费。
|
|
9
|
+
|
|
10
|
+
特点:
|
|
11
|
+
1. 单例模式 - 全局唯一的连接池实例
|
|
12
|
+
2. 线程安全 - 使用异步锁保护初始化过程
|
|
13
|
+
3. 配置隔离 - 支持不同的数据库配置创建不同的连接池
|
|
14
|
+
4. 自动清理 - 支持资源清理和重置
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
from typing import Dict, Optional, Any
|
|
19
|
+
from motor.motor_asyncio import AsyncIOMotorClient
|
|
20
|
+
from crawlo.logging import get_logger
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MongoConnectionPoolManager:
|
|
24
|
+
"""MongoDB 连接池管理器(单例模式)"""
|
|
25
|
+
|
|
26
|
+
_instances: Dict[str, 'MongoConnectionPoolManager'] = {}
|
|
27
|
+
_lock = asyncio.Lock()
|
|
28
|
+
|
|
29
|
+
def __init__(self, pool_key: str):
|
|
30
|
+
"""
|
|
31
|
+
初始化连接池管理器
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
pool_key: 连接池唯一标识
|
|
35
|
+
"""
|
|
36
|
+
self.pool_key = pool_key
|
|
37
|
+
self.client: Optional[AsyncIOMotorClient] = None
|
|
38
|
+
self._client_lock = asyncio.Lock()
|
|
39
|
+
self._client_initialized = False
|
|
40
|
+
self._config: Dict[str, Any] = {}
|
|
41
|
+
self.logger = get_logger(f'MongoPool.{pool_key}')
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
async def get_client(
|
|
45
|
+
cls,
|
|
46
|
+
mongo_uri: str = 'mongodb://localhost:27017',
|
|
47
|
+
db_name: str = 'crawlo',
|
|
48
|
+
max_pool_size: int = 100,
|
|
49
|
+
min_pool_size: int = 10,
|
|
50
|
+
connect_timeout_ms: int = 5000,
|
|
51
|
+
socket_timeout_ms: int = 30000,
|
|
52
|
+
**kwargs
|
|
53
|
+
) -> AsyncIOMotorClient:
|
|
54
|
+
"""
|
|
55
|
+
获取 MongoDB 客户端实例(单例模式)
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
mongo_uri: MongoDB 连接 URI
|
|
59
|
+
db_name: 数据库名
|
|
60
|
+
max_pool_size: 最大连接池大小
|
|
61
|
+
min_pool_size: 最小连接池大小
|
|
62
|
+
connect_timeout_ms: 连接超时(毫秒)
|
|
63
|
+
socket_timeout_ms: Socket 超时(毫秒)
|
|
64
|
+
**kwargs: 其他连接参数
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
MongoDB 客户端实例
|
|
68
|
+
"""
|
|
69
|
+
# 生成连接池唯一标识
|
|
70
|
+
pool_key = f"{mongo_uri}:{db_name}"
|
|
71
|
+
|
|
72
|
+
async with cls._lock:
|
|
73
|
+
if pool_key not in cls._instances:
|
|
74
|
+
instance = cls(pool_key)
|
|
75
|
+
instance._config = {
|
|
76
|
+
'mongo_uri': mongo_uri,
|
|
77
|
+
'db_name': db_name,
|
|
78
|
+
'max_pool_size': max_pool_size,
|
|
79
|
+
'min_pool_size': min_pool_size,
|
|
80
|
+
'connect_timeout_ms': connect_timeout_ms,
|
|
81
|
+
'socket_timeout_ms': socket_timeout_ms,
|
|
82
|
+
**kwargs
|
|
83
|
+
}
|
|
84
|
+
cls._instances[pool_key] = instance
|
|
85
|
+
instance.logger.info(
|
|
86
|
+
f"创建新的 MongoDB 连接池管理器: {pool_key} "
|
|
87
|
+
f"(minPoolSize={min_pool_size}, maxPoolSize={max_pool_size})"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
instance = cls._instances[pool_key]
|
|
91
|
+
await instance._ensure_client()
|
|
92
|
+
return instance.client
|
|
93
|
+
|
|
94
|
+
async def _ensure_client(self):
|
|
95
|
+
"""确保客户端已初始化(线程安全)"""
|
|
96
|
+
if self._client_initialized and self.client:
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
async with self._client_lock:
|
|
100
|
+
if not self._client_initialized:
|
|
101
|
+
try:
|
|
102
|
+
self.client = AsyncIOMotorClient(
|
|
103
|
+
self._config['mongo_uri'],
|
|
104
|
+
maxPoolSize=self._config['max_pool_size'],
|
|
105
|
+
minPoolSize=self._config['min_pool_size'],
|
|
106
|
+
connectTimeoutMS=self._config['connect_timeout_ms'],
|
|
107
|
+
socketTimeoutMS=self._config['socket_timeout_ms']
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
self._client_initialized = True
|
|
111
|
+
self.logger.info(
|
|
112
|
+
f"MongoDB 客户端初始化成功: {self.pool_key} "
|
|
113
|
+
f"(minPoolSize={self._config['min_pool_size']}, "
|
|
114
|
+
f"maxPoolSize={self._config['max_pool_size']})"
|
|
115
|
+
)
|
|
116
|
+
except Exception as e:
|
|
117
|
+
self.logger.error(f"MongoDB 客户端初始化失败: {e}")
|
|
118
|
+
self._client_initialized = False
|
|
119
|
+
self.client = None
|
|
120
|
+
raise
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
async def close_all_clients(cls):
|
|
124
|
+
"""关闭所有 MongoDB 客户端"""
|
|
125
|
+
logger = get_logger('MongoPool')
|
|
126
|
+
logger.info(f"开始关闭所有 MongoDB 客户端,共 {len(cls._instances)} 个")
|
|
127
|
+
|
|
128
|
+
for pool_key, instance in cls._instances.items():
|
|
129
|
+
try:
|
|
130
|
+
if instance.client:
|
|
131
|
+
logger.info(f"关闭 MongoDB 客户端: {pool_key}")
|
|
132
|
+
instance.client.close()
|
|
133
|
+
logger.info(f"MongoDB 客户端已关闭: {pool_key}")
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error(f"关闭 MongoDB 客户端 {pool_key} 时发生错误: {e}")
|
|
136
|
+
|
|
137
|
+
cls._instances.clear()
|
|
138
|
+
logger.info("所有 MongoDB 客户端已关闭")
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def get_pool_stats(cls) -> Dict[str, Any]:
|
|
142
|
+
"""获取所有连接池的统计信息"""
|
|
143
|
+
stats = {
|
|
144
|
+
'total_pools': len(cls._instances),
|
|
145
|
+
'pools': {}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
for pool_key, instance in cls._instances.items():
|
|
149
|
+
if instance.client:
|
|
150
|
+
stats['pools'][pool_key] = {
|
|
151
|
+
'uri': instance._config.get('mongo_uri', 'unknown'),
|
|
152
|
+
'db_name': instance._config.get('db_name', 'unknown'),
|
|
153
|
+
'min_pool_size': instance._config.get('min_pool_size', 'unknown'),
|
|
154
|
+
'max_pool_size': instance._config.get('max_pool_size', 'unknown')
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return stats
|