crawlo 1.4.5__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -245
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +470 -326
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +285 -270
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +82 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +110 -157
- crawlo/templates/project/settings_distributed.py.tmpl +156 -161
- crawlo/templates/project/settings_gentle.py.tmpl +170 -171
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -172
- crawlo/templates/project/settings_minimal.py.tmpl +99 -77
- crawlo/templates/project/settings_simple.py.tmpl +168 -169
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -30
- crawlo/templates/spider/spider.py.tmpl +33 -144
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -244
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +77 -0
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +140 -0
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +57 -0
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -268
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -121
- tests/test_proxy_middleware_enhanced.py +212 -216
- tests/test_proxy_middleware_integration.py +142 -137
- tests/test_proxy_middleware_refactored.py +207 -184
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +110 -0
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/middleware/simple_proxy.py +0 -65
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.5.dist-info/METADATA +0 -329
- crawlo-1.4.5.dist-info/RECORD +0 -347
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
资源管理器 - 统一管理所有可清理资源
|
|
5
|
+
========================================
|
|
6
|
+
|
|
7
|
+
功能特性:
|
|
8
|
+
- 统一注册和清理资源
|
|
9
|
+
- 支持异步资源清理
|
|
10
|
+
- 资源泄露检测
|
|
11
|
+
- 清理顺序保证(LIFO)
|
|
12
|
+
"""
|
|
13
|
+
import asyncio
|
|
14
|
+
import time
|
|
15
|
+
import traceback
|
|
16
|
+
from typing import Any, Callable, List, Tuple, Optional, Dict
|
|
17
|
+
from enum import Enum
|
|
18
|
+
|
|
19
|
+
from crawlo.logging import get_logger
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ResourceType(Enum):
|
|
23
|
+
"""资源类型枚举"""
|
|
24
|
+
DOWNLOADER = "downloader"
|
|
25
|
+
REDIS_POOL = "redis_pool"
|
|
26
|
+
QUEUE = "queue"
|
|
27
|
+
FILTER = "filter"
|
|
28
|
+
PIPELINE = "pipeline"
|
|
29
|
+
MIDDLEWARE = "middleware"
|
|
30
|
+
EXTENSION = "extension"
|
|
31
|
+
SESSION = "session"
|
|
32
|
+
BROWSER = "browser"
|
|
33
|
+
OTHER = "other"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ResourceStatus(Enum):
|
|
37
|
+
"""资源状态"""
|
|
38
|
+
ACTIVE = "active"
|
|
39
|
+
CLOSING = "closing"
|
|
40
|
+
CLOSED = "closed"
|
|
41
|
+
ERROR = "error"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ManagedResource:
|
|
45
|
+
"""托管资源"""
|
|
46
|
+
|
|
47
|
+
def __init__(self,
|
|
48
|
+
resource: Any,
|
|
49
|
+
cleanup_func: Callable,
|
|
50
|
+
resource_type: ResourceType = ResourceType.OTHER,
|
|
51
|
+
name: Optional[str] = None):
|
|
52
|
+
self.resource = resource
|
|
53
|
+
self.cleanup_func = cleanup_func
|
|
54
|
+
self.resource_type = resource_type
|
|
55
|
+
self.name = name or f"{resource_type.value}_{id(resource)}"
|
|
56
|
+
self.status = ResourceStatus.ACTIVE
|
|
57
|
+
self.created_at = time.time()
|
|
58
|
+
self.closed_at: Optional[float] = None
|
|
59
|
+
|
|
60
|
+
async def cleanup(self) -> bool:
|
|
61
|
+
"""清理资源"""
|
|
62
|
+
if self.status == ResourceStatus.CLOSED:
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
self.status = ResourceStatus.CLOSING
|
|
66
|
+
try:
|
|
67
|
+
# 检查cleanup_func是否为异步函数
|
|
68
|
+
if asyncio.iscoroutinefunction(self.cleanup_func):
|
|
69
|
+
await self.cleanup_func(self.resource)
|
|
70
|
+
else:
|
|
71
|
+
# 同步函数,直接调用
|
|
72
|
+
result = self.cleanup_func(self.resource)
|
|
73
|
+
# 如果返回的是协程,则await
|
|
74
|
+
if asyncio.iscoroutine(result):
|
|
75
|
+
await result
|
|
76
|
+
|
|
77
|
+
self.status = ResourceStatus.CLOSED
|
|
78
|
+
self.closed_at = time.time()
|
|
79
|
+
return True
|
|
80
|
+
except Exception as e:
|
|
81
|
+
self.status = ResourceStatus.ERROR
|
|
82
|
+
raise e
|
|
83
|
+
|
|
84
|
+
def get_lifetime(self) -> float:
|
|
85
|
+
"""获取资源生命周期(秒)"""
|
|
86
|
+
end_time = self.closed_at or time.time()
|
|
87
|
+
return end_time - self.created_at
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ResourceManager:
|
|
91
|
+
"""
|
|
92
|
+
资源管理器 - 统一管理所有可清理资源
|
|
93
|
+
|
|
94
|
+
特性:
|
|
95
|
+
1. 自动跟踪注册的资源
|
|
96
|
+
2. 保证清理顺序(LIFO - 后进先出)
|
|
97
|
+
3. 容错清理(一个失败不影响其他)
|
|
98
|
+
4. 资源泄露检测
|
|
99
|
+
5. 统计和监控
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self, name: str = "default"):
|
|
103
|
+
self.name = name
|
|
104
|
+
self._resources: List[ManagedResource] = []
|
|
105
|
+
self._lock = asyncio.Lock()
|
|
106
|
+
self._cleanup_errors: List[Tuple[str, Exception]] = []
|
|
107
|
+
self._logger = get_logger(f"ResourceManager.{name}")
|
|
108
|
+
|
|
109
|
+
# 统计信息
|
|
110
|
+
self._stats = {
|
|
111
|
+
'total_registered': 0,
|
|
112
|
+
'total_cleaned': 0,
|
|
113
|
+
'total_errors': 0,
|
|
114
|
+
'active_resources': 0,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
def register(self,
|
|
118
|
+
resource: Any,
|
|
119
|
+
cleanup_func: Callable,
|
|
120
|
+
resource_type: ResourceType = ResourceType.OTHER,
|
|
121
|
+
name: Optional[str] = None) -> ManagedResource:
|
|
122
|
+
"""
|
|
123
|
+
注册需要清理的资源
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
resource: 资源对象
|
|
127
|
+
cleanup_func: 清理函数(同步或异步)
|
|
128
|
+
resource_type: 资源类型
|
|
129
|
+
name: 资源名称(用于日志)
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
托管资源对象
|
|
133
|
+
"""
|
|
134
|
+
managed = ManagedResource(resource, cleanup_func, resource_type, name)
|
|
135
|
+
self._resources.append(managed)
|
|
136
|
+
self._stats['total_registered'] += 1
|
|
137
|
+
self._stats['active_resources'] += 1
|
|
138
|
+
|
|
139
|
+
self._logger.debug(f"Resource registered: {managed.name} ({resource_type.value})")
|
|
140
|
+
return managed
|
|
141
|
+
|
|
142
|
+
async def cleanup_all(self, reverse: bool = True) -> Dict[str, Any]:
|
|
143
|
+
"""
|
|
144
|
+
清理所有注册的资源
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
reverse: 是否反向清理(LIFO,推荐)
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
清理结果统计
|
|
151
|
+
"""
|
|
152
|
+
async with self._lock:
|
|
153
|
+
if not self._resources:
|
|
154
|
+
self._logger.debug("No resources to cleanup")
|
|
155
|
+
return self._get_cleanup_stats()
|
|
156
|
+
|
|
157
|
+
self._logger.info(f"Starting cleanup of {len(self._resources)} resources...")
|
|
158
|
+
|
|
159
|
+
# 反向清理(后创建的先清理)
|
|
160
|
+
resources = reversed(self._resources) if reverse else self._resources
|
|
161
|
+
|
|
162
|
+
cleanup_start = time.time()
|
|
163
|
+
success_count = 0
|
|
164
|
+
error_count = 0
|
|
165
|
+
|
|
166
|
+
for managed in resources:
|
|
167
|
+
try:
|
|
168
|
+
self._logger.debug(f"Cleaning up: {managed.name}")
|
|
169
|
+
await managed.cleanup()
|
|
170
|
+
success_count += 1
|
|
171
|
+
self._stats['total_cleaned'] += 1
|
|
172
|
+
self._stats['active_resources'] -= 1
|
|
173
|
+
except Exception as e:
|
|
174
|
+
error_count += 1
|
|
175
|
+
self._stats['total_errors'] += 1
|
|
176
|
+
self._cleanup_errors.append((managed.name, e))
|
|
177
|
+
self._logger.error(
|
|
178
|
+
f"Failed to cleanup {managed.name}: {e}",
|
|
179
|
+
exc_info=True
|
|
180
|
+
)
|
|
181
|
+
# 继续清理其他资源,不中断
|
|
182
|
+
|
|
183
|
+
cleanup_duration = time.time() - cleanup_start
|
|
184
|
+
|
|
185
|
+
# 清空资源列表
|
|
186
|
+
self._resources.clear()
|
|
187
|
+
|
|
188
|
+
result = {
|
|
189
|
+
'success': success_count,
|
|
190
|
+
'errors': error_count,
|
|
191
|
+
'duration': cleanup_duration,
|
|
192
|
+
'total_resources': success_count + error_count,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if error_count > 0:
|
|
196
|
+
self._logger.warning(
|
|
197
|
+
f"Cleanup completed with errors: {success_count} success, "
|
|
198
|
+
f"{error_count} errors in {cleanup_duration:.2f}s"
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
self._logger.info(
|
|
202
|
+
f"Cleanup completed successfully: {success_count} resources "
|
|
203
|
+
f"in {cleanup_duration:.2f}s"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return result
|
|
207
|
+
|
|
208
|
+
async def cleanup_by_type(self, resource_type: ResourceType) -> int:
|
|
209
|
+
"""
|
|
210
|
+
按类型清理资源
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
resource_type: 资源类型
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
清理的资源数量
|
|
217
|
+
"""
|
|
218
|
+
async with self._lock:
|
|
219
|
+
to_cleanup = [r for r in self._resources if r.resource_type == resource_type]
|
|
220
|
+
|
|
221
|
+
if not to_cleanup:
|
|
222
|
+
return 0
|
|
223
|
+
|
|
224
|
+
cleaned = 0
|
|
225
|
+
for managed in reversed(to_cleanup):
|
|
226
|
+
try:
|
|
227
|
+
await managed.cleanup()
|
|
228
|
+
self._resources.remove(managed)
|
|
229
|
+
cleaned += 1
|
|
230
|
+
self._stats['total_cleaned'] += 1
|
|
231
|
+
self._stats['active_resources'] -= 1
|
|
232
|
+
except Exception as e:
|
|
233
|
+
self._logger.error(f"Failed to cleanup {managed.name}: {e}")
|
|
234
|
+
self._stats['total_errors'] += 1
|
|
235
|
+
|
|
236
|
+
return cleaned
|
|
237
|
+
|
|
238
|
+
def get_active_resources(self) -> List[ManagedResource]:
|
|
239
|
+
"""获取所有活跃资源"""
|
|
240
|
+
return [r for r in self._resources if r.status == ResourceStatus.ACTIVE]
|
|
241
|
+
|
|
242
|
+
def get_resources_by_type(self, resource_type: ResourceType) -> List[ManagedResource]:
|
|
243
|
+
"""按类型获取资源"""
|
|
244
|
+
return [r for r in self._resources if r.resource_type == resource_type]
|
|
245
|
+
|
|
246
|
+
def detect_leaks(self, max_lifetime: float = 3600) -> List[ManagedResource]:
|
|
247
|
+
"""
|
|
248
|
+
检测可能的资源泄露
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
max_lifetime: 最大生命周期(秒),超过此时间未清理视为泄露
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
可能泄露的资源列表
|
|
255
|
+
"""
|
|
256
|
+
current_time = time.time()
|
|
257
|
+
leaks = []
|
|
258
|
+
|
|
259
|
+
for managed in self._resources:
|
|
260
|
+
if managed.status == ResourceStatus.ACTIVE:
|
|
261
|
+
lifetime = current_time - managed.created_at
|
|
262
|
+
if lifetime > max_lifetime:
|
|
263
|
+
leaks.append(managed)
|
|
264
|
+
self._logger.warning(
|
|
265
|
+
f"Potential leak detected: {managed.name} "
|
|
266
|
+
f"(lifetime: {lifetime:.2f}s)"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
return leaks
|
|
270
|
+
|
|
271
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
272
|
+
"""获取统计信息"""
|
|
273
|
+
return {
|
|
274
|
+
**self._stats,
|
|
275
|
+
'cleanup_errors': len(self._cleanup_errors),
|
|
276
|
+
'active_by_type': self._get_active_by_type(),
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
def _get_active_by_type(self) -> Dict[str, int]:
|
|
280
|
+
"""按类型统计活跃资源"""
|
|
281
|
+
result = {}
|
|
282
|
+
for managed in self._resources:
|
|
283
|
+
if managed.status == ResourceStatus.ACTIVE:
|
|
284
|
+
type_name = managed.resource_type.value
|
|
285
|
+
result[type_name] = result.get(type_name, 0) + 1
|
|
286
|
+
return result
|
|
287
|
+
|
|
288
|
+
def _get_cleanup_stats(self) -> Dict[str, Any]:
|
|
289
|
+
"""获取清理统计"""
|
|
290
|
+
return {
|
|
291
|
+
'success': 0,
|
|
292
|
+
'errors': 0,
|
|
293
|
+
'duration': 0.0,
|
|
294
|
+
'total_resources': 0,
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
async def __aenter__(self):
|
|
298
|
+
"""上下文管理器入口"""
|
|
299
|
+
return self
|
|
300
|
+
|
|
301
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
302
|
+
"""上下文管理器退出,自动清理"""
|
|
303
|
+
await self.cleanup_all()
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# 全局资源管理器注册表
|
|
308
|
+
_global_managers: Dict[str, ResourceManager] = {}
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def get_resource_manager(name: str = "default") -> ResourceManager:
|
|
312
|
+
"""
|
|
313
|
+
获取资源管理器实例(单例)
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
name: 管理器名称
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
资源管理器实例
|
|
320
|
+
"""
|
|
321
|
+
if name not in _global_managers:
|
|
322
|
+
_global_managers[name] = ResourceManager(name)
|
|
323
|
+
return _global_managers[name]
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
async def cleanup_all_managers():
|
|
327
|
+
"""清理所有资源管理器"""
|
|
328
|
+
logger = get_logger("ResourceManager")
|
|
329
|
+
|
|
330
|
+
for name, manager in _global_managers.items():
|
|
331
|
+
try:
|
|
332
|
+
logger.info(f"Cleaning up resource manager: {name}")
|
|
333
|
+
await manager.cleanup_all()
|
|
334
|
+
except Exception as e:
|
|
335
|
+
logger.error(f"Failed to cleanup manager {name}: {e}")
|
|
336
|
+
|
|
337
|
+
_global_managers.clear()
|
crawlo/utils/selector_helper.py
CHANGED
|
@@ -1,138 +1,138 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
选择器辅助工具模块
|
|
5
|
-
==================
|
|
6
|
-
提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
|
|
7
|
-
|
|
8
|
-
该模块包含以下主要函数:
|
|
9
|
-
- extract_text: 从元素列表中提取文本并拼接
|
|
10
|
-
- extract_texts: 从元素列表中提取多个文本列表
|
|
11
|
-
- extract_attr: 从元素列表中提取单个元素的属性值
|
|
12
|
-
- extract_attrs: 从元素列表中提取多个元素的属性值列表
|
|
13
|
-
- is_xpath: 判断查询语句是否为XPath
|
|
14
|
-
|
|
15
|
-
所有方法都采用了简洁直观的命名风格,便于记忆和使用。
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
from typing import List, Any, Optional
|
|
19
|
-
from parsel import Selector, SelectorList
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def extract_text(elements: SelectorList, join_str: str = " ") -> str:
|
|
23
|
-
"""
|
|
24
|
-
从元素列表中提取文本并拼接
|
|
25
|
-
|
|
26
|
-
:param elements: SelectorList元素列表
|
|
27
|
-
:param join_str: 文本拼接分隔符
|
|
28
|
-
:return: 拼接后的文本
|
|
29
|
-
|
|
30
|
-
示例:
|
|
31
|
-
title_elements = selector.css('title')
|
|
32
|
-
title_text = extract_text(title_elements)
|
|
33
|
-
"""
|
|
34
|
-
texts = []
|
|
35
|
-
for element in elements:
|
|
36
|
-
# 获取元素的所有文本节点
|
|
37
|
-
if hasattr(element, 'xpath'):
|
|
38
|
-
element_texts = element.xpath('.//text()').getall()
|
|
39
|
-
else:
|
|
40
|
-
element_texts = [str(element)]
|
|
41
|
-
# 清理并添加非空文本
|
|
42
|
-
for text in element_texts:
|
|
43
|
-
cleaned = text.strip()
|
|
44
|
-
if cleaned:
|
|
45
|
-
texts.append(cleaned)
|
|
46
|
-
return join_str.join(texts)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def extract_texts(elements: SelectorList, join_str: str = " ") -> List[str]:
|
|
50
|
-
"""
|
|
51
|
-
从元素列表中提取多个文本列表
|
|
52
|
-
|
|
53
|
-
:param elements: SelectorList元素列表
|
|
54
|
-
:param join_str: 单个节点内文本拼接分隔符
|
|
55
|
-
:return: 纯文本列表(每个元素对应一个节点的文本)
|
|
56
|
-
|
|
57
|
-
示例:
|
|
58
|
-
li_elements = selector.css('.list li')
|
|
59
|
-
li_texts = extract_texts(li_elements)
|
|
60
|
-
"""
|
|
61
|
-
result = []
|
|
62
|
-
for element in elements:
|
|
63
|
-
# 对每个元素提取文本
|
|
64
|
-
if hasattr(element, 'xpath'):
|
|
65
|
-
texts = element.xpath('.//text()').getall()
|
|
66
|
-
else:
|
|
67
|
-
texts = [str(element)]
|
|
68
|
-
|
|
69
|
-
# 清理文本并拼接
|
|
70
|
-
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
71
|
-
if clean_texts:
|
|
72
|
-
result.append(join_str.join(clean_texts))
|
|
73
|
-
|
|
74
|
-
return result
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def extract_attr(elements: SelectorList, attr_name: str, default: Any = None) -> Any:
|
|
78
|
-
"""
|
|
79
|
-
从元素列表中提取单个元素的属性值
|
|
80
|
-
|
|
81
|
-
:param elements: SelectorList元素列表
|
|
82
|
-
:param attr_name: 属性名称
|
|
83
|
-
:param default: 默认返回值
|
|
84
|
-
:return: 属性值或默认值
|
|
85
|
-
|
|
86
|
-
示例:
|
|
87
|
-
link_elements = selector.css('.link')
|
|
88
|
-
link_href = extract_attr(link_elements, 'href')
|
|
89
|
-
"""
|
|
90
|
-
# 使用parsel的attrib属性获取第一个匹配元素的属性值
|
|
91
|
-
if hasattr(elements, 'attrib'):
|
|
92
|
-
return elements.attrib.get(attr_name, default)
|
|
93
|
-
# 如果elements是SelectorList,获取第一个元素的属性
|
|
94
|
-
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
95
|
-
return elements[0].attrib.get(attr_name, default)
|
|
96
|
-
return default
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def extract_attrs(elements: SelectorList, attr_name: str) -> List[Any]:
|
|
100
|
-
"""
|
|
101
|
-
从元素列表中提取多个元素的属性值列表
|
|
102
|
-
|
|
103
|
-
:param elements: SelectorList元素列表
|
|
104
|
-
:param attr_name: 属性名称
|
|
105
|
-
:return: 属性值列表
|
|
106
|
-
|
|
107
|
-
示例:
|
|
108
|
-
all_links = selector.css('a')
|
|
109
|
-
all_hrefs = extract_attrs(all_links, 'href')
|
|
110
|
-
"""
|
|
111
|
-
result = []
|
|
112
|
-
for element in elements:
|
|
113
|
-
# 使用parsel的attrib属性获取元素的属性值
|
|
114
|
-
if hasattr(element, 'attrib'):
|
|
115
|
-
attr_value = element.attrib.get(attr_name)
|
|
116
|
-
if attr_value is not None:
|
|
117
|
-
result.append(attr_value)
|
|
118
|
-
|
|
119
|
-
return result
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def is_xpath(query: str) -> bool:
|
|
123
|
-
"""
|
|
124
|
-
判断查询语句是否为XPath
|
|
125
|
-
|
|
126
|
-
:param query: 查询语句
|
|
127
|
-
:return: 是否为XPath
|
|
128
|
-
"""
|
|
129
|
-
return query.startswith(('/', '//', './'))
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
__all__ = [
|
|
133
|
-
"extract_text",
|
|
134
|
-
"extract_texts",
|
|
135
|
-
"extract_attr",
|
|
136
|
-
"extract_attrs",
|
|
137
|
-
"is_xpath"
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
选择器辅助工具模块
|
|
5
|
+
==================
|
|
6
|
+
提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
|
|
7
|
+
|
|
8
|
+
该模块包含以下主要函数:
|
|
9
|
+
- extract_text: 从元素列表中提取文本并拼接
|
|
10
|
+
- extract_texts: 从元素列表中提取多个文本列表
|
|
11
|
+
- extract_attr: 从元素列表中提取单个元素的属性值
|
|
12
|
+
- extract_attrs: 从元素列表中提取多个元素的属性值列表
|
|
13
|
+
- is_xpath: 判断查询语句是否为XPath
|
|
14
|
+
|
|
15
|
+
所有方法都采用了简洁直观的命名风格,便于记忆和使用。
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import List, Any, Optional
|
|
19
|
+
from parsel import Selector, SelectorList
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_text(elements: SelectorList, join_str: str = " ") -> str:
|
|
23
|
+
"""
|
|
24
|
+
从元素列表中提取文本并拼接
|
|
25
|
+
|
|
26
|
+
:param elements: SelectorList元素列表
|
|
27
|
+
:param join_str: 文本拼接分隔符
|
|
28
|
+
:return: 拼接后的文本
|
|
29
|
+
|
|
30
|
+
示例:
|
|
31
|
+
title_elements = selector.css('title')
|
|
32
|
+
title_text = extract_text(title_elements)
|
|
33
|
+
"""
|
|
34
|
+
texts = []
|
|
35
|
+
for element in elements:
|
|
36
|
+
# 获取元素的所有文本节点
|
|
37
|
+
if hasattr(element, 'xpath'):
|
|
38
|
+
element_texts = element.xpath('.//text()').getall()
|
|
39
|
+
else:
|
|
40
|
+
element_texts = [str(element)]
|
|
41
|
+
# 清理并添加非空文本
|
|
42
|
+
for text in element_texts:
|
|
43
|
+
cleaned = text.strip()
|
|
44
|
+
if cleaned:
|
|
45
|
+
texts.append(cleaned)
|
|
46
|
+
return join_str.join(texts)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_texts(elements: SelectorList, join_str: str = " ") -> List[str]:
|
|
50
|
+
"""
|
|
51
|
+
从元素列表中提取多个文本列表
|
|
52
|
+
|
|
53
|
+
:param elements: SelectorList元素列表
|
|
54
|
+
:param join_str: 单个节点内文本拼接分隔符
|
|
55
|
+
:return: 纯文本列表(每个元素对应一个节点的文本)
|
|
56
|
+
|
|
57
|
+
示例:
|
|
58
|
+
li_elements = selector.css('.list li')
|
|
59
|
+
li_texts = extract_texts(li_elements)
|
|
60
|
+
"""
|
|
61
|
+
result = []
|
|
62
|
+
for element in elements:
|
|
63
|
+
# 对每个元素提取文本
|
|
64
|
+
if hasattr(element, 'xpath'):
|
|
65
|
+
texts = element.xpath('.//text()').getall()
|
|
66
|
+
else:
|
|
67
|
+
texts = [str(element)]
|
|
68
|
+
|
|
69
|
+
# 清理文本并拼接
|
|
70
|
+
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
71
|
+
if clean_texts:
|
|
72
|
+
result.append(join_str.join(clean_texts))
|
|
73
|
+
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def extract_attr(elements: SelectorList, attr_name: str, default: Any = None) -> Any:
|
|
78
|
+
"""
|
|
79
|
+
从元素列表中提取单个元素的属性值
|
|
80
|
+
|
|
81
|
+
:param elements: SelectorList元素列表
|
|
82
|
+
:param attr_name: 属性名称
|
|
83
|
+
:param default: 默认返回值
|
|
84
|
+
:return: 属性值或默认值
|
|
85
|
+
|
|
86
|
+
示例:
|
|
87
|
+
link_elements = selector.css('.link')
|
|
88
|
+
link_href = extract_attr(link_elements, 'href')
|
|
89
|
+
"""
|
|
90
|
+
# 使用parsel的attrib属性获取第一个匹配元素的属性值
|
|
91
|
+
if hasattr(elements, 'attrib'):
|
|
92
|
+
return elements.attrib.get(attr_name, default)
|
|
93
|
+
# 如果elements是SelectorList,获取第一个元素的属性
|
|
94
|
+
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
95
|
+
return elements[0].attrib.get(attr_name, default)
|
|
96
|
+
return default
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def extract_attrs(elements: SelectorList, attr_name: str) -> List[Any]:
|
|
100
|
+
"""
|
|
101
|
+
从元素列表中提取多个元素的属性值列表
|
|
102
|
+
|
|
103
|
+
:param elements: SelectorList元素列表
|
|
104
|
+
:param attr_name: 属性名称
|
|
105
|
+
:return: 属性值列表
|
|
106
|
+
|
|
107
|
+
示例:
|
|
108
|
+
all_links = selector.css('a')
|
|
109
|
+
all_hrefs = extract_attrs(all_links, 'href')
|
|
110
|
+
"""
|
|
111
|
+
result = []
|
|
112
|
+
for element in elements:
|
|
113
|
+
# 使用parsel的attrib属性获取元素的属性值
|
|
114
|
+
if hasattr(element, 'attrib'):
|
|
115
|
+
attr_value = element.attrib.get(attr_name)
|
|
116
|
+
if attr_value is not None:
|
|
117
|
+
result.append(attr_value)
|
|
118
|
+
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def is_xpath(query: str) -> bool:
|
|
123
|
+
"""
|
|
124
|
+
判断查询语句是否为XPath
|
|
125
|
+
|
|
126
|
+
:param query: 查询语句
|
|
127
|
+
:return: 是否为XPath
|
|
128
|
+
"""
|
|
129
|
+
return query.startswith(('/', '//', './'))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
__all__ = [
|
|
133
|
+
"extract_text",
|
|
134
|
+
"extract_texts",
|
|
135
|
+
"extract_attr",
|
|
136
|
+
"extract_attrs",
|
|
137
|
+
"is_xpath"
|
|
138
138
|
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
单例模式工具模块
|
|
5
|
+
================
|
|
6
|
+
|
|
7
|
+
提供同步和异步两种单例实现方式,适用于不同的使用场景。
|
|
8
|
+
|
|
9
|
+
使用场景:
|
|
10
|
+
1. 同步单例:用于框架初始化、配置管理等同步代码
|
|
11
|
+
2. 异步单例:用于数据库连接池、网络资源等异步代码
|
|
12
|
+
|
|
13
|
+
示例:
|
|
14
|
+
# 同步单例
|
|
15
|
+
@singleton
|
|
16
|
+
class CoreInitializer:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
# 异步单例(在连接池管理器中使用)
|
|
20
|
+
class MySQLConnectionPoolManager:
|
|
21
|
+
_instances: Dict[str, 'MySQLConnectionPoolManager'] = {}
|
|
22
|
+
_lock = asyncio.Lock()
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
async def get_pool(cls, ...):
|
|
26
|
+
async with cls._lock:
|
|
27
|
+
if pool_key not in cls._instances:
|
|
28
|
+
cls._instances[pool_key] = cls(pool_key)
|
|
29
|
+
return cls._instances[pool_key].pool
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
import threading
|
|
33
|
+
from typing import Any, Dict, Type
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SingletonMeta(type):
|
|
37
|
+
"""单例元类"""
|
|
38
|
+
_instances: Dict[Type, Any] = {}
|
|
39
|
+
_lock = threading.Lock()
|
|
40
|
+
|
|
41
|
+
def __call__(cls, *args, **kwargs):
|
|
42
|
+
if cls not in cls._instances:
|
|
43
|
+
with cls._lock:
|
|
44
|
+
if cls not in cls._instances:
|
|
45
|
+
instance = super().__call__(*args, **kwargs)
|
|
46
|
+
cls._instances[cls] = instance
|
|
47
|
+
return cls._instances[cls]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def singleton(cls):
|
|
51
|
+
"""
|
|
52
|
+
单例装饰器
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
cls: 要装饰的类
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
装饰后的类,确保只有一个实例
|
|
59
|
+
"""
|
|
60
|
+
instances = {}
|
|
61
|
+
lock = threading.Lock()
|
|
62
|
+
|
|
63
|
+
def get_instance(*args, **kwargs):
|
|
64
|
+
if cls not in instances:
|
|
65
|
+
with lock:
|
|
66
|
+
if cls not in instances:
|
|
67
|
+
instances[cls] = cls(*args, **kwargs)
|
|
68
|
+
return instances[cls]
|
|
69
|
+
|
|
70
|
+
return get_instance
|