crawlo 1.4.5__py3-none-any.whl → 1.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (375) hide show
  1. crawlo/__init__.py +90 -89
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -341
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -438
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -291
  19. crawlo/crawler.py +698 -657
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -276
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -245
  25. crawlo/downloader/httpx_downloader.py +265 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -402
  28. crawlo/downloader/selenium_downloader.py +486 -472
  29. crawlo/event.py +45 -11
  30. crawlo/exceptions.py +215 -82
  31. crawlo/extension/__init__.py +65 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +135 -0
  44. crawlo/filters/__init__.py +170 -153
  45. crawlo/filters/aioredis_filter.py +348 -264
  46. crawlo/filters/memory_filter.py +261 -276
  47. crawlo/framework.py +306 -292
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -434
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -194
  52. crawlo/initialization/phases.py +230 -149
  53. crawlo/initialization/registry.py +143 -145
  54. crawlo/initialization/utils.py +49 -0
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -46
  61. crawlo/logging/config.py +277 -197
  62. crawlo/logging/factory.py +175 -171
  63. crawlo/logging/manager.py +104 -112
  64. crawlo/middleware/__init__.py +87 -24
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -386
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -253
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +375 -379
  77. crawlo/network/response.py +569 -664
  78. crawlo/pipelines/__init__.py +53 -22
  79. crawlo/pipelines/base_pipeline.py +452 -0
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -132
  87. crawlo/pipelines/mysql_pipeline.py +470 -326
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +10 -0
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -525
  94. crawlo/queue/redis_priority_queue.py +519 -370
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +285 -270
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +657 -657
  99. crawlo/stats_collector.py +82 -73
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +2 -4
  104. crawlo/templates/project/items.py.tmpl +13 -17
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -36
  107. crawlo/templates/project/settings.py.tmpl +110 -157
  108. crawlo/templates/project/settings_distributed.py.tmpl +156 -161
  109. crawlo/templates/project/settings_gentle.py.tmpl +170 -171
  110. crawlo/templates/project/settings_high_performance.py.tmpl +171 -172
  111. crawlo/templates/project/settings_minimal.py.tmpl +99 -77
  112. crawlo/templates/project/settings_simple.py.tmpl +168 -169
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -30
  115. crawlo/templates/spider/spider.py.tmpl +33 -144
  116. crawlo/templates/spiders_init.py.tmpl +5 -10
  117. crawlo/tools/__init__.py +86 -189
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +50 -50
  123. crawlo/utils/batch_processor.py +276 -259
  124. crawlo/utils/config_manager.py +442 -0
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -244
  127. crawlo/utils/error_handler.py +410 -410
  128. crawlo/utils/fingerprint.py +121 -121
  129. crawlo/utils/func_tools.py +82 -82
  130. crawlo/utils/large_scale_helper.py +344 -344
  131. crawlo/utils/leak_detector.py +335 -0
  132. crawlo/utils/log.py +79 -79
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -0
  135. crawlo/utils/mysql_connection_pool.py +197 -0
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +91 -0
  139. crawlo/utils/redis_connection_pool.py +578 -388
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -256
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -0
  144. crawlo/utils/selector_helper.py +137 -137
  145. crawlo/utils/singleton.py +70 -0
  146. crawlo/utils/spider_loader.py +201 -201
  147. crawlo/utils/text_helper.py +94 -94
  148. crawlo/utils/{url.py → url_utils.py} +39 -39
  149. crawlo-1.4.7.dist-info/METADATA +689 -0
  150. crawlo-1.4.7.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -275
  154. tests/authenticated_proxy_example.py +110 -106
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -0
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +77 -0
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -0
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/scrapy.cfg +11 -11
  192. tests/optimized_performance_test.py +211 -211
  193. tests/performance_comparison.py +244 -244
  194. tests/queue_blocking_test.py +113 -113
  195. tests/queue_test.py +89 -89
  196. tests/redis_key_validation_demo.py +130 -130
  197. tests/request_params_example.py +150 -150
  198. tests/response_improvements_example.py +144 -144
  199. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  200. tests/scrapy_comparison/scrapy_test.py +133 -133
  201. tests/simple_cli_test.py +55 -0
  202. tests/simple_command_test.py +119 -119
  203. tests/simple_crawlo_test.py +126 -126
  204. tests/simple_follow_test.py +38 -38
  205. tests/simple_log_test2.py +137 -137
  206. tests/simple_optimization_test.py +128 -128
  207. tests/simple_queue_type_test.py +41 -41
  208. tests/simple_response_selector_test.py +94 -94
  209. tests/simple_selector_helper_test.py +154 -154
  210. tests/simple_selector_test.py +207 -207
  211. tests/simple_spider_test.py +49 -49
  212. tests/simple_url_test.py +73 -73
  213. tests/simulate_mysql_update_test.py +140 -0
  214. tests/spider_log_timing_test.py +177 -177
  215. tests/test_advanced_tools.py +148 -148
  216. tests/test_all_commands.py +230 -230
  217. tests/test_all_pipeline_fingerprints.py +133 -133
  218. tests/test_all_redis_key_configs.py +145 -145
  219. tests/test_asyncmy_usage.py +57 -0
  220. tests/test_batch_processor.py +178 -178
  221. tests/test_cleaners.py +54 -54
  222. tests/test_cli_arguments.py +119 -0
  223. tests/test_component_factory.py +174 -174
  224. tests/test_config_consistency.py +80 -80
  225. tests/test_config_merge.py +152 -152
  226. tests/test_config_validator.py +182 -182
  227. tests/test_controlled_spider_mixin.py +79 -79
  228. tests/test_crawler_process_import.py +38 -38
  229. tests/test_crawler_process_spider_modules.py +47 -47
  230. tests/test_crawlo_proxy_integration.py +114 -108
  231. tests/test_date_tools.py +123 -123
  232. tests/test_dedup_fix.py +220 -220
  233. tests/test_dedup_pipeline_consistency.py +124 -124
  234. tests/test_default_header_middleware.py +313 -313
  235. tests/test_distributed.py +65 -65
  236. tests/test_double_crawlo_fix.py +204 -204
  237. tests/test_double_crawlo_fix_simple.py +124 -124
  238. tests/test_download_delay_middleware.py +221 -221
  239. tests/test_downloader_proxy_compatibility.py +272 -268
  240. tests/test_edge_cases.py +305 -305
  241. tests/test_encoding_core.py +56 -56
  242. tests/test_encoding_detection.py +126 -126
  243. tests/test_enhanced_error_handler.py +270 -270
  244. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  245. tests/test_error_handler_compatibility.py +112 -112
  246. tests/test_factories.py +252 -252
  247. tests/test_factory_compatibility.py +196 -196
  248. tests/test_final_validation.py +153 -153
  249. tests/test_fingerprint_consistency.py +135 -135
  250. tests/test_fingerprint_simple.py +51 -51
  251. tests/test_get_component_logger.py +83 -83
  252. tests/test_hash_performance.py +99 -99
  253. tests/test_integration.py +169 -169
  254. tests/test_item_dedup_redis_key.py +122 -122
  255. tests/test_large_scale_helper.py +235 -235
  256. tests/test_logging_enhancements.py +374 -374
  257. tests/test_logging_final.py +184 -184
  258. tests/test_logging_integration.py +312 -312
  259. tests/test_logging_system.py +282 -282
  260. tests/test_middleware_debug.py +141 -141
  261. tests/test_mode_consistency.py +51 -51
  262. tests/test_multi_directory.py +67 -67
  263. tests/test_multiple_spider_modules.py +80 -80
  264. tests/test_mysql_pipeline_config.py +165 -0
  265. tests/test_mysql_pipeline_error.py +99 -0
  266. tests/test_mysql_pipeline_init_log.py +83 -0
  267. tests/test_mysql_pipeline_integration.py +133 -0
  268. tests/test_mysql_pipeline_refactor.py +144 -0
  269. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  270. tests/test_mysql_pipeline_robustness.py +196 -0
  271. tests/test_mysql_pipeline_types.py +89 -0
  272. tests/test_mysql_update_columns.py +94 -0
  273. tests/test_offsite_middleware.py +244 -244
  274. tests/test_offsite_middleware_simple.py +203 -203
  275. tests/test_optimized_selector_naming.py +100 -100
  276. tests/test_parsel.py +29 -29
  277. tests/test_performance.py +327 -327
  278. tests/test_performance_monitor.py +115 -115
  279. tests/test_pipeline_fingerprint_consistency.py +86 -86
  280. tests/test_priority_behavior.py +211 -211
  281. tests/test_priority_consistency.py +151 -151
  282. tests/test_priority_consistency_fixed.py +249 -249
  283. tests/test_proxy_health_check.py +32 -32
  284. tests/test_proxy_middleware.py +217 -121
  285. tests/test_proxy_middleware_enhanced.py +212 -216
  286. tests/test_proxy_middleware_integration.py +142 -137
  287. tests/test_proxy_middleware_refactored.py +207 -184
  288. tests/test_proxy_only.py +84 -0
  289. tests/test_proxy_providers.py +56 -56
  290. tests/test_proxy_stats.py +19 -19
  291. tests/test_proxy_strategies.py +59 -59
  292. tests/test_proxy_with_downloader.py +153 -0
  293. tests/test_queue_empty_check.py +41 -41
  294. tests/test_queue_manager_double_crawlo.py +173 -173
  295. tests/test_queue_manager_redis_key.py +179 -179
  296. tests/test_queue_naming.py +154 -154
  297. tests/test_queue_type.py +106 -106
  298. tests/test_queue_type_redis_config_consistency.py +130 -130
  299. tests/test_random_headers_default.py +322 -322
  300. tests/test_random_headers_necessity.py +308 -308
  301. tests/test_random_user_agent.py +72 -72
  302. tests/test_redis_config.py +28 -28
  303. tests/test_redis_connection_pool.py +294 -294
  304. tests/test_redis_key_naming.py +181 -181
  305. tests/test_redis_key_validator.py +123 -123
  306. tests/test_redis_queue.py +224 -224
  307. tests/test_redis_queue_name_fix.py +175 -175
  308. tests/test_redis_queue_type_fallback.py +129 -129
  309. tests/test_request_ignore_middleware.py +182 -182
  310. tests/test_request_params.py +111 -111
  311. tests/test_request_serialization.py +70 -70
  312. tests/test_response_code_middleware.py +349 -349
  313. tests/test_response_filter_middleware.py +427 -427
  314. tests/test_response_follow.py +104 -104
  315. tests/test_response_improvements.py +152 -152
  316. tests/test_response_selector_methods.py +92 -92
  317. tests/test_response_url_methods.py +70 -70
  318. tests/test_response_urljoin.py +86 -86
  319. tests/test_retry_middleware.py +333 -333
  320. tests/test_retry_middleware_realistic.py +273 -273
  321. tests/test_scheduler.py +252 -252
  322. tests/test_scheduler_config_update.py +133 -133
  323. tests/test_scrapy_style_encoding.py +112 -112
  324. tests/test_selector_helper.py +100 -100
  325. tests/test_selector_optimizations.py +146 -146
  326. tests/test_simple_response.py +61 -61
  327. tests/test_spider_loader.py +49 -49
  328. tests/test_spider_loader_comprehensive.py +69 -69
  329. tests/test_spider_modules.py +84 -84
  330. tests/test_spiders/test_spider.py +9 -9
  331. tests/test_telecom_spider_redis_key.py +205 -205
  332. tests/test_template_content.py +87 -87
  333. tests/test_template_redis_key.py +134 -134
  334. tests/test_tools.py +159 -159
  335. tests/test_user_agent_randomness.py +176 -176
  336. tests/test_user_agents.py +96 -96
  337. tests/untested_features_report.md +138 -138
  338. tests/verify_debug.py +51 -51
  339. tests/verify_distributed.py +117 -117
  340. tests/verify_log_fix.py +111 -111
  341. tests/verify_mysql_warnings.py +110 -0
  342. crawlo/logging/async_handler.py +0 -181
  343. crawlo/logging/monitor.py +0 -153
  344. crawlo/logging/sampler.py +0 -167
  345. crawlo/middleware/simple_proxy.py +0 -65
  346. crawlo/tools/authenticated_proxy.py +0 -241
  347. crawlo/tools/data_formatter.py +0 -226
  348. crawlo/tools/data_validator.py +0 -181
  349. crawlo/tools/encoding_converter.py +0 -127
  350. crawlo/tools/network_diagnostic.py +0 -365
  351. crawlo/tools/request_tools.py +0 -83
  352. crawlo/tools/retry_mechanism.py +0 -224
  353. crawlo/utils/env_config.py +0 -143
  354. crawlo/utils/large_scale_config.py +0 -287
  355. crawlo/utils/system.py +0 -11
  356. crawlo/utils/tools.py +0 -5
  357. crawlo-1.4.5.dist-info/METADATA +0 -329
  358. crawlo-1.4.5.dist-info/RECORD +0 -347
  359. tests/env_config_example.py +0 -134
  360. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  361. tests/test_authenticated_proxy.py +0 -142
  362. tests/test_comprehensive.py +0 -147
  363. tests/test_dynamic_downloaders_proxy.py +0 -125
  364. tests/test_dynamic_proxy.py +0 -93
  365. tests/test_dynamic_proxy_config.py +0 -147
  366. tests/test_dynamic_proxy_real.py +0 -110
  367. tests/test_env_config.py +0 -122
  368. tests/test_framework_env_usage.py +0 -104
  369. tests/test_large_scale_config.py +0 -113
  370. tests/test_proxy_api.py +0 -265
  371. tests/test_real_scenario_proxy.py +0 -196
  372. tests/tools_example.py +0 -261
  373. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
  374. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
  375. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,337 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 资源管理器 - 统一管理所有可清理资源
5
+ ========================================
6
+
7
+ 功能特性:
8
+ - 统一注册和清理资源
9
+ - 支持异步资源清理
10
+ - 资源泄露检测
11
+ - 清理顺序保证(LIFO)
12
+ """
13
+ import asyncio
14
+ import time
15
+ import traceback
16
+ from typing import Any, Callable, List, Tuple, Optional, Dict
17
+ from enum import Enum
18
+
19
+ from crawlo.logging import get_logger
20
+
21
+
22
+ class ResourceType(Enum):
23
+ """资源类型枚举"""
24
+ DOWNLOADER = "downloader"
25
+ REDIS_POOL = "redis_pool"
26
+ QUEUE = "queue"
27
+ FILTER = "filter"
28
+ PIPELINE = "pipeline"
29
+ MIDDLEWARE = "middleware"
30
+ EXTENSION = "extension"
31
+ SESSION = "session"
32
+ BROWSER = "browser"
33
+ OTHER = "other"
34
+
35
+
36
+ class ResourceStatus(Enum):
37
+ """资源状态"""
38
+ ACTIVE = "active"
39
+ CLOSING = "closing"
40
+ CLOSED = "closed"
41
+ ERROR = "error"
42
+
43
+
44
+ class ManagedResource:
45
+ """托管资源"""
46
+
47
+ def __init__(self,
48
+ resource: Any,
49
+ cleanup_func: Callable,
50
+ resource_type: ResourceType = ResourceType.OTHER,
51
+ name: Optional[str] = None):
52
+ self.resource = resource
53
+ self.cleanup_func = cleanup_func
54
+ self.resource_type = resource_type
55
+ self.name = name or f"{resource_type.value}_{id(resource)}"
56
+ self.status = ResourceStatus.ACTIVE
57
+ self.created_at = time.time()
58
+ self.closed_at: Optional[float] = None
59
+
60
+ async def cleanup(self) -> bool:
61
+ """清理资源"""
62
+ if self.status == ResourceStatus.CLOSED:
63
+ return True
64
+
65
+ self.status = ResourceStatus.CLOSING
66
+ try:
67
+ # 检查cleanup_func是否为异步函数
68
+ if asyncio.iscoroutinefunction(self.cleanup_func):
69
+ await self.cleanup_func(self.resource)
70
+ else:
71
+ # 同步函数,直接调用
72
+ result = self.cleanup_func(self.resource)
73
+ # 如果返回的是协程,则await
74
+ if asyncio.iscoroutine(result):
75
+ await result
76
+
77
+ self.status = ResourceStatus.CLOSED
78
+ self.closed_at = time.time()
79
+ return True
80
+ except Exception as e:
81
+ self.status = ResourceStatus.ERROR
82
+ raise e
83
+
84
+ def get_lifetime(self) -> float:
85
+ """获取资源生命周期(秒)"""
86
+ end_time = self.closed_at or time.time()
87
+ return end_time - self.created_at
88
+
89
+
90
+ class ResourceManager:
91
+ """
92
+ 资源管理器 - 统一管理所有可清理资源
93
+
94
+ 特性:
95
+ 1. 自动跟踪注册的资源
96
+ 2. 保证清理顺序(LIFO - 后进先出)
97
+ 3. 容错清理(一个失败不影响其他)
98
+ 4. 资源泄露检测
99
+ 5. 统计和监控
100
+ """
101
+
102
+ def __init__(self, name: str = "default"):
103
+ self.name = name
104
+ self._resources: List[ManagedResource] = []
105
+ self._lock = asyncio.Lock()
106
+ self._cleanup_errors: List[Tuple[str, Exception]] = []
107
+ self._logger = get_logger(f"ResourceManager.{name}")
108
+
109
+ # 统计信息
110
+ self._stats = {
111
+ 'total_registered': 0,
112
+ 'total_cleaned': 0,
113
+ 'total_errors': 0,
114
+ 'active_resources': 0,
115
+ }
116
+
117
+ def register(self,
118
+ resource: Any,
119
+ cleanup_func: Callable,
120
+ resource_type: ResourceType = ResourceType.OTHER,
121
+ name: Optional[str] = None) -> ManagedResource:
122
+ """
123
+ 注册需要清理的资源
124
+
125
+ Args:
126
+ resource: 资源对象
127
+ cleanup_func: 清理函数(同步或异步)
128
+ resource_type: 资源类型
129
+ name: 资源名称(用于日志)
130
+
131
+ Returns:
132
+ 托管资源对象
133
+ """
134
+ managed = ManagedResource(resource, cleanup_func, resource_type, name)
135
+ self._resources.append(managed)
136
+ self._stats['total_registered'] += 1
137
+ self._stats['active_resources'] += 1
138
+
139
+ self._logger.debug(f"Resource registered: {managed.name} ({resource_type.value})")
140
+ return managed
141
+
142
+ async def cleanup_all(self, reverse: bool = True) -> Dict[str, Any]:
143
+ """
144
+ 清理所有注册的资源
145
+
146
+ Args:
147
+ reverse: 是否反向清理(LIFO,推荐)
148
+
149
+ Returns:
150
+ 清理结果统计
151
+ """
152
+ async with self._lock:
153
+ if not self._resources:
154
+ self._logger.debug("No resources to cleanup")
155
+ return self._get_cleanup_stats()
156
+
157
+ self._logger.info(f"Starting cleanup of {len(self._resources)} resources...")
158
+
159
+ # 反向清理(后创建的先清理)
160
+ resources = reversed(self._resources) if reverse else self._resources
161
+
162
+ cleanup_start = time.time()
163
+ success_count = 0
164
+ error_count = 0
165
+
166
+ for managed in resources:
167
+ try:
168
+ self._logger.debug(f"Cleaning up: {managed.name}")
169
+ await managed.cleanup()
170
+ success_count += 1
171
+ self._stats['total_cleaned'] += 1
172
+ self._stats['active_resources'] -= 1
173
+ except Exception as e:
174
+ error_count += 1
175
+ self._stats['total_errors'] += 1
176
+ self._cleanup_errors.append((managed.name, e))
177
+ self._logger.error(
178
+ f"Failed to cleanup {managed.name}: {e}",
179
+ exc_info=True
180
+ )
181
+ # 继续清理其他资源,不中断
182
+
183
+ cleanup_duration = time.time() - cleanup_start
184
+
185
+ # 清空资源列表
186
+ self._resources.clear()
187
+
188
+ result = {
189
+ 'success': success_count,
190
+ 'errors': error_count,
191
+ 'duration': cleanup_duration,
192
+ 'total_resources': success_count + error_count,
193
+ }
194
+
195
+ if error_count > 0:
196
+ self._logger.warning(
197
+ f"Cleanup completed with errors: {success_count} success, "
198
+ f"{error_count} errors in {cleanup_duration:.2f}s"
199
+ )
200
+ else:
201
+ self._logger.info(
202
+ f"Cleanup completed successfully: {success_count} resources "
203
+ f"in {cleanup_duration:.2f}s"
204
+ )
205
+
206
+ return result
207
+
208
+ async def cleanup_by_type(self, resource_type: ResourceType) -> int:
209
+ """
210
+ 按类型清理资源
211
+
212
+ Args:
213
+ resource_type: 资源类型
214
+
215
+ Returns:
216
+ 清理的资源数量
217
+ """
218
+ async with self._lock:
219
+ to_cleanup = [r for r in self._resources if r.resource_type == resource_type]
220
+
221
+ if not to_cleanup:
222
+ return 0
223
+
224
+ cleaned = 0
225
+ for managed in reversed(to_cleanup):
226
+ try:
227
+ await managed.cleanup()
228
+ self._resources.remove(managed)
229
+ cleaned += 1
230
+ self._stats['total_cleaned'] += 1
231
+ self._stats['active_resources'] -= 1
232
+ except Exception as e:
233
+ self._logger.error(f"Failed to cleanup {managed.name}: {e}")
234
+ self._stats['total_errors'] += 1
235
+
236
+ return cleaned
237
+
238
+ def get_active_resources(self) -> List[ManagedResource]:
239
+ """获取所有活跃资源"""
240
+ return [r for r in self._resources if r.status == ResourceStatus.ACTIVE]
241
+
242
+ def get_resources_by_type(self, resource_type: ResourceType) -> List[ManagedResource]:
243
+ """按类型获取资源"""
244
+ return [r for r in self._resources if r.resource_type == resource_type]
245
+
246
+ def detect_leaks(self, max_lifetime: float = 3600) -> List[ManagedResource]:
247
+ """
248
+ 检测可能的资源泄露
249
+
250
+ Args:
251
+ max_lifetime: 最大生命周期(秒),超过此时间未清理视为泄露
252
+
253
+ Returns:
254
+ 可能泄露的资源列表
255
+ """
256
+ current_time = time.time()
257
+ leaks = []
258
+
259
+ for managed in self._resources:
260
+ if managed.status == ResourceStatus.ACTIVE:
261
+ lifetime = current_time - managed.created_at
262
+ if lifetime > max_lifetime:
263
+ leaks.append(managed)
264
+ self._logger.warning(
265
+ f"Potential leak detected: {managed.name} "
266
+ f"(lifetime: {lifetime:.2f}s)"
267
+ )
268
+
269
+ return leaks
270
+
271
+ def get_stats(self) -> Dict[str, Any]:
272
+ """获取统计信息"""
273
+ return {
274
+ **self._stats,
275
+ 'cleanup_errors': len(self._cleanup_errors),
276
+ 'active_by_type': self._get_active_by_type(),
277
+ }
278
+
279
+ def _get_active_by_type(self) -> Dict[str, int]:
280
+ """按类型统计活跃资源"""
281
+ result = {}
282
+ for managed in self._resources:
283
+ if managed.status == ResourceStatus.ACTIVE:
284
+ type_name = managed.resource_type.value
285
+ result[type_name] = result.get(type_name, 0) + 1
286
+ return result
287
+
288
+ def _get_cleanup_stats(self) -> Dict[str, Any]:
289
+ """获取清理统计"""
290
+ return {
291
+ 'success': 0,
292
+ 'errors': 0,
293
+ 'duration': 0.0,
294
+ 'total_resources': 0,
295
+ }
296
+
297
+ async def __aenter__(self):
298
+ """上下文管理器入口"""
299
+ return self
300
+
301
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
302
+ """上下文管理器退出,自动清理"""
303
+ await self.cleanup_all()
304
+ return False
305
+
306
+
307
+ # 全局资源管理器注册表
308
+ _global_managers: Dict[str, ResourceManager] = {}
309
+
310
+
311
+ def get_resource_manager(name: str = "default") -> ResourceManager:
312
+ """
313
+ 获取资源管理器实例(单例)
314
+
315
+ Args:
316
+ name: 管理器名称
317
+
318
+ Returns:
319
+ 资源管理器实例
320
+ """
321
+ if name not in _global_managers:
322
+ _global_managers[name] = ResourceManager(name)
323
+ return _global_managers[name]
324
+
325
+
326
+ async def cleanup_all_managers():
327
+ """清理所有资源管理器"""
328
+ logger = get_logger("ResourceManager")
329
+
330
+ for name, manager in _global_managers.items():
331
+ try:
332
+ logger.info(f"Cleaning up resource manager: {name}")
333
+ await manager.cleanup_all()
334
+ except Exception as e:
335
+ logger.error(f"Failed to cleanup manager {name}: {e}")
336
+
337
+ _global_managers.clear()
@@ -1,138 +1,138 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- 选择器辅助工具模块
5
- ==================
6
- 提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
7
-
8
- 该模块包含以下主要函数:
9
- - extract_text: 从元素列表中提取文本并拼接
10
- - extract_texts: 从元素列表中提取多个文本列表
11
- - extract_attr: 从元素列表中提取单个元素的属性值
12
- - extract_attrs: 从元素列表中提取多个元素的属性值列表
13
- - is_xpath: 判断查询语句是否为XPath
14
-
15
- 所有方法都采用了简洁直观的命名风格,便于记忆和使用。
16
- """
17
-
18
- from typing import List, Any, Optional
19
- from parsel import Selector, SelectorList
20
-
21
-
22
- def extract_text(elements: SelectorList, join_str: str = " ") -> str:
23
- """
24
- 从元素列表中提取文本并拼接
25
-
26
- :param elements: SelectorList元素列表
27
- :param join_str: 文本拼接分隔符
28
- :return: 拼接后的文本
29
-
30
- 示例:
31
- title_elements = selector.css('title')
32
- title_text = extract_text(title_elements)
33
- """
34
- texts = []
35
- for element in elements:
36
- # 获取元素的所有文本节点
37
- if hasattr(element, 'xpath'):
38
- element_texts = element.xpath('.//text()').getall()
39
- else:
40
- element_texts = [str(element)]
41
- # 清理并添加非空文本
42
- for text in element_texts:
43
- cleaned = text.strip()
44
- if cleaned:
45
- texts.append(cleaned)
46
- return join_str.join(texts)
47
-
48
-
49
- def extract_texts(elements: SelectorList, join_str: str = " ") -> List[str]:
50
- """
51
- 从元素列表中提取多个文本列表
52
-
53
- :param elements: SelectorList元素列表
54
- :param join_str: 单个节点内文本拼接分隔符
55
- :return: 纯文本列表(每个元素对应一个节点的文本)
56
-
57
- 示例:
58
- li_elements = selector.css('.list li')
59
- li_texts = extract_texts(li_elements)
60
- """
61
- result = []
62
- for element in elements:
63
- # 对每个元素提取文本
64
- if hasattr(element, 'xpath'):
65
- texts = element.xpath('.//text()').getall()
66
- else:
67
- texts = [str(element)]
68
-
69
- # 清理文本并拼接
70
- clean_texts = [text.strip() for text in texts if text.strip()]
71
- if clean_texts:
72
- result.append(join_str.join(clean_texts))
73
-
74
- return result
75
-
76
-
77
- def extract_attr(elements: SelectorList, attr_name: str, default: Any = None) -> Any:
78
- """
79
- 从元素列表中提取单个元素的属性值
80
-
81
- :param elements: SelectorList元素列表
82
- :param attr_name: 属性名称
83
- :param default: 默认返回值
84
- :return: 属性值或默认值
85
-
86
- 示例:
87
- link_elements = selector.css('.link')
88
- link_href = extract_attr(link_elements, 'href')
89
- """
90
- # 使用parsel的attrib属性获取第一个匹配元素的属性值
91
- if hasattr(elements, 'attrib'):
92
- return elements.attrib.get(attr_name, default)
93
- # 如果elements是SelectorList,获取第一个元素的属性
94
- elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
95
- return elements[0].attrib.get(attr_name, default)
96
- return default
97
-
98
-
99
- def extract_attrs(elements: SelectorList, attr_name: str) -> List[Any]:
100
- """
101
- 从元素列表中提取多个元素的属性值列表
102
-
103
- :param elements: SelectorList元素列表
104
- :param attr_name: 属性名称
105
- :return: 属性值列表
106
-
107
- 示例:
108
- all_links = selector.css('a')
109
- all_hrefs = extract_attrs(all_links, 'href')
110
- """
111
- result = []
112
- for element in elements:
113
- # 使用parsel的attrib属性获取元素的属性值
114
- if hasattr(element, 'attrib'):
115
- attr_value = element.attrib.get(attr_name)
116
- if attr_value is not None:
117
- result.append(attr_value)
118
-
119
- return result
120
-
121
-
122
- def is_xpath(query: str) -> bool:
123
- """
124
- 判断查询语句是否为XPath
125
-
126
- :param query: 查询语句
127
- :return: 是否为XPath
128
- """
129
- return query.startswith(('/', '//', './'))
130
-
131
-
132
- __all__ = [
133
- "extract_text",
134
- "extract_texts",
135
- "extract_attr",
136
- "extract_attrs",
137
- "is_xpath"
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 选择器辅助工具模块
5
+ ==================
6
+ 提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
7
+
8
+ 该模块包含以下主要函数:
9
+ - extract_text: 从元素列表中提取文本并拼接
10
+ - extract_texts: 从元素列表中提取多个文本列表
11
+ - extract_attr: 从元素列表中提取单个元素的属性值
12
+ - extract_attrs: 从元素列表中提取多个元素的属性值列表
13
+ - is_xpath: 判断查询语句是否为XPath
14
+
15
+ 所有方法都采用了简洁直观的命名风格,便于记忆和使用。
16
+ """
17
+
18
+ from typing import List, Any, Optional
19
+ from parsel import Selector, SelectorList
20
+
21
+
22
+ def extract_text(elements: SelectorList, join_str: str = " ") -> str:
23
+ """
24
+ 从元素列表中提取文本并拼接
25
+
26
+ :param elements: SelectorList元素列表
27
+ :param join_str: 文本拼接分隔符
28
+ :return: 拼接后的文本
29
+
30
+ 示例:
31
+ title_elements = selector.css('title')
32
+ title_text = extract_text(title_elements)
33
+ """
34
+ texts = []
35
+ for element in elements:
36
+ # 获取元素的所有文本节点
37
+ if hasattr(element, 'xpath'):
38
+ element_texts = element.xpath('.//text()').getall()
39
+ else:
40
+ element_texts = [str(element)]
41
+ # 清理并添加非空文本
42
+ for text in element_texts:
43
+ cleaned = text.strip()
44
+ if cleaned:
45
+ texts.append(cleaned)
46
+ return join_str.join(texts)
47
+
48
+
49
+ def extract_texts(elements: SelectorList, join_str: str = " ") -> List[str]:
50
+ """
51
+ 从元素列表中提取多个文本列表
52
+
53
+ :param elements: SelectorList元素列表
54
+ :param join_str: 单个节点内文本拼接分隔符
55
+ :return: 纯文本列表(每个元素对应一个节点的文本)
56
+
57
+ 示例:
58
+ li_elements = selector.css('.list li')
59
+ li_texts = extract_texts(li_elements)
60
+ """
61
+ result = []
62
+ for element in elements:
63
+ # 对每个元素提取文本
64
+ if hasattr(element, 'xpath'):
65
+ texts = element.xpath('.//text()').getall()
66
+ else:
67
+ texts = [str(element)]
68
+
69
+ # 清理文本并拼接
70
+ clean_texts = [text.strip() for text in texts if text.strip()]
71
+ if clean_texts:
72
+ result.append(join_str.join(clean_texts))
73
+
74
+ return result
75
+
76
+
77
+ def extract_attr(elements: SelectorList, attr_name: str, default: Any = None) -> Any:
78
+ """
79
+ 从元素列表中提取单个元素的属性值
80
+
81
+ :param elements: SelectorList元素列表
82
+ :param attr_name: 属性名称
83
+ :param default: 默认返回值
84
+ :return: 属性值或默认值
85
+
86
+ 示例:
87
+ link_elements = selector.css('.link')
88
+ link_href = extract_attr(link_elements, 'href')
89
+ """
90
+ # 使用parsel的attrib属性获取第一个匹配元素的属性值
91
+ if hasattr(elements, 'attrib'):
92
+ return elements.attrib.get(attr_name, default)
93
+ # 如果elements是SelectorList,获取第一个元素的属性
94
+ elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
95
+ return elements[0].attrib.get(attr_name, default)
96
+ return default
97
+
98
+
99
+ def extract_attrs(elements: SelectorList, attr_name: str) -> List[Any]:
100
+ """
101
+ 从元素列表中提取多个元素的属性值列表
102
+
103
+ :param elements: SelectorList元素列表
104
+ :param attr_name: 属性名称
105
+ :return: 属性值列表
106
+
107
+ 示例:
108
+ all_links = selector.css('a')
109
+ all_hrefs = extract_attrs(all_links, 'href')
110
+ """
111
+ result = []
112
+ for element in elements:
113
+ # 使用parsel的attrib属性获取元素的属性值
114
+ if hasattr(element, 'attrib'):
115
+ attr_value = element.attrib.get(attr_name)
116
+ if attr_value is not None:
117
+ result.append(attr_value)
118
+
119
+ return result
120
+
121
+
122
+ def is_xpath(query: str) -> bool:
123
+ """
124
+ 判断查询语句是否为XPath
125
+
126
+ :param query: 查询语句
127
+ :return: 是否为XPath
128
+ """
129
+ return query.startswith(('/', '//', './'))
130
+
131
+
132
+ __all__ = [
133
+ "extract_text",
134
+ "extract_texts",
135
+ "extract_attr",
136
+ "extract_attrs",
137
+ "is_xpath"
138
138
  ]
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 单例模式工具模块
5
+ ================
6
+
7
+ 提供同步和异步两种单例实现方式,适用于不同的使用场景。
8
+
9
+ 使用场景:
10
+ 1. 同步单例:用于框架初始化、配置管理等同步代码
11
+ 2. 异步单例:用于数据库连接池、网络资源等异步代码
12
+
13
+ 示例:
14
+ # 同步单例
15
+ @singleton
16
+ class CoreInitializer:
17
+ pass
18
+
19
+ # 异步单例(在连接池管理器中使用)
20
+ class MySQLConnectionPoolManager:
21
+ _instances: Dict[str, 'MySQLConnectionPoolManager'] = {}
22
+ _lock = asyncio.Lock()
23
+
24
+ @classmethod
25
+ async def get_pool(cls, ...):
26
+ async with cls._lock:
27
+ if pool_key not in cls._instances:
28
+ cls._instances[pool_key] = cls(pool_key)
29
+ return cls._instances[pool_key].pool
30
+ """
31
+
32
+ import threading
33
+ from typing import Any, Dict, Type
34
+
35
+
36
+ class SingletonMeta(type):
37
+ """单例元类"""
38
+ _instances: Dict[Type, Any] = {}
39
+ _lock = threading.Lock()
40
+
41
+ def __call__(cls, *args, **kwargs):
42
+ if cls not in cls._instances:
43
+ with cls._lock:
44
+ if cls not in cls._instances:
45
+ instance = super().__call__(*args, **kwargs)
46
+ cls._instances[cls] = instance
47
+ return cls._instances[cls]
48
+
49
+
50
+ def singleton(cls):
51
+ """
52
+ 单例装饰器
53
+
54
+ Args:
55
+ cls: 要装饰的类
56
+
57
+ Returns:
58
+ 装饰后的类,确保只有一个实例
59
+ """
60
+ instances = {}
61
+ lock = threading.Lock()
62
+
63
+ def get_instance(*args, **kwargs):
64
+ if cls not in instances:
65
+ with lock:
66
+ if cls not in instances:
67
+ instances[cls] = cls(*args, **kwargs)
68
+ return instances[cls]
69
+
70
+ return get_instance