crawlo 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (326) hide show
  1. crawlo/__init__.py +93 -93
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -341
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +438 -439
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +291 -257
  19. crawlo/crawler.py +650 -650
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +233 -228
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +63 -63
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +61 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +103 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -257
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -292
  47. crawlo/initialization/__init__.py +44 -44
  48. crawlo/initialization/built_in.py +425 -425
  49. crawlo/initialization/context.py +141 -141
  50. crawlo/initialization/core.py +193 -193
  51. crawlo/initialization/phases.py +148 -148
  52. crawlo/initialization/registry.py +145 -145
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -23
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +45 -37
  58. crawlo/logging/async_handler.py +181 -0
  59. crawlo/logging/config.py +196 -96
  60. crawlo/logging/factory.py +171 -128
  61. crawlo/logging/manager.py +111 -111
  62. crawlo/logging/monitor.py +153 -0
  63. crawlo/logging/sampler.py +167 -0
  64. crawlo/middleware/__init__.py +21 -21
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +135 -135
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +386 -386
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/middleware/simple_proxy.py +65 -65
  75. crawlo/mode_manager.py +219 -219
  76. crawlo/network/__init__.py +21 -21
  77. crawlo/network/request.py +379 -379
  78. crawlo/network/response.py +359 -359
  79. crawlo/pipelines/__init__.py +21 -21
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +131 -131
  87. crawlo/pipelines/mysql_pipeline.py +325 -325
  88. crawlo/pipelines/pipeline_manager.py +100 -84
  89. crawlo/pipelines/redis_dedup_pipeline.py +156 -156
  90. crawlo/project.py +349 -338
  91. crawlo/queue/pqueue.py +42 -42
  92. crawlo/queue/queue_manager.py +526 -522
  93. crawlo/queue/redis_priority_queue.py +370 -367
  94. crawlo/settings/__init__.py +7 -7
  95. crawlo/settings/default_settings.py +284 -284
  96. crawlo/settings/setting_manager.py +219 -219
  97. crawlo/spider/__init__.py +657 -657
  98. crawlo/stats_collector.py +73 -73
  99. crawlo/subscriber.py +129 -129
  100. crawlo/task_manager.py +138 -138
  101. crawlo/templates/crawlo.cfg.tmpl +10 -10
  102. crawlo/templates/project/__init__.py.tmpl +3 -3
  103. crawlo/templates/project/items.py.tmpl +17 -17
  104. crawlo/templates/project/middlewares.py.tmpl +118 -118
  105. crawlo/templates/project/pipelines.py.tmpl +96 -96
  106. crawlo/templates/project/settings.py.tmpl +170 -170
  107. crawlo/templates/project/settings_distributed.py.tmpl +169 -169
  108. crawlo/templates/project/settings_gentle.py.tmpl +166 -166
  109. crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
  110. crawlo/templates/project/settings_minimal.py.tmpl +65 -65
  111. crawlo/templates/project/settings_simple.py.tmpl +164 -164
  112. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  113. crawlo/templates/run.py.tmpl +34 -34
  114. crawlo/templates/spider/spider.py.tmpl +143 -143
  115. crawlo/templates/spiders_init.py.tmpl +9 -9
  116. crawlo/tools/__init__.py +200 -200
  117. crawlo/tools/anti_crawler.py +268 -268
  118. crawlo/tools/authenticated_proxy.py +240 -240
  119. crawlo/tools/data_formatter.py +225 -225
  120. crawlo/tools/data_validator.py +180 -180
  121. crawlo/tools/date_tools.py +289 -289
  122. crawlo/tools/distributed_coordinator.py +384 -384
  123. crawlo/tools/encoding_converter.py +127 -127
  124. crawlo/tools/network_diagnostic.py +364 -364
  125. crawlo/tools/request_tools.py +82 -82
  126. crawlo/tools/retry_mechanism.py +224 -224
  127. crawlo/tools/scenario_adapter.py +262 -262
  128. crawlo/tools/text_cleaner.py +232 -232
  129. crawlo/utils/__init__.py +34 -34
  130. crawlo/utils/batch_processor.py +259 -259
  131. crawlo/utils/class_loader.py +25 -25
  132. crawlo/utils/controlled_spider_mixin.py +439 -439
  133. crawlo/utils/db_helper.py +343 -343
  134. crawlo/utils/enhanced_error_handler.py +356 -356
  135. crawlo/utils/env_config.py +142 -142
  136. crawlo/utils/error_handler.py +165 -165
  137. crawlo/utils/fingerprint.py +122 -122
  138. crawlo/utils/func_tools.py +82 -82
  139. crawlo/utils/large_scale_config.py +286 -286
  140. crawlo/utils/large_scale_helper.py +344 -344
  141. crawlo/utils/log.py +79 -79
  142. crawlo/utils/performance_monitor.py +285 -285
  143. crawlo/utils/queue_helper.py +175 -175
  144. crawlo/utils/redis_connection_pool.py +388 -388
  145. crawlo/utils/redis_key_validator.py +198 -198
  146. crawlo/utils/request.py +267 -267
  147. crawlo/utils/request_serializer.py +225 -225
  148. crawlo/utils/spider_loader.py +61 -61
  149. crawlo/utils/system.py +11 -11
  150. crawlo/utils/tools.py +4 -4
  151. crawlo/utils/url.py +39 -39
  152. crawlo-1.4.3.dist-info/METADATA +190 -0
  153. crawlo-1.4.3.dist-info/RECORD +326 -0
  154. examples/__init__.py +7 -7
  155. examples/test_project/__init__.py +7 -7
  156. examples/test_project/run.py +34 -34
  157. examples/test_project/test_project/__init__.py +3 -3
  158. examples/test_project/test_project/items.py +17 -17
  159. examples/test_project/test_project/middlewares.py +118 -118
  160. examples/test_project/test_project/pipelines.py +96 -96
  161. examples/test_project/test_project/settings.py +169 -169
  162. examples/test_project/test_project/spiders/__init__.py +9 -9
  163. examples/test_project/test_project/spiders/of_week_dis.py +143 -143
  164. tests/__init__.py +7 -7
  165. tests/advanced_tools_example.py +275 -275
  166. tests/authenticated_proxy_example.py +106 -106
  167. tests/baidu_performance_test.py +108 -108
  168. tests/baidu_test.py +59 -59
  169. tests/cleaners_example.py +160 -160
  170. tests/comprehensive_framework_test.py +212 -212
  171. tests/comprehensive_test.py +81 -81
  172. tests/comprehensive_testing_summary.md +186 -186
  173. tests/config_validation_demo.py +142 -142
  174. tests/controlled_spider_example.py +205 -205
  175. tests/date_tools_example.py +180 -180
  176. tests/debug_configure.py +69 -69
  177. tests/debug_framework_logger.py +84 -84
  178. tests/debug_log_config.py +126 -126
  179. tests/debug_log_levels.py +63 -63
  180. tests/debug_pipelines.py +66 -66
  181. tests/detailed_log_test.py +233 -233
  182. tests/distributed_test.py +66 -66
  183. tests/distributed_test_debug.py +76 -76
  184. tests/dynamic_loading_example.py +523 -523
  185. tests/dynamic_loading_test.py +104 -104
  186. tests/env_config_example.py +133 -133
  187. tests/error_handling_example.py +171 -171
  188. tests/final_comprehensive_test.py +151 -151
  189. tests/final_log_test.py +260 -260
  190. tests/final_validation_test.py +182 -182
  191. tests/fix_log_test.py +142 -142
  192. tests/framework_performance_test.py +202 -202
  193. tests/log_buffering_test.py +111 -111
  194. tests/log_generation_timing_test.py +153 -153
  195. tests/optimized_performance_test.py +211 -211
  196. tests/performance_comparison.py +245 -245
  197. tests/queue_blocking_test.py +113 -113
  198. tests/queue_test.py +89 -89
  199. tests/redis_key_validation_demo.py +130 -130
  200. tests/request_params_example.py +150 -150
  201. tests/response_improvements_example.py +144 -144
  202. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  203. tests/scrapy_comparison/scrapy_test.py +133 -133
  204. tests/simple_command_test.py +119 -119
  205. tests/simple_crawlo_test.py +127 -127
  206. tests/simple_log_test.py +57 -57
  207. tests/simple_log_test2.py +137 -137
  208. tests/simple_optimization_test.py +128 -128
  209. tests/simple_queue_type_test.py +41 -41
  210. tests/simple_spider_test.py +49 -49
  211. tests/simple_test.py +47 -47
  212. tests/spider_log_timing_test.py +177 -177
  213. tests/test_advanced_tools.py +148 -148
  214. tests/test_all_commands.py +230 -230
  215. tests/test_all_pipeline_fingerprints.py +133 -133
  216. tests/test_all_redis_key_configs.py +145 -145
  217. tests/test_authenticated_proxy.py +141 -141
  218. tests/test_batch_processor.py +178 -178
  219. tests/test_cleaners.py +54 -54
  220. tests/test_component_factory.py +174 -174
  221. tests/test_comprehensive.py +146 -146
  222. tests/test_config_consistency.py +80 -80
  223. tests/test_config_merge.py +152 -152
  224. tests/test_config_validator.py +182 -182
  225. tests/test_controlled_spider_mixin.py +79 -79
  226. tests/test_crawlo_proxy_integration.py +108 -108
  227. tests/test_date_tools.py +123 -123
  228. tests/test_dedup_fix.py +220 -220
  229. tests/test_dedup_pipeline_consistency.py +125 -0
  230. tests/test_default_header_middleware.py +313 -313
  231. tests/test_distributed.py +65 -65
  232. tests/test_double_crawlo_fix.py +204 -204
  233. tests/test_double_crawlo_fix_simple.py +124 -124
  234. tests/test_download_delay_middleware.py +221 -221
  235. tests/test_downloader_proxy_compatibility.py +268 -268
  236. tests/test_dynamic_downloaders_proxy.py +124 -124
  237. tests/test_dynamic_proxy.py +92 -92
  238. tests/test_dynamic_proxy_config.py +146 -146
  239. tests/test_dynamic_proxy_real.py +109 -109
  240. tests/test_edge_cases.py +303 -303
  241. tests/test_enhanced_error_handler.py +270 -270
  242. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  243. tests/test_env_config.py +121 -121
  244. tests/test_error_handler_compatibility.py +112 -112
  245. tests/test_factories.py +252 -252
  246. tests/test_final_validation.py +153 -153
  247. tests/test_fingerprint_consistency.py +135 -135
  248. tests/test_fingerprint_simple.py +51 -51
  249. tests/test_framework_env_usage.py +103 -103
  250. tests/test_framework_logger.py +66 -66
  251. tests/test_framework_startup.py +64 -64
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_config.py +112 -112
  257. tests/test_large_scale_helper.py +235 -235
  258. tests/test_logging_enhancements.py +375 -0
  259. tests/test_logging_final.py +185 -0
  260. tests/test_logging_integration.py +313 -0
  261. tests/test_logging_system.py +282 -282
  262. tests/test_middleware_debug.py +142 -0
  263. tests/test_mode_change.py +72 -72
  264. tests/test_mode_consistency.py +51 -51
  265. tests/test_offsite_middleware.py +244 -244
  266. tests/test_offsite_middleware_simple.py +203 -203
  267. tests/test_parsel.py +29 -29
  268. tests/test_performance.py +327 -327
  269. tests/test_performance_monitor.py +115 -115
  270. tests/test_pipeline_fingerprint_consistency.py +86 -86
  271. tests/test_priority_behavior.py +212 -0
  272. tests/test_priority_consistency.py +152 -0
  273. tests/test_priority_consistency_fixed.py +250 -0
  274. tests/test_proxy_api.py +264 -264
  275. tests/test_proxy_health_check.py +32 -32
  276. tests/test_proxy_middleware.py +121 -121
  277. tests/test_proxy_middleware_enhanced.py +216 -216
  278. tests/test_proxy_middleware_integration.py +136 -136
  279. tests/test_proxy_middleware_refactored.py +184 -184
  280. tests/test_proxy_providers.py +56 -56
  281. tests/test_proxy_stats.py +19 -19
  282. tests/test_proxy_strategies.py +59 -59
  283. tests/test_queue_empty_check.py +41 -41
  284. tests/test_queue_manager_double_crawlo.py +173 -173
  285. tests/test_queue_manager_redis_key.py +179 -179
  286. tests/test_queue_naming.py +154 -154
  287. tests/test_queue_type.py +106 -106
  288. tests/test_queue_type_redis_config_consistency.py +131 -0
  289. tests/test_random_headers_default.py +323 -0
  290. tests/test_random_headers_necessity.py +309 -0
  291. tests/test_random_user_agent.py +72 -72
  292. tests/test_real_scenario_proxy.py +195 -195
  293. tests/test_redis_config.py +28 -28
  294. tests/test_redis_connection_pool.py +294 -294
  295. tests/test_redis_key_naming.py +181 -181
  296. tests/test_redis_key_validator.py +123 -123
  297. tests/test_redis_queue.py +224 -224
  298. tests/test_redis_queue_name_fix.py +175 -175
  299. tests/test_redis_queue_type_fallback.py +130 -0
  300. tests/test_request_ignore_middleware.py +182 -182
  301. tests/test_request_params.py +111 -111
  302. tests/test_request_serialization.py +70 -70
  303. tests/test_response_code_middleware.py +349 -349
  304. tests/test_response_filter_middleware.py +427 -427
  305. tests/test_response_improvements.py +152 -152
  306. tests/test_retry_middleware.py +334 -242
  307. tests/test_retry_middleware_realistic.py +274 -0
  308. tests/test_scheduler.py +252 -252
  309. tests/test_scheduler_config_update.py +133 -133
  310. tests/test_simple_response.py +61 -61
  311. tests/test_telecom_spider_redis_key.py +205 -205
  312. tests/test_template_content.py +87 -87
  313. tests/test_template_redis_key.py +134 -134
  314. tests/test_tools.py +159 -159
  315. tests/test_user_agent_randomness.py +177 -0
  316. tests/test_user_agents.py +96 -96
  317. tests/tools_example.py +260 -260
  318. tests/untested_features_report.md +138 -138
  319. tests/verify_debug.py +51 -51
  320. tests/verify_distributed.py +117 -117
  321. tests/verify_log_fix.py +111 -111
  322. crawlo-1.4.1.dist-info/METADATA +0 -1199
  323. crawlo-1.4.1.dist-info/RECORD +0 -309
  324. {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
  325. {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
  326. {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
@@ -1,522 +1,526 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 统一的队列管理器
5
- 提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
- """
7
- import asyncio
8
- import traceback
9
- from enum import Enum
10
- from typing import Optional, Dict, Any, Union, TYPE_CHECKING
11
- import time
12
- import random
13
-
14
- if TYPE_CHECKING:
15
- from crawlo import Request
16
-
17
- from crawlo.queue.pqueue import SpiderPriorityQueue
18
- from crawlo.utils.error_handler import ErrorHandler
19
- from crawlo.utils.log import get_logger
20
- from crawlo.utils.request_serializer import RequestSerializer
21
-
22
- try:
23
- # 使用完整版Redis队列
24
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
25
-
26
- REDIS_AVAILABLE = True
27
- except ImportError:
28
- RedisPriorityQueue = None
29
- REDIS_AVAILABLE = False
30
-
31
-
32
- class QueueType(Enum):
33
- """Queue type enumeration"""
34
- MEMORY = "memory"
35
- REDIS = "redis"
36
- AUTO = "auto" # 自动选择
37
-
38
-
39
- class IntelligentScheduler:
40
- """智能调度器"""
41
-
42
- def __init__(self):
43
- self.domain_stats = {} # 域名统计信息
44
- self.url_stats = {} # URL统计信息
45
- self.last_request_time = {} # 最后请求时间
46
-
47
- def calculate_priority(self, request: "Request") -> int:
48
- """计算请求的智能优先级"""
49
- priority = getattr(request, 'priority', 0)
50
-
51
- # 获取域名
52
- domain = self._extract_domain(request.url)
53
-
54
- # 基于域名访问频率调整优先级
55
- if domain in self.domain_stats:
56
- domain_access_count = self.domain_stats[domain]['count']
57
- last_access_time = self.domain_stats[domain]['last_time']
58
-
59
- # 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
60
- time_since_last = time.time() - last_access_time
61
- if time_since_last < 5: # 5秒内访问过
62
- priority -= 2
63
- elif time_since_last < 30: # 30秒内访问过
64
- priority -= 1
65
-
66
- # 如果该域名访问次数过多,进一步降低优先级
67
- if domain_access_count > 10:
68
- priority -= 1
69
-
70
- # 基于URL访问历史调整优先级
71
- if request.url in self.url_stats:
72
- url_access_count = self.url_stats[request.url]
73
- if url_access_count > 1:
74
- # 重复URL降低优先级
75
- priority -= url_access_count
76
-
77
- # 基于深度调整优先级
78
- depth = getattr(request, 'meta', {}).get('depth', 0)
79
- priority -= depth # 深度越大,优先级越低
80
-
81
- return priority
82
-
83
- def update_stats(self, request: "Request"):
84
- """更新统计信息"""
85
- domain = self._extract_domain(request.url)
86
-
87
- # 更新域名统计
88
- if domain not in self.domain_stats:
89
- self.domain_stats[domain] = {'count': 0, 'last_time': 0}
90
-
91
- self.domain_stats[domain]['count'] += 1
92
- self.domain_stats[domain]['last_time'] = time.time()
93
-
94
- # 更新URL统计
95
- if request.url not in self.url_stats:
96
- self.url_stats[request.url] = 0
97
- self.url_stats[request.url] += 1
98
-
99
- # 更新最后请求时间
100
- self.last_request_time[domain] = time.time()
101
-
102
- def _extract_domain(self, url: str) -> str:
103
- """提取域名"""
104
- try:
105
- from urllib.parse import urlparse
106
- parsed = urlparse(url)
107
- return parsed.netloc
108
- except:
109
- return "unknown"
110
-
111
-
112
- class QueueConfig:
113
- """Queue configuration class"""
114
-
115
- def __init__(
116
- self,
117
- queue_type: Union[QueueType, str] = QueueType.AUTO,
118
- redis_url: Optional[str] = None,
119
- redis_host: str = "127.0.0.1",
120
- redis_port: int = 6379,
121
- redis_password: Optional[str] = None,
122
- redis_db: int = 0,
123
- queue_name: str = "crawlo:requests",
124
- max_queue_size: int = 1000,
125
- max_retries: int = 3,
126
- timeout: int = 300,
127
- **kwargs
128
- ):
129
- self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
130
-
131
- # Redis 配置
132
- if redis_url:
133
- self.redis_url = redis_url
134
- else:
135
- if redis_password:
136
- self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
137
- else:
138
- self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
139
-
140
- self.queue_name = queue_name
141
- self.max_queue_size = max_queue_size
142
- self.max_retries = max_retries
143
- self.timeout = timeout
144
- self.extra_config = kwargs
145
-
146
- @classmethod
147
- def from_settings(cls, settings) -> 'QueueConfig':
148
- """Create configuration from settings"""
149
- # 获取项目名称,用于生成默认队列名称
150
- project_name = settings.get('PROJECT_NAME', 'default')
151
- default_queue_name = f"crawlo:{project_name}:queue:requests"
152
-
153
- # 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
154
- scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
155
- if scheduler_queue_name is not None:
156
- queue_name = scheduler_queue_name
157
- else:
158
- queue_name = default_queue_name
159
-
160
- return cls(
161
- queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
162
- redis_url=settings.get('REDIS_URL'),
163
- redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
164
- redis_port=settings.get_int('REDIS_PORT', 6379),
165
- redis_password=settings.get('REDIS_PASSWORD'),
166
- redis_db=settings.get_int('REDIS_DB', 0),
167
- queue_name=queue_name,
168
- max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
169
- max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
170
- timeout=settings.get_int('QUEUE_TIMEOUT', 300)
171
- )
172
-
173
-
174
- class QueueManager:
175
- """Unified queue manager"""
176
-
177
- def __init__(self, config: QueueConfig):
178
- self.config = config
179
- # 延迟初始化logger和error_handler避免循环依赖
180
- self._logger = None
181
- self._error_handler = None
182
- self.request_serializer = RequestSerializer()
183
- self._queue = None
184
- self._queue_semaphore = None
185
- self._queue_type = None
186
- self._health_status = "unknown"
187
- self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
188
-
189
- @property
190
- def logger(self):
191
- if self._logger is None:
192
- self._logger = get_logger(self.__class__.__name__)
193
- return self._logger
194
-
195
- @property
196
- def error_handler(self):
197
- if self._error_handler is None:
198
- self._error_handler = ErrorHandler(self.__class__.__name__)
199
- return self._error_handler
200
-
201
- async def initialize(self) -> bool:
202
- """初始化队列"""
203
- try:
204
- queue_type = await self._determine_queue_type()
205
- self._queue = await self._create_queue(queue_type)
206
- self._queue_type = queue_type
207
-
208
- # 测试队列健康状态
209
- health_check_result = await self._health_check()
210
-
211
- self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
212
- # 只在调试模式下输出详细配置信息
213
- self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
214
-
215
- # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
216
- if health_check_result:
217
- return True
218
-
219
- # 如果队列类型是Redis,检查是否需要更新配置
220
- if queue_type == QueueType.REDIS:
221
- # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
222
- # 但我们不需要总是返回True,只有在确实需要更新时才返回True
223
- # 调度器会进行更详细的检查
224
- pass
225
-
226
- return False # 默认不需要更新配置
227
-
228
- except Exception as e:
229
- # 记录详细的错误信息和堆栈跟踪
230
- self.logger.error(f"Queue initialization failed: {e}")
231
- self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
232
- self._health_status = "error"
233
- return False
234
-
235
- async def put(self, request: "Request", priority: int = 0) -> bool:
236
- """Unified enqueue interface"""
237
- if not self._queue:
238
- raise RuntimeError("队列未初始化")
239
-
240
- try:
241
- # 应用智能调度算法计算优先级
242
- intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
243
- # 结合原始优先级和智能优先级
244
- final_priority = priority + intelligent_priority
245
-
246
- # 更新统计信息
247
- self._intelligent_scheduler.update_stats(request)
248
-
249
- # 序列化处理(仅对 Redis 队列)
250
- if self._queue_type == QueueType.REDIS:
251
- request = self.request_serializer.prepare_for_serialization(request)
252
-
253
- # 背压控制(仅对内存队列)
254
- if self._queue_semaphore:
255
- # 对于大量请求,使用阻塞式等待而不是跳过
256
- # 这样可以确保不会丢失任何请求
257
- await self._queue_semaphore.acquire()
258
-
259
- # 统一的入队操作
260
- if hasattr(self._queue, 'put'):
261
- if self._queue_type == QueueType.REDIS:
262
- success = await self._queue.put(request, final_priority)
263
- else:
264
- # 对于内存队列,我们需要手动处理优先级
265
- # 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
266
- await self._queue.put((final_priority, request))
267
- success = True
268
- else:
269
- raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
270
-
271
- if success:
272
- self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
273
-
274
- return success
275
-
276
- except Exception as e:
277
- self.logger.error(f"Failed to enqueue request: {e}")
278
- if self._queue_semaphore:
279
- self._queue_semaphore.release()
280
- return False
281
-
282
- async def get(self, timeout: float = 5.0) -> Optional["Request"]:
283
- """Unified dequeue interface"""
284
- if not self._queue:
285
- raise RuntimeError("队列未初始化")
286
-
287
- try:
288
- request = await self._queue.get(timeout=timeout)
289
-
290
- # 释放信号量(仅对内存队列)
291
- if self._queue_semaphore and request:
292
- self._queue_semaphore.release()
293
-
294
- # 反序列化处理(仅对 Redis 队列)
295
- if request and self._queue_type == QueueType.REDIS:
296
- # 这里需要 spider 实例,暂时返回原始请求
297
- # 实际的 callback 恢复在 scheduler 中处理
298
- pass
299
-
300
- # 如果是内存队列,需要解包(priority, request)元组
301
- if request and self._queue_type == QueueType.MEMORY:
302
- if isinstance(request, tuple) and len(request) == 2:
303
- request = request[1] # 取元组中的请求对象
304
-
305
- return request
306
-
307
- except Exception as e:
308
- self.logger.error(f"Failed to dequeue request: {e}")
309
- return None
310
-
311
- async def size(self) -> int:
312
- """Get queue size"""
313
- if not self._queue:
314
- return 0
315
-
316
- try:
317
- if hasattr(self._queue, 'qsize'):
318
- if asyncio.iscoroutinefunction(self._queue.qsize):
319
- return await self._queue.qsize()
320
- else:
321
- return self._queue.qsize()
322
- return 0
323
- except Exception as e:
324
- self.logger.warning(f"Failed to get queue size: {e}")
325
- return 0
326
-
327
- def empty(self) -> bool:
328
- """Check if queue is empty (synchronous version, for compatibility)"""
329
- try:
330
- # 对于内存队列,可以同步检查
331
- if self._queue_type == QueueType.MEMORY:
332
- # 确保正确检查队列大小
333
- if hasattr(self._queue, 'qsize'):
334
- return self._queue.qsize() == 0
335
- else:
336
- # 如果没有qsize方法,假设队列为空
337
- return True
338
- # 对于 Redis 队列,由于需要异步操作,这里返回近似值
339
- # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
340
- return True
341
- except Exception:
342
- return True
343
-
344
- async def async_empty(self) -> bool:
345
- """Check if queue is empty (asynchronous version, more accurate)"""
346
- try:
347
- # 对于内存队列
348
- if self._queue_type == QueueType.MEMORY:
349
- # 确保正确检查队列大小
350
- if hasattr(self._queue, 'qsize'):
351
- if asyncio.iscoroutinefunction(self._queue.qsize):
352
- size = await self._queue.qsize()
353
- else:
354
- size = self._queue.qsize()
355
- return size == 0
356
- else:
357
- # 如果没有qsize方法,假设队列为空
358
- return True
359
- # 对于 Redis 队列,使用异步检查
360
- elif self._queue_type == QueueType.REDIS:
361
- size = await self.size()
362
- return size == 0
363
- return True
364
- except Exception:
365
- return True
366
-
367
- async def close(self) -> None:
368
- """Close queue"""
369
- if self._queue and hasattr(self._queue, 'close'):
370
- try:
371
- await self._queue.close()
372
- # Change INFO level log to DEBUG level to avoid redundant output
373
- self.logger.debug("Queue closed")
374
- except Exception as e:
375
- self.logger.warning(f"Error closing queue: {e}")
376
-
377
- def get_status(self) -> Dict[str, Any]:
378
- """Get queue status information"""
379
- return {
380
- "type": self._queue_type.value if self._queue_type else "unknown",
381
- "health": self._health_status,
382
- "config": self._get_queue_info(),
383
- "initialized": self._queue is not None
384
- }
385
-
386
- async def _determine_queue_type(self) -> QueueType:
387
- """Determine queue type"""
388
- if self.config.queue_type == QueueType.AUTO:
389
- # 自动选择:优先使用 Redis(如果可用)
390
- if REDIS_AVAILABLE and self.config.redis_url:
391
- # 测试 Redis 连接
392
- try:
393
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
394
- test_queue = RedisPriorityQueue(self.config.redis_url)
395
- await test_queue.connect()
396
- await test_queue.close()
397
- self.logger.debug("Auto-detection: Redis available, using distributed queue")
398
- return QueueType.REDIS
399
- except Exception as e:
400
- self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
401
- return QueueType.MEMORY
402
- else:
403
- self.logger.debug("Auto-detection: Redis not configured, using memory queue")
404
- return QueueType.MEMORY
405
-
406
- elif self.config.queue_type == QueueType.REDIS:
407
- if not REDIS_AVAILABLE:
408
- raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
409
- if not self.config.redis_url:
410
- raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
411
- # 测试 Redis 连接
412
- try:
413
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
414
- test_queue = RedisPriorityQueue(self.config.redis_url)
415
- await test_queue.connect()
416
- await test_queue.close()
417
- return QueueType.REDIS
418
- except Exception as e:
419
- # 如果强制使用Redis但连接失败,则抛出异常
420
- raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
421
-
422
- elif self.config.queue_type == QueueType.MEMORY:
423
- return QueueType.MEMORY
424
-
425
- else:
426
- raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
427
-
428
- async def _create_queue(self, queue_type: QueueType):
429
- """Create queue instance"""
430
- if queue_type == QueueType.REDIS:
431
- # 延迟导入Redis队列
432
- try:
433
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
434
- except ImportError as e:
435
- raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
436
-
437
- # 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
438
- project_name = "default"
439
- if ':' in self.config.queue_name:
440
- parts = self.config.queue_name.split(':')
441
- if len(parts) >= 2:
442
- # 处理可能的双重 crawlo 前缀
443
- if parts[0] == "crawlo" and parts[1] == "crawlo":
444
- # 双重 crawlo 前缀,取"crawlo"作为项目名称
445
- project_name = "crawlo"
446
- elif parts[0] == "crawlo":
447
- # 正常的 crawlo 前缀,取第二个部分作为项目名称
448
- project_name = parts[1]
449
- else:
450
- # 没有 crawlo 前缀,使用第一个部分作为项目名称
451
- project_name = parts[0]
452
- else:
453
- project_name = self.config.queue_name or "default"
454
- else:
455
- project_name = self.config.queue_name or "default"
456
-
457
- queue = RedisPriorityQueue(
458
- redis_url=self.config.redis_url,
459
- queue_name=self.config.queue_name,
460
- max_retries=self.config.max_retries,
461
- timeout=self.config.timeout,
462
- module_name=project_name # 传递项目名称作为module_name
463
- )
464
- # 不需要立即连接,使用 lazy connect
465
- return queue
466
-
467
- elif queue_type == QueueType.MEMORY:
468
- queue = SpiderPriorityQueue()
469
- # 为内存队列设置背压控制
470
- self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
471
- return queue
472
-
473
- else:
474
- raise ValueError(f"不支持的队列类型: {queue_type}")
475
-
476
- async def _health_check(self) -> bool:
477
- """Health check"""
478
- try:
479
- if self._queue_type == QueueType.REDIS:
480
- # 测试 Redis 连接
481
- await self._queue.connect()
482
- self._health_status = "healthy"
483
- else:
484
- # 内存队列总是健康的
485
- self._health_status = "healthy"
486
- return False # 内存队列不需要更新配置
487
- except Exception as e:
488
- self.logger.warning(f"Queue health check failed: {e}")
489
- self._health_status = "unhealthy"
490
- # 如果是Redis队列且健康检查失败,尝试切换到内存队列
491
- if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
492
- self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
493
- try:
494
- await self._queue.close()
495
- except:
496
- pass
497
- self._queue = None
498
- # 重新创建内存队列
499
- self._queue = await self._create_queue(QueueType.MEMORY)
500
- self._queue_type = QueueType.MEMORY
501
- self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
502
- self._health_status = "healthy"
503
- self.logger.info("Switched to memory queue")
504
- # 返回一个信号,表示需要更新过滤器和去重管道配置
505
- return True
506
- return False
507
-
508
- def _get_queue_info(self) -> Dict[str, Any]:
509
- """Get queue configuration information"""
510
- info = {
511
- "queue_name": self.config.queue_name,
512
- "max_queue_size": self.config.max_queue_size
513
- }
514
-
515
- if self._queue_type == QueueType.REDIS:
516
- info.update({
517
- "redis_url": self.config.redis_url,
518
- "max_retries": self.config.max_retries,
519
- "timeout": self.config.timeout
520
- })
521
-
522
- return info
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 统一的队列管理器
5
+ 提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
+ """
7
+ import asyncio
8
+ import traceback
9
+ from enum import Enum
10
+ from typing import Optional, Dict, Any, Union, TYPE_CHECKING
11
+ import time
12
+ import random
13
+
14
+ if TYPE_CHECKING:
15
+ from crawlo import Request
16
+
17
+ from crawlo.queue.pqueue import SpiderPriorityQueue
18
+ from crawlo.utils.error_handler import ErrorHandler
19
+ from crawlo.utils.log import get_logger
20
+ from crawlo.utils.request_serializer import RequestSerializer
21
+
22
+ try:
23
+ # 使用完整版Redis队列
24
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
25
+
26
+ REDIS_AVAILABLE = True
27
+ except ImportError:
28
+ RedisPriorityQueue = None
29
+ REDIS_AVAILABLE = False
30
+
31
+
32
+ class QueueType(Enum):
33
+ """Queue type enumeration"""
34
+ MEMORY = "memory"
35
+ REDIS = "redis"
36
+ AUTO = "auto" # 自动选择
37
+
38
+
39
+ class IntelligentScheduler:
40
+ """智能调度器"""
41
+
42
+ def __init__(self):
43
+ self.domain_stats = {} # 域名统计信息
44
+ self.url_stats = {} # URL统计信息
45
+ self.last_request_time = {} # 最后请求时间
46
+
47
+ def calculate_priority(self, request: "Request") -> int:
48
+ """计算请求的智能优先级"""
49
+ priority = getattr(request, 'priority', 0)
50
+
51
+ # 获取域名
52
+ domain = self._extract_domain(request.url)
53
+
54
+ # 基于域名访问频率调整优先级
55
+ if domain in self.domain_stats:
56
+ domain_access_count = self.domain_stats[domain]['count']
57
+ last_access_time = self.domain_stats[domain]['last_time']
58
+
59
+ # 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
60
+ time_since_last = time.time() - last_access_time
61
+ if time_since_last < 5: # 5秒内访问过
62
+ priority -= 2
63
+ elif time_since_last < 30: # 30秒内访问过
64
+ priority -= 1
65
+
66
+ # 如果该域名访问次数过多,进一步降低优先级
67
+ if domain_access_count > 10:
68
+ priority -= 1
69
+
70
+ # 基于URL访问历史调整优先级
71
+ if request.url in self.url_stats:
72
+ url_access_count = self.url_stats[request.url]
73
+ if url_access_count > 1:
74
+ # 重复URL降低优先级
75
+ priority -= url_access_count
76
+
77
+ # 基于深度调整优先级
78
+ depth = getattr(request, 'meta', {}).get('depth', 0)
79
+ priority -= depth # 深度越大,优先级越低
80
+
81
+ return priority
82
+
83
+ def update_stats(self, request: "Request"):
84
+ """更新统计信息"""
85
+ domain = self._extract_domain(request.url)
86
+
87
+ # 更新域名统计
88
+ if domain not in self.domain_stats:
89
+ self.domain_stats[domain] = {'count': 0, 'last_time': 0}
90
+
91
+ self.domain_stats[domain]['count'] += 1
92
+ self.domain_stats[domain]['last_time'] = time.time()
93
+
94
+ # 更新URL统计
95
+ if request.url not in self.url_stats:
96
+ self.url_stats[request.url] = 0
97
+ self.url_stats[request.url] += 1
98
+
99
+ # 更新最后请求时间
100
+ self.last_request_time[domain] = time.time()
101
+
102
+ def _extract_domain(self, url: str) -> str:
103
+ """提取域名"""
104
+ try:
105
+ from urllib.parse import urlparse
106
+ parsed = urlparse(url)
107
+ return parsed.netloc
108
+ except:
109
+ return "unknown"
110
+
111
+
112
+ class QueueConfig:
113
+ """Queue configuration class"""
114
+
115
+ def __init__(
116
+ self,
117
+ queue_type: Union[QueueType, str] = QueueType.AUTO,
118
+ redis_url: Optional[str] = None,
119
+ redis_host: str = "127.0.0.1",
120
+ redis_port: int = 6379,
121
+ redis_password: Optional[str] = None,
122
+ redis_db: int = 0,
123
+ queue_name: str = "crawlo:requests",
124
+ max_queue_size: int = 1000,
125
+ max_retries: int = 3,
126
+ timeout: int = 300,
127
+ **kwargs
128
+ ):
129
+ self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
130
+
131
+ # Redis 配置
132
+ if redis_url:
133
+ self.redis_url = redis_url
134
+ else:
135
+ if redis_password:
136
+ self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
137
+ else:
138
+ self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
139
+
140
+ self.queue_name = queue_name
141
+ self.max_queue_size = max_queue_size
142
+ self.max_retries = max_retries
143
+ self.timeout = timeout
144
+ self.extra_config = kwargs
145
+
146
+ @classmethod
147
+ def from_settings(cls, settings) -> 'QueueConfig':
148
+ """Create configuration from settings"""
149
+ # 获取项目名称,用于生成默认队列名称
150
+ project_name = settings.get('PROJECT_NAME', 'default')
151
+ default_queue_name = f"crawlo:{project_name}:queue:requests"
152
+
153
+ # 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
154
+ scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
155
+ if scheduler_queue_name is not None:
156
+ queue_name = scheduler_queue_name
157
+ else:
158
+ queue_name = default_queue_name
159
+
160
+ return cls(
161
+ queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
162
+ redis_url=settings.get('REDIS_URL'),
163
+ redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
164
+ redis_port=settings.get_int('REDIS_PORT', 6379),
165
+ redis_password=settings.get('REDIS_PASSWORD'),
166
+ redis_db=settings.get_int('REDIS_DB', 0),
167
+ queue_name=queue_name,
168
+ max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
169
+ max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
170
+ timeout=settings.get_int('QUEUE_TIMEOUT', 300)
171
+ )
172
+
173
+
174
+ class QueueManager:
175
+ """Unified queue manager"""
176
+
177
+ def __init__(self, config: QueueConfig):
178
+ self.config = config
179
+ # 延迟初始化logger和error_handler避免循环依赖
180
+ self._logger = None
181
+ self._error_handler = None
182
+ self.request_serializer = RequestSerializer()
183
+ self._queue = None
184
+ self._queue_semaphore = None
185
+ self._queue_type = None
186
+ self._health_status = "unknown"
187
+ self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
188
+
189
+ @property
190
+ def logger(self):
191
+ if self._logger is None:
192
+ self._logger = get_logger(self.__class__.__name__)
193
+ return self._logger
194
+
195
+ @property
196
+ def error_handler(self):
197
+ if self._error_handler is None:
198
+ self._error_handler = ErrorHandler(self.__class__.__name__)
199
+ return self._error_handler
200
+
201
+ async def initialize(self) -> bool:
202
+ """初始化队列"""
203
+ try:
204
+ queue_type = await self._determine_queue_type()
205
+ self._queue = await self._create_queue(queue_type)
206
+ self._queue_type = queue_type
207
+
208
+ # 测试队列健康状态
209
+ health_check_result = await self._health_check()
210
+
211
+ self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
212
+ # 只在调试模式下输出详细配置信息
213
+ self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
214
+
215
+ # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
216
+ if health_check_result:
217
+ return True
218
+
219
+ # 如果队列类型是Redis,检查是否需要更新配置
220
+ if queue_type == QueueType.REDIS:
221
+ # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
222
+ # 但我们不需要总是返回True,只有在确实需要更新时才返回True
223
+ # 调度器会进行更详细的检查
224
+ pass
225
+
226
+ return False # 默认不需要更新配置
227
+
228
+ except Exception as e:
229
+ # 记录详细的错误信息和堆栈跟踪
230
+ self.logger.error(f"Queue initialization failed: {e}")
231
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
232
+ self._health_status = "error"
233
+ return False
234
+
235
+ async def put(self, request: "Request", priority: int = 0) -> bool:
236
+ """Unified enqueue interface"""
237
+ if not self._queue:
238
+ raise RuntimeError("队列未初始化")
239
+
240
+ try:
241
+ # 应用智能调度算法计算优先级
242
+ intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
243
+ # 结合原始优先级和智能优先级
244
+ final_priority = priority + intelligent_priority
245
+
246
+ # 更新统计信息
247
+ self._intelligent_scheduler.update_stats(request)
248
+
249
+ # 序列化处理(仅对 Redis 队列)
250
+ if self._queue_type == QueueType.REDIS:
251
+ request = self.request_serializer.prepare_for_serialization(request)
252
+
253
+ # 背压控制(仅对内存队列)
254
+ if self._queue_semaphore:
255
+ # 对于大量请求,使用阻塞式等待而不是跳过
256
+ # 这样可以确保不会丢失任何请求
257
+ await self._queue_semaphore.acquire()
258
+
259
+ # 统一的入队操作
260
+ if hasattr(self._queue, 'put'):
261
+ if self._queue_type == QueueType.REDIS:
262
+ success = await self._queue.put(request, final_priority)
263
+ else:
264
+ # 对于内存队列,我们需要手动处理优先级
265
+ # 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
266
+ await self._queue.put((final_priority, request))
267
+ success = True
268
+ else:
269
+ raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
270
+
271
+ if success:
272
+ self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
273
+
274
+ return success
275
+
276
+ except Exception as e:
277
+ self.logger.error(f"Failed to enqueue request: {e}")
278
+ if self._queue_semaphore:
279
+ self._queue_semaphore.release()
280
+ return False
281
+
282
+ async def get(self, timeout: float = 5.0) -> Optional["Request"]:
283
+ """Unified dequeue interface"""
284
+ if not self._queue:
285
+ raise RuntimeError("队列未初始化")
286
+
287
+ try:
288
+ request = await self._queue.get(timeout=timeout)
289
+
290
+ # 释放信号量(仅对内存队列)
291
+ if self._queue_semaphore and request:
292
+ self._queue_semaphore.release()
293
+
294
+ # 反序列化处理(仅对 Redis 队列)
295
+ if request and self._queue_type == QueueType.REDIS:
296
+ # 这里需要 spider 实例,暂时返回原始请求
297
+ # 实际的 callback 恢复在 scheduler 中处理
298
+ pass
299
+
300
+ # 如果是内存队列,需要解包(priority, request)元组
301
+ if request and self._queue_type == QueueType.MEMORY:
302
+ if isinstance(request, tuple) and len(request) == 2:
303
+ request = request[1] # 取元组中的请求对象
304
+
305
+ return request
306
+
307
+ except Exception as e:
308
+ self.logger.error(f"Failed to dequeue request: {e}")
309
+ return None
310
+
311
+ async def size(self) -> int:
312
+ """Get queue size"""
313
+ if not self._queue:
314
+ return 0
315
+
316
+ try:
317
+ if hasattr(self._queue, 'qsize'):
318
+ if asyncio.iscoroutinefunction(self._queue.qsize):
319
+ return await self._queue.qsize()
320
+ else:
321
+ return self._queue.qsize()
322
+ return 0
323
+ except Exception as e:
324
+ self.logger.warning(f"Failed to get queue size: {e}")
325
+ return 0
326
+
327
+ def empty(self) -> bool:
328
+ """Check if queue is empty (synchronous version, for compatibility)"""
329
+ try:
330
+ # 对于内存队列,可以同步检查
331
+ if self._queue_type == QueueType.MEMORY:
332
+ # 确保正确检查队列大小
333
+ if hasattr(self._queue, 'qsize'):
334
+ return self._queue.qsize() == 0
335
+ else:
336
+ # 如果没有qsize方法,假设队列为空
337
+ return True
338
+ # 对于 Redis 队列,由于需要异步操作,这里返回近似值
339
+ # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
340
+ return True
341
+ except Exception:
342
+ return True
343
+
344
+ async def async_empty(self) -> bool:
345
+ """Check if queue is empty (asynchronous version, more accurate)"""
346
+ try:
347
+ # 对于内存队列
348
+ if self._queue_type == QueueType.MEMORY:
349
+ # 确保正确检查队列大小
350
+ if hasattr(self._queue, 'qsize'):
351
+ if asyncio.iscoroutinefunction(self._queue.qsize):
352
+ size = await self._queue.qsize()
353
+ else:
354
+ size = self._queue.qsize()
355
+ return size == 0
356
+ else:
357
+ # 如果没有qsize方法,假设队列为空
358
+ return True
359
+ # 对于 Redis 队列,使用异步检查
360
+ elif self._queue_type == QueueType.REDIS:
361
+ size = await self.size()
362
+ return size == 0
363
+ return True
364
+ except Exception:
365
+ return True
366
+
367
+ async def close(self) -> None:
368
+ """Close queue"""
369
+ if self._queue and hasattr(self._queue, 'close'):
370
+ try:
371
+ await self._queue.close()
372
+ # Change INFO level log to DEBUG level to avoid redundant output
373
+ self.logger.debug("Queue closed")
374
+ except Exception as e:
375
+ self.logger.warning(f"Error closing queue: {e}")
376
+
377
+ def get_status(self) -> Dict[str, Any]:
378
+ """Get queue status information"""
379
+ return {
380
+ "type": self._queue_type.value if self._queue_type else "unknown",
381
+ "health": self._health_status,
382
+ "config": self._get_queue_info(),
383
+ "initialized": self._queue is not None
384
+ }
385
+
386
+ async def _determine_queue_type(self) -> QueueType:
387
+ """Determine queue type"""
388
+ if self.config.queue_type == QueueType.AUTO:
389
+ # 自动选择:优先使用 Redis(如果可用)
390
+ if REDIS_AVAILABLE and self.config.redis_url:
391
+ # 测试 Redis 连接
392
+ try:
393
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
394
+ test_queue = RedisPriorityQueue(self.config.redis_url)
395
+ await test_queue.connect()
396
+ await test_queue.close()
397
+ self.logger.debug("Auto-detection: Redis available, using distributed queue")
398
+ return QueueType.REDIS
399
+ except Exception as e:
400
+ self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
401
+ return QueueType.MEMORY
402
+ else:
403
+ self.logger.debug("Auto-detection: Redis not configured, using memory queue")
404
+ return QueueType.MEMORY
405
+
406
+ elif self.config.queue_type == QueueType.REDIS:
407
+ # QUEUE_TYPE = 'redis' 时,行为等同于 'auto' 模式
408
+ # 优先使用 Redis(如果可用),如果不可用则回退到内存队列
409
+ if REDIS_AVAILABLE and self.config.redis_url:
410
+ # 测试 Redis 连接
411
+ try:
412
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
413
+ test_queue = RedisPriorityQueue(self.config.redis_url)
414
+ await test_queue.connect()
415
+ await test_queue.close()
416
+ self.logger.debug("Redis mode: Redis available, using distributed queue")
417
+ return QueueType.REDIS
418
+ except Exception as e:
419
+ self.logger.debug(f"Redis mode: Redis unavailable ({e}), falling back to memory queue")
420
+ return QueueType.MEMORY
421
+ else:
422
+ self.logger.debug("Redis mode: Redis not configured, falling back to memory queue")
423
+ return QueueType.MEMORY
424
+
425
+ elif self.config.queue_type == QueueType.MEMORY:
426
+ return QueueType.MEMORY
427
+
428
+ else:
429
+ raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
430
+
431
+ async def _create_queue(self, queue_type: QueueType):
432
+ """Create queue instance"""
433
+ if queue_type == QueueType.REDIS:
434
+ # 延迟导入Redis队列
435
+ try:
436
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
437
+ except ImportError as e:
438
+ raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
439
+
440
+ # 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
441
+ project_name = "default"
442
+ if ':' in self.config.queue_name:
443
+ parts = self.config.queue_name.split(':')
444
+ if len(parts) >= 2:
445
+ # 处理可能的双重 crawlo 前缀
446
+ if parts[0] == "crawlo" and parts[1] == "crawlo":
447
+ # 双重 crawlo 前缀,取"crawlo"作为项目名称
448
+ project_name = "crawlo"
449
+ elif parts[0] == "crawlo":
450
+ # 正常的 crawlo 前缀,取第二个部分作为项目名称
451
+ project_name = parts[1]
452
+ else:
453
+ # 没有 crawlo 前缀,使用第一个部分作为项目名称
454
+ project_name = parts[0]
455
+ else:
456
+ project_name = self.config.queue_name or "default"
457
+ else:
458
+ project_name = self.config.queue_name or "default"
459
+
460
+ queue = RedisPriorityQueue(
461
+ redis_url=self.config.redis_url,
462
+ queue_name=self.config.queue_name,
463
+ max_retries=self.config.max_retries,
464
+ timeout=self.config.timeout,
465
+ module_name=project_name # 传递项目名称作为module_name
466
+ )
467
+ # 不需要立即连接,使用 lazy connect
468
+ return queue
469
+
470
+ elif queue_type == QueueType.MEMORY:
471
+ queue = SpiderPriorityQueue()
472
+ # 为内存队列设置背压控制
473
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
474
+ return queue
475
+
476
+ else:
477
+ raise ValueError(f"不支持的队列类型: {queue_type}")
478
+
479
+ async def _health_check(self) -> bool:
480
+ """Health check"""
481
+ try:
482
+ if self._queue_type == QueueType.REDIS:
483
+ # 测试 Redis 连接
484
+ await self._queue.connect()
485
+ self._health_status = "healthy"
486
+ else:
487
+ # 内存队列总是健康的
488
+ self._health_status = "healthy"
489
+ return False # 内存队列不需要更新配置
490
+ except Exception as e:
491
+ self.logger.warning(f"Queue health check failed: {e}")
492
+ self._health_status = "unhealthy"
493
+ # 如果是Redis队列且健康检查失败,尝试切换到内存队列
494
+ # 对于 AUTO 和 REDIS 模式都允许回退
495
+ if self._queue_type == QueueType.REDIS and self.config.queue_type in [QueueType.AUTO, QueueType.REDIS]:
496
+ self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
497
+ try:
498
+ await self._queue.close()
499
+ except:
500
+ pass
501
+ self._queue = None
502
+ # 重新创建内存队列
503
+ self._queue = await self._create_queue(QueueType.MEMORY)
504
+ self._queue_type = QueueType.MEMORY
505
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
506
+ self._health_status = "healthy"
507
+ self.logger.info("Switched to memory queue")
508
+ # 返回一个信号,表示需要更新过滤器和去重管道配置
509
+ return True
510
+ return False
511
+
512
+ def _get_queue_info(self) -> Dict[str, Any]:
513
+ """Get queue configuration information"""
514
+ info = {
515
+ "queue_name": self.config.queue_name,
516
+ "max_queue_size": self.config.max_queue_size
517
+ }
518
+
519
+ if self._queue_type == QueueType.REDIS:
520
+ info.update({
521
+ "redis_url": self.config.redis_url,
522
+ "max_retries": self.config.max_retries,
523
+ "timeout": self.config.timeout
524
+ })
525
+
526
+ return info