crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (115) hide show
  1. crawlo/__init__.py +28 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/commands/startproject.py +117 -13
  8. crawlo/config.py +30 -0
  9. crawlo/config_validator.py +253 -0
  10. crawlo/core/engine.py +185 -11
  11. crawlo/core/scheduler.py +49 -78
  12. crawlo/crawler.py +6 -6
  13. crawlo/downloader/__init__.py +24 -0
  14. crawlo/downloader/aiohttp_downloader.py +8 -0
  15. crawlo/downloader/cffi_downloader.py +5 -0
  16. crawlo/downloader/hybrid_downloader.py +214 -0
  17. crawlo/downloader/playwright_downloader.py +403 -0
  18. crawlo/downloader/selenium_downloader.py +473 -0
  19. crawlo/extension/__init__.py +17 -10
  20. crawlo/extension/health_check.py +142 -0
  21. crawlo/extension/log_interval.py +27 -18
  22. crawlo/extension/log_stats.py +62 -24
  23. crawlo/extension/logging_extension.py +18 -9
  24. crawlo/extension/memory_monitor.py +105 -0
  25. crawlo/extension/performance_profiler.py +134 -0
  26. crawlo/extension/request_recorder.py +108 -0
  27. crawlo/filters/aioredis_filter.py +50 -12
  28. crawlo/middleware/proxy.py +26 -2
  29. crawlo/mode_manager.py +24 -19
  30. crawlo/network/request.py +30 -3
  31. crawlo/network/response.py +114 -25
  32. crawlo/pipelines/mongo_pipeline.py +81 -66
  33. crawlo/pipelines/mysql_pipeline.py +165 -43
  34. crawlo/pipelines/redis_dedup_pipeline.py +7 -3
  35. crawlo/queue/queue_manager.py +15 -2
  36. crawlo/queue/redis_priority_queue.py +144 -76
  37. crawlo/settings/default_settings.py +93 -121
  38. crawlo/subscriber.py +62 -37
  39. crawlo/templates/project/items.py.tmpl +1 -1
  40. crawlo/templates/project/middlewares.py.tmpl +73 -49
  41. crawlo/templates/project/pipelines.py.tmpl +51 -295
  42. crawlo/templates/project/settings.py.tmpl +93 -17
  43. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  44. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  45. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  46. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  47. crawlo/templates/spider/spider.py.tmpl +2 -38
  48. crawlo/tools/__init__.py +183 -0
  49. crawlo/tools/anti_crawler.py +269 -0
  50. crawlo/tools/authenticated_proxy.py +241 -0
  51. crawlo/tools/data_validator.py +181 -0
  52. crawlo/tools/date_tools.py +36 -0
  53. crawlo/tools/distributed_coordinator.py +387 -0
  54. crawlo/tools/retry_mechanism.py +221 -0
  55. crawlo/tools/scenario_adapter.py +263 -0
  56. crawlo/utils/__init__.py +29 -1
  57. crawlo/utils/batch_processor.py +261 -0
  58. crawlo/utils/date_tools.py +58 -1
  59. crawlo/utils/enhanced_error_handler.py +360 -0
  60. crawlo/utils/env_config.py +106 -0
  61. crawlo/utils/error_handler.py +126 -0
  62. crawlo/utils/performance_monitor.py +285 -0
  63. crawlo/utils/redis_connection_pool.py +335 -0
  64. crawlo/utils/redis_key_validator.py +200 -0
  65. crawlo-1.1.5.dist-info/METADATA +401 -0
  66. crawlo-1.1.5.dist-info/RECORD +185 -0
  67. tests/advanced_tools_example.py +276 -0
  68. tests/authenticated_proxy_example.py +237 -0
  69. tests/cleaners_example.py +161 -0
  70. tests/config_validation_demo.py +103 -0
  71. tests/date_tools_example.py +181 -0
  72. tests/dynamic_loading_example.py +524 -0
  73. tests/dynamic_loading_test.py +105 -0
  74. tests/env_config_example.py +134 -0
  75. tests/error_handling_example.py +172 -0
  76. tests/redis_key_validation_demo.py +131 -0
  77. tests/response_improvements_example.py +145 -0
  78. tests/test_advanced_tools.py +149 -0
  79. tests/test_all_redis_key_configs.py +146 -0
  80. tests/test_authenticated_proxy.py +142 -0
  81. tests/test_cleaners.py +55 -0
  82. tests/test_comprehensive.py +147 -0
  83. tests/test_config_validator.py +194 -0
  84. tests/test_date_tools.py +124 -0
  85. tests/test_dynamic_downloaders_proxy.py +125 -0
  86. tests/test_dynamic_proxy.py +93 -0
  87. tests/test_dynamic_proxy_config.py +147 -0
  88. tests/test_dynamic_proxy_real.py +110 -0
  89. tests/test_edge_cases.py +304 -0
  90. tests/test_enhanced_error_handler.py +271 -0
  91. tests/test_env_config.py +122 -0
  92. tests/test_error_handler_compatibility.py +113 -0
  93. tests/test_framework_env_usage.py +104 -0
  94. tests/test_integration.py +357 -0
  95. tests/test_item_dedup_redis_key.py +123 -0
  96. tests/test_parsel.py +30 -0
  97. tests/test_performance.py +328 -0
  98. tests/test_queue_manager_redis_key.py +177 -0
  99. tests/test_redis_connection_pool.py +295 -0
  100. tests/test_redis_key_naming.py +182 -0
  101. tests/test_redis_key_validator.py +124 -0
  102. tests/test_response_improvements.py +153 -0
  103. tests/test_simple_response.py +62 -0
  104. tests/test_telecom_spider_redis_key.py +206 -0
  105. tests/test_template_content.py +88 -0
  106. tests/test_template_redis_key.py +135 -0
  107. tests/test_tools.py +154 -0
  108. tests/tools_example.py +258 -0
  109. crawlo/core/enhanced_engine.py +0 -190
  110. crawlo-1.1.3.dist-info/METADATA +0 -635
  111. crawlo-1.1.3.dist-info/RECORD +0 -113
  112. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
  113. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
  114. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
  115. {examples → tests}/controlled_spider_example.py +0 -0
crawlo/core/engine.py CHANGED
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
3
  import asyncio
4
+ import time
4
5
  from inspect import iscoroutine
5
6
  from typing import Optional, Generator, Callable
6
7
 
@@ -31,6 +32,20 @@ class Engine(object):
31
32
  self.start_requests: Optional[Generator] = None
32
33
  self.task_manager: Optional[TaskManager] = TaskManager(self.settings.get_int('CONCURRENCY'))
33
34
 
35
+ # 增强控制参数
36
+ self.max_queue_size = self.settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 200)
37
+ self.generation_batch_size = self.settings.get_int('REQUEST_GENERATION_BATCH_SIZE', 10)
38
+ self.generation_interval = self.settings.get_float('REQUEST_GENERATION_INTERVAL', 0.05)
39
+ self.backpressure_ratio = self.settings.get_float('BACKPRESSURE_RATIO', 0.8) # 队列达到80%时启动背压
40
+
41
+ # 状态跟踪
42
+ self._generation_paused = False
43
+ self._last_generation_time = 0
44
+ self._generation_stats = {
45
+ 'total_generated': 0,
46
+ 'backpressure_events': 0
47
+ }
48
+
34
49
  self.logger = get_logger(name=self.__class__.__name__)
35
50
 
36
51
  def _get_downloader_cls(self):
@@ -64,16 +79,25 @@ class Engine(object):
64
79
 
65
80
  self.scheduler = Scheduler.create_instance(self.crawler)
66
81
  if hasattr(self.scheduler, 'open'):
67
- await self.scheduler.open()
82
+ if asyncio.iscoroutinefunction(self.scheduler.open):
83
+ await self.scheduler.open()
84
+ else:
85
+ self.scheduler.open()
68
86
 
69
87
  downloader_cls = self._get_downloader_cls()
70
88
  self.downloader = downloader_cls(self.crawler)
71
89
  if hasattr(self.downloader, 'open'):
72
- self.downloader.open()
90
+ if asyncio.iscoroutinefunction(self.downloader.open):
91
+ await self.downloader.open()
92
+ else:
93
+ self.downloader.open()
73
94
 
74
95
  self.processor = Processor(self.crawler)
75
96
  if hasattr(self.processor, 'open'):
76
- self.processor.open()
97
+ if asyncio.iscoroutinefunction(self.processor.open):
98
+ await self.processor.open()
99
+ else:
100
+ self.processor.open()
77
101
 
78
102
  self.start_requests = iter(spider.start_requests())
79
103
  await self._open_spider()
@@ -81,14 +105,57 @@ class Engine(object):
81
105
  async def crawl(self):
82
106
  """
83
107
  Crawl the spider
108
+ 增强版本支持智能请求生成和背压控制
84
109
  """
110
+ generation_task = None
111
+
112
+ try:
113
+ # 启动请求生成任务(如果启用了受控生成)
114
+ if (self.start_requests and
115
+ self.settings.get_bool('ENABLE_CONTROLLED_REQUEST_GENERATION', False)):
116
+ generation_task = asyncio.create_task(
117
+ self._controlled_request_generation()
118
+ )
119
+ else:
120
+ # 传统方式处理启动请求
121
+ generation_task = asyncio.create_task(
122
+ self._traditional_request_generation()
123
+ )
124
+
125
+ # 主爬取循环
126
+ while self.running:
127
+ # 获取并处理请求
128
+ if request := await self._get_next_request():
129
+ await self._crawl(request)
130
+
131
+ # 检查退出条件
132
+ if await self._should_exit():
133
+ break
134
+
135
+ # 短暂休息避免忙等
136
+ await asyncio.sleep(0.001)
137
+
138
+ finally:
139
+ # 清理生成任务
140
+ if generation_task and not generation_task.done():
141
+ generation_task.cancel()
142
+ try:
143
+ await generation_task
144
+ except asyncio.CancelledError:
145
+ pass
146
+
147
+ await self.close_spider()
148
+
149
+ async def _traditional_request_generation(self):
150
+ """传统的请求生成方式(兼容旧版本)"""
85
151
  while self.running:
86
- if request := await self._get_next_request():
87
- await self._crawl(request)
88
152
  try:
89
153
  start_request = next(self.start_requests)
154
+ # 请求入队
155
+ await self.enqueue_request(start_request)
90
156
  except StopIteration:
91
157
  self.start_requests = None
158
+ break
92
159
  except Exception as exp:
93
160
  # 1、发去请求的request全部运行完毕
94
161
  # 2、调度器是否空闲
@@ -98,12 +165,99 @@ class Engine(object):
98
165
  self.running = False
99
166
  if self.start_requests is not None:
100
167
  self.logger.error(f"启动请求时发生错误: {str(exp)}")
101
- else:
102
- # 请求入队
103
- await self.enqueue_request(start_request)
168
+ await asyncio.sleep(0.001)
104
169
 
105
- if not self.running:
106
- await self.close_spider()
170
+ async def _controlled_request_generation(self):
171
+ """受控的请求生成(增强功能)"""
172
+ self.logger.info("🎛️ 启动受控请求生成")
173
+
174
+ batch = []
175
+ total_generated = 0
176
+
177
+ try:
178
+ for request in self.start_requests:
179
+ batch.append(request)
180
+
181
+ # 批量处理
182
+ if len(batch) >= self.generation_batch_size:
183
+ generated = await self._process_generation_batch(batch)
184
+ total_generated += generated
185
+ batch = []
186
+
187
+ # 背压检查
188
+ if await self._should_pause_generation():
189
+ await self._wait_for_capacity()
190
+
191
+ # 处理剩余请求
192
+ if batch:
193
+ generated = await self._process_generation_batch(batch)
194
+ total_generated += generated
195
+
196
+ except Exception as e:
197
+ self.logger.error(f"❌ 请求生成失败: {e}")
198
+
199
+ finally:
200
+ self.start_requests = None
201
+ self.logger.info(f"🎉 请求生成完成,总计: {total_generated}")
202
+
203
+ async def _process_generation_batch(self, batch) -> int:
204
+ """处理一批请求"""
205
+ generated = 0
206
+
207
+ for request in batch:
208
+ if not self.running:
209
+ break
210
+
211
+ # 等待队列有空间
212
+ while await self._is_queue_full() and self.running:
213
+ await asyncio.sleep(0.1)
214
+
215
+ if self.running:
216
+ await self.enqueue_request(request)
217
+ generated += 1
218
+ self._generation_stats['total_generated'] += 1
219
+
220
+ # 控制生成速度
221
+ if self.generation_interval > 0:
222
+ await asyncio.sleep(self.generation_interval)
223
+
224
+ return generated
225
+
226
+ async def _should_pause_generation(self) -> bool:
227
+ """判断是否应该暂停生成"""
228
+ # 检查队列大小
229
+ if await self._is_queue_full():
230
+ return True
231
+
232
+ # 检查任务管理器负载
233
+ if self.task_manager:
234
+ current_tasks = len(self.task_manager.current_task)
235
+ if hasattr(self.task_manager, 'semaphore'):
236
+ max_concurrency = getattr(self.task_manager.semaphore, '_initial_value', 8)
237
+ if current_tasks >= max_concurrency * self.backpressure_ratio:
238
+ return True
239
+
240
+ return False
241
+
242
+ async def _is_queue_full(self) -> bool:
243
+ """检查队列是否已满"""
244
+ if not self.scheduler:
245
+ return False
246
+
247
+ queue_size = len(self.scheduler)
248
+ return queue_size >= self.max_queue_size * self.backpressure_ratio
249
+
250
+ async def _wait_for_capacity(self):
251
+ """等待系统有足够容量"""
252
+ self._generation_stats['backpressure_events'] += 1
253
+ self.logger.debug("⏸️ 触发背压,暂停请求生成")
254
+
255
+ wait_time = 0.1
256
+ max_wait = 2.0
257
+
258
+ while await self._should_pause_generation() and self.running:
259
+ await asyncio.sleep(wait_time)
260
+ wait_time = min(wait_time * 1.1, max_wait)
107
261
 
108
262
  async def _open_spider(self):
109
263
  asyncio.create_task(self.crawler.subscriber.notify(spider_opened))
@@ -164,9 +318,29 @@ class Engine(object):
164
318
  return True
165
319
  return False
166
320
 
321
+ async def _should_exit(self) -> bool:
322
+ """检查是否应该退出(增强版本)"""
323
+ # 没有启动请求,且所有队列都空闲
324
+ if (self.start_requests is None and
325
+ self.scheduler.idle() and
326
+ self.downloader.idle() and
327
+ self.task_manager.all_done() and
328
+ self.processor.idle()):
329
+ return True
330
+
331
+ return False
332
+
167
333
  async def close_spider(self):
168
334
  await asyncio.gather(*self.task_manager.current_task)
169
335
  await self.scheduler.close()
170
336
  await self.downloader.close()
171
337
  if self.normal:
172
- await self.crawler.close()
338
+ await self.crawler.close()
339
+
340
+ def get_generation_stats(self) -> dict:
341
+ """获取生成统计"""
342
+ return {
343
+ **self._generation_stats,
344
+ 'queue_size': len(self.scheduler) if self.scheduler else 0,
345
+ 'active_tasks': len(self.task_manager.current_task) if self.task_manager else 0
346
+ }
crawlo/core/scheduler.py CHANGED
@@ -5,6 +5,7 @@ from typing import Optional, Callable
5
5
  from crawlo.utils.log import get_logger
6
6
  from crawlo.utils.request import set_request
7
7
  from crawlo.utils.request_serializer import RequestSerializer
8
+ from crawlo.utils.error_handler import ErrorHandler
8
9
  from crawlo.queue.queue_manager import QueueManager, QueueConfig
9
10
  from crawlo.project import load_class, common_call
10
11
 
@@ -16,6 +17,7 @@ class Scheduler:
16
17
  self.request_serializer = RequestSerializer() # 专门处理序列化
17
18
 
18
19
  self.logger = get_logger(name=self.__class__.__name__, level=log_level)
20
+ self.error_handler = ErrorHandler(self.__class__.__name__, log_level)
19
21
  self.stats = stats
20
22
  self.dupe_filter = dupe_filter
21
23
  self.priority = priority
@@ -34,6 +36,7 @@ class Scheduler:
34
36
 
35
37
  async def open(self):
36
38
  """初始化调度器和队列"""
39
+ self.logger.info("开始初始化调度器...")
37
40
  try:
38
41
  # 创建队列配置
39
42
  queue_config = QueueConfig.from_settings(self.crawler.settings)
@@ -42,6 +45,7 @@ class Scheduler:
42
45
  self.queue_manager = QueueManager(queue_config)
43
46
 
44
47
  # 初始化队列
48
+ self.logger.info("开始初始化队列管理器...")
45
49
  success = await self.queue_manager.initialize()
46
50
  if not success:
47
51
  raise RuntimeError("队列初始化失败")
@@ -50,8 +54,10 @@ class Scheduler:
50
54
  status = self.queue_manager.get_status()
51
55
  self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
52
56
  self.logger.info(f'requesting filter: {self.dupe_filter}')
57
+ self.logger.info("调度器初始化完成")
53
58
  except Exception as e:
54
59
  self.logger.error(f"❌ 调度器初始化失败: {e}")
60
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
55
61
  raise
56
62
 
57
63
  async def next_request(self):
@@ -59,14 +65,22 @@ class Scheduler:
59
65
  if not self.queue_manager:
60
66
  return None
61
67
 
62
- request = await self.queue_manager.get()
63
-
64
- # 恢复 callback(从 Redis 队列取出时)
65
- if request:
66
- spider = getattr(self.crawler, 'spider', None)
67
- request = self.request_serializer.restore_after_deserialization(request, spider)
68
+ try:
69
+ request = await self.queue_manager.get()
70
+
71
+ # 恢复 callback(从 Redis 队列取出时)
72
+ if request:
73
+ spider = getattr(self.crawler, 'spider', None)
74
+ request = self.request_serializer.restore_after_deserialization(request, spider)
68
75
 
69
- return request
76
+ return request
77
+ except Exception as e:
78
+ self.error_handler.handle_error(
79
+ e,
80
+ context="获取下一个请求失败",
81
+ raise_error=False
82
+ )
83
+ return None
70
84
 
71
85
  async def enqueue_request(self, request):
72
86
  """将请求加入队列"""
@@ -80,13 +94,21 @@ class Scheduler:
80
94
 
81
95
  set_request(request, self.priority)
82
96
 
83
- # 使用统一的队列接口
84
- success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
85
-
86
- if success:
87
- self.logger.debug(f"✅ 请求入队成功: {request.url}")
88
-
89
- return success
97
+ try:
98
+ # 使用统一的队列接口
99
+ success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
100
+
101
+ if success:
102
+ self.logger.debug(f"✅ 请求入队成功: {request.url}")
103
+
104
+ return success
105
+ except Exception as e:
106
+ self.error_handler.handle_error(
107
+ e,
108
+ context="请求入队失败",
109
+ raise_error=False
110
+ )
111
+ return False
90
112
 
91
113
  def idle(self) -> bool:
92
114
  """检查队列是否为空"""
@@ -94,73 +116,22 @@ class Scheduler:
94
116
 
95
117
  async def close(self):
96
118
  """关闭调度器"""
97
- if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
98
- await closed()
99
-
100
- if self.queue_manager:
101
- await self.queue_manager.close()
119
+ try:
120
+ if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
121
+ await closed()
122
+
123
+ if self.queue_manager:
124
+ await self.queue_manager.close()
125
+ except Exception as e:
126
+ self.error_handler.handle_error(
127
+ e,
128
+ context="关闭调度器失败",
129
+ raise_error=False
130
+ )
102
131
 
103
132
  def __len__(self):
104
133
  """获取队列大小"""
105
134
  if not self.queue_manager:
106
135
  return 0
107
136
  # 返回同步的近似值,实际大小需要异步获取
108
- return 0 if self.queue_manager.empty() else 1
109
-
110
- # #!/usr/bin/python
111
- # # -*- coding:UTF-8 -*-
112
- # from typing import Optional, Callable
113
- #
114
- # from crawlo.utils.log import get_logger
115
- # from crawlo.utils.request import set_request
116
- # from crawlo.utils.pqueue import SpiderPriorityQueue
117
- # from crawlo.project import load_class, common_call
118
- #
119
- #
120
- # class Scheduler:
121
- # def __init__(self, crawler, dupe_filter, stats, log_level, priority):
122
- # self.crawler = crawler
123
- # self.request_queue: Optional[SpiderPriorityQueue] = None
124
- #
125
- # self.logger = get_logger(name=self.__class__.__name__, level=log_level)
126
- # self.stats = stats
127
- # self.dupe_filter = dupe_filter
128
- # self.priority = priority
129
- #
130
- # @classmethod
131
- # def create_instance(cls, crawler):
132
- # filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
133
- # o = cls(
134
- # crawler=crawler,
135
- # dupe_filter=filter_cls.create_instance(crawler),
136
- # stats=crawler.stats,
137
- # log_level=crawler.settings.get('LOG_LEVEL'),
138
- # priority=crawler.settings.get('DEPTH_PRIORITY')
139
- # )
140
- # return o
141
- #
142
- # def open(self):
143
- # self.request_queue = SpiderPriorityQueue()
144
- # self.logger.info(f'requesting filter: {self.dupe_filter}')
145
- #
146
- # async def next_request(self):
147
- # request = await self.request_queue.get()
148
- # return request
149
- #
150
- # async def enqueue_request(self, request):
151
- # if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
152
- # self.dupe_filter.log_stats(request)
153
- # return False
154
- # set_request(request, self.priority)
155
- # await self.request_queue.put(request)
156
- # return True
157
- #
158
- # def idle(self) -> bool:
159
- # return len(self) == 0
160
- #
161
- # async def close(self):
162
- # if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
163
- # await closed()
164
- #
165
- # def __len__(self):
166
- # return self.request_queue.qsize()
137
+ return 0 if self.queue_manager.empty() else 1
crawlo/crawler.py CHANGED
@@ -139,7 +139,7 @@ class Crawler:
139
139
 
140
140
  async def crawl(self):
141
141
  """
142
- 启动爬虫核心流程(增强版)
142
+ 启动爬虫核心流程
143
143
 
144
144
  包含以下阶段:
145
145
  1. 初始化阶段: 创建所有组件
@@ -391,7 +391,7 @@ class Crawler:
391
391
 
392
392
  class CrawlerProcess:
393
393
  """
394
- 爬虫进程管理器(增强版)
394
+ 爬虫进程管理器
395
395
 
396
396
  支持功能:
397
397
  - 多爬虫并发调度和资源管理
@@ -586,7 +586,7 @@ class CrawlerProcess:
586
586
 
587
587
  async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
588
588
  """
589
- 启动一个或多个爬虫(增强版)
589
+ 启动一个或多个爬虫
590
590
 
591
591
  增强功能:
592
592
  - 智能并发控制
@@ -856,7 +856,7 @@ class CrawlerProcess:
856
856
 
857
857
  def _shutdown(self, _signum, _frame):
858
858
  """
859
- 优雅关闭信号处理(增强版)
859
+ 优雅关闭信号处理
860
860
 
861
861
  提供更好的关闭体验和资源清理
862
862
  """
@@ -881,7 +881,7 @@ class CrawlerProcess:
881
881
 
882
882
  async def _wait_for_shutdown(self):
883
883
  """
884
- 等待所有活跃任务完成(增强版)
884
+ 等待所有活跃任务完成
885
885
 
886
886
  提供更好的关闭时间控制和进度反馈
887
887
  """
@@ -935,7 +935,7 @@ class CrawlerProcess:
935
935
  @classmethod
936
936
  def _get_default_settings(cls) -> SettingManager:
937
937
  """
938
- 加载默认配置(增强版)
938
+ 加载默认配置
939
939
 
940
940
  提供更好的错误处理和降级策略
941
941
  """
@@ -209,6 +209,21 @@ try:
209
209
  except ImportError:
210
210
  HttpXDownloader = None
211
211
 
212
+ try:
213
+ from .selenium_downloader import SeleniumDownloader
214
+ except ImportError:
215
+ SeleniumDownloader = None
216
+
217
+ try:
218
+ from .playwright_downloader import PlaywrightDownloader
219
+ except ImportError:
220
+ PlaywrightDownloader = None
221
+
222
+ try:
223
+ from .hybrid_downloader import HybridDownloader
224
+ except ImportError:
225
+ HybridDownloader = None
226
+
212
227
  # 导出所有可用的类
213
228
  __all__ = [
214
229
  'DownloaderBase',
@@ -223,6 +238,12 @@ if CurlCffiDownloader:
223
238
  __all__.append('CurlCffiDownloader')
224
239
  if HttpXDownloader:
225
240
  __all__.append('HttpXDownloader')
241
+ if SeleniumDownloader:
242
+ __all__.append('SeleniumDownloader')
243
+ if PlaywrightDownloader:
244
+ __all__.append('PlaywrightDownloader')
245
+ if HybridDownloader:
246
+ __all__.append('HybridDownloader')
226
247
 
227
248
  # 提供便捷的下载器映射
228
249
  DOWNLOADER_MAP = {
@@ -230,6 +251,9 @@ DOWNLOADER_MAP = {
230
251
  'httpx': HttpXDownloader,
231
252
  'curl_cffi': CurlCffiDownloader,
232
253
  'cffi': CurlCffiDownloader, # 别名
254
+ 'selenium': SeleniumDownloader,
255
+ 'playwright': PlaywrightDownloader,
256
+ 'hybrid': HybridDownloader,
233
257
  }
234
258
 
235
259
  # 过滤掉不可用的下载器
@@ -162,6 +162,14 @@ class AioHttpDownloader(DownloaderBase):
162
162
  except Exception as e:
163
163
  raise ValueError(f"Invalid proxy URL: {proxy}") from e
164
164
 
165
+ # 处理通过meta传递的代理认证信息
166
+ meta_proxy_auth = request.meta.get("proxy_auth")
167
+ if meta_proxy_auth and isinstance(meta_proxy_auth, dict):
168
+ username = meta_proxy_auth.get("username")
169
+ password = meta_proxy_auth.get("password")
170
+ if username and password:
171
+ kwargs["proxy_auth"] = BasicAuth(username, password)
172
+
165
173
  # === 处理请求体 ===
166
174
  if hasattr(request, "_json_body") and request._json_body is not None:
167
175
  kwargs["json"] = request._json_body
@@ -210,6 +210,11 @@ class CurlCffiDownloader(DownloaderBase):
210
210
  else:
211
211
  self.logger.error(f"不支持的 proxy 类型: {type(proxy)},值: {proxy}")
212
212
 
213
+ # 处理通过meta传递的代理认证信息
214
+ proxy_auth_header = request.headers.get("Proxy-Authorization") or request.meta.get("proxy_auth_header")
215
+ if proxy_auth_header:
216
+ kwargs["headers"]["Proxy-Authorization"] = proxy_auth_header
217
+
213
218
  # 请求体处理
214
219
  if hasattr(request, "_json_body") and request._json_body is not None:
215
220
  kwargs["json"] = request._json_body