crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +28 -1
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +61 -0
- crawlo/cleaners/data_formatter.py +226 -0
- crawlo/cleaners/encoding_converter.py +126 -0
- crawlo/cleaners/text_cleaner.py +233 -0
- crawlo/commands/startproject.py +117 -13
- crawlo/config.py +30 -0
- crawlo/config_validator.py +253 -0
- crawlo/core/engine.py +185 -11
- crawlo/core/scheduler.py +49 -78
- crawlo/crawler.py +6 -6
- crawlo/downloader/__init__.py +24 -0
- crawlo/downloader/aiohttp_downloader.py +8 -0
- crawlo/downloader/cffi_downloader.py +5 -0
- crawlo/downloader/hybrid_downloader.py +214 -0
- crawlo/downloader/playwright_downloader.py +403 -0
- crawlo/downloader/selenium_downloader.py +473 -0
- crawlo/extension/__init__.py +17 -10
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +27 -18
- crawlo/extension/log_stats.py +62 -24
- crawlo/extension/logging_extension.py +18 -9
- crawlo/extension/memory_monitor.py +105 -0
- crawlo/extension/performance_profiler.py +134 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/aioredis_filter.py +50 -12
- crawlo/middleware/proxy.py +26 -2
- crawlo/mode_manager.py +24 -19
- crawlo/network/request.py +30 -3
- crawlo/network/response.py +114 -25
- crawlo/pipelines/mongo_pipeline.py +81 -66
- crawlo/pipelines/mysql_pipeline.py +165 -43
- crawlo/pipelines/redis_dedup_pipeline.py +7 -3
- crawlo/queue/queue_manager.py +15 -2
- crawlo/queue/redis_priority_queue.py +144 -76
- crawlo/settings/default_settings.py +93 -121
- crawlo/subscriber.py +62 -37
- crawlo/templates/project/items.py.tmpl +1 -1
- crawlo/templates/project/middlewares.py.tmpl +73 -49
- crawlo/templates/project/pipelines.py.tmpl +51 -295
- crawlo/templates/project/settings.py.tmpl +93 -17
- crawlo/templates/project/settings_distributed.py.tmpl +120 -0
- crawlo/templates/project/settings_gentle.py.tmpl +95 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
- crawlo/templates/project/settings_simple.py.tmpl +69 -0
- crawlo/templates/spider/spider.py.tmpl +2 -38
- crawlo/tools/__init__.py +183 -0
- crawlo/tools/anti_crawler.py +269 -0
- crawlo/tools/authenticated_proxy.py +241 -0
- crawlo/tools/data_validator.py +181 -0
- crawlo/tools/date_tools.py +36 -0
- crawlo/tools/distributed_coordinator.py +387 -0
- crawlo/tools/retry_mechanism.py +221 -0
- crawlo/tools/scenario_adapter.py +263 -0
- crawlo/utils/__init__.py +29 -1
- crawlo/utils/batch_processor.py +261 -0
- crawlo/utils/date_tools.py +58 -1
- crawlo/utils/enhanced_error_handler.py +360 -0
- crawlo/utils/env_config.py +106 -0
- crawlo/utils/error_handler.py +126 -0
- crawlo/utils/performance_monitor.py +285 -0
- crawlo/utils/redis_connection_pool.py +335 -0
- crawlo/utils/redis_key_validator.py +200 -0
- crawlo-1.1.5.dist-info/METADATA +401 -0
- crawlo-1.1.5.dist-info/RECORD +185 -0
- tests/advanced_tools_example.py +276 -0
- tests/authenticated_proxy_example.py +237 -0
- tests/cleaners_example.py +161 -0
- tests/config_validation_demo.py +103 -0
- tests/date_tools_example.py +181 -0
- tests/dynamic_loading_example.py +524 -0
- tests/dynamic_loading_test.py +105 -0
- tests/env_config_example.py +134 -0
- tests/error_handling_example.py +172 -0
- tests/redis_key_validation_demo.py +131 -0
- tests/response_improvements_example.py +145 -0
- tests/test_advanced_tools.py +149 -0
- tests/test_all_redis_key_configs.py +146 -0
- tests/test_authenticated_proxy.py +142 -0
- tests/test_cleaners.py +55 -0
- tests/test_comprehensive.py +147 -0
- tests/test_config_validator.py +194 -0
- tests/test_date_tools.py +124 -0
- tests/test_dynamic_downloaders_proxy.py +125 -0
- tests/test_dynamic_proxy.py +93 -0
- tests/test_dynamic_proxy_config.py +147 -0
- tests/test_dynamic_proxy_real.py +110 -0
- tests/test_edge_cases.py +304 -0
- tests/test_enhanced_error_handler.py +271 -0
- tests/test_env_config.py +122 -0
- tests/test_error_handler_compatibility.py +113 -0
- tests/test_framework_env_usage.py +104 -0
- tests/test_integration.py +357 -0
- tests/test_item_dedup_redis_key.py +123 -0
- tests/test_parsel.py +30 -0
- tests/test_performance.py +328 -0
- tests/test_queue_manager_redis_key.py +177 -0
- tests/test_redis_connection_pool.py +295 -0
- tests/test_redis_key_naming.py +182 -0
- tests/test_redis_key_validator.py +124 -0
- tests/test_response_improvements.py +153 -0
- tests/test_simple_response.py +62 -0
- tests/test_telecom_spider_redis_key.py +206 -0
- tests/test_template_content.py +88 -0
- tests/test_template_redis_key.py +135 -0
- tests/test_tools.py +154 -0
- tests/tools_example.py +258 -0
- crawlo/core/enhanced_engine.py +0 -190
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
- {examples → tests}/controlled_spider_example.py +0 -0
crawlo/core/engine.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
3
|
import asyncio
|
|
4
|
+
import time
|
|
4
5
|
from inspect import iscoroutine
|
|
5
6
|
from typing import Optional, Generator, Callable
|
|
6
7
|
|
|
@@ -31,6 +32,20 @@ class Engine(object):
|
|
|
31
32
|
self.start_requests: Optional[Generator] = None
|
|
32
33
|
self.task_manager: Optional[TaskManager] = TaskManager(self.settings.get_int('CONCURRENCY'))
|
|
33
34
|
|
|
35
|
+
# 增强控制参数
|
|
36
|
+
self.max_queue_size = self.settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 200)
|
|
37
|
+
self.generation_batch_size = self.settings.get_int('REQUEST_GENERATION_BATCH_SIZE', 10)
|
|
38
|
+
self.generation_interval = self.settings.get_float('REQUEST_GENERATION_INTERVAL', 0.05)
|
|
39
|
+
self.backpressure_ratio = self.settings.get_float('BACKPRESSURE_RATIO', 0.8) # 队列达到80%时启动背压
|
|
40
|
+
|
|
41
|
+
# 状态跟踪
|
|
42
|
+
self._generation_paused = False
|
|
43
|
+
self._last_generation_time = 0
|
|
44
|
+
self._generation_stats = {
|
|
45
|
+
'total_generated': 0,
|
|
46
|
+
'backpressure_events': 0
|
|
47
|
+
}
|
|
48
|
+
|
|
34
49
|
self.logger = get_logger(name=self.__class__.__name__)
|
|
35
50
|
|
|
36
51
|
def _get_downloader_cls(self):
|
|
@@ -64,16 +79,25 @@ class Engine(object):
|
|
|
64
79
|
|
|
65
80
|
self.scheduler = Scheduler.create_instance(self.crawler)
|
|
66
81
|
if hasattr(self.scheduler, 'open'):
|
|
67
|
-
|
|
82
|
+
if asyncio.iscoroutinefunction(self.scheduler.open):
|
|
83
|
+
await self.scheduler.open()
|
|
84
|
+
else:
|
|
85
|
+
self.scheduler.open()
|
|
68
86
|
|
|
69
87
|
downloader_cls = self._get_downloader_cls()
|
|
70
88
|
self.downloader = downloader_cls(self.crawler)
|
|
71
89
|
if hasattr(self.downloader, 'open'):
|
|
72
|
-
self.downloader.open
|
|
90
|
+
if asyncio.iscoroutinefunction(self.downloader.open):
|
|
91
|
+
await self.downloader.open()
|
|
92
|
+
else:
|
|
93
|
+
self.downloader.open()
|
|
73
94
|
|
|
74
95
|
self.processor = Processor(self.crawler)
|
|
75
96
|
if hasattr(self.processor, 'open'):
|
|
76
|
-
self.processor.open
|
|
97
|
+
if asyncio.iscoroutinefunction(self.processor.open):
|
|
98
|
+
await self.processor.open()
|
|
99
|
+
else:
|
|
100
|
+
self.processor.open()
|
|
77
101
|
|
|
78
102
|
self.start_requests = iter(spider.start_requests())
|
|
79
103
|
await self._open_spider()
|
|
@@ -81,14 +105,57 @@ class Engine(object):
|
|
|
81
105
|
async def crawl(self):
|
|
82
106
|
"""
|
|
83
107
|
Crawl the spider
|
|
108
|
+
增强版本支持智能请求生成和背压控制
|
|
84
109
|
"""
|
|
110
|
+
generation_task = None
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
# 启动请求生成任务(如果启用了受控生成)
|
|
114
|
+
if (self.start_requests and
|
|
115
|
+
self.settings.get_bool('ENABLE_CONTROLLED_REQUEST_GENERATION', False)):
|
|
116
|
+
generation_task = asyncio.create_task(
|
|
117
|
+
self._controlled_request_generation()
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
# 传统方式处理启动请求
|
|
121
|
+
generation_task = asyncio.create_task(
|
|
122
|
+
self._traditional_request_generation()
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# 主爬取循环
|
|
126
|
+
while self.running:
|
|
127
|
+
# 获取并处理请求
|
|
128
|
+
if request := await self._get_next_request():
|
|
129
|
+
await self._crawl(request)
|
|
130
|
+
|
|
131
|
+
# 检查退出条件
|
|
132
|
+
if await self._should_exit():
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
# 短暂休息避免忙等
|
|
136
|
+
await asyncio.sleep(0.001)
|
|
137
|
+
|
|
138
|
+
finally:
|
|
139
|
+
# 清理生成任务
|
|
140
|
+
if generation_task and not generation_task.done():
|
|
141
|
+
generation_task.cancel()
|
|
142
|
+
try:
|
|
143
|
+
await generation_task
|
|
144
|
+
except asyncio.CancelledError:
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
await self.close_spider()
|
|
148
|
+
|
|
149
|
+
async def _traditional_request_generation(self):
|
|
150
|
+
"""传统的请求生成方式(兼容旧版本)"""
|
|
85
151
|
while self.running:
|
|
86
|
-
if request := await self._get_next_request():
|
|
87
|
-
await self._crawl(request)
|
|
88
152
|
try:
|
|
89
153
|
start_request = next(self.start_requests)
|
|
154
|
+
# 请求入队
|
|
155
|
+
await self.enqueue_request(start_request)
|
|
90
156
|
except StopIteration:
|
|
91
157
|
self.start_requests = None
|
|
158
|
+
break
|
|
92
159
|
except Exception as exp:
|
|
93
160
|
# 1、发去请求的request全部运行完毕
|
|
94
161
|
# 2、调度器是否空闲
|
|
@@ -98,12 +165,99 @@ class Engine(object):
|
|
|
98
165
|
self.running = False
|
|
99
166
|
if self.start_requests is not None:
|
|
100
167
|
self.logger.error(f"启动请求时发生错误: {str(exp)}")
|
|
101
|
-
|
|
102
|
-
# 请求入队
|
|
103
|
-
await self.enqueue_request(start_request)
|
|
168
|
+
await asyncio.sleep(0.001)
|
|
104
169
|
|
|
105
|
-
|
|
106
|
-
|
|
170
|
+
async def _controlled_request_generation(self):
|
|
171
|
+
"""受控的请求生成(增强功能)"""
|
|
172
|
+
self.logger.info("🎛️ 启动受控请求生成")
|
|
173
|
+
|
|
174
|
+
batch = []
|
|
175
|
+
total_generated = 0
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
for request in self.start_requests:
|
|
179
|
+
batch.append(request)
|
|
180
|
+
|
|
181
|
+
# 批量处理
|
|
182
|
+
if len(batch) >= self.generation_batch_size:
|
|
183
|
+
generated = await self._process_generation_batch(batch)
|
|
184
|
+
total_generated += generated
|
|
185
|
+
batch = []
|
|
186
|
+
|
|
187
|
+
# 背压检查
|
|
188
|
+
if await self._should_pause_generation():
|
|
189
|
+
await self._wait_for_capacity()
|
|
190
|
+
|
|
191
|
+
# 处理剩余请求
|
|
192
|
+
if batch:
|
|
193
|
+
generated = await self._process_generation_batch(batch)
|
|
194
|
+
total_generated += generated
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
self.logger.error(f"❌ 请求生成失败: {e}")
|
|
198
|
+
|
|
199
|
+
finally:
|
|
200
|
+
self.start_requests = None
|
|
201
|
+
self.logger.info(f"🎉 请求生成完成,总计: {total_generated}")
|
|
202
|
+
|
|
203
|
+
async def _process_generation_batch(self, batch) -> int:
|
|
204
|
+
"""处理一批请求"""
|
|
205
|
+
generated = 0
|
|
206
|
+
|
|
207
|
+
for request in batch:
|
|
208
|
+
if not self.running:
|
|
209
|
+
break
|
|
210
|
+
|
|
211
|
+
# 等待队列有空间
|
|
212
|
+
while await self._is_queue_full() and self.running:
|
|
213
|
+
await asyncio.sleep(0.1)
|
|
214
|
+
|
|
215
|
+
if self.running:
|
|
216
|
+
await self.enqueue_request(request)
|
|
217
|
+
generated += 1
|
|
218
|
+
self._generation_stats['total_generated'] += 1
|
|
219
|
+
|
|
220
|
+
# 控制生成速度
|
|
221
|
+
if self.generation_interval > 0:
|
|
222
|
+
await asyncio.sleep(self.generation_interval)
|
|
223
|
+
|
|
224
|
+
return generated
|
|
225
|
+
|
|
226
|
+
async def _should_pause_generation(self) -> bool:
|
|
227
|
+
"""判断是否应该暂停生成"""
|
|
228
|
+
# 检查队列大小
|
|
229
|
+
if await self._is_queue_full():
|
|
230
|
+
return True
|
|
231
|
+
|
|
232
|
+
# 检查任务管理器负载
|
|
233
|
+
if self.task_manager:
|
|
234
|
+
current_tasks = len(self.task_manager.current_task)
|
|
235
|
+
if hasattr(self.task_manager, 'semaphore'):
|
|
236
|
+
max_concurrency = getattr(self.task_manager.semaphore, '_initial_value', 8)
|
|
237
|
+
if current_tasks >= max_concurrency * self.backpressure_ratio:
|
|
238
|
+
return True
|
|
239
|
+
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
async def _is_queue_full(self) -> bool:
|
|
243
|
+
"""检查队列是否已满"""
|
|
244
|
+
if not self.scheduler:
|
|
245
|
+
return False
|
|
246
|
+
|
|
247
|
+
queue_size = len(self.scheduler)
|
|
248
|
+
return queue_size >= self.max_queue_size * self.backpressure_ratio
|
|
249
|
+
|
|
250
|
+
async def _wait_for_capacity(self):
|
|
251
|
+
"""等待系统有足够容量"""
|
|
252
|
+
self._generation_stats['backpressure_events'] += 1
|
|
253
|
+
self.logger.debug("⏸️ 触发背压,暂停请求生成")
|
|
254
|
+
|
|
255
|
+
wait_time = 0.1
|
|
256
|
+
max_wait = 2.0
|
|
257
|
+
|
|
258
|
+
while await self._should_pause_generation() and self.running:
|
|
259
|
+
await asyncio.sleep(wait_time)
|
|
260
|
+
wait_time = min(wait_time * 1.1, max_wait)
|
|
107
261
|
|
|
108
262
|
async def _open_spider(self):
|
|
109
263
|
asyncio.create_task(self.crawler.subscriber.notify(spider_opened))
|
|
@@ -164,9 +318,29 @@ class Engine(object):
|
|
|
164
318
|
return True
|
|
165
319
|
return False
|
|
166
320
|
|
|
321
|
+
async def _should_exit(self) -> bool:
|
|
322
|
+
"""检查是否应该退出(增强版本)"""
|
|
323
|
+
# 没有启动请求,且所有队列都空闲
|
|
324
|
+
if (self.start_requests is None and
|
|
325
|
+
self.scheduler.idle() and
|
|
326
|
+
self.downloader.idle() and
|
|
327
|
+
self.task_manager.all_done() and
|
|
328
|
+
self.processor.idle()):
|
|
329
|
+
return True
|
|
330
|
+
|
|
331
|
+
return False
|
|
332
|
+
|
|
167
333
|
async def close_spider(self):
|
|
168
334
|
await asyncio.gather(*self.task_manager.current_task)
|
|
169
335
|
await self.scheduler.close()
|
|
170
336
|
await self.downloader.close()
|
|
171
337
|
if self.normal:
|
|
172
|
-
await self.crawler.close()
|
|
338
|
+
await self.crawler.close()
|
|
339
|
+
|
|
340
|
+
def get_generation_stats(self) -> dict:
|
|
341
|
+
"""获取生成统计"""
|
|
342
|
+
return {
|
|
343
|
+
**self._generation_stats,
|
|
344
|
+
'queue_size': len(self.scheduler) if self.scheduler else 0,
|
|
345
|
+
'active_tasks': len(self.task_manager.current_task) if self.task_manager else 0
|
|
346
|
+
}
|
crawlo/core/scheduler.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Optional, Callable
|
|
|
5
5
|
from crawlo.utils.log import get_logger
|
|
6
6
|
from crawlo.utils.request import set_request
|
|
7
7
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
8
|
+
from crawlo.utils.error_handler import ErrorHandler
|
|
8
9
|
from crawlo.queue.queue_manager import QueueManager, QueueConfig
|
|
9
10
|
from crawlo.project import load_class, common_call
|
|
10
11
|
|
|
@@ -16,6 +17,7 @@ class Scheduler:
|
|
|
16
17
|
self.request_serializer = RequestSerializer() # 专门处理序列化
|
|
17
18
|
|
|
18
19
|
self.logger = get_logger(name=self.__class__.__name__, level=log_level)
|
|
20
|
+
self.error_handler = ErrorHandler(self.__class__.__name__, log_level)
|
|
19
21
|
self.stats = stats
|
|
20
22
|
self.dupe_filter = dupe_filter
|
|
21
23
|
self.priority = priority
|
|
@@ -34,6 +36,7 @@ class Scheduler:
|
|
|
34
36
|
|
|
35
37
|
async def open(self):
|
|
36
38
|
"""初始化调度器和队列"""
|
|
39
|
+
self.logger.info("开始初始化调度器...")
|
|
37
40
|
try:
|
|
38
41
|
# 创建队列配置
|
|
39
42
|
queue_config = QueueConfig.from_settings(self.crawler.settings)
|
|
@@ -42,6 +45,7 @@ class Scheduler:
|
|
|
42
45
|
self.queue_manager = QueueManager(queue_config)
|
|
43
46
|
|
|
44
47
|
# 初始化队列
|
|
48
|
+
self.logger.info("开始初始化队列管理器...")
|
|
45
49
|
success = await self.queue_manager.initialize()
|
|
46
50
|
if not success:
|
|
47
51
|
raise RuntimeError("队列初始化失败")
|
|
@@ -50,8 +54,10 @@ class Scheduler:
|
|
|
50
54
|
status = self.queue_manager.get_status()
|
|
51
55
|
self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
|
|
52
56
|
self.logger.info(f'requesting filter: {self.dupe_filter}')
|
|
57
|
+
self.logger.info("调度器初始化完成")
|
|
53
58
|
except Exception as e:
|
|
54
59
|
self.logger.error(f"❌ 调度器初始化失败: {e}")
|
|
60
|
+
self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
|
|
55
61
|
raise
|
|
56
62
|
|
|
57
63
|
async def next_request(self):
|
|
@@ -59,14 +65,22 @@ class Scheduler:
|
|
|
59
65
|
if not self.queue_manager:
|
|
60
66
|
return None
|
|
61
67
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
+
try:
|
|
69
|
+
request = await self.queue_manager.get()
|
|
70
|
+
|
|
71
|
+
# 恢复 callback(从 Redis 队列取出时)
|
|
72
|
+
if request:
|
|
73
|
+
spider = getattr(self.crawler, 'spider', None)
|
|
74
|
+
request = self.request_serializer.restore_after_deserialization(request, spider)
|
|
68
75
|
|
|
69
|
-
|
|
76
|
+
return request
|
|
77
|
+
except Exception as e:
|
|
78
|
+
self.error_handler.handle_error(
|
|
79
|
+
e,
|
|
80
|
+
context="获取下一个请求失败",
|
|
81
|
+
raise_error=False
|
|
82
|
+
)
|
|
83
|
+
return None
|
|
70
84
|
|
|
71
85
|
async def enqueue_request(self, request):
|
|
72
86
|
"""将请求加入队列"""
|
|
@@ -80,13 +94,21 @@ class Scheduler:
|
|
|
80
94
|
|
|
81
95
|
set_request(request, self.priority)
|
|
82
96
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
97
|
+
try:
|
|
98
|
+
# 使用统一的队列接口
|
|
99
|
+
success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
|
|
100
|
+
|
|
101
|
+
if success:
|
|
102
|
+
self.logger.debug(f"✅ 请求入队成功: {request.url}")
|
|
103
|
+
|
|
104
|
+
return success
|
|
105
|
+
except Exception as e:
|
|
106
|
+
self.error_handler.handle_error(
|
|
107
|
+
e,
|
|
108
|
+
context="请求入队失败",
|
|
109
|
+
raise_error=False
|
|
110
|
+
)
|
|
111
|
+
return False
|
|
90
112
|
|
|
91
113
|
def idle(self) -> bool:
|
|
92
114
|
"""检查队列是否为空"""
|
|
@@ -94,73 +116,22 @@ class Scheduler:
|
|
|
94
116
|
|
|
95
117
|
async def close(self):
|
|
96
118
|
"""关闭调度器"""
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
119
|
+
try:
|
|
120
|
+
if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
|
|
121
|
+
await closed()
|
|
122
|
+
|
|
123
|
+
if self.queue_manager:
|
|
124
|
+
await self.queue_manager.close()
|
|
125
|
+
except Exception as e:
|
|
126
|
+
self.error_handler.handle_error(
|
|
127
|
+
e,
|
|
128
|
+
context="关闭调度器失败",
|
|
129
|
+
raise_error=False
|
|
130
|
+
)
|
|
102
131
|
|
|
103
132
|
def __len__(self):
|
|
104
133
|
"""获取队列大小"""
|
|
105
134
|
if not self.queue_manager:
|
|
106
135
|
return 0
|
|
107
136
|
# 返回同步的近似值,实际大小需要异步获取
|
|
108
|
-
return 0 if self.queue_manager.empty() else 1
|
|
109
|
-
|
|
110
|
-
# #!/usr/bin/python
|
|
111
|
-
# # -*- coding:UTF-8 -*-
|
|
112
|
-
# from typing import Optional, Callable
|
|
113
|
-
#
|
|
114
|
-
# from crawlo.utils.log import get_logger
|
|
115
|
-
# from crawlo.utils.request import set_request
|
|
116
|
-
# from crawlo.utils.pqueue import SpiderPriorityQueue
|
|
117
|
-
# from crawlo.project import load_class, common_call
|
|
118
|
-
#
|
|
119
|
-
#
|
|
120
|
-
# class Scheduler:
|
|
121
|
-
# def __init__(self, crawler, dupe_filter, stats, log_level, priority):
|
|
122
|
-
# self.crawler = crawler
|
|
123
|
-
# self.request_queue: Optional[SpiderPriorityQueue] = None
|
|
124
|
-
#
|
|
125
|
-
# self.logger = get_logger(name=self.__class__.__name__, level=log_level)
|
|
126
|
-
# self.stats = stats
|
|
127
|
-
# self.dupe_filter = dupe_filter
|
|
128
|
-
# self.priority = priority
|
|
129
|
-
#
|
|
130
|
-
# @classmethod
|
|
131
|
-
# def create_instance(cls, crawler):
|
|
132
|
-
# filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
|
|
133
|
-
# o = cls(
|
|
134
|
-
# crawler=crawler,
|
|
135
|
-
# dupe_filter=filter_cls.create_instance(crawler),
|
|
136
|
-
# stats=crawler.stats,
|
|
137
|
-
# log_level=crawler.settings.get('LOG_LEVEL'),
|
|
138
|
-
# priority=crawler.settings.get('DEPTH_PRIORITY')
|
|
139
|
-
# )
|
|
140
|
-
# return o
|
|
141
|
-
#
|
|
142
|
-
# def open(self):
|
|
143
|
-
# self.request_queue = SpiderPriorityQueue()
|
|
144
|
-
# self.logger.info(f'requesting filter: {self.dupe_filter}')
|
|
145
|
-
#
|
|
146
|
-
# async def next_request(self):
|
|
147
|
-
# request = await self.request_queue.get()
|
|
148
|
-
# return request
|
|
149
|
-
#
|
|
150
|
-
# async def enqueue_request(self, request):
|
|
151
|
-
# if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
|
|
152
|
-
# self.dupe_filter.log_stats(request)
|
|
153
|
-
# return False
|
|
154
|
-
# set_request(request, self.priority)
|
|
155
|
-
# await self.request_queue.put(request)
|
|
156
|
-
# return True
|
|
157
|
-
#
|
|
158
|
-
# def idle(self) -> bool:
|
|
159
|
-
# return len(self) == 0
|
|
160
|
-
#
|
|
161
|
-
# async def close(self):
|
|
162
|
-
# if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
|
|
163
|
-
# await closed()
|
|
164
|
-
#
|
|
165
|
-
# def __len__(self):
|
|
166
|
-
# return self.request_queue.qsize()
|
|
137
|
+
return 0 if self.queue_manager.empty() else 1
|
crawlo/crawler.py
CHANGED
|
@@ -139,7 +139,7 @@ class Crawler:
|
|
|
139
139
|
|
|
140
140
|
async def crawl(self):
|
|
141
141
|
"""
|
|
142
|
-
|
|
142
|
+
启动爬虫核心流程
|
|
143
143
|
|
|
144
144
|
包含以下阶段:
|
|
145
145
|
1. 初始化阶段: 创建所有组件
|
|
@@ -391,7 +391,7 @@ class Crawler:
|
|
|
391
391
|
|
|
392
392
|
class CrawlerProcess:
|
|
393
393
|
"""
|
|
394
|
-
|
|
394
|
+
爬虫进程管理器
|
|
395
395
|
|
|
396
396
|
支持功能:
|
|
397
397
|
- 多爬虫并发调度和资源管理
|
|
@@ -586,7 +586,7 @@ class CrawlerProcess:
|
|
|
586
586
|
|
|
587
587
|
async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
|
|
588
588
|
"""
|
|
589
|
-
|
|
589
|
+
启动一个或多个爬虫
|
|
590
590
|
|
|
591
591
|
增强功能:
|
|
592
592
|
- 智能并发控制
|
|
@@ -856,7 +856,7 @@ class CrawlerProcess:
|
|
|
856
856
|
|
|
857
857
|
def _shutdown(self, _signum, _frame):
|
|
858
858
|
"""
|
|
859
|
-
|
|
859
|
+
优雅关闭信号处理
|
|
860
860
|
|
|
861
861
|
提供更好的关闭体验和资源清理
|
|
862
862
|
"""
|
|
@@ -881,7 +881,7 @@ class CrawlerProcess:
|
|
|
881
881
|
|
|
882
882
|
async def _wait_for_shutdown(self):
|
|
883
883
|
"""
|
|
884
|
-
|
|
884
|
+
等待所有活跃任务完成
|
|
885
885
|
|
|
886
886
|
提供更好的关闭时间控制和进度反馈
|
|
887
887
|
"""
|
|
@@ -935,7 +935,7 @@ class CrawlerProcess:
|
|
|
935
935
|
@classmethod
|
|
936
936
|
def _get_default_settings(cls) -> SettingManager:
|
|
937
937
|
"""
|
|
938
|
-
|
|
938
|
+
加载默认配置
|
|
939
939
|
|
|
940
940
|
提供更好的错误处理和降级策略
|
|
941
941
|
"""
|
crawlo/downloader/__init__.py
CHANGED
|
@@ -209,6 +209,21 @@ try:
|
|
|
209
209
|
except ImportError:
|
|
210
210
|
HttpXDownloader = None
|
|
211
211
|
|
|
212
|
+
try:
|
|
213
|
+
from .selenium_downloader import SeleniumDownloader
|
|
214
|
+
except ImportError:
|
|
215
|
+
SeleniumDownloader = None
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
from .playwright_downloader import PlaywrightDownloader
|
|
219
|
+
except ImportError:
|
|
220
|
+
PlaywrightDownloader = None
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
from .hybrid_downloader import HybridDownloader
|
|
224
|
+
except ImportError:
|
|
225
|
+
HybridDownloader = None
|
|
226
|
+
|
|
212
227
|
# 导出所有可用的类
|
|
213
228
|
__all__ = [
|
|
214
229
|
'DownloaderBase',
|
|
@@ -223,6 +238,12 @@ if CurlCffiDownloader:
|
|
|
223
238
|
__all__.append('CurlCffiDownloader')
|
|
224
239
|
if HttpXDownloader:
|
|
225
240
|
__all__.append('HttpXDownloader')
|
|
241
|
+
if SeleniumDownloader:
|
|
242
|
+
__all__.append('SeleniumDownloader')
|
|
243
|
+
if PlaywrightDownloader:
|
|
244
|
+
__all__.append('PlaywrightDownloader')
|
|
245
|
+
if HybridDownloader:
|
|
246
|
+
__all__.append('HybridDownloader')
|
|
226
247
|
|
|
227
248
|
# 提供便捷的下载器映射
|
|
228
249
|
DOWNLOADER_MAP = {
|
|
@@ -230,6 +251,9 @@ DOWNLOADER_MAP = {
|
|
|
230
251
|
'httpx': HttpXDownloader,
|
|
231
252
|
'curl_cffi': CurlCffiDownloader,
|
|
232
253
|
'cffi': CurlCffiDownloader, # 别名
|
|
254
|
+
'selenium': SeleniumDownloader,
|
|
255
|
+
'playwright': PlaywrightDownloader,
|
|
256
|
+
'hybrid': HybridDownloader,
|
|
233
257
|
}
|
|
234
258
|
|
|
235
259
|
# 过滤掉不可用的下载器
|
|
@@ -162,6 +162,14 @@ class AioHttpDownloader(DownloaderBase):
|
|
|
162
162
|
except Exception as e:
|
|
163
163
|
raise ValueError(f"Invalid proxy URL: {proxy}") from e
|
|
164
164
|
|
|
165
|
+
# 处理通过meta传递的代理认证信息
|
|
166
|
+
meta_proxy_auth = request.meta.get("proxy_auth")
|
|
167
|
+
if meta_proxy_auth and isinstance(meta_proxy_auth, dict):
|
|
168
|
+
username = meta_proxy_auth.get("username")
|
|
169
|
+
password = meta_proxy_auth.get("password")
|
|
170
|
+
if username and password:
|
|
171
|
+
kwargs["proxy_auth"] = BasicAuth(username, password)
|
|
172
|
+
|
|
165
173
|
# === 处理请求体 ===
|
|
166
174
|
if hasattr(request, "_json_body") and request._json_body is not None:
|
|
167
175
|
kwargs["json"] = request._json_body
|
|
@@ -210,6 +210,11 @@ class CurlCffiDownloader(DownloaderBase):
|
|
|
210
210
|
else:
|
|
211
211
|
self.logger.error(f"不支持的 proxy 类型: {type(proxy)},值: {proxy}")
|
|
212
212
|
|
|
213
|
+
# 处理通过meta传递的代理认证信息
|
|
214
|
+
proxy_auth_header = request.headers.get("Proxy-Authorization") or request.meta.get("proxy_auth_header")
|
|
215
|
+
if proxy_auth_header:
|
|
216
|
+
kwargs["headers"]["Proxy-Authorization"] = proxy_auth_header
|
|
217
|
+
|
|
213
218
|
# 请求体处理
|
|
214
219
|
if hasattr(request, "_json_body") and request._json_body is not None:
|
|
215
220
|
kwargs["json"] = request._json_body
|