crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +24 -0
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +58 -32
- crawlo/core/__init__.py +44 -0
- crawlo/core/engine.py +119 -45
- crawlo/core/scheduler.py +4 -3
- crawlo/crawler.py +603 -1133
- crawlo/downloader/aiohttp_downloader.py +4 -2
- crawlo/extension/__init__.py +1 -1
- crawlo/extension/logging_extension.py +23 -7
- crawlo/factories/__init__.py +28 -0
- crawlo/factories/base.py +69 -0
- crawlo/factories/crawler.py +104 -0
- crawlo/factories/registry.py +85 -0
- crawlo/filters/aioredis_filter.py +25 -2
- crawlo/framework.py +292 -0
- crawlo/initialization/__init__.py +40 -0
- crawlo/initialization/built_in.py +426 -0
- crawlo/initialization/context.py +142 -0
- crawlo/initialization/core.py +194 -0
- crawlo/initialization/phases.py +149 -0
- crawlo/initialization/registry.py +146 -0
- crawlo/items/base.py +2 -1
- crawlo/logging/__init__.py +38 -0
- crawlo/logging/config.py +97 -0
- crawlo/logging/factory.py +129 -0
- crawlo/logging/manager.py +112 -0
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/middleware/offsite.py +1 -1
- crawlo/mode_manager.py +26 -1
- crawlo/pipelines/pipeline_manager.py +2 -1
- crawlo/project.py +76 -46
- crawlo/queue/pqueue.py +11 -5
- crawlo/queue/queue_manager.py +143 -19
- crawlo/queue/redis_priority_queue.py +69 -49
- crawlo/settings/default_settings.py +110 -14
- crawlo/settings/setting_manager.py +29 -13
- crawlo/spider/__init__.py +34 -16
- crawlo/stats_collector.py +17 -3
- crawlo/task_manager.py +112 -3
- crawlo/templates/project/settings.py.tmpl +103 -202
- crawlo/templates/project/settings_distributed.py.tmpl +122 -135
- crawlo/templates/project/settings_gentle.py.tmpl +149 -43
- crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
- crawlo/templates/project/settings_minimal.py.tmpl +46 -15
- crawlo/templates/project/settings_simple.py.tmpl +138 -75
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
- crawlo/templates/run.py.tmpl +10 -14
- crawlo/templates/spiders_init.py.tmpl +10 -0
- crawlo/tools/network_diagnostic.py +365 -0
- crawlo/utils/class_loader.py +26 -0
- crawlo/utils/error_handler.py +76 -35
- crawlo/utils/log.py +41 -144
- crawlo/utils/redis_connection_pool.py +43 -6
- crawlo/utils/request_serializer.py +8 -1
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
- tests/authenticated_proxy_example.py +2 -2
- tests/baidu_performance_test.py +109 -0
- tests/baidu_test.py +60 -0
- tests/comprehensive_framework_test.py +213 -0
- tests/comprehensive_test.py +82 -0
- tests/comprehensive_testing_summary.md +187 -0
- tests/debug_configure.py +70 -0
- tests/debug_framework_logger.py +85 -0
- tests/debug_log_levels.py +64 -0
- tests/distributed_test.py +67 -0
- tests/distributed_test_debug.py +77 -0
- tests/final_command_test_report.md +0 -0
- tests/final_comprehensive_test.py +152 -0
- tests/final_validation_test.py +183 -0
- tests/framework_performance_test.py +203 -0
- tests/optimized_performance_test.py +212 -0
- tests/performance_comparison.py +246 -0
- tests/queue_blocking_test.py +114 -0
- tests/queue_test.py +90 -0
- tests/scrapy_comparison/ofweek_scrapy.py +139 -0
- tests/scrapy_comparison/scrapy_test.py +134 -0
- tests/simple_command_test.py +120 -0
- tests/simple_crawlo_test.py +128 -0
- tests/simple_log_test.py +58 -0
- tests/simple_optimization_test.py +129 -0
- tests/simple_spider_test.py +50 -0
- tests/simple_test.py +48 -0
- tests/test_all_commands.py +231 -0
- tests/test_batch_processor.py +179 -0
- tests/test_component_factory.py +175 -0
- tests/test_controlled_spider_mixin.py +80 -0
- tests/test_enhanced_error_handler_comprehensive.py +246 -0
- tests/test_factories.py +253 -0
- tests/test_framework_logger.py +67 -0
- tests/test_framework_startup.py +65 -0
- tests/test_large_scale_config.py +113 -0
- tests/test_large_scale_helper.py +236 -0
- tests/test_mode_change.py +73 -0
- tests/test_mode_consistency.py +1 -1
- tests/test_performance_monitor.py +116 -0
- tests/test_queue_empty_check.py +42 -0
- tests/untested_features_report.md +139 -0
- tests/verify_debug.py +52 -0
- tests/verify_log_fix.py +112 -0
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
crawlo/__init__.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
"""
|
|
4
4
|
Crawlo - 一个异步爬虫框架
|
|
5
5
|
"""
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
6
8
|
from crawlo.spider import Spider
|
|
7
9
|
from crawlo.items import Item, Field
|
|
8
10
|
from crawlo.network.request import Request
|
|
@@ -24,9 +26,29 @@ from crawlo.utils import (
|
|
|
24
26
|
)
|
|
25
27
|
from crawlo import tools
|
|
26
28
|
|
|
29
|
+
# 框架核心模块 - 使用TYPE_CHECKING避免循环导入
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from crawlo.core.framework_initializer import get_framework_initializer, initialize_framework
|
|
32
|
+
|
|
27
33
|
# 为了向后兼容,从tools中导入cleaners相关的功能
|
|
28
34
|
import crawlo.tools as cleaners
|
|
29
35
|
|
|
36
|
+
# 延迟导入的辅助函数
|
|
37
|
+
def get_framework_initializer():
|
|
38
|
+
"""延迟导入get_framework_initializer以避免循环依赖"""
|
|
39
|
+
from crawlo.core.framework_initializer import get_framework_initializer as _get_framework_initializer
|
|
40
|
+
return _get_framework_initializer()
|
|
41
|
+
|
|
42
|
+
def initialize_framework(custom_settings=None):
|
|
43
|
+
"""延迟导入initialize_framework以避免循环依赖"""
|
|
44
|
+
from crawlo.core.framework_initializer import initialize_framework as _initialize_framework
|
|
45
|
+
return _initialize_framework(custom_settings)
|
|
46
|
+
|
|
47
|
+
# 向后兼容的别名
|
|
48
|
+
def get_bootstrap_manager():
|
|
49
|
+
"""向后兼容的别名"""
|
|
50
|
+
return get_framework_initializer()
|
|
51
|
+
|
|
30
52
|
# 版本号:优先从元数据读取
|
|
31
53
|
try:
|
|
32
54
|
from importlib.metadata import version
|
|
@@ -60,5 +82,7 @@ __all__ = [
|
|
|
60
82
|
'from_timestamp_with_tz',
|
|
61
83
|
'cleaners',
|
|
62
84
|
'tools',
|
|
85
|
+
'get_framework_initializer',
|
|
86
|
+
'get_bootstrap_manager',
|
|
63
87
|
'__version__',
|
|
64
88
|
]
|
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '1.3.
|
|
1
|
+
__version__ = '1.3.4'
|
crawlo/commands/run.py
CHANGED
|
@@ -21,10 +21,23 @@ from rich.text import Text
|
|
|
21
21
|
from crawlo.commands.stats import record_stats
|
|
22
22
|
from crawlo.crawler import CrawlerProcess
|
|
23
23
|
from crawlo.project import get_settings, _find_project_root
|
|
24
|
-
#
|
|
24
|
+
# 使用新的统一初始化系统
|
|
25
|
+
from crawlo.initialization import initialize_framework
|
|
26
|
+
from crawlo.core import get_framework_initializer
|
|
25
27
|
from crawlo.utils.log import get_logger
|
|
26
28
|
|
|
27
|
-
logger
|
|
29
|
+
# 延迟获取logger,确保在日志系统配置之后获取
|
|
30
|
+
_logger = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def logger():
|
|
34
|
+
"""延迟获取logger实例,确保在日志系统配置之后获取"""
|
|
35
|
+
global _logger
|
|
36
|
+
if _logger is None:
|
|
37
|
+
# 使用改进后的日志系统,可以安全地在任何时候创建
|
|
38
|
+
_logger = get_logger(__name__)
|
|
39
|
+
return _logger
|
|
40
|
+
|
|
28
41
|
|
|
29
42
|
console = Console()
|
|
30
43
|
|
|
@@ -35,15 +48,15 @@ def check_redis_connection(settings):
|
|
|
35
48
|
# 检查是否为分布式模式
|
|
36
49
|
run_mode = settings.get('RUN_MODE', 'standalone')
|
|
37
50
|
queue_type = settings.get('QUEUE_TYPE', 'memory')
|
|
38
|
-
|
|
51
|
+
|
|
39
52
|
if run_mode == 'distributed' or queue_type == 'redis':
|
|
40
53
|
import redis.asyncio as redis
|
|
41
54
|
redis_url = settings.get('REDIS_URL', 'redis://127.0.0.1:6379/0')
|
|
42
55
|
redis_host = settings.get('REDIS_HOST', '127.0.0.1')
|
|
43
56
|
redis_port = settings.get('REDIS_PORT', 6379)
|
|
44
|
-
|
|
57
|
+
|
|
45
58
|
console.print(f"检查 Redis 连接: {redis_host}:{redis_port}")
|
|
46
|
-
|
|
59
|
+
|
|
47
60
|
# 创建Redis连接进行测试
|
|
48
61
|
async def _test_redis():
|
|
49
62
|
try:
|
|
@@ -54,11 +67,11 @@ def check_redis_connection(settings):
|
|
|
54
67
|
except Exception as e:
|
|
55
68
|
console.print(f"Redis 连接失败: {e}")
|
|
56
69
|
return False
|
|
57
|
-
|
|
70
|
+
|
|
58
71
|
# 运行异步测试
|
|
59
72
|
if not asyncio.run(_test_redis()):
|
|
60
73
|
raise ConnectionError(f"无法连接到 Redis 服务器 {redis_host}:{redis_port}")
|
|
61
|
-
|
|
74
|
+
|
|
62
75
|
console.print("Redis 连接正常")
|
|
63
76
|
return True
|
|
64
77
|
else:
|
|
@@ -78,11 +91,15 @@ def main(args):
|
|
|
78
91
|
用法:
|
|
79
92
|
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
80
93
|
"""
|
|
94
|
+
# 确保框架已初始化
|
|
95
|
+
init_manager = get_framework_initializer()
|
|
96
|
+
|
|
81
97
|
# 添加调试信息
|
|
82
|
-
logger.debug("DEBUG: 进入main函数")
|
|
83
|
-
|
|
98
|
+
logger().debug("DEBUG: 进入main函数")
|
|
99
|
+
|
|
84
100
|
if len(args) < 1:
|
|
85
|
-
console.print(
|
|
101
|
+
console.print(
|
|
102
|
+
"[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
86
103
|
console.print("示例:")
|
|
87
104
|
console.print(" [blue]crawlo run baidu[/blue]")
|
|
88
105
|
console.print(" [blue]crawlo run all[/blue]")
|
|
@@ -153,9 +170,9 @@ def main(args):
|
|
|
153
170
|
console.print(Panel(msg, title="导入错误", border_style="red"))
|
|
154
171
|
return 1
|
|
155
172
|
|
|
156
|
-
# 4.
|
|
157
|
-
settings =
|
|
158
|
-
|
|
173
|
+
# 4. 启动框架并加载 settings
|
|
174
|
+
settings = initialize_framework()
|
|
175
|
+
|
|
159
176
|
# 检查Redis连接(如果是分布式模式)
|
|
160
177
|
if not check_redis_connection(settings):
|
|
161
178
|
if show_json:
|
|
@@ -163,9 +180,22 @@ def main(args):
|
|
|
163
180
|
return 1
|
|
164
181
|
else:
|
|
165
182
|
return 1
|
|
166
|
-
|
|
167
|
-
|
|
183
|
+
|
|
184
|
+
# 从配置中获取SPIDER_MODULES
|
|
185
|
+
spider_modules = settings.get('SPIDER_MODULES', [f"{project_package}.spiders"])
|
|
186
|
+
logger().debug(f"SPIDER_MODULES from settings: {spider_modules}")
|
|
168
187
|
process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
|
|
188
|
+
|
|
189
|
+
# 不再需要手动导入爬虫模块,框架内部会自动处理
|
|
190
|
+
# 检查注册表中的爬虫
|
|
191
|
+
from crawlo.spider import get_global_spider_registry
|
|
192
|
+
registry = get_global_spider_registry()
|
|
193
|
+
spider_names = list(registry.keys())
|
|
194
|
+
logger().debug(f"Registered spiders after import: {spider_names}")
|
|
195
|
+
|
|
196
|
+
# 调试信息
|
|
197
|
+
logger().debug(f"SPIDER_MODULES: {spider_modules}")
|
|
198
|
+
logger().debug(f"Available spiders: {process.get_spider_names()}")
|
|
169
199
|
|
|
170
200
|
# === 情况1:运行所有爬虫 ===
|
|
171
201
|
if spider_arg.lower() == "all":
|
|
@@ -193,19 +223,14 @@ def main(args):
|
|
|
193
223
|
# 显示即将运行的爬虫列表
|
|
194
224
|
# 根据用户要求,不再显示详细的爬虫列表信息
|
|
195
225
|
|
|
196
|
-
# 注册 stats 记录(除非 --no-stats)
|
|
197
|
-
if not no_stats:
|
|
198
|
-
for crawler in process.crawlers:
|
|
199
|
-
crawler.signals.connect(record_stats, signal="spider_closed")
|
|
200
|
-
|
|
201
226
|
# 并行运行所有爬虫
|
|
202
227
|
with Progress(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
228
|
+
SpinnerColumn(),
|
|
229
|
+
TextColumn("[progress.description]{task.description}"),
|
|
230
|
+
transient=True,
|
|
206
231
|
) as progress:
|
|
207
232
|
task = progress.add_task("正在运行所有爬虫...", total=None)
|
|
208
|
-
asyncio.run(process.
|
|
233
|
+
asyncio.run(process.crawl_multiple(spider_names))
|
|
209
234
|
|
|
210
235
|
if show_json:
|
|
211
236
|
console.print_json(data={"success": True, "spiders": spider_names})
|
|
@@ -267,15 +292,16 @@ def main(args):
|
|
|
267
292
|
# console.print()
|
|
268
293
|
|
|
269
294
|
# 注册 stats 记录
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
295
|
+
# 注意:CrawlerProcess没有crawlers属性,我们需要在运行时注册
|
|
296
|
+
# if not no_stats:
|
|
297
|
+
# for crawler in process.crawlers:
|
|
298
|
+
# crawler.signals.connect(record_stats, signal="spider_closed")
|
|
273
299
|
|
|
274
300
|
# 运行爬虫
|
|
275
301
|
with Progress(
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
302
|
+
SpinnerColumn(),
|
|
303
|
+
TextColumn("[progress.description]{task.description}"),
|
|
304
|
+
transient=True,
|
|
279
305
|
) as progress:
|
|
280
306
|
task = progress.add_task(f"正在运行 {spider_name}...", total=None)
|
|
281
307
|
asyncio.run(process.crawl(spider_name))
|
|
@@ -298,7 +324,7 @@ def main(args):
|
|
|
298
324
|
console.print(f"[bold yellow]{msg}[/bold yellow]")
|
|
299
325
|
return 1
|
|
300
326
|
except Exception as e:
|
|
301
|
-
logger.exception("Exception during 'crawlo run'")
|
|
327
|
+
logger().exception("Exception during 'crawlo run'")
|
|
302
328
|
msg = f"意外错误: {e}"
|
|
303
329
|
if show_json:
|
|
304
330
|
console.print_json(data={"success": False, "error": msg})
|
|
@@ -312,4 +338,4 @@ if __name__ == "__main__":
|
|
|
312
338
|
支持直接运行:
|
|
313
339
|
python -m crawlo.commands.run spider_name
|
|
314
340
|
"""
|
|
315
|
-
sys.exit(main(sys.argv[1:]))
|
|
341
|
+
sys.exit(main(sys.argv[1:]))
|
crawlo/core/__init__.py
CHANGED
|
@@ -1,2 +1,46 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
|
+
|
|
4
|
+
# Crawlo核心模块
|
|
5
|
+
# 提供框架的核心组件和初始化功能
|
|
6
|
+
|
|
7
|
+
# 使用新的初始化系统
|
|
8
|
+
from ..initialization import (
|
|
9
|
+
initialize_framework,
|
|
10
|
+
is_framework_ready
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
# 向后兼容的别名
|
|
14
|
+
def async_initialize_framework(*args, **kwargs):
|
|
15
|
+
"""Async wrapper for framework initialization"""
|
|
16
|
+
return initialize_framework(*args, **kwargs)
|
|
17
|
+
|
|
18
|
+
def get_framework_initializer():
|
|
19
|
+
"""Get framework initializer - compatibility function"""
|
|
20
|
+
from ..initialization.core import CoreInitializer
|
|
21
|
+
return CoreInitializer()
|
|
22
|
+
|
|
23
|
+
def get_framework_logger(name='crawlo.core'):
|
|
24
|
+
"""Get framework logger - compatibility function"""
|
|
25
|
+
from ..logging import get_logger
|
|
26
|
+
return get_logger(name)
|
|
27
|
+
|
|
28
|
+
# 向后兼容
|
|
29
|
+
def bootstrap_framework(*args, **kwargs):
|
|
30
|
+
"""Bootstrap framework - compatibility function"""
|
|
31
|
+
return initialize_framework(*args, **kwargs)
|
|
32
|
+
|
|
33
|
+
def get_bootstrap_manager():
|
|
34
|
+
"""Get bootstrap manager - compatibility function"""
|
|
35
|
+
return get_framework_initializer()
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
'initialize_framework',
|
|
39
|
+
'async_initialize_framework',
|
|
40
|
+
'get_framework_initializer',
|
|
41
|
+
'is_framework_ready',
|
|
42
|
+
'get_framework_logger',
|
|
43
|
+
# 向后兼容
|
|
44
|
+
'bootstrap_framework',
|
|
45
|
+
'get_bootstrap_manager'
|
|
46
|
+
]
|
crawlo/core/engine.py
CHANGED
|
@@ -6,16 +6,16 @@ from inspect import iscoroutine
|
|
|
6
6
|
from typing import Optional, Generator, Callable
|
|
7
7
|
|
|
8
8
|
from crawlo import Request, Item
|
|
9
|
-
from crawlo.spider import Spider
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.exceptions import OutputError
|
|
12
|
-
from crawlo.core.scheduler import Scheduler
|
|
13
9
|
from crawlo.core.processor import Processor
|
|
14
|
-
from crawlo.
|
|
15
|
-
from crawlo.project import load_class
|
|
10
|
+
from crawlo.core.scheduler import Scheduler
|
|
16
11
|
from crawlo.downloader import DownloaderBase
|
|
17
|
-
from crawlo.utils.func_tools import transform
|
|
18
12
|
from crawlo.event import spider_opened, spider_error, request_scheduled
|
|
13
|
+
from crawlo.exceptions import OutputError
|
|
14
|
+
from crawlo.utils.class_loader import load_class
|
|
15
|
+
from crawlo.spider import Spider
|
|
16
|
+
from crawlo.task_manager import TaskManager
|
|
17
|
+
from crawlo.utils.func_tools import transform
|
|
18
|
+
from crawlo.utils.log import get_logger
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Engine(object):
|
|
@@ -35,8 +35,8 @@ class Engine(object):
|
|
|
35
35
|
# Enhanced control parameters
|
|
36
36
|
self.max_queue_size = self.settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 200)
|
|
37
37
|
self.generation_batch_size = self.settings.get_int('REQUEST_GENERATION_BATCH_SIZE', 10)
|
|
38
|
-
self.generation_interval = self.settings.get_float('REQUEST_GENERATION_INTERVAL', 0.
|
|
39
|
-
self.backpressure_ratio = self.settings.get_float('BACKPRESSURE_RATIO', 0.
|
|
38
|
+
self.generation_interval = self.settings.get_float('REQUEST_GENERATION_INTERVAL', 0.01) # 优化默认值
|
|
39
|
+
self.backpressure_ratio = self.settings.get_float('BACKPRESSURE_RATIO', 0.9) # 优化默认值
|
|
40
40
|
|
|
41
41
|
# State tracking
|
|
42
42
|
self._generation_paused = False
|
|
@@ -74,9 +74,7 @@ class Engine(object):
|
|
|
74
74
|
if not version or version == 'None':
|
|
75
75
|
version = '1.0.0'
|
|
76
76
|
# Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
|
|
77
|
-
self.logger.debug(
|
|
78
|
-
f"Crawlo Started version {version}"
|
|
79
|
-
)
|
|
77
|
+
self.logger.debug(f"Crawlo Framework Started {version}")
|
|
80
78
|
|
|
81
79
|
async def start_spider(self, spider):
|
|
82
80
|
self.spider = spider
|
|
@@ -110,7 +108,20 @@ class Engine(object):
|
|
|
110
108
|
if not hasattr(self.crawler, 'extension') or not self.crawler.extension:
|
|
111
109
|
self.crawler.extension = self.crawler._create_extension()
|
|
112
110
|
|
|
113
|
-
|
|
111
|
+
# 启动引擎
|
|
112
|
+
self.engine_start()
|
|
113
|
+
|
|
114
|
+
self.logger.debug("开始创建start_requests迭代器")
|
|
115
|
+
try:
|
|
116
|
+
# 先收集所有请求到列表中,避免在检查时消耗迭代器
|
|
117
|
+
requests_list = list(spider.start_requests())
|
|
118
|
+
self.logger.debug(f"收集到 {len(requests_list)} 个请求")
|
|
119
|
+
self.start_requests = iter(requests_list)
|
|
120
|
+
self.logger.debug("start_requests迭代器创建成功")
|
|
121
|
+
except Exception as e:
|
|
122
|
+
self.logger.error(f"创建start_requests迭代器失败: {e}")
|
|
123
|
+
import traceback
|
|
124
|
+
self.logger.error(traceback.format_exc())
|
|
114
125
|
await self._open_spider()
|
|
115
126
|
|
|
116
127
|
async def crawl(self):
|
|
@@ -124,32 +135,46 @@ class Engine(object):
|
|
|
124
135
|
# 启动请求生成任务(如果启用了受控生成)
|
|
125
136
|
if (self.start_requests and
|
|
126
137
|
self.settings.get_bool('ENABLE_CONTROLLED_REQUEST_GENERATION', False)):
|
|
138
|
+
self.logger.debug("创建受控请求生成任务")
|
|
127
139
|
generation_task = asyncio.create_task(
|
|
128
140
|
self._controlled_request_generation()
|
|
129
141
|
)
|
|
130
142
|
else:
|
|
131
143
|
# 传统方式处理启动请求
|
|
144
|
+
self.logger.debug("创建传统请求生成任务")
|
|
132
145
|
generation_task = asyncio.create_task(
|
|
133
146
|
self._traditional_request_generation()
|
|
134
147
|
)
|
|
135
148
|
|
|
149
|
+
self.logger.debug("请求生成任务创建完成")
|
|
150
|
+
|
|
136
151
|
# 主爬取循环
|
|
152
|
+
loop_count = 0
|
|
153
|
+
last_exit_check = 0 # 记录上次检查退出条件的时间
|
|
154
|
+
exit_check_interval = 1 # 每1次循环检查一次退出条件,进一步提高检查频率
|
|
155
|
+
|
|
137
156
|
while self.running:
|
|
157
|
+
loop_count += 1
|
|
138
158
|
# 获取并处理请求
|
|
139
159
|
if request := await self._get_next_request():
|
|
140
160
|
await self._crawl(request)
|
|
141
161
|
|
|
142
|
-
#
|
|
143
|
-
if
|
|
144
|
-
|
|
162
|
+
# 优化退出条件检查频率
|
|
163
|
+
if loop_count - last_exit_check >= exit_check_interval:
|
|
164
|
+
should_exit = await self._should_exit()
|
|
165
|
+
if should_exit:
|
|
166
|
+
self.logger.debug("满足退出条件,准备退出循环")
|
|
167
|
+
break
|
|
168
|
+
last_exit_check = loop_count
|
|
145
169
|
|
|
146
|
-
#
|
|
147
|
-
await asyncio.sleep(0.
|
|
170
|
+
# 短暂休息避免忙等,但减少休息时间以提高效率
|
|
171
|
+
await asyncio.sleep(0.000001) # 从0.00001减少到0.000001
|
|
172
|
+
|
|
173
|
+
self.logger.debug(f"主爬取循环结束,总共执行了 {loop_count} 次")
|
|
148
174
|
|
|
149
175
|
finally:
|
|
150
|
-
#
|
|
176
|
+
# 确保请求生成任务完成
|
|
151
177
|
if generation_task and not generation_task.done():
|
|
152
|
-
generation_task.cancel()
|
|
153
178
|
try:
|
|
154
179
|
await generation_task
|
|
155
180
|
except asyncio.CancelledError:
|
|
@@ -159,15 +184,24 @@ class Engine(object):
|
|
|
159
184
|
|
|
160
185
|
async def _traditional_request_generation(self):
|
|
161
186
|
"""Traditional request generation method (compatible with older versions)"""
|
|
187
|
+
self.logger.debug("开始处理传统请求生成")
|
|
188
|
+
processed_count = 0
|
|
162
189
|
while self.running:
|
|
163
190
|
try:
|
|
164
191
|
start_request = next(self.start_requests)
|
|
192
|
+
self.logger.debug(f"获取到请求: {getattr(start_request, 'url', 'Unknown URL')}")
|
|
165
193
|
# 请求入队
|
|
166
194
|
await self.enqueue_request(start_request)
|
|
195
|
+
processed_count += 1
|
|
196
|
+
self.logger.debug(f"已处理请求数量: {processed_count}")
|
|
167
197
|
except StopIteration:
|
|
198
|
+
self.logger.debug("所有起始请求处理完成")
|
|
168
199
|
self.start_requests = None
|
|
169
200
|
break
|
|
170
201
|
except Exception as exp:
|
|
202
|
+
self.logger.error(f"处理请求时发生异常: {exp}")
|
|
203
|
+
import traceback
|
|
204
|
+
self.logger.error(traceback.format_exc())
|
|
171
205
|
# 1. All requests have been processed
|
|
172
206
|
# 2. Is scheduler idle
|
|
173
207
|
# 3. Is downloader idle
|
|
@@ -176,11 +210,13 @@ class Engine(object):
|
|
|
176
210
|
self.running = False
|
|
177
211
|
if self.start_requests is not None:
|
|
178
212
|
self.logger.error(f"Error occurred while starting request: {str(exp)}")
|
|
179
|
-
|
|
213
|
+
# 减少等待时间以提高效率
|
|
214
|
+
await asyncio.sleep(0.00001) # 从0.0001减少到0.00001
|
|
215
|
+
self.logger.debug(f"传统请求生成完成,总共处理了 {processed_count} 个请求")
|
|
180
216
|
|
|
181
217
|
async def _controlled_request_generation(self):
|
|
182
218
|
"""Controlled request generation (enhanced features)"""
|
|
183
|
-
self.logger.
|
|
219
|
+
self.logger.debug("Starting controlled request generation")
|
|
184
220
|
|
|
185
221
|
batch = []
|
|
186
222
|
total_generated = 0
|
|
@@ -209,7 +245,7 @@ class Engine(object):
|
|
|
209
245
|
|
|
210
246
|
finally:
|
|
211
247
|
self.start_requests = None
|
|
212
|
-
self.logger.
|
|
248
|
+
self.logger.debug(f"Request generation completed, total: {total_generated}")
|
|
213
249
|
|
|
214
250
|
async def _process_generation_batch(self, batch) -> int:
|
|
215
251
|
"""Process a batch of requests"""
|
|
@@ -221,14 +257,14 @@ class Engine(object):
|
|
|
221
257
|
|
|
222
258
|
# 等待队列有空间
|
|
223
259
|
while await self._is_queue_full() and self.running:
|
|
224
|
-
await asyncio.sleep(0.
|
|
260
|
+
await asyncio.sleep(0.01) # 减少等待时间
|
|
225
261
|
|
|
226
262
|
if self.running:
|
|
227
263
|
await self.enqueue_request(request)
|
|
228
264
|
generated += 1
|
|
229
265
|
self._generation_stats['total_generated'] += 1
|
|
230
266
|
|
|
231
|
-
#
|
|
267
|
+
# 控制生成速度,但使用更小的间隔
|
|
232
268
|
if self.generation_interval > 0:
|
|
233
269
|
await asyncio.sleep(self.generation_interval)
|
|
234
270
|
|
|
@@ -263,8 +299,8 @@ class Engine(object):
|
|
|
263
299
|
self._generation_stats['backpressure_events'] += 1
|
|
264
300
|
self.logger.debug("Backpressure triggered, pausing request generation")
|
|
265
301
|
|
|
266
|
-
wait_time = 0.
|
|
267
|
-
max_wait =
|
|
302
|
+
wait_time = 0.01 # 减少初始等待时间
|
|
303
|
+
max_wait = 1.0 # 减少最大等待时间
|
|
268
304
|
|
|
269
305
|
while await self._should_pause_generation() and self.running:
|
|
270
306
|
await asyncio.sleep(wait_time)
|
|
@@ -272,16 +308,38 @@ class Engine(object):
|
|
|
272
308
|
|
|
273
309
|
async def _open_spider(self):
|
|
274
310
|
asyncio.create_task(self.crawler.subscriber.notify(spider_opened))
|
|
275
|
-
|
|
276
|
-
await
|
|
311
|
+
# 直接调用crawl方法而不是创建任务,确保等待完成
|
|
312
|
+
await self.crawl()
|
|
277
313
|
|
|
278
314
|
async def _crawl(self, request):
|
|
279
|
-
# TODO 实现并发
|
|
280
315
|
async def crawl_task():
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
316
|
+
start_time = time.time()
|
|
317
|
+
try:
|
|
318
|
+
outputs = await self._fetch(request)
|
|
319
|
+
# 记录响应时间
|
|
320
|
+
response_time = time.time() - start_time
|
|
321
|
+
if self.task_manager:
|
|
322
|
+
self.task_manager.record_response_time(response_time)
|
|
323
|
+
|
|
324
|
+
# TODO 处理output
|
|
325
|
+
if outputs:
|
|
326
|
+
await self._handle_spider_output(outputs)
|
|
327
|
+
except Exception as e:
|
|
328
|
+
# 记录详细的异常信息
|
|
329
|
+
self.logger.error(
|
|
330
|
+
f"处理请求失败: {getattr(request, 'url', 'Unknown URL')} - {type(e).__name__}: {e}"
|
|
331
|
+
)
|
|
332
|
+
self.logger.debug(f"详细异常信息", exc_info=True)
|
|
333
|
+
|
|
334
|
+
# 发送统计事件
|
|
335
|
+
if hasattr(self.crawler, 'stats'):
|
|
336
|
+
self.crawler.stats.inc_value('downloader/exception_count')
|
|
337
|
+
self.crawler.stats.inc_value(f'downloader/exception_type_count/{type(e).__name__}')
|
|
338
|
+
if hasattr(request, 'url'):
|
|
339
|
+
self.crawler.stats.inc_value(f'downloader/failed_urls_count')
|
|
340
|
+
|
|
341
|
+
# 不再重新抛出异常,避免未处理的Task异常
|
|
342
|
+
return None
|
|
285
343
|
|
|
286
344
|
# 使用异步任务创建,遵守并发限制
|
|
287
345
|
await self.task_manager.create_task(crawl_task())
|
|
@@ -331,31 +389,47 @@ class Engine(object):
|
|
|
331
389
|
|
|
332
390
|
async def _should_exit(self) -> bool:
|
|
333
391
|
"""检查是否应该退出"""
|
|
392
|
+
self.logger.debug(f"检查退出条件: start_requests={self.start_requests is not None}")
|
|
334
393
|
# 没有启动请求,且所有队列都空闲
|
|
335
394
|
if self.start_requests is None:
|
|
395
|
+
self.logger.debug("start_requests 为 None,检查其他组件状态")
|
|
336
396
|
# 使用异步的idle检查方法以获得更精确的结果
|
|
337
397
|
scheduler_idle = await self.scheduler.async_idle() if hasattr(self.scheduler, 'async_idle') else self.scheduler.idle()
|
|
398
|
+
downloader_idle = self.downloader.idle()
|
|
399
|
+
task_manager_done = self.task_manager.all_done()
|
|
400
|
+
processor_idle = self.processor.idle()
|
|
401
|
+
|
|
402
|
+
self.logger.debug(f"组件状态 - Scheduler: {scheduler_idle}, Downloader: {downloader_idle}, TaskManager: {task_manager_done}, Processor: {processor_idle}")
|
|
338
403
|
|
|
339
404
|
if (scheduler_idle and
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
#
|
|
344
|
-
await
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
405
|
+
downloader_idle and
|
|
406
|
+
task_manager_done and
|
|
407
|
+
processor_idle):
|
|
408
|
+
# 立即进行二次检查,不等待
|
|
409
|
+
scheduler_idle = await self.scheduler.async_idle() if hasattr(self.scheduler, 'async_idle') else self.scheduler.idle()
|
|
410
|
+
downloader_idle = self.downloader.idle()
|
|
411
|
+
task_manager_done = self.task_manager.all_done()
|
|
412
|
+
processor_idle = self.processor.idle()
|
|
413
|
+
|
|
414
|
+
self.logger.debug(f"二次检查组件状态 - Scheduler: {scheduler_idle}, Downloader: {downloader_idle}, TaskManager: {task_manager_done}, Processor: {processor_idle}")
|
|
415
|
+
|
|
416
|
+
if (scheduler_idle and
|
|
417
|
+
downloader_idle and
|
|
418
|
+
task_manager_done and
|
|
419
|
+
processor_idle):
|
|
420
|
+
self.logger.info("所有组件都空闲,准备退出")
|
|
349
421
|
return True
|
|
422
|
+
else:
|
|
423
|
+
self.logger.debug("start_requests 不为 None,不退出")
|
|
350
424
|
|
|
351
425
|
return False
|
|
352
426
|
|
|
353
427
|
async def close_spider(self):
|
|
428
|
+
# 不再调用crawler.close(),避免重复清理
|
|
429
|
+
# 清理工作应该由crawler的_lifecycle_manager上下文管理器来处理
|
|
354
430
|
await asyncio.gather(*self.task_manager.current_task)
|
|
355
431
|
await self.scheduler.close()
|
|
356
432
|
await self.downloader.close()
|
|
357
|
-
if self.normal:
|
|
358
|
-
await self.crawler.close()
|
|
359
433
|
|
|
360
434
|
def get_generation_stats(self) -> dict:
|
|
361
435
|
"""获取生成统计"""
|
crawlo/core/scheduler.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
|
-
from typing import Optional, Callable
|
|
4
3
|
import traceback
|
|
4
|
+
from typing import Optional, Callable
|
|
5
5
|
|
|
6
6
|
from crawlo.utils.log import get_logger
|
|
7
7
|
from crawlo.utils.request import set_request
|
|
8
|
-
from crawlo.utils.request_serializer import RequestSerializer
|
|
9
8
|
from crawlo.utils.error_handler import ErrorHandler
|
|
9
|
+
from crawlo.utils.class_loader import load_class
|
|
10
|
+
from crawlo.project import common_call
|
|
11
|
+
from crawlo.utils.request_serializer import RequestSerializer
|
|
10
12
|
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
11
|
-
from crawlo.project import load_class, common_call
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class Scheduler:
|