crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (105) hide show
  1. crawlo/__init__.py +24 -0
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/run.py +58 -32
  4. crawlo/core/__init__.py +44 -0
  5. crawlo/core/engine.py +119 -45
  6. crawlo/core/scheduler.py +4 -3
  7. crawlo/crawler.py +603 -1133
  8. crawlo/downloader/aiohttp_downloader.py +4 -2
  9. crawlo/extension/__init__.py +1 -1
  10. crawlo/extension/logging_extension.py +23 -7
  11. crawlo/factories/__init__.py +28 -0
  12. crawlo/factories/base.py +69 -0
  13. crawlo/factories/crawler.py +104 -0
  14. crawlo/factories/registry.py +85 -0
  15. crawlo/filters/aioredis_filter.py +25 -2
  16. crawlo/framework.py +292 -0
  17. crawlo/initialization/__init__.py +40 -0
  18. crawlo/initialization/built_in.py +426 -0
  19. crawlo/initialization/context.py +142 -0
  20. crawlo/initialization/core.py +194 -0
  21. crawlo/initialization/phases.py +149 -0
  22. crawlo/initialization/registry.py +146 -0
  23. crawlo/items/base.py +2 -1
  24. crawlo/logging/__init__.py +38 -0
  25. crawlo/logging/config.py +97 -0
  26. crawlo/logging/factory.py +129 -0
  27. crawlo/logging/manager.py +112 -0
  28. crawlo/middleware/middleware_manager.py +1 -1
  29. crawlo/middleware/offsite.py +1 -1
  30. crawlo/mode_manager.py +26 -1
  31. crawlo/pipelines/pipeline_manager.py +2 -1
  32. crawlo/project.py +76 -46
  33. crawlo/queue/pqueue.py +11 -5
  34. crawlo/queue/queue_manager.py +143 -19
  35. crawlo/queue/redis_priority_queue.py +69 -49
  36. crawlo/settings/default_settings.py +110 -14
  37. crawlo/settings/setting_manager.py +29 -13
  38. crawlo/spider/__init__.py +34 -16
  39. crawlo/stats_collector.py +17 -3
  40. crawlo/task_manager.py +112 -3
  41. crawlo/templates/project/settings.py.tmpl +103 -202
  42. crawlo/templates/project/settings_distributed.py.tmpl +122 -135
  43. crawlo/templates/project/settings_gentle.py.tmpl +149 -43
  44. crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
  45. crawlo/templates/project/settings_minimal.py.tmpl +46 -15
  46. crawlo/templates/project/settings_simple.py.tmpl +138 -75
  47. crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
  48. crawlo/templates/run.py.tmpl +10 -14
  49. crawlo/templates/spiders_init.py.tmpl +10 -0
  50. crawlo/tools/network_diagnostic.py +365 -0
  51. crawlo/utils/class_loader.py +26 -0
  52. crawlo/utils/error_handler.py +76 -35
  53. crawlo/utils/log.py +41 -144
  54. crawlo/utils/redis_connection_pool.py +43 -6
  55. crawlo/utils/request_serializer.py +8 -1
  56. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
  57. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
  58. tests/authenticated_proxy_example.py +2 -2
  59. tests/baidu_performance_test.py +109 -0
  60. tests/baidu_test.py +60 -0
  61. tests/comprehensive_framework_test.py +213 -0
  62. tests/comprehensive_test.py +82 -0
  63. tests/comprehensive_testing_summary.md +187 -0
  64. tests/debug_configure.py +70 -0
  65. tests/debug_framework_logger.py +85 -0
  66. tests/debug_log_levels.py +64 -0
  67. tests/distributed_test.py +67 -0
  68. tests/distributed_test_debug.py +77 -0
  69. tests/final_command_test_report.md +0 -0
  70. tests/final_comprehensive_test.py +152 -0
  71. tests/final_validation_test.py +183 -0
  72. tests/framework_performance_test.py +203 -0
  73. tests/optimized_performance_test.py +212 -0
  74. tests/performance_comparison.py +246 -0
  75. tests/queue_blocking_test.py +114 -0
  76. tests/queue_test.py +90 -0
  77. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  78. tests/scrapy_comparison/scrapy_test.py +134 -0
  79. tests/simple_command_test.py +120 -0
  80. tests/simple_crawlo_test.py +128 -0
  81. tests/simple_log_test.py +58 -0
  82. tests/simple_optimization_test.py +129 -0
  83. tests/simple_spider_test.py +50 -0
  84. tests/simple_test.py +48 -0
  85. tests/test_all_commands.py +231 -0
  86. tests/test_batch_processor.py +179 -0
  87. tests/test_component_factory.py +175 -0
  88. tests/test_controlled_spider_mixin.py +80 -0
  89. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  90. tests/test_factories.py +253 -0
  91. tests/test_framework_logger.py +67 -0
  92. tests/test_framework_startup.py +65 -0
  93. tests/test_large_scale_config.py +113 -0
  94. tests/test_large_scale_helper.py +236 -0
  95. tests/test_mode_change.py +73 -0
  96. tests/test_mode_consistency.py +1 -1
  97. tests/test_performance_monitor.py +116 -0
  98. tests/test_queue_empty_check.py +42 -0
  99. tests/untested_features_report.md +139 -0
  100. tests/verify_debug.py +52 -0
  101. tests/verify_log_fix.py +112 -0
  102. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  103. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  104. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  105. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
crawlo/__init__.py CHANGED
@@ -3,6 +3,8 @@
3
3
  """
4
4
  Crawlo - 一个异步爬虫框架
5
5
  """
6
+ from typing import TYPE_CHECKING
7
+
6
8
  from crawlo.spider import Spider
7
9
  from crawlo.items import Item, Field
8
10
  from crawlo.network.request import Request
@@ -24,9 +26,29 @@ from crawlo.utils import (
24
26
  )
25
27
  from crawlo import tools
26
28
 
29
+ # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
+ if TYPE_CHECKING:
31
+ from crawlo.core.framework_initializer import get_framework_initializer, initialize_framework
32
+
27
33
  # 为了向后兼容,从tools中导入cleaners相关的功能
28
34
  import crawlo.tools as cleaners
29
35
 
36
+ # 延迟导入的辅助函数
37
+ def get_framework_initializer():
38
+ """延迟导入get_framework_initializer以避免循环依赖"""
39
+ from crawlo.core.framework_initializer import get_framework_initializer as _get_framework_initializer
40
+ return _get_framework_initializer()
41
+
42
+ def initialize_framework(custom_settings=None):
43
+ """延迟导入initialize_framework以避免循环依赖"""
44
+ from crawlo.core.framework_initializer import initialize_framework as _initialize_framework
45
+ return _initialize_framework(custom_settings)
46
+
47
+ # 向后兼容的别名
48
+ def get_bootstrap_manager():
49
+ """向后兼容的别名"""
50
+ return get_framework_initializer()
51
+
30
52
  # 版本号:优先从元数据读取
31
53
  try:
32
54
  from importlib.metadata import version
@@ -60,5 +82,7 @@ __all__ = [
60
82
  'from_timestamp_with_tz',
61
83
  'cleaners',
62
84
  'tools',
85
+ 'get_framework_initializer',
86
+ 'get_bootstrap_manager',
63
87
  '__version__',
64
88
  ]
crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.3.2'
1
+ __version__ = '1.3.4'
crawlo/commands/run.py CHANGED
@@ -21,10 +21,23 @@ from rich.text import Text
21
21
  from crawlo.commands.stats import record_stats
22
22
  from crawlo.crawler import CrawlerProcess
23
23
  from crawlo.project import get_settings, _find_project_root
24
- # 使用自定义日志系统
24
+ # 使用新的统一初始化系统
25
+ from crawlo.initialization import initialize_framework
26
+ from crawlo.core import get_framework_initializer
25
27
  from crawlo.utils.log import get_logger
26
28
 
27
- logger = get_logger(__name__)
29
+ # 延迟获取logger,确保在日志系统配置之后获取
30
+ _logger = None
31
+
32
+
33
+ def logger():
34
+ """延迟获取logger实例,确保在日志系统配置之后获取"""
35
+ global _logger
36
+ if _logger is None:
37
+ # 使用改进后的日志系统,可以安全地在任何时候创建
38
+ _logger = get_logger(__name__)
39
+ return _logger
40
+
28
41
 
29
42
  console = Console()
30
43
 
@@ -35,15 +48,15 @@ def check_redis_connection(settings):
35
48
  # 检查是否为分布式模式
36
49
  run_mode = settings.get('RUN_MODE', 'standalone')
37
50
  queue_type = settings.get('QUEUE_TYPE', 'memory')
38
-
51
+
39
52
  if run_mode == 'distributed' or queue_type == 'redis':
40
53
  import redis.asyncio as redis
41
54
  redis_url = settings.get('REDIS_URL', 'redis://127.0.0.1:6379/0')
42
55
  redis_host = settings.get('REDIS_HOST', '127.0.0.1')
43
56
  redis_port = settings.get('REDIS_PORT', 6379)
44
-
57
+
45
58
  console.print(f"检查 Redis 连接: {redis_host}:{redis_port}")
46
-
59
+
47
60
  # 创建Redis连接进行测试
48
61
  async def _test_redis():
49
62
  try:
@@ -54,11 +67,11 @@ def check_redis_connection(settings):
54
67
  except Exception as e:
55
68
  console.print(f"Redis 连接失败: {e}")
56
69
  return False
57
-
70
+
58
71
  # 运行异步测试
59
72
  if not asyncio.run(_test_redis()):
60
73
  raise ConnectionError(f"无法连接到 Redis 服务器 {redis_host}:{redis_port}")
61
-
74
+
62
75
  console.print("Redis 连接正常")
63
76
  return True
64
77
  else:
@@ -78,11 +91,15 @@ def main(args):
78
91
  用法:
79
92
  crawlo run <spider_name>|all [--json] [--no-stats]
80
93
  """
94
+ # 确保框架已初始化
95
+ init_manager = get_framework_initializer()
96
+
81
97
  # 添加调试信息
82
- logger.debug("DEBUG: 进入main函数")
83
-
98
+ logger().debug("DEBUG: 进入main函数")
99
+
84
100
  if len(args) < 1:
85
- console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
101
+ console.print(
102
+ "[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
86
103
  console.print("示例:")
87
104
  console.print(" [blue]crawlo run baidu[/blue]")
88
105
  console.print(" [blue]crawlo run all[/blue]")
@@ -153,9 +170,9 @@ def main(args):
153
170
  console.print(Panel(msg, title="导入错误", border_style="red"))
154
171
  return 1
155
172
 
156
- # 4. 加载 settings 和爬虫模块
157
- settings = get_settings()
158
-
173
+ # 4. 启动框架并加载 settings
174
+ settings = initialize_framework()
175
+
159
176
  # 检查Redis连接(如果是分布式模式)
160
177
  if not check_redis_connection(settings):
161
178
  if show_json:
@@ -163,9 +180,22 @@ def main(args):
163
180
  return 1
164
181
  else:
165
182
  return 1
166
-
167
- spider_modules = [f"{project_package}.spiders"]
183
+
184
+ # 从配置中获取SPIDER_MODULES
185
+ spider_modules = settings.get('SPIDER_MODULES', [f"{project_package}.spiders"])
186
+ logger().debug(f"SPIDER_MODULES from settings: {spider_modules}")
168
187
  process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
188
+
189
+ # 不再需要手动导入爬虫模块,框架内部会自动处理
190
+ # 检查注册表中的爬虫
191
+ from crawlo.spider import get_global_spider_registry
192
+ registry = get_global_spider_registry()
193
+ spider_names = list(registry.keys())
194
+ logger().debug(f"Registered spiders after import: {spider_names}")
195
+
196
+ # 调试信息
197
+ logger().debug(f"SPIDER_MODULES: {spider_modules}")
198
+ logger().debug(f"Available spiders: {process.get_spider_names()}")
169
199
 
170
200
  # === 情况1:运行所有爬虫 ===
171
201
  if spider_arg.lower() == "all":
@@ -193,19 +223,14 @@ def main(args):
193
223
  # 显示即将运行的爬虫列表
194
224
  # 根据用户要求,不再显示详细的爬虫列表信息
195
225
 
196
- # 注册 stats 记录(除非 --no-stats)
197
- if not no_stats:
198
- for crawler in process.crawlers:
199
- crawler.signals.connect(record_stats, signal="spider_closed")
200
-
201
226
  # 并行运行所有爬虫
202
227
  with Progress(
203
- SpinnerColumn(),
204
- TextColumn("[progress.description]{task.description}"),
205
- transient=True,
228
+ SpinnerColumn(),
229
+ TextColumn("[progress.description]{task.description}"),
230
+ transient=True,
206
231
  ) as progress:
207
232
  task = progress.add_task("正在运行所有爬虫...", total=None)
208
- asyncio.run(process.crawl(spider_names))
233
+ asyncio.run(process.crawl_multiple(spider_names))
209
234
 
210
235
  if show_json:
211
236
  console.print_json(data={"success": True, "spiders": spider_names})
@@ -267,15 +292,16 @@ def main(args):
267
292
  # console.print()
268
293
 
269
294
  # 注册 stats 记录
270
- if not no_stats:
271
- for crawler in process.crawlers:
272
- crawler.signals.connect(record_stats, signal="spider_closed")
295
+ # 注意:CrawlerProcess没有crawlers属性,我们需要在运行时注册
296
+ # if not no_stats:
297
+ # for crawler in process.crawlers:
298
+ # crawler.signals.connect(record_stats, signal="spider_closed")
273
299
 
274
300
  # 运行爬虫
275
301
  with Progress(
276
- SpinnerColumn(),
277
- TextColumn("[progress.description]{task.description}"),
278
- transient=True,
302
+ SpinnerColumn(),
303
+ TextColumn("[progress.description]{task.description}"),
304
+ transient=True,
279
305
  ) as progress:
280
306
  task = progress.add_task(f"正在运行 {spider_name}...", total=None)
281
307
  asyncio.run(process.crawl(spider_name))
@@ -298,7 +324,7 @@ def main(args):
298
324
  console.print(f"[bold yellow]{msg}[/bold yellow]")
299
325
  return 1
300
326
  except Exception as e:
301
- logger.exception("Exception during 'crawlo run'")
327
+ logger().exception("Exception during 'crawlo run'")
302
328
  msg = f"意外错误: {e}"
303
329
  if show_json:
304
330
  console.print_json(data={"success": False, "error": msg})
@@ -312,4 +338,4 @@ if __name__ == "__main__":
312
338
  支持直接运行:
313
339
  python -m crawlo.commands.run spider_name
314
340
  """
315
- sys.exit(main(sys.argv[1:]))
341
+ sys.exit(main(sys.argv[1:]))
crawlo/core/__init__.py CHANGED
@@ -1,2 +1,46 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
+
4
+ # Crawlo核心模块
5
+ # 提供框架的核心组件和初始化功能
6
+
7
+ # 使用新的初始化系统
8
+ from ..initialization import (
9
+ initialize_framework,
10
+ is_framework_ready
11
+ )
12
+
13
+ # 向后兼容的别名
14
+ def async_initialize_framework(*args, **kwargs):
15
+ """Async wrapper for framework initialization"""
16
+ return initialize_framework(*args, **kwargs)
17
+
18
+ def get_framework_initializer():
19
+ """Get framework initializer - compatibility function"""
20
+ from ..initialization.core import CoreInitializer
21
+ return CoreInitializer()
22
+
23
+ def get_framework_logger(name='crawlo.core'):
24
+ """Get framework logger - compatibility function"""
25
+ from ..logging import get_logger
26
+ return get_logger(name)
27
+
28
+ # 向后兼容
29
+ def bootstrap_framework(*args, **kwargs):
30
+ """Bootstrap framework - compatibility function"""
31
+ return initialize_framework(*args, **kwargs)
32
+
33
+ def get_bootstrap_manager():
34
+ """Get bootstrap manager - compatibility function"""
35
+ return get_framework_initializer()
36
+
37
+ __all__ = [
38
+ 'initialize_framework',
39
+ 'async_initialize_framework',
40
+ 'get_framework_initializer',
41
+ 'is_framework_ready',
42
+ 'get_framework_logger',
43
+ # 向后兼容
44
+ 'bootstrap_framework',
45
+ 'get_bootstrap_manager'
46
+ ]
crawlo/core/engine.py CHANGED
@@ -6,16 +6,16 @@ from inspect import iscoroutine
6
6
  from typing import Optional, Generator, Callable
7
7
 
8
8
  from crawlo import Request, Item
9
- from crawlo.spider import Spider
10
- from crawlo.utils.log import get_logger
11
- from crawlo.exceptions import OutputError
12
- from crawlo.core.scheduler import Scheduler
13
9
  from crawlo.core.processor import Processor
14
- from crawlo.task_manager import TaskManager
15
- from crawlo.project import load_class
10
+ from crawlo.core.scheduler import Scheduler
16
11
  from crawlo.downloader import DownloaderBase
17
- from crawlo.utils.func_tools import transform
18
12
  from crawlo.event import spider_opened, spider_error, request_scheduled
13
+ from crawlo.exceptions import OutputError
14
+ from crawlo.utils.class_loader import load_class
15
+ from crawlo.spider import Spider
16
+ from crawlo.task_manager import TaskManager
17
+ from crawlo.utils.func_tools import transform
18
+ from crawlo.utils.log import get_logger
19
19
 
20
20
 
21
21
  class Engine(object):
@@ -35,8 +35,8 @@ class Engine(object):
35
35
  # Enhanced control parameters
36
36
  self.max_queue_size = self.settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 200)
37
37
  self.generation_batch_size = self.settings.get_int('REQUEST_GENERATION_BATCH_SIZE', 10)
38
- self.generation_interval = self.settings.get_float('REQUEST_GENERATION_INTERVAL', 0.05)
39
- self.backpressure_ratio = self.settings.get_float('BACKPRESSURE_RATIO', 0.8) # Start backpressure when queue reaches 80%
38
+ self.generation_interval = self.settings.get_float('REQUEST_GENERATION_INTERVAL', 0.01) # 优化默认值
39
+ self.backpressure_ratio = self.settings.get_float('BACKPRESSURE_RATIO', 0.9) # 优化默认值
40
40
 
41
41
  # State tracking
42
42
  self._generation_paused = False
@@ -74,9 +74,7 @@ class Engine(object):
74
74
  if not version or version == 'None':
75
75
  version = '1.0.0'
76
76
  # Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
77
- self.logger.debug(
78
- f"Crawlo Started version {version}"
79
- )
77
+ self.logger.debug(f"Crawlo Framework Started {version}")
80
78
 
81
79
  async def start_spider(self, spider):
82
80
  self.spider = spider
@@ -110,7 +108,20 @@ class Engine(object):
110
108
  if not hasattr(self.crawler, 'extension') or not self.crawler.extension:
111
109
  self.crawler.extension = self.crawler._create_extension()
112
110
 
113
- self.start_requests = iter(spider.start_requests())
111
+ # 启动引擎
112
+ self.engine_start()
113
+
114
+ self.logger.debug("开始创建start_requests迭代器")
115
+ try:
116
+ # 先收集所有请求到列表中,避免在检查时消耗迭代器
117
+ requests_list = list(spider.start_requests())
118
+ self.logger.debug(f"收集到 {len(requests_list)} 个请求")
119
+ self.start_requests = iter(requests_list)
120
+ self.logger.debug("start_requests迭代器创建成功")
121
+ except Exception as e:
122
+ self.logger.error(f"创建start_requests迭代器失败: {e}")
123
+ import traceback
124
+ self.logger.error(traceback.format_exc())
114
125
  await self._open_spider()
115
126
 
116
127
  async def crawl(self):
@@ -124,32 +135,46 @@ class Engine(object):
124
135
  # 启动请求生成任务(如果启用了受控生成)
125
136
  if (self.start_requests and
126
137
  self.settings.get_bool('ENABLE_CONTROLLED_REQUEST_GENERATION', False)):
138
+ self.logger.debug("创建受控请求生成任务")
127
139
  generation_task = asyncio.create_task(
128
140
  self._controlled_request_generation()
129
141
  )
130
142
  else:
131
143
  # 传统方式处理启动请求
144
+ self.logger.debug("创建传统请求生成任务")
132
145
  generation_task = asyncio.create_task(
133
146
  self._traditional_request_generation()
134
147
  )
135
148
 
149
+ self.logger.debug("请求生成任务创建完成")
150
+
136
151
  # 主爬取循环
152
+ loop_count = 0
153
+ last_exit_check = 0 # 记录上次检查退出条件的时间
154
+ exit_check_interval = 1 # 每1次循环检查一次退出条件,进一步提高检查频率
155
+
137
156
  while self.running:
157
+ loop_count += 1
138
158
  # 获取并处理请求
139
159
  if request := await self._get_next_request():
140
160
  await self._crawl(request)
141
161
 
142
- # 检查退出条件
143
- if await self._should_exit():
144
- break
162
+ # 优化退出条件检查频率
163
+ if loop_count - last_exit_check >= exit_check_interval:
164
+ should_exit = await self._should_exit()
165
+ if should_exit:
166
+ self.logger.debug("满足退出条件,准备退出循环")
167
+ break
168
+ last_exit_check = loop_count
145
169
 
146
- # 短暂休息避免忙等
147
- await asyncio.sleep(0.001)
170
+ # 短暂休息避免忙等,但减少休息时间以提高效率
171
+ await asyncio.sleep(0.000001) # 从0.00001减少到0.000001
172
+
173
+ self.logger.debug(f"主爬取循环结束,总共执行了 {loop_count} 次")
148
174
 
149
175
  finally:
150
- # 清理生成任务
176
+ # 确保请求生成任务完成
151
177
  if generation_task and not generation_task.done():
152
- generation_task.cancel()
153
178
  try:
154
179
  await generation_task
155
180
  except asyncio.CancelledError:
@@ -159,15 +184,24 @@ class Engine(object):
159
184
 
160
185
  async def _traditional_request_generation(self):
161
186
  """Traditional request generation method (compatible with older versions)"""
187
+ self.logger.debug("开始处理传统请求生成")
188
+ processed_count = 0
162
189
  while self.running:
163
190
  try:
164
191
  start_request = next(self.start_requests)
192
+ self.logger.debug(f"获取到请求: {getattr(start_request, 'url', 'Unknown URL')}")
165
193
  # 请求入队
166
194
  await self.enqueue_request(start_request)
195
+ processed_count += 1
196
+ self.logger.debug(f"已处理请求数量: {processed_count}")
167
197
  except StopIteration:
198
+ self.logger.debug("所有起始请求处理完成")
168
199
  self.start_requests = None
169
200
  break
170
201
  except Exception as exp:
202
+ self.logger.error(f"处理请求时发生异常: {exp}")
203
+ import traceback
204
+ self.logger.error(traceback.format_exc())
171
205
  # 1. All requests have been processed
172
206
  # 2. Is scheduler idle
173
207
  # 3. Is downloader idle
@@ -176,11 +210,13 @@ class Engine(object):
176
210
  self.running = False
177
211
  if self.start_requests is not None:
178
212
  self.logger.error(f"Error occurred while starting request: {str(exp)}")
179
- await asyncio.sleep(0.001)
213
+ # 减少等待时间以提高效率
214
+ await asyncio.sleep(0.00001) # 从0.0001减少到0.00001
215
+ self.logger.debug(f"传统请求生成完成,总共处理了 {processed_count} 个请求")
180
216
 
181
217
  async def _controlled_request_generation(self):
182
218
  """Controlled request generation (enhanced features)"""
183
- self.logger.info("Starting controlled request generation")
219
+ self.logger.debug("Starting controlled request generation")
184
220
 
185
221
  batch = []
186
222
  total_generated = 0
@@ -209,7 +245,7 @@ class Engine(object):
209
245
 
210
246
  finally:
211
247
  self.start_requests = None
212
- self.logger.info(f"Request generation completed, total: {total_generated}")
248
+ self.logger.debug(f"Request generation completed, total: {total_generated}")
213
249
 
214
250
  async def _process_generation_batch(self, batch) -> int:
215
251
  """Process a batch of requests"""
@@ -221,14 +257,14 @@ class Engine(object):
221
257
 
222
258
  # 等待队列有空间
223
259
  while await self._is_queue_full() and self.running:
224
- await asyncio.sleep(0.1)
260
+ await asyncio.sleep(0.01) # 减少等待时间
225
261
 
226
262
  if self.running:
227
263
  await self.enqueue_request(request)
228
264
  generated += 1
229
265
  self._generation_stats['total_generated'] += 1
230
266
 
231
- # 控制生成速度
267
+ # 控制生成速度,但使用更小的间隔
232
268
  if self.generation_interval > 0:
233
269
  await asyncio.sleep(self.generation_interval)
234
270
 
@@ -263,8 +299,8 @@ class Engine(object):
263
299
  self._generation_stats['backpressure_events'] += 1
264
300
  self.logger.debug("Backpressure triggered, pausing request generation")
265
301
 
266
- wait_time = 0.1
267
- max_wait = 2.0
302
+ wait_time = 0.01 # 减少初始等待时间
303
+ max_wait = 1.0 # 减少最大等待时间
268
304
 
269
305
  while await self._should_pause_generation() and self.running:
270
306
  await asyncio.sleep(wait_time)
@@ -272,16 +308,38 @@ class Engine(object):
272
308
 
273
309
  async def _open_spider(self):
274
310
  asyncio.create_task(self.crawler.subscriber.notify(spider_opened))
275
- crawling = asyncio.create_task(self.crawl())
276
- await crawling
311
+ # 直接调用crawl方法而不是创建任务,确保等待完成
312
+ await self.crawl()
277
313
 
278
314
  async def _crawl(self, request):
279
- # TODO 实现并发
280
315
  async def crawl_task():
281
- outputs = await self._fetch(request)
282
- # TODO 处理output
283
- if outputs:
284
- await self._handle_spider_output(outputs)
316
+ start_time = time.time()
317
+ try:
318
+ outputs = await self._fetch(request)
319
+ # 记录响应时间
320
+ response_time = time.time() - start_time
321
+ if self.task_manager:
322
+ self.task_manager.record_response_time(response_time)
323
+
324
+ # TODO 处理output
325
+ if outputs:
326
+ await self._handle_spider_output(outputs)
327
+ except Exception as e:
328
+ # 记录详细的异常信息
329
+ self.logger.error(
330
+ f"处理请求失败: {getattr(request, 'url', 'Unknown URL')} - {type(e).__name__}: {e}"
331
+ )
332
+ self.logger.debug(f"详细异常信息", exc_info=True)
333
+
334
+ # 发送统计事件
335
+ if hasattr(self.crawler, 'stats'):
336
+ self.crawler.stats.inc_value('downloader/exception_count')
337
+ self.crawler.stats.inc_value(f'downloader/exception_type_count/{type(e).__name__}')
338
+ if hasattr(request, 'url'):
339
+ self.crawler.stats.inc_value(f'downloader/failed_urls_count')
340
+
341
+ # 不再重新抛出异常,避免未处理的Task异常
342
+ return None
285
343
 
286
344
  # 使用异步任务创建,遵守并发限制
287
345
  await self.task_manager.create_task(crawl_task())
@@ -331,31 +389,47 @@ class Engine(object):
331
389
 
332
390
  async def _should_exit(self) -> bool:
333
391
  """检查是否应该退出"""
392
+ self.logger.debug(f"检查退出条件: start_requests={self.start_requests is not None}")
334
393
  # 没有启动请求,且所有队列都空闲
335
394
  if self.start_requests is None:
395
+ self.logger.debug("start_requests 为 None,检查其他组件状态")
336
396
  # 使用异步的idle检查方法以获得更精确的结果
337
397
  scheduler_idle = await self.scheduler.async_idle() if hasattr(self.scheduler, 'async_idle') else self.scheduler.idle()
398
+ downloader_idle = self.downloader.idle()
399
+ task_manager_done = self.task_manager.all_done()
400
+ processor_idle = self.processor.idle()
401
+
402
+ self.logger.debug(f"组件状态 - Scheduler: {scheduler_idle}, Downloader: {downloader_idle}, TaskManager: {task_manager_done}, Processor: {processor_idle}")
338
403
 
339
404
  if (scheduler_idle and
340
- self.downloader.idle() and
341
- self.task_manager.all_done() and
342
- self.processor.idle()):
343
- # 增加额外检查确保所有任务都完成
344
- await asyncio.sleep(0.1) # 短暂等待确保没有新的任务加入
345
- if (await self.scheduler.async_idle() and
346
- self.downloader.idle() and
347
- self.task_manager.all_done() and
348
- self.processor.idle()):
405
+ downloader_idle and
406
+ task_manager_done and
407
+ processor_idle):
408
+ # 立即进行二次检查,不等待
409
+ scheduler_idle = await self.scheduler.async_idle() if hasattr(self.scheduler, 'async_idle') else self.scheduler.idle()
410
+ downloader_idle = self.downloader.idle()
411
+ task_manager_done = self.task_manager.all_done()
412
+ processor_idle = self.processor.idle()
413
+
414
+ self.logger.debug(f"二次检查组件状态 - Scheduler: {scheduler_idle}, Downloader: {downloader_idle}, TaskManager: {task_manager_done}, Processor: {processor_idle}")
415
+
416
+ if (scheduler_idle and
417
+ downloader_idle and
418
+ task_manager_done and
419
+ processor_idle):
420
+ self.logger.info("所有组件都空闲,准备退出")
349
421
  return True
422
+ else:
423
+ self.logger.debug("start_requests 不为 None,不退出")
350
424
 
351
425
  return False
352
426
 
353
427
  async def close_spider(self):
428
+ # 不再调用crawler.close(),避免重复清理
429
+ # 清理工作应该由crawler的_lifecycle_manager上下文管理器来处理
354
430
  await asyncio.gather(*self.task_manager.current_task)
355
431
  await self.scheduler.close()
356
432
  await self.downloader.close()
357
- if self.normal:
358
- await self.crawler.close()
359
433
 
360
434
  def get_generation_stats(self) -> dict:
361
435
  """获取生成统计"""
crawlo/core/scheduler.py CHANGED
@@ -1,14 +1,15 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from typing import Optional, Callable
4
3
  import traceback
4
+ from typing import Optional, Callable
5
5
 
6
6
  from crawlo.utils.log import get_logger
7
7
  from crawlo.utils.request import set_request
8
- from crawlo.utils.request_serializer import RequestSerializer
9
8
  from crawlo.utils.error_handler import ErrorHandler
9
+ from crawlo.utils.class_loader import load_class
10
+ from crawlo.project import common_call
11
+ from crawlo.utils.request_serializer import RequestSerializer
10
12
  from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
11
- from crawlo.project import load_class, common_call
12
13
 
13
14
 
14
15
  class Scheduler: