crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/__init__.py CHANGED
@@ -7,7 +7,7 @@ Crawlo - 一个异步爬虫框架
7
7
  # 为了向后兼容,从tools中导入cleaners相关的功能
8
8
  import crawlo.tools as cleaners
9
9
  from crawlo import tools
10
- from crawlo.crawler import CrawlerProcess
10
+ from crawlo.crawler import Crawler, CrawlerProcess
11
11
  from crawlo.downloader import DownloaderBase
12
12
  from crawlo.items import Item, Field
13
13
  from crawlo.middleware import BaseMiddleware
@@ -82,6 +82,7 @@ __all__ = [
82
82
  'from_timestamp_with_tz',
83
83
  'cleaners',
84
84
  'tools',
85
+ 'Crawler',
85
86
  'CrawlerProcess',
86
87
  'get_framework_initializer',
87
88
  'get_bootstrap_manager',
crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.4.6'
1
+ __version__ = '1.4.8'
crawlo/cli.py CHANGED
@@ -4,12 +4,12 @@
4
4
  import sys
5
5
  import argparse
6
6
  from crawlo.commands import get_commands
7
- from crawlo.utils.env_config import get_version
7
+ from crawlo.utils.config_manager import EnvConfigManager
8
8
 
9
9
 
10
10
  def main():
11
11
  # 获取框架版本号
12
- VERSION = get_version()
12
+ VERSION = EnvConfigManager.get_version()
13
13
 
14
14
  # 获取所有可用命令
15
15
  commands = get_commands()
crawlo/commands/check.py CHANGED
@@ -24,7 +24,7 @@ from watchdog.observers import Observer
24
24
  from watchdog.events import FileSystemEventHandler
25
25
 
26
26
  from crawlo.crawler import CrawlerProcess
27
- from crawlo.utils.log import get_logger
27
+ from crawlo.logging import get_logger
28
28
 
29
29
 
30
30
  logger = get_logger(__name__)
crawlo/commands/help.py CHANGED
@@ -11,10 +11,10 @@ from rich.table import Table
11
11
  from rich.panel import Panel
12
12
  from rich.text import Text
13
13
  from rich import box
14
- from crawlo.utils.env_config import get_version
14
+ from crawlo.utils.config_manager import EnvConfigManager
15
15
 
16
16
  # 获取框架版本号
17
- VERSION = get_version()
17
+ VERSION = EnvConfigManager.get_version()
18
18
 
19
19
  console = Console()
20
20
 
@@ -85,11 +85,13 @@ def show_help():
85
85
 
86
86
  # run 命令
87
87
  console.print("[bold cyan]run[/bold cyan] - 运行爬虫")
88
- console.print(" 用法: crawlo run <spider_name>|all [--json] [--no-stats]")
88
+ console.print(" 用法: crawlo run <spider_name>|all [--json] [--no-stats] [--log-level LEVEL] [--config CONFIG] [--concurrency NUM]")
89
89
  console.print(" 示例:")
90
90
  console.print(" crawlo run myspider")
91
91
  console.print(" crawlo run all")
92
92
  console.print(" crawlo run all --json --no-stats")
93
+ console.print(" crawlo run myspider --log-level DEBUG")
94
+ console.print(" crawlo run myspider --concurrency 32")
93
95
  console.print()
94
96
 
95
97
 
crawlo/commands/list.py CHANGED
@@ -16,7 +16,7 @@ from rich.text import Text
16
16
  from rich import box
17
17
 
18
18
  from crawlo.crawler import CrawlerProcess
19
- from crawlo.utils.log import get_logger
19
+ from crawlo.logging import get_logger
20
20
  from .utils import validate_project_environment, show_error_panel
21
21
 
22
22
  logger = get_logger(__name__)
crawlo/commands/run.py CHANGED
@@ -24,7 +24,7 @@ from crawlo.project import get_settings, _find_project_root
24
24
  # 使用新的统一初始化系统
25
25
  from crawlo.initialization import initialize_framework
26
26
  from crawlo.core import get_framework_initializer
27
- from crawlo.utils.log import get_logger
27
+ from crawlo.logging import get_logger
28
28
 
29
29
  # 延迟获取logger,确保在日志系统配置之后获取
30
30
  _logger = None
@@ -89,7 +89,7 @@ def main(args):
89
89
  """
90
90
  主函数:运行指定爬虫
91
91
  用法:
92
- crawlo run <spider_name>|all [--json] [--no-stats]
92
+ crawlo run <spider_name>|all [--json] [--no-stats] [--log-level LEVEL] [--config CONFIG] [--concurrency NUM]
93
93
  """
94
94
  # 确保框架已初始化
95
95
  init_manager = get_framework_initializer()
@@ -99,7 +99,7 @@ def main(args):
99
99
 
100
100
  if len(args) < 1:
101
101
  console.print(
102
- "[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
102
+ "[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats] [--log-level LEVEL] [--config CONFIG] [--concurrency NUM][/bold yellow]")
103
103
  console.print("示例:")
104
104
  console.print(" [blue]crawlo run baidu[/blue]")
105
105
  console.print(" [blue]crawlo run all[/blue]")
@@ -110,6 +110,36 @@ def main(args):
110
110
  spider_arg = args[0]
111
111
  show_json = "--json" in args
112
112
  no_stats = "--no-stats" in args
113
+
114
+ # 解析日志级别参数
115
+ log_level = None
116
+ if "--log-level" in args:
117
+ try:
118
+ log_level_index = args.index("--log-level")
119
+ if log_level_index + 1 < len(args):
120
+ log_level = args[log_level_index + 1]
121
+ except (ValueError, IndexError):
122
+ pass
123
+
124
+ # 解析配置文件参数
125
+ config_file = None
126
+ if "--config" in args:
127
+ try:
128
+ config_index = args.index("--config")
129
+ if config_index + 1 < len(args):
130
+ config_file = args[config_index + 1]
131
+ except (ValueError, IndexError):
132
+ pass
133
+
134
+ # 解析并发数参数
135
+ concurrency = None
136
+ if "--concurrency" in args:
137
+ try:
138
+ concurrency_index = args.index("--concurrency")
139
+ if concurrency_index + 1 < len(args):
140
+ concurrency = int(args[concurrency_index + 1])
141
+ except (ValueError, IndexError, TypeError):
142
+ pass
113
143
 
114
144
  try:
115
145
  # 1. 查找项目根目录
@@ -171,7 +201,14 @@ def main(args):
171
201
  return 1
172
202
 
173
203
  # 4. 启动框架并加载 settings
174
- settings = initialize_framework()
204
+ # 如果指定了日志级别,则添加到自定义设置中
205
+ custom_settings = {}
206
+ if log_level:
207
+ custom_settings['LOG_LEVEL'] = log_level
208
+ if concurrency:
209
+ custom_settings['CONCURRENCY'] = concurrency
210
+
211
+ settings = initialize_framework(custom_settings if custom_settings else None)
175
212
 
176
213
  # 检查Redis连接(如果是分布式模式)
177
214
  if not check_redis_connection(settings):
@@ -183,7 +220,7 @@ def main(args):
183
220
 
184
221
  # 从配置中获取SPIDER_MODULES
185
222
  spider_modules = settings.get('SPIDER_MODULES', [f"{project_package}.spiders"])
186
- logger().debug(f"SPIDER_MODULES from settings: {spider_modules}")
223
+ # 合并重复的调试信息
187
224
  process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
188
225
 
189
226
  # 不再需要手动导入爬虫模块,框架内部会自动处理
@@ -191,11 +228,11 @@ def main(args):
191
228
  from crawlo.spider import get_global_spider_registry
192
229
  registry = get_global_spider_registry()
193
230
  spider_names = list(registry.keys())
194
- logger().debug(f"Registered spiders after import: {spider_names}")
195
-
196
- # 调试信息
197
- logger().debug(f"SPIDER_MODULES: {spider_modules}")
198
- logger().debug(f"Available spiders: {process.get_spider_names()}")
231
+ # 减少重复的调试日志输出
232
+ # logger().debug(f"SPIDER_MODULES from settings: {spider_modules}")
233
+ # logger().debug(f"Registered spiders after import: {spider_names}")
234
+ # logger().debug(f"SPIDER_MODULES: {spider_modules}")
235
+ # logger().debug(f"Available spiders: {process.get_spider_names()}")
199
236
 
200
237
  # === 情况1:运行所有爬虫 ===
201
238
  if spider_arg.lower() == "all":
@@ -260,7 +297,8 @@ def main(args):
260
297
  panel_content.append("\n可用爬虫:\n")
261
298
  for name in sorted(available):
262
299
  cls = process.get_spider_class(name)
263
- panel_content.append(f" • [cyan]{name}[/cyan] ([green]{cls.__name__}[/green])\n")
300
+ class_name = cls.__name__ if cls else 'Unknown'
301
+ panel_content.append(f" • [cyan]{name}[/cyan] ([green]{class_name}[/green])\n")
264
302
  else:
265
303
  panel_content.append("\n未找到爬虫。请检查爬虫模块。")
266
304
 
crawlo/commands/stats.py CHANGED
@@ -16,7 +16,7 @@ from rich.panel import Panel
16
16
  from rich.text import Text
17
17
  from rich import box
18
18
 
19
- from crawlo.utils.log import get_logger
19
+ from crawlo.logging import get_logger
20
20
 
21
21
 
22
22
  logger = get_logger(__name__)
crawlo/config.py CHANGED
@@ -23,7 +23,7 @@ from typing import Dict, Any, Optional
23
23
 
24
24
  from crawlo.config_validator import validate_config
25
25
  from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode, from_env
26
- from crawlo.utils.log import get_logger
26
+ from crawlo.logging import get_logger
27
27
 
28
28
 
29
29
  class CrawloConfig:
@@ -51,13 +51,21 @@ class CrawloConfig:
51
51
  return self.settings.get(key, default)
52
52
 
53
53
  def set(self, key: str, value: Any) -> 'CrawloConfig':
54
- """设置配置项(链式调用)"""
54
+ """设置配置项(链式调用)
55
+
56
+ 注意:设置后会自动验证配置合法性
57
+ """
55
58
  self.settings[key] = value
59
+ self._validate_settings() # 自动验证
56
60
  return self
57
61
 
58
62
  def update(self, settings: Dict[str, Any]) -> 'CrawloConfig':
59
- """更新配置(链式调用)"""
63
+ """更新配置(链式调用)
64
+
65
+ 注意:更新后会自动验证配置合法性
66
+ """
60
67
  self.settings.update(settings)
68
+ self._validate_settings() # 自动验证
61
69
  return self
62
70
 
63
71
  def set_concurrency(self, concurrency: int) -> 'CrawloConfig':
@@ -95,7 +103,7 @@ class CrawloConfig:
95
103
  'auto': '自动检测模式'
96
104
  }
97
105
 
98
- queue_type = self.settings.get('QUEUE_TYPE', 'memory')
106
+ queue_type = self.settings.get('QUEUE_TYPE', 'auto')
99
107
  filter_class = self.settings.get('FILTER_CLASS', '').split('.')[-1]
100
108
  concurrency = self.settings.get('CONCURRENCY', 8)
101
109
 
@@ -7,7 +7,7 @@
7
7
  """
8
8
  from typing import Dict, Any, List, Tuple
9
9
 
10
- from crawlo.utils.log import get_logger
10
+ from crawlo.logging import get_logger
11
11
 
12
12
 
13
13
  class ConfigValidator:
crawlo/core/engine.py CHANGED
@@ -9,13 +9,13 @@ from crawlo import Request, Item
9
9
  from crawlo.core.processor import Processor
10
10
  from crawlo.core.scheduler import Scheduler
11
11
  from crawlo.downloader import DownloaderBase
12
- from crawlo.event import spider_opened, spider_error, request_scheduled
12
+ from crawlo.event import CrawlerEvent
13
13
  from crawlo.exceptions import OutputError
14
14
  from crawlo.utils.misc import load_object
15
15
  from crawlo.spider import Spider
16
16
  from crawlo.task_manager import TaskManager
17
17
  from crawlo.utils.func_tools import transform
18
- from crawlo.utils.log import get_logger
18
+ from crawlo.logging import get_logger
19
19
 
20
20
 
21
21
  class Engine(object):
@@ -94,6 +94,17 @@ class Engine(object):
94
94
  else:
95
95
  # DownloaderBase.open() 是同步方法,直接调用而不是await
96
96
  self.downloader.open()
97
+
98
+ # 注册下载器到资源管理器
99
+ if hasattr(self.crawler, '_resource_manager') and self.downloader:
100
+ from crawlo.utils.resource_manager import ResourceType
101
+ self.crawler._resource_manager.register(
102
+ self.downloader,
103
+ lambda d: d.close() if hasattr(d, 'close') else None,
104
+ ResourceType.DOWNLOADER,
105
+ name=f"downloader.{downloader_cls.__name__}"
106
+ )
107
+ self.logger.debug(f"Downloader registered to resource manager: {downloader_cls.__name__}")
97
108
 
98
109
  self.processor = Processor(self.crawler)
99
110
  if hasattr(self.processor, 'open'):
@@ -188,11 +199,13 @@ class Engine(object):
188
199
  while self.running:
189
200
  try:
190
201
  start_request = next(self.start_requests)
191
- self.logger.debug(f"获取到请求: {getattr(start_request, 'url', 'Unknown URL')}")
202
+ # 将过于频繁的debug日志合并,减少输出
203
+ # self.logger.debug(f"获取到请求: {getattr(start_request, 'url', 'Unknown URL')}")
192
204
  # 请求入队
193
205
  await self.enqueue_request(start_request)
194
206
  processed_count += 1
195
- self.logger.debug(f"已处理请求数量: {processed_count}")
207
+ # 减少过于频繁的日志输出
208
+ # self.logger.debug(f"已处理请求数量: {processed_count}")
196
209
  except StopIteration:
197
210
  self.logger.debug("所有起始请求处理完成")
198
211
  self.start_requests = None
@@ -306,7 +319,7 @@ class Engine(object):
306
319
  wait_time = min(wait_time * 1.1, max_wait)
307
320
 
308
321
  async def _open_spider(self):
309
- asyncio.create_task(self.crawler.subscriber.notify(spider_opened))
322
+ asyncio.create_task(self.crawler.subscriber.notify(CrawlerEvent.SPIDER_OPENED))
310
323
  # 直接调用crawl方法而不是创建任务,确保等待完成
311
324
  await self.crawl()
312
325
 
@@ -364,7 +377,7 @@ class Engine(object):
364
377
  async def _schedule_request(self, request):
365
378
  # TODO 去重
366
379
  if await self.scheduler.enqueue_request(request):
367
- asyncio.create_task(self.crawler.subscriber.notify(request_scheduled, request, self.crawler.spider))
380
+ asyncio.create_task(self.crawler.subscriber.notify(CrawlerEvent.REQUEST_SCHEDULED, request, self.crawler.spider))
368
381
 
369
382
  async def _get_next_request(self):
370
383
  return await self.scheduler.next_request()
@@ -375,7 +388,7 @@ class Engine(object):
375
388
  await self.processor.enqueue(spider_output)
376
389
  elif isinstance(spider_output, Exception):
377
390
  asyncio.create_task(
378
- self.crawler.subscriber.notify(spider_error, spider_output, self.spider)
391
+ self.crawler.subscriber.notify(CrawlerEvent.SPIDER_ERROR, spider_output, self.spider)
379
392
  )
380
393
  raise spider_output
381
394
  else:
crawlo/core/processor.py CHANGED
@@ -4,7 +4,7 @@ from asyncio import Queue, create_task
4
4
  from typing import Union, Optional
5
5
 
6
6
  from crawlo import Request, Item
7
- from crawlo.event import item_discard
7
+ from crawlo.event import CrawlerEvent
8
8
  from crawlo.exceptions import ItemDiscard
9
9
  from crawlo.pipelines.pipeline_manager import PipelineManager
10
10
 
crawlo/core/scheduler.py CHANGED
@@ -3,7 +3,7 @@
3
3
  import traceback
4
4
  from typing import Optional, Callable
5
5
 
6
- from crawlo.utils.log import get_logger
6
+ from crawlo.logging import get_logger
7
7
  from crawlo.utils.request import set_request
8
8
  from crawlo.utils.error_handler import ErrorHandler
9
9
  from crawlo.utils.misc import load_object
@@ -13,13 +13,13 @@ from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
13
13
 
14
14
 
15
15
  class Scheduler:
16
- def __init__(self, crawler, dupe_filter, stats, log_level, priority):
16
+ def __init__(self, crawler, dupe_filter, stats, priority):
17
17
  self.crawler = crawler
18
18
  self.queue_manager: Optional[QueueManager] = None
19
19
  self.request_serializer = RequestSerializer()
20
20
 
21
- self.logger = get_logger(name=self.__class__.__name__, level=log_level)
22
- self.error_handler = ErrorHandler(self.__class__.__name__, log_level)
21
+ self.logger = get_logger(self.__class__.__name__)
22
+ self.error_handler = ErrorHandler(self.__class__.__name__)
23
23
  self.stats = stats
24
24
  self.dupe_filter = dupe_filter
25
25
  self.priority = priority
@@ -31,7 +31,6 @@ class Scheduler:
31
31
  crawler=crawler,
32
32
  dupe_filter=filter_cls.create_instance(crawler),
33
33
  stats=crawler.stats,
34
- log_level=crawler.settings.get('LOG_LEVEL'),
35
34
  priority=crawler.settings.get('DEPTH_PRIORITY')
36
35
  )
37
36
  return o
crawlo/crawler.py CHANGED
@@ -1,14 +1,19 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding: UTF-8 -*-
3
3
  """
4
- 重构后的Crawler系统
5
- ==================
4
+ Crawler系统
5
+ ==========
6
+
7
+ 核心组件:
8
+ - Crawler: 爬虫核心控制器,负责单个爬虫的生命周期管理
9
+ - CrawlerProcess: 爬虫进程管理器,支持单个/多个爬虫运行
6
10
 
7
11
  设计原则:
8
12
  1. 单一职责 - 每个类只负责一个明确的功能
9
13
  2. 依赖注入 - 通过工厂创建组件,便于测试
10
14
  3. 状态管理 - 清晰的状态转换和生命周期
11
15
  4. 错误处理 - 优雅的错误处理和恢复机制
16
+ 5. 资源管理 - 统一的资源注册和清理机制
12
17
  """
13
18
 
14
19
  import asyncio
@@ -21,6 +26,7 @@ from typing import Optional, Type, Dict, Any, List
21
26
  from crawlo.logging import get_logger
22
27
  from crawlo.factories import get_component_registry
23
28
  from crawlo.initialization import initialize_framework, is_framework_ready
29
+ from crawlo.utils.resource_manager import ResourceManager, ResourceType
24
30
 
25
31
 
26
32
  class CrawlerState(Enum):
@@ -55,15 +61,16 @@ class CrawlerMetrics:
55
61
  return (self.success_count / total * 100) if total > 0 else 0.0
56
62
 
57
63
 
58
- class ModernCrawler:
64
+ class Crawler:
59
65
  """
60
- 现代化的Crawler实现
66
+ 爬虫核心控制器
61
67
 
62
68
  特点:
63
69
  1. 清晰的状态管理
64
70
  2. 依赖注入
65
71
  3. 组件化架构
66
72
  4. 完善的错误处理
73
+ 5. 统一的资源管理
67
74
  """
68
75
 
69
76
  def __init__(self, spider_cls: Type, settings=None):
@@ -82,6 +89,9 @@ class ModernCrawler:
82
89
  # 指标
83
90
  self._metrics = CrawlerMetrics()
84
91
 
92
+ # 资源管理器
93
+ self._resource_manager = ResourceManager(name=f"crawler.{spider_cls.__name__ if spider_cls else 'unknown'}")
94
+
85
95
  # 日志
86
96
  self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
87
97
 
@@ -209,6 +219,14 @@ class ModernCrawler:
209
219
 
210
220
  # 创建Engine(需要crawler参数)
211
221
  self._engine = registry.create('engine', crawler=self)
222
+ # 注册Engine到资源管理器
223
+ if self._engine and hasattr(self._engine, 'close'):
224
+ self._resource_manager.register(
225
+ self._engine,
226
+ lambda e: e.close() if hasattr(e, 'close') else None,
227
+ ResourceType.OTHER,
228
+ name="engine"
229
+ )
212
230
 
213
231
  # 创建Stats(需要crawler参数)
214
232
  self._stats = registry.create('stats', crawler=self)
@@ -291,7 +309,15 @@ class ModernCrawler:
291
309
  self._state = CrawlerState.CLOSING
292
310
 
293
311
  try:
294
- # 关闭各个组件
312
+ # 使用资源管理器统一清理
313
+ self._logger.debug("开始清理Crawler资源...")
314
+ cleanup_result = await self._resource_manager.cleanup_all()
315
+ self._logger.debug(
316
+ f"资源清理完成: {cleanup_result['success']}成功, "
317
+ f"{cleanup_result['errors']}失败, 耗时{cleanup_result['duration']:.2f}s"
318
+ )
319
+
320
+ # 关闭各个组件(继续兼容旧逻辑)
295
321
  if self._engine and hasattr(self._engine, 'close'):
296
322
  try:
297
323
  await self._engine.close()
@@ -318,7 +344,9 @@ class ModernCrawler:
318
344
 
319
345
  # 触发spider_closed事件,通知所有订阅者(包括扩展)
320
346
  # 传递reason参数,这里使用默认的'finished'作为reason
321
- await self.subscriber.notify("spider_closed", reason='finished')
347
+ if self.subscriber:
348
+ from crawlo.event import CrawlerEvent
349
+ await self.subscriber.notify(CrawlerEvent.SPIDER_CLOSED, reason='finished')
322
350
 
323
351
  if self._stats and hasattr(self._stats, 'close'):
324
352
  try:
@@ -348,7 +376,7 @@ class CrawlerProcess:
348
376
  # 初始化框架配置
349
377
  self._settings = settings or initialize_framework()
350
378
  self._max_concurrency = max_concurrency
351
- self._crawlers: List[ModernCrawler] = []
379
+ self._crawlers: List[Crawler] = []
352
380
  self._semaphore = asyncio.Semaphore(max_concurrency)
353
381
  self._logger = get_logger('crawler.process')
354
382
 
@@ -497,7 +525,7 @@ class CrawlerProcess:
497
525
  logger.info(f"Starting spider: {spider_cls.name}")
498
526
 
499
527
  merged_settings = self._merge_settings(settings)
500
- crawler = ModernCrawler(spider_cls, merged_settings)
528
+ crawler = Crawler(spider_cls, merged_settings)
501
529
 
502
530
  async with self._semaphore:
503
531
  await crawler.crawl()
@@ -526,7 +554,7 @@ class CrawlerProcess:
526
554
  tasks = []
527
555
  for spider_cls in spider_classes:
528
556
  merged_settings = self._merge_settings(settings)
529
- crawler = ModernCrawler(spider_cls, merged_settings)
557
+ crawler = Crawler(spider_cls, merged_settings)
530
558
  self._crawlers.append(crawler)
531
559
 
532
560
  task = asyncio.create_task(self._run_with_semaphore(crawler))
@@ -543,12 +571,25 @@ class CrawlerProcess:
543
571
  return results
544
572
 
545
573
  finally:
574
+ # 清理所有crawler,防止资源累积
575
+ self._logger.debug(f"Cleaning up {len(self._crawlers)} crawler(s)...")
576
+ for crawler in self._crawlers:
577
+ try:
578
+ # 确保每个crawler都被清理
579
+ if hasattr(crawler, '_resource_manager'):
580
+ await crawler._resource_manager.cleanup_all()
581
+ except Exception as e:
582
+ self._logger.warning(f"Failed to cleanup crawler: {e}")
583
+
584
+ # 清空crawlers列表,释放引用
585
+ self._crawlers.clear()
586
+
546
587
  self._end_time = time.time()
547
588
  if self._start_time:
548
589
  duration = self._end_time - self._start_time
549
590
  self._logger.info(f"Total execution time: {duration:.2f}s")
550
591
 
551
- async def _run_with_semaphore(self, crawler: ModernCrawler):
592
+ async def _run_with_semaphore(self, crawler: Crawler):
552
593
  """在信号量控制下运行爬虫"""
553
594
  async with self._semaphore:
554
595
  await crawler.crawl()
@@ -18,7 +18,7 @@ from abc import abstractmethod, ABCMeta
18
18
  from typing import Final, Set, Optional, TYPE_CHECKING
19
19
  from contextlib import asynccontextmanager
20
20
 
21
- from crawlo.utils.log import get_logger
21
+ from crawlo.logging import get_logger
22
22
  from crawlo.middleware.middleware_manager import MiddlewareManager
23
23
 
24
24
  if TYPE_CHECKING:
@@ -68,12 +68,16 @@ class ActivateRequestManager:
68
68
 
69
69
  def get_stats(self) -> dict:
70
70
  """获取请求统计信息"""
71
+ completed = self._completed_requests + self._failed_requests
71
72
  return {
72
73
  'active_requests': len(self._active),
73
74
  'total_requests': self._total_requests,
74
75
  'completed_requests': self._completed_requests,
75
76
  'failed_requests': self._failed_requests,
76
- 'success_rate': self._completed_requests / max(1, self._total_requests - len(self._active))
77
+ 'success_rate': (
78
+ self._completed_requests / completed * 100
79
+ if completed > 0 else 100.0 # 无完成请求时返回100%
80
+ )
77
81
  }
78
82
 
79
83
  def reset_stats(self):
@@ -104,7 +108,7 @@ class DownloaderBase(metaclass=DownloaderMeta):
104
108
  self.crawler = crawler
105
109
  self._active = ActivateRequestManager()
106
110
  self.middleware: Optional[MiddlewareManager] = None
107
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
111
+ self.logger = get_logger(self.__class__.__name__)
108
112
  self._closed = False
109
113
  self._stats_enabled = crawler.settings.get_bool("DOWNLOADER_STATS", True)
110
114
 
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding: UTF-8 -*-
3
+ import asyncio
3
4
  from yarl import URL
4
5
  from typing import Optional
5
6
  from aiohttp import (
@@ -13,7 +14,7 @@ from aiohttp import (
13
14
  )
14
15
 
15
16
  from crawlo.network.response import Response
16
- from crawlo.utils.log import get_logger
17
+ from crawlo.logging import get_logger
17
18
  from crawlo.downloader import DownloaderBase
18
19
 
19
20
 
@@ -31,7 +32,7 @@ class AioHttpDownloader(DownloaderBase):
31
32
  super().__init__(crawler)
32
33
  self.session: Optional[ClientSession] = None
33
34
  self.max_download_size: int = 0
34
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
35
+ self.logger = get_logger(self.__class__.__name__)
35
36
 
36
37
  def open(self):
37
38
  super().open()
@@ -80,9 +81,6 @@ class AioHttpDownloader(DownloaderBase):
80
81
  # 输出下载器配置摘要
81
82
  spider_name = getattr(self.crawler.spider, 'name', 'Unknown')
82
83
  concurrency = self.crawler.settings.get('CONCURRENCY', 4)
83
- # self.logger.debug(f"下载器初始化完成 [爬虫: {spider_name}, 类型: {self.__class__.__name__}, 并发数: {concurrency}]") # 注释掉重复的日志
84
-
85
- # self.logger.debug("AioHttpDownloader initialized.") # 注释掉重复的日志
86
84
 
87
85
  async def download(self, request) -> Optional[Response]:
88
86
  """下载请求并返回响应"""
@@ -206,28 +204,30 @@ class AioHttpDownloader(DownloaderBase):
206
204
  # --- 请求追踪日志 ---
207
205
  async def _on_request_start(self, session, trace_config_ctx, params):
208
206
  """请求开始时的回调。"""
209
- # proxy = getattr(params, "proxy", None)
210
- # proxy_info = f" via {proxy}" if proxy else ""
211
- # self.logger.debug(f"Requesting: {params.method} {params.url}{proxy_info}") # 注释掉过于详细的日志
207
+ pass
212
208
 
213
209
  async def _on_request_end(self, session, trace_config_ctx, params):
214
210
  """请求成功结束时的回调。"""
215
- # response = params.response
216
- # self.logger.debug(
217
- # f"Finished: {params.method} {params.url} with status {response.status}"
218
- # ) # 注释掉过于详细的日志
211
+ pass
219
212
 
220
213
  async def _on_request_exception(self, session, trace_config_ctx, params):
221
214
  """请求发生异常时的回调。"""
222
- # exc = params.exception
223
- # self.logger.warning(
224
- # f"Failed: {params.method} {params.url} with exception {type(exc).__name__}: {exc}"
225
- # ) # 注释掉过于详细的日志
215
+ pass
226
216
 
227
217
  async def close(self) -> None:
228
218
  """关闭会话资源"""
229
219
  if self.session and not self.session.closed:
230
- # 恢复关键的下载器关闭信息为INFO级别
231
220
  self.logger.info("Closing AioHttpDownloader session...")
232
- await self.session.close()
221
+ try:
222
+ # 关闭 session
223
+ await self.session.close()
224
+
225
+ # 等待一小段时间确保连接完全关闭
226
+ # 参考: https://docs.aiohttp.org/en/stable/client_advanced.html#graceful-shutdown
227
+ await asyncio.sleep(0.25)
228
+ except Exception as e:
229
+ self.logger.warning(f"Error during session close: {e}")
230
+ finally:
231
+ self.session = None
232
+
233
233
  self.logger.debug("AioHttpDownloader closed.")