crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ Crawlo - 一个异步爬虫框架
|
|
|
7
7
|
# 为了向后兼容,从tools中导入cleaners相关的功能
|
|
8
8
|
import crawlo.tools as cleaners
|
|
9
9
|
from crawlo import tools
|
|
10
|
-
from crawlo.crawler import CrawlerProcess
|
|
10
|
+
from crawlo.crawler import Crawler, CrawlerProcess
|
|
11
11
|
from crawlo.downloader import DownloaderBase
|
|
12
12
|
from crawlo.items import Item, Field
|
|
13
13
|
from crawlo.middleware import BaseMiddleware
|
|
@@ -82,6 +82,7 @@ __all__ = [
|
|
|
82
82
|
'from_timestamp_with_tz',
|
|
83
83
|
'cleaners',
|
|
84
84
|
'tools',
|
|
85
|
+
'Crawler',
|
|
85
86
|
'CrawlerProcess',
|
|
86
87
|
'get_framework_initializer',
|
|
87
88
|
'get_bootstrap_manager',
|
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '1.4.
|
|
1
|
+
__version__ = '1.4.8'
|
crawlo/cli.py
CHANGED
|
@@ -4,12 +4,12 @@
|
|
|
4
4
|
import sys
|
|
5
5
|
import argparse
|
|
6
6
|
from crawlo.commands import get_commands
|
|
7
|
-
from crawlo.utils.
|
|
7
|
+
from crawlo.utils.config_manager import EnvConfigManager
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def main():
|
|
11
11
|
# 获取框架版本号
|
|
12
|
-
VERSION = get_version()
|
|
12
|
+
VERSION = EnvConfigManager.get_version()
|
|
13
13
|
|
|
14
14
|
# 获取所有可用命令
|
|
15
15
|
commands = get_commands()
|
crawlo/commands/check.py
CHANGED
|
@@ -24,7 +24,7 @@ from watchdog.observers import Observer
|
|
|
24
24
|
from watchdog.events import FileSystemEventHandler
|
|
25
25
|
|
|
26
26
|
from crawlo.crawler import CrawlerProcess
|
|
27
|
-
from crawlo.
|
|
27
|
+
from crawlo.logging import get_logger
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
logger = get_logger(__name__)
|
crawlo/commands/help.py
CHANGED
|
@@ -11,10 +11,10 @@ from rich.table import Table
|
|
|
11
11
|
from rich.panel import Panel
|
|
12
12
|
from rich.text import Text
|
|
13
13
|
from rich import box
|
|
14
|
-
from crawlo.utils.
|
|
14
|
+
from crawlo.utils.config_manager import EnvConfigManager
|
|
15
15
|
|
|
16
16
|
# 获取框架版本号
|
|
17
|
-
VERSION = get_version()
|
|
17
|
+
VERSION = EnvConfigManager.get_version()
|
|
18
18
|
|
|
19
19
|
console = Console()
|
|
20
20
|
|
|
@@ -85,11 +85,13 @@ def show_help():
|
|
|
85
85
|
|
|
86
86
|
# run 命令
|
|
87
87
|
console.print("[bold cyan]run[/bold cyan] - 运行爬虫")
|
|
88
|
-
console.print(" 用法: crawlo run <spider_name>|all [--json] [--no-stats]")
|
|
88
|
+
console.print(" 用法: crawlo run <spider_name>|all [--json] [--no-stats] [--log-level LEVEL] [--config CONFIG] [--concurrency NUM]")
|
|
89
89
|
console.print(" 示例:")
|
|
90
90
|
console.print(" crawlo run myspider")
|
|
91
91
|
console.print(" crawlo run all")
|
|
92
92
|
console.print(" crawlo run all --json --no-stats")
|
|
93
|
+
console.print(" crawlo run myspider --log-level DEBUG")
|
|
94
|
+
console.print(" crawlo run myspider --concurrency 32")
|
|
93
95
|
console.print()
|
|
94
96
|
|
|
95
97
|
|
crawlo/commands/list.py
CHANGED
|
@@ -16,7 +16,7 @@ from rich.text import Text
|
|
|
16
16
|
from rich import box
|
|
17
17
|
|
|
18
18
|
from crawlo.crawler import CrawlerProcess
|
|
19
|
-
from crawlo.
|
|
19
|
+
from crawlo.logging import get_logger
|
|
20
20
|
from .utils import validate_project_environment, show_error_panel
|
|
21
21
|
|
|
22
22
|
logger = get_logger(__name__)
|
crawlo/commands/run.py
CHANGED
|
@@ -24,7 +24,7 @@ from crawlo.project import get_settings, _find_project_root
|
|
|
24
24
|
# 使用新的统一初始化系统
|
|
25
25
|
from crawlo.initialization import initialize_framework
|
|
26
26
|
from crawlo.core import get_framework_initializer
|
|
27
|
-
from crawlo.
|
|
27
|
+
from crawlo.logging import get_logger
|
|
28
28
|
|
|
29
29
|
# 延迟获取logger,确保在日志系统配置之后获取
|
|
30
30
|
_logger = None
|
|
@@ -89,7 +89,7 @@ def main(args):
|
|
|
89
89
|
"""
|
|
90
90
|
主函数:运行指定爬虫
|
|
91
91
|
用法:
|
|
92
|
-
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
92
|
+
crawlo run <spider_name>|all [--json] [--no-stats] [--log-level LEVEL] [--config CONFIG] [--concurrency NUM]
|
|
93
93
|
"""
|
|
94
94
|
# 确保框架已初始化
|
|
95
95
|
init_manager = get_framework_initializer()
|
|
@@ -99,7 +99,7 @@ def main(args):
|
|
|
99
99
|
|
|
100
100
|
if len(args) < 1:
|
|
101
101
|
console.print(
|
|
102
|
-
"[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
102
|
+
"[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats] [--log-level LEVEL] [--config CONFIG] [--concurrency NUM][/bold yellow]")
|
|
103
103
|
console.print("示例:")
|
|
104
104
|
console.print(" [blue]crawlo run baidu[/blue]")
|
|
105
105
|
console.print(" [blue]crawlo run all[/blue]")
|
|
@@ -110,6 +110,36 @@ def main(args):
|
|
|
110
110
|
spider_arg = args[0]
|
|
111
111
|
show_json = "--json" in args
|
|
112
112
|
no_stats = "--no-stats" in args
|
|
113
|
+
|
|
114
|
+
# 解析日志级别参数
|
|
115
|
+
log_level = None
|
|
116
|
+
if "--log-level" in args:
|
|
117
|
+
try:
|
|
118
|
+
log_level_index = args.index("--log-level")
|
|
119
|
+
if log_level_index + 1 < len(args):
|
|
120
|
+
log_level = args[log_level_index + 1]
|
|
121
|
+
except (ValueError, IndexError):
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
# 解析配置文件参数
|
|
125
|
+
config_file = None
|
|
126
|
+
if "--config" in args:
|
|
127
|
+
try:
|
|
128
|
+
config_index = args.index("--config")
|
|
129
|
+
if config_index + 1 < len(args):
|
|
130
|
+
config_file = args[config_index + 1]
|
|
131
|
+
except (ValueError, IndexError):
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
# 解析并发数参数
|
|
135
|
+
concurrency = None
|
|
136
|
+
if "--concurrency" in args:
|
|
137
|
+
try:
|
|
138
|
+
concurrency_index = args.index("--concurrency")
|
|
139
|
+
if concurrency_index + 1 < len(args):
|
|
140
|
+
concurrency = int(args[concurrency_index + 1])
|
|
141
|
+
except (ValueError, IndexError, TypeError):
|
|
142
|
+
pass
|
|
113
143
|
|
|
114
144
|
try:
|
|
115
145
|
# 1. 查找项目根目录
|
|
@@ -171,7 +201,14 @@ def main(args):
|
|
|
171
201
|
return 1
|
|
172
202
|
|
|
173
203
|
# 4. 启动框架并加载 settings
|
|
174
|
-
|
|
204
|
+
# 如果指定了日志级别,则添加到自定义设置中
|
|
205
|
+
custom_settings = {}
|
|
206
|
+
if log_level:
|
|
207
|
+
custom_settings['LOG_LEVEL'] = log_level
|
|
208
|
+
if concurrency:
|
|
209
|
+
custom_settings['CONCURRENCY'] = concurrency
|
|
210
|
+
|
|
211
|
+
settings = initialize_framework(custom_settings if custom_settings else None)
|
|
175
212
|
|
|
176
213
|
# 检查Redis连接(如果是分布式模式)
|
|
177
214
|
if not check_redis_connection(settings):
|
|
@@ -183,7 +220,7 @@ def main(args):
|
|
|
183
220
|
|
|
184
221
|
# 从配置中获取SPIDER_MODULES
|
|
185
222
|
spider_modules = settings.get('SPIDER_MODULES', [f"{project_package}.spiders"])
|
|
186
|
-
|
|
223
|
+
# 合并重复的调试信息
|
|
187
224
|
process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
|
|
188
225
|
|
|
189
226
|
# 不再需要手动导入爬虫模块,框架内部会自动处理
|
|
@@ -191,11 +228,11 @@ def main(args):
|
|
|
191
228
|
from crawlo.spider import get_global_spider_registry
|
|
192
229
|
registry = get_global_spider_registry()
|
|
193
230
|
spider_names = list(registry.keys())
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
#
|
|
197
|
-
logger().debug(f"SPIDER_MODULES: {spider_modules}")
|
|
198
|
-
logger().debug(f"Available spiders: {process.get_spider_names()}")
|
|
231
|
+
# 减少重复的调试日志输出
|
|
232
|
+
# logger().debug(f"SPIDER_MODULES from settings: {spider_modules}")
|
|
233
|
+
# logger().debug(f"Registered spiders after import: {spider_names}")
|
|
234
|
+
# logger().debug(f"SPIDER_MODULES: {spider_modules}")
|
|
235
|
+
# logger().debug(f"Available spiders: {process.get_spider_names()}")
|
|
199
236
|
|
|
200
237
|
# === 情况1:运行所有爬虫 ===
|
|
201
238
|
if spider_arg.lower() == "all":
|
|
@@ -260,7 +297,8 @@ def main(args):
|
|
|
260
297
|
panel_content.append("\n可用爬虫:\n")
|
|
261
298
|
for name in sorted(available):
|
|
262
299
|
cls = process.get_spider_class(name)
|
|
263
|
-
|
|
300
|
+
class_name = cls.__name__ if cls else 'Unknown'
|
|
301
|
+
panel_content.append(f" • [cyan]{name}[/cyan] ([green]{class_name}[/green])\n")
|
|
264
302
|
else:
|
|
265
303
|
panel_content.append("\n未找到爬虫。请检查爬虫模块。")
|
|
266
304
|
|
crawlo/commands/stats.py
CHANGED
crawlo/config.py
CHANGED
|
@@ -23,7 +23,7 @@ from typing import Dict, Any, Optional
|
|
|
23
23
|
|
|
24
24
|
from crawlo.config_validator import validate_config
|
|
25
25
|
from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode, from_env
|
|
26
|
-
from crawlo.
|
|
26
|
+
from crawlo.logging import get_logger
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class CrawloConfig:
|
|
@@ -51,13 +51,21 @@ class CrawloConfig:
|
|
|
51
51
|
return self.settings.get(key, default)
|
|
52
52
|
|
|
53
53
|
def set(self, key: str, value: Any) -> 'CrawloConfig':
|
|
54
|
-
"""设置配置项(链式调用)
|
|
54
|
+
"""设置配置项(链式调用)
|
|
55
|
+
|
|
56
|
+
注意:设置后会自动验证配置合法性
|
|
57
|
+
"""
|
|
55
58
|
self.settings[key] = value
|
|
59
|
+
self._validate_settings() # 自动验证
|
|
56
60
|
return self
|
|
57
61
|
|
|
58
62
|
def update(self, settings: Dict[str, Any]) -> 'CrawloConfig':
|
|
59
|
-
"""更新配置(链式调用)
|
|
63
|
+
"""更新配置(链式调用)
|
|
64
|
+
|
|
65
|
+
注意:更新后会自动验证配置合法性
|
|
66
|
+
"""
|
|
60
67
|
self.settings.update(settings)
|
|
68
|
+
self._validate_settings() # 自动验证
|
|
61
69
|
return self
|
|
62
70
|
|
|
63
71
|
def set_concurrency(self, concurrency: int) -> 'CrawloConfig':
|
|
@@ -95,7 +103,7 @@ class CrawloConfig:
|
|
|
95
103
|
'auto': '自动检测模式'
|
|
96
104
|
}
|
|
97
105
|
|
|
98
|
-
queue_type = self.settings.get('QUEUE_TYPE', '
|
|
106
|
+
queue_type = self.settings.get('QUEUE_TYPE', 'auto')
|
|
99
107
|
filter_class = self.settings.get('FILTER_CLASS', '').split('.')[-1]
|
|
100
108
|
concurrency = self.settings.get('CONCURRENCY', 8)
|
|
101
109
|
|
crawlo/config_validator.py
CHANGED
crawlo/core/engine.py
CHANGED
|
@@ -9,13 +9,13 @@ from crawlo import Request, Item
|
|
|
9
9
|
from crawlo.core.processor import Processor
|
|
10
10
|
from crawlo.core.scheduler import Scheduler
|
|
11
11
|
from crawlo.downloader import DownloaderBase
|
|
12
|
-
from crawlo.event import
|
|
12
|
+
from crawlo.event import CrawlerEvent
|
|
13
13
|
from crawlo.exceptions import OutputError
|
|
14
14
|
from crawlo.utils.misc import load_object
|
|
15
15
|
from crawlo.spider import Spider
|
|
16
16
|
from crawlo.task_manager import TaskManager
|
|
17
17
|
from crawlo.utils.func_tools import transform
|
|
18
|
-
from crawlo.
|
|
18
|
+
from crawlo.logging import get_logger
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Engine(object):
|
|
@@ -94,6 +94,17 @@ class Engine(object):
|
|
|
94
94
|
else:
|
|
95
95
|
# DownloaderBase.open() 是同步方法,直接调用而不是await
|
|
96
96
|
self.downloader.open()
|
|
97
|
+
|
|
98
|
+
# 注册下载器到资源管理器
|
|
99
|
+
if hasattr(self.crawler, '_resource_manager') and self.downloader:
|
|
100
|
+
from crawlo.utils.resource_manager import ResourceType
|
|
101
|
+
self.crawler._resource_manager.register(
|
|
102
|
+
self.downloader,
|
|
103
|
+
lambda d: d.close() if hasattr(d, 'close') else None,
|
|
104
|
+
ResourceType.DOWNLOADER,
|
|
105
|
+
name=f"downloader.{downloader_cls.__name__}"
|
|
106
|
+
)
|
|
107
|
+
self.logger.debug(f"Downloader registered to resource manager: {downloader_cls.__name__}")
|
|
97
108
|
|
|
98
109
|
self.processor = Processor(self.crawler)
|
|
99
110
|
if hasattr(self.processor, 'open'):
|
|
@@ -188,11 +199,13 @@ class Engine(object):
|
|
|
188
199
|
while self.running:
|
|
189
200
|
try:
|
|
190
201
|
start_request = next(self.start_requests)
|
|
191
|
-
|
|
202
|
+
# 将过于频繁的debug日志合并,减少输出
|
|
203
|
+
# self.logger.debug(f"获取到请求: {getattr(start_request, 'url', 'Unknown URL')}")
|
|
192
204
|
# 请求入队
|
|
193
205
|
await self.enqueue_request(start_request)
|
|
194
206
|
processed_count += 1
|
|
195
|
-
|
|
207
|
+
# 减少过于频繁的日志输出
|
|
208
|
+
# self.logger.debug(f"已处理请求数量: {processed_count}")
|
|
196
209
|
except StopIteration:
|
|
197
210
|
self.logger.debug("所有起始请求处理完成")
|
|
198
211
|
self.start_requests = None
|
|
@@ -306,7 +319,7 @@ class Engine(object):
|
|
|
306
319
|
wait_time = min(wait_time * 1.1, max_wait)
|
|
307
320
|
|
|
308
321
|
async def _open_spider(self):
|
|
309
|
-
asyncio.create_task(self.crawler.subscriber.notify(
|
|
322
|
+
asyncio.create_task(self.crawler.subscriber.notify(CrawlerEvent.SPIDER_OPENED))
|
|
310
323
|
# 直接调用crawl方法而不是创建任务,确保等待完成
|
|
311
324
|
await self.crawl()
|
|
312
325
|
|
|
@@ -364,7 +377,7 @@ class Engine(object):
|
|
|
364
377
|
async def _schedule_request(self, request):
|
|
365
378
|
# TODO 去重
|
|
366
379
|
if await self.scheduler.enqueue_request(request):
|
|
367
|
-
asyncio.create_task(self.crawler.subscriber.notify(
|
|
380
|
+
asyncio.create_task(self.crawler.subscriber.notify(CrawlerEvent.REQUEST_SCHEDULED, request, self.crawler.spider))
|
|
368
381
|
|
|
369
382
|
async def _get_next_request(self):
|
|
370
383
|
return await self.scheduler.next_request()
|
|
@@ -375,7 +388,7 @@ class Engine(object):
|
|
|
375
388
|
await self.processor.enqueue(spider_output)
|
|
376
389
|
elif isinstance(spider_output, Exception):
|
|
377
390
|
asyncio.create_task(
|
|
378
|
-
self.crawler.subscriber.notify(
|
|
391
|
+
self.crawler.subscriber.notify(CrawlerEvent.SPIDER_ERROR, spider_output, self.spider)
|
|
379
392
|
)
|
|
380
393
|
raise spider_output
|
|
381
394
|
else:
|
crawlo/core/processor.py
CHANGED
|
@@ -4,7 +4,7 @@ from asyncio import Queue, create_task
|
|
|
4
4
|
from typing import Union, Optional
|
|
5
5
|
|
|
6
6
|
from crawlo import Request, Item
|
|
7
|
-
from crawlo.event import
|
|
7
|
+
from crawlo.event import CrawlerEvent
|
|
8
8
|
from crawlo.exceptions import ItemDiscard
|
|
9
9
|
from crawlo.pipelines.pipeline_manager import PipelineManager
|
|
10
10
|
|
crawlo/core/scheduler.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import traceback
|
|
4
4
|
from typing import Optional, Callable
|
|
5
5
|
|
|
6
|
-
from crawlo.
|
|
6
|
+
from crawlo.logging import get_logger
|
|
7
7
|
from crawlo.utils.request import set_request
|
|
8
8
|
from crawlo.utils.error_handler import ErrorHandler
|
|
9
9
|
from crawlo.utils.misc import load_object
|
|
@@ -13,13 +13,13 @@ from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class Scheduler:
|
|
16
|
-
def __init__(self, crawler, dupe_filter, stats,
|
|
16
|
+
def __init__(self, crawler, dupe_filter, stats, priority):
|
|
17
17
|
self.crawler = crawler
|
|
18
18
|
self.queue_manager: Optional[QueueManager] = None
|
|
19
19
|
self.request_serializer = RequestSerializer()
|
|
20
20
|
|
|
21
|
-
self.logger = get_logger(
|
|
22
|
-
self.error_handler = ErrorHandler(self.__class__.__name__
|
|
21
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
22
|
+
self.error_handler = ErrorHandler(self.__class__.__name__)
|
|
23
23
|
self.stats = stats
|
|
24
24
|
self.dupe_filter = dupe_filter
|
|
25
25
|
self.priority = priority
|
|
@@ -31,7 +31,6 @@ class Scheduler:
|
|
|
31
31
|
crawler=crawler,
|
|
32
32
|
dupe_filter=filter_cls.create_instance(crawler),
|
|
33
33
|
stats=crawler.stats,
|
|
34
|
-
log_level=crawler.settings.get('LOG_LEVEL'),
|
|
35
34
|
priority=crawler.settings.get('DEPTH_PRIORITY')
|
|
36
35
|
)
|
|
37
36
|
return o
|
crawlo/crawler.py
CHANGED
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding: UTF-8 -*-
|
|
3
3
|
"""
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
Crawler系统
|
|
5
|
+
==========
|
|
6
|
+
|
|
7
|
+
核心组件:
|
|
8
|
+
- Crawler: 爬虫核心控制器,负责单个爬虫的生命周期管理
|
|
9
|
+
- CrawlerProcess: 爬虫进程管理器,支持单个/多个爬虫运行
|
|
6
10
|
|
|
7
11
|
设计原则:
|
|
8
12
|
1. 单一职责 - 每个类只负责一个明确的功能
|
|
9
13
|
2. 依赖注入 - 通过工厂创建组件,便于测试
|
|
10
14
|
3. 状态管理 - 清晰的状态转换和生命周期
|
|
11
15
|
4. 错误处理 - 优雅的错误处理和恢复机制
|
|
16
|
+
5. 资源管理 - 统一的资源注册和清理机制
|
|
12
17
|
"""
|
|
13
18
|
|
|
14
19
|
import asyncio
|
|
@@ -21,6 +26,7 @@ from typing import Optional, Type, Dict, Any, List
|
|
|
21
26
|
from crawlo.logging import get_logger
|
|
22
27
|
from crawlo.factories import get_component_registry
|
|
23
28
|
from crawlo.initialization import initialize_framework, is_framework_ready
|
|
29
|
+
from crawlo.utils.resource_manager import ResourceManager, ResourceType
|
|
24
30
|
|
|
25
31
|
|
|
26
32
|
class CrawlerState(Enum):
|
|
@@ -55,15 +61,16 @@ class CrawlerMetrics:
|
|
|
55
61
|
return (self.success_count / total * 100) if total > 0 else 0.0
|
|
56
62
|
|
|
57
63
|
|
|
58
|
-
class
|
|
64
|
+
class Crawler:
|
|
59
65
|
"""
|
|
60
|
-
|
|
66
|
+
爬虫核心控制器
|
|
61
67
|
|
|
62
68
|
特点:
|
|
63
69
|
1. 清晰的状态管理
|
|
64
70
|
2. 依赖注入
|
|
65
71
|
3. 组件化架构
|
|
66
72
|
4. 完善的错误处理
|
|
73
|
+
5. 统一的资源管理
|
|
67
74
|
"""
|
|
68
75
|
|
|
69
76
|
def __init__(self, spider_cls: Type, settings=None):
|
|
@@ -82,6 +89,9 @@ class ModernCrawler:
|
|
|
82
89
|
# 指标
|
|
83
90
|
self._metrics = CrawlerMetrics()
|
|
84
91
|
|
|
92
|
+
# 资源管理器
|
|
93
|
+
self._resource_manager = ResourceManager(name=f"crawler.{spider_cls.__name__ if spider_cls else 'unknown'}")
|
|
94
|
+
|
|
85
95
|
# 日志
|
|
86
96
|
self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
|
|
87
97
|
|
|
@@ -209,6 +219,14 @@ class ModernCrawler:
|
|
|
209
219
|
|
|
210
220
|
# 创建Engine(需要crawler参数)
|
|
211
221
|
self._engine = registry.create('engine', crawler=self)
|
|
222
|
+
# 注册Engine到资源管理器
|
|
223
|
+
if self._engine and hasattr(self._engine, 'close'):
|
|
224
|
+
self._resource_manager.register(
|
|
225
|
+
self._engine,
|
|
226
|
+
lambda e: e.close() if hasattr(e, 'close') else None,
|
|
227
|
+
ResourceType.OTHER,
|
|
228
|
+
name="engine"
|
|
229
|
+
)
|
|
212
230
|
|
|
213
231
|
# 创建Stats(需要crawler参数)
|
|
214
232
|
self._stats = registry.create('stats', crawler=self)
|
|
@@ -291,7 +309,15 @@ class ModernCrawler:
|
|
|
291
309
|
self._state = CrawlerState.CLOSING
|
|
292
310
|
|
|
293
311
|
try:
|
|
294
|
-
#
|
|
312
|
+
# 使用资源管理器统一清理
|
|
313
|
+
self._logger.debug("开始清理Crawler资源...")
|
|
314
|
+
cleanup_result = await self._resource_manager.cleanup_all()
|
|
315
|
+
self._logger.debug(
|
|
316
|
+
f"资源清理完成: {cleanup_result['success']}成功, "
|
|
317
|
+
f"{cleanup_result['errors']}失败, 耗时{cleanup_result['duration']:.2f}s"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# 关闭各个组件(继续兼容旧逻辑)
|
|
295
321
|
if self._engine and hasattr(self._engine, 'close'):
|
|
296
322
|
try:
|
|
297
323
|
await self._engine.close()
|
|
@@ -318,7 +344,9 @@ class ModernCrawler:
|
|
|
318
344
|
|
|
319
345
|
# 触发spider_closed事件,通知所有订阅者(包括扩展)
|
|
320
346
|
# 传递reason参数,这里使用默认的'finished'作为reason
|
|
321
|
-
|
|
347
|
+
if self.subscriber:
|
|
348
|
+
from crawlo.event import CrawlerEvent
|
|
349
|
+
await self.subscriber.notify(CrawlerEvent.SPIDER_CLOSED, reason='finished')
|
|
322
350
|
|
|
323
351
|
if self._stats and hasattr(self._stats, 'close'):
|
|
324
352
|
try:
|
|
@@ -348,7 +376,7 @@ class CrawlerProcess:
|
|
|
348
376
|
# 初始化框架配置
|
|
349
377
|
self._settings = settings or initialize_framework()
|
|
350
378
|
self._max_concurrency = max_concurrency
|
|
351
|
-
self._crawlers: List[
|
|
379
|
+
self._crawlers: List[Crawler] = []
|
|
352
380
|
self._semaphore = asyncio.Semaphore(max_concurrency)
|
|
353
381
|
self._logger = get_logger('crawler.process')
|
|
354
382
|
|
|
@@ -497,7 +525,7 @@ class CrawlerProcess:
|
|
|
497
525
|
logger.info(f"Starting spider: {spider_cls.name}")
|
|
498
526
|
|
|
499
527
|
merged_settings = self._merge_settings(settings)
|
|
500
|
-
crawler =
|
|
528
|
+
crawler = Crawler(spider_cls, merged_settings)
|
|
501
529
|
|
|
502
530
|
async with self._semaphore:
|
|
503
531
|
await crawler.crawl()
|
|
@@ -526,7 +554,7 @@ class CrawlerProcess:
|
|
|
526
554
|
tasks = []
|
|
527
555
|
for spider_cls in spider_classes:
|
|
528
556
|
merged_settings = self._merge_settings(settings)
|
|
529
|
-
crawler =
|
|
557
|
+
crawler = Crawler(spider_cls, merged_settings)
|
|
530
558
|
self._crawlers.append(crawler)
|
|
531
559
|
|
|
532
560
|
task = asyncio.create_task(self._run_with_semaphore(crawler))
|
|
@@ -543,12 +571,25 @@ class CrawlerProcess:
|
|
|
543
571
|
return results
|
|
544
572
|
|
|
545
573
|
finally:
|
|
574
|
+
# 清理所有crawler,防止资源累积
|
|
575
|
+
self._logger.debug(f"Cleaning up {len(self._crawlers)} crawler(s)...")
|
|
576
|
+
for crawler in self._crawlers:
|
|
577
|
+
try:
|
|
578
|
+
# 确保每个crawler都被清理
|
|
579
|
+
if hasattr(crawler, '_resource_manager'):
|
|
580
|
+
await crawler._resource_manager.cleanup_all()
|
|
581
|
+
except Exception as e:
|
|
582
|
+
self._logger.warning(f"Failed to cleanup crawler: {e}")
|
|
583
|
+
|
|
584
|
+
# 清空crawlers列表,释放引用
|
|
585
|
+
self._crawlers.clear()
|
|
586
|
+
|
|
546
587
|
self._end_time = time.time()
|
|
547
588
|
if self._start_time:
|
|
548
589
|
duration = self._end_time - self._start_time
|
|
549
590
|
self._logger.info(f"Total execution time: {duration:.2f}s")
|
|
550
591
|
|
|
551
|
-
async def _run_with_semaphore(self, crawler:
|
|
592
|
+
async def _run_with_semaphore(self, crawler: Crawler):
|
|
552
593
|
"""在信号量控制下运行爬虫"""
|
|
553
594
|
async with self._semaphore:
|
|
554
595
|
await crawler.crawl()
|
crawlo/downloader/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ from abc import abstractmethod, ABCMeta
|
|
|
18
18
|
from typing import Final, Set, Optional, TYPE_CHECKING
|
|
19
19
|
from contextlib import asynccontextmanager
|
|
20
20
|
|
|
21
|
-
from crawlo.
|
|
21
|
+
from crawlo.logging import get_logger
|
|
22
22
|
from crawlo.middleware.middleware_manager import MiddlewareManager
|
|
23
23
|
|
|
24
24
|
if TYPE_CHECKING:
|
|
@@ -68,12 +68,16 @@ class ActivateRequestManager:
|
|
|
68
68
|
|
|
69
69
|
def get_stats(self) -> dict:
|
|
70
70
|
"""获取请求统计信息"""
|
|
71
|
+
completed = self._completed_requests + self._failed_requests
|
|
71
72
|
return {
|
|
72
73
|
'active_requests': len(self._active),
|
|
73
74
|
'total_requests': self._total_requests,
|
|
74
75
|
'completed_requests': self._completed_requests,
|
|
75
76
|
'failed_requests': self._failed_requests,
|
|
76
|
-
'success_rate':
|
|
77
|
+
'success_rate': (
|
|
78
|
+
self._completed_requests / completed * 100
|
|
79
|
+
if completed > 0 else 100.0 # 无完成请求时返回100%
|
|
80
|
+
)
|
|
77
81
|
}
|
|
78
82
|
|
|
79
83
|
def reset_stats(self):
|
|
@@ -104,7 +108,7 @@ class DownloaderBase(metaclass=DownloaderMeta):
|
|
|
104
108
|
self.crawler = crawler
|
|
105
109
|
self._active = ActivateRequestManager()
|
|
106
110
|
self.middleware: Optional[MiddlewareManager] = None
|
|
107
|
-
self.logger = get_logger(self.__class__.__name__
|
|
111
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
108
112
|
self._closed = False
|
|
109
113
|
self._stats_enabled = crawler.settings.get_bool("DOWNLOADER_STATS", True)
|
|
110
114
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding: UTF-8 -*-
|
|
3
|
+
import asyncio
|
|
3
4
|
from yarl import URL
|
|
4
5
|
from typing import Optional
|
|
5
6
|
from aiohttp import (
|
|
@@ -13,7 +14,7 @@ from aiohttp import (
|
|
|
13
14
|
)
|
|
14
15
|
|
|
15
16
|
from crawlo.network.response import Response
|
|
16
|
-
from crawlo.
|
|
17
|
+
from crawlo.logging import get_logger
|
|
17
18
|
from crawlo.downloader import DownloaderBase
|
|
18
19
|
|
|
19
20
|
|
|
@@ -31,7 +32,7 @@ class AioHttpDownloader(DownloaderBase):
|
|
|
31
32
|
super().__init__(crawler)
|
|
32
33
|
self.session: Optional[ClientSession] = None
|
|
33
34
|
self.max_download_size: int = 0
|
|
34
|
-
self.logger = get_logger(self.__class__.__name__
|
|
35
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
35
36
|
|
|
36
37
|
def open(self):
|
|
37
38
|
super().open()
|
|
@@ -80,9 +81,6 @@ class AioHttpDownloader(DownloaderBase):
|
|
|
80
81
|
# 输出下载器配置摘要
|
|
81
82
|
spider_name = getattr(self.crawler.spider, 'name', 'Unknown')
|
|
82
83
|
concurrency = self.crawler.settings.get('CONCURRENCY', 4)
|
|
83
|
-
# self.logger.debug(f"下载器初始化完成 [爬虫: {spider_name}, 类型: {self.__class__.__name__}, 并发数: {concurrency}]") # 注释掉重复的日志
|
|
84
|
-
|
|
85
|
-
# self.logger.debug("AioHttpDownloader initialized.") # 注释掉重复的日志
|
|
86
84
|
|
|
87
85
|
async def download(self, request) -> Optional[Response]:
|
|
88
86
|
"""下载请求并返回响应"""
|
|
@@ -206,28 +204,30 @@ class AioHttpDownloader(DownloaderBase):
|
|
|
206
204
|
# --- 请求追踪日志 ---
|
|
207
205
|
async def _on_request_start(self, session, trace_config_ctx, params):
|
|
208
206
|
"""请求开始时的回调。"""
|
|
209
|
-
|
|
210
|
-
# proxy_info = f" via {proxy}" if proxy else ""
|
|
211
|
-
# self.logger.debug(f"Requesting: {params.method} {params.url}{proxy_info}") # 注释掉过于详细的日志
|
|
207
|
+
pass
|
|
212
208
|
|
|
213
209
|
async def _on_request_end(self, session, trace_config_ctx, params):
|
|
214
210
|
"""请求成功结束时的回调。"""
|
|
215
|
-
|
|
216
|
-
# self.logger.debug(
|
|
217
|
-
# f"Finished: {params.method} {params.url} with status {response.status}"
|
|
218
|
-
# ) # 注释掉过于详细的日志
|
|
211
|
+
pass
|
|
219
212
|
|
|
220
213
|
async def _on_request_exception(self, session, trace_config_ctx, params):
|
|
221
214
|
"""请求发生异常时的回调。"""
|
|
222
|
-
|
|
223
|
-
# self.logger.warning(
|
|
224
|
-
# f"Failed: {params.method} {params.url} with exception {type(exc).__name__}: {exc}"
|
|
225
|
-
# ) # 注释掉过于详细的日志
|
|
215
|
+
pass
|
|
226
216
|
|
|
227
217
|
async def close(self) -> None:
|
|
228
218
|
"""关闭会话资源"""
|
|
229
219
|
if self.session and not self.session.closed:
|
|
230
|
-
# 恢复关键的下载器关闭信息为INFO级别
|
|
231
220
|
self.logger.info("Closing AioHttpDownloader session...")
|
|
232
|
-
|
|
221
|
+
try:
|
|
222
|
+
# 关闭 session
|
|
223
|
+
await self.session.close()
|
|
224
|
+
|
|
225
|
+
# 等待一小段时间确保连接完全关闭
|
|
226
|
+
# 参考: https://docs.aiohttp.org/en/stable/client_advanced.html#graceful-shutdown
|
|
227
|
+
await asyncio.sleep(0.25)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
self.logger.warning(f"Error during session close: {e}")
|
|
230
|
+
finally:
|
|
231
|
+
self.session = None
|
|
232
|
+
|
|
233
233
|
self.logger.debug("AioHttpDownloader closed.")
|