crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +24 -0
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +58 -32
- crawlo/core/__init__.py +44 -0
- crawlo/core/engine.py +119 -45
- crawlo/core/scheduler.py +4 -3
- crawlo/crawler.py +603 -1133
- crawlo/downloader/aiohttp_downloader.py +4 -2
- crawlo/extension/__init__.py +1 -1
- crawlo/extension/logging_extension.py +23 -7
- crawlo/factories/__init__.py +28 -0
- crawlo/factories/base.py +69 -0
- crawlo/factories/crawler.py +104 -0
- crawlo/factories/registry.py +85 -0
- crawlo/filters/aioredis_filter.py +25 -2
- crawlo/framework.py +292 -0
- crawlo/initialization/__init__.py +40 -0
- crawlo/initialization/built_in.py +426 -0
- crawlo/initialization/context.py +142 -0
- crawlo/initialization/core.py +194 -0
- crawlo/initialization/phases.py +149 -0
- crawlo/initialization/registry.py +146 -0
- crawlo/items/base.py +2 -1
- crawlo/logging/__init__.py +38 -0
- crawlo/logging/config.py +97 -0
- crawlo/logging/factory.py +129 -0
- crawlo/logging/manager.py +112 -0
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/middleware/offsite.py +1 -1
- crawlo/mode_manager.py +26 -1
- crawlo/pipelines/pipeline_manager.py +2 -1
- crawlo/project.py +76 -46
- crawlo/queue/pqueue.py +11 -5
- crawlo/queue/queue_manager.py +143 -19
- crawlo/queue/redis_priority_queue.py +69 -49
- crawlo/settings/default_settings.py +110 -14
- crawlo/settings/setting_manager.py +29 -13
- crawlo/spider/__init__.py +34 -16
- crawlo/stats_collector.py +17 -3
- crawlo/task_manager.py +112 -3
- crawlo/templates/project/settings.py.tmpl +103 -202
- crawlo/templates/project/settings_distributed.py.tmpl +122 -135
- crawlo/templates/project/settings_gentle.py.tmpl +149 -43
- crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
- crawlo/templates/project/settings_minimal.py.tmpl +46 -15
- crawlo/templates/project/settings_simple.py.tmpl +138 -75
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
- crawlo/templates/run.py.tmpl +10 -14
- crawlo/templates/spiders_init.py.tmpl +10 -0
- crawlo/tools/network_diagnostic.py +365 -0
- crawlo/utils/class_loader.py +26 -0
- crawlo/utils/error_handler.py +76 -35
- crawlo/utils/log.py +41 -144
- crawlo/utils/redis_connection_pool.py +43 -6
- crawlo/utils/request_serializer.py +8 -1
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
- tests/authenticated_proxy_example.py +2 -2
- tests/baidu_performance_test.py +109 -0
- tests/baidu_test.py +60 -0
- tests/comprehensive_framework_test.py +213 -0
- tests/comprehensive_test.py +82 -0
- tests/comprehensive_testing_summary.md +187 -0
- tests/debug_configure.py +70 -0
- tests/debug_framework_logger.py +85 -0
- tests/debug_log_levels.py +64 -0
- tests/distributed_test.py +67 -0
- tests/distributed_test_debug.py +77 -0
- tests/final_command_test_report.md +0 -0
- tests/final_comprehensive_test.py +152 -0
- tests/final_validation_test.py +183 -0
- tests/framework_performance_test.py +203 -0
- tests/optimized_performance_test.py +212 -0
- tests/performance_comparison.py +246 -0
- tests/queue_blocking_test.py +114 -0
- tests/queue_test.py +90 -0
- tests/scrapy_comparison/ofweek_scrapy.py +139 -0
- tests/scrapy_comparison/scrapy_test.py +134 -0
- tests/simple_command_test.py +120 -0
- tests/simple_crawlo_test.py +128 -0
- tests/simple_log_test.py +58 -0
- tests/simple_optimization_test.py +129 -0
- tests/simple_spider_test.py +50 -0
- tests/simple_test.py +48 -0
- tests/test_all_commands.py +231 -0
- tests/test_batch_processor.py +179 -0
- tests/test_component_factory.py +175 -0
- tests/test_controlled_spider_mixin.py +80 -0
- tests/test_enhanced_error_handler_comprehensive.py +246 -0
- tests/test_factories.py +253 -0
- tests/test_framework_logger.py +67 -0
- tests/test_framework_startup.py +65 -0
- tests/test_large_scale_config.py +113 -0
- tests/test_large_scale_helper.py +236 -0
- tests/test_mode_change.py +73 -0
- tests/test_mode_consistency.py +1 -1
- tests/test_performance_monitor.py +116 -0
- tests/test_queue_empty_check.py +42 -0
- tests/untested_features_report.md +139 -0
- tests/verify_debug.py +52 -0
- tests/verify_log_fix.py +112 -0
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
|
@@ -6,18 +6,32 @@
|
|
|
6
6
|
# 添加环境变量配置工具导入
|
|
7
7
|
from crawlo.utils.env_config import get_redis_config, get_runtime_config, get_version
|
|
8
8
|
|
|
9
|
+
# 框架初始化控制
|
|
10
|
+
FRAMEWORK_INIT_ORDER = [
|
|
11
|
+
'log_system', # 日志系统
|
|
12
|
+
'settings_system', # 配置系统
|
|
13
|
+
'core_components', # 核心组件
|
|
14
|
+
'extensions', # 扩展组件
|
|
15
|
+
'full_initialization' # 完全初始化
|
|
16
|
+
]
|
|
17
|
+
FRAMEWORK_INIT_STATE = 'uninitialized'
|
|
18
|
+
|
|
9
19
|
# ============================== 项目基础配置 ==============================
|
|
10
20
|
|
|
11
21
|
# 项目名称(用于日志、Redis Key 等标识)
|
|
12
22
|
PROJECT_NAME = get_runtime_config()['PROJECT_NAME']
|
|
13
23
|
|
|
24
|
+
# 确保项目名称不为空
|
|
25
|
+
if not PROJECT_NAME or PROJECT_NAME == 'None':
|
|
26
|
+
PROJECT_NAME = 'crawlo'
|
|
27
|
+
|
|
14
28
|
# 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
|
|
15
29
|
VERSION = get_version()
|
|
16
30
|
|
|
17
31
|
# 运行模式:standalone/distributed/auto
|
|
18
32
|
RUN_MODE = get_runtime_config()['CRAWLO_MODE']
|
|
19
33
|
|
|
20
|
-
# 并发数配置
|
|
34
|
+
# 并发数配置 - 优化默认值以提高性能
|
|
21
35
|
CONCURRENCY = get_runtime_config()['CONCURRENCY']
|
|
22
36
|
|
|
23
37
|
# ============================== 爬虫核心配置 ==============================
|
|
@@ -25,8 +39,8 @@ CONCURRENCY = get_runtime_config()['CONCURRENCY']
|
|
|
25
39
|
# 默认下载器
|
|
26
40
|
DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
|
|
27
41
|
|
|
28
|
-
#
|
|
29
|
-
DOWNLOAD_DELAY =
|
|
42
|
+
# 请求延迟(秒)- 优化默认值以提高性能
|
|
43
|
+
DOWNLOAD_DELAY = 0.5
|
|
30
44
|
|
|
31
45
|
# 随机延迟配置
|
|
32
46
|
RANDOMNESS = False # 是否启用随机延迟
|
|
@@ -35,8 +49,15 @@ RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子,实际延迟 = DOWNLOAD_
|
|
|
35
49
|
# 深度优先级(负数表示深度优先,正数表示广度优先)
|
|
36
50
|
DEPTH_PRIORITY = 1
|
|
37
51
|
|
|
38
|
-
# 调度器队列最大大小
|
|
39
|
-
SCHEDULER_MAX_QUEUE_SIZE =
|
|
52
|
+
# 调度器队列最大大小 - 优化默认值以提高性能
|
|
53
|
+
SCHEDULER_MAX_QUEUE_SIZE = 5000
|
|
54
|
+
# 背压控制配置 - 优化默认值以提高性能
|
|
55
|
+
BACKPRESSURE_RATIO = 0.9 # 背压触发阈值(队列大小达到最大容量的90%时触发背压控制)
|
|
56
|
+
|
|
57
|
+
# 请求生成控制
|
|
58
|
+
REQUEST_GENERATION_BATCH_SIZE = 10 # 请求生成批处理大小
|
|
59
|
+
REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
|
|
60
|
+
ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
|
|
40
61
|
|
|
41
62
|
# 调度器队列名称(遵循统一命名规范)
|
|
42
63
|
SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
|
|
@@ -44,13 +65,15 @@ SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
|
|
|
44
65
|
# 队列类型:memory/redis/auto
|
|
45
66
|
QUEUE_TYPE = 'auto'
|
|
46
67
|
|
|
68
|
+
# 队列配置
|
|
69
|
+
QUEUE_MAX_RETRIES = 3 # 队列操作最大重试次数
|
|
70
|
+
QUEUE_TIMEOUT = 300 # 队列操作超时时间(秒)
|
|
47
71
|
|
|
48
72
|
# 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
|
|
49
73
|
# 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
|
|
50
|
-
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
74
|
+
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
51
75
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
52
76
|
|
|
53
|
-
|
|
54
77
|
MYSQL_HOST = '127.0.0.1'
|
|
55
78
|
MYSQL_PORT = 3306
|
|
56
79
|
MYSQL_USER = 'root'
|
|
@@ -60,7 +83,6 @@ MYSQL_TABLE = 'crawlo'
|
|
|
60
83
|
MYSQL_BATCH_SIZE = 100
|
|
61
84
|
MYSQL_USE_BATCH = False # 是否启用批量插入
|
|
62
85
|
|
|
63
|
-
|
|
64
86
|
# --- Redis 过滤器配置 ---
|
|
65
87
|
# 使用环境变量配置工具获取 Redis 配置
|
|
66
88
|
redis_config = get_redis_config()
|
|
@@ -111,10 +133,6 @@ PIPELINES = [
|
|
|
111
133
|
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
112
134
|
]
|
|
113
135
|
|
|
114
|
-
# 明确添加默认去重管道到管道列表开头
|
|
115
|
-
# 注意:此操作已移至SettingManager中处理,避免重复插入
|
|
116
|
-
# PIPELINES.insert(0, DEFAULT_DEDUP_PIPELINE)
|
|
117
|
-
|
|
118
136
|
# ============================== 框架默认扩展配置 ==============================
|
|
119
137
|
|
|
120
138
|
# 框架扩展组件列表(框架默认扩展 + 用户自定义扩展)
|
|
@@ -126,9 +144,9 @@ EXTENSIONS = [
|
|
|
126
144
|
|
|
127
145
|
# ============================== 日志与监控 ==============================
|
|
128
146
|
|
|
129
|
-
LOG_LEVEL =
|
|
147
|
+
LOG_LEVEL = None # 日志级别: DEBUG/INFO/WARNING/ERROR,默认为None,由用户在项目settings中设置
|
|
130
148
|
STATS_DUMP = True # 是否周期性输出统计信息
|
|
131
|
-
LOG_FILE =
|
|
149
|
+
LOG_FILE = None # 日志文件路径,将在项目配置中设置
|
|
132
150
|
LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
|
|
133
151
|
LOG_ENCODING = 'utf-8'
|
|
134
152
|
|
|
@@ -151,6 +169,8 @@ PROXY_EXTRACTOR = "proxy"
|
|
|
151
169
|
# 代理刷新控制
|
|
152
170
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
153
171
|
PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
|
|
172
|
+
PROXY_POOL_SIZE = 5 # 代理池大小
|
|
173
|
+
PROXY_HEALTH_CHECK_THRESHOLD = 0.5 # 代理健康检查阈值
|
|
154
174
|
|
|
155
175
|
# ============================== Curl-Cffi 特有配置 ==============================
|
|
156
176
|
|
|
@@ -183,6 +203,17 @@ HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
|
|
|
183
203
|
AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
|
|
184
204
|
AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
185
205
|
|
|
206
|
+
# 通用下载器配置
|
|
207
|
+
DOWNLOAD_TIMEOUT = 30 # 下载超时时间(秒)
|
|
208
|
+
VERIFY_SSL = True # 是否验证SSL证书
|
|
209
|
+
CONNECTION_POOL_LIMIT = 100 # 连接池大小限制
|
|
210
|
+
CONNECTION_POOL_LIMIT_PER_HOST = 20 # 每个主机的连接池大小限制
|
|
211
|
+
DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 最大下载大小(字节)
|
|
212
|
+
DOWNLOAD_STATS = True # 是否启用下载统计
|
|
213
|
+
DOWNLOAD_WARN_SIZE = 1024 * 1024 # 下载警告大小(字节)
|
|
214
|
+
DOWNLOAD_RETRY_TIMES = 3 # 下载重试次数
|
|
215
|
+
MAX_RETRY_TIMES = 3 # 最大重试次数
|
|
216
|
+
|
|
186
217
|
# ============================== Selenium 下载器配置 ==============================
|
|
187
218
|
|
|
188
219
|
# Selenium 基础配置
|
|
@@ -223,3 +254,68 @@ MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
|
223
254
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
224
255
|
MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
|
|
225
256
|
MEMORY_CRITICAL_THRESHOLD = 90.0 # 内存使用率严重阈值(百分比)
|
|
257
|
+
|
|
258
|
+
# ============================== 性能分析配置 ==============================
|
|
259
|
+
|
|
260
|
+
# 性能分析扩展默认不启用
|
|
261
|
+
PERFORMANCE_PROFILER_ENABLED = False # 是否启用性能分析
|
|
262
|
+
PERFORMANCE_PROFILER_OUTPUT_DIR = 'profiling' # 性能分析输出目录
|
|
263
|
+
PERFORMANCE_PROFILER_INTERVAL = 300 # 性能分析间隔(秒)
|
|
264
|
+
|
|
265
|
+
# ============================== 健康检查配置 ==============================
|
|
266
|
+
|
|
267
|
+
# 健康检查扩展默认启用
|
|
268
|
+
HEALTH_CHECK_ENABLED = True # 是否启用健康检查
|
|
269
|
+
|
|
270
|
+
# ============================== 日志间隔配置 ==============================
|
|
271
|
+
|
|
272
|
+
# 日志间隔扩展配置
|
|
273
|
+
INTERVAL = 60 # 日志输出间隔(秒)
|
|
274
|
+
|
|
275
|
+
# ============================== 自定义日志配置 ==============================
|
|
276
|
+
|
|
277
|
+
# 自定义日志扩展配置
|
|
278
|
+
LOG_ENABLE_CUSTOM = False # 是否启用自定义日志
|
|
279
|
+
|
|
280
|
+
# ============================== 默认请求头配置 ==============================
|
|
281
|
+
|
|
282
|
+
# 默认请求头配置
|
|
283
|
+
DEFAULT_REQUEST_HEADERS = {} # 默认请求头
|
|
284
|
+
USER_AGENT = None # 用户代理
|
|
285
|
+
USER_AGENTS = [] # 用户代理列表
|
|
286
|
+
RANDOM_HEADERS = {} # 随机请求头
|
|
287
|
+
RANDOM_USER_AGENT_ENABLED = False # 是否启用随机用户代理
|
|
288
|
+
USER_AGENT_DEVICE_TYPE = "all" # 用户代理设备类型
|
|
289
|
+
|
|
290
|
+
# ============================== 站外过滤配置 ==============================
|
|
291
|
+
|
|
292
|
+
# 站外过滤配置
|
|
293
|
+
ALLOWED_DOMAINS = [] # 允许的域名列表
|
|
294
|
+
|
|
295
|
+
# ============================== Bloom过滤器配置 ==============================
|
|
296
|
+
|
|
297
|
+
# Bloom过滤器配置
|
|
298
|
+
BLOOM_FILTER_CAPACITY = 1000000 # Bloom过滤器容量
|
|
299
|
+
BLOOM_FILTER_ERROR_RATE = 0.001 # Bloom过滤器错误率
|
|
300
|
+
|
|
301
|
+
# ============================== CSV管道配置 ==============================
|
|
302
|
+
|
|
303
|
+
# CSV管道配置
|
|
304
|
+
CSV_DELIMITER = ',' # CSV分隔符
|
|
305
|
+
CSV_QUOTECHAR = '"' # CSV引号字符
|
|
306
|
+
CSV_INCLUDE_HEADERS = True # 是否包含表头
|
|
307
|
+
CSV_EXTRASACTION = 'ignore' # 额外字段处理方式:ignore, raise
|
|
308
|
+
CSV_FIELDNAMES = None # 字段名列表
|
|
309
|
+
CSV_FILE = None # CSV文件路径
|
|
310
|
+
CSV_DICT_FILE = None # CSV字典文件路径
|
|
311
|
+
CSV_BATCH_SIZE = 100 # CSV批处理大小
|
|
312
|
+
CSV_BATCH_FILE = None # CSV批处理文件路径
|
|
313
|
+
|
|
314
|
+
# ============================== 数据库去重管道配置 ==============================
|
|
315
|
+
|
|
316
|
+
# 数据库去重管道配置
|
|
317
|
+
DB_HOST = 'localhost' # 数据库主机
|
|
318
|
+
DB_PORT = 3306 # 数据库端口
|
|
319
|
+
DB_USER = 'root' # 数据库用户
|
|
320
|
+
DB_PASSWORD = '' # 数据库密码
|
|
321
|
+
DB_NAME = 'crawlo' # 数据库名称
|
|
@@ -15,6 +15,8 @@ class SettingManager(MutableMapping):
|
|
|
15
15
|
self.set_settings(default_settings)
|
|
16
16
|
# 在初始化时合并配置
|
|
17
17
|
self._merge_config(values)
|
|
18
|
+
# 处理动态配置
|
|
19
|
+
self._process_dynamic_config()
|
|
18
20
|
|
|
19
21
|
def _merge_config(self, user_config):
|
|
20
22
|
"""合并默认配置和用户配置"""
|
|
@@ -81,6 +83,33 @@ class SettingManager(MutableMapping):
|
|
|
81
83
|
if key not in ['MIDDLEWARES', 'PIPELINES', 'EXTENSIONS']:
|
|
82
84
|
self.attributes[key] = value
|
|
83
85
|
|
|
86
|
+
def set_settings(self, module):
|
|
87
|
+
if isinstance(module, str):
|
|
88
|
+
module = import_module(module)
|
|
89
|
+
|
|
90
|
+
# 收集模块中的所有配置项
|
|
91
|
+
module_settings = {}
|
|
92
|
+
for key in dir(module):
|
|
93
|
+
if key.isupper():
|
|
94
|
+
value = getattr(module, key)
|
|
95
|
+
module_settings[key] = value
|
|
96
|
+
|
|
97
|
+
# 使用合并逻辑而不是直接设置
|
|
98
|
+
self._merge_config(module_settings)
|
|
99
|
+
|
|
100
|
+
# 处理动态配置项(如LOG_FILE)
|
|
101
|
+
self._process_dynamic_config()
|
|
102
|
+
|
|
103
|
+
def _process_dynamic_config(self):
|
|
104
|
+
"""
|
|
105
|
+
处理动态配置项
|
|
106
|
+
某些配置项需要根据其他配置项的值进行动态计算
|
|
107
|
+
"""
|
|
108
|
+
# 处理LOG_FILE配置
|
|
109
|
+
if self.attributes.get('LOG_FILE') is None:
|
|
110
|
+
project_name = self.attributes.get('PROJECT_NAME', 'crawlo')
|
|
111
|
+
self.attributes['LOG_FILE'] = f'logs/{project_name}.log'
|
|
112
|
+
|
|
84
113
|
def get(self, key, default=None):
|
|
85
114
|
"""安全获取值,不触发递归"""
|
|
86
115
|
value = self.attributes.get(key, default)
|
|
@@ -133,19 +162,6 @@ class SettingManager(MutableMapping):
|
|
|
133
162
|
def set(self, key, value):
|
|
134
163
|
self.attributes[key] = value
|
|
135
164
|
|
|
136
|
-
def set_settings(self, module):
|
|
137
|
-
if isinstance(module, str):
|
|
138
|
-
module = import_module(module)
|
|
139
|
-
|
|
140
|
-
# 收集模块中的所有配置项
|
|
141
|
-
module_settings = {}
|
|
142
|
-
for key in dir(module):
|
|
143
|
-
if key.isupper():
|
|
144
|
-
module_settings[key] = getattr(module, key)
|
|
145
|
-
|
|
146
|
-
# 使用合并逻辑而不是直接设置
|
|
147
|
-
self._merge_config(module_settings)
|
|
148
|
-
|
|
149
165
|
# 实现 MutableMapping 必须的方法
|
|
150
166
|
def __getitem__(self, item):
|
|
151
167
|
return self.attributes[item]
|
crawlo/spider/__init__.py
CHANGED
|
@@ -77,7 +77,13 @@ class SpiderMeta(type):
|
|
|
77
77
|
|
|
78
78
|
# 注册爬虫
|
|
79
79
|
_DEFAULT_SPIDER_REGISTRY[spider_name] = cls
|
|
80
|
-
|
|
80
|
+
# 延迟初始化logger避免模块级别阻塞
|
|
81
|
+
try:
|
|
82
|
+
from crawlo.utils.log import get_logger
|
|
83
|
+
get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
|
|
84
|
+
except:
|
|
85
|
+
# 如果日志系统未初始化,静默失败
|
|
86
|
+
pass
|
|
81
87
|
|
|
82
88
|
return cls
|
|
83
89
|
|
|
@@ -153,12 +159,21 @@ class Spider(metaclass=SpiderMeta):
|
|
|
153
159
|
|
|
154
160
|
# 初始化其他属性
|
|
155
161
|
self.crawler = None
|
|
156
|
-
|
|
162
|
+
# 延迟初始化logger避免阻塞
|
|
163
|
+
self._logger = None
|
|
157
164
|
self.stats = None
|
|
158
165
|
|
|
159
166
|
# 应用额外参数
|
|
160
167
|
for key, value in kwargs.items():
|
|
161
168
|
setattr(self, key, value)
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def logger(self):
|
|
172
|
+
"""延迟初始化logger"""
|
|
173
|
+
if self._logger is None:
|
|
174
|
+
from crawlo.utils.log import get_logger
|
|
175
|
+
self._logger = get_logger(self.name)
|
|
176
|
+
return self._logger
|
|
162
177
|
|
|
163
178
|
@classmethod
|
|
164
179
|
def create_instance(cls, crawler) -> 'Spider':
|
|
@@ -172,13 +187,23 @@ class Spider(metaclass=SpiderMeta):
|
|
|
172
187
|
spider.crawler = crawler
|
|
173
188
|
spider.stats = getattr(crawler, 'stats', None)
|
|
174
189
|
|
|
175
|
-
# 合并自定义设置
|
|
190
|
+
# 合并自定义设置 - 使用延迟应用避免初始化时的循环依赖
|
|
176
191
|
if hasattr(spider, 'custom_settings') and spider.custom_settings:
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
192
|
+
# 延迟到真正需要时才应用设置
|
|
193
|
+
spider._pending_settings = spider.custom_settings.copy()
|
|
194
|
+
spider.logger.debug(f"准备应用 {len(spider.custom_settings)} 项自定义设置")
|
|
180
195
|
|
|
181
196
|
return spider
|
|
197
|
+
|
|
198
|
+
def apply_pending_settings(self):
|
|
199
|
+
"""应用待处理的设置(在初始化完成后调用)"""
|
|
200
|
+
if hasattr(self, '_pending_settings') and self._pending_settings:
|
|
201
|
+
for key, value in self._pending_settings.items():
|
|
202
|
+
if self.crawler and hasattr(self.crawler, 'settings'):
|
|
203
|
+
self.crawler.settings.set(key, value)
|
|
204
|
+
self.logger.debug(f"应用自定义设置: {key} = {value}")
|
|
205
|
+
# 清除待处理的设置
|
|
206
|
+
delattr(self, '_pending_settings')
|
|
182
207
|
|
|
183
208
|
def start_requests(self) -> Iterator[Request]:
|
|
184
209
|
"""
|
|
@@ -349,17 +374,10 @@ class Spider(metaclass=SpiderMeta):
|
|
|
349
374
|
可用于:
|
|
350
375
|
- 清理资源
|
|
351
376
|
- 关闭数据库连接
|
|
352
|
-
- 输出统计信息
|
|
353
377
|
"""
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
'total_items': self.stats.get('total_items', 0),
|
|
358
|
-
'success_rate': self.stats.get('success_rate', 'N/A')
|
|
359
|
-
}
|
|
360
|
-
self.logger.info(f"Spider {self.name} closed, stats: {stats_summary}")
|
|
361
|
-
else:
|
|
362
|
-
self.logger.info(f"Spider {self.name} closed")
|
|
378
|
+
# 不再输出任何信息,避免与统计信息重复
|
|
379
|
+
# 统计信息由StatsCollector负责输出
|
|
380
|
+
pass
|
|
363
381
|
|
|
364
382
|
def __str__(self) -> str:
|
|
365
383
|
return f"{self.__class__.__name__}(name='{self.name}')"
|
crawlo/stats_collector.py
CHANGED
|
@@ -46,9 +46,6 @@ class StatsCollector(object):
|
|
|
46
46
|
|
|
47
47
|
self._stats['spider_name'] = spider_name
|
|
48
48
|
|
|
49
|
-
if self._dump:
|
|
50
|
-
self.logger.info(f'{spider_name} stats: \n{pformat(self._stats)}')
|
|
51
|
-
|
|
52
49
|
def __getitem__(self, item):
|
|
53
50
|
return self._stats[item]
|
|
54
51
|
|
|
@@ -57,3 +54,20 @@ class StatsCollector(object):
|
|
|
57
54
|
|
|
58
55
|
def __delitem__(self, key):
|
|
59
56
|
del self._stats[key]
|
|
57
|
+
|
|
58
|
+
def close(self):
|
|
59
|
+
"""关闭统计收集器并输出统计信息"""
|
|
60
|
+
if self._dump:
|
|
61
|
+
# 获取爬虫名称
|
|
62
|
+
spider_name = self._stats.get('spider_name', 'unknown')
|
|
63
|
+
|
|
64
|
+
# 如果还没有设置爬虫名称,尝试从crawler中获取
|
|
65
|
+
if spider_name == 'unknown' and hasattr(self, 'crawler') and self.crawler:
|
|
66
|
+
spider = getattr(self.crawler, 'spider', None)
|
|
67
|
+
if spider and hasattr(spider, 'name'):
|
|
68
|
+
spider_name = spider.name
|
|
69
|
+
# 同时更新_stats中的spider_name
|
|
70
|
+
self._stats['spider_name'] = spider_name
|
|
71
|
+
|
|
72
|
+
# 输出统计信息(这是唯一输出统计信息的地方)
|
|
73
|
+
self.logger.info(f'{spider_name} stats: \n{pformat(self._stats)}')
|
crawlo/task_manager.py
CHANGED
|
@@ -3,13 +3,79 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
from asyncio import Task, Future, Semaphore
|
|
5
5
|
from typing import Set, Final
|
|
6
|
+
from collections import deque
|
|
7
|
+
import time
|
|
8
|
+
from crawlo.utils.log import get_logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DynamicSemaphore(Semaphore):
|
|
12
|
+
"""支持动态调整的信号量"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, initial_value: int = 8):
|
|
15
|
+
super().__init__(initial_value)
|
|
16
|
+
self._initial_value = initial_value
|
|
17
|
+
self._current_value = initial_value
|
|
18
|
+
self._response_times = deque(maxlen=10) # 存储最近10次响应时间
|
|
19
|
+
self._last_adjust_time = time.time()
|
|
20
|
+
|
|
21
|
+
def record_response_time(self, response_time: float):
|
|
22
|
+
"""记录响应时间"""
|
|
23
|
+
self._response_times.append(response_time)
|
|
24
|
+
|
|
25
|
+
def adjust_concurrency(self):
|
|
26
|
+
"""根据响应时间动态调整并发数"""
|
|
27
|
+
current_time = time.time()
|
|
28
|
+
# 限制调整频率,至少间隔1秒(从2秒减少到1秒)
|
|
29
|
+
if current_time - self._last_adjust_time < 1:
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
self._last_adjust_time = current_time
|
|
33
|
+
|
|
34
|
+
if len(self._response_times) < 2: # 从3减少到2
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
# 计算平均响应时间
|
|
38
|
+
avg_response_time = sum(self._response_times) / len(self._response_times)
|
|
39
|
+
|
|
40
|
+
# 根据响应时间调整并发数
|
|
41
|
+
if avg_response_time < 0.2: # 响应很快,增加并发(从0.3降到0.2)
|
|
42
|
+
new_concurrency = min(self._current_value + 5, self._initial_value * 3) # 增加幅度从3提高到5,最大值从2倍提高到3倍
|
|
43
|
+
elif avg_response_time > 1.0: # 响应较慢,减少并发(从1.5降到1.0)
|
|
44
|
+
new_concurrency = max(self._current_value - 5, max(1, self._initial_value // 3)) # 减少幅度从3提高到5,最小值从一半降低到三分之一
|
|
45
|
+
else:
|
|
46
|
+
return # 保持当前并发数
|
|
47
|
+
|
|
48
|
+
# 只有当变化较大时才调整
|
|
49
|
+
if abs(new_concurrency - self._current_value) > 1:
|
|
50
|
+
self._adjust_semaphore_value(new_concurrency)
|
|
51
|
+
|
|
52
|
+
def _adjust_semaphore_value(self, new_value: int):
|
|
53
|
+
"""调整信号量的值"""
|
|
54
|
+
if new_value > self._current_value:
|
|
55
|
+
# 增加信号量
|
|
56
|
+
for _ in range(new_value - self._current_value):
|
|
57
|
+
self.release()
|
|
58
|
+
elif new_value < self._current_value:
|
|
59
|
+
# 减少信号量,这里只是记录新的目标值
|
|
60
|
+
# 实际减少会在acquire时处理
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
self._current_value = new_value
|
|
64
|
+
# 注意:Python的Semaphore没有直接修改内部计数器的方法
|
|
65
|
+
# 所以我们只能通过release()来增加,减少则需要在acquire时控制
|
|
6
66
|
|
|
7
67
|
|
|
8
68
|
class TaskManager:
|
|
9
69
|
|
|
10
70
|
def __init__(self, total_concurrency: int = 8):
|
|
11
71
|
self.current_task: Final[Set] = set()
|
|
12
|
-
|
|
72
|
+
# 使用动态信号量替代普通信号量
|
|
73
|
+
self.semaphore: DynamicSemaphore = DynamicSemaphore(max(1, total_concurrency))
|
|
74
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
75
|
+
|
|
76
|
+
# 异常统计
|
|
77
|
+
self._exception_count = 0
|
|
78
|
+
self._total_tasks = 0
|
|
13
79
|
|
|
14
80
|
async def create_task(self, coroutine) -> Task:
|
|
15
81
|
# 等待信号量,控制并发数
|
|
@@ -17,10 +83,39 @@ class TaskManager:
|
|
|
17
83
|
|
|
18
84
|
task = asyncio.create_task(coroutine)
|
|
19
85
|
self.current_task.add(task)
|
|
86
|
+
self._total_tasks += 1
|
|
20
87
|
|
|
21
88
|
def done_callback(_future: Future) -> None:
|
|
22
|
-
|
|
23
|
-
|
|
89
|
+
try:
|
|
90
|
+
self.current_task.discard(task) # 使用discard而不是remove,避免KeyError
|
|
91
|
+
|
|
92
|
+
# 获取任务结果或异常 - 这是关键,必须调用result()或exception()来"获取"异常
|
|
93
|
+
try:
|
|
94
|
+
# 尝试获取结果,如果有异常会被抛出
|
|
95
|
+
result = _future.result()
|
|
96
|
+
# 如果成功完成,可以在这里记录成功统计
|
|
97
|
+
except Exception as exception:
|
|
98
|
+
# 异常被正确"获取"了,不会再出现"never retrieved"警告
|
|
99
|
+
self._exception_count += 1
|
|
100
|
+
|
|
101
|
+
# 记录异常详情
|
|
102
|
+
self.logger.error(
|
|
103
|
+
f"Task completed with exception: {type(exception).__name__}: {exception}"
|
|
104
|
+
)
|
|
105
|
+
self.logger.debug("Task exception details:", exc_info=exception)
|
|
106
|
+
|
|
107
|
+
# 可以在这里添加更多的异常处理逻辑,如发送到监控系统
|
|
108
|
+
|
|
109
|
+
except Exception as e:
|
|
110
|
+
# 防止回调函数本身出现异常
|
|
111
|
+
self.logger.error(f"Error in task done callback: {e}")
|
|
112
|
+
finally:
|
|
113
|
+
# 确保信号量始终被释放
|
|
114
|
+
self.semaphore.release()
|
|
115
|
+
|
|
116
|
+
# 定期调整并发数(从每3个任务调整一次改为每2个任务调整一次)
|
|
117
|
+
if self._total_tasks % 2 == 0:
|
|
118
|
+
self.semaphore.adjust_concurrency()
|
|
24
119
|
|
|
25
120
|
task.add_done_callback(done_callback)
|
|
26
121
|
|
|
@@ -28,3 +123,17 @@ class TaskManager:
|
|
|
28
123
|
|
|
29
124
|
def all_done(self) -> bool:
|
|
30
125
|
return len(self.current_task) == 0
|
|
126
|
+
|
|
127
|
+
def record_response_time(self, response_time: float):
|
|
128
|
+
"""记录任务的响应时间,用于动态调整并发数"""
|
|
129
|
+
self.semaphore.record_response_time(response_time)
|
|
130
|
+
|
|
131
|
+
def get_stats(self) -> dict:
|
|
132
|
+
"""获取任务管理器统计信息"""
|
|
133
|
+
return {
|
|
134
|
+
'active_tasks': len(self.current_task),
|
|
135
|
+
'total_tasks': self._total_tasks,
|
|
136
|
+
'exception_count': self._exception_count,
|
|
137
|
+
'success_rate': (self._total_tasks - self._exception_count) / max(1, self._total_tasks) * 100,
|
|
138
|
+
'current_concurrency': self.semaphore._current_value
|
|
139
|
+
}
|