crawlo 1.2.9__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +26 -35
- crawlo/commands/utils.py +12 -2
- crawlo/core/engine.py +1 -2
- crawlo/crawler.py +135 -69
- crawlo/extension/logging_extension.py +4 -2
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/middleware/offsite.py +2 -1
- crawlo/mode_manager.py +37 -100
- crawlo/pipelines/mysql_pipeline.py +5 -4
- crawlo/pipelines/pipeline_manager.py +15 -2
- crawlo/project.py +44 -37
- crawlo/settings/default_settings.py +13 -4
- crawlo/settings/setting_manager.py +55 -20
- crawlo/utils/log.py +21 -62
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/METADATA +13 -4
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/RECORD +20 -20
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/WHEEL +0 -0
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.9.dist-info → crawlo-1.3.1.dist-info}/top_level.txt +0 -0
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '1.
|
|
1
|
+
__version__ = '1.3.1'
|
crawlo/commands/run.py
CHANGED
|
@@ -5,26 +5,27 @@
|
|
|
5
5
|
# @Author : crawl-coder
|
|
6
6
|
# @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
|
|
7
7
|
"""
|
|
8
|
+
import os
|
|
8
9
|
import sys
|
|
9
10
|
import asyncio
|
|
10
11
|
import configparser
|
|
11
|
-
import os
|
|
12
|
-
from pathlib import Path
|
|
13
12
|
from importlib import import_module
|
|
14
13
|
|
|
14
|
+
from rich import box
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
from rich.panel import Panel
|
|
17
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
17
18
|
from rich.table import Table
|
|
18
19
|
from rich.text import Text
|
|
19
|
-
from rich import box
|
|
20
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
21
20
|
|
|
21
|
+
from crawlo.commands.stats import record_stats
|
|
22
22
|
from crawlo.crawler import CrawlerProcess
|
|
23
|
-
from crawlo.utils.log import get_logger
|
|
24
23
|
from crawlo.project import get_settings, _find_project_root
|
|
25
|
-
|
|
24
|
+
# 使用自定义日志系统
|
|
25
|
+
from crawlo.utils.log import get_logger
|
|
26
26
|
|
|
27
27
|
logger = get_logger(__name__)
|
|
28
|
+
|
|
28
29
|
console = Console()
|
|
29
30
|
|
|
30
31
|
|
|
@@ -77,6 +78,9 @@ def main(args):
|
|
|
77
78
|
用法:
|
|
78
79
|
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
79
80
|
"""
|
|
81
|
+
# 添加调试信息
|
|
82
|
+
logger.debug("DEBUG: 进入main函数")
|
|
83
|
+
|
|
80
84
|
if len(args) < 1:
|
|
81
85
|
console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
82
86
|
console.print("示例:")
|
|
@@ -187,21 +191,7 @@ def main(args):
|
|
|
187
191
|
return 1
|
|
188
192
|
|
|
189
193
|
# 显示即将运行的爬虫列表
|
|
190
|
-
|
|
191
|
-
title=f"启动全部 {len(spider_names)} 个爬虫",
|
|
192
|
-
box=box.ROUNDED,
|
|
193
|
-
show_header=True,
|
|
194
|
-
header_style="bold magenta"
|
|
195
|
-
)
|
|
196
|
-
table.add_column("名称", style="cyan")
|
|
197
|
-
table.add_column("类名", style="green")
|
|
198
|
-
|
|
199
|
-
for name in sorted(spider_names):
|
|
200
|
-
cls = process.get_spider_class(name)
|
|
201
|
-
table.add_row(name, cls.__name__)
|
|
202
|
-
|
|
203
|
-
console.print(table)
|
|
204
|
-
console.print()
|
|
194
|
+
# 根据用户要求,不再显示详细的爬虫列表信息
|
|
205
195
|
|
|
206
196
|
# 注册 stats 记录(除非 --no-stats)
|
|
207
197
|
if not no_stats:
|
|
@@ -260,20 +250,21 @@ def main(args):
|
|
|
260
250
|
spider_class = process.get_spider_class(spider_name)
|
|
261
251
|
|
|
262
252
|
# 显示启动信息
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
253
|
+
# 根据用户要求,不再显示项目启动信息
|
|
254
|
+
# if not show_json:
|
|
255
|
+
# info_table = Table(
|
|
256
|
+
# title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
|
|
257
|
+
# box=box.SIMPLE,
|
|
258
|
+
# show_header=False,
|
|
259
|
+
# title_style="bold green"
|
|
260
|
+
# )
|
|
261
|
+
# info_table.add_column("Key", style="yellow")
|
|
262
|
+
# info_table.add_column("Value", style="cyan")
|
|
263
|
+
# info_table.add_row("Project", project_package)
|
|
264
|
+
# info_table.add_row("Class", spider_class.__name__)
|
|
265
|
+
# info_table.add_row("Module", spider_class.__module__)
|
|
266
|
+
# console.print(info_table)
|
|
267
|
+
# console.print()
|
|
277
268
|
|
|
278
269
|
# 注册 stats 记录
|
|
279
270
|
if not no_stats:
|
crawlo/commands/utils.py
CHANGED
|
@@ -133,8 +133,11 @@ def validate_spider_name(spider_name: str) -> bool:
|
|
|
133
133
|
bool: 是否有效
|
|
134
134
|
"""
|
|
135
135
|
import re
|
|
136
|
+
# 清理爬虫名称中的不可见字符
|
|
137
|
+
cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
|
|
138
|
+
|
|
136
139
|
# 爬虫名称应该是有效的Python标识符
|
|
137
|
-
return
|
|
140
|
+
return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
|
|
138
141
|
|
|
139
142
|
|
|
140
143
|
def format_file_size(size_bytes: int) -> str:
|
|
@@ -181,7 +184,14 @@ def is_valid_domain(domain: str) -> bool:
|
|
|
181
184
|
bool: 是否有效
|
|
182
185
|
"""
|
|
183
186
|
import re
|
|
187
|
+
# 清理域名中的不可见字符
|
|
188
|
+
cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
|
|
189
|
+
|
|
184
190
|
pattern = re.compile(
|
|
185
191
|
r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
|
|
186
192
|
)
|
|
187
|
-
return bool(pattern.match(
|
|
193
|
+
return bool(pattern.match(cleaned_domain))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# 添加导入
|
|
197
|
+
import unicodedata
|
crawlo/core/engine.py
CHANGED
|
@@ -75,8 +75,7 @@ class Engine(object):
|
|
|
75
75
|
version = '1.0.0'
|
|
76
76
|
# Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
|
|
77
77
|
self.logger.debug(
|
|
78
|
-
f"Crawlo Started version {version}
|
|
79
|
-
# f"(project name : {self.settings.get('PROJECT_NAME')})"
|
|
78
|
+
f"Crawlo Started version {version}"
|
|
80
79
|
)
|
|
81
80
|
|
|
82
81
|
async def start_spider(self, spider):
|
crawlo/crawler.py
CHANGED
|
@@ -21,7 +21,7 @@ Example Usage:
|
|
|
21
21
|
# Single crawler run
|
|
22
22
|
crawler = Crawler(MySpider, settings)
|
|
23
23
|
await crawler.crawl()
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
# Multi-crawler concurrent management
|
|
26
26
|
process = CrawlerProcess()
|
|
27
27
|
await process.crawl([Spider1, Spider2])
|
|
@@ -34,24 +34,15 @@ import threading
|
|
|
34
34
|
from typing import Type, Optional, Set, List, Union, Dict, Any
|
|
35
35
|
from .spider import Spider, get_global_spider_registry
|
|
36
36
|
from .core.engine import Engine
|
|
37
|
-
from .utils.log import get_logger
|
|
38
37
|
from .subscriber import Subscriber
|
|
39
38
|
from .extension import ExtensionManager
|
|
39
|
+
from crawlo.utils.log import get_logger
|
|
40
40
|
from .stats_collector import StatsCollector
|
|
41
41
|
from .event import spider_opened, spider_closed
|
|
42
42
|
from .settings.setting_manager import SettingManager
|
|
43
43
|
from crawlo.project import merge_settings, get_settings
|
|
44
44
|
|
|
45
|
-
|
|
46
|
-
logger = None
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def _get_logger():
|
|
50
|
-
"""延迟获取logger实例,确保在配置加载后创建"""
|
|
51
|
-
global logger
|
|
52
|
-
if logger is None:
|
|
53
|
-
logger = get_logger(__name__)
|
|
54
|
-
return logger
|
|
45
|
+
logger = get_logger(__name__)
|
|
55
46
|
|
|
56
47
|
|
|
57
48
|
class CrawlerContext:
|
|
@@ -110,7 +101,7 @@ class CrawlerContext:
|
|
|
110
101
|
class Crawler:
|
|
111
102
|
"""
|
|
112
103
|
Single crawler runtime instance, managing Spider and engine lifecycle
|
|
113
|
-
|
|
104
|
+
|
|
114
105
|
Provides functionality:
|
|
115
106
|
- Spider lifecycle management (initialization, running, closing)
|
|
116
107
|
- Engine component coordination management
|
|
@@ -120,7 +111,12 @@ class Crawler:
|
|
|
120
111
|
- Exception handling and cleanup
|
|
121
112
|
"""
|
|
122
113
|
|
|
123
|
-
def __init__(
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
spider_cls: Type[Spider],
|
|
117
|
+
settings: SettingManager,
|
|
118
|
+
context: Optional[CrawlerContext] = None
|
|
119
|
+
):
|
|
124
120
|
self.spider_cls = spider_cls
|
|
125
121
|
self.spider: Optional[Spider] = None
|
|
126
122
|
self.engine: Optional[Engine] = None
|
|
@@ -145,10 +141,26 @@ class Crawler:
|
|
|
145
141
|
'error_count': 0
|
|
146
142
|
}
|
|
147
143
|
|
|
144
|
+
# Initialize components
|
|
145
|
+
self.subscriber = self._create_subscriber()
|
|
146
|
+
self.spider = self._create_spider()
|
|
147
|
+
self.engine = self._create_engine()
|
|
148
|
+
self.stats = self._create_stats()
|
|
149
|
+
# Note: Do not initialize extension manager here, let it initialize in the engine
|
|
150
|
+
|
|
151
|
+
# Validate crawler state
|
|
152
|
+
self._validate_crawler_state()
|
|
153
|
+
|
|
154
|
+
# 打印启动信息,确保在日志系统配置之后打印
|
|
155
|
+
self._log_startup_info()
|
|
156
|
+
|
|
157
|
+
# 将启动爬虫名称的日志移到这里,确保在日志系统配置之后打印
|
|
158
|
+
logger.info(f"Starting running {self.spider.name}")
|
|
159
|
+
|
|
148
160
|
async def crawl(self):
|
|
149
161
|
"""
|
|
150
162
|
Start the crawler core process
|
|
151
|
-
|
|
163
|
+
|
|
152
164
|
Includes the following stages:
|
|
153
165
|
1. Initialization stage: Create all components
|
|
154
166
|
2. Validation stage: Check configuration and state
|
|
@@ -190,12 +202,12 @@ class Crawler:
|
|
|
190
202
|
# Update context status
|
|
191
203
|
self.context.increment_completed()
|
|
192
204
|
|
|
193
|
-
|
|
205
|
+
logger.info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
|
|
194
206
|
|
|
195
207
|
except Exception as e:
|
|
196
208
|
self._performance_metrics['error_count'] += 1
|
|
197
209
|
self.context.increment_failed(str(e))
|
|
198
|
-
|
|
210
|
+
logger.error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
|
|
199
211
|
raise
|
|
200
212
|
finally:
|
|
201
213
|
self.context.decrement_active()
|
|
@@ -213,7 +225,7 @@ class Crawler:
|
|
|
213
225
|
else:
|
|
214
226
|
spider_name = 'Unknown'
|
|
215
227
|
|
|
216
|
-
|
|
228
|
+
logger.info(f"Starting running {spider_name}")
|
|
217
229
|
|
|
218
230
|
def _validate_crawler_state(self):
|
|
219
231
|
"""
|
|
@@ -233,7 +245,7 @@ class Crawler:
|
|
|
233
245
|
if not self.spider.name:
|
|
234
246
|
raise ValueError("Spider name cannot be empty")
|
|
235
247
|
|
|
236
|
-
|
|
248
|
+
logger.debug(f"Spider {self.spider.name} state validation passed")
|
|
237
249
|
|
|
238
250
|
def _get_total_duration(self) -> float:
|
|
239
251
|
"""Get total runtime"""
|
|
@@ -241,13 +253,59 @@ class Crawler:
|
|
|
241
253
|
return self._end_time - self._start_time
|
|
242
254
|
return 0.0
|
|
243
255
|
|
|
256
|
+
def _log_startup_info(self):
|
|
257
|
+
"""Print startup information, including run mode and key configuration checks"""
|
|
258
|
+
# Get run mode
|
|
259
|
+
run_mode = self.settings.get('RUN_MODE', 'standalone')
|
|
260
|
+
|
|
261
|
+
# Get version number
|
|
262
|
+
version = self.settings.get('VERSION', '1.0.0')
|
|
263
|
+
if not version or version == 'None':
|
|
264
|
+
version = '1.0.0'
|
|
265
|
+
|
|
266
|
+
# Print framework start info
|
|
267
|
+
logger.info(f"Crawlo Framework Started {version}")
|
|
268
|
+
|
|
269
|
+
# Add mode info if available
|
|
270
|
+
mode_info = self.settings.get('_mode_info')
|
|
271
|
+
if mode_info:
|
|
272
|
+
logger.info(mode_info)
|
|
273
|
+
else:
|
|
274
|
+
# 如果没有_mode_info,添加默认信息
|
|
275
|
+
logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
|
|
276
|
+
|
|
277
|
+
# Get actual queue type
|
|
278
|
+
queue_type = self.settings.get('QUEUE_TYPE', 'memory')
|
|
279
|
+
|
|
280
|
+
# Display information based on run mode and queue type combination
|
|
281
|
+
if run_mode == 'distributed':
|
|
282
|
+
logger.info("Run Mode: distributed")
|
|
283
|
+
logger.info("Distributed Mode - Multi-node collaboration supported")
|
|
284
|
+
# Show Redis configuration
|
|
285
|
+
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
286
|
+
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
287
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
288
|
+
elif run_mode == 'standalone':
|
|
289
|
+
if queue_type == 'redis':
|
|
290
|
+
logger.info("Run Mode: standalone+redis")
|
|
291
|
+
# Show Redis configuration
|
|
292
|
+
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
293
|
+
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
294
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
295
|
+
elif queue_type == 'auto':
|
|
296
|
+
logger.info("Run Mode: standalone+auto")
|
|
297
|
+
else: # memory
|
|
298
|
+
logger.info("Run Mode: standalone")
|
|
299
|
+
else:
|
|
300
|
+
logger.info(f"Run Mode: {run_mode}")
|
|
301
|
+
|
|
244
302
|
async def _ensure_cleanup(self):
|
|
245
303
|
"""Ensure resource cleanup"""
|
|
246
304
|
try:
|
|
247
305
|
if not self._closed:
|
|
248
306
|
await self.close()
|
|
249
307
|
except Exception as e:
|
|
250
|
-
|
|
308
|
+
logger.warning(f"Error cleaning up resources: {e}")
|
|
251
309
|
|
|
252
310
|
def get_performance_metrics(self) -> Dict[str, Any]:
|
|
253
311
|
"""Get performance metrics"""
|
|
@@ -267,7 +325,7 @@ class Crawler:
|
|
|
267
325
|
def _create_spider(self) -> Spider:
|
|
268
326
|
"""
|
|
269
327
|
Create and validate spider instance (enhanced version)
|
|
270
|
-
|
|
328
|
+
|
|
271
329
|
Performs the following validations:
|
|
272
330
|
- Spider name must exist
|
|
273
331
|
- start_requests method must be callable
|
|
@@ -300,7 +358,7 @@ class Crawler:
|
|
|
300
358
|
|
|
301
359
|
# parse method check (warning instead of error)
|
|
302
360
|
if not callable(getattr(spider, 'parse', None)):
|
|
303
|
-
|
|
361
|
+
logger.warning(
|
|
304
362
|
f"Spider '{spider.name}' does not define 'parse' method.\n"
|
|
305
363
|
f"Ensure all Requests specify a callback function, otherwise responses will be ignored."
|
|
306
364
|
)
|
|
@@ -308,27 +366,29 @@ class Crawler:
|
|
|
308
366
|
# Set spider configuration
|
|
309
367
|
self._set_spider(spider)
|
|
310
368
|
|
|
311
|
-
|
|
369
|
+
logger.debug(f"Spider '{spider.name}' initialized successfully")
|
|
312
370
|
return spider
|
|
313
371
|
|
|
314
372
|
def _create_engine(self) -> Engine:
|
|
315
373
|
"""Create and initialize engine"""
|
|
316
374
|
engine = Engine(self)
|
|
317
375
|
engine.engine_start()
|
|
318
|
-
|
|
376
|
+
logger.debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
319
377
|
return engine
|
|
320
378
|
|
|
321
379
|
def _create_stats(self) -> StatsCollector:
|
|
322
380
|
"""Create stats collector"""
|
|
323
381
|
stats = StatsCollector(self)
|
|
324
|
-
|
|
382
|
+
logger.debug(
|
|
383
|
+
f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
325
384
|
return stats
|
|
326
385
|
|
|
327
386
|
def _create_extension(self) -> ExtensionManager:
|
|
328
387
|
"""Create extension manager"""
|
|
329
388
|
# Modify extension manager creation method, delay initialization until needed
|
|
330
389
|
extension = ExtensionManager.create_instance(self)
|
|
331
|
-
|
|
390
|
+
logger.debug(
|
|
391
|
+
f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
332
392
|
return extension
|
|
333
393
|
|
|
334
394
|
def _set_spider(self, spider: Spider):
|
|
@@ -343,12 +403,12 @@ class Crawler:
|
|
|
343
403
|
# Merge spider custom configuration
|
|
344
404
|
merge_settings(spider, self.settings)
|
|
345
405
|
|
|
346
|
-
|
|
406
|
+
logger.debug(f"Spider '{spider.name}' configuration merged successfully")
|
|
347
407
|
|
|
348
408
|
async def close(self, reason='finished') -> None:
|
|
349
409
|
"""
|
|
350
410
|
Close crawler and clean up resources (enhanced version)
|
|
351
|
-
|
|
411
|
+
|
|
352
412
|
Ensure closing only once and handle all cleanup operations
|
|
353
413
|
"""
|
|
354
414
|
async with self._close_lock:
|
|
@@ -371,15 +431,15 @@ class Crawler:
|
|
|
371
431
|
from crawlo.commands.stats import record_stats
|
|
372
432
|
record_stats(self)
|
|
373
433
|
except ImportError:
|
|
374
|
-
|
|
434
|
+
logger.debug("Statistics recording module does not exist, skipping statistics recording")
|
|
375
435
|
|
|
376
|
-
|
|
436
|
+
logger.info(
|
|
377
437
|
f"Spider '{getattr(self.spider, 'name', 'Unknown')}' closed, "
|
|
378
438
|
f"reason: {reason}, took: {self._get_total_duration():.2f} seconds"
|
|
379
439
|
)
|
|
380
440
|
|
|
381
441
|
except Exception as e:
|
|
382
|
-
|
|
442
|
+
logger.error(f"Error closing crawler: {e}", exc_info=True)
|
|
383
443
|
finally:
|
|
384
444
|
# Ensure resource cleanup
|
|
385
445
|
await self._cleanup_resources()
|
|
@@ -413,13 +473,13 @@ class Crawler:
|
|
|
413
473
|
if cleanup_tasks:
|
|
414
474
|
await asyncio.gather(*cleanup_tasks, return_exceptions=True)
|
|
415
475
|
|
|
416
|
-
|
|
476
|
+
logger.debug("Resource cleanup completed")
|
|
417
477
|
|
|
418
478
|
|
|
419
479
|
class CrawlerProcess:
|
|
420
480
|
"""
|
|
421
481
|
Crawler process manager
|
|
422
|
-
|
|
482
|
+
|
|
423
483
|
Supported features:
|
|
424
484
|
- Multi-crawler concurrent scheduling and resource management
|
|
425
485
|
- Automatic module discovery and spider registration
|
|
@@ -428,15 +488,15 @@ class CrawlerProcess:
|
|
|
428
488
|
- Real-time status monitoring and statistics
|
|
429
489
|
- Error recovery and retry mechanism
|
|
430
490
|
- Large-scale crawler optimization support
|
|
431
|
-
|
|
491
|
+
|
|
432
492
|
Usage example:
|
|
433
493
|
# Basic usage
|
|
434
494
|
process = CrawlerProcess()
|
|
435
495
|
await process.crawl(MySpider)
|
|
436
|
-
|
|
496
|
+
|
|
437
497
|
# Multi-crawler concurrency
|
|
438
498
|
await process.crawl([Spider1, Spider2, 'spider_name'])
|
|
439
|
-
|
|
499
|
+
|
|
440
500
|
# Custom concurrency
|
|
441
501
|
process = CrawlerProcess(max_concurrency=8)
|
|
442
502
|
"""
|
|
@@ -489,7 +549,10 @@ class CrawlerProcess:
|
|
|
489
549
|
signal.signal(signal.SIGINT, self._shutdown)
|
|
490
550
|
signal.signal(signal.SIGTERM, self._shutdown)
|
|
491
551
|
|
|
492
|
-
|
|
552
|
+
# 注意:移除在这里调用_log_startup_info(),因为这时候日志系统可能还没有被正确配置
|
|
553
|
+
# 日志系统的配置是在project.py的get_settings函数中进行的,而CrawlerProcess的实例化
|
|
554
|
+
# 是在get_settings函数返回之前进行的,所以这时候调用_log_startup_info()可能会导致
|
|
555
|
+
# 日志信息没有被正确写入到日志文件中
|
|
493
556
|
|
|
494
557
|
logger.debug(
|
|
495
558
|
f"CrawlerProcess initialized successfully\n"
|
|
@@ -563,7 +626,7 @@ class CrawlerProcess:
|
|
|
563
626
|
def auto_discover(modules: List[str]):
|
|
564
627
|
"""
|
|
565
628
|
Automatically import modules, trigger Spider class definition and registration (enhanced version)
|
|
566
|
-
|
|
629
|
+
|
|
567
630
|
Supports recursive scanning and error recovery
|
|
568
631
|
"""
|
|
569
632
|
import importlib
|
|
@@ -617,7 +680,7 @@ class CrawlerProcess:
|
|
|
617
680
|
async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
|
|
618
681
|
"""
|
|
619
682
|
Start one or more crawlers
|
|
620
|
-
|
|
683
|
+
|
|
621
684
|
Enhanced features:
|
|
622
685
|
- Intelligent concurrency control
|
|
623
686
|
- Real-time monitoring and statistics
|
|
@@ -639,7 +702,7 @@ class CrawlerProcess:
|
|
|
639
702
|
await self.start_monitoring()
|
|
640
703
|
|
|
641
704
|
try:
|
|
642
|
-
# Phase 3:
|
|
705
|
+
# Phase 3: Initialize context and monitoring
|
|
643
706
|
spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
|
|
644
707
|
|
|
645
708
|
logger.debug(
|
|
@@ -738,7 +801,7 @@ class CrawlerProcess:
|
|
|
738
801
|
) -> List[Type[Spider]]:
|
|
739
802
|
"""
|
|
740
803
|
Resolve input to spider class list
|
|
741
|
-
|
|
804
|
+
|
|
742
805
|
Supports various input formats and validates uniqueness
|
|
743
806
|
"""
|
|
744
807
|
inputs = self._normalize_inputs(spiders_input)
|
|
@@ -762,7 +825,8 @@ class CrawlerProcess:
|
|
|
762
825
|
seen_spider_names.add(spider_name)
|
|
763
826
|
spider_classes.append(spider_cls)
|
|
764
827
|
|
|
765
|
-
logger.debug(
|
|
828
|
+
logger.debug(
|
|
829
|
+
f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
|
|
766
830
|
|
|
767
831
|
except Exception as e:
|
|
768
832
|
logger.error(f"Failed to resolve spider: {item} - {e}")
|
|
@@ -774,7 +838,7 @@ class CrawlerProcess:
|
|
|
774
838
|
def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
|
|
775
839
|
"""
|
|
776
840
|
Normalize input to list
|
|
777
|
-
|
|
841
|
+
|
|
778
842
|
Supports more input types and provides better error information
|
|
779
843
|
"""
|
|
780
844
|
if isinstance(spiders_input, (type, str)):
|
|
@@ -793,7 +857,7 @@ class CrawlerProcess:
|
|
|
793
857
|
def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
|
|
794
858
|
"""
|
|
795
859
|
Resolve single input item to spider class
|
|
796
|
-
|
|
860
|
+
|
|
797
861
|
Provides better error prompts and debugging information
|
|
798
862
|
"""
|
|
799
863
|
if isinstance(item, type) and issubclass(item, Spider):
|
|
@@ -820,7 +884,7 @@ class CrawlerProcess:
|
|
|
820
884
|
async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
|
|
821
885
|
"""
|
|
822
886
|
Spider running function limited by semaphore
|
|
823
|
-
|
|
887
|
+
|
|
824
888
|
Includes enhanced error handling and monitoring functionality
|
|
825
889
|
"""
|
|
826
890
|
task = asyncio.current_task()
|
|
@@ -888,7 +952,7 @@ class CrawlerProcess:
|
|
|
888
952
|
def _shutdown(self, _signum, _frame):
|
|
889
953
|
"""
|
|
890
954
|
Graceful shutdown signal handling
|
|
891
|
-
|
|
955
|
+
|
|
892
956
|
Provides better shutdown experience and resource cleanup
|
|
893
957
|
"""
|
|
894
958
|
signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
|
|
@@ -913,7 +977,7 @@ class CrawlerProcess:
|
|
|
913
977
|
async def _wait_for_shutdown(self):
|
|
914
978
|
"""
|
|
915
979
|
Wait for all active tasks to complete
|
|
916
|
-
|
|
980
|
+
|
|
917
981
|
Provides better shutdown time control and progress feedback
|
|
918
982
|
"""
|
|
919
983
|
try:
|
|
@@ -967,15 +1031,15 @@ class CrawlerProcess:
|
|
|
967
1031
|
def _get_default_settings(cls) -> SettingManager:
|
|
968
1032
|
"""
|
|
969
1033
|
Load default configuration
|
|
970
|
-
|
|
1034
|
+
|
|
971
1035
|
Provides better error handling and fallback strategy
|
|
972
1036
|
"""
|
|
973
1037
|
try:
|
|
974
1038
|
settings = get_settings()
|
|
975
|
-
|
|
1039
|
+
logger.debug("Default configuration loaded successfully")
|
|
976
1040
|
return settings
|
|
977
1041
|
except Exception as e:
|
|
978
|
-
|
|
1042
|
+
logger.warning(f"Unable to load default configuration: {e}, using empty configuration")
|
|
979
1043
|
return SettingManager()
|
|
980
1044
|
|
|
981
1045
|
def _log_startup_info(self):
|
|
@@ -988,39 +1052,41 @@ class CrawlerProcess:
|
|
|
988
1052
|
if not version or version == 'None':
|
|
989
1053
|
version = '1.0.0'
|
|
990
1054
|
|
|
991
|
-
#
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
1055
|
+
# Print framework start info
|
|
1056
|
+
logger.info(f"Crawlo Framework Started {version}")
|
|
1057
|
+
|
|
1058
|
+
# Add mode info if available
|
|
1059
|
+
mode_info = self.settings.get('_mode_info')
|
|
1060
|
+
if mode_info:
|
|
1061
|
+
logger.info(mode_info)
|
|
1062
|
+
else:
|
|
1063
|
+
# 如果没有_mode_info,添加默认信息
|
|
1064
|
+
logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
|
|
995
1065
|
|
|
996
1066
|
# Get actual queue type
|
|
997
1067
|
queue_type = self.settings.get('QUEUE_TYPE', 'memory')
|
|
998
1068
|
|
|
999
1069
|
# Display information based on run mode and queue type combination
|
|
1000
1070
|
if run_mode == 'distributed':
|
|
1001
|
-
|
|
1002
|
-
|
|
1071
|
+
logger.info("Run Mode: distributed")
|
|
1072
|
+
logger.info("Distributed Mode - Multi-node collaboration supported")
|
|
1003
1073
|
# Show Redis configuration
|
|
1004
1074
|
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
1005
1075
|
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
1006
|
-
|
|
1076
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
1007
1077
|
elif run_mode == 'standalone':
|
|
1008
1078
|
if queue_type == 'redis':
|
|
1009
|
-
|
|
1079
|
+
logger.info("Run Mode: standalone+redis")
|
|
1010
1080
|
# Show Redis configuration
|
|
1011
1081
|
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
1012
1082
|
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
1013
|
-
|
|
1083
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
1014
1084
|
elif queue_type == 'auto':
|
|
1015
|
-
|
|
1085
|
+
logger.info("Run Mode: standalone+auto")
|
|
1016
1086
|
else: # memory
|
|
1017
|
-
|
|
1087
|
+
logger.info("Run Mode: standalone")
|
|
1018
1088
|
else:
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
# Print startup information
|
|
1022
|
-
for info in startup_info:
|
|
1023
|
-
logger.info(info)
|
|
1089
|
+
logger.info(f"Run Mode: {run_mode}")
|
|
1024
1090
|
|
|
1025
1091
|
|
|
1026
1092
|
# === Utility functions ===
|
|
@@ -1032,7 +1098,7 @@ def create_crawler_with_optimizations(
|
|
|
1032
1098
|
) -> Crawler:
|
|
1033
1099
|
"""
|
|
1034
1100
|
Create an optimized crawler instance
|
|
1035
|
-
|
|
1101
|
+
|
|
1036
1102
|
:param spider_cls: Spider class
|
|
1037
1103
|
:param settings: Settings manager
|
|
1038
1104
|
:param optimization_kwargs: Optimization parameters
|
|
@@ -1056,7 +1122,7 @@ def create_process_with_large_scale_config(
|
|
|
1056
1122
|
) -> CrawlerProcess:
|
|
1057
1123
|
"""
|
|
1058
1124
|
Create a process manager that supports large-scale optimization
|
|
1059
|
-
|
|
1125
|
+
|
|
1060
1126
|
:param config_type: Configuration type ('conservative', 'balanced', 'aggressive', 'memory_optimized')
|
|
1061
1127
|
:param concurrency: Concurrency count
|
|
1062
1128
|
:param kwargs: Other parameters
|
|
@@ -1100,4 +1166,4 @@ __all__ = [
|
|
|
1100
1166
|
'CrawlerContext',
|
|
1101
1167
|
'create_crawler_with_optimizations',
|
|
1102
1168
|
'create_process_with_large_scale_config'
|
|
1103
|
-
]
|
|
1169
|
+
]
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
from crawlo.exceptions import NotConfigured
|
|
3
|
-
from crawlo.utils.log import get_logger
|
|
4
3
|
from crawlo.utils.log import LoggerManager
|
|
5
4
|
|
|
5
|
+
# 使用全局logger,避免每个模块都创建自己的延迟初始化函数
|
|
6
|
+
logger = LoggerManager.get_logger(__name__)
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
class CustomLoggerExtension:
|
|
8
10
|
"""
|
|
@@ -32,7 +34,7 @@ class CustomLoggerExtension:
|
|
|
32
34
|
return cls(crawler.settings)
|
|
33
35
|
|
|
34
36
|
def spider_opened(self, spider: Any) -> None:
|
|
35
|
-
logger = get_logger(__name__)
|
|
37
|
+
logger = LoggerManager.get_logger(__name__)
|
|
36
38
|
try:
|
|
37
39
|
logger.info(
|
|
38
40
|
f"CustomLoggerExtension: Logging initialized. "
|
|
@@ -133,4 +133,4 @@ class MiddlewareManager:
|
|
|
133
133
|
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
134
134
|
method = getattr(type(middleware), method_name)
|
|
135
135
|
base_method = getattr(BaseMiddleware, method_name)
|
|
136
|
-
return False if method == base_method else True
|
|
136
|
+
return False if method == base_method else True
|