crawlo 1.2.9__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.2.9'
1
+ __version__ = '1.3.1'
crawlo/commands/run.py CHANGED
@@ -5,26 +5,27 @@
5
5
  # @Author : crawl-coder
6
6
  # @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
7
7
  """
8
+ import os
8
9
  import sys
9
10
  import asyncio
10
11
  import configparser
11
- import os
12
- from pathlib import Path
13
12
  from importlib import import_module
14
13
 
14
+ from rich import box
15
15
  from rich.console import Console
16
16
  from rich.panel import Panel
17
+ from rich.progress import Progress, SpinnerColumn, TextColumn
17
18
  from rich.table import Table
18
19
  from rich.text import Text
19
- from rich import box
20
- from rich.progress import Progress, SpinnerColumn, TextColumn
21
20
 
21
+ from crawlo.commands.stats import record_stats
22
22
  from crawlo.crawler import CrawlerProcess
23
- from crawlo.utils.log import get_logger
24
23
  from crawlo.project import get_settings, _find_project_root
25
- from crawlo.commands.stats import record_stats
24
+ # 使用自定义日志系统
25
+ from crawlo.utils.log import get_logger
26
26
 
27
27
  logger = get_logger(__name__)
28
+
28
29
  console = Console()
29
30
 
30
31
 
@@ -77,6 +78,9 @@ def main(args):
77
78
  用法:
78
79
  crawlo run <spider_name>|all [--json] [--no-stats]
79
80
  """
81
+ # 添加调试信息
82
+ logger.debug("DEBUG: 进入main函数")
83
+
80
84
  if len(args) < 1:
81
85
  console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
82
86
  console.print("示例:")
@@ -187,21 +191,7 @@ def main(args):
187
191
  return 1
188
192
 
189
193
  # 显示即将运行的爬虫列表
190
- table = Table(
191
- title=f"启动全部 {len(spider_names)} 个爬虫",
192
- box=box.ROUNDED,
193
- show_header=True,
194
- header_style="bold magenta"
195
- )
196
- table.add_column("名称", style="cyan")
197
- table.add_column("类名", style="green")
198
-
199
- for name in sorted(spider_names):
200
- cls = process.get_spider_class(name)
201
- table.add_row(name, cls.__name__)
202
-
203
- console.print(table)
204
- console.print()
194
+ # 根据用户要求,不再显示详细的爬虫列表信息
205
195
 
206
196
  # 注册 stats 记录(除非 --no-stats)
207
197
  if not no_stats:
@@ -260,20 +250,21 @@ def main(args):
260
250
  spider_class = process.get_spider_class(spider_name)
261
251
 
262
252
  # 显示启动信息
263
- if not show_json:
264
- info_table = Table(
265
- title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
266
- box=box.SIMPLE,
267
- show_header=False,
268
- title_style="bold green"
269
- )
270
- info_table.add_column("Key", style="yellow")
271
- info_table.add_column("Value", style="cyan")
272
- info_table.add_row("Project", project_package)
273
- info_table.add_row("Class", spider_class.__name__)
274
- info_table.add_row("Module", spider_class.__module__)
275
- console.print(info_table)
276
- console.print()
253
+ # 根据用户要求,不再显示项目启动信息
254
+ # if not show_json:
255
+ # info_table = Table(
256
+ # title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
257
+ # box=box.SIMPLE,
258
+ # show_header=False,
259
+ # title_style="bold green"
260
+ # )
261
+ # info_table.add_column("Key", style="yellow")
262
+ # info_table.add_column("Value", style="cyan")
263
+ # info_table.add_row("Project", project_package)
264
+ # info_table.add_row("Class", spider_class.__name__)
265
+ # info_table.add_row("Module", spider_class.__module__)
266
+ # console.print(info_table)
267
+ # console.print()
277
268
 
278
269
  # 注册 stats 记录
279
270
  if not no_stats:
crawlo/commands/utils.py CHANGED
@@ -133,8 +133,11 @@ def validate_spider_name(spider_name: str) -> bool:
133
133
  bool: 是否有效
134
134
  """
135
135
  import re
136
+ # 清理爬虫名称中的不可见字符
137
+ cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
138
+
136
139
  # 爬虫名称应该是有效的Python标识符
137
- return spider_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', spider_name)
140
+ return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
138
141
 
139
142
 
140
143
  def format_file_size(size_bytes: int) -> str:
@@ -181,7 +184,14 @@ def is_valid_domain(domain: str) -> bool:
181
184
  bool: 是否有效
182
185
  """
183
186
  import re
187
+ # 清理域名中的不可见字符
188
+ cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
189
+
184
190
  pattern = re.compile(
185
191
  r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
186
192
  )
187
- return bool(pattern.match(domain))
193
+ return bool(pattern.match(cleaned_domain))
194
+
195
+
196
+ # 添加导入
197
+ import unicodedata
crawlo/core/engine.py CHANGED
@@ -75,8 +75,7 @@ class Engine(object):
75
75
  version = '1.0.0'
76
76
  # Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
77
77
  self.logger.debug(
78
- f"Crawlo Started version {version} . "
79
- # f"(project name : {self.settings.get('PROJECT_NAME')})"
78
+ f"Crawlo Started version {version}"
80
79
  )
81
80
 
82
81
  async def start_spider(self, spider):
crawlo/crawler.py CHANGED
@@ -21,7 +21,7 @@ Example Usage:
21
21
  # Single crawler run
22
22
  crawler = Crawler(MySpider, settings)
23
23
  await crawler.crawl()
24
-
24
+
25
25
  # Multi-crawler concurrent management
26
26
  process = CrawlerProcess()
27
27
  await process.crawl([Spider1, Spider2])
@@ -34,24 +34,15 @@ import threading
34
34
  from typing import Type, Optional, Set, List, Union, Dict, Any
35
35
  from .spider import Spider, get_global_spider_registry
36
36
  from .core.engine import Engine
37
- from .utils.log import get_logger
38
37
  from .subscriber import Subscriber
39
38
  from .extension import ExtensionManager
39
+ from crawlo.utils.log import get_logger
40
40
  from .stats_collector import StatsCollector
41
41
  from .event import spider_opened, spider_closed
42
42
  from .settings.setting_manager import SettingManager
43
43
  from crawlo.project import merge_settings, get_settings
44
44
 
45
- # 延迟初始化logger,在需要时通过get_logger获取
46
- logger = None
47
-
48
-
49
- def _get_logger():
50
- """延迟获取logger实例,确保在配置加载后创建"""
51
- global logger
52
- if logger is None:
53
- logger = get_logger(__name__)
54
- return logger
45
+ logger = get_logger(__name__)
55
46
 
56
47
 
57
48
  class CrawlerContext:
@@ -110,7 +101,7 @@ class CrawlerContext:
110
101
  class Crawler:
111
102
  """
112
103
  Single crawler runtime instance, managing Spider and engine lifecycle
113
-
104
+
114
105
  Provides functionality:
115
106
  - Spider lifecycle management (initialization, running, closing)
116
107
  - Engine component coordination management
@@ -120,7 +111,12 @@ class Crawler:
120
111
  - Exception handling and cleanup
121
112
  """
122
113
 
123
- def __init__(self, spider_cls: Type[Spider], settings: SettingManager, context: Optional[CrawlerContext] = None):
114
+ def __init__(
115
+ self,
116
+ spider_cls: Type[Spider],
117
+ settings: SettingManager,
118
+ context: Optional[CrawlerContext] = None
119
+ ):
124
120
  self.spider_cls = spider_cls
125
121
  self.spider: Optional[Spider] = None
126
122
  self.engine: Optional[Engine] = None
@@ -145,10 +141,26 @@ class Crawler:
145
141
  'error_count': 0
146
142
  }
147
143
 
144
+ # Initialize components
145
+ self.subscriber = self._create_subscriber()
146
+ self.spider = self._create_spider()
147
+ self.engine = self._create_engine()
148
+ self.stats = self._create_stats()
149
+ # Note: Do not initialize extension manager here, let it initialize in the engine
150
+
151
+ # Validate crawler state
152
+ self._validate_crawler_state()
153
+
154
+ # 打印启动信息,确保在日志系统配置之后打印
155
+ self._log_startup_info()
156
+
157
+ # 将启动爬虫名称的日志移到这里,确保在日志系统配置之后打印
158
+ logger.info(f"Starting running {self.spider.name}")
159
+
148
160
  async def crawl(self):
149
161
  """
150
162
  Start the crawler core process
151
-
163
+
152
164
  Includes the following stages:
153
165
  1. Initialization stage: Create all components
154
166
  2. Validation stage: Check configuration and state
@@ -190,12 +202,12 @@ class Crawler:
190
202
  # Update context status
191
203
  self.context.increment_completed()
192
204
 
193
- _get_logger().info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
205
+ logger.info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
194
206
 
195
207
  except Exception as e:
196
208
  self._performance_metrics['error_count'] += 1
197
209
  self.context.increment_failed(str(e))
198
- _get_logger().error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
210
+ logger.error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
199
211
  raise
200
212
  finally:
201
213
  self.context.decrement_active()
@@ -213,7 +225,7 @@ class Crawler:
213
225
  else:
214
226
  spider_name = 'Unknown'
215
227
 
216
- _get_logger().info(f"Starting running {spider_name}")
228
+ logger.info(f"Starting running {spider_name}")
217
229
 
218
230
  def _validate_crawler_state(self):
219
231
  """
@@ -233,7 +245,7 @@ class Crawler:
233
245
  if not self.spider.name:
234
246
  raise ValueError("Spider name cannot be empty")
235
247
 
236
- _get_logger().debug(f"Spider {self.spider.name} state validation passed")
248
+ logger.debug(f"Spider {self.spider.name} state validation passed")
237
249
 
238
250
  def _get_total_duration(self) -> float:
239
251
  """Get total runtime"""
@@ -241,13 +253,59 @@ class Crawler:
241
253
  return self._end_time - self._start_time
242
254
  return 0.0
243
255
 
256
+ def _log_startup_info(self):
257
+ """Print startup information, including run mode and key configuration checks"""
258
+ # Get run mode
259
+ run_mode = self.settings.get('RUN_MODE', 'standalone')
260
+
261
+ # Get version number
262
+ version = self.settings.get('VERSION', '1.0.0')
263
+ if not version or version == 'None':
264
+ version = '1.0.0'
265
+
266
+ # Print framework start info
267
+ logger.info(f"Crawlo Framework Started {version}")
268
+
269
+ # Add mode info if available
270
+ mode_info = self.settings.get('_mode_info')
271
+ if mode_info:
272
+ logger.info(mode_info)
273
+ else:
274
+ # 如果没有_mode_info,添加默认信息
275
+ logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
276
+
277
+ # Get actual queue type
278
+ queue_type = self.settings.get('QUEUE_TYPE', 'memory')
279
+
280
+ # Display information based on run mode and queue type combination
281
+ if run_mode == 'distributed':
282
+ logger.info("Run Mode: distributed")
283
+ logger.info("Distributed Mode - Multi-node collaboration supported")
284
+ # Show Redis configuration
285
+ redis_host = self.settings.get('REDIS_HOST', 'localhost')
286
+ redis_port = self.settings.get('REDIS_PORT', 6379)
287
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
288
+ elif run_mode == 'standalone':
289
+ if queue_type == 'redis':
290
+ logger.info("Run Mode: standalone+redis")
291
+ # Show Redis configuration
292
+ redis_host = self.settings.get('REDIS_HOST', 'localhost')
293
+ redis_port = self.settings.get('REDIS_PORT', 6379)
294
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
295
+ elif queue_type == 'auto':
296
+ logger.info("Run Mode: standalone+auto")
297
+ else: # memory
298
+ logger.info("Run Mode: standalone")
299
+ else:
300
+ logger.info(f"Run Mode: {run_mode}")
301
+
244
302
  async def _ensure_cleanup(self):
245
303
  """Ensure resource cleanup"""
246
304
  try:
247
305
  if not self._closed:
248
306
  await self.close()
249
307
  except Exception as e:
250
- _get_logger().warning(f"Error cleaning up resources: {e}")
308
+ logger.warning(f"Error cleaning up resources: {e}")
251
309
 
252
310
  def get_performance_metrics(self) -> Dict[str, Any]:
253
311
  """Get performance metrics"""
@@ -267,7 +325,7 @@ class Crawler:
267
325
  def _create_spider(self) -> Spider:
268
326
  """
269
327
  Create and validate spider instance (enhanced version)
270
-
328
+
271
329
  Performs the following validations:
272
330
  - Spider name must exist
273
331
  - start_requests method must be callable
@@ -300,7 +358,7 @@ class Crawler:
300
358
 
301
359
  # parse method check (warning instead of error)
302
360
  if not callable(getattr(spider, 'parse', None)):
303
- _get_logger().warning(
361
+ logger.warning(
304
362
  f"Spider '{spider.name}' does not define 'parse' method.\n"
305
363
  f"Ensure all Requests specify a callback function, otherwise responses will be ignored."
306
364
  )
@@ -308,27 +366,29 @@ class Crawler:
308
366
  # Set spider configuration
309
367
  self._set_spider(spider)
310
368
 
311
- _get_logger().debug(f"Spider '{spider.name}' initialized successfully")
369
+ logger.debug(f"Spider '{spider.name}' initialized successfully")
312
370
  return spider
313
371
 
314
372
  def _create_engine(self) -> Engine:
315
373
  """Create and initialize engine"""
316
374
  engine = Engine(self)
317
375
  engine.engine_start()
318
- _get_logger().debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
376
+ logger.debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
319
377
  return engine
320
378
 
321
379
  def _create_stats(self) -> StatsCollector:
322
380
  """Create stats collector"""
323
381
  stats = StatsCollector(self)
324
- _get_logger().debug(f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
382
+ logger.debug(
383
+ f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
325
384
  return stats
326
385
 
327
386
  def _create_extension(self) -> ExtensionManager:
328
387
  """Create extension manager"""
329
388
  # Modify extension manager creation method, delay initialization until needed
330
389
  extension = ExtensionManager.create_instance(self)
331
- _get_logger().debug(f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
390
+ logger.debug(
391
+ f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
332
392
  return extension
333
393
 
334
394
  def _set_spider(self, spider: Spider):
@@ -343,12 +403,12 @@ class Crawler:
343
403
  # Merge spider custom configuration
344
404
  merge_settings(spider, self.settings)
345
405
 
346
- _get_logger().debug(f"Spider '{spider.name}' configuration merged successfully")
406
+ logger.debug(f"Spider '{spider.name}' configuration merged successfully")
347
407
 
348
408
  async def close(self, reason='finished') -> None:
349
409
  """
350
410
  Close crawler and clean up resources (enhanced version)
351
-
411
+
352
412
  Ensure closing only once and handle all cleanup operations
353
413
  """
354
414
  async with self._close_lock:
@@ -371,15 +431,15 @@ class Crawler:
371
431
  from crawlo.commands.stats import record_stats
372
432
  record_stats(self)
373
433
  except ImportError:
374
- _get_logger().debug("Statistics recording module does not exist, skipping statistics recording")
434
+ logger.debug("Statistics recording module does not exist, skipping statistics recording")
375
435
 
376
- _get_logger().info(
436
+ logger.info(
377
437
  f"Spider '{getattr(self.spider, 'name', 'Unknown')}' closed, "
378
438
  f"reason: {reason}, took: {self._get_total_duration():.2f} seconds"
379
439
  )
380
440
 
381
441
  except Exception as e:
382
- _get_logger().error(f"Error closing crawler: {e}", exc_info=True)
442
+ logger.error(f"Error closing crawler: {e}", exc_info=True)
383
443
  finally:
384
444
  # Ensure resource cleanup
385
445
  await self._cleanup_resources()
@@ -413,13 +473,13 @@ class Crawler:
413
473
  if cleanup_tasks:
414
474
  await asyncio.gather(*cleanup_tasks, return_exceptions=True)
415
475
 
416
- _get_logger().debug("Resource cleanup completed")
476
+ logger.debug("Resource cleanup completed")
417
477
 
418
478
 
419
479
  class CrawlerProcess:
420
480
  """
421
481
  Crawler process manager
422
-
482
+
423
483
  Supported features:
424
484
  - Multi-crawler concurrent scheduling and resource management
425
485
  - Automatic module discovery and spider registration
@@ -428,15 +488,15 @@ class CrawlerProcess:
428
488
  - Real-time status monitoring and statistics
429
489
  - Error recovery and retry mechanism
430
490
  - Large-scale crawler optimization support
431
-
491
+
432
492
  Usage example:
433
493
  # Basic usage
434
494
  process = CrawlerProcess()
435
495
  await process.crawl(MySpider)
436
-
496
+
437
497
  # Multi-crawler concurrency
438
498
  await process.crawl([Spider1, Spider2, 'spider_name'])
439
-
499
+
440
500
  # Custom concurrency
441
501
  process = CrawlerProcess(max_concurrency=8)
442
502
  """
@@ -489,7 +549,10 @@ class CrawlerProcess:
489
549
  signal.signal(signal.SIGINT, self._shutdown)
490
550
  signal.signal(signal.SIGTERM, self._shutdown)
491
551
 
492
- self._log_startup_info()
552
+ # 注意:移除在这里调用_log_startup_info(),因为这时候日志系统可能还没有被正确配置
553
+ # 日志系统的配置是在project.py的get_settings函数中进行的,而CrawlerProcess的实例化
554
+ # 是在get_settings函数返回之前进行的,所以这时候调用_log_startup_info()可能会导致
555
+ # 日志信息没有被正确写入到日志文件中
493
556
 
494
557
  logger.debug(
495
558
  f"CrawlerProcess initialized successfully\n"
@@ -563,7 +626,7 @@ class CrawlerProcess:
563
626
  def auto_discover(modules: List[str]):
564
627
  """
565
628
  Automatically import modules, trigger Spider class definition and registration (enhanced version)
566
-
629
+
567
630
  Supports recursive scanning and error recovery
568
631
  """
569
632
  import importlib
@@ -617,7 +680,7 @@ class CrawlerProcess:
617
680
  async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
618
681
  """
619
682
  Start one or more crawlers
620
-
683
+
621
684
  Enhanced features:
622
685
  - Intelligent concurrency control
623
686
  - Real-time monitoring and statistics
@@ -639,7 +702,7 @@ class CrawlerProcess:
639
702
  await self.start_monitoring()
640
703
 
641
704
  try:
642
- # Phase 3: Sort by class name to ensure predictable startup order
705
+ # Phase 3: Initialize context and monitoring
643
706
  spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
644
707
 
645
708
  logger.debug(
@@ -738,7 +801,7 @@ class CrawlerProcess:
738
801
  ) -> List[Type[Spider]]:
739
802
  """
740
803
  Resolve input to spider class list
741
-
804
+
742
805
  Supports various input formats and validates uniqueness
743
806
  """
744
807
  inputs = self._normalize_inputs(spiders_input)
@@ -762,7 +825,8 @@ class CrawlerProcess:
762
825
  seen_spider_names.add(spider_name)
763
826
  spider_classes.append(spider_cls)
764
827
 
765
- logger.debug(f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
828
+ logger.debug(
829
+ f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
766
830
 
767
831
  except Exception as e:
768
832
  logger.error(f"Failed to resolve spider: {item} - {e}")
@@ -774,7 +838,7 @@ class CrawlerProcess:
774
838
  def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
775
839
  """
776
840
  Normalize input to list
777
-
841
+
778
842
  Supports more input types and provides better error information
779
843
  """
780
844
  if isinstance(spiders_input, (type, str)):
@@ -793,7 +857,7 @@ class CrawlerProcess:
793
857
  def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
794
858
  """
795
859
  Resolve single input item to spider class
796
-
860
+
797
861
  Provides better error prompts and debugging information
798
862
  """
799
863
  if isinstance(item, type) and issubclass(item, Spider):
@@ -820,7 +884,7 @@ class CrawlerProcess:
820
884
  async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
821
885
  """
822
886
  Spider running function limited by semaphore
823
-
887
+
824
888
  Includes enhanced error handling and monitoring functionality
825
889
  """
826
890
  task = asyncio.current_task()
@@ -888,7 +952,7 @@ class CrawlerProcess:
888
952
  def _shutdown(self, _signum, _frame):
889
953
  """
890
954
  Graceful shutdown signal handling
891
-
955
+
892
956
  Provides better shutdown experience and resource cleanup
893
957
  """
894
958
  signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
@@ -913,7 +977,7 @@ class CrawlerProcess:
913
977
  async def _wait_for_shutdown(self):
914
978
  """
915
979
  Wait for all active tasks to complete
916
-
980
+
917
981
  Provides better shutdown time control and progress feedback
918
982
  """
919
983
  try:
@@ -967,15 +1031,15 @@ class CrawlerProcess:
967
1031
  def _get_default_settings(cls) -> SettingManager:
968
1032
  """
969
1033
  Load default configuration
970
-
1034
+
971
1035
  Provides better error handling and fallback strategy
972
1036
  """
973
1037
  try:
974
1038
  settings = get_settings()
975
- _get_logger().debug("Default configuration loaded successfully")
1039
+ logger.debug("Default configuration loaded successfully")
976
1040
  return settings
977
1041
  except Exception as e:
978
- _get_logger().warning(f"Unable to load default configuration: {e}, using empty configuration")
1042
+ logger.warning(f"Unable to load default configuration: {e}, using empty configuration")
979
1043
  return SettingManager()
980
1044
 
981
1045
  def _log_startup_info(self):
@@ -988,39 +1052,41 @@ class CrawlerProcess:
988
1052
  if not version or version == 'None':
989
1053
  version = '1.0.0'
990
1054
 
991
- # Build startup info log
992
- startup_info = [
993
- f"Crawlo Framework Started v{version}"
994
- ]
1055
+ # Print framework start info
1056
+ logger.info(f"Crawlo Framework Started {version}")
1057
+
1058
+ # Add mode info if available
1059
+ mode_info = self.settings.get('_mode_info')
1060
+ if mode_info:
1061
+ logger.info(mode_info)
1062
+ else:
1063
+ # 如果没有_mode_info,添加默认信息
1064
+ logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
995
1065
 
996
1066
  # Get actual queue type
997
1067
  queue_type = self.settings.get('QUEUE_TYPE', 'memory')
998
1068
 
999
1069
  # Display information based on run mode and queue type combination
1000
1070
  if run_mode == 'distributed':
1001
- startup_info.append("Run Mode: distributed")
1002
- startup_info.append("Distributed Mode - Multi-node collaboration supported")
1071
+ logger.info("Run Mode: distributed")
1072
+ logger.info("Distributed Mode - Multi-node collaboration supported")
1003
1073
  # Show Redis configuration
1004
1074
  redis_host = self.settings.get('REDIS_HOST', 'localhost')
1005
1075
  redis_port = self.settings.get('REDIS_PORT', 6379)
1006
- startup_info.append(f"Redis Address: {redis_host}:{redis_port}")
1076
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
1007
1077
  elif run_mode == 'standalone':
1008
1078
  if queue_type == 'redis':
1009
- startup_info.append("Run Mode: standalone+redis")
1079
+ logger.info("Run Mode: standalone+redis")
1010
1080
  # Show Redis configuration
1011
1081
  redis_host = self.settings.get('REDIS_HOST', 'localhost')
1012
1082
  redis_port = self.settings.get('REDIS_PORT', 6379)
1013
- startup_info.append(f"Redis Address: {redis_host}:{redis_port}")
1083
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
1014
1084
  elif queue_type == 'auto':
1015
- startup_info.append("Run Mode: standalone+auto")
1085
+ logger.info("Run Mode: standalone+auto")
1016
1086
  else: # memory
1017
- startup_info.append("Run Mode: standalone")
1087
+ logger.info("Run Mode: standalone")
1018
1088
  else:
1019
- startup_info.append(f"Run Mode: {run_mode}")
1020
-
1021
- # Print startup information
1022
- for info in startup_info:
1023
- logger.info(info)
1089
+ logger.info(f"Run Mode: {run_mode}")
1024
1090
 
1025
1091
 
1026
1092
  # === Utility functions ===
@@ -1032,7 +1098,7 @@ def create_crawler_with_optimizations(
1032
1098
  ) -> Crawler:
1033
1099
  """
1034
1100
  Create an optimized crawler instance
1035
-
1101
+
1036
1102
  :param spider_cls: Spider class
1037
1103
  :param settings: Settings manager
1038
1104
  :param optimization_kwargs: Optimization parameters
@@ -1056,7 +1122,7 @@ def create_process_with_large_scale_config(
1056
1122
  ) -> CrawlerProcess:
1057
1123
  """
1058
1124
  Create a process manager that supports large-scale optimization
1059
-
1125
+
1060
1126
  :param config_type: Configuration type ('conservative', 'balanced', 'aggressive', 'memory_optimized')
1061
1127
  :param concurrency: Concurrency count
1062
1128
  :param kwargs: Other parameters
@@ -1100,4 +1166,4 @@ __all__ = [
1100
1166
  'CrawlerContext',
1101
1167
  'create_crawler_with_optimizations',
1102
1168
  'create_process_with_large_scale_config'
1103
- ]
1169
+ ]
@@ -1,8 +1,10 @@
1
1
  from typing import Any
2
2
  from crawlo.exceptions import NotConfigured
3
- from crawlo.utils.log import get_logger
4
3
  from crawlo.utils.log import LoggerManager
5
4
 
5
+ # 使用全局logger,避免每个模块都创建自己的延迟初始化函数
6
+ logger = LoggerManager.get_logger(__name__)
7
+
6
8
 
7
9
  class CustomLoggerExtension:
8
10
  """
@@ -32,7 +34,7 @@ class CustomLoggerExtension:
32
34
  return cls(crawler.settings)
33
35
 
34
36
  def spider_opened(self, spider: Any) -> None:
35
- logger = get_logger(__name__)
37
+ logger = LoggerManager.get_logger(__name__)
36
38
  try:
37
39
  logger.info(
38
40
  f"CustomLoggerExtension: Logging initialized. "
@@ -133,4 +133,4 @@ class MiddlewareManager:
133
133
  def _validate_middleware_method(method_name, middleware) -> bool:
134
134
  method = getattr(type(middleware), method_name)
135
135
  base_method = getattr(BaseMiddleware, method_name)
136
- return False if method == base_method else True
136
+ return False if method == base_method else True