crawlo 1.2.9__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.2.9'
1
+ __version__ = '1.3.0'
crawlo/commands/run.py CHANGED
@@ -5,26 +5,27 @@
5
5
  # @Author : crawl-coder
6
6
  # @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
7
7
  """
8
+ import os
8
9
  import sys
9
10
  import asyncio
10
11
  import configparser
11
- import os
12
- from pathlib import Path
13
12
  from importlib import import_module
14
13
 
14
+ from rich import box
15
15
  from rich.console import Console
16
16
  from rich.panel import Panel
17
+ from rich.progress import Progress, SpinnerColumn, TextColumn
17
18
  from rich.table import Table
18
19
  from rich.text import Text
19
- from rich import box
20
- from rich.progress import Progress, SpinnerColumn, TextColumn
21
20
 
21
+ from crawlo.commands.stats import record_stats
22
22
  from crawlo.crawler import CrawlerProcess
23
- from crawlo.utils.log import get_logger
24
23
  from crawlo.project import get_settings, _find_project_root
25
- from crawlo.commands.stats import record_stats
24
+ # 使用自定义日志系统
25
+ from crawlo.utils.log import get_logger
26
26
 
27
27
  logger = get_logger(__name__)
28
+
28
29
  console = Console()
29
30
 
30
31
 
@@ -77,6 +78,9 @@ def main(args):
77
78
  用法:
78
79
  crawlo run <spider_name>|all [--json] [--no-stats]
79
80
  """
81
+ # 添加调试信息
82
+ logger.debug("DEBUG: 进入main函数")
83
+
80
84
  if len(args) < 1:
81
85
  console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
82
86
  console.print("示例:")
@@ -187,21 +191,7 @@ def main(args):
187
191
  return 1
188
192
 
189
193
  # 显示即将运行的爬虫列表
190
- table = Table(
191
- title=f"启动全部 {len(spider_names)} 个爬虫",
192
- box=box.ROUNDED,
193
- show_header=True,
194
- header_style="bold magenta"
195
- )
196
- table.add_column("名称", style="cyan")
197
- table.add_column("类名", style="green")
198
-
199
- for name in sorted(spider_names):
200
- cls = process.get_spider_class(name)
201
- table.add_row(name, cls.__name__)
202
-
203
- console.print(table)
204
- console.print()
194
+ # 根据用户要求,不再显示详细的爬虫列表信息
205
195
 
206
196
  # 注册 stats 记录(除非 --no-stats)
207
197
  if not no_stats:
@@ -260,20 +250,21 @@ def main(args):
260
250
  spider_class = process.get_spider_class(spider_name)
261
251
 
262
252
  # 显示启动信息
263
- if not show_json:
264
- info_table = Table(
265
- title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
266
- box=box.SIMPLE,
267
- show_header=False,
268
- title_style="bold green"
269
- )
270
- info_table.add_column("Key", style="yellow")
271
- info_table.add_column("Value", style="cyan")
272
- info_table.add_row("Project", project_package)
273
- info_table.add_row("Class", spider_class.__name__)
274
- info_table.add_row("Module", spider_class.__module__)
275
- console.print(info_table)
276
- console.print()
253
+ # 根据用户要求,不再显示项目启动信息
254
+ # if not show_json:
255
+ # info_table = Table(
256
+ # title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
257
+ # box=box.SIMPLE,
258
+ # show_header=False,
259
+ # title_style="bold green"
260
+ # )
261
+ # info_table.add_column("Key", style="yellow")
262
+ # info_table.add_column("Value", style="cyan")
263
+ # info_table.add_row("Project", project_package)
264
+ # info_table.add_row("Class", spider_class.__name__)
265
+ # info_table.add_row("Module", spider_class.__module__)
266
+ # console.print(info_table)
267
+ # console.print()
277
268
 
278
269
  # 注册 stats 记录
279
270
  if not no_stats:
crawlo/core/engine.py CHANGED
@@ -75,8 +75,7 @@ class Engine(object):
75
75
  version = '1.0.0'
76
76
  # Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
77
77
  self.logger.debug(
78
- f"Crawlo Started version {version} . "
79
- # f"(project name : {self.settings.get('PROJECT_NAME')})"
78
+ f"Crawlo Started version {version}"
80
79
  )
81
80
 
82
81
  async def start_spider(self, spider):
crawlo/crawler.py CHANGED
@@ -21,7 +21,7 @@ Example Usage:
21
21
  # Single crawler run
22
22
  crawler = Crawler(MySpider, settings)
23
23
  await crawler.crawl()
24
-
24
+
25
25
  # Multi-crawler concurrent management
26
26
  process = CrawlerProcess()
27
27
  await process.crawl([Spider1, Spider2])
@@ -34,7 +34,6 @@ import threading
34
34
  from typing import Type, Optional, Set, List, Union, Dict, Any
35
35
  from .spider import Spider, get_global_spider_registry
36
36
  from .core.engine import Engine
37
- from .utils.log import get_logger
38
37
  from .subscriber import Subscriber
39
38
  from .extension import ExtensionManager
40
39
  from .stats_collector import StatsCollector
@@ -42,16 +41,9 @@ from .event import spider_opened, spider_closed
42
41
  from .settings.setting_manager import SettingManager
43
42
  from crawlo.project import merge_settings, get_settings
44
43
 
45
- # 延迟初始化logger,在需要时通过get_logger获取
46
- logger = None
47
-
48
-
49
- def _get_logger():
50
- """延迟获取logger实例,确保在配置加载后创建"""
51
- global logger
52
- if logger is None:
53
- logger = get_logger(__name__)
54
- return logger
44
+ # 使用自定义日志系统
45
+ from crawlo.utils.log import get_logger
46
+ logger = get_logger(__name__)
55
47
 
56
48
 
57
49
  class CrawlerContext:
@@ -110,7 +102,7 @@ class CrawlerContext:
110
102
  class Crawler:
111
103
  """
112
104
  Single crawler runtime instance, managing Spider and engine lifecycle
113
-
105
+
114
106
  Provides functionality:
115
107
  - Spider lifecycle management (initialization, running, closing)
116
108
  - Engine component coordination management
@@ -148,7 +140,7 @@ class Crawler:
148
140
  async def crawl(self):
149
141
  """
150
142
  Start the crawler core process
151
-
143
+
152
144
  Includes the following stages:
153
145
  1. Initialization stage: Create all components
154
146
  2. Validation stage: Check configuration and state
@@ -190,12 +182,12 @@ class Crawler:
190
182
  # Update context status
191
183
  self.context.increment_completed()
192
184
 
193
- _get_logger().info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
185
+ logger.info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
194
186
 
195
187
  except Exception as e:
196
188
  self._performance_metrics['error_count'] += 1
197
189
  self.context.increment_failed(str(e))
198
- _get_logger().error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
190
+ logger.error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
199
191
  raise
200
192
  finally:
201
193
  self.context.decrement_active()
@@ -213,7 +205,7 @@ class Crawler:
213
205
  else:
214
206
  spider_name = 'Unknown'
215
207
 
216
- _get_logger().info(f"Starting running {spider_name}")
208
+ logger.info(f"Starting running {spider_name}")
217
209
 
218
210
  def _validate_crawler_state(self):
219
211
  """
@@ -233,7 +225,7 @@ class Crawler:
233
225
  if not self.spider.name:
234
226
  raise ValueError("Spider name cannot be empty")
235
227
 
236
- _get_logger().debug(f"Spider {self.spider.name} state validation passed")
228
+ logger.debug(f"Spider {self.spider.name} state validation passed")
237
229
 
238
230
  def _get_total_duration(self) -> float:
239
231
  """Get total runtime"""
@@ -247,7 +239,7 @@ class Crawler:
247
239
  if not self._closed:
248
240
  await self.close()
249
241
  except Exception as e:
250
- _get_logger().warning(f"Error cleaning up resources: {e}")
242
+ logger.warning(f"Error cleaning up resources: {e}")
251
243
 
252
244
  def get_performance_metrics(self) -> Dict[str, Any]:
253
245
  """Get performance metrics"""
@@ -267,7 +259,7 @@ class Crawler:
267
259
  def _create_spider(self) -> Spider:
268
260
  """
269
261
  Create and validate spider instance (enhanced version)
270
-
262
+
271
263
  Performs the following validations:
272
264
  - Spider name must exist
273
265
  - start_requests method must be callable
@@ -300,7 +292,7 @@ class Crawler:
300
292
 
301
293
  # parse method check (warning instead of error)
302
294
  if not callable(getattr(spider, 'parse', None)):
303
- _get_logger().warning(
295
+ logger.warning(
304
296
  f"Spider '{spider.name}' does not define 'parse' method.\n"
305
297
  f"Ensure all Requests specify a callback function, otherwise responses will be ignored."
306
298
  )
@@ -308,27 +300,29 @@ class Crawler:
308
300
  # Set spider configuration
309
301
  self._set_spider(spider)
310
302
 
311
- _get_logger().debug(f"Spider '{spider.name}' initialized successfully")
303
+ logger.debug(f"Spider '{spider.name}' initialized successfully")
312
304
  return spider
313
305
 
314
306
  def _create_engine(self) -> Engine:
315
307
  """Create and initialize engine"""
316
308
  engine = Engine(self)
317
309
  engine.engine_start()
318
- _get_logger().debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
310
+ logger.debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
319
311
  return engine
320
312
 
321
313
  def _create_stats(self) -> StatsCollector:
322
314
  """Create stats collector"""
323
315
  stats = StatsCollector(self)
324
- _get_logger().debug(f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
316
+ logger.debug(
317
+ f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
325
318
  return stats
326
319
 
327
320
  def _create_extension(self) -> ExtensionManager:
328
321
  """Create extension manager"""
329
322
  # Modify extension manager creation method, delay initialization until needed
330
323
  extension = ExtensionManager.create_instance(self)
331
- _get_logger().debug(f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
324
+ logger.debug(
325
+ f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
332
326
  return extension
333
327
 
334
328
  def _set_spider(self, spider: Spider):
@@ -343,12 +337,12 @@ class Crawler:
343
337
  # Merge spider custom configuration
344
338
  merge_settings(spider, self.settings)
345
339
 
346
- _get_logger().debug(f"Spider '{spider.name}' configuration merged successfully")
340
+ logger.debug(f"Spider '{spider.name}' configuration merged successfully")
347
341
 
348
342
  async def close(self, reason='finished') -> None:
349
343
  """
350
344
  Close crawler and clean up resources (enhanced version)
351
-
345
+
352
346
  Ensure closing only once and handle all cleanup operations
353
347
  """
354
348
  async with self._close_lock:
@@ -371,15 +365,15 @@ class Crawler:
371
365
  from crawlo.commands.stats import record_stats
372
366
  record_stats(self)
373
367
  except ImportError:
374
- _get_logger().debug("Statistics recording module does not exist, skipping statistics recording")
368
+ logger.debug("Statistics recording module does not exist, skipping statistics recording")
375
369
 
376
- _get_logger().info(
370
+ logger.info(
377
371
  f"Spider '{getattr(self.spider, 'name', 'Unknown')}' closed, "
378
372
  f"reason: {reason}, took: {self._get_total_duration():.2f} seconds"
379
373
  )
380
374
 
381
375
  except Exception as e:
382
- _get_logger().error(f"Error closing crawler: {e}", exc_info=True)
376
+ logger.error(f"Error closing crawler: {e}", exc_info=True)
383
377
  finally:
384
378
  # Ensure resource cleanup
385
379
  await self._cleanup_resources()
@@ -413,13 +407,13 @@ class Crawler:
413
407
  if cleanup_tasks:
414
408
  await asyncio.gather(*cleanup_tasks, return_exceptions=True)
415
409
 
416
- _get_logger().debug("Resource cleanup completed")
410
+ logger.debug("Resource cleanup completed")
417
411
 
418
412
 
419
413
  class CrawlerProcess:
420
414
  """
421
415
  Crawler process manager
422
-
416
+
423
417
  Supported features:
424
418
  - Multi-crawler concurrent scheduling and resource management
425
419
  - Automatic module discovery and spider registration
@@ -428,15 +422,15 @@ class CrawlerProcess:
428
422
  - Real-time status monitoring and statistics
429
423
  - Error recovery and retry mechanism
430
424
  - Large-scale crawler optimization support
431
-
425
+
432
426
  Usage example:
433
427
  # Basic usage
434
428
  process = CrawlerProcess()
435
429
  await process.crawl(MySpider)
436
-
430
+
437
431
  # Multi-crawler concurrency
438
432
  await process.crawl([Spider1, Spider2, 'spider_name'])
439
-
433
+
440
434
  # Custom concurrency
441
435
  process = CrawlerProcess(max_concurrency=8)
442
436
  """
@@ -563,7 +557,7 @@ class CrawlerProcess:
563
557
  def auto_discover(modules: List[str]):
564
558
  """
565
559
  Automatically import modules, trigger Spider class definition and registration (enhanced version)
566
-
560
+
567
561
  Supports recursive scanning and error recovery
568
562
  """
569
563
  import importlib
@@ -617,7 +611,7 @@ class CrawlerProcess:
617
611
  async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
618
612
  """
619
613
  Start one or more crawlers
620
-
614
+
621
615
  Enhanced features:
622
616
  - Intelligent concurrency control
623
617
  - Real-time monitoring and statistics
@@ -639,7 +633,7 @@ class CrawlerProcess:
639
633
  await self.start_monitoring()
640
634
 
641
635
  try:
642
- # Phase 3: Sort by class name to ensure predictable startup order
636
+ # Phase 3: Initialize context and monitoring
643
637
  spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
644
638
 
645
639
  logger.debug(
@@ -738,7 +732,7 @@ class CrawlerProcess:
738
732
  ) -> List[Type[Spider]]:
739
733
  """
740
734
  Resolve input to spider class list
741
-
735
+
742
736
  Supports various input formats and validates uniqueness
743
737
  """
744
738
  inputs = self._normalize_inputs(spiders_input)
@@ -762,7 +756,8 @@ class CrawlerProcess:
762
756
  seen_spider_names.add(spider_name)
763
757
  spider_classes.append(spider_cls)
764
758
 
765
- logger.debug(f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
759
+ logger.debug(
760
+ f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
766
761
 
767
762
  except Exception as e:
768
763
  logger.error(f"Failed to resolve spider: {item} - {e}")
@@ -774,7 +769,7 @@ class CrawlerProcess:
774
769
  def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
775
770
  """
776
771
  Normalize input to list
777
-
772
+
778
773
  Supports more input types and provides better error information
779
774
  """
780
775
  if isinstance(spiders_input, (type, str)):
@@ -793,7 +788,7 @@ class CrawlerProcess:
793
788
  def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
794
789
  """
795
790
  Resolve single input item to spider class
796
-
791
+
797
792
  Provides better error prompts and debugging information
798
793
  """
799
794
  if isinstance(item, type) and issubclass(item, Spider):
@@ -820,7 +815,7 @@ class CrawlerProcess:
820
815
  async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
821
816
  """
822
817
  Spider running function limited by semaphore
823
-
818
+
824
819
  Includes enhanced error handling and monitoring functionality
825
820
  """
826
821
  task = asyncio.current_task()
@@ -888,7 +883,7 @@ class CrawlerProcess:
888
883
  def _shutdown(self, _signum, _frame):
889
884
  """
890
885
  Graceful shutdown signal handling
891
-
886
+
892
887
  Provides better shutdown experience and resource cleanup
893
888
  """
894
889
  signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
@@ -913,7 +908,7 @@ class CrawlerProcess:
913
908
  async def _wait_for_shutdown(self):
914
909
  """
915
910
  Wait for all active tasks to complete
916
-
911
+
917
912
  Provides better shutdown time control and progress feedback
918
913
  """
919
914
  try:
@@ -967,15 +962,15 @@ class CrawlerProcess:
967
962
  def _get_default_settings(cls) -> SettingManager:
968
963
  """
969
964
  Load default configuration
970
-
965
+
971
966
  Provides better error handling and fallback strategy
972
967
  """
973
968
  try:
974
969
  settings = get_settings()
975
- _get_logger().debug("Default configuration loaded successfully")
970
+ logger.debug("Default configuration loaded successfully")
976
971
  return settings
977
972
  except Exception as e:
978
- _get_logger().warning(f"Unable to load default configuration: {e}, using empty configuration")
973
+ logger.warning(f"Unable to load default configuration: {e}, using empty configuration")
979
974
  return SettingManager()
980
975
 
981
976
  def _log_startup_info(self):
@@ -990,7 +985,7 @@ class CrawlerProcess:
990
985
 
991
986
  # Build startup info log
992
987
  startup_info = [
993
- f"Crawlo Framework Started v{version}"
988
+ f"Crawlo Framework Started {version}"
994
989
  ]
995
990
 
996
991
  # Get actual queue type
@@ -1018,7 +1013,7 @@ class CrawlerProcess:
1018
1013
  else:
1019
1014
  startup_info.append(f"Run Mode: {run_mode}")
1020
1015
 
1021
- # Print startup information
1016
+ # Print startup information at INFO level
1022
1017
  for info in startup_info:
1023
1018
  logger.info(info)
1024
1019
 
@@ -1032,7 +1027,7 @@ def create_crawler_with_optimizations(
1032
1027
  ) -> Crawler:
1033
1028
  """
1034
1029
  Create an optimized crawler instance
1035
-
1030
+
1036
1031
  :param spider_cls: Spider class
1037
1032
  :param settings: Settings manager
1038
1033
  :param optimization_kwargs: Optimization parameters
@@ -1056,7 +1051,7 @@ def create_process_with_large_scale_config(
1056
1051
  ) -> CrawlerProcess:
1057
1052
  """
1058
1053
  Create a process manager that supports large-scale optimization
1059
-
1054
+
1060
1055
  :param config_type: Configuration type ('conservative', 'balanced', 'aggressive', 'memory_optimized')
1061
1056
  :param concurrency: Concurrency count
1062
1057
  :param kwargs: Other parameters
@@ -1100,4 +1095,4 @@ __all__ = [
1100
1095
  'CrawlerContext',
1101
1096
  'create_crawler_with_optimizations',
1102
1097
  'create_process_with_large_scale_config'
1103
- ]
1098
+ ]
@@ -1,8 +1,10 @@
1
1
  from typing import Any
2
2
  from crawlo.exceptions import NotConfigured
3
- from crawlo.utils.log import get_logger
4
3
  from crawlo.utils.log import LoggerManager
5
4
 
5
+ # 使用全局logger,避免每个模块都创建自己的延迟初始化函数
6
+ logger = LoggerManager.get_logger(__name__)
7
+
6
8
 
7
9
  class CustomLoggerExtension:
8
10
  """
@@ -32,7 +34,7 @@ class CustomLoggerExtension:
32
34
  return cls(crawler.settings)
33
35
 
34
36
  def spider_opened(self, spider: Any) -> None:
35
- logger = get_logger(__name__)
37
+ logger = LoggerManager.get_logger(__name__)
36
38
  try:
37
39
  logger.info(
38
40
  f"CustomLoggerExtension: Logging initialized. "
@@ -133,4 +133,4 @@ class MiddlewareManager:
133
133
  def _validate_middleware_method(method_name, middleware) -> bool:
134
134
  method = getattr(type(middleware), method_name)
135
135
  base_method = getattr(BaseMiddleware, method_name)
136
- return False if method == base_method else True
136
+ return False if method == base_method else True
crawlo/mode_manager.py CHANGED
@@ -19,36 +19,37 @@ from crawlo.utils.log import get_logger
19
19
 
20
20
  class RunMode(Enum):
21
21
  """运行模式枚举"""
22
- STANDALONE = "standalone" # 单机模式
22
+ STANDALONE = "standalone" # 单机模式
23
23
  DISTRIBUTED = "distributed" # 分布式模式
24
- AUTO = "auto" # 自动检测模式
24
+ AUTO = "auto" # 自动检测模式
25
25
 
26
26
 
27
27
  class ModeManager:
28
28
  """运行模式管理器"""
29
-
29
+
30
30
  def __init__(self):
31
31
  self.logger = get_logger(self.__class__.__name__)
32
-
32
+
33
33
  @staticmethod
34
34
  def get_standalone_settings() -> Dict[str, Any]:
35
35
  """获取单机模式配置"""
36
36
  return {
37
37
  'QUEUE_TYPE': 'memory',
38
38
  'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
+ 'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
39
40
  'CONCURRENCY': 8,
40
41
  'MAX_RUNNING_SPIDERS': 1,
41
42
  'DOWNLOAD_DELAY': 1.0,
42
43
  'LOG_LEVEL': 'INFO',
43
44
  }
44
-
45
+
45
46
  @staticmethod
46
47
  def get_distributed_settings(
47
- redis_host: str = '127.0.0.1',
48
- redis_port: int = 6379,
49
- redis_password: Optional[str] = None,
50
- redis_db: int = 0, # 添加 redis_db 参数
51
- project_name: str = 'crawlo'
48
+ redis_host: str = '127.0.0.1',
49
+ redis_port: int = 6379,
50
+ redis_password: Optional[str] = None,
51
+ redis_db: int = 0, # 添加 redis_db 参数
52
+ project_name: str = 'crawlo'
52
53
  ) -> Dict[str, Any]:
53
54
  """获取分布式模式配置"""
54
55
  # 构建 Redis URL,使用传入的 redis_db 参数
@@ -56,7 +57,7 @@ class ModeManager:
56
57
  redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
57
58
  else:
58
59
  redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
59
-
60
+
60
61
  return {
61
62
  'PROJECT_NAME': project_name, # 添加项目名称到配置中
62
63
  'QUEUE_TYPE': 'redis',
@@ -74,7 +75,7 @@ class ModeManager:
74
75
  'DOWNLOAD_DELAY': 1.0,
75
76
  'LOG_LEVEL': 'INFO',
76
77
  }
77
-
78
+
78
79
  @staticmethod
79
80
  def get_auto_settings() -> Dict[str, Any]:
80
81
  """获取自动检测模式配置"""
@@ -86,28 +87,28 @@ class ModeManager:
86
87
  'DOWNLOAD_DELAY': 1.0,
87
88
  'LOG_LEVEL': 'INFO',
88
89
  }
89
-
90
+
90
91
  def resolve_mode_settings(
91
- self,
92
- mode: str = 'standalone',
93
- **kwargs
92
+ self,
93
+ mode: str = 'standalone',
94
+ **kwargs
94
95
  ) -> Dict[str, Any]:
95
96
  """
96
97
  解析运行模式并返回对应配置
97
-
98
+
98
99
  Args:
99
100
  mode: 运行模式 ('standalone', 'distributed', 'auto')
100
101
  **kwargs: 额外配置参数
101
-
102
+
102
103
  Returns:
103
104
  Dict[str, Any]: 配置字典
104
105
  """
105
106
  mode = RunMode(mode.lower())
106
-
107
+
107
108
  if mode == RunMode.STANDALONE:
108
109
  self.logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
109
110
  settings = self.get_standalone_settings()
110
-
111
+
111
112
  elif mode == RunMode.DISTRIBUTED:
112
113
  self.logger.info("使用分布式模式 - 支持多节点扩展,适合大规模爬取")
113
114
  settings = self.get_distributed_settings(
@@ -117,25 +118,25 @@ class ModeManager:
117
118
  redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
118
119
  project_name=kwargs.get('project_name', 'crawlo')
119
120
  )
120
-
121
+
121
122
  elif mode == RunMode.AUTO:
122
123
  self.logger.info("使用自动检测模式 - 智能选择最佳运行方式")
123
124
  settings = self.get_auto_settings()
124
-
125
+
125
126
  else:
126
127
  raise ValueError(f"不支持的运行模式: {mode}")
127
-
128
+
128
129
  # 合并用户自定义配置
129
- user_settings = {k: v for k, v in kwargs.items()
130
- if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
130
+ user_settings = {k: v for k, v in kwargs.items()
131
+ if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
131
132
  settings.update(user_settings)
132
-
133
+
133
134
  return settings
134
-
135
+
135
136
  def from_environment(self) -> Dict[str, Any]:
136
137
  """从环境变量构建配置"""
137
138
  config = {}
138
-
139
+
139
140
  # 扫描 CRAWLO_ 前缀的环境变量
140
141
  for key, value in os.environ.items():
141
142
  if key.startswith('CRAWLO_'):
@@ -150,7 +151,7 @@ class ModeManager:
150
151
  config[config_key] = float(value)
151
152
  except ValueError:
152
153
  config[config_key] = value
153
-
154
+
154
155
  return config
155
156
 
156
157
 
@@ -161,12 +162,12 @@ def standalone_mode(**kwargs) -> Dict[str, Any]:
161
162
 
162
163
 
163
164
  def distributed_mode(
164
- redis_host: str = '127.0.0.1',
165
- redis_port: int = 6379,
166
- redis_password: Optional[str] = None,
167
- redis_db: int = 0, # 添加 redis_db 参数
168
- project_name: str = 'crawlo',
169
- **kwargs
165
+ redis_host: str = '127.0.0.1',
166
+ redis_port: int = 6379,
167
+ redis_password: Optional[str] = None,
168
+ redis_db: int = 0, # 添加 redis_db 参数
169
+ project_name: str = 'crawlo',
170
+ **kwargs
170
171
  ) -> Dict[str, Any]:
171
172
  """快速创建分布式模式配置"""
172
173
  return ModeManager().resolve_mode_settings(
@@ -190,7 +191,7 @@ def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
190
191
  """从环境变量创建配置"""
191
192
  # 移除直接使用 os.getenv(),要求通过 settings 配置
192
193
  raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
193
-
194
+
194
195
  # 保留原有代码作为参考
195
196
  # mode = os.getenv('CRAWLO_MODE', default_mode).lower()
196
197
  #
@@ -4,7 +4,6 @@ from typing import List
4
4
  from pprint import pformat
5
5
  from asyncio import create_task
6
6
 
7
-
8
7
  from crawlo.utils.log import get_logger
9
8
  from crawlo.event import item_successful, item_discard
10
9
  from crawlo.project import load_class, common_call
@@ -20,6 +19,19 @@ class PipelineManager:
20
19
 
21
20
  self.logger = get_logger(self.__class__.__name__, self.crawler.settings.get('LOG_LEVEL'))
22
21
  pipelines = self.crawler.settings.get_list('PIPELINES')
22
+ dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
23
+
24
+ # 添加调试信息
25
+ self.logger.debug(f"PIPELINES from settings: {pipelines}")
26
+ self.logger.debug(f"DEFAULT_DEDUP_PIPELINE from settings: {dedup_pipeline}")
27
+
28
+ # 确保DEFAULT_DEDUP_PIPELINE被添加到管道列表开头
29
+ if dedup_pipeline:
30
+ # 移除所有去重管道实例(如果存在)
31
+ pipelines = [item for item in pipelines if item != dedup_pipeline]
32
+ # 在开头插入去重管道
33
+ pipelines.insert(0, dedup_pipeline)
34
+
23
35
  self._add_pipelines(pipelines)
24
36
  self._add_methods()
25
37
 
crawlo/project.py CHANGED
@@ -7,18 +7,10 @@ from inspect import iscoroutinefunction
7
7
  from typing import Callable, Optional, Any
8
8
 
9
9
  from crawlo.settings.setting_manager import SettingManager
10
- from crawlo.utils.log import get_logger
10
+ from crawlo.utils.log import get_logger, LoggerManager
11
11
 
12
- # 延迟初始化logger,在需要时通过get_logger获取
13
- logger = None
14
-
15
-
16
- def _get_logger():
17
- """延迟获取logger实例,确保在配置加载后创建"""
18
- global logger
19
- if logger is None:
20
- logger = get_logger(__name__)
21
- return logger
12
+ # 使用全局logger,避免每个模块都创建自己的延迟初始化函数
13
+ logger = get_logger(__name__)
22
14
 
23
15
 
24
16
  def load_class(path: str) -> Any:
@@ -50,7 +42,7 @@ def merge_settings(spider, settings):
50
42
  spider_name = getattr(spider, 'name', 'UnknownSpider')
51
43
  # 检查 settings 是否为 SettingManager 实例
52
44
  if not hasattr(settings, 'update_attributes'):
53
- _get_logger().error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
45
+ logger.error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
54
46
  # 如果是字典,创建一个新的 SettingManager 实例
55
47
  if isinstance(settings, dict):
56
48
  from crawlo.settings.setting_manager import SettingManager
@@ -58,14 +50,14 @@ def merge_settings(spider, settings):
58
50
  new_settings.update_attributes(settings)
59
51
  settings = new_settings
60
52
  else:
61
- _get_logger().error("无法处理的 settings 类型")
53
+ logger.error("无法处理的 settings 类型")
62
54
  return
63
55
 
64
56
  if hasattr(spider, 'custom_settings'):
65
57
  custom_settings = getattr(spider, 'custom_settings')
66
58
  settings.update_attributes(custom_settings)
67
59
  else:
68
- _get_logger().debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
60
+ logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
69
61
 
70
62
 
71
63
  async def common_call(func: Callable, *args, **kwargs):
@@ -93,7 +85,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
93
85
  config.read(cfg_path, encoding="utf-8")
94
86
  if config.has_section("settings") and config.has_option("settings", "default"):
95
87
  module_path = config.get("settings", "default")
96
- _get_logger().debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
88
+ logger.debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
97
89
  return module_path
98
90
  else:
99
91
  raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
@@ -114,7 +106,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
114
106
  for root, dirs, files in os.walk(path):
115
107
  if "crawlo.cfg" in files:
116
108
  cfg_path = os.path.join(root, "crawlo.cfg")
117
- _get_logger().debug(f"✅ 找到项目配置文件: {cfg_path}")
109
+ logger.debug(f"✅ 找到项目配置文件: {cfg_path}")
118
110
  return root
119
111
 
120
112
  # 向上查找直到找到 crawlo.cfg 或包含 settings.py 和 __init__.py 的目录
@@ -130,20 +122,20 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
130
122
  # 检查 crawlo.cfg
131
123
  cfg_file = os.path.join(path, "crawlo.cfg")
132
124
  if os.path.isfile(cfg_file):
133
- _get_logger().debug(f"✅ 找到项目配置文件: {cfg_file}")
125
+ logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
134
126
  return path
135
127
 
136
128
  # 检查 settings.py 和 __init__.py
137
129
  settings_file = os.path.join(path, "settings.py")
138
130
  init_file = os.path.join(path, "__init__.py")
139
131
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
140
- _get_logger().debug(f"✅ 找到项目模块: {path}")
132
+ logger.debug(f"✅ 找到项目模块: {path}")
141
133
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
142
134
  parent = os.path.dirname(path)
143
135
  if parent != path:
144
136
  parent_cfg = os.path.join(parent, "crawlo.cfg")
145
137
  if os.path.isfile(parent_cfg):
146
- _get_logger().debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
138
+ logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
147
139
  return parent
148
140
  return path
149
141
 
@@ -167,19 +159,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
167
159
 
168
160
  cfg_file = os.path.join(path, "crawlo.cfg")
169
161
  if os.path.isfile(cfg_file):
170
- _get_logger().debug(f"✅ 找到项目配置文件: {cfg_file}")
162
+ logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
171
163
  return path
172
164
 
173
165
  settings_file = os.path.join(path, "settings.py")
174
166
  init_file = os.path.join(path, "__init__.py")
175
167
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
176
- _get_logger().debug(f"✅ 找到项目模块: {path}")
168
+ logger.debug(f"✅ 找到项目模块: {path}")
177
169
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
178
170
  parent = os.path.dirname(path)
179
171
  if parent != path:
180
172
  parent_cfg = os.path.join(parent, "crawlo.cfg")
181
173
  if os.path.isfile(parent_cfg):
182
- _get_logger().debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
174
+ logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
183
175
  return parent
184
176
  return path
185
177
 
@@ -204,19 +196,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
204
196
 
205
197
  cfg_file = os.path.join(path, "crawlo.cfg")
206
198
  if os.path.isfile(cfg_file):
207
- _get_logger().debug(f"找到项目配置文件: {cfg_file}")
199
+ logger.debug(f"找到项目配置文件: {cfg_file}")
208
200
  return path
209
201
 
210
202
  settings_file = os.path.join(path, "settings.py")
211
203
  init_file = os.path.join(path, "__init__.py")
212
204
  if os.path.isfile(settings_file) and os.path.isfile(init_file):
213
- _get_logger().debug(f"找到项目模块: {path}")
205
+ logger.debug(f"找到项目模块: {path}")
214
206
  # 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
215
207
  parent = os.path.dirname(path)
216
208
  if parent != path:
217
209
  parent_cfg = os.path.join(parent, "crawlo.cfg")
218
210
  if os.path.isfile(parent_cfg):
219
- _get_logger().debug(f"在上层目录找到项目配置文件: {parent_cfg}")
211
+ logger.debug(f"在上层目录找到项目配置文件: {parent_cfg}")
220
212
  return parent
221
213
  return path
222
214
 
@@ -227,7 +219,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
227
219
  except Exception:
228
220
  pass
229
221
 
230
- _get_logger().warning("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
222
+ logger.warning("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
231
223
  return None
232
224
 
233
225
 
@@ -241,8 +233,7 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
241
233
  Returns:
242
234
  SettingManager: 已加载配置的实例
243
235
  """
244
- # Change INFO level log to DEBUG level to avoid redundant output
245
- _get_logger().debug("🚀 正在初始化 Crawlo 项目配置...")
236
+ logger.debug("🚀 正在初始化 Crawlo 项目配置...")
246
237
 
247
238
  # 1. 查找项目根
248
239
  project_root = _find_project_root()
@@ -259,32 +250,35 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
259
250
  # 推断:项目目录名.settings
260
251
  project_name = os.path.basename(project_root)
261
252
  settings_module_path = f"{project_name}.settings"
262
- _get_logger().warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
253
+ logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
263
254
 
264
255
  # 3. 注入 sys.path
265
256
  project_root_str = os.path.abspath(project_root)
266
257
  if project_root_str not in sys.path:
267
258
  sys.path.insert(0, project_root_str)
268
- _get_logger().debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
259
+ logger.debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
269
260
 
270
261
  # 4. 加载 SettingManager
271
- _get_logger().debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
262
+ logger.debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
272
263
  settings = SettingManager()
273
264
 
274
265
  try:
275
266
  settings.set_settings(settings_module_path)
276
- _get_logger().debug("✅ settings 模块加载成功")
267
+ logger.debug("✅ settings 模块加载成功")
277
268
  except Exception as e:
278
269
  raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
279
270
 
280
271
  # 5. 合并运行时配置
281
272
  if custom_settings:
282
273
  settings.update_attributes(custom_settings)
283
- _get_logger().debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
274
+ logger.debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
284
275
 
285
276
  # 6. 显示核心配置摘要(INFO级别)
286
277
  # _log_settings_summary(settings)
287
278
 
279
+ # 配置日志系统
280
+ LoggerManager.configure(settings)
281
+
288
282
  # 将项目初始化完成的消息改为DEBUG级别
289
- _get_logger().debug("🎉 Crawlo 项目配置初始化完成!")
283
+ logger.debug("🎉 Crawlo 项目配置初始化完成!")
290
284
  return settings
@@ -15,38 +15,50 @@ class SettingManager(MutableMapping):
15
15
  self.set_settings(default_settings)
16
16
  # 在初始化时合并配置
17
17
  self._merge_config(values)
18
-
18
+
19
19
  def _merge_config(self, user_config):
20
20
  """合并默认配置和用户配置"""
21
21
  if not user_config:
22
22
  return
23
-
23
+
24
24
  # 合并中间件配置
25
25
  if 'MIDDLEWARES' in user_config:
26
26
  default_middlewares = self.attributes.get('MIDDLEWARES', [])
27
27
  user_middlewares = user_config['MIDDLEWARES']
28
- self.attributes['MIDDLEWARES'] = default_middlewares + user_middlewares
29
-
28
+ # 如果用户配置了空列表,则仍然使用默认配置
29
+ if user_middlewares:
30
+ self.attributes['MIDDLEWARES'] = default_middlewares + user_middlewares
31
+
30
32
  # 合并管道配置
31
33
  if 'PIPELINES' in user_config:
32
34
  default_pipelines = self.attributes.get('PIPELINES', [])
33
35
  user_pipelines = user_config['PIPELINES']
34
- merged_pipelines = default_pipelines + user_pipelines
35
- # 特殊处理PIPELINES,确保去重管道在最前面
36
- dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
37
- if dedup_pipeline:
38
- # 移除所有去重管道实例(如果存在)
39
- merged_pipelines = [item for item in merged_pipelines if item != dedup_pipeline]
40
- # 在开头插入去重管道
41
- merged_pipelines.insert(0, dedup_pipeline)
42
- self.attributes['PIPELINES'] = merged_pipelines
43
-
36
+ # 如果用户配置了空列表,则仍然使用默认配置
37
+ if user_pipelines:
38
+ # 过滤掉空值和注释
39
+ user_pipelines = [pipeline for pipeline in user_pipelines if pipeline and not pipeline.strip().startswith('#')]
40
+ if user_pipelines:
41
+ self.attributes['PIPELINES'] = user_pipelines
42
+
43
+ # 特殊处理PIPELINES,确保去重管道在最前面
44
+ dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
45
+ if dedup_pipeline:
46
+ pipelines = self.attributes.get('PIPELINES', [])
47
+ # 移除所有去重管道实例(如果存在)
48
+ pipelines = [item for item in pipelines if item != dedup_pipeline]
49
+ # 在开头插入去重管道
50
+ pipelines.insert(0, dedup_pipeline)
51
+ self.attributes['PIPELINES'] = pipelines
52
+
44
53
  # 合并扩展配置
45
54
  if 'EXTENSIONS' in user_config:
46
55
  default_extensions = self.attributes.get('EXTENSIONS', [])
47
56
  user_extensions = user_config['EXTENSIONS']
48
- self.attributes['EXTENSIONS'] = default_extensions + user_extensions
49
-
57
+ # 如果用户配置了空列表,则仍然使用默认配置
58
+ if user_extensions:
59
+ self.attributes['EXTENSIONS'] = default_extensions + user_extensions
60
+ # 如果用户没有配置扩展,则使用默认配置
61
+
50
62
  # 更新其他用户配置
51
63
  for key, value in user_config.items():
52
64
  if key not in ['MIDDLEWARES', 'PIPELINES', 'EXTENSIONS']:
@@ -147,7 +159,7 @@ class SettingManager(MutableMapping):
147
159
  # 创建一个新的实例
148
160
  cls = self.__class__
149
161
  new_instance = cls.__new__(cls)
150
-
162
+
151
163
  # 复制attributes字典,但排除不可pickle的对象
152
164
  new_attributes = {}
153
165
  for key, value in self.attributes.items():
@@ -157,8 +169,8 @@ class SettingManager(MutableMapping):
157
169
  except Exception:
158
170
  # 如果复制失败,保留原始引用(对于logger等对象)
159
171
  new_attributes[key] = value
160
-
172
+
161
173
  # 设置新实例的attributes
162
174
  new_instance.attributes = new_attributes
163
-
175
+
164
176
  return new_instance
crawlo/utils/log.py CHANGED
@@ -8,28 +8,20 @@ from logging import (
8
8
  INFO,
9
9
  getLevelName,
10
10
  )
11
- # 导入日志轮转处理器
12
- from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
13
11
 
14
12
  LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
15
13
 
16
14
 
17
15
  class LoggerManager:
16
+ """日志管理器,提供统一的日志配置和获取接口"""
18
17
  logger_cache = {}
19
18
  _default_filename = None
20
19
  _default_level = DEBUG # 设置为最低级别,由handler控制实际输出
21
- _default_file_level = INFO
22
- _default_console_level = INFO
20
+ _default_file_level = INFO # 默认为INFO级别
21
+ _default_console_level = INFO # 默认为INFO级别
23
22
  _default_log_format = LOG_FORMAT
24
23
  _default_encoding = 'utf-8'
25
- # 添加日志轮转相关默认配置
26
- _default_log_max_bytes = 10 * 1024 * 1024 # 10MB
27
- _default_log_backup_count = 5
28
- _default_log_when = 'midnight'
29
- _default_log_interval = 1
30
- _default_log_use_rotation = False
31
- _default_log_rotation_type = 'size' # 'size' or 'time'
32
- _default_log_rotation_suffix = None # 轮转文件后缀格式
24
+ _configured = False # 标记是否已配置
33
25
 
34
26
  @classmethod
35
27
  def _to_level(cls, level):
@@ -67,19 +59,12 @@ class LoggerManager:
67
59
  get_val = settings.get if hasattr(settings, 'get') else (lambda k, d=None: kwargs.get(k, d))
68
60
 
69
61
  filename = get_val('LOG_FILE')
70
- level = get_val('LOG_LEVEL', 'DEBUG') # 默认为DEBUG级别
71
- file_level = get_val('LOG_FILE_LEVEL', 'INFO')
72
- console_level = get_val('LOG_CONSOLE_LEVEL', 'INFO') # 保留LOG_CONSOLE_LEVEL配置
62
+ level = get_val('LOG_LEVEL', 'INFO') # 默认为INFO级别
63
+ file_level = get_val('LOG_FILE_LEVEL', level) # 默认继承LOG_LEVEL的值
64
+ # 根据项目规范,已完全移除LOG_CONSOLE_LEVEL支持,统一使用LOG_LEVEL控制控制台和文件的日志输出级别
65
+ console_level = level # 控制台日志级别直接使用LOG_LEVEL的值
73
66
  log_format = get_val('LOG_FORMAT', LOG_FORMAT)
74
67
  encoding = get_val('LOG_ENCODING', 'utf-8')
75
- # 获取日志轮转配置
76
- use_rotation = get_val('LOG_USE_ROTATION', False)
77
- rotation_type = get_val('LOG_ROTATION_TYPE', 'size')
78
- max_bytes = get_val('LOG_MAX_BYTES', cls._default_log_max_bytes)
79
- backup_count = get_val('LOG_BACKUP_COUNT', cls._default_log_backup_count)
80
- when = get_val('LOG_WHEN', cls._default_log_when)
81
- interval = get_val('LOG_INTERVAL', cls._default_log_interval)
82
- rotation_suffix = get_val('LOG_ROTATION_SUFFIX', cls._default_log_rotation_suffix) # 轮转文件后缀
83
68
 
84
69
  cls._default_filename = filename
85
70
  cls._default_level = cls._to_level(level)
@@ -87,21 +72,13 @@ class LoggerManager:
87
72
  cls._default_console_level = cls._to_level(console_level)
88
73
  cls._default_log_format = log_format
89
74
  cls._default_encoding = encoding
90
- # 设置日志轮转配置
91
- cls._default_log_use_rotation = use_rotation
92
- cls._default_log_rotation_type = rotation_type
93
- cls._default_log_max_bytes = max_bytes
94
- cls._default_log_backup_count = backup_count
95
- cls._default_log_when = when
96
- cls._default_log_interval = interval
97
- cls._default_log_rotation_suffix = rotation_suffix
98
-
99
- # 移除对根日志记录器级别的修改,避免副作用
75
+
76
+ cls._configured = True
100
77
 
101
78
  @classmethod
102
79
  def get_logger(cls, name='default', level=None, filename=None):
103
80
  """
104
- 简化接口,只暴露必要参数
81
+ 获取logger实例
105
82
  """
106
83
  # 确定最终参数
107
84
  # 如果传入了level参数,则使用它,否则使用默认级别
@@ -110,7 +87,7 @@ class LoggerManager:
110
87
  else:
111
88
  # Logger级别设置为DEBUG(最低级别),由handler控制实际输出
112
89
  final_level = DEBUG
113
-
90
+
114
91
  final_filename = filename if filename is not None else cls._default_filename
115
92
 
116
93
  # 安全的字符串化 key,避免任何 unhashable 类型
@@ -146,32 +123,9 @@ class LoggerManager:
146
123
  if log_dir and not os.path.exists(log_dir):
147
124
  os.makedirs(log_dir, exist_ok=True)
148
125
 
149
- # 检查是否启用日志轮转
150
- if cls._default_log_use_rotation:
151
- if cls._default_log_rotation_type == 'size':
152
- # 使用大小轮转
153
- fh = RotatingFileHandler(
154
- final_filename,
155
- maxBytes=cls._default_log_max_bytes,
156
- backupCount=cls._default_log_backup_count,
157
- encoding=cls._default_encoding
158
- )
159
- else:
160
- # 使用时间轮转
161
- fh = TimedRotatingFileHandler(
162
- final_filename,
163
- when=cls._default_log_when,
164
- interval=cls._default_log_interval,
165
- backupCount=cls._default_log_backup_count,
166
- encoding=cls._default_encoding
167
- )
168
- # 如果提供了自定义后缀格式,则设置
169
- if cls._default_log_rotation_suffix:
170
- fh.suffix = cls._default_log_rotation_suffix
171
- else:
172
- # 使用普通文件处理器(默认行为,会追加到文件)
173
- fh = FileHandler(final_filename, mode='a', encoding=cls._default_encoding)
174
-
126
+ # 使用普通文件处理器(移除日志轮转功能)
127
+ fh = FileHandler(final_filename, mode='a', encoding=cls._default_encoding)
128
+
175
129
  fh.setFormatter(formatter)
176
130
  fh.setLevel(cls._default_file_level)
177
131
  _logger.addHandler(fh)
@@ -183,6 +137,11 @@ class LoggerManager:
183
137
  cls.logger_cache[key] = _logger
184
138
  return _logger
185
139
 
140
+ @classmethod
141
+ def is_configured(cls):
142
+ """检查日志系统是否已配置"""
143
+ return cls._configured
144
+
186
145
 
187
146
  # 全局快捷函数
188
147
  get_logger = LoggerManager.get_logger
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.2.9
3
+ Version: 1.3.0
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -1,13 +1,13 @@
1
1
  crawlo/__init__.py,sha256=qZzTmb7hw5h_qcP2EYGUZcoSScxlKZFJ76CjSeS7UfA,1381
2
- crawlo/__version__.py,sha256=S2h29KjM2QfA587KjqKjzd2pK0GLkBmHIq8WDOutUiI,21
2
+ crawlo/__version__.py,sha256=zi_LaUT_OsChAtsPXbOeRpQkCohSsOyeXfavQPM0GoE,22
3
3
  crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
4
4
  crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
5
5
  crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
6
- crawlo/crawler.py,sha256=cXvzTF3W6LeZNll4hbMLK-E1xDpZL5M9fgtd_8Gk6eI,39863
6
+ crawlo/crawler.py,sha256=rxyjA5pXOd709bujgniqYG9tR3eoNaok6wJaeZOgzmo,39451
7
7
  crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
8
8
  crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
9
- crawlo/mode_manager.py,sha256=D8LOgqkqN4yglCKtXw56mq41r30FvxCUb5JOitSTp_U,7502
10
- crawlo/project.py,sha256=vHqDrGFgRThzFuF2RxDtSHxPerruR3liwXuGC3cVda8,10796
9
+ crawlo/mode_manager.py,sha256=soEgZNBt6jA0qtC1WH-MG_2WngDk2RfmQckLsK3NzmQ,7510
10
+ crawlo/project.py,sha256=830PPRUD6ldE8MKPdkFkPiUcecHhlWP3fUXYC96_T0Y,10506
11
11
  crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
12
12
  crawlo/subscriber.py,sha256=D3hzE7Pc_zJjc-zR7lct5pt32bz6LsDYeC8uHlS4Hso,4986
13
13
  crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
@@ -16,12 +16,12 @@ crawlo/commands/check.py,sha256=7pD43s97DD-fSLO9OEOuNcNr7o-2g94rJULL8fUzdaI,2260
16
16
  crawlo/commands/genspider.py,sha256=HhtvBLkIuhYtJUzom6PquItiC22vU9LNpOkjDUiqdM4,4937
17
17
  crawlo/commands/help.py,sha256=gwfHibRpdYDmZO6waUMOEn8SMJ_ubdjL-prD5fiuVY8,4973
18
18
  crawlo/commands/list.py,sha256=BqlPjBa5FLotjAlyZ3-nGmXg5cWcCNbHi8U5znb2_D8,5722
19
- crawlo/commands/run.py,sha256=b_HxEmaze0jpPNIrPbm5lyPYZ20--eUVKJZpXoATf0M,12088
19
+ crawlo/commands/run.py,sha256=KcJ4h4D7lavB6qQDpYMrbgJMgY5vCSLHaLckos5EUNY,11793
20
20
  crawlo/commands/startproject.py,sha256=aqKRJarKqTf5XjJnGXwjRpp0uYF16LreFbwwQLGpK-0,16070
21
21
  crawlo/commands/stats.py,sha256=8wTubR1RQ1JPTlpOKArcGcQ39bM-0cuH27lDpndnwPQ,6014
22
22
  crawlo/commands/utils.py,sha256=pXiFzwVIVXdSPO2Fty_u19P1lsE8HmuE8gTMamKZZUs,5047
23
23
  crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
24
- crawlo/core/engine.py,sha256=NIWbfRL_Lzjl6yYQ6LSfRJmDHBxjtk6x3nOWrnzaHME,14569
24
+ crawlo/core/engine.py,sha256=Hy0K_g9My6aQ3CPkxAcCiPsumdwh4O8qRhmFlNoErd4,14496
25
25
  crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
26
26
  crawlo/core/scheduler.py,sha256=D-YzXVvnP6DEkovmz9hThhzIe2UgRrQLNt9pJCPEPwY,12593
27
27
  crawlo/data/__init__.py,sha256=8MgDxcMhx-emFARcLAw_ODOZNz0neYBcx7kEbzothd8,116
@@ -37,7 +37,7 @@ crawlo/extension/__init__.py,sha256=FbOwJ4jh60xCbSh7P9CUGJsGAC-VH4MyOtCftRMlxbk,
37
37
  crawlo/extension/health_check.py,sha256=0GveZgUtFwjYEKlm3qbwIvCmb4FR0qrIKc8cEF1yQV8,5516
38
38
  crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
39
39
  crawlo/extension/log_stats.py,sha256=vrChs3bj_Dvay3kxxkBOp4-w0K-IG-2XZ0PoSUahTPs,2908
40
- crawlo/extension/logging_extension.py,sha256=ET6VAu1J2qNMz4NnG1G3zQLRhbsvV7l6xRIuQLE6DaE,1626
40
+ crawlo/extension/logging_extension.py,sha256=RfL1wI4nz-1Xtg420Ktp7RXnOPnZSHwO0Zpg1w4fO4M,1726
41
41
  crawlo/extension/memory_monitor.py,sha256=4aszl3C0GMQbqFhGZjZq5iQuXQR1sOz06VHjjEHgkyE,4290
42
42
  crawlo/extension/performance_profiler.py,sha256=EPiNuXuPPDU0Jtgy8arYHpr_8ASK13cCI2BytdJnu_I,4899
43
43
  crawlo/extension/request_recorder.py,sha256=RC23yzXClnVv9j2ljQvjBkUfWznfnDHsrQejKhE9y5E,4074
@@ -51,7 +51,7 @@ crawlo/items/items.py,sha256=e-3nXI9ckD64vcDxxQiAU6ufbtJMs09gbZQcYjxgwHY,3374
51
51
  crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
52
52
  crawlo/middleware/default_header.py,sha256=wQ7BrUHd-hRosFoKsReV9hwNNr_jwK6V0ZfxL6MOGrk,5032
53
53
  crawlo/middleware/download_delay.py,sha256=zt9R5g2HWErWA_MAOnGcw_D8l6HD769Kyaw-Hv-vcTc,3438
54
- crawlo/middleware/middleware_manager.py,sha256=_JnJ50u3u4zFKkNZDae8M8FHcK-V2LkbcX7DkBdtJ2M,6279
54
+ crawlo/middleware/middleware_manager.py,sha256=9Sj9rrWK6R9NZq9eT38sWRGuBKLKfjSgEAxu-5NCWgU,6278
55
55
  crawlo/middleware/offsite.py,sha256=b3BMwNKGR41YGJGHt1S0H7yXujEkztVvonUQGO05hoM,4026
56
56
  crawlo/middleware/proxy.py,sha256=NquB6tqHAgHs3-2_1_5220kJYfjNG5JyHRJyo_2j4wo,15636
57
57
  crawlo/middleware/request_ignore.py,sha256=xcyZ1c7r_HhbzR3r9pfjsLGW7L7FBVeYvlNt8cpP2wY,2577
@@ -71,7 +71,7 @@ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZ
71
71
  crawlo/pipelines/memory_dedup_pipeline.py,sha256=oIksbIrmSw9s9jMh6JJMfVbv6hzseVMV_g9S8UHQUP4,3837
72
72
  crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
73
73
  crawlo/pipelines/mysql_pipeline.py,sha256=G2DMhdh0ihBBolIul4YVTDz2JbrZGJauDtWF-gqRW0w,13473
74
- crawlo/pipelines/pipeline_manager.py,sha256=DuIRl6FYbrKKUCrtSHnq4pnRImpZPYtg1YRcgUd1xTk,2425
74
+ crawlo/pipelines/pipeline_manager.py,sha256=vCgfbhgsKMLm_7jCnr3cE5GemIYkG9u4oF8u4Ta_7so,3013
75
75
  crawlo/pipelines/redis_dedup_pipeline.py,sha256=POYRiWAOp1pqDW9iTPJ8h3VcpLALeLrpw74MvJJqPiM,6342
76
76
  crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
77
  crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
@@ -79,7 +79,7 @@ crawlo/queue/queue_manager.py,sha256=XqS_oVbNQJWdtokOuDDPK-FzMrVdnZ3UKp1MF_DMJww
79
79
  crawlo/queue/redis_priority_queue.py,sha256=k1OChSMRovSMkbbJ9388axfhpYeMevuJTe-3N1oYhbA,13126
80
80
  crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
81
81
  crawlo/settings/default_settings.py,sha256=98URrj6QBrx_pmJ1yvK-MSAW8VrZ-pl0FfiZEHV0ZnI,9183
82
- crawlo/settings/setting_manager.py,sha256=55iJJambnFliZxwhNd7BtlgCV6eviuPRBrrIgyrmZgw,5659
82
+ crawlo/settings/setting_manager.py,sha256=V3nVJEPtusadoz5eILXFeNyDXX1u_MgIiKIFIWVDY1s,6189
83
83
  crawlo/spider/__init__.py,sha256=ZnSAL9PXLZSIH-Jdv-P6RuWmQUdukr8KPLQK6SXZZaU,20435
84
84
  crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
85
85
  crawlo/templates/run.py.tmpl,sha256=v_g-LQMYJ6pC8TZgyWj0yB2yTTKrwy9lEJufAYCXyxY,1228
@@ -117,7 +117,7 @@ crawlo/utils/error_handler.py,sha256=q6NqHxjYrKdswfmhshMYMmfBIr0M2YWPYxts4ScHl4Y
117
117
  crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
118
118
  crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
119
119
  crawlo/utils/large_scale_helper.py,sha256=Kxdy3WMuqjzQTyCc6z4xEYxXDi4xnYKJzsVwaBYZrrg,12108
120
- crawlo/utils/log.py,sha256=AcRx6_9U6fY-6GEZP77Il2o3FZZm5Cx0MFubIK9R3oI,7501
120
+ crawlo/utils/log.py,sha256=xZe3UU78yr10lK0hxALBQB0Uv9cXShOPPksoe5n_qKI,5229
121
121
  crawlo/utils/performance_monitor.py,sha256=Q9xxuXBIfFoig_U-FQPOUuPAh1axO3MzYgpielDyku0,9547
122
122
  crawlo/utils/queue_helper.py,sha256=xpUUTOqlU1xz2Pb9NKAVGo3AfAO-7Xvx8Lm1q65Dgck,4743
123
123
  crawlo/utils/redis_connection_pool.py,sha256=amGjhaKpodMrw9X56qxZ6f3OTZhjrI89sSVGqgwAQGU,11050
@@ -212,8 +212,8 @@ tests/test_tools.py,sha256=9t9FXZ61MfdB70nck9NYzCq97yd3SLVlLiMybEAlClk,5345
212
212
  tests/test_user_agents.py,sha256=rUotyuE2iJDi2LQBrUh980U-dAMTs4ARPMJxICOoQFY,3231
213
213
  tests/tools_example.py,sha256=MtIypR-OFiWwi-skurwmq4fM0cGTt-GUX4hSekYs7BY,7739
214
214
  tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3928
215
- crawlo-1.2.9.dist-info/METADATA,sha256=wyjBzrx9hUZzlt0r1JPu33ouyrm_iJtphQAi2THb7LA,26298
216
- crawlo-1.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
217
- crawlo-1.2.9.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
218
- crawlo-1.2.9.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
219
- crawlo-1.2.9.dist-info/RECORD,,
215
+ crawlo-1.3.0.dist-info/METADATA,sha256=5BRT0EE3J1yUtWZ0l_pZqEWxTgGA1p3laxJjTSu7980,26298
216
+ crawlo-1.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
217
+ crawlo-1.3.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
218
+ crawlo-1.3.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
219
+ crawlo-1.3.0.dist-info/RECORD,,
File without changes