crawlo 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (111) hide show
  1. crawlo/__init__.py +33 -24
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -106
  6. crawlo/commands/genspider.py +125 -110
  7. crawlo/commands/list.py +147 -92
  8. crawlo/commands/run.py +286 -181
  9. crawlo/commands/startproject.py +111 -101
  10. crawlo/commands/stats.py +188 -59
  11. crawlo/core/__init__.py +2 -2
  12. crawlo/core/engine.py +158 -158
  13. crawlo/core/processor.py +40 -40
  14. crawlo/core/scheduler.py +57 -57
  15. crawlo/crawler.py +494 -492
  16. crawlo/downloader/__init__.py +78 -78
  17. crawlo/downloader/aiohttp_downloader.py +199 -199
  18. crawlo/downloader/cffi_downloader.py +242 -277
  19. crawlo/downloader/httpx_downloader.py +246 -246
  20. crawlo/event.py +11 -11
  21. crawlo/exceptions.py +78 -78
  22. crawlo/extension/__init__.py +31 -31
  23. crawlo/extension/log_interval.py +49 -49
  24. crawlo/extension/log_stats.py +44 -44
  25. crawlo/extension/logging_extension.py +34 -34
  26. crawlo/filters/__init__.py +37 -37
  27. crawlo/filters/aioredis_filter.py +150 -150
  28. crawlo/filters/memory_filter.py +202 -202
  29. crawlo/items/__init__.py +23 -23
  30. crawlo/items/base.py +21 -21
  31. crawlo/items/fields.py +53 -53
  32. crawlo/items/items.py +104 -104
  33. crawlo/middleware/__init__.py +21 -21
  34. crawlo/middleware/default_header.py +32 -32
  35. crawlo/middleware/download_delay.py +28 -28
  36. crawlo/middleware/middleware_manager.py +135 -135
  37. crawlo/middleware/proxy.py +245 -245
  38. crawlo/middleware/request_ignore.py +30 -30
  39. crawlo/middleware/response_code.py +18 -18
  40. crawlo/middleware/response_filter.py +26 -26
  41. crawlo/middleware/retry.py +90 -90
  42. crawlo/network/__init__.py +7 -7
  43. crawlo/network/request.py +203 -203
  44. crawlo/network/response.py +166 -166
  45. crawlo/pipelines/__init__.py +13 -13
  46. crawlo/pipelines/console_pipeline.py +39 -39
  47. crawlo/pipelines/mongo_pipeline.py +116 -116
  48. crawlo/pipelines/mysql_batch_pipline.py +272 -272
  49. crawlo/pipelines/mysql_pipeline.py +195 -195
  50. crawlo/pipelines/pipeline_manager.py +56 -56
  51. crawlo/project.py +153 -0
  52. crawlo/settings/__init__.py +7 -7
  53. crawlo/settings/default_settings.py +166 -168
  54. crawlo/settings/setting_manager.py +99 -99
  55. crawlo/spider/__init__.py +129 -129
  56. crawlo/stats_collector.py +59 -59
  57. crawlo/subscriber.py +106 -106
  58. crawlo/task_manager.py +27 -27
  59. crawlo/templates/crawlo.cfg.tmpl +10 -10
  60. crawlo/templates/project/__init__.py.tmpl +3 -3
  61. crawlo/templates/project/items.py.tmpl +17 -17
  62. crawlo/templates/project/middlewares.py.tmpl +75 -75
  63. crawlo/templates/project/pipelines.py.tmpl +63 -63
  64. crawlo/templates/project/settings.py.tmpl +54 -54
  65. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  66. crawlo/templates/spider/spider.py.tmpl +31 -31
  67. crawlo/utils/__init__.py +7 -7
  68. crawlo/utils/date_tools.py +233 -233
  69. crawlo/utils/db_helper.py +343 -343
  70. crawlo/utils/func_tools.py +82 -82
  71. crawlo/utils/log.py +128 -128
  72. crawlo/utils/pqueue.py +173 -173
  73. crawlo/utils/request.py +267 -267
  74. crawlo/utils/spider_loader.py +62 -62
  75. crawlo/utils/system.py +11 -11
  76. crawlo/utils/tools.py +4 -4
  77. crawlo/utils/url.py +39 -39
  78. crawlo-1.1.1.dist-info/METADATA +220 -0
  79. crawlo-1.1.1.dist-info/RECORD +100 -0
  80. examples/__init__.py +7 -0
  81. examples/baidu_spider/__init__.py +7 -0
  82. examples/baidu_spider/demo.py +94 -0
  83. examples/baidu_spider/items.py +46 -0
  84. examples/baidu_spider/middleware.py +49 -0
  85. examples/baidu_spider/pipeline.py +55 -0
  86. examples/baidu_spider/run.py +27 -0
  87. examples/baidu_spider/settings.py +121 -0
  88. examples/baidu_spider/spiders/__init__.py +7 -0
  89. examples/baidu_spider/spiders/bai_du.py +61 -0
  90. examples/baidu_spider/spiders/miit.py +159 -0
  91. examples/baidu_spider/spiders/sina.py +79 -0
  92. tests/__init__.py +7 -7
  93. tests/test_proxy_health_check.py +32 -32
  94. tests/test_proxy_middleware_integration.py +136 -136
  95. tests/test_proxy_providers.py +56 -56
  96. tests/test_proxy_stats.py +19 -19
  97. tests/test_proxy_strategies.py +59 -59
  98. crawlo/utils/concurrency_manager.py +0 -125
  99. crawlo/utils/project.py +0 -197
  100. crawlo-1.0.9.dist-info/METADATA +0 -49
  101. crawlo-1.0.9.dist-info/RECORD +0 -97
  102. examples/gxb/__init__.py +0 -0
  103. examples/gxb/items.py +0 -36
  104. examples/gxb/run.py +0 -16
  105. examples/gxb/settings.py +0 -72
  106. examples/gxb/spider/__init__.py +0 -0
  107. examples/gxb/spider/miit_spider.py +0 -180
  108. examples/gxb/spider/telecom_device.py +0 -129
  109. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/WHEEL +0 -0
  110. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/entry_points.txt +0 -0
  111. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/top_level.txt +0 -0
crawlo/crawler.py CHANGED
@@ -1,493 +1,495 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- from __future__ import annotations
4
- import asyncio
5
- import signal
6
- from typing import Type, Optional, Set, List, Union, Dict
7
- from .spider import Spider, get_global_spider_registry
8
- from .core.engine import Engine
9
- from .utils.log import get_logger
10
- from .subscriber import Subscriber
11
- from .extension import ExtensionManager
12
- from .stats_collector import StatsCollector
13
- from .event import spider_opened, spider_closed
14
- from .settings.setting_manager import SettingManager
15
- from .utils.project import merge_settings, get_settings
16
-
17
-
18
- logger = get_logger(__name__)
19
-
20
-
21
- class Crawler:
22
- """单个爬虫运行实例,绑定 Spider 与引擎"""
23
-
24
- def __init__(self, spider_cls: Type[Spider], settings: SettingManager):
25
- self.spider_cls = spider_cls
26
- self.spider: Optional[Spider] = None
27
- self.engine: Optional[Engine] = None
28
- self.stats: Optional[StatsCollector] = None
29
- self.subscriber: Optional[Subscriber] = None
30
- self.extension: Optional[ExtensionManager] = None
31
- self.settings: SettingManager = settings.copy()
32
- self._closed = False
33
- self._close_lock = asyncio.Lock()
34
-
35
- async def crawl(self):
36
- """启动爬虫核心流程"""
37
- self.subscriber = self._create_subscriber()
38
- self.spider = self._create_spider()
39
- self.engine = self._create_engine()
40
- self.stats = self._create_stats()
41
- self.extension = self._create_extension()
42
- await self.engine.start_spider(self.spider)
43
-
44
- @staticmethod
45
- def _create_subscriber() -> Subscriber:
46
- return Subscriber()
47
-
48
- def _create_spider(self) -> Spider:
49
- spider = self.spider_cls.create_instance(self)
50
-
51
- if not getattr(spider, 'name', None):
52
- raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
53
-
54
- if not callable(getattr(spider, 'start_requests', None)):
55
- raise AttributeError(f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。")
56
-
57
- start_urls = getattr(spider, 'start_urls', [])
58
- if isinstance(start_urls, str):
59
- raise TypeError(f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。")
60
-
61
- if not callable(getattr(spider, 'parse', None)):
62
- logger.warning(
63
- f"爬虫 '{spider.name}' 未定义 'parse' 方法。请确保所有 Request 都指定了回调函数,否则响应将被忽略。")
64
-
65
- self._set_spider(spider)
66
- return spider
67
-
68
- def _create_engine(self) -> Engine:
69
- engine = Engine(self)
70
- engine.engine_start()
71
- return engine
72
-
73
- def _create_stats(self) -> StatsCollector:
74
- return StatsCollector(self)
75
-
76
- def _create_extension(self) -> ExtensionManager:
77
- return ExtensionManager.create_instance(self)
78
-
79
- def _set_spider(self, spider: Spider):
80
- self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
81
- self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
82
- merge_settings(spider, self.settings)
83
-
84
- async def close(self, reason='finished') -> None:
85
- async with self._close_lock:
86
- if self._closed:
87
- return
88
- self._closed = True
89
- await self.subscriber.notify(spider_closed)
90
- if self.stats and self.spider:
91
- self.stats.close_spider(spider=self.spider, reason=reason)
92
- from crawlo.commands.stats import record_stats
93
- record_stats(self)
94
-
95
-
96
- class CrawlerProcess:
97
- """
98
- 爬虫进程管理器,支持:
99
- - 自动发现爬虫模块
100
- - 通过 name 或类启动爬虫
101
- - 并发控制
102
- - 优雅关闭
103
- """
104
-
105
- def __init__(
106
- self,
107
- settings: Optional[SettingManager] = None,
108
- max_concurrency: Optional[int] = None,
109
- spider_modules: Optional[List[str]] = None
110
- ):
111
- self.settings: SettingManager = settings or self._get_default_settings()
112
- self.crawlers: Set[Crawler] = set()
113
- self._active_tasks: Set[asyncio.Task] = set()
114
-
115
- # 自动发现并导入爬虫模块
116
- if spider_modules:
117
- self.auto_discover(spider_modules)
118
-
119
- # 使用全局注册表的快照(避免后续导入影响)
120
- self._spider_registry: Dict[str, Type[Spider]] = get_global_spider_registry()
121
-
122
- self.max_concurrency: int = (
123
- max_concurrency
124
- or self.settings.get('MAX_RUNNING_SPIDERS')
125
- or self.settings.get('CONCURRENCY', 3)
126
- )
127
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
128
-
129
- # 注册信号量
130
- signal.signal(signal.SIGINT, self._shutdown)
131
- signal.signal(signal.SIGTERM, self._shutdown)
132
- logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
133
-
134
- def auto_discover(self, modules: List[str]):
135
- """自动导入模块,触发 Spider 类定义和注册"""
136
- import importlib
137
- import pkgutil
138
- for module_name in modules:
139
- try:
140
- module = importlib.import_module(module_name)
141
- if hasattr(module, '__path__'):
142
- for _, name, _ in pkgutil.walk_packages(module.__path__, module.__name__ + "."):
143
- importlib.import_module(name)
144
- else:
145
- importlib.import_module(module_name)
146
- logger.debug(f"已扫描模块: {module_name}")
147
- except Exception as e:
148
- logger.error(f"扫描模块 {module_name} 失败: {e}", exc_info=True)
149
-
150
- # === 公共只读接口:避免直接访问 _spider_registry ===
151
-
152
- def get_spider_names(self) -> List[str]:
153
- """获取所有已注册的爬虫名称"""
154
- return list(self._spider_registry.keys())
155
-
156
- def get_spider_class(self, name: str) -> Optional[Type[Spider]]:
157
- """根据 name 获取爬虫类"""
158
- return self._spider_registry.get(name)
159
-
160
- def is_spider_registered(self, name: str) -> bool:
161
- """检查某个 name 是否已注册"""
162
- return name in self._spider_registry
163
-
164
- async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
165
- """启动一个或多个爬虫"""
166
- spider_classes_to_run = self._resolve_spiders_to_run(spiders)
167
- total = len(spider_classes_to_run)
168
-
169
- if total == 0:
170
- raise ValueError("至少需要提供一个爬虫类或名称")
171
-
172
- # 按类名排序,保证启动顺序可预测
173
- spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
174
- logger.info(f"启动 {total} 个爬虫.")
175
-
176
- # 流式启动
177
- tasks = [
178
- asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
179
- for index, spider_cls in enumerate(spider_classes_to_run)
180
- ]
181
-
182
- # 等待完成(失败不中断)
183
- results = await asyncio.gather(*tasks, return_exceptions=True)
184
- failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
185
- if failed:
186
- logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes_to_run[i].__name__ for i in failed]}")
187
-
188
- def _resolve_spiders_to_run(
189
- self,
190
- spiders_input: Union[Type[Spider], str, List[Union[Type[Spider], str]]]
191
- ) -> List[Type[Spider]]:
192
- """解析输入为爬虫类列表"""
193
- inputs = self._normalize_inputs(spiders_input)
194
- seen_spider_names: Set[str] = set()
195
- spider_classes: List[Type[Spider]] = []
196
-
197
- for item in inputs:
198
- spider_cls = self._resolve_spider_class(item)
199
- spider_name = spider_cls.name
200
-
201
- if spider_name in seen_spider_names:
202
- raise ValueError(f"本次运行中爬虫名称 '{spider_name}' 重复。")
203
-
204
- seen_spider_names.add(spider_name)
205
- spider_classes.append(spider_cls)
206
-
207
- return spider_classes
208
-
209
- def _normalize_inputs(self, spiders_input) -> List[Union[Type[Spider], str]]:
210
- """标准化输入为列表"""
211
- if isinstance(spiders_input, (type, str)):
212
- return [spiders_input]
213
- elif isinstance(spiders_input, (list, tuple)):
214
- return list(spiders_input)
215
- else:
216
- raise TypeError("spiders 必须是爬虫类、name 字符串,或它们的列表/元组")
217
-
218
- def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
219
- """解析单个输入项为爬虫类"""
220
- if isinstance(item, type) and issubclass(item, Spider):
221
- return item
222
- elif isinstance(item, str):
223
- spider_cls = self._spider_registry.get(item)
224
- if not spider_cls:
225
- raise ValueError(f"未找到名为 '{item}' 的爬虫。")
226
- return spider_cls
227
- else:
228
- raise TypeError(f"无效类型 {type(item)}。必须是 Spider 类或字符串 name。")
229
-
230
- async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
231
- """受信号量限制的爬虫运行函数"""
232
- task = asyncio.current_task()
233
- self._active_tasks.add(task)
234
- try:
235
- await self.semaphore.acquire()
236
- logger.info(f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}")
237
- crawler = Crawler(spider_cls, self.settings)
238
- self.crawlers.add(crawler)
239
- await crawler.crawl()
240
- logger.info(f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}")
241
- except Exception as e:
242
- logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
243
- raise
244
- finally:
245
- if task in self._active_tasks:
246
- self._active_tasks.remove(task)
247
- self.semaphore.release()
248
-
249
- def _shutdown(self, _signum, _frame):
250
- """优雅关闭信号处理"""
251
- logger.warning("收到关闭信号,正在停止所有爬虫...")
252
- for crawler in list(self.crawlers):
253
- if crawler.engine:
254
- crawler.engine.running = False
255
- crawler.engine.normal = False
256
- asyncio.create_task(self._wait_for_shutdown())
257
-
258
- async def _wait_for_shutdown(self):
259
- """等待所有活跃任务完成"""
260
- pending = [t for t in self._active_tasks if not t.done()]
261
- if pending:
262
- logger.info(f"等待 {len(pending)} 个活跃任务完成...")
263
- await asyncio.gather(*pending, return_exceptions=True)
264
- logger.info("所有爬虫已优雅关闭")
265
-
266
- @classmethod
267
- def _get_default_settings(cls) -> SettingManager:
268
- """加载默认配置"""
269
- try:
270
- return get_settings()
271
- except Exception as e:
272
- logger.warning(f"无法加载默认配置: {e}")
273
- return SettingManager()
274
-
275
- # #!/usr/bin/python
276
- # # -*- coding: UTF-8 -*-
277
- # import asyncio
278
- # import signal
279
- # from typing import Type, Optional, Set, List
280
- #
281
- # from crawlo.spider import Spider
282
- # from crawlo.core.engine import Engine
283
- # from crawlo.utils.log import get_logger
284
- # from crawlo.subscriber import Subscriber
285
- # from crawlo.extension import ExtensionManager
286
- # from crawlo.exceptions import SpiderTypeError
287
- # from crawlo.stats_collector import StatsCollector
288
- # from crawlo.event import spider_opened, spider_closed
289
- # from crawlo.settings.setting_manager import SettingManager
290
- # from crawlo.utils.project import merge_settings, get_settings
291
- #
292
- #
293
- # logger = get_logger(__name__)
294
- #
295
- #
296
- # class Crawler:
297
- # """单个爬虫运行实例,绑定 Spider 与引擎"""
298
- #
299
- # def __init__(self, spider_cls: Type[Spider], settings: SettingManager):
300
- # self.spider_cls = spider_cls
301
- # self.spider: Optional[Spider] = None
302
- # self.engine: Optional[Engine] = None
303
- # self.stats: Optional[StatsCollector] = None
304
- # self.subscriber: Optional[Subscriber] = None
305
- # self.extension: Optional[ExtensionManager] = None
306
- # self.settings: SettingManager = settings.copy()
307
- # self._closed = False # 新增状态
308
- # self._close_lock = asyncio.Lock()
309
- #
310
- # async def crawl(self):
311
- # """启动爬虫核心流程"""
312
- # self.subscriber = self._create_subscriber()
313
- # self.spider = self._create_spider()
314
- # self.engine = self._create_engine()
315
- # self.stats = self._create_stats()
316
- # self.extension = self._create_extension()
317
- #
318
- # await self.engine.start_spider(self.spider)
319
- #
320
- # @staticmethod
321
- # def _create_subscriber() -> Subscriber:
322
- # return Subscriber()
323
- #
324
- # def _create_spider(self) -> Spider:
325
- # spider = self.spider_cls.create_instance(self)
326
- #
327
- # # --- 关键属性检查 ---
328
- # if not getattr(spider, 'name', None):
329
- # raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
330
- #
331
- # if not callable(getattr(spider, 'start_requests', None)):
332
- # raise AttributeError(f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。")
333
- #
334
- # start_urls = getattr(spider, 'start_urls', [])
335
- # if isinstance(start_urls, str):
336
- # raise TypeError(f"爬虫 '{spider.name}' 'start_urls' 必须是列表或元组,不能是字符串。")
337
- #
338
- # if not callable(getattr(spider, 'parse', None)):
339
- # logger.warning(
340
- # f"爬虫 '{spider.name}' 未定义 'parse' 方法。请确保所有 Request 都指定了回调函数,否则响应将被忽略。")
341
- #
342
- # self._set_spider(spider)
343
- # return spider
344
- #
345
- # def _create_engine(self) -> Engine:
346
- # engine = Engine(self)
347
- # engine.engine_start()
348
- # return engine
349
- #
350
- # def _create_stats(self) -> StatsCollector:
351
- # return StatsCollector(self)
352
- #
353
- # def _create_extension(self) -> ExtensionManager:
354
- # return ExtensionManager.create_instance(self)
355
- #
356
- # def _set_spider(self, spider: Spider):
357
- # self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
358
- # self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
359
- # merge_settings(spider, self.settings)
360
- #
361
- # async def close(self, reason='finished') -> None:
362
- # async with self._close_lock:
363
- # if self._closed:
364
- # return
365
- # self._closed = True
366
- # await self.subscriber.notify(spider_closed)
367
- # if self.stats and self.spider:
368
- # self.stats.close_spider(spider=self.spider, reason=reason)
369
- #
370
- #
371
- # class CrawlerProcess:
372
- # """
373
- # 爬虫进程管理器,支持多爬虫并发调度、信号量控制、实时日志与优雅关闭
374
- # """
375
- #
376
- # def __init__(self, settings: Optional[SettingManager] = None, max_concurrency: Optional[int] = None):
377
- # self.settings: SettingManager = settings or self._get_default_settings()
378
- # self.crawlers: Set[Crawler] = set()
379
- # self._active_tasks: Set[asyncio.Task] = set()
380
- #
381
- # # 使用专用配置,降级使用 CONCURRENCY
382
- # self.max_concurrency: int = (
383
- # max_concurrency
384
- # or self.settings.get('MAX_RUNNING_SPIDERS')
385
- # or self.settings.get('CONCURRENCY', 3)
386
- # )
387
- # self.semaphore = asyncio.Semaphore(self.max_concurrency)
388
- #
389
- # # 注册信号量
390
- # signal.signal(signal.SIGINT, self._shutdown)
391
- # signal.signal(signal.SIGTERM, self._shutdown)
392
- # logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
393
- #
394
- # async def crawl(self, spiders):
395
- # """
396
- # 启动一个或多个爬虫,流式调度,支持实时进度反馈
397
- # """
398
- # spider_classes = self._normalize_spiders(spiders)
399
- # total = len(spider_classes)
400
- #
401
- # if total == 0:
402
- # raise ValueError("至少需要提供一个爬虫类")
403
- #
404
- # # 按名称排序
405
- # spider_classes.sort(key=lambda cls: cls.__name__.lower())
406
- #
407
- # logger.info(f"启动 {total} 个爬虫.")
408
- #
409
- # # 流式启动所有爬虫任务
410
- # tasks = [
411
- # asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
412
- # for index, spider_cls in enumerate(spider_classes)
413
- # ]
414
- #
415
- # # 等待所有任务完成(失败不中断)
416
- # results = await asyncio.gather(*tasks, return_exceptions=True)
417
- #
418
- # # 统计异常
419
- # failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
420
- # if failed:
421
- # logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes[i].__name__ for i in failed]}")
422
- #
423
- # @staticmethod
424
- # def _normalize_spiders(spiders) -> List[Type[Spider]]:
425
- # """标准化输入为爬虫类列表"""
426
- # if isinstance(spiders, type) and issubclass(spiders, Spider):
427
- # return [spiders]
428
- # elif isinstance(spiders, (list, tuple)):
429
- # return list(spiders)
430
- # else:
431
- # raise TypeError("spiders 必须是爬虫类或爬虫类列表/元组")
432
- #
433
- # async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
434
- # """
435
- # 受信号量限制的爬虫运行函数,带进度日志
436
- # """
437
- # task = asyncio.current_task()
438
- # self._active_tasks.add(task)
439
- #
440
- # try:
441
- # # 获取并发许可
442
- # await self.semaphore.acquire()
443
- #
444
- # start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
445
- # logger.info(start_msg)
446
- #
447
- # # 创建并运行爬虫
448
- # crawler = self._create_crawler(spider_cls)
449
- # self.crawlers.add(crawler)
450
- # await crawler.crawl()
451
- #
452
- # end_msg = f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}"
453
- # logger.info(end_msg)
454
- #
455
- # except Exception as e:
456
- # logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
457
- # raise
458
- # finally:
459
- # if task in self._active_tasks:
460
- # self._active_tasks.remove(task)
461
- # self.semaphore.release() # 必须释放
462
- #
463
- # def _create_crawler(self, spider_cls: Type[Spider]) -> Crawler:
464
- # """创建爬虫实例"""
465
- # if isinstance(spider_cls, str):
466
- # raise SpiderTypeError(f"不支持字符串形式的爬虫: {spider_cls}")
467
- # return Crawler(spider_cls, self.settings)
468
- #
469
- # def _shutdown(self, _signum, _frame):
470
- # """优雅关闭信号处理"""
471
- # logger.warning("收到关闭信号,正在停止所有爬虫...")
472
- # for crawler in list(self.crawlers):
473
- # if crawler.engine:
474
- # crawler.engine.running = False
475
- # crawler.engine.normal = False
476
- # asyncio.create_task(self._wait_for_shutdown())
477
- #
478
- # async def _wait_for_shutdown(self):
479
- # """等待所有活跃任务完成"""
480
- # pending = [t for t in self._active_tasks if not t.done()]
481
- # if pending:
482
- # logger.info(f"等待 {len(pending)} 个活跃任务完成...")
483
- # await asyncio.gather(*pending, return_exceptions=True)
484
- # logger.info("所有爬虫已优雅关闭")
485
- #
486
- # @classmethod
487
- # def _get_default_settings(cls) -> SettingManager:
488
- # """加载默认配置"""
489
- # try:
490
- # return get_settings()
491
- # except Exception as e:
492
- # logger.warning(f"无法加载默认配置: {e}")
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ from __future__ import annotations
4
+ import asyncio
5
+ import signal
6
+ from typing import Type, Optional, Set, List, Union, Dict
7
+ from .spider import Spider, get_global_spider_registry
8
+ from .core.engine import Engine
9
+ from .utils.log import get_logger
10
+ from .subscriber import Subscriber
11
+ from .extension import ExtensionManager
12
+ from .stats_collector import StatsCollector
13
+ from .event import spider_opened, spider_closed
14
+ from .settings.setting_manager import SettingManager
15
+ from crawlo.project import merge_settings, get_settings
16
+
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class Crawler:
22
+ """单个爬虫运行实例,绑定 Spider 与引擎"""
23
+
24
+ def __init__(self, spider_cls: Type[Spider], settings: SettingManager):
25
+ self.spider_cls = spider_cls
26
+ self.spider: Optional[Spider] = None
27
+ self.engine: Optional[Engine] = None
28
+ self.stats: Optional[StatsCollector] = None
29
+ self.subscriber: Optional[Subscriber] = None
30
+ self.extension: Optional[ExtensionManager] = None
31
+ self.settings: SettingManager = settings.copy()
32
+ self._closed = False
33
+ self._close_lock = asyncio.Lock()
34
+
35
+ async def crawl(self):
36
+ """启动爬虫核心流程"""
37
+ self.subscriber = self._create_subscriber()
38
+ self.spider = self._create_spider()
39
+ self.engine = self._create_engine()
40
+ self.stats = self._create_stats()
41
+ self.extension = self._create_extension()
42
+ await self.engine.start_spider(self.spider)
43
+
44
+ @staticmethod
45
+ def _create_subscriber() -> Subscriber:
46
+ return Subscriber()
47
+
48
+ def _create_spider(self) -> Spider:
49
+ spider = self.spider_cls.create_instance(self)
50
+
51
+ if not getattr(spider, 'name', None):
52
+ raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
53
+
54
+ if not callable(getattr(spider, 'start_requests', None)):
55
+ raise AttributeError(f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。")
56
+
57
+ start_urls = getattr(spider, 'start_urls', [])
58
+ if isinstance(start_urls, str):
59
+ raise TypeError(f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。")
60
+
61
+ if not callable(getattr(spider, 'parse', None)):
62
+ logger.warning(
63
+ f"爬虫 '{spider.name}' 未定义 'parse' 方法。请确保所有 Request 都指定了回调函数,否则响应将被忽略。")
64
+
65
+ self._set_spider(spider)
66
+ return spider
67
+
68
+ def _create_engine(self) -> Engine:
69
+ engine = Engine(self)
70
+ engine.engine_start()
71
+ return engine
72
+
73
+ def _create_stats(self) -> StatsCollector:
74
+ return StatsCollector(self)
75
+
76
+ def _create_extension(self) -> ExtensionManager:
77
+ return ExtensionManager.create_instance(self)
78
+
79
+ def _set_spider(self, spider: Spider):
80
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
81
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
82
+ merge_settings(spider, self.settings)
83
+
84
+ async def close(self, reason='finished') -> None:
85
+ async with self._close_lock:
86
+ if self._closed:
87
+ return
88
+ self._closed = True
89
+ await self.subscriber.notify(spider_closed)
90
+ if self.stats and self.spider:
91
+ self.stats.close_spider(spider=self.spider, reason=reason)
92
+ from crawlo.commands.stats import record_stats
93
+ record_stats(self)
94
+
95
+
96
+ class CrawlerProcess:
97
+ """
98
+ 爬虫进程管理器,支持:
99
+ - 自动发现爬虫模块
100
+ - 通过 name 或类启动爬虫
101
+ - 并发控制
102
+ - 优雅关闭
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ settings: Optional[SettingManager] = None,
108
+ max_concurrency: Optional[int] = None,
109
+ spider_modules: Optional[List[str]] = None
110
+ ):
111
+ self.settings: SettingManager = settings or self._get_default_settings()
112
+ self.crawlers: Set[Crawler] = set()
113
+ self._active_tasks: Set[asyncio.Task] = set()
114
+
115
+ # 自动发现并导入爬虫模块
116
+ if spider_modules:
117
+ self.auto_discover(spider_modules)
118
+
119
+ # 使用全局注册表的快照(避免后续导入影响)
120
+ self._spider_registry: Dict[str, Type[Spider]] = get_global_spider_registry()
121
+
122
+ self.max_concurrency: int = (
123
+ max_concurrency
124
+ or self.settings.get('MAX_RUNNING_SPIDERS')
125
+ or self.settings.get('CONCURRENCY', 3)
126
+ )
127
+ self.semaphore = asyncio.Semaphore(self.max_concurrency)
128
+
129
+ # 注册信号量
130
+ signal.signal(signal.SIGINT, self._shutdown)
131
+ signal.signal(signal.SIGTERM, self._shutdown)
132
+ logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
133
+
134
+ @staticmethod
135
+ def auto_discover(modules: List[str]):
136
+ """自动导入模块,触发 Spider 类定义和注册"""
137
+ import importlib
138
+ import pkgutil
139
+ for module_name in modules:
140
+ try:
141
+ module = importlib.import_module(module_name)
142
+ if hasattr(module, '__path__'):
143
+ for _, name, _ in pkgutil.walk_packages(module.__path__, module.__name__ + "."):
144
+ importlib.import_module(name)
145
+ else:
146
+ importlib.import_module(module_name)
147
+ logger.debug(f"已扫描模块: {module_name}")
148
+ except Exception as e:
149
+ logger.error(f"扫描模块 {module_name} 失败: {e}", exc_info=True)
150
+
151
+ # === 公共只读接口:避免直接访问 _spider_registry ===
152
+
153
+ def get_spider_names(self) -> List[str]:
154
+ """获取所有已注册的爬虫名称"""
155
+ return list(self._spider_registry.keys())
156
+
157
+ def get_spider_class(self, name: str) -> Optional[Type[Spider]]:
158
+ """根据 name 获取爬虫类"""
159
+ return self._spider_registry.get(name)
160
+
161
+ def is_spider_registered(self, name: str) -> bool:
162
+ """检查某个 name 是否已注册"""
163
+ return name in self._spider_registry
164
+
165
+ async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
166
+ """启动一个或多个爬虫"""
167
+ spider_classes_to_run = self._resolve_spiders_to_run(spiders)
168
+ total = len(spider_classes_to_run)
169
+
170
+ if total == 0:
171
+ raise ValueError("至少需要提供一个爬虫类或名称")
172
+
173
+ # 按类名排序,保证启动顺序可预测
174
+ spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
175
+ logger.info(f"启动 {total} 个爬虫.")
176
+
177
+ # 流式启动
178
+ tasks = [
179
+ asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
180
+ for index, spider_cls in enumerate(spider_classes_to_run)
181
+ ]
182
+
183
+ # 等待完成(失败不中断)
184
+ results = await asyncio.gather(*tasks, return_exceptions=True)
185
+ failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
186
+ if failed:
187
+ logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes_to_run[i].__name__ for i in failed]}")
188
+
189
+ def _resolve_spiders_to_run(
190
+ self,
191
+ spiders_input: Union[Type[Spider], str, List[Union[Type[Spider], str]]]
192
+ ) -> List[Type[Spider]]:
193
+ """解析输入为爬虫类列表"""
194
+ inputs = self._normalize_inputs(spiders_input)
195
+ seen_spider_names: Set[str] = set()
196
+ spider_classes: List[Type[Spider]] = []
197
+
198
+ for item in inputs:
199
+ spider_cls = self._resolve_spider_class(item)
200
+ spider_name = spider_cls.name
201
+
202
+ if spider_name in seen_spider_names:
203
+ raise ValueError(f"本次运行中爬虫名称 '{spider_name}' 重复。")
204
+
205
+ seen_spider_names.add(spider_name)
206
+ spider_classes.append(spider_cls)
207
+
208
+ return spider_classes
209
+
210
+ @staticmethod
211
+ def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
212
+ """标准化输入为列表"""
213
+ if isinstance(spiders_input, (type, str)):
214
+ return [spiders_input]
215
+ elif isinstance(spiders_input, (list, tuple)):
216
+ return list(spiders_input)
217
+ else:
218
+ raise TypeError("spiders 必须是爬虫类、name 字符串,或它们的列表/元组")
219
+
220
+ def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
221
+ """解析单个输入项为爬虫类"""
222
+ if isinstance(item, type) and issubclass(item, Spider):
223
+ return item
224
+ elif isinstance(item, str):
225
+ spider_cls = self._spider_registry.get(item)
226
+ if not spider_cls:
227
+ raise ValueError(f"未找到名为 '{item}' 的爬虫。")
228
+ return spider_cls
229
+ else:
230
+ raise TypeError(f"无效类型 {type(item)}。必须是 Spider 类或字符串 name。")
231
+
232
+ async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
233
+ """受信号量限制的爬虫运行函数"""
234
+ task = asyncio.current_task()
235
+ self._active_tasks.add(task)
236
+ try:
237
+ await self.semaphore.acquire()
238
+ logger.info(f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}")
239
+ crawler = Crawler(spider_cls, self.settings)
240
+ self.crawlers.add(crawler)
241
+ await crawler.crawl()
242
+ logger.info(f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}")
243
+ except Exception as e:
244
+ logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
245
+ raise
246
+ finally:
247
+ if task in self._active_tasks:
248
+ self._active_tasks.remove(task)
249
+ self.semaphore.release()
250
+
251
+ def _shutdown(self, _signum, _frame):
252
+ """优雅关闭信号处理"""
253
+ logger.warning("收到关闭信号,正在停止所有爬虫...")
254
+ for crawler in list(self.crawlers):
255
+ if crawler.engine:
256
+ crawler.engine.running = False
257
+ crawler.engine.normal = False
258
+ asyncio.create_task(self._wait_for_shutdown())
259
+
260
+ async def _wait_for_shutdown(self):
261
+ """等待所有活跃任务完成"""
262
+ pending = [t for t in self._active_tasks if not t.done()]
263
+ if pending:
264
+ logger.info(f"等待 {len(pending)} 个活跃任务完成...")
265
+ await asyncio.gather(*pending, return_exceptions=True)
266
+ logger.info("所有爬虫已优雅关闭")
267
+
268
+ @classmethod
269
+ def _get_default_settings(cls) -> SettingManager:
270
+ """加载默认配置"""
271
+ try:
272
+ return get_settings()
273
+ except Exception as e:
274
+ logger.warning(f"无法加载默认配置: {e}")
275
+ return SettingManager()
276
+
277
+ # #!/usr/bin/python
278
+ # # -*- coding: UTF-8 -*-
279
+ # import asyncio
280
+ # import signal
281
+ # from typing import Type, Optional, Set, List
282
+ #
283
+ # from crawlo.spider import Spider
284
+ # from crawlo.core.engine import Engine
285
+ # from crawlo.utils.log import get_logger
286
+ # from crawlo.subscriber import Subscriber
287
+ # from crawlo.extension import ExtensionManager
288
+ # from crawlo.exceptions import SpiderTypeError
289
+ # from crawlo.stats_collector import StatsCollector
290
+ # from crawlo.event import spider_opened, spider_closed
291
+ # from crawlo.settings.setting_manager import SettingManager
292
+ # from crawlo.utils.project import merge_settings, get_settings
293
+ #
294
+ #
295
+ # logger = get_logger(__name__)
296
+ #
297
+ #
298
+ # class Crawler:
299
+ # """单个爬虫运行实例,绑定 Spider 与引擎"""
300
+ #
301
+ # def __init__(self, spider_cls: Type[Spider], settings: SettingManager):
302
+ # self.spider_cls = spider_cls
303
+ # self.spider: Optional[Spider] = None
304
+ # self.engine: Optional[Engine] = None
305
+ # self.stats: Optional[StatsCollector] = None
306
+ # self.subscriber: Optional[Subscriber] = None
307
+ # self.extension: Optional[ExtensionManager] = None
308
+ # self.settings: SettingManager = settings.copy()
309
+ # self._closed = False # 新增状态
310
+ # self._close_lock = asyncio.Lock()
311
+ #
312
+ # async def crawl(self):
313
+ # """启动爬虫核心流程"""
314
+ # self.subscriber = self._create_subscriber()
315
+ # self.spider = self._create_spider()
316
+ # self.engine = self._create_engine()
317
+ # self.stats = self._create_stats()
318
+ # self.extension = self._create_extension()
319
+ #
320
+ # await self.engine.start_spider(self.spider)
321
+ #
322
+ # @staticmethod
323
+ # def _create_subscriber() -> Subscriber:
324
+ # return Subscriber()
325
+ #
326
+ # def _create_spider(self) -> Spider:
327
+ # spider = self.spider_cls.create_instance(self)
328
+ #
329
+ # # --- 关键属性检查 ---
330
+ # if not getattr(spider, 'name', None):
331
+ # raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
332
+ #
333
+ # if not callable(getattr(spider, 'start_requests', None)):
334
+ # raise AttributeError(f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。")
335
+ #
336
+ # start_urls = getattr(spider, 'start_urls', [])
337
+ # if isinstance(start_urls, str):
338
+ # raise TypeError(f"爬虫 '{spider.name}' 'start_urls' 必须是列表或元组,不能是字符串。")
339
+ #
340
+ # if not callable(getattr(spider, 'parse', None)):
341
+ # logger.warning(
342
+ # f"爬虫 '{spider.name}' 未定义 'parse' 方法。请确保所有 Request 都指定了回调函数,否则响应将被忽略。")
343
+ #
344
+ # self._set_spider(spider)
345
+ # return spider
346
+ #
347
+ # def _create_engine(self) -> Engine:
348
+ # engine = Engine(self)
349
+ # engine.engine_start()
350
+ # return engine
351
+ #
352
+ # def _create_stats(self) -> StatsCollector:
353
+ # return StatsCollector(self)
354
+ #
355
+ # def _create_extension(self) -> ExtensionManager:
356
+ # return ExtensionManager.create_instance(self)
357
+ #
358
+ # def _set_spider(self, spider: Spider):
359
+ # self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
360
+ # self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
361
+ # merge_settings(spider, self.settings)
362
+ #
363
+ # async def close(self, reason='finished') -> None:
364
+ # async with self._close_lock:
365
+ # if self._closed:
366
+ # return
367
+ # self._closed = True
368
+ # await self.subscriber.notify(spider_closed)
369
+ # if self.stats and self.spider:
370
+ # self.stats.close_spider(spider=self.spider, reason=reason)
371
+ #
372
+ #
373
+ # class CrawlerProcess:
374
+ # """
375
+ # 爬虫进程管理器,支持多爬虫并发调度、信号量控制、实时日志与优雅关闭
376
+ # """
377
+ #
378
+ # def __init__(self, settings: Optional[SettingManager] = None, max_concurrency: Optional[int] = None):
379
+ # self.settings: SettingManager = settings or self._get_default_settings()
380
+ # self.crawlers: Set[Crawler] = set()
381
+ # self._active_tasks: Set[asyncio.Task] = set()
382
+ #
383
+ # # 使用专用配置,降级使用 CONCURRENCY
384
+ # self.max_concurrency: int = (
385
+ # max_concurrency
386
+ # or self.settings.get('MAX_RUNNING_SPIDERS')
387
+ # or self.settings.get('CONCURRENCY', 3)
388
+ # )
389
+ # self.semaphore = asyncio.Semaphore(self.max_concurrency)
390
+ #
391
+ # # 注册信号量
392
+ # signal.signal(signal.SIGINT, self._shutdown)
393
+ # signal.signal(signal.SIGTERM, self._shutdown)
394
+ # logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
395
+ #
396
+ # async def crawl(self, spiders):
397
+ # """
398
+ # 启动一个或多个爬虫,流式调度,支持实时进度反馈
399
+ # """
400
+ # spider_classes = self._normalize_spiders(spiders)
401
+ # total = len(spider_classes)
402
+ #
403
+ # if total == 0:
404
+ # raise ValueError("至少需要提供一个爬虫类")
405
+ #
406
+ # # 按名称排序
407
+ # spider_classes.sort(key=lambda cls: cls.__name__.lower())
408
+ #
409
+ # logger.info(f"启动 {total} 个爬虫.")
410
+ #
411
+ # # 流式启动所有爬虫任务
412
+ # tasks = [
413
+ # asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
414
+ # for index, spider_cls in enumerate(spider_classes)
415
+ # ]
416
+ #
417
+ # # 等待所有任务完成(失败不中断)
418
+ # results = await asyncio.gather(*tasks, return_exceptions=True)
419
+ #
420
+ # # 统计异常
421
+ # failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
422
+ # if failed:
423
+ # logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes[i].__name__ for i in failed]}")
424
+ #
425
+ # @staticmethod
426
+ # def _normalize_spiders(spiders) -> List[Type[Spider]]:
427
+ # """标准化输入为爬虫类列表"""
428
+ # if isinstance(spiders, type) and issubclass(spiders, Spider):
429
+ # return [spiders]
430
+ # elif isinstance(spiders, (list, tuple)):
431
+ # return list(spiders)
432
+ # else:
433
+ # raise TypeError("spiders 必须是爬虫类或爬虫类列表/元组")
434
+ #
435
+ # async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
436
+ # """
437
+ # 受信号量限制的爬虫运行函数,带进度日志
438
+ # """
439
+ # task = asyncio.current_task()
440
+ # self._active_tasks.add(task)
441
+ #
442
+ # try:
443
+ # # 获取并发许可
444
+ # await self.semaphore.acquire()
445
+ #
446
+ # start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
447
+ # logger.info(start_msg)
448
+ #
449
+ # # 创建并运行爬虫
450
+ # crawler = self._create_crawler(spider_cls)
451
+ # self.crawlers.add(crawler)
452
+ # await crawler.crawl()
453
+ #
454
+ # end_msg = f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}"
455
+ # logger.info(end_msg)
456
+ #
457
+ # except Exception as e:
458
+ # logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
459
+ # raise
460
+ # finally:
461
+ # if task in self._active_tasks:
462
+ # self._active_tasks.remove(task)
463
+ # self.semaphore.release() # 必须释放
464
+ #
465
+ # def _create_crawler(self, spider_cls: Type[Spider]) -> Crawler:
466
+ # """创建爬虫实例"""
467
+ # if isinstance(spider_cls, str):
468
+ # raise SpiderTypeError(f"不支持字符串形式的爬虫: {spider_cls}")
469
+ # return Crawler(spider_cls, self.settings)
470
+ #
471
+ # def _shutdown(self, _signum, _frame):
472
+ # """优雅关闭信号处理"""
473
+ # logger.warning("收到关闭信号,正在停止所有爬虫...")
474
+ # for crawler in list(self.crawlers):
475
+ # if crawler.engine:
476
+ # crawler.engine.running = False
477
+ # crawler.engine.normal = False
478
+ # asyncio.create_task(self._wait_for_shutdown())
479
+ #
480
+ # async def _wait_for_shutdown(self):
481
+ # """等待所有活跃任务完成"""
482
+ # pending = [t for t in self._active_tasks if not t.done()]
483
+ # if pending:
484
+ # logger.info(f"等待 {len(pending)} 个活跃任务完成...")
485
+ # await asyncio.gather(*pending, return_exceptions=True)
486
+ # logger.info("所有爬虫已优雅关闭")
487
+ #
488
+ # @classmethod
489
+ # def _get_default_settings(cls) -> SettingManager:
490
+ # """加载默认配置"""
491
+ # try:
492
+ # return get_settings()
493
+ # except Exception as e:
494
+ # logger.warning(f"无法加载默认配置: {e}")
493
495
  # return SettingManager()