crawlo 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (112) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +41 -0
  4. crawlo/commands/__init__.py +10 -0
  5. crawlo/commands/genspider.py +111 -0
  6. crawlo/commands/run.py +149 -0
  7. crawlo/commands/startproject.py +101 -0
  8. crawlo/core/__init__.py +2 -2
  9. crawlo/core/engine.py +158 -158
  10. crawlo/core/processor.py +40 -40
  11. crawlo/core/scheduler.py +57 -57
  12. crawlo/crawler.py +219 -242
  13. crawlo/downloader/__init__.py +78 -78
  14. crawlo/downloader/aiohttp_downloader.py +200 -259
  15. crawlo/downloader/cffi_downloader.py +277 -0
  16. crawlo/downloader/httpx_downloader.py +246 -187
  17. crawlo/event.py +11 -11
  18. crawlo/exceptions.py +78 -64
  19. crawlo/extension/__init__.py +31 -31
  20. crawlo/extension/log_interval.py +49 -49
  21. crawlo/extension/log_stats.py +44 -44
  22. crawlo/extension/logging_extension.py +35 -0
  23. crawlo/filters/__init__.py +37 -37
  24. crawlo/filters/aioredis_filter.py +150 -150
  25. crawlo/filters/memory_filter.py +202 -202
  26. crawlo/items/__init__.py +22 -62
  27. crawlo/items/base.py +31 -0
  28. crawlo/items/fields.py +54 -0
  29. crawlo/items/items.py +105 -119
  30. crawlo/middleware/__init__.py +21 -21
  31. crawlo/middleware/default_header.py +32 -32
  32. crawlo/middleware/download_delay.py +28 -28
  33. crawlo/middleware/middleware_manager.py +135 -140
  34. crawlo/middleware/proxy.py +246 -0
  35. crawlo/middleware/request_ignore.py +30 -30
  36. crawlo/middleware/response_code.py +18 -18
  37. crawlo/middleware/response_filter.py +26 -26
  38. crawlo/middleware/retry.py +90 -90
  39. crawlo/network/__init__.py +7 -7
  40. crawlo/network/request.py +203 -204
  41. crawlo/network/response.py +166 -166
  42. crawlo/pipelines/__init__.py +13 -13
  43. crawlo/pipelines/console_pipeline.py +39 -39
  44. crawlo/pipelines/mongo_pipeline.py +116 -116
  45. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  46. crawlo/pipelines/mysql_pipeline.py +195 -195
  47. crawlo/pipelines/pipeline_manager.py +56 -56
  48. crawlo/settings/__init__.py +7 -7
  49. crawlo/settings/default_settings.py +169 -94
  50. crawlo/settings/setting_manager.py +99 -99
  51. crawlo/spider/__init__.py +41 -36
  52. crawlo/stats_collector.py +59 -59
  53. crawlo/subscriber.py +106 -106
  54. crawlo/task_manager.py +27 -27
  55. crawlo/templates/crawlo.cfg.tmpl +11 -0
  56. crawlo/templates/project/__init__.py.tmpl +4 -0
  57. crawlo/templates/project/items.py.tmpl +18 -0
  58. crawlo/templates/project/middlewares.py.tmpl +76 -0
  59. crawlo/templates/project/pipelines.py.tmpl +64 -0
  60. crawlo/templates/project/settings.py.tmpl +54 -0
  61. crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
  62. crawlo/templates/spider/spider.py.tmpl +32 -0
  63. crawlo/utils/__init__.py +7 -7
  64. crawlo/utils/concurrency_manager.py +124 -124
  65. crawlo/utils/date_tools.py +233 -177
  66. crawlo/utils/db_helper.py +344 -0
  67. crawlo/utils/func_tools.py +82 -82
  68. crawlo/utils/log.py +129 -39
  69. crawlo/utils/pqueue.py +173 -173
  70. crawlo/utils/project.py +199 -59
  71. crawlo/utils/request.py +267 -122
  72. crawlo/utils/spider_loader.py +63 -0
  73. crawlo/utils/system.py +11 -11
  74. crawlo/utils/tools.py +5 -303
  75. crawlo/utils/url.py +39 -39
  76. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/METADATA +49 -48
  77. crawlo-1.0.6.dist-info/RECORD +94 -0
  78. crawlo-1.0.6.dist-info/entry_points.txt +2 -0
  79. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/top_level.txt +1 -0
  80. examples/gxb/items.py +36 -0
  81. examples/gxb/run.py +16 -0
  82. examples/gxb/settings.py +72 -0
  83. examples/gxb/spider/__init__.py +0 -0
  84. examples/gxb/spider/miit_spider.py +180 -0
  85. examples/gxb/spider/telecom_device.py +129 -0
  86. tests/__init__.py +7 -7
  87. tests/test_proxy_health_check.py +33 -0
  88. tests/test_proxy_middleware_integration.py +137 -0
  89. tests/test_proxy_providers.py +57 -0
  90. tests/test_proxy_stats.py +20 -0
  91. tests/test_proxy_strategies.py +60 -0
  92. crawlo/downloader/playwright_downloader.py +0 -161
  93. crawlo/templates/item_template.tmpl +0 -22
  94. crawlo/templates/project_template/main.py +0 -33
  95. crawlo/templates/project_template/setting.py +0 -190
  96. crawlo/templates/spider_template.tmpl +0 -31
  97. crawlo-1.0.4.dist-info/RECORD +0 -79
  98. crawlo-1.0.4.dist-info/entry_points.txt +0 -2
  99. tests/baidu_spider/__init__.py +0 -7
  100. tests/baidu_spider/demo.py +0 -94
  101. tests/baidu_spider/items.py +0 -25
  102. tests/baidu_spider/middleware.py +0 -49
  103. tests/baidu_spider/pipeline.py +0 -55
  104. tests/baidu_spider/request_fingerprints.txt +0 -9
  105. tests/baidu_spider/run.py +0 -27
  106. tests/baidu_spider/settings.py +0 -80
  107. tests/baidu_spider/spiders/__init__.py +0 -7
  108. tests/baidu_spider/spiders/bai_du.py +0 -61
  109. tests/baidu_spider/spiders/sina.py +0 -79
  110. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/WHEEL +0 -0
  111. {crawlo/templates/project_template/items → examples}/__init__.py +0 -0
  112. {crawlo/templates/project_template/spiders → examples/gxb}/__init__.py +0 -0
crawlo/crawler.py CHANGED
@@ -1,242 +1,219 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*
3
- import signal
4
- import asyncio
5
- from typing import Type, Final, Set, Optional
6
-
7
- from crawlo.spider import Spider
8
- from crawlo.core.engine import Engine
9
- from crawlo.utils.log import get_logger
10
- from crawlo.subscriber import Subscriber
11
- from crawlo.extension import ExtensionManager
12
- from crawlo.exceptions import SpiderTypeError
13
- from crawlo.stats_collector import StatsCollector
14
- from crawlo.event import spider_opened, spider_closed
15
- from crawlo.settings.setting_manager import SettingManager
16
- from crawlo.utils.project import merge_settings, get_settings
17
- from crawlo.utils.concurrency_manager import calculate_optimal_concurrency
18
-
19
- logger = get_logger(__name__)
20
-
21
-
22
- class Crawler:
23
-
24
- def __init__(self, spider_cls, settings):
25
- self.spider_cls = spider_cls
26
- self.spider: Optional[Spider] = None
27
- self.engine: Optional[Engine] = None
28
- self.stats: Optional[StatsCollector] = None
29
- self.subscriber: Optional[Subscriber] = None
30
- self.extension: Optional[ExtensionManager] = None
31
- self.settings: SettingManager = settings.copy()
32
-
33
- async def crawl(self):
34
- self.subscriber = self._create_subscriber()
35
- self.spider = self._create_spider()
36
- self.engine = self._create_engine()
37
- self.stats = self._create_stats()
38
- self.extension = self._create_extension()
39
-
40
- await self.engine.start_spider(self.spider)
41
-
42
- @staticmethod
43
- def _create_subscriber():
44
- return Subscriber()
45
-
46
- def _create_spider(self) -> Spider:
47
- spider = self.spider_cls.create_instance(self)
48
-
49
- # --- 关键属性检查 ---
50
- # 1. 检查 name
51
- if not getattr(spider, 'name', None):
52
- raise AttributeError(f"Spider class '{self.spider_cls.__name__}' must have a 'name' attribute.")
53
-
54
- # 2. 检查 start_requests 是否可调用
55
- if not callable(getattr(spider, 'start_requests', None)):
56
- raise AttributeError(f"Spider '{spider.name}' must have a callable 'start_requests' method.")
57
-
58
- # 3. 检查 start_urls 类型
59
- start_urls = getattr(spider, 'start_urls', [])
60
- if isinstance(start_urls, str):
61
- raise TypeError(f"'{spider.name}.start_urls' must be a list or tuple, not a string.")
62
-
63
- # --- 日志提示 ---
64
- # 提醒用户定义 parse 方法
65
- if not callable(getattr(spider, 'parse', None)):
66
- logger.warning(f"Spider '{spider.name}' lacks a 'parse' method. Ensure all Requests have callbacks.")
67
-
68
- self._set_spider(spider)
69
- return spider
70
-
71
- def _create_engine(self) -> Engine:
72
- engine = Engine(self)
73
- engine.engine_start()
74
- return engine
75
-
76
- def _create_stats(self) -> StatsCollector:
77
- stats = StatsCollector(self)
78
- return stats
79
-
80
- def _create_extension(self) -> ExtensionManager:
81
- extension = ExtensionManager.create_instance(self)
82
- return extension
83
-
84
- def _set_spider(self, spider):
85
- self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
86
- self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
87
- merge_settings(spider, self.settings)
88
-
89
- async def close(self, reason='finished') -> None:
90
- await asyncio.create_task(self.subscriber.notify(spider_closed))
91
- self.stats.close_spider(spider=self.spider, reason=reason)
92
-
93
-
94
- class CrawlerProcess:
95
- """爬虫处理类,支持跨平台动态并发控制和精细化日志"""
96
-
97
- def __init__(self, settings=None, max_concurrency: Optional[int] = None, batch_size: int = 50):
98
- self.crawlers: Final[Set] = set()
99
- self._active_spiders: Final[Set] = set()
100
- self.settings = settings or self._get_default_settings()
101
- self.batch_size = batch_size
102
-
103
- # 使用独立模块计算最大并发数
104
- self.max_concurrency = calculate_optimal_concurrency(max_concurrency)
105
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
106
-
107
- signal.signal(signal.SIGINT, self._shutdown)
108
- logger.info(f"初始化爬虫处理进程,最大并发数: {self.max_concurrency}")
109
-
110
- async def crawl(self, spiders):
111
- """支持单个或多个爬虫的批量处理,优化日志输出"""
112
- if not spiders:
113
- raise ValueError("至少需要提供一个爬虫类")
114
-
115
- # 统一转换为列表
116
- if isinstance(spiders, type) and issubclass(spiders, Spider):
117
- spiders = [spiders]
118
- elif isinstance(spiders, (list, tuple)):
119
- spiders = list(spiders)
120
- else:
121
- raise TypeError("spiders 必须是爬虫类或爬虫类列表/元组")
122
-
123
- # 按爬虫类名首字母排序(升序)
124
- spiders.sort(key=lambda x: x.__name__.lower())
125
-
126
- if len(spiders) == 1:
127
- logger.info(f"启动爬虫: {spiders[0].__name__}")
128
- else:
129
- logger.info(f"启动{len(spiders)}个爬虫,按名称排序后分批处理中")
130
-
131
- batches = [spiders[i:i + self.batch_size] for i in range(0, len(spiders), self.batch_size)]
132
-
133
- for batch_idx, batch in enumerate(batches):
134
- batch_tasks = set()
135
-
136
- for spider_cls in batch:
137
- crawler = self._create_crawler(spider_cls)
138
- self.crawlers.add(crawler)
139
-
140
- await self.semaphore.acquire()
141
- task = asyncio.create_task(self._run_crawler_with_semaphore(crawler))
142
- batch_tasks.add(task)
143
- self._active_spiders.add(task)
144
-
145
- if len(spiders) > 1: # 仅对多爬虫显示批次信息
146
- logger.info(f"启动第 {batch_idx + 1}/{len(batches)} 批爬虫,共 {len(batch)} 个")
147
-
148
- await asyncio.gather(*batch_tasks)
149
-
150
- if len(spiders) > 1: # 仅对多爬虫显示批次完成信息
151
- logger.info(f"第 {batch_idx + 1} 批爬虫处理完成")
152
-
153
- async def _run_crawler_with_semaphore(self, crawler):
154
- """使用信号量控制的爬虫运行函数"""
155
- try:
156
- await crawler.crawl()
157
- finally:
158
- self.semaphore.release() # 确保资源释放
159
-
160
- async def start(self):
161
- """启动所有爬虫任务"""
162
- if self._active_spiders:
163
- logger.info(f"启动 {len(self._active_spiders)} 个爬虫任务,并发限制: {self.max_concurrency}")
164
- await asyncio.gather(*self._active_spiders)
165
-
166
- def _create_crawler(self, spider_cls) -> Crawler:
167
- """创建爬虫实例"""
168
- if isinstance(spider_cls, str):
169
- raise SpiderTypeError(f"{type(self)}.crawl args: String is not supported.")
170
- crawler: Crawler = Crawler(spider_cls, self.settings)
171
- return crawler
172
-
173
- def _shutdown(self, _signum, _frame):
174
- """优雅关闭所有爬虫"""
175
- logger.warning(f"收到关闭信号,正在优雅关闭 {len(self.crawlers)} 个爬虫...")
176
- for crawler in self.crawlers:
177
- if crawler.engine:
178
- crawler.engine.running = False
179
- crawler.engine.normal = False
180
- crawler.stats.close_spider(crawler.spider, 'shutdown signal')
181
-
182
- # 等待所有任务完成
183
- asyncio.create_task(self._wait_for_tasks())
184
-
185
- async def _wait_for_tasks(self):
186
- """等待所有活跃任务完成"""
187
- pending = [task for task in self._active_spiders if not task.done()]
188
- if pending:
189
- logger.info(f"等待 {len(pending)} 个活跃任务完成...")
190
- await asyncio.gather(*pending)
191
- logger.info("所有爬虫已优雅关闭")
192
-
193
- @classmethod
194
- def _get_default_settings(cls):
195
- """框架自动获取默认配置"""
196
- try:
197
- return get_settings()
198
- except ImportError:
199
- return {}
200
-
201
- # class CrawlerProcess:
202
- #
203
- # def __init__(self, settings=None):
204
- # self.crawlers: Final[Set] = set()
205
- # self._active_spiders: Final[Set] = set()
206
- # self.settings = settings or self._get_default_settings()
207
- #
208
- # signal.signal(signal.SIGINT, self._shutdown)
209
- #
210
- # async def crawl(self, spider: Type[Spider]):
211
- # crawler: Crawler = self._create_crawler(spider)
212
- # self.crawlers.add(crawler)
213
- # task = await self._crawl(crawler)
214
- # self._active_spiders.add(task)
215
- #
216
- # @classmethod
217
- # def _get_default_settings(cls):
218
- # """自动获取默认配置"""
219
- # try:
220
- # return get_settings()
221
- # except ImportError:
222
- # return {}
223
- #
224
- # @staticmethod
225
- # async def _crawl(crawler):
226
- # return asyncio.create_task(crawler.crawl())
227
- #
228
- # async def start(self):
229
- # await asyncio.gather(*self._active_spiders)
230
- #
231
- # def _create_crawler(self, spider_cls) -> Crawler:
232
- # if isinstance(spider_cls, str):
233
- # raise SpiderTypeError(f"{type(self)}.crawl args: String is not supported.")
234
- # crawler: Crawler = Crawler(spider_cls, self.settings)
235
- # return crawler
236
- #
237
- # def _shutdown(self, _signum, _frame):
238
- # for crawler in self.crawlers:
239
- # crawler.engine.running = False
240
- # crawler.engine.normal = False
241
- # crawler.stats.close_spider(crawler.spider, 'Ctrl C')
242
- # logger.warning(f'spiders received: `Ctrl C` signal, closed.')
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ import asyncio
4
+ import signal
5
+ from typing import Type, Optional, Set, List
6
+
7
+ from crawlo.spider import Spider
8
+ from crawlo.core.engine import Engine
9
+ from crawlo.utils.log import get_logger
10
+ from crawlo.subscriber import Subscriber
11
+ from crawlo.extension import ExtensionManager
12
+ from crawlo.exceptions import SpiderTypeError
13
+ from crawlo.stats_collector import StatsCollector
14
+ from crawlo.event import spider_opened, spider_closed
15
+ from crawlo.settings.setting_manager import SettingManager
16
+ from crawlo.utils.project import merge_settings, get_settings
17
+
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class Crawler:
23
+ """单个爬虫运行实例,绑定 Spider 与引擎"""
24
+
25
+ def __init__(self, spider_cls: Type[Spider], settings: SettingManager):
26
+ self.spider_cls = spider_cls
27
+ self.spider: Optional[Spider] = None
28
+ self.engine: Optional[Engine] = None
29
+ self.stats: Optional[StatsCollector] = None
30
+ self.subscriber: Optional[Subscriber] = None
31
+ self.extension: Optional[ExtensionManager] = None
32
+ self.settings: SettingManager = settings.copy()
33
+ self._closed = False # 新增状态
34
+ self._close_lock = asyncio.Lock()
35
+
36
+ async def crawl(self):
37
+ """启动爬虫核心流程"""
38
+ self.subscriber = self._create_subscriber()
39
+ self.spider = self._create_spider()
40
+ self.engine = self._create_engine()
41
+ self.stats = self._create_stats()
42
+ self.extension = self._create_extension()
43
+
44
+ await self.engine.start_spider(self.spider)
45
+
46
+ @staticmethod
47
+ def _create_subscriber() -> Subscriber:
48
+ return Subscriber()
49
+
50
+ def _create_spider(self) -> Spider:
51
+ spider = self.spider_cls.create_instance(self)
52
+
53
+ # --- 关键属性检查 ---
54
+ if not getattr(spider, 'name', None):
55
+ raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
56
+
57
+ if not callable(getattr(spider, 'start_requests', None)):
58
+ raise AttributeError(f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。")
59
+
60
+ start_urls = getattr(spider, 'start_urls', [])
61
+ if isinstance(start_urls, str):
62
+ raise TypeError(f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。")
63
+
64
+ if not callable(getattr(spider, 'parse', None)):
65
+ logger.warning(
66
+ f"爬虫 '{spider.name}' 未定义 'parse' 方法。请确保所有 Request 都指定了回调函数,否则响应将被忽略。")
67
+
68
+ self._set_spider(spider)
69
+ return spider
70
+
71
+ def _create_engine(self) -> Engine:
72
+ engine = Engine(self)
73
+ engine.engine_start()
74
+ return engine
75
+
76
+ def _create_stats(self) -> StatsCollector:
77
+ return StatsCollector(self)
78
+
79
+ def _create_extension(self) -> ExtensionManager:
80
+ return ExtensionManager.create_instance(self)
81
+
82
+ def _set_spider(self, spider: Spider):
83
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
84
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
85
+ merge_settings(spider, self.settings)
86
+
87
+ async def close(self, reason='finished') -> None:
88
+ async with self._close_lock:
89
+ if self._closed:
90
+ return
91
+ self._closed = True
92
+ await self.subscriber.notify(spider_closed)
93
+ if self.stats and self.spider:
94
+ self.stats.close_spider(spider=self.spider, reason=reason)
95
+
96
+
97
+ class CrawlerProcess:
98
+ """
99
+ 爬虫进程管理器,支持多爬虫并发调度、信号量控制、实时日志与优雅关闭
100
+ """
101
+
102
+ def __init__(self, settings: Optional[SettingManager] = None, max_concurrency: Optional[int] = None):
103
+ self.settings: SettingManager = settings or self._get_default_settings()
104
+ self.crawlers: Set[Crawler] = set()
105
+ self._active_tasks: Set[asyncio.Task] = set()
106
+
107
+ # 使用专用配置,降级使用 CONCURRENCY
108
+ self.max_concurrency: int = (
109
+ max_concurrency
110
+ or self.settings.get('MAX_RUNNING_SPIDERS')
111
+ or self.settings.get('CONCURRENCY', 3)
112
+ )
113
+ self.semaphore = asyncio.Semaphore(self.max_concurrency)
114
+
115
+ # 注册信号量
116
+ signal.signal(signal.SIGINT, self._shutdown)
117
+ signal.signal(signal.SIGTERM, self._shutdown)
118
+ logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
119
+
120
+ async def crawl(self, spiders):
121
+ """
122
+ 启动一个或多个爬虫,流式调度,支持实时进度反馈
123
+ """
124
+ spider_classes = self._normalize_spiders(spiders)
125
+ total = len(spider_classes)
126
+
127
+ if total == 0:
128
+ raise ValueError("至少需要提供一个爬虫类")
129
+
130
+ # 按名称排序
131
+ spider_classes.sort(key=lambda cls: cls.__name__.lower())
132
+
133
+ logger.info(f"启动 {total} 个爬虫.")
134
+
135
+ # 流式启动所有爬虫任务
136
+ tasks = [
137
+ asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
138
+ for index, spider_cls in enumerate(spider_classes)
139
+ ]
140
+
141
+ # 等待所有任务完成(失败不中断)
142
+ results = await asyncio.gather(*tasks, return_exceptions=True)
143
+
144
+ # 统计异常
145
+ failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
146
+ if failed:
147
+ logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes[i].__name__ for i in failed]}")
148
+
149
+ @staticmethod
150
+ def _normalize_spiders(spiders) -> List[Type[Spider]]:
151
+ """标准化输入为爬虫类列表"""
152
+ if isinstance(spiders, type) and issubclass(spiders, Spider):
153
+ return [spiders]
154
+ elif isinstance(spiders, (list, tuple)):
155
+ return list(spiders)
156
+ else:
157
+ raise TypeError("spiders 必须是爬虫类或爬虫类列表/元组")
158
+
159
+ async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
160
+ """
161
+ 受信号量限制的爬虫运行函数,带进度日志
162
+ """
163
+ task = asyncio.current_task()
164
+ self._active_tasks.add(task)
165
+
166
+ try:
167
+ # 获取并发许可
168
+ await self.semaphore.acquire()
169
+
170
+ start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
171
+ logger.info(start_msg)
172
+
173
+ # 创建并运行爬虫
174
+ crawler = self._create_crawler(spider_cls)
175
+ self.crawlers.add(crawler)
176
+ await crawler.crawl()
177
+
178
+ end_msg = f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}"
179
+ logger.info(end_msg)
180
+
181
+ except Exception as e:
182
+ logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
183
+ raise
184
+ finally:
185
+ if task in self._active_tasks:
186
+ self._active_tasks.remove(task)
187
+ self.semaphore.release() # 必须释放
188
+
189
+ def _create_crawler(self, spider_cls: Type[Spider]) -> Crawler:
190
+ """创建爬虫实例"""
191
+ if isinstance(spider_cls, str):
192
+ raise SpiderTypeError(f"不支持字符串形式的爬虫: {spider_cls}")
193
+ return Crawler(spider_cls, self.settings)
194
+
195
+ def _shutdown(self, _signum, _frame):
196
+ """优雅关闭信号处理"""
197
+ logger.warning("收到关闭信号,正在停止所有爬虫...")
198
+ for crawler in list(self.crawlers):
199
+ if crawler.engine:
200
+ crawler.engine.running = False
201
+ crawler.engine.normal = False
202
+ asyncio.create_task(self._wait_for_shutdown())
203
+
204
+ async def _wait_for_shutdown(self):
205
+ """等待所有活跃任务完成"""
206
+ pending = [t for t in self._active_tasks if not t.done()]
207
+ if pending:
208
+ logger.info(f"等待 {len(pending)} 个活跃任务完成...")
209
+ await asyncio.gather(*pending, return_exceptions=True)
210
+ logger.info("所有爬虫已优雅关闭")
211
+
212
+ @classmethod
213
+ def _get_default_settings(cls) -> SettingManager:
214
+ """加载默认配置"""
215
+ try:
216
+ return get_settings()
217
+ except Exception as e:
218
+ logger.warning(f"无法加载默认配置: {e}")
219
+ return SettingManager()
@@ -1,78 +1,78 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from abc import abstractmethod, ABCMeta
4
- from typing_extensions import Self
5
- from typing import Final, Set, Optional
6
- from contextlib import asynccontextmanager
7
-
8
- from crawlo import Response, Request
9
- from crawlo.utils.log import get_logger
10
- from crawlo.middleware.middleware_manager import MiddlewareManager
11
-
12
-
13
- class ActivateRequestManager:
14
-
15
- def __init__(self):
16
- self._active: Final[Set] = set()
17
-
18
- def add(self, request):
19
- self._active.add(request)
20
-
21
- def remove(self, request):
22
- self._active.remove(request)
23
-
24
- @asynccontextmanager
25
- async def __call__(self, request):
26
- try:
27
- yield self.add(request)
28
- finally:
29
- self.remove(request)
30
-
31
- def __len__(self):
32
- return len(self._active)
33
-
34
-
35
- class DownloaderMeta(ABCMeta):
36
- def __subclasscheck__(self, subclass):
37
- required_methods = ('fetch', 'download', 'create_instance', 'close')
38
- is_subclass = all(
39
- hasattr(subclass, method) and callable(getattr(subclass, method, None)) for method in required_methods
40
- )
41
- return is_subclass
42
-
43
-
44
- class DownloaderBase(metaclass=DownloaderMeta):
45
- def __init__(self, crawler):
46
- self.crawler = crawler
47
- self._active = ActivateRequestManager()
48
- self.middleware: Optional[MiddlewareManager] = None
49
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
50
-
51
- @classmethod
52
- def create_instance(cls, *args, **kwargs) -> Self:
53
- return cls(*args, **kwargs)
54
-
55
- def open(self) -> None:
56
- self.logger.info(
57
- f"{self.crawler.spider} <downloader class:{type(self).__name__}>"
58
- f"<concurrency:{self.crawler.settings.get_int('CONCURRENCY')}>"
59
- )
60
- self.middleware = MiddlewareManager.create_instance(self.crawler)
61
-
62
- async def fetch(self, request) -> Optional[Response]:
63
- async with self._active(request):
64
- response = await self.middleware.download(request)
65
- return response
66
-
67
- @abstractmethod
68
- async def download(self, request: Request) -> Response:
69
- pass
70
-
71
- async def close(self) -> None:
72
- pass
73
-
74
- def idle(self) -> bool:
75
- return len(self) == 0
76
-
77
- def __len__(self) -> int:
78
- return len(self._active)
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from abc import abstractmethod, ABCMeta
4
+ from typing_extensions import Self
5
+ from typing import Final, Set, Optional
6
+ from contextlib import asynccontextmanager
7
+
8
+ from crawlo import Response, Request
9
+ from crawlo.utils.log import get_logger
10
+ from crawlo.middleware.middleware_manager import MiddlewareManager
11
+
12
+
13
+ class ActivateRequestManager:
14
+
15
+ def __init__(self):
16
+ self._active: Final[Set] = set()
17
+
18
+ def add(self, request):
19
+ self._active.add(request)
20
+
21
+ def remove(self, request):
22
+ self._active.remove(request)
23
+
24
+ @asynccontextmanager
25
+ async def __call__(self, request):
26
+ try:
27
+ yield self.add(request)
28
+ finally:
29
+ self.remove(request)
30
+
31
+ def __len__(self):
32
+ return len(self._active)
33
+
34
+
35
+ class DownloaderMeta(ABCMeta):
36
+ def __subclasscheck__(self, subclass):
37
+ required_methods = ('fetch', 'download', 'create_instance', 'close')
38
+ is_subclass = all(
39
+ hasattr(subclass, method) and callable(getattr(subclass, method, None)) for method in required_methods
40
+ )
41
+ return is_subclass
42
+
43
+
44
+ class DownloaderBase(metaclass=DownloaderMeta):
45
+ def __init__(self, crawler):
46
+ self.crawler = crawler
47
+ self._active = ActivateRequestManager()
48
+ self.middleware: Optional[MiddlewareManager] = None
49
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
50
+
51
+ @classmethod
52
+ def create_instance(cls, *args, **kwargs) -> Self:
53
+ return cls(*args, **kwargs)
54
+
55
+ def open(self) -> None:
56
+ self.logger.info(
57
+ f"{self.crawler.spider} <downloader class:{type(self).__name__}>"
58
+ f"<concurrency:{self.crawler.settings.get_int('CONCURRENCY')}>"
59
+ )
60
+ self.middleware = MiddlewareManager.create_instance(self.crawler)
61
+
62
+ async def fetch(self, request) -> Optional[Response]:
63
+ async with self._active(request):
64
+ response = await self.middleware.download(request)
65
+ return response
66
+
67
+ @abstractmethod
68
+ async def download(self, request: Request) -> Response:
69
+ pass
70
+
71
+ async def close(self) -> None:
72
+ pass
73
+
74
+ def idle(self) -> bool:
75
+ return len(self) == 0
76
+
77
+ def __len__(self) -> int:
78
+ return len(self._active)