crawlo 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (222) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +81 -81
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +144 -142
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +323 -292
  14. crawlo/commands/startproject.py +420 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +251 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1110 -1027
  24. crawlo/data/__init__.py +6 -0
  25. crawlo/data/user_agents.py +108 -0
  26. crawlo/downloader/__init__.py +266 -266
  27. crawlo/downloader/aiohttp_downloader.py +220 -220
  28. crawlo/downloader/cffi_downloader.py +256 -256
  29. crawlo/downloader/httpx_downloader.py +259 -259
  30. crawlo/downloader/hybrid_downloader.py +212 -213
  31. crawlo/downloader/playwright_downloader.py +402 -402
  32. crawlo/downloader/selenium_downloader.py +472 -472
  33. crawlo/event.py +11 -11
  34. crawlo/exceptions.py +81 -81
  35. crawlo/extension/__init__.py +37 -37
  36. crawlo/extension/health_check.py +141 -141
  37. crawlo/extension/log_interval.py +57 -57
  38. crawlo/extension/log_stats.py +81 -81
  39. crawlo/extension/logging_extension.py +43 -43
  40. crawlo/extension/memory_monitor.py +104 -104
  41. crawlo/extension/performance_profiler.py +133 -133
  42. crawlo/extension/request_recorder.py +107 -107
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +280 -280
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/items/__init__.py +23 -23
  47. crawlo/items/base.py +21 -21
  48. crawlo/items/fields.py +52 -53
  49. crawlo/items/items.py +104 -104
  50. crawlo/middleware/__init__.py +21 -21
  51. crawlo/middleware/default_header.py +131 -131
  52. crawlo/middleware/download_delay.py +104 -104
  53. crawlo/middleware/middleware_manager.py +135 -135
  54. crawlo/middleware/offsite.py +114 -115
  55. crawlo/middleware/proxy.py +367 -366
  56. crawlo/middleware/request_ignore.py +86 -87
  57. crawlo/middleware/response_code.py +163 -164
  58. crawlo/middleware/response_filter.py +136 -137
  59. crawlo/middleware/retry.py +124 -124
  60. crawlo/mode_manager.py +211 -211
  61. crawlo/network/__init__.py +21 -21
  62. crawlo/network/request.py +338 -338
  63. crawlo/network/response.py +359 -359
  64. crawlo/pipelines/__init__.py +21 -21
  65. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  66. crawlo/pipelines/console_pipeline.py +39 -39
  67. crawlo/pipelines/csv_pipeline.py +316 -316
  68. crawlo/pipelines/database_dedup_pipeline.py +222 -224
  69. crawlo/pipelines/json_pipeline.py +218 -218
  70. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  71. crawlo/pipelines/mongo_pipeline.py +131 -131
  72. crawlo/pipelines/mysql_pipeline.py +317 -316
  73. crawlo/pipelines/pipeline_manager.py +61 -61
  74. crawlo/pipelines/redis_dedup_pipeline.py +165 -167
  75. crawlo/project.py +279 -187
  76. crawlo/queue/pqueue.py +37 -37
  77. crawlo/queue/queue_manager.py +337 -337
  78. crawlo/queue/redis_priority_queue.py +298 -298
  79. crawlo/settings/__init__.py +7 -7
  80. crawlo/settings/default_settings.py +217 -226
  81. crawlo/settings/setting_manager.py +122 -122
  82. crawlo/spider/__init__.py +639 -639
  83. crawlo/stats_collector.py +59 -59
  84. crawlo/subscriber.py +129 -130
  85. crawlo/task_manager.py +30 -30
  86. crawlo/templates/crawlo.cfg.tmpl +10 -10
  87. crawlo/templates/project/__init__.py.tmpl +3 -3
  88. crawlo/templates/project/items.py.tmpl +17 -17
  89. crawlo/templates/project/middlewares.py.tmpl +118 -118
  90. crawlo/templates/project/pipelines.py.tmpl +96 -96
  91. crawlo/templates/project/run.py.tmpl +47 -45
  92. crawlo/templates/project/settings.py.tmpl +350 -327
  93. crawlo/templates/project/settings_distributed.py.tmpl +160 -119
  94. crawlo/templates/project/settings_gentle.py.tmpl +133 -94
  95. crawlo/templates/project/settings_high_performance.py.tmpl +155 -151
  96. crawlo/templates/project/settings_simple.py.tmpl +108 -68
  97. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  98. crawlo/templates/spider/spider.py.tmpl +143 -143
  99. crawlo/tools/__init__.py +182 -182
  100. crawlo/tools/anti_crawler.py +268 -268
  101. crawlo/tools/authenticated_proxy.py +240 -240
  102. crawlo/tools/data_validator.py +180 -180
  103. crawlo/tools/date_tools.py +35 -35
  104. crawlo/tools/distributed_coordinator.py +386 -386
  105. crawlo/tools/retry_mechanism.py +220 -220
  106. crawlo/tools/scenario_adapter.py +262 -262
  107. crawlo/utils/__init__.py +35 -35
  108. crawlo/utils/batch_processor.py +259 -260
  109. crawlo/utils/controlled_spider_mixin.py +439 -439
  110. crawlo/utils/date_tools.py +290 -290
  111. crawlo/utils/db_helper.py +343 -343
  112. crawlo/utils/enhanced_error_handler.py +356 -359
  113. crawlo/utils/env_config.py +105 -105
  114. crawlo/utils/error_handler.py +123 -125
  115. crawlo/utils/func_tools.py +82 -82
  116. crawlo/utils/large_scale_config.py +286 -286
  117. crawlo/utils/large_scale_helper.py +344 -343
  118. crawlo/utils/log.py +128 -128
  119. crawlo/utils/performance_monitor.py +285 -284
  120. crawlo/utils/queue_helper.py +175 -175
  121. crawlo/utils/redis_connection_pool.py +334 -334
  122. crawlo/utils/redis_key_validator.py +198 -199
  123. crawlo/utils/request.py +267 -267
  124. crawlo/utils/request_serializer.py +218 -219
  125. crawlo/utils/spider_loader.py +61 -62
  126. crawlo/utils/system.py +11 -11
  127. crawlo/utils/tools.py +4 -4
  128. crawlo/utils/url.py +39 -39
  129. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/METADATA +764 -692
  130. crawlo-1.2.4.dist-info/RECORD +206 -0
  131. examples/__init__.py +7 -7
  132. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  133. tests/__init__.py +7 -7
  134. tests/advanced_tools_example.py +275 -275
  135. tests/authenticated_proxy_example.py +236 -236
  136. tests/cleaners_example.py +160 -160
  137. tests/config_validation_demo.py +102 -102
  138. tests/controlled_spider_example.py +205 -205
  139. tests/date_tools_example.py +180 -180
  140. tests/dynamic_loading_example.py +523 -523
  141. tests/dynamic_loading_test.py +104 -104
  142. tests/env_config_example.py +133 -133
  143. tests/error_handling_example.py +171 -171
  144. tests/redis_key_validation_demo.py +130 -130
  145. tests/response_improvements_example.py +144 -144
  146. tests/test_advanced_tools.py +148 -148
  147. tests/test_all_redis_key_configs.py +145 -145
  148. tests/test_authenticated_proxy.py +141 -141
  149. tests/test_cleaners.py +54 -54
  150. tests/test_comprehensive.py +146 -146
  151. tests/test_config_validator.py +193 -193
  152. tests/test_crawlo_proxy_integration.py +172 -172
  153. tests/test_date_tools.py +123 -123
  154. tests/test_default_header_middleware.py +158 -158
  155. tests/test_double_crawlo_fix.py +207 -207
  156. tests/test_double_crawlo_fix_simple.py +124 -124
  157. tests/test_download_delay_middleware.py +221 -221
  158. tests/test_downloader_proxy_compatibility.py +268 -268
  159. tests/test_dynamic_downloaders_proxy.py +124 -124
  160. tests/test_dynamic_proxy.py +92 -92
  161. tests/test_dynamic_proxy_config.py +146 -146
  162. tests/test_dynamic_proxy_real.py +109 -109
  163. tests/test_edge_cases.py +303 -303
  164. tests/test_enhanced_error_handler.py +270 -270
  165. tests/test_env_config.py +121 -121
  166. tests/test_error_handler_compatibility.py +112 -112
  167. tests/test_final_validation.py +153 -153
  168. tests/test_framework_env_usage.py +103 -103
  169. tests/test_integration.py +356 -356
  170. tests/test_item_dedup_redis_key.py +122 -122
  171. tests/test_offsite_middleware.py +221 -221
  172. tests/test_parsel.py +29 -29
  173. tests/test_performance.py +327 -327
  174. tests/test_proxy_api.py +264 -264
  175. tests/test_proxy_health_check.py +32 -32
  176. tests/test_proxy_middleware.py +121 -121
  177. tests/test_proxy_middleware_enhanced.py +216 -216
  178. tests/test_proxy_middleware_integration.py +136 -136
  179. tests/test_proxy_providers.py +56 -56
  180. tests/test_proxy_stats.py +19 -19
  181. tests/test_proxy_strategies.py +59 -59
  182. tests/test_queue_manager_double_crawlo.py +173 -173
  183. tests/test_queue_manager_redis_key.py +176 -176
  184. tests/test_real_scenario_proxy.py +195 -195
  185. tests/test_redis_config.py +28 -28
  186. tests/test_redis_connection_pool.py +294 -294
  187. tests/test_redis_key_naming.py +181 -181
  188. tests/test_redis_key_validator.py +123 -123
  189. tests/test_redis_queue.py +224 -224
  190. tests/test_request_ignore_middleware.py +182 -182
  191. tests/test_request_serialization.py +70 -70
  192. tests/test_response_code_middleware.py +349 -349
  193. tests/test_response_filter_middleware.py +427 -427
  194. tests/test_response_improvements.py +152 -152
  195. tests/test_retry_middleware.py +241 -241
  196. tests/test_scheduler.py +241 -241
  197. tests/test_simple_response.py +61 -61
  198. tests/test_telecom_spider_redis_key.py +205 -205
  199. tests/test_template_content.py +87 -87
  200. tests/test_template_redis_key.py +134 -134
  201. tests/test_tools.py +153 -153
  202. tests/tools_example.py +257 -257
  203. crawlo-1.2.2.dist-info/RECORD +0 -220
  204. examples/aiohttp_settings.py +0 -42
  205. examples/curl_cffi_settings.py +0 -41
  206. examples/default_header_middleware_example.py +0 -107
  207. examples/default_header_spider_example.py +0 -129
  208. examples/download_delay_middleware_example.py +0 -160
  209. examples/httpx_settings.py +0 -42
  210. examples/multi_downloader_proxy_example.py +0 -81
  211. examples/offsite_middleware_example.py +0 -55
  212. examples/offsite_spider_example.py +0 -107
  213. examples/proxy_spider_example.py +0 -166
  214. examples/request_ignore_middleware_example.py +0 -51
  215. examples/request_ignore_spider_example.py +0 -99
  216. examples/response_code_middleware_example.py +0 -52
  217. examples/response_filter_middleware_example.py +0 -67
  218. examples/tong_hua_shun_settings.py +0 -62
  219. examples/tong_hua_shun_spider.py +0 -170
  220. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/WHEEL +0 -0
  221. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/entry_points.txt +0 -0
  222. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/top_level.txt +0 -0
crawlo/spider/__init__.py CHANGED
@@ -1,639 +1,639 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Crawlo Spider Module
5
- ==================
6
- 提供爬虫基类和相关功能。
7
-
8
- 核心功能:
9
- - Spider基类:所有爬虫的基础类
10
- - 自动注册机制:通过元类自动注册爬虫
11
- - 配置管理:支持自定义设置和链式调用
12
- - 生命周期管理:开启/关闭钩子函数
13
- - 分布式支持:智能检测运行模式
14
-
15
- 使用示例:
16
- class MySpider(Spider):
17
- name = 'my_spider'
18
- start_urls = ['http://example.com']
19
-
20
- # 自定义配置
21
- custom_settings = {
22
- 'DOWNLOADER_TYPE': 'httpx',
23
- 'CONCURRENCY': 10
24
- }
25
-
26
- def parse(self, response):
27
- # 解析逻辑
28
- yield Item(data=response.json())
29
- """
30
- from __future__ import annotations
31
- from typing import Type, Any, Optional, List, Dict, Union, Iterator, AsyncIterator
32
- from ..network.request import Request
33
- from ..utils.log import get_logger
34
-
35
-
36
- # 全局爬虫注册表
37
- _DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
38
-
39
-
40
- class SpiderMeta(type):
41
- """
42
- 爬虫元类,提供自动注册功能
43
-
44
- 功能:
45
- - 自动注册爬虫到全局注册表
46
- - 验证爬虫名称的唯一性
47
- - 提供完整的错误提示
48
- """
49
-
50
- def __new__(mcs, name: str, bases: tuple[type], namespace: dict[str, Any], **kwargs):
51
- cls = super().__new__(mcs, name, bases, namespace)
52
-
53
- # 检查是否为Spider子类
54
- is_spider_subclass = any(
55
- base is Spider or (isinstance(base, type) and issubclass(base, Spider))
56
- for base in bases
57
- )
58
- if not is_spider_subclass:
59
- return cls
60
-
61
- # 验证爬虫名称
62
- spider_name = namespace.get('name')
63
- if not isinstance(spider_name, str):
64
- raise AttributeError(
65
- f"爬虫类 '{cls.__name__}' 必须定义字符串类型的 'name' 属性。\n"
66
- f"示例: name = 'my_spider'"
67
- )
68
-
69
- # 检查名称唯一性
70
- if spider_name in _DEFAULT_SPIDER_REGISTRY:
71
- existing_class = _DEFAULT_SPIDER_REGISTRY[spider_name]
72
- raise ValueError(
73
- f"爬虫名称 '{spider_name}' 已被 {existing_class.__name__} 占用。\n"
74
- f"请确保每个爬虫的 name 属性全局唯一。\n"
75
- f"建议使用格式: 'project_module_function'"
76
- )
77
-
78
- # 注册爬虫
79
- _DEFAULT_SPIDER_REGISTRY[spider_name] = cls
80
- get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
81
-
82
- return cls
83
-
84
-
85
- class Spider(metaclass=SpiderMeta):
86
- """
87
- 爬虫基类 - 所有爬虫实现的基础
88
-
89
- 必须定义的属性:
90
- - name: 爬虫名称,必须全局唯一
91
-
92
- 可选配置:
93
- - start_urls: 起始 URL 列表
94
- - custom_settings: 自定义设置字典
95
- - allowed_domains: 允许的域名列表
96
-
97
- 必须实现的方法:
98
- - parse(response): 解析响应的主方法
99
-
100
- 可选实现的方法:
101
- - spider_opened(): 爬虫开启时调用
102
- - spider_closed(): 爬虫关闭时调用
103
- - start_requests(): 生成初始请求(默认使用start_urls)
104
-
105
- 示例:
106
- class MySpider(Spider):
107
- name = 'example_spider'
108
- start_urls = ['https://example.com']
109
-
110
- custom_settings = {
111
- 'DOWNLOADER_TYPE': 'httpx',
112
- 'CONCURRENCY': 5,
113
- 'DOWNLOAD_DELAY': 1.0
114
- }
115
-
116
- def parse(self, response):
117
- # 提取数据
118
- data = response.css('title::text').get()
119
- yield {'title': data}
120
-
121
- # 生成新请求
122
- for link in response.css('a::attr(href)').getall():
123
- yield Request(url=link, callback=self.parse_detail)
124
- """
125
-
126
- # 必须定义的属性
127
- name: str = None
128
-
129
- # 可选属性
130
- start_urls: List[str] = None
131
- custom_settings: Dict[str, Any] = None
132
- allowed_domains: List[str] = None
133
-
134
- def __init__(self, name: str = None, **kwargs):
135
- """
136
- 初始化爬虫实例
137
-
138
- :param name: 爬虫名称(可选,默认使用类属性)
139
- :param kwargs: 其他初始化参数
140
- """
141
- # 初始化基本属性
142
- if not hasattr(self, 'start_urls') or self.start_urls is None:
143
- self.start_urls = []
144
- if not hasattr(self, 'custom_settings') or self.custom_settings is None:
145
- self.custom_settings = {}
146
- if not hasattr(self, 'allowed_domains') or self.allowed_domains is None:
147
- self.allowed_domains = []
148
-
149
- # 设置爬虫名称
150
- self.name = name or self.name
151
- if not self.name:
152
- raise ValueError(f"爬虫 {self.__class__.__name__} 必须指定 name 属性")
153
-
154
- # 初始化其他属性
155
- self.crawler = None
156
- self.logger = get_logger(self.name)
157
- self.stats = None
158
-
159
- # 应用额外参数
160
- for key, value in kwargs.items():
161
- setattr(self, key, value)
162
-
163
- @classmethod
164
- def create_instance(cls, crawler) -> 'Spider':
165
- """
166
- 创建爬虫实例并绑定 crawler
167
-
168
- :param crawler: Crawler 实例
169
- :return: 爬虫实例
170
- """
171
- spider = cls()
172
- spider.crawler = crawler
173
- spider.stats = getattr(crawler, 'stats', None)
174
-
175
- # 合并自定义设置
176
- if hasattr(spider, 'custom_settings') and spider.custom_settings:
177
- for key, value in spider.custom_settings.items():
178
- crawler.settings.set(key, value)
179
- spider.logger.debug(f"应用自定义设置: {key} = {value}")
180
-
181
- return spider
182
-
183
- def start_requests(self) -> Iterator[Request]:
184
- """
185
- 生成初始请求
186
-
187
- 默认行为:
188
- - 使用 start_urls 生成请求
189
- - 智能检测分布式模式决定是否去重
190
- - 支持单个 start_url 属性(兼容性)
191
- - 支持批量生成优化(大规模URL场景)
192
-
193
- :return: Request 迭代器
194
- """
195
- # 检测是否为分布式模式
196
- is_distributed = self._is_distributed_mode()
197
-
198
- # 获取批量处理配置
199
- batch_size = self._get_batch_size()
200
-
201
- # 从 start_urls 生成请求
202
- if self.start_urls:
203
- generated_count = 0
204
- for url in self.start_urls:
205
- if self._is_allowed_domain(url):
206
- yield Request(
207
- url=url,
208
- callback=self.parse,
209
- dont_filter=not is_distributed,
210
- meta={'spider_name': self.name}
211
- )
212
- generated_count += 1
213
-
214
- # 大规模URL时进行批量控制
215
- if batch_size > 0 and generated_count % batch_size == 0:
216
- self.logger.debug(f"已生成 {generated_count} 个请求(批量大小: {batch_size})")
217
- else:
218
- self.logger.warning(f"跳过不允许的域名: {url}")
219
-
220
- # 兼容单个 start_url 属性
221
- elif hasattr(self, 'start_url') and isinstance(getattr(self, 'start_url'), str):
222
- url = getattr(self, 'start_url')
223
- if self._is_allowed_domain(url):
224
- yield Request(
225
- url=url,
226
- callback=self.parse,
227
- dont_filter=not is_distributed,
228
- meta={'spider_name': self.name}
229
- )
230
- else:
231
- self.logger.warning(f"跳过不允许的域名: {url}")
232
-
233
- else:
234
- self.logger.warning(
235
- f"爬虫 {self.name} 没有定义 start_urls 或 start_url。\n"
236
- f"请在爬虫类中定义或重写 start_requests() 方法。"
237
- )
238
-
239
- def _get_batch_size(self) -> int:
240
- """
241
- 获取批量处理大小配置
242
-
243
- 用于大规模URL场景的性能优化
244
-
245
- :return: 批量大小(0表示无限制)
246
- """
247
- if not self.crawler:
248
- return 0
249
-
250
- # 从设置中获取批量大小
251
- batch_size = self.crawler.settings.get_int('SPIDER_BATCH_SIZE', 0)
252
-
253
- # 如果start_urls超过一定数量,自动启用批量模式
254
- if batch_size == 0 and self.start_urls and len(self.start_urls) > 1000:
255
- batch_size = 500 # 默认批量大小
256
- self.logger.info(f"检测到大量start_urls ({len(self.start_urls)}),启用批量模式 (批量大小: {batch_size})")
257
-
258
- return batch_size
259
-
260
- def _is_distributed_mode(self) -> bool:
261
- """
262
- 智能检测是否为分布式模式
263
-
264
- 检测条件:
265
- - QUEUE_TYPE = 'redis'
266
- - FILTER_CLASS 包含 'aioredis_filter'
267
- - RUN_MODE = 'distributed'
268
-
269
- :return: 是否为分布式模式
270
- """
271
- if not self.crawler:
272
- return False
273
-
274
- settings = self.crawler.settings
275
-
276
- # 检查多个条件来判断是否为分布式模式
277
- queue_type = settings.get('QUEUE_TYPE', 'memory')
278
- filter_class = settings.get('FILTER_CLASS', '')
279
- run_mode = settings.get('RUN_MODE', 'standalone')
280
-
281
- # 分布式模式的标志
282
- is_redis_queue = queue_type == 'redis'
283
- is_redis_filter = 'aioredis_filter' in filter_class.lower()
284
- is_distributed_run_mode = run_mode == 'distributed'
285
-
286
- distributed = is_redis_queue or is_redis_filter or is_distributed_run_mode
287
-
288
- if distributed:
289
- self.logger.debug("检测到分布式模式,启用请求去重")
290
- else:
291
- self.logger.debug("检测到单机模式,禁用请求去重")
292
-
293
- return distributed
294
-
295
- def _is_allowed_domain(self, url: str) -> bool:
296
- """
297
- 检查URL是否在允许的域名列表中
298
-
299
- :param url: 要检查的URL
300
- :return: 是否允许
301
- """
302
- if not self.allowed_domains:
303
- return True
304
-
305
- from urllib.parse import urlparse
306
- try:
307
- domain = urlparse(url).netloc.lower()
308
- return any(
309
- domain == allowed.lower() or domain.endswith('.' + allowed.lower())
310
- for allowed in self.allowed_domains
311
- )
312
- except Exception as e:
313
- self.logger.warning(f"URL解析失败: {url} - {e}")
314
- return False
315
-
316
- def parse(self, response):
317
- """
318
- 解析响应的主方法(必须实现)
319
-
320
- :param response: 响应对象
321
- :return: 生成的 Item 或 Request
322
- """
323
- raise NotImplementedError(
324
- f"爬虫 {self.__class__.__name__} 必须实现 parse() 方法\n"
325
- f"示例:\n"
326
- f"def parse(self, response):\n"
327
- f" # 提取数据\n"
328
- f" yield {{'title': response.css('title::text').get()}}\n"
329
- f" # 生成新请求\n"
330
- f" for link in response.css('a::attr(href)').getall():\n"
331
- f" yield Request(url=link)"
332
- )
333
-
334
- async def spider_opened(self):
335
- """
336
- 爬虫开启时调用的钩子函数
337
-
338
- 可用于:
339
- - 初始化资源
340
- - 连接数据库
341
- - 设置初始状态
342
- """
343
- self.logger.info(f"爬虫 {self.name} 已开启")
344
-
345
- async def spider_closed(self):
346
- """
347
- 爬虫关闭时调用的钩子函数
348
-
349
- 可用于:
350
- - 清理资源
351
- - 关闭数据库连接
352
- - 输出统计信息
353
- """
354
- if self.stats:
355
- stats_summary = {
356
- 'total_requests': self.stats.get('total_requests', 0),
357
- 'total_items': self.stats.get('total_items', 0),
358
- 'success_rate': self.stats.get('success_rate', 'N/A')
359
- }
360
- self.logger.info(f"爬虫 {self.name} 已关闭,统计信息: {stats_summary}")
361
- else:
362
- self.logger.info(f"爬虫 {self.name} 已关闭")
363
-
364
- def __str__(self) -> str:
365
- return f"{self.__class__.__name__}(name='{self.name}')"
366
-
367
- def __repr__(self) -> str:
368
- return self.__str__()
369
-
370
- def set_custom_setting(self, key: str, value: Any) -> 'Spider':
371
- """
372
- 设置自定义配置(链式调用)
373
-
374
- :param key: 配置键名
375
- :param value: 配置值
376
- :return: self(支持链式调用)
377
-
378
- 示例:
379
- spider.set_custom_setting('CONCURRENCY', 10)\
380
- .set_custom_setting('DOWNLOAD_DELAY', 1.0)
381
- """
382
- if not hasattr(self, 'custom_settings') or self.custom_settings is None:
383
- self.custom_settings = {}
384
-
385
- self.custom_settings[key] = value
386
- self.logger.debug(f"设置自定义配置: {key} = {value}")
387
-
388
- # 如果已绑定crawler,立即应用设置
389
- if self.crawler:
390
- self.crawler.settings.set(key, value)
391
-
392
- return self
393
-
394
- def get_custom_setting(self, key: str, default: Any = None) -> Any:
395
- """
396
- 获取自定义配置值
397
-
398
- :param key: 配置键名
399
- :param default: 默认值
400
- :return: 配置值
401
- """
402
- if hasattr(self, 'custom_settings') and self.custom_settings:
403
- return self.custom_settings.get(key, default)
404
- return default
405
-
406
- def get_spider_info(self) -> Dict[str, Any]:
407
- """
408
- 获取爬虫详细信息
409
-
410
- :return: 爬虫信息字典
411
- """
412
- info = {
413
- 'name': self.name,
414
- 'class_name': self.__class__.__name__,
415
- 'module': self.__module__,
416
- 'start_urls_count': len(self.start_urls) if self.start_urls else 0,
417
- 'allowed_domains_count': len(self.allowed_domains) if self.allowed_domains else 0,
418
- 'custom_settings_count': len(self.custom_settings) if self.custom_settings else 0,
419
- 'is_distributed': self._is_distributed_mode() if self.crawler else None,
420
- 'has_crawler': self.crawler is not None,
421
- 'logger_name': self.logger.name if hasattr(self, 'logger') else None
422
- }
423
-
424
- # 添加方法检查
425
- info['methods'] = {
426
- 'has_parse': callable(getattr(self, 'parse', None)),
427
- 'has_spider_opened': callable(getattr(self, 'spider_opened', None)),
428
- 'has_spider_closed': callable(getattr(self, 'spider_closed', None)),
429
- 'has_start_requests': callable(getattr(self, 'start_requests', None))
430
- }
431
-
432
- return info
433
-
434
- def make_request(self, url: str, callback=None, **kwargs) -> Request:
435
- """
436
- 便捷方法:创建 Request 对象
437
-
438
- :param url: 请求URL
439
- :param callback: 回调函数(默认为parse)
440
- :param kwargs: 其他Request参数
441
- :return: Request对象
442
- """
443
- return Request(
444
- url=url,
445
- callback=callback or self.parse,
446
- meta={'spider_name': self.name},
447
- **kwargs
448
- )
449
-
450
-
451
- # === 高级爬虫功能扩展 ===
452
-
453
- class SpiderStatsTracker:
454
- """
455
- 爬虫统计跟踪器
456
- 提供详细的性能监控功能
457
- """
458
-
459
- def __init__(self, spider_name: str):
460
- self.spider_name = spider_name
461
- self.start_time = None
462
- self.end_time = None
463
- self.request_count = 0
464
- self.response_count = 0
465
- self.item_count = 0
466
- self.error_count = 0
467
- self.domain_stats = {}
468
-
469
- def start_tracking(self):
470
- """开始统计"""
471
- import time
472
- self.start_time = time.time()
473
-
474
- def stop_tracking(self):
475
- """停止统计"""
476
- import time
477
- self.end_time = time.time()
478
-
479
- def record_request(self, url: str):
480
- """记录请求"""
481
- self.request_count += 1
482
- from urllib.parse import urlparse
483
- domain = urlparse(url).netloc
484
- self.domain_stats[domain] = self.domain_stats.get(domain, 0) + 1
485
-
486
- def record_response(self):
487
- """记录响应"""
488
- self.response_count += 1
489
-
490
- def record_item(self):
491
- """记录Item"""
492
- self.item_count += 1
493
-
494
- def record_error(self):
495
- """记录错误"""
496
- self.error_count += 1
497
-
498
- def get_summary(self) -> Dict[str, Any]:
499
- """获取统计摘要"""
500
- duration = (self.end_time - self.start_time) if (self.start_time and self.end_time) else 0
501
-
502
- return {
503
- 'spider_name': self.spider_name,
504
- 'duration_seconds': round(duration, 2),
505
- 'requests': self.request_count,
506
- 'responses': self.response_count,
507
- 'items': self.item_count,
508
- 'errors': self.error_count,
509
- 'success_rate': round((self.response_count / max(1, self.request_count)) * 100, 2),
510
- 'requests_per_second': round(self.request_count / max(1, duration), 2),
511
- 'top_domains': sorted(
512
- self.domain_stats.items(),
513
- key=lambda x: x[1],
514
- reverse=True
515
- )[:5]
516
- }
517
-
518
-
519
- def create_spider_from_template(name: str, start_urls: List[str], **options) -> Type[Spider]:
520
- """
521
- 从模板快速创建爬虫类
522
-
523
- :param name: 爬虫名称
524
- :param start_urls: 起始URL列表
525
- :param options: 其他选项
526
- :return: 新创建的爬虫类
527
-
528
- 示例:
529
- MySpider = create_spider_from_template(
530
- name='quick_spider',
531
- start_urls=['http://example.com'],
532
- allowed_domains=['example.com'],
533
- custom_settings={'CONCURRENCY': 5}
534
- )
535
- """
536
-
537
- # 动态创建爬虫类
538
- class_attrs = {
539
- 'name': name,
540
- 'start_urls': start_urls,
541
- 'allowed_domains': options.get('allowed_domains', []),
542
- 'custom_settings': options.get('custom_settings', {})
543
- }
544
-
545
- # 添加自定义parse方法
546
- if 'parse_function' in options:
547
- class_attrs['parse'] = options['parse_function']
548
- else:
549
- def default_parse(self, response):
550
- """默认解析方法"""
551
- yield {'url': response.url, 'title': getattr(response, 'title', 'N/A')}
552
- class_attrs['parse'] = default_parse
553
-
554
- # 创建类名
555
- class_name = options.get('class_name', f"Generated{name.replace('_', '').title()}Spider")
556
-
557
- # 动态创建类
558
- spider_class = type(class_name, (Spider,), class_attrs)
559
-
560
- get_logger(__name__).info(f"动态创建爬虫类: {class_name} (name='{name}')")
561
-
562
- return spider_class
563
-
564
-
565
- # === 公共只读接口 ===
566
- def get_global_spider_registry() -> dict[str, Type[Spider]]:
567
- """
568
- 获取全局爬虫注册表的副本
569
-
570
- :return: 爬虫注册表的副本
571
- """
572
- return _DEFAULT_SPIDER_REGISTRY.copy()
573
-
574
-
575
- def get_spider_by_name(name: str) -> Optional[Type[Spider]]:
576
- """
577
- 根据名称获取爬虫类
578
-
579
- :param name: 爬虫名称
580
- :return: 爬虫类或None
581
- """
582
- return _DEFAULT_SPIDER_REGISTRY.get(name)
583
-
584
-
585
- def get_all_spider_classes() -> List[Type[Spider]]:
586
- """
587
- 获取所有注册的爬虫类
588
-
589
- :return: 爬虫类列表
590
- """
591
- return list(set(_DEFAULT_SPIDER_REGISTRY.values()))
592
-
593
-
594
- def get_spider_names() -> List[str]:
595
- """
596
- 获取所有爬虫名称
597
-
598
- :return: 爬虫名称列表
599
- """
600
- return list(_DEFAULT_SPIDER_REGISTRY.keys())
601
-
602
-
603
- def is_spider_registered(name: str) -> bool:
604
- """
605
- 检查爬虫是否已注册
606
-
607
- :param name: 爬虫名称
608
- :return: 是否已注册
609
- """
610
- return name in _DEFAULT_SPIDER_REGISTRY
611
-
612
-
613
- def unregister_spider(name: str) -> bool:
614
- """
615
- 取消注册爬虫(仅用于测试)
616
-
617
- :param name: 爬虫名称
618
- :return: 是否成功取消注册
619
- """
620
- if name in _DEFAULT_SPIDER_REGISTRY:
621
- del _DEFAULT_SPIDER_REGISTRY[name]
622
- return True
623
- return False
624
-
625
-
626
- # 导出的公共接口
627
- __all__ = [
628
- 'Spider',
629
- 'SpiderMeta',
630
- 'SpiderStatsTracker',
631
- 'create_spider_from_template',
632
- 'get_global_spider_registry',
633
- 'get_spider_by_name',
634
- 'get_all_spider_classes',
635
- 'get_spider_names',
636
- 'is_spider_registered',
637
- 'unregister_spider'
638
- ]
639
-
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Crawlo Spider Module
5
+ ==================
6
+ 提供爬虫基类和相关功能。
7
+
8
+ 核心功能:
9
+ - Spider基类:所有爬虫的基础类
10
+ - 自动注册机制:通过元类自动注册爬虫
11
+ - 配置管理:支持自定义设置和链式调用
12
+ - 生命周期管理:开启/关闭钩子函数
13
+ - 分布式支持:智能检测运行模式
14
+
15
+ 使用示例:
16
+ class MySpider(Spider):
17
+ name = 'my_spider'
18
+ start_urls = ['http://example.com']
19
+
20
+ # 自定义配置
21
+ custom_settings = {
22
+ 'DOWNLOADER_TYPE': 'httpx',
23
+ 'CONCURRENCY': 10
24
+ }
25
+
26
+ def parse(self, response):
27
+ # 解析逻辑
28
+ yield Item(data=response.json())
29
+ """
30
+ from __future__ import annotations
31
+ from typing import Type, Any, Optional, List, Dict, Union, Iterator, AsyncIterator
32
+ from ..network.request import Request
33
+ from ..utils.log import get_logger
34
+
35
+
36
+ # 全局爬虫注册表
37
+ _DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
38
+
39
+
40
+ class SpiderMeta(type):
41
+ """
42
+ 爬虫元类,提供自动注册功能
43
+
44
+ 功能:
45
+ - 自动注册爬虫到全局注册表
46
+ - 验证爬虫名称的唯一性
47
+ - 提供完整的错误提示
48
+ """
49
+
50
+ def __new__(mcs, name: str, bases: tuple[type], namespace: dict[str, Any], **kwargs):
51
+ cls = super().__new__(mcs, name, bases, namespace)
52
+
53
+ # 检查是否为Spider子类
54
+ is_spider_subclass = any(
55
+ base is Spider or (isinstance(base, type) and issubclass(base, Spider))
56
+ for base in bases
57
+ )
58
+ if not is_spider_subclass:
59
+ return cls
60
+
61
+ # 验证爬虫名称
62
+ spider_name = namespace.get('name')
63
+ if not isinstance(spider_name, str):
64
+ raise AttributeError(
65
+ f"爬虫类 '{cls.__name__}' 必须定义字符串类型的 'name' 属性。\n"
66
+ f"示例: name = 'my_spider'"
67
+ )
68
+
69
+ # 检查名称唯一性
70
+ if spider_name in _DEFAULT_SPIDER_REGISTRY:
71
+ existing_class = _DEFAULT_SPIDER_REGISTRY[spider_name]
72
+ raise ValueError(
73
+ f"爬虫名称 '{spider_name}' 已被 {existing_class.__name__} 占用。\n"
74
+ f"请确保每个爬虫的 name 属性全局唯一。\n"
75
+ f"建议使用格式: 'project_module_function'"
76
+ )
77
+
78
+ # 注册爬虫
79
+ _DEFAULT_SPIDER_REGISTRY[spider_name] = cls
80
+ get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
81
+
82
+ return cls
83
+
84
+
85
+ class Spider(metaclass=SpiderMeta):
86
+ """
87
+ 爬虫基类 - 所有爬虫实现的基础
88
+
89
+ 必须定义的属性:
90
+ - name: 爬虫名称,必须全局唯一
91
+
92
+ 可选配置:
93
+ - start_urls: 起始 URL 列表
94
+ - custom_settings: 自定义设置字典
95
+ - allowed_domains: 允许的域名列表
96
+
97
+ 必须实现的方法:
98
+ - parse(response): 解析响应的主方法
99
+
100
+ 可选实现的方法:
101
+ - spider_opened(): 爬虫开启时调用
102
+ - spider_closed(): 爬虫关闭时调用
103
+ - start_requests(): 生成初始请求(默认使用start_urls)
104
+
105
+ 示例:
106
+ class MySpider(Spider):
107
+ name = 'example_spider'
108
+ start_urls = ['https://example.com']
109
+
110
+ custom_settings = {
111
+ 'DOWNLOADER_TYPE': 'httpx',
112
+ 'CONCURRENCY': 5,
113
+ 'DOWNLOAD_DELAY': 1.0
114
+ }
115
+
116
+ def parse(self, response):
117
+ # 提取数据
118
+ data = response.css('title::text').get()
119
+ yield {'title': data}
120
+
121
+ # 生成新请求
122
+ for link in response.css('a::attr(href)').getall():
123
+ yield Request(url=link, callback=self.parse_detail)
124
+ """
125
+
126
+ # 必须定义的属性
127
+ name: str = None
128
+
129
+ # 可选属性
130
+ start_urls: List[str] = None
131
+ custom_settings: Dict[str, Any] = None
132
+ allowed_domains: List[str] = None
133
+
134
+ def __init__(self, name: str = None, **kwargs):
135
+ """
136
+ 初始化爬虫实例
137
+
138
+ :param name: 爬虫名称(可选,默认使用类属性)
139
+ :param kwargs: 其他初始化参数
140
+ """
141
+ # 初始化基本属性
142
+ if not hasattr(self, 'start_urls') or self.start_urls is None:
143
+ self.start_urls = []
144
+ if not hasattr(self, 'custom_settings') or self.custom_settings is None:
145
+ self.custom_settings = {}
146
+ if not hasattr(self, 'allowed_domains') or self.allowed_domains is None:
147
+ self.allowed_domains = []
148
+
149
+ # 设置爬虫名称
150
+ self.name = name or self.name
151
+ if not self.name:
152
+ raise ValueError(f"爬虫 {self.__class__.__name__} 必须指定 name 属性")
153
+
154
+ # 初始化其他属性
155
+ self.crawler = None
156
+ self.logger = get_logger(self.name)
157
+ self.stats = None
158
+
159
+ # 应用额外参数
160
+ for key, value in kwargs.items():
161
+ setattr(self, key, value)
162
+
163
+ @classmethod
164
+ def create_instance(cls, crawler) -> 'Spider':
165
+ """
166
+ 创建爬虫实例并绑定 crawler
167
+
168
+ :param crawler: Crawler 实例
169
+ :return: 爬虫实例
170
+ """
171
+ spider = cls()
172
+ spider.crawler = crawler
173
+ spider.stats = getattr(crawler, 'stats', None)
174
+
175
+ # 合并自定义设置
176
+ if hasattr(spider, 'custom_settings') and spider.custom_settings:
177
+ for key, value in spider.custom_settings.items():
178
+ crawler.settings.set(key, value)
179
+ spider.logger.debug(f"应用自定义设置: {key} = {value}")
180
+
181
+ return spider
182
+
183
+ def start_requests(self) -> Iterator[Request]:
184
+ """
185
+ 生成初始请求
186
+
187
+ 默认行为:
188
+ - 使用 start_urls 生成请求
189
+ - 智能检测分布式模式决定是否去重
190
+ - 支持单个 start_url 属性(兼容性)
191
+ - 支持批量生成优化(大规模URL场景)
192
+
193
+ :return: Request 迭代器
194
+ """
195
+ # 检测是否为分布式模式
196
+ is_distributed = self._is_distributed_mode()
197
+
198
+ # 获取批量处理配置
199
+ batch_size = self._get_batch_size()
200
+
201
+ # 从 start_urls 生成请求
202
+ if self.start_urls:
203
+ generated_count = 0
204
+ for url in self.start_urls:
205
+ if self._is_allowed_domain(url):
206
+ yield Request(
207
+ url=url,
208
+ callback=self.parse,
209
+ dont_filter=not is_distributed,
210
+ meta={'spider_name': self.name}
211
+ )
212
+ generated_count += 1
213
+
214
+ # 大规模URL时进行批量控制
215
+ if batch_size > 0 and generated_count % batch_size == 0:
216
+ self.logger.debug(f"已生成 {generated_count} 个请求(批量大小: {batch_size})")
217
+ else:
218
+ self.logger.warning(f"跳过不允许的域名: {url}")
219
+
220
+ # 兼容单个 start_url 属性
221
+ elif hasattr(self, 'start_url') and isinstance(getattr(self, 'start_url'), str):
222
+ url = getattr(self, 'start_url')
223
+ if self._is_allowed_domain(url):
224
+ yield Request(
225
+ url=url,
226
+ callback=self.parse,
227
+ dont_filter=not is_distributed,
228
+ meta={'spider_name': self.name}
229
+ )
230
+ else:
231
+ self.logger.warning(f"跳过不允许的域名: {url}")
232
+
233
+ else:
234
+ self.logger.warning(
235
+ f"爬虫 {self.name} 没有定义 start_urls 或 start_url。\n"
236
+ f"请在爬虫类中定义或重写 start_requests() 方法。"
237
+ )
238
+
239
+ def _get_batch_size(self) -> int:
240
+ """
241
+ 获取批量处理大小配置
242
+
243
+ 用于大规模URL场景的性能优化
244
+
245
+ :return: 批量大小(0表示无限制)
246
+ """
247
+ if not self.crawler:
248
+ return 0
249
+
250
+ # 从设置中获取批量大小
251
+ batch_size = self.crawler.settings.get_int('SPIDER_BATCH_SIZE', 0)
252
+
253
+ # 如果start_urls超过一定数量,自动启用批量模式
254
+ if batch_size == 0 and self.start_urls and len(self.start_urls) > 1000:
255
+ batch_size = 500 # 默认批量大小
256
+ self.logger.info(f"检测到大量start_urls ({len(self.start_urls)}),启用批量模式 (批量大小: {batch_size})")
257
+
258
+ return batch_size
259
+
260
+ def _is_distributed_mode(self) -> bool:
261
+ """
262
+ 智能检测是否为分布式模式
263
+
264
+ 检测条件:
265
+ - QUEUE_TYPE = 'redis'
266
+ - FILTER_CLASS 包含 'aioredis_filter'
267
+ - RUN_MODE = 'distributed'
268
+
269
+ :return: 是否为分布式模式
270
+ """
271
+ if not self.crawler:
272
+ return False
273
+
274
+ settings = self.crawler.settings
275
+
276
+ # 检查多个条件来判断是否为分布式模式
277
+ queue_type = settings.get('QUEUE_TYPE', 'memory')
278
+ filter_class = settings.get('FILTER_CLASS', '')
279
+ run_mode = settings.get('RUN_MODE', 'standalone')
280
+
281
+ # 分布式模式的标志
282
+ is_redis_queue = queue_type == 'redis'
283
+ is_redis_filter = 'aioredis_filter' in filter_class.lower()
284
+ is_distributed_run_mode = run_mode == 'distributed'
285
+
286
+ distributed = is_redis_queue or is_redis_filter or is_distributed_run_mode
287
+
288
+ if distributed:
289
+ self.logger.debug("检测到分布式模式,启用请求去重")
290
+ else:
291
+ self.logger.debug("检测到单机模式,禁用请求去重")
292
+
293
+ return distributed
294
+
295
+ def _is_allowed_domain(self, url: str) -> bool:
296
+ """
297
+ 检查URL是否在允许的域名列表中
298
+
299
+ :param url: 要检查的URL
300
+ :return: 是否允许
301
+ """
302
+ if not self.allowed_domains:
303
+ return True
304
+
305
+ from urllib.parse import urlparse
306
+ try:
307
+ domain = urlparse(url).netloc.lower()
308
+ return any(
309
+ domain == allowed.lower() or domain.endswith('.' + allowed.lower())
310
+ for allowed in self.allowed_domains
311
+ )
312
+ except Exception as e:
313
+ self.logger.warning(f"URL解析失败: {url} - {e}")
314
+ return False
315
+
316
+ def parse(self, response):
317
+ """
318
+ 解析响应的主方法(必须实现)
319
+
320
+ :param response: 响应对象
321
+ :return: 生成的 Item 或 Request
322
+ """
323
+ raise NotImplementedError(
324
+ f"爬虫 {self.__class__.__name__} 必须实现 parse() 方法\n"
325
+ f"示例:\n"
326
+ f"def parse(self, response):\n"
327
+ f" # 提取数据\n"
328
+ f" yield {{'title': response.css('title::text').get()}}\n"
329
+ f" # 生成新请求\n"
330
+ f" for link in response.css('a::attr(href)').getall():\n"
331
+ f" yield Request(url=link)"
332
+ )
333
+
334
+ async def spider_opened(self):
335
+ """
336
+ 爬虫开启时调用的钩子函数
337
+
338
+ 可用于:
339
+ - 初始化资源
340
+ - 连接数据库
341
+ - 设置初始状态
342
+ """
343
+ self.logger.info(f"爬虫 {self.name} 已开启")
344
+
345
+ async def spider_closed(self):
346
+ """
347
+ 爬虫关闭时调用的钩子函数
348
+
349
+ 可用于:
350
+ - 清理资源
351
+ - 关闭数据库连接
352
+ - 输出统计信息
353
+ """
354
+ if self.stats:
355
+ stats_summary = {
356
+ 'total_requests': self.stats.get('total_requests', 0),
357
+ 'total_items': self.stats.get('total_items', 0),
358
+ 'success_rate': self.stats.get('success_rate', 'N/A')
359
+ }
360
+ self.logger.info(f"爬虫 {self.name} 已关闭,统计信息: {stats_summary}")
361
+ else:
362
+ self.logger.info(f"爬虫 {self.name} 已关闭")
363
+
364
+ def __str__(self) -> str:
365
+ return f"{self.__class__.__name__}(name='{self.name}')"
366
+
367
+ def __repr__(self) -> str:
368
+ return self.__str__()
369
+
370
+ def set_custom_setting(self, key: str, value: Any) -> 'Spider':
371
+ """
372
+ 设置自定义配置(链式调用)
373
+
374
+ :param key: 配置键名
375
+ :param value: 配置值
376
+ :return: self(支持链式调用)
377
+
378
+ 示例:
379
+ spider.set_custom_setting('CONCURRENCY', 10)\
380
+ .set_custom_setting('DOWNLOAD_DELAY', 1.0)
381
+ """
382
+ if not hasattr(self, 'custom_settings') or self.custom_settings is None:
383
+ self.custom_settings = {}
384
+
385
+ self.custom_settings[key] = value
386
+ self.logger.debug(f"设置自定义配置: {key} = {value}")
387
+
388
+ # 如果已绑定crawler,立即应用设置
389
+ if self.crawler:
390
+ self.crawler.settings.set(key, value)
391
+
392
+ return self
393
+
394
+ def get_custom_setting(self, key: str, default: Any = None) -> Any:
395
+ """
396
+ 获取自定义配置值
397
+
398
+ :param key: 配置键名
399
+ :param default: 默认值
400
+ :return: 配置值
401
+ """
402
+ if hasattr(self, 'custom_settings') and self.custom_settings:
403
+ return self.custom_settings.get(key, default)
404
+ return default
405
+
406
+ def get_spider_info(self) -> Dict[str, Any]:
407
+ """
408
+ 获取爬虫详细信息
409
+
410
+ :return: 爬虫信息字典
411
+ """
412
+ info = {
413
+ 'name': self.name,
414
+ 'class_name': self.__class__.__name__,
415
+ 'module': self.__module__,
416
+ 'start_urls_count': len(self.start_urls) if self.start_urls else 0,
417
+ 'allowed_domains_count': len(self.allowed_domains) if self.allowed_domains else 0,
418
+ 'custom_settings_count': len(self.custom_settings) if self.custom_settings else 0,
419
+ 'is_distributed': self._is_distributed_mode() if self.crawler else None,
420
+ 'has_crawler': self.crawler is not None,
421
+ 'logger_name': self.logger.name if hasattr(self, 'logger') else None
422
+ }
423
+
424
+ # 添加方法检查
425
+ info['methods'] = {
426
+ 'has_parse': callable(getattr(self, 'parse', None)),
427
+ 'has_spider_opened': callable(getattr(self, 'spider_opened', None)),
428
+ 'has_spider_closed': callable(getattr(self, 'spider_closed', None)),
429
+ 'has_start_requests': callable(getattr(self, 'start_requests', None))
430
+ }
431
+
432
+ return info
433
+
434
+ def make_request(self, url: str, callback=None, **kwargs) -> Request:
435
+ """
436
+ 便捷方法:创建 Request 对象
437
+
438
+ :param url: 请求URL
439
+ :param callback: 回调函数(默认为parse)
440
+ :param kwargs: 其他Request参数
441
+ :return: Request对象
442
+ """
443
+ return Request(
444
+ url=url,
445
+ callback=callback or self.parse,
446
+ meta={'spider_name': self.name},
447
+ **kwargs
448
+ )
449
+
450
+
451
+ # === 高级爬虫功能扩展 ===
452
+
453
+ class SpiderStatsTracker:
454
+ """
455
+ 爬虫统计跟踪器
456
+ 提供详细的性能监控功能
457
+ """
458
+
459
+ def __init__(self, spider_name: str):
460
+ self.spider_name = spider_name
461
+ self.start_time = None
462
+ self.end_time = None
463
+ self.request_count = 0
464
+ self.response_count = 0
465
+ self.item_count = 0
466
+ self.error_count = 0
467
+ self.domain_stats = {}
468
+
469
+ def start_tracking(self):
470
+ """开始统计"""
471
+ import time
472
+ self.start_time = time.time()
473
+
474
+ def stop_tracking(self):
475
+ """停止统计"""
476
+ import time
477
+ self.end_time = time.time()
478
+
479
+ def record_request(self, url: str):
480
+ """记录请求"""
481
+ self.request_count += 1
482
+ from urllib.parse import urlparse
483
+ domain = urlparse(url).netloc
484
+ self.domain_stats[domain] = self.domain_stats.get(domain, 0) + 1
485
+
486
+ def record_response(self):
487
+ """记录响应"""
488
+ self.response_count += 1
489
+
490
+ def record_item(self):
491
+ """记录Item"""
492
+ self.item_count += 1
493
+
494
+ def record_error(self):
495
+ """记录错误"""
496
+ self.error_count += 1
497
+
498
+ def get_summary(self) -> Dict[str, Any]:
499
+ """获取统计摘要"""
500
+ duration = (self.end_time - self.start_time) if (self.start_time and self.end_time) else 0
501
+
502
+ return {
503
+ 'spider_name': self.spider_name,
504
+ 'duration_seconds': round(duration, 2),
505
+ 'requests': self.request_count,
506
+ 'responses': self.response_count,
507
+ 'items': self.item_count,
508
+ 'errors': self.error_count,
509
+ 'success_rate': round((self.response_count / max(1, self.request_count)) * 100, 2),
510
+ 'requests_per_second': round(self.request_count / max(1, duration), 2),
511
+ 'top_domains': sorted(
512
+ self.domain_stats.items(),
513
+ key=lambda x: x[1],
514
+ reverse=True
515
+ )[:5]
516
+ }
517
+
518
+
519
+ def create_spider_from_template(name: str, start_urls: List[str], **options) -> Type[Spider]:
520
+ """
521
+ 从模板快速创建爬虫类
522
+
523
+ :param name: 爬虫名称
524
+ :param start_urls: 起始URL列表
525
+ :param options: 其他选项
526
+ :return: 新创建的爬虫类
527
+
528
+ 示例:
529
+ MySpider = create_spider_from_template(
530
+ name='quick_spider',
531
+ start_urls=['http://example.com'],
532
+ allowed_domains=['example.com'],
533
+ custom_settings={'CONCURRENCY': 5}
534
+ )
535
+ """
536
+
537
+ # 动态创建爬虫类
538
+ class_attrs = {
539
+ 'name': name,
540
+ 'start_urls': start_urls,
541
+ 'allowed_domains': options.get('allowed_domains', []),
542
+ 'custom_settings': options.get('custom_settings', {})
543
+ }
544
+
545
+ # 添加自定义parse方法
546
+ if 'parse_function' in options:
547
+ class_attrs['parse'] = options['parse_function']
548
+ else:
549
+ def default_parse(self, response):
550
+ """默认解析方法"""
551
+ yield {'url': response.url, 'title': getattr(response, 'title', 'N/A')}
552
+ class_attrs['parse'] = default_parse
553
+
554
+ # 创建类名
555
+ class_name = options.get('class_name', f"Generated{name.replace('_', '').title()}Spider")
556
+
557
+ # 动态创建类
558
+ spider_class = type(class_name, (Spider,), class_attrs)
559
+
560
+ get_logger(__name__).info(f"动态创建爬虫类: {class_name} (name='{name}')")
561
+
562
+ return spider_class
563
+
564
+
565
+ # === 公共只读接口 ===
566
+ def get_global_spider_registry() -> dict[str, Type[Spider]]:
567
+ """
568
+ 获取全局爬虫注册表的副本
569
+
570
+ :return: 爬虫注册表的副本
571
+ """
572
+ return _DEFAULT_SPIDER_REGISTRY.copy()
573
+
574
+
575
+ def get_spider_by_name(name: str) -> Optional[Type[Spider]]:
576
+ """
577
+ 根据名称获取爬虫类
578
+
579
+ :param name: 爬虫名称
580
+ :return: 爬虫类或None
581
+ """
582
+ return _DEFAULT_SPIDER_REGISTRY.get(name)
583
+
584
+
585
+ def get_all_spider_classes() -> List[Type[Spider]]:
586
+ """
587
+ 获取所有注册的爬虫类
588
+
589
+ :return: 爬虫类列表
590
+ """
591
+ return list(set(_DEFAULT_SPIDER_REGISTRY.values()))
592
+
593
+
594
+ def get_spider_names() -> List[str]:
595
+ """
596
+ 获取所有爬虫名称
597
+
598
+ :return: 爬虫名称列表
599
+ """
600
+ return list(_DEFAULT_SPIDER_REGISTRY.keys())
601
+
602
+
603
+ def is_spider_registered(name: str) -> bool:
604
+ """
605
+ 检查爬虫是否已注册
606
+
607
+ :param name: 爬虫名称
608
+ :return: 是否已注册
609
+ """
610
+ return name in _DEFAULT_SPIDER_REGISTRY
611
+
612
+
613
+ def unregister_spider(name: str) -> bool:
614
+ """
615
+ 取消注册爬虫(仅用于测试)
616
+
617
+ :param name: 爬虫名称
618
+ :return: 是否成功取消注册
619
+ """
620
+ if name in _DEFAULT_SPIDER_REGISTRY:
621
+ del _DEFAULT_SPIDER_REGISTRY[name]
622
+ return True
623
+ return False
624
+
625
+
626
+ # 导出的公共接口
627
+ __all__ = [
628
+ 'Spider',
629
+ 'SpiderMeta',
630
+ 'SpiderStatsTracker',
631
+ 'create_spider_from_template',
632
+ 'get_global_spider_registry',
633
+ 'get_spider_by_name',
634
+ 'get_all_spider_classes',
635
+ 'get_spider_names',
636
+ 'is_spider_registered',
637
+ 'unregister_spider'
638
+ ]
639
+