crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (348) hide show
  1. crawlo/__init__.py +90 -90
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -140
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -379
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -320
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -451
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -290
  19. crawlo/crawler.py +698 -698
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -280
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -250
  25. crawlo/downloader/httpx_downloader.py +265 -265
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -425
  28. crawlo/downloader/selenium_downloader.py +486 -486
  29. crawlo/event.py +45 -45
  30. crawlo/exceptions.py +214 -214
  31. crawlo/extension/__init__.py +64 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -53
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -104
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +134 -134
  44. crawlo/filters/__init__.py +170 -170
  45. crawlo/filters/aioredis_filter.py +347 -347
  46. crawlo/filters/memory_filter.py +261 -261
  47. crawlo/framework.py +306 -306
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -391
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -240
  52. crawlo/initialization/phases.py +229 -229
  53. crawlo/initialization/registry.py +143 -143
  54. crawlo/initialization/utils.py +48 -48
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -42
  61. crawlo/logging/config.py +280 -276
  62. crawlo/logging/factory.py +175 -175
  63. crawlo/logging/manager.py +104 -104
  64. crawlo/middleware/__init__.py +87 -87
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -287
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +408 -376
  77. crawlo/network/response.py +598 -569
  78. crawlo/pipelines/__init__.py +52 -52
  79. crawlo/pipelines/base_pipeline.py +452 -452
  80. crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +196 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +104 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -139
  87. crawlo/pipelines/mysql_pipeline.py +468 -469
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -155
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +9 -9
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -591
  94. crawlo/queue/redis_priority_queue.py +518 -518
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +287 -284
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +658 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +1 -1
  104. crawlo/templates/project/items.py.tmpl +13 -13
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -35
  107. crawlo/templates/project/settings.py.tmpl +113 -109
  108. crawlo/templates/project/settings_distributed.py.tmpl +160 -156
  109. crawlo/templates/project/settings_gentle.py.tmpl +174 -170
  110. crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
  111. crawlo/templates/project/settings_minimal.py.tmpl +102 -98
  112. crawlo/templates/project/settings_simple.py.tmpl +172 -168
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -32
  116. crawlo/templates/spiders_init.py.tmpl +4 -4
  117. crawlo/tools/__init__.py +86 -86
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +74 -50
  123. crawlo/utils/batch_processor.py +276 -276
  124. crawlo/utils/config_manager.py +442 -442
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/encoding_helper.py +190 -0
  128. crawlo/utils/error_handler.py +410 -410
  129. crawlo/utils/fingerprint.py +121 -121
  130. crawlo/utils/func_tools.py +82 -82
  131. crawlo/utils/large_scale_helper.py +344 -344
  132. crawlo/utils/leak_detector.py +335 -335
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -157
  135. crawlo/utils/mysql_connection_pool.py +197 -197
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +90 -90
  139. crawlo/utils/redis_connection_pool.py +578 -578
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -278
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -337
  144. crawlo/utils/response_helper.py +113 -0
  145. crawlo/utils/selector_helper.py +138 -137
  146. crawlo/utils/singleton.py +69 -69
  147. crawlo/utils/spider_loader.py +201 -201
  148. crawlo/utils/text_helper.py +94 -94
  149. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
  150. crawlo-1.4.8.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -217
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -467
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -72
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  192. tests/ofweek_scrapy/scrapy.cfg +11 -11
  193. tests/optimized_performance_test.py +211 -211
  194. tests/performance_comparison.py +244 -244
  195. tests/queue_blocking_test.py +113 -113
  196. tests/queue_test.py +89 -89
  197. tests/redis_key_validation_demo.py +130 -130
  198. tests/request_params_example.py +150 -150
  199. tests/response_improvements_example.py +144 -144
  200. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  201. tests/scrapy_comparison/scrapy_test.py +133 -133
  202. tests/simple_cli_test.py +54 -54
  203. tests/simple_command_test.py +119 -119
  204. tests/simple_crawlo_test.py +126 -126
  205. tests/simple_follow_test.py +38 -38
  206. tests/simple_log_test2.py +137 -137
  207. tests/simple_optimization_test.py +128 -128
  208. tests/simple_queue_type_test.py +41 -41
  209. tests/simple_response_selector_test.py +94 -94
  210. tests/simple_selector_helper_test.py +154 -154
  211. tests/simple_selector_test.py +207 -207
  212. tests/simple_spider_test.py +49 -49
  213. tests/simple_url_test.py +73 -73
  214. tests/simulate_mysql_update_test.py +139 -139
  215. tests/spider_log_timing_test.py +177 -177
  216. tests/test_advanced_tools.py +148 -148
  217. tests/test_all_commands.py +230 -230
  218. tests/test_all_pipeline_fingerprints.py +133 -133
  219. tests/test_all_redis_key_configs.py +145 -145
  220. tests/test_asyncmy_usage.py +56 -56
  221. tests/test_batch_processor.py +178 -178
  222. tests/test_cleaners.py +54 -54
  223. tests/test_cli_arguments.py +118 -118
  224. tests/test_component_factory.py +174 -174
  225. tests/test_config_consistency.py +80 -80
  226. tests/test_config_merge.py +152 -152
  227. tests/test_config_validator.py +182 -182
  228. tests/test_controlled_spider_mixin.py +79 -79
  229. tests/test_crawler_process_import.py +38 -38
  230. tests/test_crawler_process_spider_modules.py +47 -47
  231. tests/test_crawlo_proxy_integration.py +114 -114
  232. tests/test_date_tools.py +123 -123
  233. tests/test_dedup_fix.py +220 -220
  234. tests/test_dedup_pipeline_consistency.py +124 -124
  235. tests/test_default_header_middleware.py +313 -313
  236. tests/test_distributed.py +65 -65
  237. tests/test_double_crawlo_fix.py +204 -204
  238. tests/test_double_crawlo_fix_simple.py +124 -124
  239. tests/test_download_delay_middleware.py +221 -221
  240. tests/test_downloader_proxy_compatibility.py +272 -272
  241. tests/test_edge_cases.py +305 -305
  242. tests/test_encoding_core.py +56 -56
  243. tests/test_encoding_detection.py +126 -126
  244. tests/test_enhanced_error_handler.py +270 -270
  245. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  246. tests/test_error_handler_compatibility.py +112 -112
  247. tests/test_factories.py +252 -252
  248. tests/test_factory_compatibility.py +196 -196
  249. tests/test_final_validation.py +153 -153
  250. tests/test_fingerprint_consistency.py +135 -135
  251. tests/test_fingerprint_simple.py +51 -51
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_helper.py +235 -235
  257. tests/test_logging_enhancements.py +374 -374
  258. tests/test_logging_final.py +184 -184
  259. tests/test_logging_integration.py +312 -312
  260. tests/test_logging_system.py +282 -282
  261. tests/test_middleware_debug.py +141 -141
  262. tests/test_mode_consistency.py +51 -51
  263. tests/test_multi_directory.py +67 -67
  264. tests/test_multiple_spider_modules.py +80 -80
  265. tests/test_mysql_pipeline_config.py +164 -164
  266. tests/test_mysql_pipeline_error.py +98 -98
  267. tests/test_mysql_pipeline_init_log.py +82 -82
  268. tests/test_mysql_pipeline_integration.py +132 -132
  269. tests/test_mysql_pipeline_refactor.py +143 -143
  270. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  271. tests/test_mysql_pipeline_robustness.py +195 -195
  272. tests/test_mysql_pipeline_types.py +88 -88
  273. tests/test_mysql_update_columns.py +93 -93
  274. tests/test_offsite_middleware.py +244 -244
  275. tests/test_offsite_middleware_simple.py +203 -203
  276. tests/test_optimized_selector_naming.py +100 -100
  277. tests/test_parsel.py +29 -29
  278. tests/test_performance.py +327 -327
  279. tests/test_performance_monitor.py +115 -115
  280. tests/test_pipeline_fingerprint_consistency.py +86 -86
  281. tests/test_priority_behavior.py +211 -211
  282. tests/test_priority_consistency.py +151 -151
  283. tests/test_priority_consistency_fixed.py +249 -249
  284. tests/test_proxy_health_check.py +32 -32
  285. tests/test_proxy_middleware.py +217 -217
  286. tests/test_proxy_middleware_enhanced.py +212 -212
  287. tests/test_proxy_middleware_integration.py +142 -142
  288. tests/test_proxy_middleware_refactored.py +207 -207
  289. tests/test_proxy_only.py +83 -83
  290. tests/test_proxy_providers.py +56 -56
  291. tests/test_proxy_stats.py +19 -19
  292. tests/test_proxy_strategies.py +59 -59
  293. tests/test_proxy_with_downloader.py +152 -152
  294. tests/test_queue_empty_check.py +41 -41
  295. tests/test_queue_manager_double_crawlo.py +173 -173
  296. tests/test_queue_manager_redis_key.py +179 -179
  297. tests/test_queue_naming.py +154 -154
  298. tests/test_queue_type.py +106 -106
  299. tests/test_queue_type_redis_config_consistency.py +130 -130
  300. tests/test_random_headers_default.py +322 -322
  301. tests/test_random_headers_necessity.py +308 -308
  302. tests/test_random_user_agent.py +72 -72
  303. tests/test_redis_config.py +28 -28
  304. tests/test_redis_connection_pool.py +294 -294
  305. tests/test_redis_key_naming.py +181 -181
  306. tests/test_redis_key_validator.py +123 -123
  307. tests/test_redis_queue.py +224 -224
  308. tests/test_redis_queue_name_fix.py +175 -175
  309. tests/test_redis_queue_type_fallback.py +129 -129
  310. tests/test_request_ignore_middleware.py +182 -182
  311. tests/test_request_params.py +111 -111
  312. tests/test_request_serialization.py +70 -70
  313. tests/test_response_code_middleware.py +349 -349
  314. tests/test_response_filter_middleware.py +427 -427
  315. tests/test_response_follow.py +104 -104
  316. tests/test_response_improvements.py +152 -152
  317. tests/test_response_selector_methods.py +92 -92
  318. tests/test_response_url_methods.py +70 -70
  319. tests/test_response_urljoin.py +86 -86
  320. tests/test_retry_middleware.py +333 -333
  321. tests/test_retry_middleware_realistic.py +273 -273
  322. tests/test_scheduler.py +252 -252
  323. tests/test_scheduler_config_update.py +133 -133
  324. tests/test_scrapy_style_encoding.py +112 -112
  325. tests/test_selector_helper.py +100 -100
  326. tests/test_selector_optimizations.py +146 -146
  327. tests/test_simple_response.py +61 -61
  328. tests/test_spider_loader.py +49 -49
  329. tests/test_spider_loader_comprehensive.py +69 -69
  330. tests/test_spider_modules.py +84 -84
  331. tests/test_spiders/test_spider.py +9 -9
  332. tests/test_telecom_spider_redis_key.py +205 -205
  333. tests/test_template_content.py +87 -87
  334. tests/test_template_redis_key.py +134 -134
  335. tests/test_tools.py +159 -159
  336. tests/test_user_agent_randomness.py +176 -176
  337. tests/test_user_agents.py +96 -96
  338. tests/untested_features_report.md +138 -138
  339. tests/verify_debug.py +51 -51
  340. tests/verify_distributed.py +117 -117
  341. tests/verify_log_fix.py +111 -111
  342. tests/verify_mysql_warnings.py +109 -109
  343. crawlo/utils/log.py +0 -80
  344. crawlo/utils/url_utils.py +0 -40
  345. crawlo-1.4.7.dist-info/RECORD +0 -347
  346. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  347. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  348. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/spider/__init__.py CHANGED
@@ -1,657 +1,658 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Crawlo Spider Module
5
- ==================
6
- 提供爬虫基类和相关功能。
7
-
8
- 核心功能:
9
- - Spider基类:所有爬虫的基础类
10
- - 自动注册机制:通过元类自动注册爬虫
11
- - 配置管理:支持自定义设置和链式调用
12
- - 生命周期管理:开启/关闭钩子函数
13
- - 分布式支持:智能检测运行模式
14
-
15
- 使用示例:
16
- class MySpider(Spider):
17
- name = 'my_spider'
18
- start_urls = ['http://example.com']
19
-
20
- # 自定义配置
21
- custom_settings = {
22
- 'DOWNLOADER_TYPE': 'httpx',
23
- 'CONCURRENCY': 10
24
- }
25
-
26
- def parse(self, response):
27
- # 解析逻辑
28
- yield Item(data=response.json())
29
- """
30
- from __future__ import annotations
31
- from typing import Type, Any, Optional, List, Dict, Union, Iterator, AsyncIterator
32
- from ..network.request import Request
33
- from ..utils.log import get_logger
34
-
35
-
36
- # 全局爬虫注册表
37
- _DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
38
-
39
-
40
- class SpiderMeta(type):
41
- """
42
- 爬虫元类,提供自动注册功能
43
-
44
- 功能:
45
- - 自动注册爬虫到全局注册表
46
- - 验证爬虫名称的唯一性
47
- - 提供完整的错误提示
48
- """
49
-
50
- def __new__(mcs, name: str, bases: tuple[type], namespace: dict[str, Any], **kwargs):
51
- cls = super().__new__(mcs, name, bases, namespace)
52
-
53
- # 检查是否为Spider子类
54
- is_spider_subclass = any(
55
- base is Spider or (isinstance(base, type) and issubclass(base, Spider))
56
- for base in bases
57
- )
58
- if not is_spider_subclass:
59
- return cls
60
-
61
- # 验证爬虫名称
62
- spider_name = namespace.get('name')
63
- if not isinstance(spider_name, str):
64
- raise AttributeError(
65
- f"爬虫类 '{cls.__name__}' 必须定义字符串类型的 'name' 属性。\n"
66
- f"示例: name = 'my_spider'"
67
- )
68
-
69
- # 检查名称唯一性
70
- if spider_name in _DEFAULT_SPIDER_REGISTRY:
71
- existing_class = _DEFAULT_SPIDER_REGISTRY[spider_name]
72
- raise ValueError(
73
- f"爬虫名称 '{spider_name}' 已被 {existing_class.__name__} 占用。\n"
74
- f"请确保每个爬虫的 name 属性全局唯一。\n"
75
- f"建议使用格式: 'project_module_function'"
76
- )
77
-
78
- # 注册爬虫
79
- _DEFAULT_SPIDER_REGISTRY[spider_name] = cls
80
- # 延迟初始化logger避免模块级别阻塞
81
- try:
82
- from crawlo.logging import get_logger
83
- get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
84
- except:
85
- # 如果日志系统未初始化,静默失败
86
- pass
87
-
88
- return cls
89
-
90
-
91
- class Spider(metaclass=SpiderMeta):
92
- """
93
- 爬虫基类 - 所有爬虫实现的基础
94
-
95
- 必须定义的属性:
96
- - name: 爬虫名称,必须全局唯一
97
-
98
- 可选配置:
99
- - start_urls: 起始 URL 列表
100
- - custom_settings: 自定义设置字典
101
- - allowed_domains: 允许的域名列表
102
-
103
- 必须实现的方法:
104
- - parse(response): 解析响应的主方法
105
-
106
- 可选实现的方法:
107
- - spider_opened(): 爬虫开启时调用
108
- - spider_closed(): 爬虫关闭时调用
109
- - start_requests(): 生成初始请求(默认使用start_urls)
110
-
111
- 示例:
112
- class MySpider(Spider):
113
- name = 'example_spider'
114
- start_urls = ['https://example.com']
115
-
116
- custom_settings = {
117
- 'DOWNLOADER_TYPE': 'httpx',
118
- 'CONCURRENCY': 5,
119
- 'DOWNLOAD_DELAY': 1.0
120
- }
121
-
122
- def parse(self, response):
123
- # 提取数据
124
- data = response.css('title::text').get()
125
- yield {'title': data}
126
-
127
- # 生成新请求
128
- for link in response.css('a::attr(href)').getall():
129
- yield Request(url=link, callback=self.parse_detail)
130
- """
131
-
132
- # 必须定义的属性
133
- name: str = None
134
-
135
- # 可选属性
136
- start_urls: List[str] = None
137
- custom_settings: Dict[str, Any] = None
138
- allowed_domains: List[str] = None
139
-
140
- def __init__(self, name: str = None, **kwargs):
141
- """
142
- 初始化爬虫实例
143
-
144
- :param name: 爬虫名称(可选,默认使用类属性)
145
- :param kwargs: 其他初始化参数
146
- """
147
- # 初始化基本属性
148
- if not hasattr(self, 'start_urls') or self.start_urls is None:
149
- self.start_urls = []
150
- if not hasattr(self, 'custom_settings') or self.custom_settings is None:
151
- self.custom_settings = {}
152
- if not hasattr(self, 'allowed_domains') or self.allowed_domains is None:
153
- self.allowed_domains = []
154
-
155
- # 设置爬虫名称
156
- self.name = name or self.name
157
- if not self.name:
158
- raise ValueError(f"爬虫 {self.__class__.__name__} 必须指定 name 属性")
159
-
160
- # 初始化其他属性
161
- self.crawler = None
162
- # 延迟初始化logger避免阻塞
163
- self._logger = None
164
- self.stats = None
165
-
166
- # 应用额外参数
167
- for key, value in kwargs.items():
168
- setattr(self, key, value)
169
-
170
- @property
171
- def logger(self):
172
- """延迟初始化logger"""
173
- if self._logger is None:
174
- from crawlo.logging import get_logger
175
- self._logger = get_logger(self.name)
176
- return self._logger
177
-
178
- @classmethod
179
- def create_instance(cls, crawler) -> 'Spider':
180
- """
181
- 创建爬虫实例并绑定 crawler
182
-
183
- :param crawler: Crawler 实例
184
- :return: 爬虫实例
185
- """
186
- spider = cls()
187
- spider.crawler = crawler
188
- spider.stats = getattr(crawler, 'stats', None)
189
-
190
- # 合并自定义设置 - 使用延迟应用避免初始化时的循环依赖
191
- if hasattr(spider, 'custom_settings') and spider.custom_settings:
192
- # 延迟到真正需要时才应用设置
193
- spider._pending_settings = spider.custom_settings.copy()
194
- spider.logger.debug(f"准备应用 {len(spider.custom_settings)} 项自定义设置")
195
-
196
- return spider
197
-
198
- def apply_pending_settings(self):
199
- """应用待处理的设置(在初始化完成后调用)"""
200
- if hasattr(self, '_pending_settings') and self._pending_settings:
201
- for key, value in self._pending_settings.items():
202
- if self.crawler and hasattr(self.crawler, 'settings'):
203
- self.crawler.settings.set(key, value)
204
- self.logger.debug(f"应用自定义设置: {key} = {value}")
205
- # 清除待处理的设置
206
- delattr(self, '_pending_settings')
207
-
208
- def start_requests(self) -> Iterator[Request]:
209
- """
210
- 生成初始请求
211
-
212
- 默认行为:
213
- - 使用 start_urls 生成请求
214
- - 智能检测分布式模式决定是否去重
215
- - 支持单个 start_url 属性(兼容性)
216
- - 支持批量生成优化(大规模URL场景)
217
-
218
- :return: Request 迭代器
219
- """
220
- # 检测是否为分布式模式
221
- is_distributed = self._is_distributed_mode()
222
-
223
- # 获取批量处理配置
224
- batch_size = self._get_batch_size()
225
-
226
- # 从 start_urls 生成请求
227
- if self.start_urls:
228
- generated_count = 0
229
- for url in self.start_urls:
230
- if self._is_allowed_domain(url):
231
- yield Request(
232
- url=url,
233
- callback=self.parse,
234
- dont_filter=not is_distributed,
235
- meta={'spider_name': self.name}
236
- )
237
- generated_count += 1
238
-
239
- # 大规模URL时进行批量控制
240
- if batch_size > 0 and generated_count % batch_size == 0:
241
- self.logger.debug(f"已生成 {generated_count} 个请求(批量大小: {batch_size})")
242
- else:
243
- self.logger.warning(f"跳过不允许的域名: {url}")
244
-
245
- # 兼容单个 start_url 属性
246
- elif hasattr(self, 'start_url') and isinstance(getattr(self, 'start_url'), str):
247
- url = getattr(self, 'start_url')
248
- if self._is_allowed_domain(url):
249
- yield Request(
250
- url=url,
251
- callback=self.parse,
252
- dont_filter=not is_distributed,
253
- meta={'spider_name': self.name}
254
- )
255
- else:
256
- self.logger.warning(f"跳过不允许的域名: {url}")
257
-
258
- else:
259
- self.logger.warning(
260
- f"爬虫 {self.name} 没有定义 start_urls 或 start_url。\n"
261
- f"请在爬虫类中定义或重写 start_requests() 方法。"
262
- )
263
-
264
- def _get_batch_size(self) -> int:
265
- """
266
- 获取批量处理大小配置
267
-
268
- 用于大规模URL场景的性能优化
269
-
270
- :return: 批量大小(0表示无限制)
271
- """
272
- if not self.crawler:
273
- return 0
274
-
275
- # 从设置中获取批量大小
276
- batch_size = self.crawler.settings.get_int('SPIDER_BATCH_SIZE', 0)
277
-
278
- # 如果start_urls超过一定数量,自动启用批量模式
279
- if batch_size == 0 and self.start_urls and len(self.start_urls) > 1000:
280
- batch_size = 500 # 默认批量大小
281
- self.logger.info(f"检测到大量start_urls ({len(self.start_urls)}),启用批量模式 (批量大小: {batch_size})")
282
-
283
- return batch_size
284
-
285
- def _is_distributed_mode(self) -> bool:
286
- """
287
- 智能检测是否为分布式模式
288
-
289
- 检测条件:
290
- - QUEUE_TYPE = 'redis'
291
- - FILTER_CLASS 包含 'aioredis_filter'
292
- - RUN_MODE = 'distributed'
293
-
294
- :return: 是否为分布式模式
295
- """
296
- if not self.crawler:
297
- return False
298
-
299
- settings = self.crawler.settings
300
-
301
- # 检查多个条件来判断是否为分布式模式
302
- queue_type = settings.get('QUEUE_TYPE', 'memory')
303
- filter_class = settings.get('FILTER_CLASS', '')
304
- run_mode = settings.get('RUN_MODE', 'standalone')
305
-
306
- # 分布式模式的标志
307
- is_redis_queue = queue_type == 'redis'
308
- is_redis_filter = 'aioredis_filter' in filter_class.lower()
309
- is_distributed_run_mode = run_mode == 'distributed'
310
-
311
- distributed = is_redis_queue or is_redis_filter or is_distributed_run_mode
312
-
313
- if distributed:
314
- self.logger.debug("检测到分布式模式,启用请求去重")
315
- else:
316
- self.logger.debug("检测到单机模式,禁用请求去重")
317
-
318
- return distributed
319
-
320
- def _is_allowed_domain(self, url: str) -> bool:
321
- """
322
- 检查URL是否在允许的域名列表中
323
-
324
- :param url: 要检查的URL
325
- :return: 是否允许
326
- """
327
- if not self.allowed_domains:
328
- return True
329
-
330
- from urllib.parse import urlparse
331
- try:
332
- domain = urlparse(url).netloc.lower()
333
- return any(
334
- domain == allowed.lower() or domain.endswith('.' + allowed.lower())
335
- for allowed in self.allowed_domains
336
- )
337
- except Exception as e:
338
- self.logger.warning(f"URL解析失败: {url} - {e}")
339
- return False
340
-
341
- def parse(self, response):
342
- """
343
- 解析响应的主方法(必须实现)
344
-
345
- :param response: 响应对象
346
- :return: 生成的 Item 或 Request
347
- """
348
- raise NotImplementedError(
349
- f"爬虫 {self.__class__.__name__} 必须实现 parse() 方法\n"
350
- f"示例:\n"
351
- f"def parse(self, response):\n"
352
- f" # 提取数据\n"
353
- f" yield {{'title': response.css('title::text').get()}}\n"
354
- f" # 生成新请求\n"
355
- f" for link in response.css('a::attr(href)').getall():\n"
356
- f" yield Request(url=link)"
357
- )
358
-
359
- async def spider_opened(self):
360
- """
361
- 爬虫开启时调用的钩子函数
362
-
363
- 可用于:
364
- - 初始化资源
365
- - 连接数据库
366
- - 设置初始状态
367
- """
368
- self.logger.info(f"Spider {self.name} opened")
369
-
370
- async def spider_closed(self):
371
- """
372
- 爬虫关闭时调用的钩子函数
373
-
374
- 可用于:
375
- - 清理资源
376
- - 关闭数据库连接
377
- """
378
- # 不再输出任何信息,避免与统计信息重复
379
- # 统计信息由StatsCollector负责输出
380
- pass
381
-
382
- def __str__(self) -> str:
383
- return f"{self.__class__.__name__}(name='{self.name}')"
384
-
385
- def __repr__(self) -> str:
386
- return self.__str__()
387
-
388
- def set_custom_setting(self, key: str, value: Any) -> 'Spider':
389
- """
390
- 设置自定义配置(链式调用)
391
-
392
- :param key: 配置键名
393
- :param value: 配置值
394
- :return: self(支持链式调用)
395
-
396
- 示例:
397
- spider.set_custom_setting('CONCURRENCY', 10)\
398
- .set_custom_setting('DOWNLOAD_DELAY', 1.0)
399
- """
400
- if not hasattr(self, 'custom_settings') or self.custom_settings is None:
401
- self.custom_settings = {}
402
-
403
- self.custom_settings[key] = value
404
- self.logger.debug(f"设置自定义配置: {key} = {value}")
405
-
406
- # 如果已绑定crawler,立即应用设置
407
- if self.crawler:
408
- self.crawler.settings.set(key, value)
409
-
410
- return self
411
-
412
- def get_custom_setting(self, key: str, default: Any = None) -> Any:
413
- """
414
- 获取自定义配置值
415
-
416
- :param key: 配置键名
417
- :param default: 默认值
418
- :return: 配置值
419
- """
420
- if hasattr(self, 'custom_settings') and self.custom_settings:
421
- return self.custom_settings.get(key, default)
422
- return default
423
-
424
- def get_spider_info(self) -> Dict[str, Any]:
425
- """
426
- 获取爬虫详细信息
427
-
428
- :return: 爬虫信息字典
429
- """
430
- info = {
431
- 'name': self.name,
432
- 'class_name': self.__class__.__name__,
433
- 'module': self.__module__,
434
- 'start_urls_count': len(self.start_urls) if self.start_urls else 0,
435
- 'allowed_domains_count': len(self.allowed_domains) if self.allowed_domains else 0,
436
- 'custom_settings_count': len(self.custom_settings) if self.custom_settings else 0,
437
- 'is_distributed': self._is_distributed_mode() if self.crawler else None,
438
- 'has_crawler': self.crawler is not None,
439
- 'logger_name': self.logger.name if hasattr(self, 'logger') else None
440
- }
441
-
442
- # 添加方法检查
443
- info['methods'] = {
444
- 'has_parse': callable(getattr(self, 'parse', None)),
445
- 'has_spider_opened': callable(getattr(self, 'spider_opened', None)),
446
- 'has_spider_closed': callable(getattr(self, 'spider_closed', None)),
447
- 'has_start_requests': callable(getattr(self, 'start_requests', None))
448
- }
449
-
450
- return info
451
-
452
- def make_request(self, url: str, callback=None, **kwargs) -> Request:
453
- """
454
- 便捷方法:创建 Request 对象
455
-
456
- :param url: 请求URL
457
- :param callback: 回调函数(默认为parse)
458
- :param kwargs: 其他Request参数
459
- :return: Request对象
460
- """
461
- return Request(
462
- url=url,
463
- callback=callback or self.parse,
464
- meta={'spider_name': self.name},
465
- **kwargs
466
- )
467
-
468
-
469
- # === 高级爬虫功能扩展 ===
470
-
471
- class SpiderStatsTracker:
472
- """
473
- 爬虫统计跟踪器
474
- 提供详细的性能监控功能
475
- """
476
-
477
- def __init__(self, spider_name: str):
478
- self.spider_name = spider_name
479
- self.start_time = None
480
- self.end_time = None
481
- self.request_count = 0
482
- self.response_count = 0
483
- self.item_count = 0
484
- self.error_count = 0
485
- self.domain_stats = {}
486
-
487
- def start_tracking(self):
488
- """开始统计"""
489
- import time
490
- self.start_time = time.time()
491
-
492
- def stop_tracking(self):
493
- """停止统计"""
494
- import time
495
- self.end_time = time.time()
496
-
497
- def record_request(self, url: str):
498
- """记录请求"""
499
- self.request_count += 1
500
- from urllib.parse import urlparse
501
- domain = urlparse(url).netloc
502
- self.domain_stats[domain] = self.domain_stats.get(domain, 0) + 1
503
-
504
- def record_response(self):
505
- """记录响应"""
506
- self.response_count += 1
507
-
508
- def record_item(self):
509
- """记录Item"""
510
- self.item_count += 1
511
-
512
- def record_error(self):
513
- """记录错误"""
514
- self.error_count += 1
515
-
516
- def get_summary(self) -> Dict[str, Any]:
517
- """获取统计摘要"""
518
- duration = (self.end_time - self.start_time) if (self.start_time and self.end_time) else 0
519
-
520
- return {
521
- 'spider_name': self.spider_name,
522
- 'duration_seconds': round(duration, 2),
523
- 'requests': self.request_count,
524
- 'responses': self.response_count,
525
- 'items': self.item_count,
526
- 'errors': self.error_count,
527
- 'success_rate': round((self.response_count / max(1, self.request_count)) * 100, 2),
528
- 'requests_per_second': round(self.request_count / max(1, duration), 2),
529
- 'top_domains': sorted(
530
- self.domain_stats.items(),
531
- key=lambda x: x[1],
532
- reverse=True
533
- )[:5]
534
- }
535
-
536
-
537
- def create_spider_from_template(name: str, start_urls: List[str], **options) -> Type[Spider]:
538
- """
539
- 从模板快速创建爬虫类
540
-
541
- :param name: 爬虫名称
542
- :param start_urls: 起始URL列表
543
- :param options: 其他选项
544
- :return: 新创建的爬虫类
545
-
546
- 示例:
547
- MySpider = create_spider_from_template(
548
- name='quick_spider',
549
- start_urls=['http://example.com'],
550
- allowed_domains=['example.com'],
551
- custom_settings={'CONCURRENCY': 5}
552
- )
553
- """
554
-
555
- # 动态创建爬虫类
556
- class_attrs = {
557
- 'name': name,
558
- 'start_urls': start_urls,
559
- 'allowed_domains': options.get('allowed_domains', []),
560
- 'custom_settings': options.get('custom_settings', {})
561
- }
562
-
563
- # 添加自定义parse方法
564
- if 'parse_function' in options:
565
- class_attrs['parse'] = options['parse_function']
566
- else:
567
- def default_parse(self, response):
568
- """默认解析方法"""
569
- yield {'url': response.url, 'title': getattr(response, 'title', 'N/A')}
570
- class_attrs['parse'] = default_parse
571
-
572
- # 创建类名
573
- class_name = options.get('class_name', f"Generated{name.replace('_', '').title()}Spider")
574
-
575
- # 动态创建类
576
- spider_class = type(class_name, (Spider,), class_attrs)
577
-
578
- get_logger(__name__).info(f"动态创建爬虫类: {class_name} (name='{name}')")
579
-
580
- return spider_class
581
-
582
-
583
- # === 公共只读接口 ===
584
- def get_global_spider_registry() -> dict[str, Type[Spider]]:
585
- """
586
- 获取全局爬虫注册表的副本
587
-
588
- :return: 爬虫注册表的副本
589
- """
590
- return _DEFAULT_SPIDER_REGISTRY.copy()
591
-
592
-
593
- def get_spider_by_name(name: str) -> Optional[Type[Spider]]:
594
- """
595
- 根据名称获取爬虫类
596
-
597
- :param name: 爬虫名称
598
- :return: 爬虫类或None
599
- """
600
- return _DEFAULT_SPIDER_REGISTRY.get(name)
601
-
602
-
603
- def get_all_spider_classes() -> List[Type[Spider]]:
604
- """
605
- 获取所有注册的爬虫类
606
-
607
- :return: 爬虫类列表
608
- """
609
- return list(set(_DEFAULT_SPIDER_REGISTRY.values()))
610
-
611
-
612
- def get_spider_names() -> List[str]:
613
- """
614
- 获取所有爬虫名称
615
-
616
- :return: 爬虫名称列表
617
- """
618
- return list(_DEFAULT_SPIDER_REGISTRY.keys())
619
-
620
-
621
- def is_spider_registered(name: str) -> bool:
622
- """
623
- 检查爬虫是否已注册
624
-
625
- :param name: 爬虫名称
626
- :return: 是否已注册
627
- """
628
- return name in _DEFAULT_SPIDER_REGISTRY
629
-
630
-
631
- def unregister_spider(name: str) -> bool:
632
- """
633
- 取消注册爬虫(仅用于测试)
634
-
635
- :param name: 爬虫名称
636
- :return: 是否成功取消注册
637
- """
638
- if name in _DEFAULT_SPIDER_REGISTRY:
639
- del _DEFAULT_SPIDER_REGISTRY[name]
640
- return True
641
- return False
642
-
643
-
644
- # 导出的公共接口
645
- __all__ = [
646
- 'Spider',
647
- 'SpiderMeta',
648
- 'SpiderStatsTracker',
649
- 'create_spider_from_template',
650
- 'get_global_spider_registry',
651
- 'get_spider_by_name',
652
- 'get_all_spider_classes',
653
- 'get_spider_names',
654
- 'is_spider_registered',
655
- 'unregister_spider'
656
- ]
657
-
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Crawlo Spider Module
5
+ ==================
6
+ 提供爬虫基类和相关功能。
7
+
8
+ 核心功能:
9
+ - Spider基类:所有爬虫的基础类
10
+ - 自动注册机制:通过元类自动注册爬虫
11
+ - 配置管理:支持自定义设置和链式调用
12
+ - 生命周期管理:开启/关闭钩子函数
13
+ - 分布式支持:智能检测运行模式
14
+
15
+ 使用示例:
16
+ class MySpider(Spider):
17
+ name = 'my_spider'
18
+ start_urls = ['http://example.com']
19
+
20
+ # 自定义配置
21
+ custom_settings = {
22
+ 'DOWNLOADER_TYPE': 'httpx',
23
+ 'CONCURRENCY': 10
24
+ }
25
+
26
+ def parse(self, response):
27
+ # 解析逻辑
28
+ yield Item(data=response.json())
29
+ """
30
+ from __future__ import annotations
31
+
32
+ from typing import Type, Any, Optional, List, Dict, Iterator
33
+
34
+ from ..logging import get_logger
35
+ from ..network.request import Request
36
+
37
+ # 全局爬虫注册表
38
+ _DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
39
+
40
+
41
+ class SpiderMeta(type):
42
+ """
43
+ 爬虫元类,提供自动注册功能
44
+
45
+ 功能:
46
+ - 自动注册爬虫到全局注册表
47
+ - 验证爬虫名称的唯一性
48
+ - 提供完整的错误提示
49
+ """
50
+
51
+ def __new__(mcs, name: str, bases: tuple[type], namespace: dict[str, Any], **kwargs):
52
+ cls = super().__new__(mcs, name, bases, namespace)
53
+
54
+ # 检查是否为Spider子类
55
+ is_spider_subclass = any(
56
+ base is Spider or (isinstance(base, type) and issubclass(base, Spider))
57
+ for base in bases
58
+ )
59
+ if not is_spider_subclass:
60
+ return cls
61
+
62
+ # 验证爬虫名称
63
+ spider_name = namespace.get('name')
64
+ if not isinstance(spider_name, str):
65
+ raise AttributeError(
66
+ f"爬虫类 '{cls.__name__}' 必须定义字符串类型的 'name' 属性。\n"
67
+ f"示例: name = 'my_spider'"
68
+ )
69
+
70
+ # 检查名称唯一性
71
+ if spider_name in _DEFAULT_SPIDER_REGISTRY:
72
+ existing_class = _DEFAULT_SPIDER_REGISTRY[spider_name]
73
+ raise ValueError(
74
+ f"爬虫名称 '{spider_name}' 已被 {existing_class.__name__} 占用。\n"
75
+ f"请确保每个爬虫的 name 属性全局唯一。\n"
76
+ f"建议使用格式: 'project_module_function'"
77
+ )
78
+
79
+ # 注册爬虫
80
+ _DEFAULT_SPIDER_REGISTRY[spider_name] = cls
81
+ # 延迟初始化logger避免模块级别阻塞
82
+ try:
83
+ from crawlo.logging import get_logger
84
+ get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
85
+ except:
86
+ # 如果日志系统未初始化,静默失败
87
+ pass
88
+
89
+ return cls
90
+
91
+
92
+ class Spider(metaclass=SpiderMeta):
93
+ """
94
+ 爬虫基类 - 所有爬虫实现的基础
95
+
96
+ 必须定义的属性:
97
+ - name: 爬虫名称,必须全局唯一
98
+
99
+ 可选配置:
100
+ - start_urls: 起始 URL 列表
101
+ - custom_settings: 自定义设置字典
102
+ - allowed_domains: 允许的域名列表
103
+
104
+ 必须实现的方法:
105
+ - parse(response): 解析响应的主方法
106
+
107
+ 可选实现的方法:
108
+ - spider_opened(): 爬虫开启时调用
109
+ - spider_closed(): 爬虫关闭时调用
110
+ - start_requests(): 生成初始请求(默认使用start_urls)
111
+
112
+ 示例:
113
+ class MySpider(Spider):
114
+ name = 'example_spider'
115
+ start_urls = ['https://example.com']
116
+
117
+ custom_settings = {
118
+ 'DOWNLOADER_TYPE': 'httpx',
119
+ 'CONCURRENCY': 5,
120
+ 'DOWNLOAD_DELAY': 1.0
121
+ }
122
+
123
+ def parse(self, response):
124
+ # 提取数据
125
+ data = response.css('title::text').get()
126
+ yield {'title': data}
127
+
128
+ # 生成新请求
129
+ for link in response.css('a::attr(href)').getall():
130
+ yield Request(url=link, callback=self.parse_detail)
131
+ """
132
+
133
+ # 必须定义的属性
134
+ name: str = None
135
+
136
+ # 可选属性
137
+ start_urls: List[str] = None
138
+ custom_settings: Dict[str, Any] = None
139
+ allowed_domains: List[str] = None
140
+
141
+ def __init__(self, name: str = None, **kwargs):
142
+ """
143
+ 初始化爬虫实例
144
+
145
+ :param name: 爬虫名称(可选,默认使用类属性)
146
+ :param kwargs: 其他初始化参数
147
+ """
148
+ # 初始化基本属性
149
+ if not hasattr(self, 'start_urls') or self.start_urls is None:
150
+ self.start_urls = []
151
+ if not hasattr(self, 'custom_settings') or self.custom_settings is None:
152
+ self.custom_settings = {}
153
+ if not hasattr(self, 'allowed_domains') or self.allowed_domains is None:
154
+ self.allowed_domains = []
155
+
156
+ # 设置爬虫名称
157
+ self.name = name or self.name
158
+ if not self.name:
159
+ raise ValueError(f"爬虫 {self.__class__.__name__} 必须指定 name 属性")
160
+
161
+ # 初始化其他属性
162
+ self.crawler = None
163
+ # 延迟初始化logger避免阻塞
164
+ self._logger = None
165
+ self.stats = None
166
+
167
+ # 应用额外参数
168
+ for key, value in kwargs.items():
169
+ setattr(self, key, value)
170
+
171
+ @property
172
+ def logger(self):
173
+ """延迟初始化logger"""
174
+ if self._logger is None:
175
+ from crawlo.logging import get_logger
176
+ self._logger = get_logger(self.name)
177
+ return self._logger
178
+
179
+ @classmethod
180
+ def create_instance(cls, crawler) -> 'Spider':
181
+ """
182
+ 创建爬虫实例并绑定 crawler
183
+
184
+ :param crawler: Crawler 实例
185
+ :return: 爬虫实例
186
+ """
187
+ spider = cls()
188
+ spider.crawler = crawler
189
+ spider.stats = getattr(crawler, 'stats', None)
190
+
191
+ # 合并自定义设置 - 使用延迟应用避免初始化时的循环依赖
192
+ if hasattr(spider, 'custom_settings') and spider.custom_settings:
193
+ # 延迟到真正需要时才应用设置
194
+ spider._pending_settings = spider.custom_settings.copy()
195
+ spider.logger.debug(f"准备应用 {len(spider.custom_settings)} 项自定义设置")
196
+
197
+ return spider
198
+
199
+ def apply_pending_settings(self):
200
+ """应用待处理的设置(在初始化完成后调用)"""
201
+ if hasattr(self, '_pending_settings') and self._pending_settings:
202
+ for key, value in self._pending_settings.items():
203
+ if self.crawler and hasattr(self.crawler, 'settings'):
204
+ self.crawler.settings.set(key, value)
205
+ self.logger.debug(f"应用自定义设置: {key} = {value}")
206
+ # 清除待处理的设置
207
+ delattr(self, '_pending_settings')
208
+
209
+ def start_requests(self) -> Iterator[Request]:
210
+ """
211
+ 生成初始请求
212
+
213
+ 默认行为:
214
+ - 使用 start_urls 生成请求
215
+ - 智能检测分布式模式决定是否去重
216
+ - 支持单个 start_url 属性(兼容性)
217
+ - 支持批量生成优化(大规模URL场景)
218
+
219
+ :return: Request 迭代器
220
+ """
221
+ # 检测是否为分布式模式
222
+ is_distributed = self._is_distributed_mode()
223
+
224
+ # 获取批量处理配置
225
+ batch_size = self._get_batch_size()
226
+
227
+ # start_urls 生成请求
228
+ if self.start_urls:
229
+ generated_count = 0
230
+ for url in self.start_urls:
231
+ if self._is_allowed_domain(url):
232
+ yield Request(
233
+ url=url,
234
+ callback=self.parse,
235
+ dont_filter=not is_distributed,
236
+ meta={'spider_name': self.name}
237
+ )
238
+ generated_count += 1
239
+
240
+ # 大规模URL时进行批量控制
241
+ if batch_size > 0 and generated_count % batch_size == 0:
242
+ self.logger.debug(f"已生成 {generated_count} 个请求(批量大小: {batch_size})")
243
+ else:
244
+ self.logger.warning(f"跳过不允许的域名: {url}")
245
+
246
+ # 兼容单个 start_url 属性
247
+ elif hasattr(self, 'start_url') and isinstance(getattr(self, 'start_url'), str):
248
+ url = getattr(self, 'start_url')
249
+ if self._is_allowed_domain(url):
250
+ yield Request(
251
+ url=url,
252
+ callback=self.parse,
253
+ dont_filter=not is_distributed,
254
+ meta={'spider_name': self.name}
255
+ )
256
+ else:
257
+ self.logger.warning(f"跳过不允许的域名: {url}")
258
+
259
+ else:
260
+ self.logger.warning(
261
+ f"爬虫 {self.name} 没有定义 start_urls 或 start_url。\n"
262
+ f"请在爬虫类中定义或重写 start_requests() 方法。"
263
+ )
264
+
265
+ def _get_batch_size(self) -> int:
266
+ """
267
+ 获取批量处理大小配置
268
+
269
+ 用于大规模URL场景的性能优化
270
+
271
+ :return: 批量大小(0表示无限制)
272
+ """
273
+ if not self.crawler:
274
+ return 0
275
+
276
+ # 从设置中获取批量大小
277
+ batch_size = self.crawler.settings.get_int('SPIDER_BATCH_SIZE', 0)
278
+
279
+ # 如果start_urls超过一定数量,自动启用批量模式
280
+ if batch_size == 0 and self.start_urls and len(self.start_urls) > 1000:
281
+ batch_size = 500 # 默认批量大小
282
+ self.logger.info(f"检测到大量start_urls ({len(self.start_urls)}),启用批量模式 (批量大小: {batch_size})")
283
+
284
+ return batch_size
285
+
286
+ def _is_distributed_mode(self) -> bool:
287
+ """
288
+ 智能检测是否为分布式模式
289
+
290
+ 检测条件:
291
+ - QUEUE_TYPE = 'redis'
292
+ - FILTER_CLASS 包含 'aioredis_filter'
293
+ - RUN_MODE = 'distributed'
294
+
295
+ :return: 是否为分布式模式
296
+ """
297
+ if not self.crawler:
298
+ return False
299
+
300
+ settings = self.crawler.settings
301
+
302
+ # 检查多个条件来判断是否为分布式模式
303
+ queue_type = settings.get('QUEUE_TYPE', 'memory')
304
+ filter_class = settings.get('FILTER_CLASS', '')
305
+ run_mode = settings.get('RUN_MODE', 'standalone')
306
+
307
+ # 分布式模式的标志
308
+ is_redis_queue = queue_type == 'redis'
309
+ is_redis_filter = 'aioredis_filter' in filter_class.lower()
310
+ is_distributed_run_mode = run_mode == 'distributed'
311
+
312
+ distributed = is_redis_queue or is_redis_filter or is_distributed_run_mode
313
+
314
+ if distributed:
315
+ self.logger.debug("检测到分布式模式,启用请求去重")
316
+ else:
317
+ self.logger.debug("检测到单机模式,禁用请求去重")
318
+
319
+ return distributed
320
+
321
+ def _is_allowed_domain(self, url: str) -> bool:
322
+ """
323
+ 检查URL是否在允许的域名列表中
324
+
325
+ :param url: 要检查的URL
326
+ :return: 是否允许
327
+ """
328
+ if not self.allowed_domains:
329
+ return True
330
+
331
+ from urllib.parse import urlparse
332
+ try:
333
+ domain = urlparse(url).netloc.lower()
334
+ return any(
335
+ domain == allowed.lower() or domain.endswith('.' + allowed.lower())
336
+ for allowed in self.allowed_domains
337
+ )
338
+ except Exception as e:
339
+ self.logger.warning(f"URL解析失败: {url} - {e}")
340
+ return False
341
+
342
+ def parse(self, response):
343
+ """
344
+ 解析响应的主方法(必须实现)
345
+
346
+ :param response: 响应对象
347
+ :return: 生成的 Item 或 Request
348
+ """
349
+ raise NotImplementedError(
350
+ f"爬虫 {self.__class__.__name__} 必须实现 parse() 方法\n"
351
+ f"示例:\n"
352
+ f"def parse(self, response):\n"
353
+ f" # 提取数据\n"
354
+ f" yield {{'title': response.css('title::text').get()}}\n"
355
+ f" # 生成新请求\n"
356
+ f" for link in response.css('a::attr(href)').getall():\n"
357
+ f" yield Request(url=link)"
358
+ )
359
+
360
+ async def spider_opened(self):
361
+ """
362
+ 爬虫开启时调用的钩子函数
363
+
364
+ 可用于:
365
+ - 初始化资源
366
+ - 连接数据库
367
+ - 设置初始状态
368
+ """
369
+ self.logger.info(f"Spider {self.name} opened")
370
+
371
+ async def spider_closed(self):
372
+ """
373
+ 爬虫关闭时调用的钩子函数
374
+
375
+ 可用于:
376
+ - 清理资源
377
+ - 关闭数据库连接
378
+ """
379
+ # 不再输出任何信息,避免与统计信息重复
380
+ # 统计信息由StatsCollector负责输出
381
+ pass
382
+
383
+ def __str__(self) -> str:
384
+ return f"{self.__class__.__name__}(name='{self.name}')"
385
+
386
+ def __repr__(self) -> str:
387
+ return self.__str__()
388
+
389
+ def set_custom_setting(self, key: str, value: Any) -> 'Spider':
390
+ """
391
+ 设置自定义配置(链式调用)
392
+
393
+ :param key: 配置键名
394
+ :param value: 配置值
395
+ :return: self(支持链式调用)
396
+
397
+ 示例:
398
+ spider.set_custom_setting('CONCURRENCY', 10)\
399
+ .set_custom_setting('DOWNLOAD_DELAY', 1.0)
400
+ """
401
+ if not hasattr(self, 'custom_settings') or self.custom_settings is None:
402
+ self.custom_settings = {}
403
+
404
+ self.custom_settings[key] = value
405
+ self.logger.debug(f"设置自定义配置: {key} = {value}")
406
+
407
+ # 如果已绑定crawler,立即应用设置
408
+ if self.crawler:
409
+ self.crawler.settings.set(key, value)
410
+
411
+ return self
412
+
413
+ def get_custom_setting(self, key: str, default: Any = None) -> Any:
414
+ """
415
+ 获取自定义配置值
416
+
417
+ :param key: 配置键名
418
+ :param default: 默认值
419
+ :return: 配置值
420
+ """
421
+ if hasattr(self, 'custom_settings') and self.custom_settings:
422
+ return self.custom_settings.get(key, default)
423
+ return default
424
+
425
+ def get_spider_info(self) -> Dict[str, Any]:
426
+ """
427
+ 获取爬虫详细信息
428
+
429
+ :return: 爬虫信息字典
430
+ """
431
+ info = {
432
+ 'name': self.name,
433
+ 'class_name': self.__class__.__name__,
434
+ 'module': self.__module__,
435
+ 'start_urls_count': len(self.start_urls) if self.start_urls else 0,
436
+ 'allowed_domains_count': len(self.allowed_domains) if self.allowed_domains else 0,
437
+ 'custom_settings_count': len(self.custom_settings) if self.custom_settings else 0,
438
+ 'is_distributed': self._is_distributed_mode() if self.crawler else None,
439
+ 'has_crawler': self.crawler is not None,
440
+ 'logger_name': self.logger.name if hasattr(self, 'logger') else None
441
+ }
442
+
443
+ # 添加方法检查
444
+ info['methods'] = {
445
+ 'has_parse': callable(getattr(self, 'parse', None)),
446
+ 'has_spider_opened': callable(getattr(self, 'spider_opened', None)),
447
+ 'has_spider_closed': callable(getattr(self, 'spider_closed', None)),
448
+ 'has_start_requests': callable(getattr(self, 'start_requests', None))
449
+ }
450
+
451
+ return info
452
+
453
+ def make_request(self, url: str, callback=None, **kwargs) -> Request:
454
+ """
455
+ 便捷方法:创建 Request 对象
456
+
457
+ :param url: 请求URL
458
+ :param callback: 回调函数(默认为parse)
459
+ :param kwargs: 其他Request参数
460
+ :return: Request对象
461
+ """
462
+ return Request(
463
+ url=url,
464
+ callback=callback or self.parse,
465
+ meta={'spider_name': self.name},
466
+ **kwargs
467
+ )
468
+
469
+
470
+ # === 高级爬虫功能扩展 ===
471
+
472
+ class SpiderStatsTracker:
473
+ """
474
+ 爬虫统计跟踪器
475
+ 提供详细的性能监控功能
476
+ """
477
+
478
+ def __init__(self, spider_name: str):
479
+ self.spider_name = spider_name
480
+ self.start_time = None
481
+ self.end_time = None
482
+ self.request_count = 0
483
+ self.response_count = 0
484
+ self.item_count = 0
485
+ self.error_count = 0
486
+ self.domain_stats = {}
487
+
488
+ def start_tracking(self):
489
+ """开始统计"""
490
+ import time
491
+ self.start_time = time.time()
492
+
493
+ def stop_tracking(self):
494
+ """停止统计"""
495
+ import time
496
+ self.end_time = time.time()
497
+
498
+ def record_request(self, url: str):
499
+ """记录请求"""
500
+ self.request_count += 1
501
+ from urllib.parse import urlparse
502
+ domain = urlparse(url).netloc
503
+ self.domain_stats[domain] = self.domain_stats.get(domain, 0) + 1
504
+
505
+ def record_response(self):
506
+ """记录响应"""
507
+ self.response_count += 1
508
+
509
+ def record_item(self):
510
+ """记录Item"""
511
+ self.item_count += 1
512
+
513
+ def record_error(self):
514
+ """记录错误"""
515
+ self.error_count += 1
516
+
517
+ def get_summary(self) -> Dict[str, Any]:
518
+ """获取统计摘要"""
519
+ duration = (self.end_time - self.start_time) if (self.start_time and self.end_time) else 0
520
+
521
+ return {
522
+ 'spider_name': self.spider_name,
523
+ 'duration_seconds': round(duration, 2),
524
+ 'requests': self.request_count,
525
+ 'responses': self.response_count,
526
+ 'items': self.item_count,
527
+ 'errors': self.error_count,
528
+ 'success_rate': round((self.response_count / max(1, self.request_count)) * 100, 2),
529
+ 'requests_per_second': round(self.request_count / max(1, duration), 2),
530
+ 'top_domains': sorted(
531
+ self.domain_stats.items(),
532
+ key=lambda x: x[1],
533
+ reverse=True
534
+ )[:5]
535
+ }
536
+
537
+
538
+ def create_spider_from_template(name: str, start_urls: List[str], **options) -> Type[Spider]:
539
+ """
540
+ 从模板快速创建爬虫类
541
+
542
+ :param name: 爬虫名称
543
+ :param start_urls: 起始URL列表
544
+ :param options: 其他选项
545
+ :return: 新创建的爬虫类
546
+
547
+ 示例:
548
+ MySpider = create_spider_from_template(
549
+ name='quick_spider',
550
+ start_urls=['http://example.com'],
551
+ allowed_domains=['example.com'],
552
+ custom_settings={'CONCURRENCY': 5}
553
+ )
554
+ """
555
+
556
+ # 动态创建爬虫类
557
+ class_attrs = {
558
+ 'name': name,
559
+ 'start_urls': start_urls,
560
+ 'allowed_domains': options.get('allowed_domains', []),
561
+ 'custom_settings': options.get('custom_settings', {})
562
+ }
563
+
564
+ # 添加自定义parse方法
565
+ if 'parse_function' in options:
566
+ class_attrs['parse'] = options['parse_function']
567
+ else:
568
+ def default_parse(self, response):
569
+ """默认解析方法"""
570
+ yield {'url': response.url, 'title': getattr(response, 'title', 'N/A')}
571
+ class_attrs['parse'] = default_parse
572
+
573
+ # 创建类名
574
+ class_name = options.get('class_name', f"Generated{name.replace('_', '').title()}Spider")
575
+
576
+ # 动态创建类
577
+ spider_class = type(class_name, (Spider,), class_attrs)
578
+
579
+ get_logger(__name__).info(f"动态创建爬虫类: {class_name} (name='{name}')")
580
+
581
+ return spider_class
582
+
583
+
584
+ # === 公共只读接口 ===
585
+ def get_global_spider_registry() -> dict[str, Type[Spider]]:
586
+ """
587
+ 获取全局爬虫注册表的副本
588
+
589
+ :return: 爬虫注册表的副本
590
+ """
591
+ return _DEFAULT_SPIDER_REGISTRY.copy()
592
+
593
+
594
+ def get_spider_by_name(name: str) -> Optional[Type[Spider]]:
595
+ """
596
+ 根据名称获取爬虫类
597
+
598
+ :param name: 爬虫名称
599
+ :return: 爬虫类或None
600
+ """
601
+ return _DEFAULT_SPIDER_REGISTRY.get(name)
602
+
603
+
604
+ def get_all_spider_classes() -> List[Type[Spider]]:
605
+ """
606
+ 获取所有注册的爬虫类
607
+
608
+ :return: 爬虫类列表
609
+ """
610
+ return list(set(_DEFAULT_SPIDER_REGISTRY.values()))
611
+
612
+
613
+ def get_spider_names() -> List[str]:
614
+ """
615
+ 获取所有爬虫名称
616
+
617
+ :return: 爬虫名称列表
618
+ """
619
+ return list(_DEFAULT_SPIDER_REGISTRY.keys())
620
+
621
+
622
+ def is_spider_registered(name: str) -> bool:
623
+ """
624
+ 检查爬虫是否已注册
625
+
626
+ :param name: 爬虫名称
627
+ :return: 是否已注册
628
+ """
629
+ return name in _DEFAULT_SPIDER_REGISTRY
630
+
631
+
632
+ def unregister_spider(name: str) -> bool:
633
+ """
634
+ 取消注册爬虫(仅用于测试)
635
+
636
+ :param name: 爬虫名称
637
+ :return: 是否成功取消注册
638
+ """
639
+ if name in _DEFAULT_SPIDER_REGISTRY:
640
+ del _DEFAULT_SPIDER_REGISTRY[name]
641
+ return True
642
+ return False
643
+
644
+
645
+ # 导出的公共接口
646
+ __all__ = [
647
+ 'Spider',
648
+ 'SpiderMeta',
649
+ 'SpiderStatsTracker',
650
+ 'create_spider_from_template',
651
+ 'get_global_spider_registry',
652
+ 'get_spider_by_name',
653
+ 'get_all_spider_classes',
654
+ 'get_spider_names',
655
+ 'is_spider_registered',
656
+ 'unregister_spider'
657
+ ]
658
+