crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (374) hide show
  1. crawlo/__init__.py +90 -89
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -341
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -438
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -291
  19. crawlo/crawler.py +698 -657
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -276
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -247
  25. crawlo/downloader/httpx_downloader.py +265 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -402
  28. crawlo/downloader/selenium_downloader.py +486 -472
  29. crawlo/event.py +45 -11
  30. crawlo/exceptions.py +215 -82
  31. crawlo/extension/__init__.py +65 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +135 -0
  44. crawlo/filters/__init__.py +170 -153
  45. crawlo/filters/aioredis_filter.py +348 -264
  46. crawlo/filters/memory_filter.py +261 -276
  47. crawlo/framework.py +306 -292
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -434
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -194
  52. crawlo/initialization/phases.py +230 -149
  53. crawlo/initialization/registry.py +143 -145
  54. crawlo/initialization/utils.py +49 -0
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -46
  61. crawlo/logging/config.py +277 -197
  62. crawlo/logging/factory.py +175 -171
  63. crawlo/logging/manager.py +104 -112
  64. crawlo/middleware/__init__.py +87 -24
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -253
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +375 -379
  77. crawlo/network/response.py +569 -664
  78. crawlo/pipelines/__init__.py +53 -22
  79. crawlo/pipelines/base_pipeline.py +452 -0
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -132
  87. crawlo/pipelines/mysql_pipeline.py +469 -476
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +10 -0
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -525
  94. crawlo/queue/redis_priority_queue.py +519 -370
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +284 -277
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +657 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +2 -4
  104. crawlo/templates/project/items.py.tmpl +13 -17
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -36
  107. crawlo/templates/project/settings.py.tmpl +109 -111
  108. crawlo/templates/project/settings_distributed.py.tmpl +156 -159
  109. crawlo/templates/project/settings_gentle.py.tmpl +170 -176
  110. crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
  111. crawlo/templates/project/settings_minimal.py.tmpl +98 -100
  112. crawlo/templates/project/settings_simple.py.tmpl +168 -174
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -40
  116. crawlo/templates/spiders_init.py.tmpl +5 -10
  117. crawlo/tools/__init__.py +86 -189
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +50 -50
  123. crawlo/utils/batch_processor.py +276 -259
  124. crawlo/utils/config_manager.py +442 -0
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/error_handler.py +410 -410
  128. crawlo/utils/fingerprint.py +121 -121
  129. crawlo/utils/func_tools.py +82 -82
  130. crawlo/utils/large_scale_helper.py +344 -344
  131. crawlo/utils/leak_detector.py +335 -0
  132. crawlo/utils/log.py +79 -79
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -0
  135. crawlo/utils/mysql_connection_pool.py +197 -0
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +91 -0
  139. crawlo/utils/redis_connection_pool.py +578 -388
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -256
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -0
  144. crawlo/utils/selector_helper.py +137 -137
  145. crawlo/utils/singleton.py +70 -0
  146. crawlo/utils/spider_loader.py +201 -201
  147. crawlo/utils/text_helper.py +94 -94
  148. crawlo/utils/{url.py → url_utils.py} +39 -39
  149. crawlo-1.4.7.dist-info/METADATA +689 -0
  150. crawlo-1.4.7.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -275
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -0
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -0
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/scrapy.cfg +11 -11
  192. tests/optimized_performance_test.py +211 -211
  193. tests/performance_comparison.py +244 -244
  194. tests/queue_blocking_test.py +113 -113
  195. tests/queue_test.py +89 -89
  196. tests/redis_key_validation_demo.py +130 -130
  197. tests/request_params_example.py +150 -150
  198. tests/response_improvements_example.py +144 -144
  199. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  200. tests/scrapy_comparison/scrapy_test.py +133 -133
  201. tests/simple_cli_test.py +55 -0
  202. tests/simple_command_test.py +119 -119
  203. tests/simple_crawlo_test.py +126 -126
  204. tests/simple_follow_test.py +38 -38
  205. tests/simple_log_test2.py +137 -137
  206. tests/simple_optimization_test.py +128 -128
  207. tests/simple_queue_type_test.py +41 -41
  208. tests/simple_response_selector_test.py +94 -94
  209. tests/simple_selector_helper_test.py +154 -154
  210. tests/simple_selector_test.py +207 -207
  211. tests/simple_spider_test.py +49 -49
  212. tests/simple_url_test.py +73 -73
  213. tests/simulate_mysql_update_test.py +139 -139
  214. tests/spider_log_timing_test.py +177 -177
  215. tests/test_advanced_tools.py +148 -148
  216. tests/test_all_commands.py +230 -230
  217. tests/test_all_pipeline_fingerprints.py +133 -133
  218. tests/test_all_redis_key_configs.py +145 -145
  219. tests/test_asyncmy_usage.py +56 -56
  220. tests/test_batch_processor.py +178 -178
  221. tests/test_cleaners.py +54 -54
  222. tests/test_cli_arguments.py +119 -0
  223. tests/test_component_factory.py +174 -174
  224. tests/test_config_consistency.py +80 -80
  225. tests/test_config_merge.py +152 -152
  226. tests/test_config_validator.py +182 -182
  227. tests/test_controlled_spider_mixin.py +79 -79
  228. tests/test_crawler_process_import.py +38 -38
  229. tests/test_crawler_process_spider_modules.py +47 -47
  230. tests/test_crawlo_proxy_integration.py +114 -114
  231. tests/test_date_tools.py +123 -123
  232. tests/test_dedup_fix.py +220 -220
  233. tests/test_dedup_pipeline_consistency.py +124 -124
  234. tests/test_default_header_middleware.py +313 -313
  235. tests/test_distributed.py +65 -65
  236. tests/test_double_crawlo_fix.py +204 -204
  237. tests/test_double_crawlo_fix_simple.py +124 -124
  238. tests/test_download_delay_middleware.py +221 -221
  239. tests/test_downloader_proxy_compatibility.py +272 -272
  240. tests/test_edge_cases.py +305 -305
  241. tests/test_encoding_core.py +56 -56
  242. tests/test_encoding_detection.py +126 -126
  243. tests/test_enhanced_error_handler.py +270 -270
  244. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  245. tests/test_error_handler_compatibility.py +112 -112
  246. tests/test_factories.py +252 -252
  247. tests/test_factory_compatibility.py +196 -196
  248. tests/test_final_validation.py +153 -153
  249. tests/test_fingerprint_consistency.py +135 -135
  250. tests/test_fingerprint_simple.py +51 -51
  251. tests/test_get_component_logger.py +83 -83
  252. tests/test_hash_performance.py +99 -99
  253. tests/test_integration.py +169 -169
  254. tests/test_item_dedup_redis_key.py +122 -122
  255. tests/test_large_scale_helper.py +235 -235
  256. tests/test_logging_enhancements.py +374 -374
  257. tests/test_logging_final.py +184 -184
  258. tests/test_logging_integration.py +312 -312
  259. tests/test_logging_system.py +282 -282
  260. tests/test_middleware_debug.py +141 -141
  261. tests/test_mode_consistency.py +51 -51
  262. tests/test_multi_directory.py +67 -67
  263. tests/test_multiple_spider_modules.py +80 -80
  264. tests/test_mysql_pipeline_config.py +164 -164
  265. tests/test_mysql_pipeline_error.py +98 -98
  266. tests/test_mysql_pipeline_init_log.py +82 -82
  267. tests/test_mysql_pipeline_integration.py +132 -132
  268. tests/test_mysql_pipeline_refactor.py +143 -143
  269. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  270. tests/test_mysql_pipeline_robustness.py +195 -195
  271. tests/test_mysql_pipeline_types.py +88 -88
  272. tests/test_mysql_update_columns.py +93 -93
  273. tests/test_offsite_middleware.py +244 -244
  274. tests/test_offsite_middleware_simple.py +203 -203
  275. tests/test_optimized_selector_naming.py +100 -100
  276. tests/test_parsel.py +29 -29
  277. tests/test_performance.py +327 -327
  278. tests/test_performance_monitor.py +115 -115
  279. tests/test_pipeline_fingerprint_consistency.py +86 -86
  280. tests/test_priority_behavior.py +211 -211
  281. tests/test_priority_consistency.py +151 -151
  282. tests/test_priority_consistency_fixed.py +249 -249
  283. tests/test_proxy_health_check.py +32 -32
  284. tests/test_proxy_middleware.py +217 -217
  285. tests/test_proxy_middleware_enhanced.py +212 -212
  286. tests/test_proxy_middleware_integration.py +142 -142
  287. tests/test_proxy_middleware_refactored.py +207 -207
  288. tests/test_proxy_only.py +83 -83
  289. tests/test_proxy_providers.py +56 -56
  290. tests/test_proxy_stats.py +19 -19
  291. tests/test_proxy_strategies.py +59 -59
  292. tests/test_proxy_with_downloader.py +152 -152
  293. tests/test_queue_empty_check.py +41 -41
  294. tests/test_queue_manager_double_crawlo.py +173 -173
  295. tests/test_queue_manager_redis_key.py +179 -179
  296. tests/test_queue_naming.py +154 -154
  297. tests/test_queue_type.py +106 -106
  298. tests/test_queue_type_redis_config_consistency.py +130 -130
  299. tests/test_random_headers_default.py +322 -322
  300. tests/test_random_headers_necessity.py +308 -308
  301. tests/test_random_user_agent.py +72 -72
  302. tests/test_redis_config.py +28 -28
  303. tests/test_redis_connection_pool.py +294 -294
  304. tests/test_redis_key_naming.py +181 -181
  305. tests/test_redis_key_validator.py +123 -123
  306. tests/test_redis_queue.py +224 -224
  307. tests/test_redis_queue_name_fix.py +175 -175
  308. tests/test_redis_queue_type_fallback.py +129 -129
  309. tests/test_request_ignore_middleware.py +182 -182
  310. tests/test_request_params.py +111 -111
  311. tests/test_request_serialization.py +70 -70
  312. tests/test_response_code_middleware.py +349 -349
  313. tests/test_response_filter_middleware.py +427 -427
  314. tests/test_response_follow.py +104 -104
  315. tests/test_response_improvements.py +152 -152
  316. tests/test_response_selector_methods.py +92 -92
  317. tests/test_response_url_methods.py +70 -70
  318. tests/test_response_urljoin.py +86 -86
  319. tests/test_retry_middleware.py +333 -333
  320. tests/test_retry_middleware_realistic.py +273 -273
  321. tests/test_scheduler.py +252 -252
  322. tests/test_scheduler_config_update.py +133 -133
  323. tests/test_scrapy_style_encoding.py +112 -112
  324. tests/test_selector_helper.py +100 -100
  325. tests/test_selector_optimizations.py +146 -146
  326. tests/test_simple_response.py +61 -61
  327. tests/test_spider_loader.py +49 -49
  328. tests/test_spider_loader_comprehensive.py +69 -69
  329. tests/test_spider_modules.py +84 -84
  330. tests/test_spiders/test_spider.py +9 -9
  331. tests/test_telecom_spider_redis_key.py +205 -205
  332. tests/test_template_content.py +87 -87
  333. tests/test_template_redis_key.py +134 -134
  334. tests/test_tools.py +159 -159
  335. tests/test_user_agent_randomness.py +176 -176
  336. tests/test_user_agents.py +96 -96
  337. tests/untested_features_report.md +138 -138
  338. tests/verify_debug.py +51 -51
  339. tests/verify_distributed.py +117 -117
  340. tests/verify_log_fix.py +111 -111
  341. tests/verify_mysql_warnings.py +109 -109
  342. crawlo/logging/async_handler.py +0 -181
  343. crawlo/logging/monitor.py +0 -153
  344. crawlo/logging/sampler.py +0 -167
  345. crawlo/tools/authenticated_proxy.py +0 -241
  346. crawlo/tools/data_formatter.py +0 -226
  347. crawlo/tools/data_validator.py +0 -181
  348. crawlo/tools/encoding_converter.py +0 -127
  349. crawlo/tools/network_diagnostic.py +0 -365
  350. crawlo/tools/request_tools.py +0 -83
  351. crawlo/tools/retry_mechanism.py +0 -224
  352. crawlo/utils/env_config.py +0 -143
  353. crawlo/utils/large_scale_config.py +0 -287
  354. crawlo/utils/system.py +0 -11
  355. crawlo/utils/tools.py +0 -5
  356. crawlo-1.4.6.dist-info/METADATA +0 -329
  357. crawlo-1.4.6.dist-info/RECORD +0 -361
  358. tests/env_config_example.py +0 -134
  359. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  360. tests/test_authenticated_proxy.py +0 -142
  361. tests/test_comprehensive.py +0 -147
  362. tests/test_dynamic_downloaders_proxy.py +0 -125
  363. tests/test_dynamic_proxy.py +0 -93
  364. tests/test_dynamic_proxy_config.py +0 -147
  365. tests/test_dynamic_proxy_real.py +0 -110
  366. tests/test_env_config.py +0 -122
  367. tests/test_framework_env_usage.py +0 -104
  368. tests/test_large_scale_config.py +0 -113
  369. tests/test_proxy_api.py +0 -265
  370. tests/test_real_scenario_proxy.py +0 -196
  371. tests/tools_example.py +0 -261
  372. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
  373. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
  374. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-05-11 11:08
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-05-11 11:08
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """
@@ -1,278 +1,285 @@
1
- # -*- coding:UTF-8 -*-
2
- """
3
- 默认配置文件
4
- 包含 Crawlo 框架的所有默认设置项
5
- """
6
- # 添加环境变量配置工具导入
7
- from crawlo.utils.env_config import get_redis_config, get_runtime_config, get_version
8
-
9
- # --------------------------------- 1. 框架基础配置 ------------------------------------
10
-
11
- # 框架初始化控制
12
- FRAMEWORK_INIT_ORDER = [
13
- 'log_system', # 日志系统
14
- 'settings_system', # 配置系统
15
- 'core_components', # 核心组件
16
- 'extensions', # 扩展组件
17
- 'full_initialization' # 完全初始化
18
- ]
19
- FRAMEWORK_INIT_STATE = 'uninitialized'
20
-
21
- # 项目基础配置
22
- runtime_config = get_runtime_config()
23
- PROJECT_NAME = runtime_config['PROJECT_NAME'] # 项目名称(用于日志、Redis Key 等标识)
24
- VERSION = get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
25
- RUN_MODE = runtime_config['CRAWLO_MODE'] # 运行模式:standalone/distributed/auto
26
- CONCURRENCY = runtime_config['CONCURRENCY'] # 并发数配置
27
-
28
- # 爬虫模块配置
29
- SPIDER_MODULES = [] # 爬虫模块列表
30
- SPIDER_LOADER_WARN_ONLY = False # 爬虫加载器是否只警告不报错
31
-
32
- # --------------------------------- 2. 爬虫核心配置 ------------------------------------
33
-
34
- # 下载器配置
35
- DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader" # 默认下载器
36
- DOWNLOAD_DELAY = 0.5 # 请求延迟(秒)
37
- RANDOMNESS = True # 是否启用随机延迟
38
- RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子,实际延迟 = DOWNLOAD_DELAY * RANDOM_RANGE[0] 到 DOWNLOAD_DELAY * RANDOM_RANGE[1]
39
-
40
- # 调度器配置
41
- DEPTH_PRIORITY = 1 # 深度优先级(负数表示深度优先,正数表示广度优先)
42
- SCHEDULER_MAX_QUEUE_SIZE = 5000 # 调度器队列最大大小
43
- BACKPRESSURE_RATIO = 0.9 # 背压触发阈值(队列大小达到最大容量的90%时触发背压控制)
44
-
45
- # 请求生成控制
46
- REQUEST_GENERATION_BATCH_SIZE = 10 # 请求生成批处理大小
47
- REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
48
- ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
49
-
50
- # 队列配置
51
- QUEUE_TYPE = 'auto' # 队列类型:memory/redis/auto
52
- # SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests" # 调度器队列名称(遵循统一命名规范)
53
- QUEUE_MAX_RETRIES = 3 # 队列操作最大重试次数
54
- QUEUE_TIMEOUT = 300 # 队列操作超时时间(秒)
55
-
56
- # --------------------------------- 3. 数据库和过滤器配置 ------------------------------------
57
-
58
- # MySQL配置
59
- MYSQL_HOST = '127.0.0.1'
60
- MYSQL_PORT = 3306
61
- MYSQL_USER = 'root'
62
- MYSQL_PASSWORD = '123456'
63
- MYSQL_DB = 'crawl_pro'
64
- MYSQL_TABLE = 'crawlo'
65
- MYSQL_BATCH_SIZE = 100
66
- MYSQL_USE_BATCH = False # 是否启用批量插入
67
- # MySQL SQL生成行为控制配置
68
- MYSQL_AUTO_UPDATE = False # 是否使用 REPLACE INTO(完全覆盖已存在记录)
69
- MYSQL_INSERT_IGNORE = False # 是否使用 INSERT IGNORE(忽略重复数据)
70
- MYSQL_UPDATE_COLUMNS = () # 冲突时需更新的列名;指定后 MYSQL_AUTO_UPDATE 失效
71
-
72
- # Redis配置
73
- redis_config = get_redis_config()
74
- REDIS_HOST = redis_config['REDIS_HOST']
75
- REDIS_PORT = redis_config['REDIS_PORT']
76
- REDIS_PASSWORD = redis_config['REDIS_PASSWORD']
77
- REDIS_DB = redis_config['REDIS_DB']
78
-
79
- # 根据是否有密码生成不同的 URL 格式
80
- if REDIS_PASSWORD:
81
- REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
82
- else:
83
- REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
84
-
85
- # Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
86
- # - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
87
- # - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
88
- # - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
89
- # - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
90
- # - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
91
-
92
- REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
93
- CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
94
- FILTER_DEBUG = True # 是否开启去重调试日志
95
- DECODE_RESPONSES = True # Redis 返回是否解码为字符串
96
-
97
- # 过滤器配置
98
- DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline' # 默认使用内存过滤器和去重管道
99
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
100
-
101
- # Bloom过滤器配置
102
- BLOOM_FILTER_CAPACITY = 1000000 # Bloom过滤器容量
103
- BLOOM_FILTER_ERROR_RATE = 0.001 # Bloom过滤器错误率
104
-
105
- # --------------------------------- 4. 中间件配置 ------------------------------------
106
-
107
- # 框架中间件列表(框架默认中间件 + 用户自定义中间件)
108
- MIDDLEWARES = [
109
- # === 请求预处理阶段 ===
110
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
111
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
112
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
113
- 'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
114
-
115
- # === 响应处理阶段 ===
116
- 'crawlo.middleware.retry.RetryMiddleware', # 6. 失败请求重试
117
- 'crawlo.middleware.response_code.ResponseCodeMiddleware', # 7. 处理特殊状态码
118
- 'crawlo.middleware.response_filter.ResponseFilterMiddleware', # 8. 响应内容过滤
119
- ]
120
-
121
- # --------------------------------- 5. 管道配置 ------------------------------------
122
-
123
- # 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
124
- PIPELINES = [
125
- 'crawlo.pipelines.console_pipeline.ConsolePipeline',
126
- ]
127
-
128
- # --------------------------------- 6. 扩展配置 ------------------------------------
129
-
130
- # 框架扩展组件列表(框架默认扩展 + 用户自定义扩展)
131
- EXTENSIONS = [
132
- 'crawlo.extension.log_interval.LogIntervalExtension', # 定时日志
133
- 'crawlo.extension.log_stats.LogStats', # 统计信息
134
- 'crawlo.extension.logging_extension.CustomLoggerExtension', # 自定义日志
135
- ]
136
-
137
- # --------------------------------- 7. 日志与监控配置 ------------------------------------
138
-
139
- # 日志配置
140
- LOG_LEVEL = None # 日志级别: DEBUG/INFO/WARNING/ERROR,默认为None,由用户在项目settings中设置
141
- STATS_DUMP = True # 是否周期性输出统计信息
142
- LOG_FILE = None # 日志文件路径,将在项目配置中设置
143
- LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
144
- LOG_ENCODING = 'utf-8'
145
- LOG_MAX_BYTES = 10 * 1024 * 1024 # 日志轮转大小(字节)
146
- LOG_BACKUP_COUNT = 5 # 日志备份数量
147
-
148
- # 日志间隔配置
149
- INTERVAL = 60 # 日志输出间隔(秒)
150
-
151
- # 内存监控配置
152
- MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
153
- MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
154
- MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
155
- MEMORY_CRITICAL_THRESHOLD = 90.0 # 内存使用率严重阈值(百分比)
156
-
157
- # 性能分析配置
158
- PERFORMANCE_PROFILER_ENABLED = False # 是否启用性能分析
159
- PERFORMANCE_PROFILER_OUTPUT_DIR = 'profiling' # 性能分析输出目录
160
- PERFORMANCE_PROFILER_INTERVAL = 300 # 性能分析间隔(秒)
161
-
162
- # 健康检查配置
163
- HEALTH_CHECK_ENABLED = True # 是否启用健康检查
164
-
165
- # --------------------------------- 8. 网络请求配置 ------------------------------------
166
-
167
- # 默认请求头配置
168
- DEFAULT_REQUEST_HEADERS = {
169
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
170
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
171
- 'Accept-Encoding': 'gzip, deflate, br',
172
- } # 默认请求头
173
-
174
- # 默认User-Agent(使用现代浏览器的User-Agent)
175
- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
176
-
177
- # 是否启用随机User-Agent功能(默认禁用,用户可根据需要启用)
178
- RANDOM_USER_AGENT_ENABLED = False # 是否启用随机用户代理
179
-
180
- # 站外过滤配置
181
- ALLOWED_DOMAINS = [] # 允许的域名列表
182
-
183
- # 代理配置(通用版,支持静态代理列表和动态代理API两种模式)
184
- PROXY_LIST = [] # 静态代理列表配置
185
- PROXY_API_URL = "" # 动态代理API配置
186
- # 代理提取配置,用于指定如何从API返回的数据中提取代理地址
187
- # 可选值:
188
- # - 字符串:直接作为字段名使用,如 "proxy"(默认值)
189
- # - 字典:包含type和value字段,支持多种提取方式
190
- # - {"type": "field", "value": "data"}:从指定字段提取
191
- # - {"type": "jsonpath", "value": "$.data[0].proxy"}:使用JSONPath表达式提取
192
- # - {"type": "custom", "function": your_function}:使用自定义函数提取
193
- PROXY_EXTRACTOR = "proxy" # 代理提取配置
194
- # 代理失败处理配置
195
- PROXY_MAX_FAILED_ATTEMPTS = 3 # 代理最大失败尝试次数,超过此次数将标记为失效
196
-
197
- # 代理使用示例:
198
- # 1. 静态代理列表:
199
- # PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
200
- # PROXY_API_URL = "" # 不使用动态代理
201
- #
202
- # 2. 动态代理API(默认字段提取):
203
- # PROXY_LIST = [] # 不使用静态代理
204
- # PROXY_API_URL = "http://api.example.com/get_proxy"
205
- # PROXY_EXTRACTOR = "proxy" # 从"proxy"字段提取
206
- #
207
- # 3. 动态代理API(自定义字段提取):
208
- # PROXY_LIST = [] # 不使用静态代理
209
- # PROXY_API_URL = "http://api.example.com/get_proxy"
210
- # PROXY_EXTRACTOR = "data" # 从"data"字段提取
211
- #
212
- # 4. 动态代理API(嵌套字段提取):
213
- # PROXY_LIST = [] # 不使用静态代理
214
- # PROXY_API_URL = "http://api.example.com/get_proxy"
215
- # PROXY_EXTRACTOR = {"type": "field", "value": "result"} # 从"result"字段提取
216
-
217
- # 下载器通用配置
218
- DOWNLOAD_TIMEOUT = 30 # 下载超时时间(秒)
219
- VERIFY_SSL = True # 是否验证SSL证书
220
- CONNECTION_POOL_LIMIT = 100 # 连接池大小限制
221
- CONNECTION_POOL_LIMIT_PER_HOST = 20 # 每个主机的连接池大小限制
222
- DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 最大下载大小(字节)
223
- DOWNLOAD_STATS = True # 是否启用下载统计
224
- DOWNLOAD_WARN_SIZE = 1024 * 1024 # 下载警告大小(字节)
225
- DOWNLOAD_RETRY_TIMES = 3 # 下载重试次数
226
- MAX_RETRY_TIMES = 3 # 最大重试次数
227
-
228
- # 下载器健康检查
229
- DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
230
- HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
231
- REQUEST_STATS_ENABLED = True # 是否启用请求统计
232
- STATS_RESET_ON_START = False # 启动时是否重置统计
233
-
234
- # HttpX 下载器专用配置
235
- HTTPX_HTTP2 = True # 是否启用HTTP/2支持
236
- HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
237
-
238
- # AioHttp 下载器专用配置
239
- AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
240
- AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
241
-
242
- # Curl-Cffi 特有配置
243
- CURL_BROWSER_TYPE = "chrome" # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
244
- CURL_BROWSER_VERSION_MAP = { # 自定义浏览器版本映射(可覆盖默认行为)
245
- "chrome": "chrome136",
246
- "edge": "edge101",
247
- "safari": "safari184",
248
- "firefox": "firefox135",
249
- }
250
-
251
- # Selenium 下载器配置
252
- SELENIUM_BROWSER_TYPE = "chrome" # 浏览器类型: chrome, firefox, edge
253
- SELENIUM_HEADLESS = True # 是否无头模式
254
- SELENIUM_TIMEOUT = 30 # 超时时间(秒)
255
- SELENIUM_LOAD_TIMEOUT = 10 # 页面加载超时时间(秒)
256
- SELENIUM_WINDOW_WIDTH = 1920 # 窗口宽度
257
- SELENIUM_WINDOW_HEIGHT = 1080 # 窗口高度
258
- SELENIUM_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
259
- SELENIUM_ENABLE_JS = True # 是否启用JavaScript
260
- SELENIUM_PROXY = None # 代理设置
261
- SELENIUM_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
262
- SELENIUM_MAX_TABS_PER_BROWSER = 10 # 单浏览器最大标签页数量
263
-
264
- # Playwright 下载器配置
265
- PLAYWRIGHT_BROWSER_TYPE = "chromium" # 浏览器类型: chromium, firefox, webkit
266
- PLAYWRIGHT_HEADLESS = True # 是否无头模式
267
- PLAYWRIGHT_TIMEOUT = 30000 # 超时时间(毫秒)
268
- PLAYWRIGHT_LOAD_TIMEOUT = 10000 # 页面加载超时时间(毫秒)
269
- PLAYWRIGHT_VIEWPORT_WIDTH = 1920 # 视口宽度
270
- PLAYWRIGHT_VIEWPORT_HEIGHT = 1080 # 视口高度
271
- PLAYWRIGHT_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
272
- PLAYWRIGHT_PROXY = None # 代理设置
273
- PLAYWRIGHT_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
274
- PLAYWRIGHT_MAX_PAGES_PER_BROWSER = 10 # 单浏览器最大页面数量
275
-
276
- # 通用优化配置
277
- CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
1
+ # -*- coding:UTF-8 -*-
2
+ """
3
+ 默认配置文件
4
+ 包含 Crawlo 框架的所有默认设置项
5
+ """
6
+ # 添加环境变量配置工具导入
7
+ from crawlo.utils.config_manager import EnvConfigManager
8
+
9
+ # --------------------------------- 1. 框架基础配置 ------------------------------------
10
+
11
+ # 框架初始化控制
12
+ FRAMEWORK_INIT_ORDER = [
13
+ 'log_system', # 日志系统
14
+ 'settings_system', # 配置系统
15
+ 'core_components', # 核心组件
16
+ 'extensions', # 扩展组件
17
+ 'full_initialization' # 完全初始化
18
+ ]
19
+ FRAMEWORK_INIT_STATE = 'uninitialized'
20
+
21
+ # 项目基础配置
22
+ runtime_config = EnvConfigManager.get_runtime_config()
23
+ PROJECT_NAME = runtime_config['PROJECT_NAME'] # 项目名称(用于日志、Redis Key 等标识)
24
+ VERSION = EnvConfigManager.get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
25
+ RUN_MODE = runtime_config['CRAWLO_MODE'] # 运行模式:standalone/distributed/auto
26
+ CONCURRENCY = runtime_config['CONCURRENCY'] # 并发数配置
27
+
28
+ # 爬虫模块配置
29
+ SPIDER_MODULES = [] # 爬虫模块列表
30
+ SPIDER_LOADER_WARN_ONLY = False # 爬虫加载器是否只警告不报错
31
+
32
+ # --------------------------------- 2. 爬虫核心配置 ------------------------------------
33
+
34
+ # 下载器配置
35
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader" # 默认下载器
36
+ DOWNLOAD_DELAY = 0.5 # 请求延迟(秒)
37
+ RANDOMNESS = True # 是否启用随机延迟
38
+ RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子,实际延迟 = DOWNLOAD_DELAY * RANDOM_RANGE[0] 到 DOWNLOAD_DELAY * RANDOM_RANGE[1]
39
+
40
+ # 调度器配置
41
+ DEPTH_PRIORITY = 1 # 深度优先级(负数表示深度优先,正数表示广度优先)
42
+ SCHEDULER_MAX_QUEUE_SIZE = 5000 # 调度器队列最大大小
43
+ BACKPRESSURE_RATIO = 0.9 # 背压触发阈值(队列大小达到最大容量的90%时触发背压控制)
44
+
45
+ # 请求生成控制
46
+ REQUEST_GENERATION_BATCH_SIZE = 10 # 请求生成批处理大小
47
+ REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
48
+ ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
49
+
50
+ # 队列配置
51
+ QUEUE_TYPE = 'auto' # 队列类型:memory/redis/auto
52
+ # SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests" # 调度器队列名称(遵循统一命名规范)
53
+ QUEUE_MAX_RETRIES = 3 # 队列操作最大重试次数
54
+ QUEUE_TIMEOUT = 300 # 队列操作超时时间(秒)
55
+
56
+ # --------------------------------- 3. 数据库和过滤器配置 ------------------------------------
57
+
58
+ # MySQL配置
59
+ MYSQL_HOST = '127.0.0.1'
60
+ MYSQL_PORT = 3306
61
+ MYSQL_USER = 'root'
62
+ MYSQL_PASSWORD = '123456'
63
+ MYSQL_DB = 'crawl_pro'
64
+ MYSQL_TABLE = 'crawlo'
65
+ MYSQL_BATCH_SIZE = 100
66
+ MYSQL_USE_BATCH = False # 是否启用批量插入
67
+ # MySQL SQL生成行为控制配置
68
+ MYSQL_AUTO_UPDATE = False # 是否使用 REPLACE INTO(完全覆盖已存在记录)
69
+ MYSQL_INSERT_IGNORE = False # 是否使用 INSERT IGNORE(忽略重复数据)
70
+ MYSQL_UPDATE_COLUMNS = () # 冲突时需更新的列名;指定后 MYSQL_AUTO_UPDATE 失效
71
+
72
+ # Redis配置
73
+ redis_config = EnvConfigManager.get_redis_config()
74
+ REDIS_HOST = redis_config['REDIS_HOST']
75
+ REDIS_PORT = redis_config['REDIS_PORT']
76
+ REDIS_PASSWORD = redis_config['REDIS_PASSWORD']
77
+ REDIS_DB = redis_config['REDIS_DB']
78
+
79
+ # Redis集群支持说明:
80
+ # Crawlo框架支持Redis单实例和集群模式的智能切换
81
+ # 集群模式配置方式:
82
+ # 1. 使用逗号分隔的节点列表:'192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
83
+ # 2. 使用集群URL格式:'redis-cluster://192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
84
+ # 框架会自动检测URL格式并选择合适的模式
85
+
86
+ # 根据是否有密码生成不同的 URL 格式
87
+ if REDIS_PASSWORD:
88
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
89
+ else:
90
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
91
+
92
+ # Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
93
+ # - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
94
+ # - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
95
+ # - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
96
+ # - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
97
+ # - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
98
+
99
+ REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
100
+ CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
101
+ FILTER_DEBUG = True # 是否开启去重调试日志
102
+ DECODE_RESPONSES = True # Redis 返回是否解码为字符串
103
+
104
+ # 过滤器配置
105
+ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline' # 默认使用内存过滤器和去重管道
106
+ FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
107
+
108
+ # Bloom过滤器配置
109
+ BLOOM_FILTER_CAPACITY = 1000000 # Bloom过滤器容量
110
+ BLOOM_FILTER_ERROR_RATE = 0.001 # Bloom过滤器错误率
111
+
112
+ # --------------------------------- 4. 中间件配置 ------------------------------------
113
+
114
+ # 框架中间件列表(框架默认中间件 + 用户自定义中间件)
115
+ MIDDLEWARES = [
116
+ # === 请求预处理阶段 ===
117
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
118
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
119
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
120
+ 'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
121
+
122
+ # === 响应处理阶段 ===
123
+ 'crawlo.middleware.retry.RetryMiddleware', # 6. 失败请求重试
124
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware', # 7. 处理特殊状态码
125
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware', # 8. 响应内容过滤
126
+ ]
127
+
128
+ # --------------------------------- 5. 管道配置 ------------------------------------
129
+
130
+ # 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
131
+ PIPELINES = [
132
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
133
+ ]
134
+
135
+ # --------------------------------- 6. 扩展配置 ------------------------------------
136
+
137
+ # 框架扩展组件列表(框架默认扩展 + 用户自定义扩展)
138
+ EXTENSIONS = [
139
+ 'crawlo.extension.log_interval.LogIntervalExtension', # 定时日志
140
+ 'crawlo.extension.log_stats.LogStats', # 统计信息
141
+ 'crawlo.extension.logging_extension.CustomLoggerExtension', # 自定义日志
142
+ ]
143
+
144
+ # --------------------------------- 7. 日志与监控配置 ------------------------------------
145
+
146
+ # 日志配置
147
+ LOG_LEVEL = None # 日志级别: DEBUG/INFO/WARNING/ERROR,默认为None,由用户在项目settings中设置
148
+ STATS_DUMP = True # 是否周期性输出统计信息
149
+ LOG_FILE = None # 日志文件路径,将在项目配置中设置
150
+ LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
151
+ LOG_ENCODING = 'utf-8'
152
+ LOG_MAX_BYTES = 10 * 1024 * 1024 # 日志轮转大小(字节)
153
+ LOG_BACKUP_COUNT = 5 # 日志备份数量
154
+
155
+ # 日志间隔配置
156
+ INTERVAL = 60 # 日志输出间隔(秒)
157
+
158
+ # 内存监控配置
159
+ MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
160
+ MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
161
+ MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
162
+ MEMORY_CRITICAL_THRESHOLD = 90.0 # 内存使用率严重阈值(百分比)
163
+
164
+ # 性能分析配置
165
+ PERFORMANCE_PROFILER_ENABLED = False # 是否启用性能分析
166
+ PERFORMANCE_PROFILER_OUTPUT_DIR = 'profiling' # 性能分析输出目录
167
+ PERFORMANCE_PROFILER_INTERVAL = 300 # 性能分析间隔(秒)
168
+
169
+ # 健康检查配置
170
+ HEALTH_CHECK_ENABLED = True # 是否启用健康检查
171
+
172
+ # --------------------------------- 8. 网络请求配置 ------------------------------------
173
+
174
+ # 默认请求头配置
175
+ DEFAULT_REQUEST_HEADERS = {
176
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
177
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
178
+ 'Accept-Encoding': 'gzip, deflate, br',
179
+ } # 默认请求头
180
+
181
+ # 默认User-Agent(使用现代浏览器的User-Agent)
182
+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
183
+
184
+ # 是否启用随机User-Agent功能(默认禁用,用户可根据需要启用)
185
+ RANDOM_USER_AGENT_ENABLED = False # 是否启用随机用户代理
186
+
187
+ # 站外过滤配置
188
+ ALLOWED_DOMAINS = [] # 允许的域名列表
189
+
190
+ # 代理配置(通用版,支持静态代理列表和动态代理API两种模式)
191
+ PROXY_LIST = [] # 静态代理列表配置
192
+ PROXY_API_URL = "" # 动态代理API配置
193
+ # 代理提取配置,用于指定如何从API返回的数据中提取代理地址
194
+ # 可选值:
195
+ # - 字符串:直接作为字段名使用,如 "proxy"(默认值)
196
+ # - 字典:包含type和value字段,支持多种提取方式
197
+ # - {"type": "field", "value": "data"}:从指定字段提取
198
+ # - {"type": "jsonpath", "value": "$.data[0].proxy"}:使用JSONPath表达式提取
199
+ # - {"type": "custom", "function": your_function}:使用自定义函数提取
200
+ PROXY_EXTRACTOR = "proxy" # 代理提取配置
201
+ # 代理失败处理配置
202
+ PROXY_MAX_FAILED_ATTEMPTS = 3 # 代理最大失败尝试次数,超过此次数将标记为失效
203
+
204
+ # 代理使用示例:
205
+ # 1. 静态代理列表:
206
+ # PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
207
+ # PROXY_API_URL = "" # 不使用动态代理
208
+ #
209
+ # 2. 动态代理API(默认字段提取):
210
+ # PROXY_LIST = [] # 不使用静态代理
211
+ # PROXY_API_URL = "http://api.example.com/get_proxy"
212
+ # PROXY_EXTRACTOR = "proxy" # 从"proxy"字段提取
213
+ #
214
+ # 3. 动态代理API(自定义字段提取):
215
+ # PROXY_LIST = [] # 不使用静态代理
216
+ # PROXY_API_URL = "http://api.example.com/get_proxy"
217
+ # PROXY_EXTRACTOR = "data" # 从"data"字段提取
218
+ #
219
+ # 4. 动态代理API(嵌套字段提取):
220
+ # PROXY_LIST = [] # 不使用静态代理
221
+ # PROXY_API_URL = "http://api.example.com/get_proxy"
222
+ # PROXY_EXTRACTOR = {"type": "field", "value": "result"} # 从"result"字段提取
223
+
224
+ # 下载器通用配置
225
+ DOWNLOAD_TIMEOUT = 30 # 下载超时时间(秒)
226
+ VERIFY_SSL = True # 是否验证SSL证书
227
+ CONNECTION_POOL_LIMIT = 100 # 连接池大小限制
228
+ CONNECTION_POOL_LIMIT_PER_HOST = 20 # 每个主机的连接池大小限制
229
+ DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 最大下载大小(字节)
230
+ DOWNLOAD_STATS = True # 是否启用下载统计
231
+ DOWNLOAD_WARN_SIZE = 1024 * 1024 # 下载警告大小(字节)
232
+ DOWNLOAD_RETRY_TIMES = 3 # 下载重试次数
233
+ MAX_RETRY_TIMES = 3 # 最大重试次数
234
+
235
+ # 下载器健康检查
236
+ DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
237
+ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
238
+ REQUEST_STATS_ENABLED = True # 是否启用请求统计
239
+ STATS_RESET_ON_START = False # 启动时是否重置统计
240
+
241
+ # HttpX 下载器专用配置
242
+ HTTPX_HTTP2 = True # 是否启用HTTP/2支持
243
+ HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
244
+
245
+ # AioHttp 下载器专用配置
246
+ AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
247
+ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
248
+
249
+ # Curl-Cffi 特有配置
250
+ CURL_BROWSER_TYPE = "chrome" # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
251
+ CURL_BROWSER_VERSION_MAP = { # 自定义浏览器版本映射(可覆盖默认行为)
252
+ "chrome": "chrome136",
253
+ "edge": "edge101",
254
+ "safari": "safari184",
255
+ "firefox": "firefox135",
256
+ }
257
+
258
+ # Selenium 下载器配置
259
+ SELENIUM_BROWSER_TYPE = "chrome" # 浏览器类型: chrome, firefox, edge
260
+ SELENIUM_HEADLESS = True # 是否无头模式
261
+ SELENIUM_TIMEOUT = 30 # 超时时间(秒)
262
+ SELENIUM_LOAD_TIMEOUT = 10 # 页面加载超时时间(秒)
263
+ SELENIUM_WINDOW_WIDTH = 1920 # 窗口宽度
264
+ SELENIUM_WINDOW_HEIGHT = 1080 # 窗口高度
265
+ SELENIUM_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
266
+ SELENIUM_ENABLE_JS = True # 是否启用JavaScript
267
+ SELENIUM_PROXY = None # 代理设置
268
+ SELENIUM_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
269
+ SELENIUM_MAX_TABS_PER_BROWSER = 10 # 单浏览器最大标签页数量
270
+
271
+ # Playwright 下载器配置
272
+ PLAYWRIGHT_BROWSER_TYPE = "chromium" # 浏览器类型: chromium, firefox, webkit
273
+ PLAYWRIGHT_HEADLESS = True # 是否无头模式
274
+ PLAYWRIGHT_TIMEOUT = 30000 # 超时时间(毫秒)
275
+ PLAYWRIGHT_LOAD_TIMEOUT = 10000 # 页面加载超时时间(毫秒)
276
+ PLAYWRIGHT_VIEWPORT_WIDTH = 1920 # 视口宽度
277
+ PLAYWRIGHT_VIEWPORT_HEIGHT = 1080 # 视口高度
278
+ PLAYWRIGHT_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
279
+ PLAYWRIGHT_PROXY = None # 代理设置
280
+ PLAYWRIGHT_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
281
+ PLAYWRIGHT_MAX_PAGES_PER_BROWSER = 10 # 单浏览器最大页面数量
282
+
283
+ # 通用优化配置
284
+ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
278
285
  CONNECTION_KEEPALIVE = True # 是否启用HTTP连接保持