crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (348) hide show
  1. crawlo/__init__.py +90 -90
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -140
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -379
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -320
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -451
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -290
  19. crawlo/crawler.py +698 -698
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -280
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -250
  25. crawlo/downloader/httpx_downloader.py +265 -265
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -425
  28. crawlo/downloader/selenium_downloader.py +486 -486
  29. crawlo/event.py +45 -45
  30. crawlo/exceptions.py +214 -214
  31. crawlo/extension/__init__.py +64 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -53
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -104
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +134 -134
  44. crawlo/filters/__init__.py +170 -170
  45. crawlo/filters/aioredis_filter.py +347 -347
  46. crawlo/filters/memory_filter.py +261 -261
  47. crawlo/framework.py +306 -306
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -391
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -240
  52. crawlo/initialization/phases.py +229 -229
  53. crawlo/initialization/registry.py +143 -143
  54. crawlo/initialization/utils.py +48 -48
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -42
  61. crawlo/logging/config.py +280 -276
  62. crawlo/logging/factory.py +175 -175
  63. crawlo/logging/manager.py +104 -104
  64. crawlo/middleware/__init__.py +87 -87
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -287
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +408 -376
  77. crawlo/network/response.py +598 -569
  78. crawlo/pipelines/__init__.py +52 -52
  79. crawlo/pipelines/base_pipeline.py +452 -452
  80. crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +196 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +104 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -139
  87. crawlo/pipelines/mysql_pipeline.py +468 -469
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -155
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +9 -9
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -591
  94. crawlo/queue/redis_priority_queue.py +518 -518
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +287 -284
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +658 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +1 -1
  104. crawlo/templates/project/items.py.tmpl +13 -13
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -35
  107. crawlo/templates/project/settings.py.tmpl +113 -109
  108. crawlo/templates/project/settings_distributed.py.tmpl +160 -156
  109. crawlo/templates/project/settings_gentle.py.tmpl +174 -170
  110. crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
  111. crawlo/templates/project/settings_minimal.py.tmpl +102 -98
  112. crawlo/templates/project/settings_simple.py.tmpl +172 -168
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -32
  116. crawlo/templates/spiders_init.py.tmpl +4 -4
  117. crawlo/tools/__init__.py +86 -86
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +74 -50
  123. crawlo/utils/batch_processor.py +276 -276
  124. crawlo/utils/config_manager.py +442 -442
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/encoding_helper.py +190 -0
  128. crawlo/utils/error_handler.py +410 -410
  129. crawlo/utils/fingerprint.py +121 -121
  130. crawlo/utils/func_tools.py +82 -82
  131. crawlo/utils/large_scale_helper.py +344 -344
  132. crawlo/utils/leak_detector.py +335 -335
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -157
  135. crawlo/utils/mysql_connection_pool.py +197 -197
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +90 -90
  139. crawlo/utils/redis_connection_pool.py +578 -578
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -278
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -337
  144. crawlo/utils/response_helper.py +113 -0
  145. crawlo/utils/selector_helper.py +138 -137
  146. crawlo/utils/singleton.py +69 -69
  147. crawlo/utils/spider_loader.py +201 -201
  148. crawlo/utils/text_helper.py +94 -94
  149. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
  150. crawlo-1.4.8.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -217
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -467
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -72
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  192. tests/ofweek_scrapy/scrapy.cfg +11 -11
  193. tests/optimized_performance_test.py +211 -211
  194. tests/performance_comparison.py +244 -244
  195. tests/queue_blocking_test.py +113 -113
  196. tests/queue_test.py +89 -89
  197. tests/redis_key_validation_demo.py +130 -130
  198. tests/request_params_example.py +150 -150
  199. tests/response_improvements_example.py +144 -144
  200. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  201. tests/scrapy_comparison/scrapy_test.py +133 -133
  202. tests/simple_cli_test.py +54 -54
  203. tests/simple_command_test.py +119 -119
  204. tests/simple_crawlo_test.py +126 -126
  205. tests/simple_follow_test.py +38 -38
  206. tests/simple_log_test2.py +137 -137
  207. tests/simple_optimization_test.py +128 -128
  208. tests/simple_queue_type_test.py +41 -41
  209. tests/simple_response_selector_test.py +94 -94
  210. tests/simple_selector_helper_test.py +154 -154
  211. tests/simple_selector_test.py +207 -207
  212. tests/simple_spider_test.py +49 -49
  213. tests/simple_url_test.py +73 -73
  214. tests/simulate_mysql_update_test.py +139 -139
  215. tests/spider_log_timing_test.py +177 -177
  216. tests/test_advanced_tools.py +148 -148
  217. tests/test_all_commands.py +230 -230
  218. tests/test_all_pipeline_fingerprints.py +133 -133
  219. tests/test_all_redis_key_configs.py +145 -145
  220. tests/test_asyncmy_usage.py +56 -56
  221. tests/test_batch_processor.py +178 -178
  222. tests/test_cleaners.py +54 -54
  223. tests/test_cli_arguments.py +118 -118
  224. tests/test_component_factory.py +174 -174
  225. tests/test_config_consistency.py +80 -80
  226. tests/test_config_merge.py +152 -152
  227. tests/test_config_validator.py +182 -182
  228. tests/test_controlled_spider_mixin.py +79 -79
  229. tests/test_crawler_process_import.py +38 -38
  230. tests/test_crawler_process_spider_modules.py +47 -47
  231. tests/test_crawlo_proxy_integration.py +114 -114
  232. tests/test_date_tools.py +123 -123
  233. tests/test_dedup_fix.py +220 -220
  234. tests/test_dedup_pipeline_consistency.py +124 -124
  235. tests/test_default_header_middleware.py +313 -313
  236. tests/test_distributed.py +65 -65
  237. tests/test_double_crawlo_fix.py +204 -204
  238. tests/test_double_crawlo_fix_simple.py +124 -124
  239. tests/test_download_delay_middleware.py +221 -221
  240. tests/test_downloader_proxy_compatibility.py +272 -272
  241. tests/test_edge_cases.py +305 -305
  242. tests/test_encoding_core.py +56 -56
  243. tests/test_encoding_detection.py +126 -126
  244. tests/test_enhanced_error_handler.py +270 -270
  245. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  246. tests/test_error_handler_compatibility.py +112 -112
  247. tests/test_factories.py +252 -252
  248. tests/test_factory_compatibility.py +196 -196
  249. tests/test_final_validation.py +153 -153
  250. tests/test_fingerprint_consistency.py +135 -135
  251. tests/test_fingerprint_simple.py +51 -51
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_helper.py +235 -235
  257. tests/test_logging_enhancements.py +374 -374
  258. tests/test_logging_final.py +184 -184
  259. tests/test_logging_integration.py +312 -312
  260. tests/test_logging_system.py +282 -282
  261. tests/test_middleware_debug.py +141 -141
  262. tests/test_mode_consistency.py +51 -51
  263. tests/test_multi_directory.py +67 -67
  264. tests/test_multiple_spider_modules.py +80 -80
  265. tests/test_mysql_pipeline_config.py +164 -164
  266. tests/test_mysql_pipeline_error.py +98 -98
  267. tests/test_mysql_pipeline_init_log.py +82 -82
  268. tests/test_mysql_pipeline_integration.py +132 -132
  269. tests/test_mysql_pipeline_refactor.py +143 -143
  270. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  271. tests/test_mysql_pipeline_robustness.py +195 -195
  272. tests/test_mysql_pipeline_types.py +88 -88
  273. tests/test_mysql_update_columns.py +93 -93
  274. tests/test_offsite_middleware.py +244 -244
  275. tests/test_offsite_middleware_simple.py +203 -203
  276. tests/test_optimized_selector_naming.py +100 -100
  277. tests/test_parsel.py +29 -29
  278. tests/test_performance.py +327 -327
  279. tests/test_performance_monitor.py +115 -115
  280. tests/test_pipeline_fingerprint_consistency.py +86 -86
  281. tests/test_priority_behavior.py +211 -211
  282. tests/test_priority_consistency.py +151 -151
  283. tests/test_priority_consistency_fixed.py +249 -249
  284. tests/test_proxy_health_check.py +32 -32
  285. tests/test_proxy_middleware.py +217 -217
  286. tests/test_proxy_middleware_enhanced.py +212 -212
  287. tests/test_proxy_middleware_integration.py +142 -142
  288. tests/test_proxy_middleware_refactored.py +207 -207
  289. tests/test_proxy_only.py +83 -83
  290. tests/test_proxy_providers.py +56 -56
  291. tests/test_proxy_stats.py +19 -19
  292. tests/test_proxy_strategies.py +59 -59
  293. tests/test_proxy_with_downloader.py +152 -152
  294. tests/test_queue_empty_check.py +41 -41
  295. tests/test_queue_manager_double_crawlo.py +173 -173
  296. tests/test_queue_manager_redis_key.py +179 -179
  297. tests/test_queue_naming.py +154 -154
  298. tests/test_queue_type.py +106 -106
  299. tests/test_queue_type_redis_config_consistency.py +130 -130
  300. tests/test_random_headers_default.py +322 -322
  301. tests/test_random_headers_necessity.py +308 -308
  302. tests/test_random_user_agent.py +72 -72
  303. tests/test_redis_config.py +28 -28
  304. tests/test_redis_connection_pool.py +294 -294
  305. tests/test_redis_key_naming.py +181 -181
  306. tests/test_redis_key_validator.py +123 -123
  307. tests/test_redis_queue.py +224 -224
  308. tests/test_redis_queue_name_fix.py +175 -175
  309. tests/test_redis_queue_type_fallback.py +129 -129
  310. tests/test_request_ignore_middleware.py +182 -182
  311. tests/test_request_params.py +111 -111
  312. tests/test_request_serialization.py +70 -70
  313. tests/test_response_code_middleware.py +349 -349
  314. tests/test_response_filter_middleware.py +427 -427
  315. tests/test_response_follow.py +104 -104
  316. tests/test_response_improvements.py +152 -152
  317. tests/test_response_selector_methods.py +92 -92
  318. tests/test_response_url_methods.py +70 -70
  319. tests/test_response_urljoin.py +86 -86
  320. tests/test_retry_middleware.py +333 -333
  321. tests/test_retry_middleware_realistic.py +273 -273
  322. tests/test_scheduler.py +252 -252
  323. tests/test_scheduler_config_update.py +133 -133
  324. tests/test_scrapy_style_encoding.py +112 -112
  325. tests/test_selector_helper.py +100 -100
  326. tests/test_selector_optimizations.py +146 -146
  327. tests/test_simple_response.py +61 -61
  328. tests/test_spider_loader.py +49 -49
  329. tests/test_spider_loader_comprehensive.py +69 -69
  330. tests/test_spider_modules.py +84 -84
  331. tests/test_spiders/test_spider.py +9 -9
  332. tests/test_telecom_spider_redis_key.py +205 -205
  333. tests/test_template_content.py +87 -87
  334. tests/test_template_redis_key.py +134 -134
  335. tests/test_tools.py +159 -159
  336. tests/test_user_agent_randomness.py +176 -176
  337. tests/test_user_agents.py +96 -96
  338. tests/untested_features_report.md +138 -138
  339. tests/verify_debug.py +51 -51
  340. tests/verify_distributed.py +117 -117
  341. tests/verify_log_fix.py +111 -111
  342. tests/verify_mysql_warnings.py +109 -109
  343. crawlo/utils/log.py +0 -80
  344. crawlo/utils/url_utils.py +0 -40
  345. crawlo-1.4.7.dist-info/RECORD +0 -347
  346. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  347. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  348. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-05-11 11:08
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-05-11 11:08
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """
@@ -1,285 +1,288 @@
1
- # -*- coding:UTF-8 -*-
2
- """
3
- 默认配置文件
4
- 包含 Crawlo 框架的所有默认设置项
5
- """
6
- # 添加环境变量配置工具导入
7
- from crawlo.utils.config_manager import EnvConfigManager
8
-
9
- # --------------------------------- 1. 框架基础配置 ------------------------------------
10
-
11
- # 框架初始化控制
12
- FRAMEWORK_INIT_ORDER = [
13
- 'log_system', # 日志系统
14
- 'settings_system', # 配置系统
15
- 'core_components', # 核心组件
16
- 'extensions', # 扩展组件
17
- 'full_initialization' # 完全初始化
18
- ]
19
- FRAMEWORK_INIT_STATE = 'uninitialized'
20
-
21
- # 项目基础配置
22
- runtime_config = EnvConfigManager.get_runtime_config()
23
- PROJECT_NAME = runtime_config['PROJECT_NAME'] # 项目名称(用于日志、Redis Key 等标识)
24
- VERSION = EnvConfigManager.get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
25
- RUN_MODE = runtime_config['CRAWLO_MODE'] # 运行模式:standalone/distributed/auto
26
- CONCURRENCY = runtime_config['CONCURRENCY'] # 并发数配置
27
-
28
- # 爬虫模块配置
29
- SPIDER_MODULES = [] # 爬虫模块列表
30
- SPIDER_LOADER_WARN_ONLY = False # 爬虫加载器是否只警告不报错
31
-
32
- # --------------------------------- 2. 爬虫核心配置 ------------------------------------
33
-
34
- # 下载器配置
35
- DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader" # 默认下载器
36
- DOWNLOAD_DELAY = 0.5 # 请求延迟(秒)
37
- RANDOMNESS = True # 是否启用随机延迟
38
- RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子,实际延迟 = DOWNLOAD_DELAY * RANDOM_RANGE[0] 到 DOWNLOAD_DELAY * RANDOM_RANGE[1]
39
-
40
- # 调度器配置
41
- DEPTH_PRIORITY = 1 # 深度优先级(负数表示深度优先,正数表示广度优先)
42
- SCHEDULER_MAX_QUEUE_SIZE = 5000 # 调度器队列最大大小
43
- BACKPRESSURE_RATIO = 0.9 # 背压触发阈值(队列大小达到最大容量的90%时触发背压控制)
44
-
45
- # 请求生成控制
46
- REQUEST_GENERATION_BATCH_SIZE = 10 # 请求生成批处理大小
47
- REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
48
- ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
49
-
50
- # 队列配置
51
- QUEUE_TYPE = 'auto' # 队列类型:memory/redis/auto
52
- # SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests" # 调度器队列名称(遵循统一命名规范)
53
- QUEUE_MAX_RETRIES = 3 # 队列操作最大重试次数
54
- QUEUE_TIMEOUT = 300 # 队列操作超时时间(秒)
55
-
56
- # --------------------------------- 3. 数据库和过滤器配置 ------------------------------------
57
-
58
- # MySQL配置
59
- MYSQL_HOST = '127.0.0.1'
60
- MYSQL_PORT = 3306
61
- MYSQL_USER = 'root'
62
- MYSQL_PASSWORD = '123456'
63
- MYSQL_DB = 'crawl_pro'
64
- MYSQL_TABLE = 'crawlo'
65
- MYSQL_BATCH_SIZE = 100
66
- MYSQL_USE_BATCH = False # 是否启用批量插入
67
- # MySQL SQL生成行为控制配置
68
- MYSQL_AUTO_UPDATE = False # 是否使用 REPLACE INTO(完全覆盖已存在记录)
69
- MYSQL_INSERT_IGNORE = False # 是否使用 INSERT IGNORE(忽略重复数据)
70
- MYSQL_UPDATE_COLUMNS = () # 冲突时需更新的列名;指定后 MYSQL_AUTO_UPDATE 失效
71
-
72
- # Redis配置
73
- redis_config = EnvConfigManager.get_redis_config()
74
- REDIS_HOST = redis_config['REDIS_HOST']
75
- REDIS_PORT = redis_config['REDIS_PORT']
76
- REDIS_PASSWORD = redis_config['REDIS_PASSWORD']
77
- REDIS_DB = redis_config['REDIS_DB']
78
-
79
- # Redis集群支持说明:
80
- # Crawlo框架支持Redis单实例和集群模式的智能切换
81
- # 集群模式配置方式:
82
- # 1. 使用逗号分隔的节点列表:'192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
83
- # 2. 使用集群URL格式:'redis-cluster://192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
84
- # 框架会自动检测URL格式并选择合适的模式
85
-
86
- # 根据是否有密码生成不同的 URL 格式
87
- if REDIS_PASSWORD:
88
- REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
89
- else:
90
- REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
91
-
92
- # Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
93
- # - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
94
- # - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
95
- # - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
96
- # - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
97
- # - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
98
-
99
- REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
100
- CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
101
- FILTER_DEBUG = True # 是否开启去重调试日志
102
- DECODE_RESPONSES = True # Redis 返回是否解码为字符串
103
-
104
- # 过滤器配置
105
- DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline' # 默认使用内存过滤器和去重管道
106
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
107
-
108
- # Bloom过滤器配置
109
- BLOOM_FILTER_CAPACITY = 1000000 # Bloom过滤器容量
110
- BLOOM_FILTER_ERROR_RATE = 0.001 # Bloom过滤器错误率
111
-
112
- # --------------------------------- 4. 中间件配置 ------------------------------------
113
-
114
- # 框架中间件列表(框架默认中间件 + 用户自定义中间件)
115
- MIDDLEWARES = [
116
- # === 请求预处理阶段 ===
117
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
118
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
119
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
120
- 'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
121
-
122
- # === 响应处理阶段 ===
123
- 'crawlo.middleware.retry.RetryMiddleware', # 6. 失败请求重试
124
- 'crawlo.middleware.response_code.ResponseCodeMiddleware', # 7. 处理特殊状态码
125
- 'crawlo.middleware.response_filter.ResponseFilterMiddleware', # 8. 响应内容过滤
126
- ]
127
-
128
- # --------------------------------- 5. 管道配置 ------------------------------------
129
-
130
- # 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
131
- PIPELINES = [
132
- 'crawlo.pipelines.console_pipeline.ConsolePipeline',
133
- ]
134
-
135
- # --------------------------------- 6. 扩展配置 ------------------------------------
136
-
137
- # 框架扩展组件列表(框架默认扩展 + 用户自定义扩展)
138
- EXTENSIONS = [
139
- 'crawlo.extension.log_interval.LogIntervalExtension', # 定时日志
140
- 'crawlo.extension.log_stats.LogStats', # 统计信息
141
- 'crawlo.extension.logging_extension.CustomLoggerExtension', # 自定义日志
142
- ]
143
-
144
- # --------------------------------- 7. 日志与监控配置 ------------------------------------
145
-
146
- # 日志配置
147
- LOG_LEVEL = None # 日志级别: DEBUG/INFO/WARNING/ERROR,默认为None,由用户在项目settings中设置
148
- STATS_DUMP = True # 是否周期性输出统计信息
149
- LOG_FILE = None # 日志文件路径,将在项目配置中设置
150
- LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
151
- LOG_ENCODING = 'utf-8'
152
- LOG_MAX_BYTES = 10 * 1024 * 1024 # 日志轮转大小(字节)
153
- LOG_BACKUP_COUNT = 5 # 日志备份数量
154
-
155
- # 日志间隔配置
156
- INTERVAL = 60 # 日志输出间隔(秒)
157
-
158
- # 内存监控配置
159
- MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
160
- MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
161
- MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
162
- MEMORY_CRITICAL_THRESHOLD = 90.0 # 内存使用率严重阈值(百分比)
163
-
164
- # 性能分析配置
165
- PERFORMANCE_PROFILER_ENABLED = False # 是否启用性能分析
166
- PERFORMANCE_PROFILER_OUTPUT_DIR = 'profiling' # 性能分析输出目录
167
- PERFORMANCE_PROFILER_INTERVAL = 300 # 性能分析间隔(秒)
168
-
169
- # 健康检查配置
170
- HEALTH_CHECK_ENABLED = True # 是否启用健康检查
171
-
172
- # --------------------------------- 8. 网络请求配置 ------------------------------------
173
-
174
- # 默认请求头配置
175
- DEFAULT_REQUEST_HEADERS = {
176
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
177
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
178
- 'Accept-Encoding': 'gzip, deflate, br',
179
- } # 默认请求头
180
-
181
- # 默认User-Agent(使用现代浏览器的User-Agent)
182
- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
183
-
184
- # 是否启用随机User-Agent功能(默认禁用,用户可根据需要启用)
185
- RANDOM_USER_AGENT_ENABLED = False # 是否启用随机用户代理
186
-
187
- # 站外过滤配置
188
- ALLOWED_DOMAINS = [] # 允许的域名列表
189
-
190
- # 代理配置(通用版,支持静态代理列表和动态代理API两种模式)
191
- PROXY_LIST = [] # 静态代理列表配置
192
- PROXY_API_URL = "" # 动态代理API配置
193
- # 代理提取配置,用于指定如何从API返回的数据中提取代理地址
194
- # 可选值:
195
- # - 字符串:直接作为字段名使用,如 "proxy"(默认值)
196
- # - 字典:包含type和value字段,支持多种提取方式
197
- # - {"type": "field", "value": "data"}:从指定字段提取
198
- # - {"type": "jsonpath", "value": "$.data[0].proxy"}:使用JSONPath表达式提取
199
- # - {"type": "custom", "function": your_function}:使用自定义函数提取
200
- PROXY_EXTRACTOR = "proxy" # 代理提取配置
201
- # 代理失败处理配置
202
- PROXY_MAX_FAILED_ATTEMPTS = 3 # 代理最大失败尝试次数,超过此次数将标记为失效
203
-
204
- # 代理使用示例:
205
- # 1. 静态代理列表:
206
- # PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
207
- # PROXY_API_URL = "" # 不使用动态代理
208
- #
209
- # 2. 动态代理API(默认字段提取):
210
- # PROXY_LIST = [] # 不使用静态代理
211
- # PROXY_API_URL = "http://api.example.com/get_proxy"
212
- # PROXY_EXTRACTOR = "proxy" # 从"proxy"字段提取
213
- #
214
- # 3. 动态代理API(自定义字段提取):
215
- # PROXY_LIST = [] # 不使用静态代理
216
- # PROXY_API_URL = "http://api.example.com/get_proxy"
217
- # PROXY_EXTRACTOR = "data" # 从"data"字段提取
218
- #
219
- # 4. 动态代理API(嵌套字段提取):
220
- # PROXY_LIST = [] # 不使用静态代理
221
- # PROXY_API_URL = "http://api.example.com/get_proxy"
222
- # PROXY_EXTRACTOR = {"type": "field", "value": "result"} # 从"result"字段提取
223
-
224
- # 下载器通用配置
225
- DOWNLOAD_TIMEOUT = 30 # 下载超时时间(秒)
226
- VERIFY_SSL = True # 是否验证SSL证书
227
- CONNECTION_POOL_LIMIT = 100 # 连接池大小限制
228
- CONNECTION_POOL_LIMIT_PER_HOST = 20 # 每个主机的连接池大小限制
229
- DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 最大下载大小(字节)
230
- DOWNLOAD_STATS = True # 是否启用下载统计
231
- DOWNLOAD_WARN_SIZE = 1024 * 1024 # 下载警告大小(字节)
232
- DOWNLOAD_RETRY_TIMES = 3 # 下载重试次数
233
- MAX_RETRY_TIMES = 3 # 最大重试次数
234
-
235
- # 下载器健康检查
236
- DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
237
- HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
238
- REQUEST_STATS_ENABLED = True # 是否启用请求统计
239
- STATS_RESET_ON_START = False # 启动时是否重置统计
240
-
241
- # HttpX 下载器专用配置
242
- HTTPX_HTTP2 = True # 是否启用HTTP/2支持
243
- HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
244
-
245
- # AioHttp 下载器专用配置
246
- AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
247
- AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
248
-
249
- # Curl-Cffi 特有配置
250
- CURL_BROWSER_TYPE = "chrome" # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
251
- CURL_BROWSER_VERSION_MAP = { # 自定义浏览器版本映射(可覆盖默认行为)
252
- "chrome": "chrome136",
253
- "edge": "edge101",
254
- "safari": "safari184",
255
- "firefox": "firefox135",
256
- }
257
-
258
- # Selenium 下载器配置
259
- SELENIUM_BROWSER_TYPE = "chrome" # 浏览器类型: chrome, firefox, edge
260
- SELENIUM_HEADLESS = True # 是否无头模式
261
- SELENIUM_TIMEOUT = 30 # 超时时间(秒)
262
- SELENIUM_LOAD_TIMEOUT = 10 # 页面加载超时时间(秒)
263
- SELENIUM_WINDOW_WIDTH = 1920 # 窗口宽度
264
- SELENIUM_WINDOW_HEIGHT = 1080 # 窗口高度
265
- SELENIUM_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
266
- SELENIUM_ENABLE_JS = True # 是否启用JavaScript
267
- SELENIUM_PROXY = None # 代理设置
268
- SELENIUM_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
269
- SELENIUM_MAX_TABS_PER_BROWSER = 10 # 单浏览器最大标签页数量
270
-
271
- # Playwright 下载器配置
272
- PLAYWRIGHT_BROWSER_TYPE = "chromium" # 浏览器类型: chromium, firefox, webkit
273
- PLAYWRIGHT_HEADLESS = True # 是否无头模式
274
- PLAYWRIGHT_TIMEOUT = 30000 # 超时时间(毫秒)
275
- PLAYWRIGHT_LOAD_TIMEOUT = 10000 # 页面加载超时时间(毫秒)
276
- PLAYWRIGHT_VIEWPORT_WIDTH = 1920 # 视口宽度
277
- PLAYWRIGHT_VIEWPORT_HEIGHT = 1080 # 视口高度
278
- PLAYWRIGHT_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
279
- PLAYWRIGHT_PROXY = None # 代理设置
280
- PLAYWRIGHT_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
281
- PLAYWRIGHT_MAX_PAGES_PER_BROWSER = 10 # 单浏览器最大页面数量
282
-
283
- # 通用优化配置
284
- CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
1
+ # -*- coding:UTF-8 -*-
2
+ """
3
+ 默认配置文件
4
+ 包含 Crawlo 框架的所有默认设置项
5
+ """
6
+ # 添加环境变量配置工具导入
7
+ from crawlo.utils.config_manager import EnvConfigManager
8
+
9
+ # --------------------------------- 1. 框架基础配置 ------------------------------------
10
+
11
+ # 框架初始化控制
12
+ FRAMEWORK_INIT_ORDER = [
13
+ 'log_system', # 日志系统
14
+ 'settings_system', # 配置系统
15
+ 'core_components', # 核心组件
16
+ 'extensions', # 扩展组件
17
+ 'full_initialization' # 完全初始化
18
+ ]
19
+ FRAMEWORK_INIT_STATE = 'uninitialized'
20
+
21
+ # 项目基础配置
22
+ runtime_config = EnvConfigManager.get_runtime_config()
23
+ PROJECT_NAME = runtime_config['PROJECT_NAME'] # 项目名称(用于日志、Redis Key 等标识)
24
+ VERSION = EnvConfigManager.get_version() # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
25
+ RUN_MODE = runtime_config['CRAWLO_MODE'] # 运行模式:standalone/distributed/auto
26
+ CONCURRENCY = runtime_config['CONCURRENCY'] # 并发数配置
27
+
28
+ # 爬虫模块配置
29
+ SPIDER_MODULES = [] # 爬虫模块列表
30
+ SPIDER_LOADER_WARN_ONLY = False # 爬虫加载器是否只警告不报错
31
+
32
+ # --------------------------------- 2. 爬虫核心配置 ------------------------------------
33
+
34
+ # 下载器配置
35
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader" # 默认下载器
36
+ DOWNLOAD_DELAY = 0.5 # 请求延迟(秒)
37
+ RANDOMNESS = True # 是否启用随机延迟
38
+ RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子,实际延迟 = DOWNLOAD_DELAY * RANDOM_RANGE[0] 到 DOWNLOAD_DELAY * RANDOM_RANGE[1]
39
+
40
+ # 调度器配置
41
+ DEPTH_PRIORITY = 1 # 深度优先级(负数表示深度优先,正数表示广度优先)
42
+ SCHEDULER_MAX_QUEUE_SIZE = 5000 # 调度器队列最大大小
43
+ BACKPRESSURE_RATIO = 0.9 # 背压触发阈值(队列大小达到最大容量的90%时触发背压控制)
44
+
45
+ # 请求生成控制
46
+ REQUEST_GENERATION_BATCH_SIZE = 10 # 请求生成批处理大小
47
+ REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
48
+ ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
49
+
50
+ # 队列配置
51
+ QUEUE_TYPE = 'auto' # 队列类型:memory/redis/auto
52
+ # SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests" # 调度器队列名称(遵循统一命名规范)
53
+ QUEUE_MAX_RETRIES = 3 # 队列操作最大重试次数
54
+ QUEUE_TIMEOUT = 300 # 队列操作超时时间(秒)
55
+
56
+ # --------------------------------- 3. 数据库和过滤器配置 ------------------------------------
57
+
58
+ # MySQL配置
59
+ MYSQL_HOST = '127.0.0.1'
60
+ MYSQL_PORT = 3306
61
+ MYSQL_USER = 'root'
62
+ MYSQL_PASSWORD = '123456'
63
+ MYSQL_DB = 'crawl_pro'
64
+ MYSQL_TABLE = 'crawlo'
65
+ MYSQL_BATCH_SIZE = 100
66
+ MYSQL_USE_BATCH = False # 是否启用批量插入
67
+ # MySQL SQL生成行为控制配置
68
+ MYSQL_AUTO_UPDATE = False # 是否使用 REPLACE INTO(完全覆盖已存在记录)
69
+ MYSQL_INSERT_IGNORE = False # 是否使用 INSERT IGNORE(忽略重复数据)
70
+ MYSQL_UPDATE_COLUMNS = () # 冲突时需更新的列名;指定后 MYSQL_AUTO_UPDATE 失效
71
+
72
+ # Redis配置
73
+ redis_config = EnvConfigManager.get_redis_config()
74
+ REDIS_HOST = redis_config['REDIS_HOST']
75
+ REDIS_PORT = redis_config['REDIS_PORT']
76
+ REDIS_PASSWORD = redis_config['REDIS_PASSWORD']
77
+ REDIS_DB = redis_config['REDIS_DB']
78
+
79
+ # Redis集群支持说明:
80
+ # Crawlo框架支持Redis单实例和集群模式的智能切换
81
+ # 集群模式配置方式:
82
+ # 1. 使用逗号分隔的节点列表:'192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
83
+ # 2. 使用集群URL格式:'redis-cluster://192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
84
+ # 框架会自动检测URL格式并选择合适的模式
85
+
86
+ # 根据是否有密码生成不同的 URL 格式
87
+ if REDIS_PASSWORD:
88
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
89
+ else:
90
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
91
+
92
+ # Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
93
+ # - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
94
+ # - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
95
+ # - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
96
+ # - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
97
+ # - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
98
+
99
+ REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
100
+ CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
101
+ FILTER_DEBUG = True # 是否开启去重调试日志
102
+ DECODE_RESPONSES = True # Redis 返回是否解码为字符串
103
+
104
+ # 过滤器配置
105
+ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline' # 默认使用内存过滤器和去重管道
106
+ FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
107
+
108
+ # Bloom过滤器配置
109
+ BLOOM_FILTER_CAPACITY = 1000000 # Bloom过滤器容量
110
+ BLOOM_FILTER_ERROR_RATE = 0.001 # Bloom过滤器错误率
111
+
112
+ # --------------------------------- 4. 中间件配置 ------------------------------------
113
+
114
+ # 框架中间件列表(框架默认中间件 + 用户自定义中间件)
115
+ MIDDLEWARES = [
116
+ # === 请求预处理阶段 ===
117
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
118
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
119
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
120
+ 'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
121
+
122
+ # === 响应处理阶段 ===
123
+ 'crawlo.middleware.retry.RetryMiddleware', # 6. 失败请求重试
124
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware', # 7. 处理特殊状态码
125
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware', # 8. 响应内容过滤
126
+ ]
127
+
128
+ # --------------------------------- 5. 管道配置 ------------------------------------
129
+
130
+ # 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
131
+ PIPELINES = [
132
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
133
+ ]
134
+
135
+ # --------------------------------- 6. 扩展配置 ------------------------------------
136
+
137
+ # 框架扩展组件列表(框架默认扩展 + 用户自定义扩展)
138
+ EXTENSIONS = [
139
+ 'crawlo.extension.log_interval.LogIntervalExtension', # 定时日志
140
+ 'crawlo.extension.log_stats.LogStats', # 统计信息
141
+ 'crawlo.extension.logging_extension.CustomLoggerExtension', # 自定义日志
142
+ ]
143
+
144
+ # --------------------------------- 7. 日志与监控配置 ------------------------------------
145
+
146
+ # 日志配置
147
+ LOG_LEVEL = None # 日志级别: DEBUG/INFO/WARNING/ERROR,默认为None,由用户在项目settings中设置
148
+ STATS_DUMP = True # 是否周期性输出统计信息
149
+ LOG_FILE = None # 日志文件路径,将在项目配置中设置
150
+ LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
151
+ LOG_ENCODING = 'utf-8'
152
+ LOG_MAX_BYTES = 10 * 1024 * 1024 # 日志轮转大小(字节),推荐20MB用于生产环境
153
+ LOG_BACKUP_COUNT = 5 # 日志备份数量,推荐10个用于生产环境
154
+ # 如果用户不想要日志轮转,可以设置 LOG_MAX_BYTES = 0 来禁用轮转功能
155
+ # 注意:当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转永远不会发生,日志文件会持续增长
156
+ # 需要通过其他方式管理磁盘空间,如系统级日志轮转工具(logrotate等)
157
+
158
+ # 日志间隔配置
159
+ INTERVAL = 60 # 日志输出间隔(秒)
160
+
161
+ # 内存监控配置
162
+ MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
163
+ MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
164
+ MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
165
+ MEMORY_CRITICAL_THRESHOLD = 90.0 # 内存使用率严重阈值(百分比)
166
+
167
+ # 性能分析配置
168
+ PERFORMANCE_PROFILER_ENABLED = False # 是否启用性能分析
169
+ PERFORMANCE_PROFILER_OUTPUT_DIR = 'profiling' # 性能分析输出目录
170
+ PERFORMANCE_PROFILER_INTERVAL = 300 # 性能分析间隔(秒)
171
+
172
+ # 健康检查配置
173
+ HEALTH_CHECK_ENABLED = True # 是否启用健康检查
174
+
175
+ # --------------------------------- 8. 网络请求配置 ------------------------------------
176
+
177
+ # 默认请求头配置
178
+ DEFAULT_REQUEST_HEADERS = {
179
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
180
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
181
+ 'Accept-Encoding': 'gzip, deflate, br',
182
+ } # 默认请求头
183
+
184
+ # 默认User-Agent(使用现代浏览器的User-Agent)
185
+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
186
+
187
+ # 是否启用随机User-Agent功能(默认禁用,用户可根据需要启用)
188
+ RANDOM_USER_AGENT_ENABLED = False # 是否启用随机用户代理
189
+
190
+ # 站外过滤配置
191
+ ALLOWED_DOMAINS = [] # 允许的域名列表
192
+
193
+ # 代理配置(通用版,支持静态代理列表和动态代理API两种模式)
194
+ PROXY_LIST = [] # 静态代理列表配置
195
+ PROXY_API_URL = "" # 动态代理API配置
196
+ # 代理提取配置,用于指定如何从API返回的数据中提取代理地址
197
+ # 可选值:
198
+ # - 字符串:直接作为字段名使用,如 "proxy"(默认值)
199
+ # - 字典:包含type和value字段,支持多种提取方式
200
+ # - {"type": "field", "value": "data"}:从指定字段提取
201
+ # - {"type": "jsonpath", "value": "$.data[0].proxy"}:使用JSONPath表达式提取
202
+ # - {"type": "custom", "function": your_function}:使用自定义函数提取
203
+ PROXY_EXTRACTOR = "proxy" # 代理提取配置
204
+ # 代理失败处理配置
205
+ PROXY_MAX_FAILED_ATTEMPTS = 3 # 代理最大失败尝试次数,超过此次数将标记为失效
206
+
207
+ # 代理使用示例:
208
+ # 1. 静态代理列表:
209
+ # PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
210
+ # PROXY_API_URL = "" # 不使用动态代理
211
+ #
212
+ # 2. 动态代理API(默认字段提取):
213
+ # PROXY_LIST = [] # 不使用静态代理
214
+ # PROXY_API_URL = "http://api.example.com/get_proxy"
215
+ # PROXY_EXTRACTOR = "proxy" # 从"proxy"字段提取
216
+ #
217
+ # 3. 动态代理API(自定义字段提取):
218
+ # PROXY_LIST = [] # 不使用静态代理
219
+ # PROXY_API_URL = "http://api.example.com/get_proxy"
220
+ # PROXY_EXTRACTOR = "data" # 从"data"字段提取
221
+ #
222
+ # 4. 动态代理API(嵌套字段提取):
223
+ # PROXY_LIST = [] # 不使用静态代理
224
+ # PROXY_API_URL = "http://api.example.com/get_proxy"
225
+ # PROXY_EXTRACTOR = {"type": "field", "value": "result"} # 从"result"字段提取
226
+
227
+ # 下载器通用配置
228
+ DOWNLOAD_TIMEOUT = 30 # 下载超时时间(秒)
229
+ VERIFY_SSL = True # 是否验证SSL证书
230
+ CONNECTION_POOL_LIMIT = 100 # 连接池大小限制
231
+ CONNECTION_POOL_LIMIT_PER_HOST = 20 # 每个主机的连接池大小限制
232
+ DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 最大下载大小(字节)
233
+ DOWNLOAD_STATS = True # 是否启用下载统计
234
+ DOWNLOAD_WARN_SIZE = 1024 * 1024 # 下载警告大小(字节)
235
+ DOWNLOAD_RETRY_TIMES = 3 # 下载重试次数
236
+ MAX_RETRY_TIMES = 3 # 最大重试次数
237
+
238
+ # 下载器健康检查
239
+ DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
240
+ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
241
+ REQUEST_STATS_ENABLED = True # 是否启用请求统计
242
+ STATS_RESET_ON_START = False # 启动时是否重置统计
243
+
244
+ # HttpX 下载器专用配置
245
+ HTTPX_HTTP2 = True # 是否启用HTTP/2支持
246
+ HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
247
+
248
+ # AioHttp 下载器专用配置
249
+ AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
250
+ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
251
+
252
+ # Curl-Cffi 特有配置
253
+ CURL_BROWSER_TYPE = "chrome" # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
254
+ CURL_BROWSER_VERSION_MAP = { # 自定义浏览器版本映射(可覆盖默认行为)
255
+ "chrome": "chrome136",
256
+ "edge": "edge101",
257
+ "safari": "safari184",
258
+ "firefox": "firefox135",
259
+ }
260
+
261
+ # Selenium 下载器配置
262
+ SELENIUM_BROWSER_TYPE = "chrome" # 浏览器类型: chrome, firefox, edge
263
+ SELENIUM_HEADLESS = True # 是否无头模式
264
+ SELENIUM_TIMEOUT = 30 # 超时时间(秒)
265
+ SELENIUM_LOAD_TIMEOUT = 10 # 页面加载超时时间(秒)
266
+ SELENIUM_WINDOW_WIDTH = 1920 # 窗口宽度
267
+ SELENIUM_WINDOW_HEIGHT = 1080 # 窗口高度
268
+ SELENIUM_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
269
+ SELENIUM_ENABLE_JS = True # 是否启用JavaScript
270
+ SELENIUM_PROXY = None # 代理设置
271
+ SELENIUM_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
272
+ SELENIUM_MAX_TABS_PER_BROWSER = 10 # 单浏览器最大标签页数量
273
+
274
+ # Playwright 下载器配置
275
+ PLAYWRIGHT_BROWSER_TYPE = "chromium" # 浏览器类型: chromium, firefox, webkit
276
+ PLAYWRIGHT_HEADLESS = True # 是否无头模式
277
+ PLAYWRIGHT_TIMEOUT = 30000 # 超时时间(毫秒)
278
+ PLAYWRIGHT_LOAD_TIMEOUT = 10000 # 页面加载超时时间(毫秒)
279
+ PLAYWRIGHT_VIEWPORT_WIDTH = 1920 # 视口宽度
280
+ PLAYWRIGHT_VIEWPORT_HEIGHT = 1080 # 视口高度
281
+ PLAYWRIGHT_WAIT_FOR_ELEMENT = None # 等待特定元素选择器
282
+ PLAYWRIGHT_PROXY = None # 代理设置
283
+ PLAYWRIGHT_SINGLE_BROWSER_MODE = True # 单浏览器多标签页模式
284
+ PLAYWRIGHT_MAX_PAGES_PER_BROWSER = 10 # 单浏览器最大页面数量
285
+
286
+ # 通用优化配置
287
+ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
285
288
  CONNECTION_KEEPALIVE = True # 是否启用HTTP连接保持