crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (374) hide show
  1. crawlo/__init__.py +90 -89
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -341
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -438
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -291
  19. crawlo/crawler.py +698 -657
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -276
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -247
  25. crawlo/downloader/httpx_downloader.py +265 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -402
  28. crawlo/downloader/selenium_downloader.py +486 -472
  29. crawlo/event.py +45 -11
  30. crawlo/exceptions.py +215 -82
  31. crawlo/extension/__init__.py +65 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +135 -0
  44. crawlo/filters/__init__.py +170 -153
  45. crawlo/filters/aioredis_filter.py +348 -264
  46. crawlo/filters/memory_filter.py +261 -276
  47. crawlo/framework.py +306 -292
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -434
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -194
  52. crawlo/initialization/phases.py +230 -149
  53. crawlo/initialization/registry.py +143 -145
  54. crawlo/initialization/utils.py +49 -0
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -46
  61. crawlo/logging/config.py +277 -197
  62. crawlo/logging/factory.py +175 -171
  63. crawlo/logging/manager.py +104 -112
  64. crawlo/middleware/__init__.py +87 -24
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -253
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +375 -379
  77. crawlo/network/response.py +569 -664
  78. crawlo/pipelines/__init__.py +53 -22
  79. crawlo/pipelines/base_pipeline.py +452 -0
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -132
  87. crawlo/pipelines/mysql_pipeline.py +469 -476
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +10 -0
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -525
  94. crawlo/queue/redis_priority_queue.py +519 -370
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +284 -277
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +657 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +2 -4
  104. crawlo/templates/project/items.py.tmpl +13 -17
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -36
  107. crawlo/templates/project/settings.py.tmpl +109 -111
  108. crawlo/templates/project/settings_distributed.py.tmpl +156 -159
  109. crawlo/templates/project/settings_gentle.py.tmpl +170 -176
  110. crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
  111. crawlo/templates/project/settings_minimal.py.tmpl +98 -100
  112. crawlo/templates/project/settings_simple.py.tmpl +168 -174
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -40
  116. crawlo/templates/spiders_init.py.tmpl +5 -10
  117. crawlo/tools/__init__.py +86 -189
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +50 -50
  123. crawlo/utils/batch_processor.py +276 -259
  124. crawlo/utils/config_manager.py +442 -0
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/error_handler.py +410 -410
  128. crawlo/utils/fingerprint.py +121 -121
  129. crawlo/utils/func_tools.py +82 -82
  130. crawlo/utils/large_scale_helper.py +344 -344
  131. crawlo/utils/leak_detector.py +335 -0
  132. crawlo/utils/log.py +79 -79
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -0
  135. crawlo/utils/mysql_connection_pool.py +197 -0
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +91 -0
  139. crawlo/utils/redis_connection_pool.py +578 -388
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -256
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -0
  144. crawlo/utils/selector_helper.py +137 -137
  145. crawlo/utils/singleton.py +70 -0
  146. crawlo/utils/spider_loader.py +201 -201
  147. crawlo/utils/text_helper.py +94 -94
  148. crawlo/utils/{url.py → url_utils.py} +39 -39
  149. crawlo-1.4.7.dist-info/METADATA +689 -0
  150. crawlo-1.4.7.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -275
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -0
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -0
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/scrapy.cfg +11 -11
  192. tests/optimized_performance_test.py +211 -211
  193. tests/performance_comparison.py +244 -244
  194. tests/queue_blocking_test.py +113 -113
  195. tests/queue_test.py +89 -89
  196. tests/redis_key_validation_demo.py +130 -130
  197. tests/request_params_example.py +150 -150
  198. tests/response_improvements_example.py +144 -144
  199. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  200. tests/scrapy_comparison/scrapy_test.py +133 -133
  201. tests/simple_cli_test.py +55 -0
  202. tests/simple_command_test.py +119 -119
  203. tests/simple_crawlo_test.py +126 -126
  204. tests/simple_follow_test.py +38 -38
  205. tests/simple_log_test2.py +137 -137
  206. tests/simple_optimization_test.py +128 -128
  207. tests/simple_queue_type_test.py +41 -41
  208. tests/simple_response_selector_test.py +94 -94
  209. tests/simple_selector_helper_test.py +154 -154
  210. tests/simple_selector_test.py +207 -207
  211. tests/simple_spider_test.py +49 -49
  212. tests/simple_url_test.py +73 -73
  213. tests/simulate_mysql_update_test.py +139 -139
  214. tests/spider_log_timing_test.py +177 -177
  215. tests/test_advanced_tools.py +148 -148
  216. tests/test_all_commands.py +230 -230
  217. tests/test_all_pipeline_fingerprints.py +133 -133
  218. tests/test_all_redis_key_configs.py +145 -145
  219. tests/test_asyncmy_usage.py +56 -56
  220. tests/test_batch_processor.py +178 -178
  221. tests/test_cleaners.py +54 -54
  222. tests/test_cli_arguments.py +119 -0
  223. tests/test_component_factory.py +174 -174
  224. tests/test_config_consistency.py +80 -80
  225. tests/test_config_merge.py +152 -152
  226. tests/test_config_validator.py +182 -182
  227. tests/test_controlled_spider_mixin.py +79 -79
  228. tests/test_crawler_process_import.py +38 -38
  229. tests/test_crawler_process_spider_modules.py +47 -47
  230. tests/test_crawlo_proxy_integration.py +114 -114
  231. tests/test_date_tools.py +123 -123
  232. tests/test_dedup_fix.py +220 -220
  233. tests/test_dedup_pipeline_consistency.py +124 -124
  234. tests/test_default_header_middleware.py +313 -313
  235. tests/test_distributed.py +65 -65
  236. tests/test_double_crawlo_fix.py +204 -204
  237. tests/test_double_crawlo_fix_simple.py +124 -124
  238. tests/test_download_delay_middleware.py +221 -221
  239. tests/test_downloader_proxy_compatibility.py +272 -272
  240. tests/test_edge_cases.py +305 -305
  241. tests/test_encoding_core.py +56 -56
  242. tests/test_encoding_detection.py +126 -126
  243. tests/test_enhanced_error_handler.py +270 -270
  244. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  245. tests/test_error_handler_compatibility.py +112 -112
  246. tests/test_factories.py +252 -252
  247. tests/test_factory_compatibility.py +196 -196
  248. tests/test_final_validation.py +153 -153
  249. tests/test_fingerprint_consistency.py +135 -135
  250. tests/test_fingerprint_simple.py +51 -51
  251. tests/test_get_component_logger.py +83 -83
  252. tests/test_hash_performance.py +99 -99
  253. tests/test_integration.py +169 -169
  254. tests/test_item_dedup_redis_key.py +122 -122
  255. tests/test_large_scale_helper.py +235 -235
  256. tests/test_logging_enhancements.py +374 -374
  257. tests/test_logging_final.py +184 -184
  258. tests/test_logging_integration.py +312 -312
  259. tests/test_logging_system.py +282 -282
  260. tests/test_middleware_debug.py +141 -141
  261. tests/test_mode_consistency.py +51 -51
  262. tests/test_multi_directory.py +67 -67
  263. tests/test_multiple_spider_modules.py +80 -80
  264. tests/test_mysql_pipeline_config.py +164 -164
  265. tests/test_mysql_pipeline_error.py +98 -98
  266. tests/test_mysql_pipeline_init_log.py +82 -82
  267. tests/test_mysql_pipeline_integration.py +132 -132
  268. tests/test_mysql_pipeline_refactor.py +143 -143
  269. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  270. tests/test_mysql_pipeline_robustness.py +195 -195
  271. tests/test_mysql_pipeline_types.py +88 -88
  272. tests/test_mysql_update_columns.py +93 -93
  273. tests/test_offsite_middleware.py +244 -244
  274. tests/test_offsite_middleware_simple.py +203 -203
  275. tests/test_optimized_selector_naming.py +100 -100
  276. tests/test_parsel.py +29 -29
  277. tests/test_performance.py +327 -327
  278. tests/test_performance_monitor.py +115 -115
  279. tests/test_pipeline_fingerprint_consistency.py +86 -86
  280. tests/test_priority_behavior.py +211 -211
  281. tests/test_priority_consistency.py +151 -151
  282. tests/test_priority_consistency_fixed.py +249 -249
  283. tests/test_proxy_health_check.py +32 -32
  284. tests/test_proxy_middleware.py +217 -217
  285. tests/test_proxy_middleware_enhanced.py +212 -212
  286. tests/test_proxy_middleware_integration.py +142 -142
  287. tests/test_proxy_middleware_refactored.py +207 -207
  288. tests/test_proxy_only.py +83 -83
  289. tests/test_proxy_providers.py +56 -56
  290. tests/test_proxy_stats.py +19 -19
  291. tests/test_proxy_strategies.py +59 -59
  292. tests/test_proxy_with_downloader.py +152 -152
  293. tests/test_queue_empty_check.py +41 -41
  294. tests/test_queue_manager_double_crawlo.py +173 -173
  295. tests/test_queue_manager_redis_key.py +179 -179
  296. tests/test_queue_naming.py +154 -154
  297. tests/test_queue_type.py +106 -106
  298. tests/test_queue_type_redis_config_consistency.py +130 -130
  299. tests/test_random_headers_default.py +322 -322
  300. tests/test_random_headers_necessity.py +308 -308
  301. tests/test_random_user_agent.py +72 -72
  302. tests/test_redis_config.py +28 -28
  303. tests/test_redis_connection_pool.py +294 -294
  304. tests/test_redis_key_naming.py +181 -181
  305. tests/test_redis_key_validator.py +123 -123
  306. tests/test_redis_queue.py +224 -224
  307. tests/test_redis_queue_name_fix.py +175 -175
  308. tests/test_redis_queue_type_fallback.py +129 -129
  309. tests/test_request_ignore_middleware.py +182 -182
  310. tests/test_request_params.py +111 -111
  311. tests/test_request_serialization.py +70 -70
  312. tests/test_response_code_middleware.py +349 -349
  313. tests/test_response_filter_middleware.py +427 -427
  314. tests/test_response_follow.py +104 -104
  315. tests/test_response_improvements.py +152 -152
  316. tests/test_response_selector_methods.py +92 -92
  317. tests/test_response_url_methods.py +70 -70
  318. tests/test_response_urljoin.py +86 -86
  319. tests/test_retry_middleware.py +333 -333
  320. tests/test_retry_middleware_realistic.py +273 -273
  321. tests/test_scheduler.py +252 -252
  322. tests/test_scheduler_config_update.py +133 -133
  323. tests/test_scrapy_style_encoding.py +112 -112
  324. tests/test_selector_helper.py +100 -100
  325. tests/test_selector_optimizations.py +146 -146
  326. tests/test_simple_response.py +61 -61
  327. tests/test_spider_loader.py +49 -49
  328. tests/test_spider_loader_comprehensive.py +69 -69
  329. tests/test_spider_modules.py +84 -84
  330. tests/test_spiders/test_spider.py +9 -9
  331. tests/test_telecom_spider_redis_key.py +205 -205
  332. tests/test_template_content.py +87 -87
  333. tests/test_template_redis_key.py +134 -134
  334. tests/test_tools.py +159 -159
  335. tests/test_user_agent_randomness.py +176 -176
  336. tests/test_user_agents.py +96 -96
  337. tests/untested_features_report.md +138 -138
  338. tests/verify_debug.py +51 -51
  339. tests/verify_distributed.py +117 -117
  340. tests/verify_log_fix.py +111 -111
  341. tests/verify_mysql_warnings.py +109 -109
  342. crawlo/logging/async_handler.py +0 -181
  343. crawlo/logging/monitor.py +0 -153
  344. crawlo/logging/sampler.py +0 -167
  345. crawlo/tools/authenticated_proxy.py +0 -241
  346. crawlo/tools/data_formatter.py +0 -226
  347. crawlo/tools/data_validator.py +0 -181
  348. crawlo/tools/encoding_converter.py +0 -127
  349. crawlo/tools/network_diagnostic.py +0 -365
  350. crawlo/tools/request_tools.py +0 -83
  351. crawlo/tools/retry_mechanism.py +0 -224
  352. crawlo/utils/env_config.py +0 -143
  353. crawlo/utils/large_scale_config.py +0 -287
  354. crawlo/utils/system.py +0 -11
  355. crawlo/utils/tools.py +0 -5
  356. crawlo-1.4.6.dist-info/METADATA +0 -329
  357. crawlo-1.4.6.dist-info/RECORD +0 -361
  358. tests/env_config_example.py +0 -134
  359. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  360. tests/test_authenticated_proxy.py +0 -142
  361. tests/test_comprehensive.py +0 -147
  362. tests/test_dynamic_downloaders_proxy.py +0 -125
  363. tests/test_dynamic_proxy.py +0 -93
  364. tests/test_dynamic_proxy_config.py +0 -147
  365. tests/test_dynamic_proxy_real.py +0 -110
  366. tests/test_env_config.py +0 -122
  367. tests/test_framework_env_usage.py +0 -104
  368. tests/test_large_scale_config.py +0 -113
  369. tests/test_proxy_api.py +0 -265
  370. tests/test_real_scenario_proxy.py +0 -196
  371. tests/tools_example.py +0 -261
  372. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
  373. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
  374. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
crawlo/logging/config.py CHANGED
@@ -1,197 +1,277 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 日志配置管理
5
- """
6
-
7
- import os
8
- from dataclasses import dataclass, field
9
- from typing import Optional, Dict, Any
10
-
11
-
12
- @dataclass
13
- class LogConfig:
14
- """日志配置数据类 - 简单明确的配置结构"""
15
-
16
- # 基本配置
17
- level: str = "INFO"
18
- format: str = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
19
- encoding: str = "utf-8"
20
-
21
- # 文件配置
22
- file_path: Optional[str] = None
23
- max_bytes: int = 10 * 1024 * 1024 # 10MB
24
- backup_count: int = 5
25
-
26
- # 控制台配置
27
- console_enabled: bool = True
28
- file_enabled: bool = True
29
-
30
- # 分别控制台和文件的日志级别
31
- console_level: Optional[str] = None
32
- file_level: Optional[str] = None
33
-
34
- # 上下文信息配置
35
- include_thread_id: bool = False
36
- include_process_id: bool = False
37
- include_module_path: bool = False
38
-
39
- # 模块级别配置
40
- module_levels: Dict[str, str] = field(default_factory=dict)
41
-
42
- @classmethod
43
- def from_settings(cls, settings) -> 'LogConfig':
44
- """从settings对象创建配置"""
45
- if not settings:
46
- return cls()
47
-
48
- # 使用settings的get方法而不是getattr
49
- if hasattr(settings, 'get'):
50
- get_val = settings.get
51
- else:
52
- get_val = lambda k, d=None: getattr(settings, k, d)
53
-
54
- # 获取默认值
55
- format_default_value = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
56
-
57
- return cls(
58
- level=get_val('LOG_LEVEL', 'INFO'),
59
- format=get_val('LOG_FORMAT', format_default_value),
60
- encoding=get_val('LOG_ENCODING', 'utf-8'),
61
- file_path=get_val('LOG_FILE'),
62
- max_bytes=get_val('LOG_MAX_BYTES', 10 * 1024 * 1024),
63
- backup_count=get_val('LOG_BACKUP_COUNT', 5),
64
- console_enabled=get_val('LOG_CONSOLE_ENABLED', True),
65
- file_enabled=get_val('LOG_FILE_ENABLED', True),
66
- console_level=get_val('LOG_CONSOLE_LEVEL'), # 允许单独设置控制台级别
67
- file_level=get_val('LOG_FILE_LEVEL'), # 允许单独设置文件级别
68
- include_thread_id=get_val('LOG_INCLUDE_THREAD_ID', False),
69
- include_process_id=get_val('LOG_INCLUDE_PROCESS_ID', False),
70
- include_module_path=get_val('LOG_INCLUDE_MODULE_PATH', False),
71
- module_levels=get_val('LOG_LEVELS', {})
72
- )
73
-
74
- @classmethod
75
- def from_dict(cls, config_dict: Dict[str, Any]) -> 'LogConfig':
76
- """从字典创建配置"""
77
- # 映射字典键到类属性名
78
- key_mapping = {
79
- 'LOG_LEVEL': 'level',
80
- 'LOG_FORMAT': 'format',
81
- 'LOG_ENCODING': 'encoding',
82
- 'LOG_FILE': 'file_path',
83
- 'LOG_MAX_BYTES': 'max_bytes',
84
- 'LOG_BACKUP_COUNT': 'backup_count',
85
- 'LOG_CONSOLE_ENABLED': 'console_enabled',
86
- 'LOG_FILE_ENABLED': 'file_enabled',
87
- 'LOG_CONSOLE_LEVEL': 'console_level',
88
- 'LOG_FILE_LEVEL': 'file_level',
89
- 'LOG_INCLUDE_THREAD_ID': 'include_thread_id',
90
- 'LOG_INCLUDE_PROCESS_ID': 'include_process_id',
91
- 'LOG_INCLUDE_MODULE_PATH': 'include_module_path',
92
- 'LOG_LEVELS': 'module_levels'
93
- }
94
-
95
- # 应用键映射
96
- mapped_dict = {}
97
- for k, v in config_dict.items():
98
- mapped_key = key_mapping.get(k, k)
99
- if mapped_key in cls.__annotations__:
100
- mapped_dict[mapped_key] = v
101
-
102
- return cls(**mapped_dict)
103
-
104
- def get_module_level(self, module_name: str) -> str:
105
- """获取模块的日志级别"""
106
- # 先查找精确匹配
107
- if module_name in self.module_levels:
108
- return self.module_levels[module_name]
109
-
110
- # 查找父模块匹配
111
- parts = module_name.split('.')
112
- for i in range(len(parts) - 1, 0, -1):
113
- parent_module = '.'.join(parts[:i])
114
- if parent_module in self.module_levels:
115
- return self.module_levels[parent_module]
116
-
117
- # 返回默认级别
118
- return self.level
119
-
120
- def get_console_level(self) -> str:
121
- """获取控制台日志级别"""
122
- return self.console_level or self.level
123
-
124
- def get_file_level(self) -> str:
125
- """获取文件日志级别"""
126
- return self.file_level or self.level
127
-
128
- def get_format(self) -> str:
129
- """
130
- 获取日志格式,包含上下文信息
131
-
132
- Returns:
133
- 日志格式字符串
134
- """
135
- base_format = self.format
136
-
137
- # 添加线程ID
138
- if self.include_thread_id:
139
- if '[%(thread)d]' not in base_format:
140
- # 在时间戳后添加线程ID
141
- base_format = base_format.replace(
142
- '%(asctime)s',
143
- '%(asctime)s [%(thread)d]'
144
- )
145
-
146
- # 添加进程ID
147
- if self.include_process_id:
148
- if '[%(process)d]' not in base_format:
149
- # 在时间戳后添加进程ID(如果已经有线程ID,则在线程ID后添加)
150
- if '[%(thread)d]' in base_format:
151
- base_format = base_format.replace(
152
- '%(asctime)s [%(thread)d]',
153
- '%(asctime)s [%(thread)d] [%(process)d]'
154
- )
155
- else:
156
- base_format = base_format.replace(
157
- '%(asctime)s',
158
- '%(asctime)s [%(process)d]'
159
- )
160
-
161
- # 添加模块路径
162
- if self.include_module_path:
163
- if '%(pathname)s:%(lineno)d' not in base_format:
164
- # 在消息前添加文件路径和行号
165
- base_format = base_format.replace(
166
- '%(message)s',
167
- '%(pathname)s:%(lineno)d - %(message)s'
168
- )
169
-
170
- return base_format
171
-
172
- def validate(self) -> bool:
173
- """验证配置有效性"""
174
- valid_levels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'}
175
-
176
- # 验证主级别
177
- if self.level.upper() not in valid_levels:
178
- return False
179
-
180
- # 验证控制台级别
181
- if self.console_level and self.console_level.upper() not in valid_levels:
182
- return False
183
-
184
- # 验证文件级别
185
- if self.file_level and self.file_level.upper() not in valid_levels:
186
- return False
187
-
188
- # 确保日志目录存在
189
- if self.file_path and self.file_enabled:
190
- try:
191
- log_dir = os.path.dirname(self.file_path)
192
- if log_dir and not os.path.exists(log_dir):
193
- os.makedirs(log_dir, exist_ok=True)
194
- except (OSError, PermissionError):
195
- return False
196
-
197
- return True
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 日志配置管理
5
+ """
6
+
7
+ import os
8
+ from dataclasses import dataclass, field
9
+ from typing import Optional, Dict, Any
10
+
11
+
12
+ @dataclass
13
+ class LogConfig:
14
+ """日志配置数据类 - 简单明确的配置结构"""
15
+
16
+ # 预设配置模板
17
+ TEMPLATES = {
18
+ 'minimal': {
19
+ 'level': 'INFO',
20
+ 'format': '%(asctime)s - %(levelname)s: %(message)s',
21
+ 'console_enabled': True,
22
+ 'file_enabled': False
23
+ },
24
+ 'standard': {
25
+ 'level': 'INFO',
26
+ 'format': '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s',
27
+ 'console_enabled': True,
28
+ 'file_enabled': True,
29
+ 'file_path': 'logs/crawlo.log'
30
+ },
31
+ 'detailed': {
32
+ 'level': 'DEBUG',
33
+ 'format': '%(asctime)s - [%(name)s] - %(levelname)s - %(pathname)s:%(lineno)d: %(message)s',
34
+ 'console_enabled': True,
35
+ 'file_enabled': True,
36
+ 'file_path': 'logs/crawlo.log',
37
+ 'max_bytes': 20 * 1024 * 1024,
38
+ 'backup_count': 10
39
+ },
40
+ 'production': {
41
+ 'level': 'WARNING',
42
+ 'format': '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s',
43
+ 'console_enabled': False,
44
+ 'file_enabled': True,
45
+ 'file_path': 'logs/crawlo.log',
46
+ 'max_bytes': 50 * 1024 * 1024,
47
+ 'backup_count': 20
48
+ }
49
+ }
50
+
51
+ # 基本配置
52
+ level: str = "INFO"
53
+ format: str = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
54
+ encoding: str = "utf-8"
55
+
56
+ # 文件配置
57
+ file_path: Optional[str] = None
58
+ max_bytes: int = 10 * 1024 * 1024 # 10MB
59
+ backup_count: int = 5
60
+
61
+ # 控制台配置
62
+ console_enabled: bool = True
63
+ file_enabled: bool = True
64
+
65
+ # 分别控制台和文件的日志级别
66
+ console_level: Optional[str] = None
67
+ file_level: Optional[str] = None
68
+
69
+ # 上下文信息配置
70
+ include_thread_id: bool = False
71
+ include_process_id: bool = False
72
+ include_module_path: bool = False
73
+
74
+ # 模块级别配置
75
+ module_levels: Dict[str, str] = field(default_factory=dict)
76
+
77
+ @classmethod
78
+ def from_settings(cls, settings) -> 'LogConfig':
79
+ """从settings对象创建配置"""
80
+ if not settings:
81
+ return cls()
82
+
83
+ # 使用settings的get方法而不是getattr
84
+ if hasattr(settings, 'get'):
85
+ get_val = settings.get
86
+ else:
87
+ get_val = lambda k, d=None: getattr(settings, k, d)
88
+
89
+ # 获取默认值
90
+ format_default_value = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
91
+
92
+ # 确保类型安全
93
+ def safe_get_str(key: str, default: str = '') -> str:
94
+ value = get_val(key, default)
95
+ return str(value) if value is not None else default
96
+
97
+ def safe_get_int(key: str, default: int) -> int:
98
+ value = get_val(key, default)
99
+ try:
100
+ return int(value) if value is not None else default
101
+ except (ValueError, TypeError):
102
+ return default
103
+
104
+ def safe_get_bool(key: str, default: bool) -> bool:
105
+ value = get_val(key, default)
106
+ if isinstance(value, bool):
107
+ return value
108
+ if isinstance(value, str):
109
+ return value.lower() in ('1', 'true', 'yes', 'on')
110
+ return bool(value) if value is not None else default
111
+
112
+ def safe_get_dict(key: str, default: dict) -> dict:
113
+ value = get_val(key, default)
114
+ return value if isinstance(value, dict) else default
115
+
116
+ return cls(
117
+ level=safe_get_str('LOG_LEVEL', 'INFO'),
118
+ format=safe_get_str('LOG_FORMAT', format_default_value),
119
+ encoding=safe_get_str('LOG_ENCODING', 'utf-8'),
120
+ file_path=safe_get_str('LOG_FILE'),
121
+ max_bytes=safe_get_int('LOG_MAX_BYTES', 10 * 1024 * 1024),
122
+ backup_count=safe_get_int('LOG_BACKUP_COUNT', 5),
123
+ console_enabled=safe_get_bool('LOG_CONSOLE_ENABLED', True),
124
+ file_enabled=safe_get_bool('LOG_FILE_ENABLED', True),
125
+ console_level=safe_get_str('LOG_CONSOLE_LEVEL'), # 允许单独设置控制台级别
126
+ file_level=safe_get_str('LOG_FILE_LEVEL'), # 允许单独设置文件级别
127
+ include_thread_id=safe_get_bool('LOG_INCLUDE_THREAD_ID', False),
128
+ include_process_id=safe_get_bool('LOG_INCLUDE_PROCESS_ID', False),
129
+ include_module_path=safe_get_bool('LOG_INCLUDE_MODULE_PATH', False),
130
+ module_levels=safe_get_dict('LOG_LEVELS', {})
131
+ )
132
+
133
+ @classmethod
134
+ def from_dict(cls, config_dict: Dict[str, Any]) -> 'LogConfig':
135
+ """从字典创建配置"""
136
+ # 映射字典键到类属性名
137
+ key_mapping = {
138
+ 'LOG_LEVEL': 'level',
139
+ 'LOG_FORMAT': 'format',
140
+ 'LOG_ENCODING': 'encoding',
141
+ 'LOG_FILE': 'file_path',
142
+ 'LOG_MAX_BYTES': 'max_bytes',
143
+ 'LOG_BACKUP_COUNT': 'backup_count',
144
+ 'LOG_CONSOLE_ENABLED': 'console_enabled',
145
+ 'LOG_FILE_ENABLED': 'file_enabled',
146
+ 'LOG_CONSOLE_LEVEL': 'console_level',
147
+ 'LOG_FILE_LEVEL': 'file_level',
148
+ 'LOG_INCLUDE_THREAD_ID': 'include_thread_id',
149
+ 'LOG_INCLUDE_PROCESS_ID': 'include_process_id',
150
+ 'LOG_INCLUDE_MODULE_PATH': 'include_module_path',
151
+ 'LOG_LEVELS': 'module_levels'
152
+ }
153
+
154
+ # 应用键映射
155
+ mapped_dict = {}
156
+ for k, v in config_dict.items():
157
+ mapped_key = key_mapping.get(k, k)
158
+ if mapped_key in cls.__annotations__:
159
+ mapped_dict[mapped_key] = v
160
+
161
+ return cls(**mapped_dict)
162
+
163
+ @classmethod
164
+ def from_template(cls, template_name: str) -> 'LogConfig':
165
+ """从模板创建配置
166
+
167
+ Args:
168
+ template_name: 模板名称 (minimal, standard, detailed, production)
169
+
170
+ Returns:
171
+ LogConfig: 配置对象
172
+ """
173
+ if template_name not in cls.TEMPLATES:
174
+ raise ValueError(f"未知的模板名称: {template_name},可用模板: {', '.join(cls.TEMPLATES.keys())}")
175
+
176
+ template_config = cls.TEMPLATES[template_name]
177
+ return cls(**template_config)
178
+
179
+ def get_module_level(self, module_name: str) -> str:
180
+ """获取模块的日志级别"""
181
+ # 先查找精确匹配
182
+ if module_name in self.module_levels:
183
+ return self.module_levels[module_name]
184
+
185
+ # 查找父模块匹配
186
+ parts = module_name.split('.')
187
+ for i in range(len(parts) - 1, 0, -1):
188
+ parent_module = '.'.join(parts[:i])
189
+ if parent_module in self.module_levels:
190
+ return self.module_levels[parent_module]
191
+
192
+ # 返回默认级别
193
+ return self.level
194
+
195
+ def get_console_level(self) -> str:
196
+ """获取控制台日志级别"""
197
+ return self.console_level or self.level
198
+
199
+ def get_file_level(self) -> str:
200
+ """获取文件日志级别"""
201
+ return self.file_level or self.level
202
+
203
+ def get_format(self) -> str:
204
+ """
205
+ 获取日志格式,包含上下文信息
206
+
207
+ Returns:
208
+ 日志格式字符串
209
+ """
210
+ base_format = self.format
211
+
212
+ # 添加线程ID
213
+ if self.include_thread_id:
214
+ if '[%(thread)d]' not in base_format:
215
+ # 在时间戳后添加线程ID
216
+ base_format = base_format.replace(
217
+ '%(asctime)s',
218
+ '%(asctime)s [%(thread)d]'
219
+ )
220
+
221
+ # 添加进程ID
222
+ if self.include_process_id:
223
+ if '[%(process)d]' not in base_format:
224
+ # 在时间戳后添加进程ID(如果已经有线程ID,则在线程ID后添加)
225
+ if '[%(thread)d]' in base_format:
226
+ base_format = base_format.replace(
227
+ '%(asctime)s [%(thread)d]',
228
+ '%(asctime)s [%(thread)d] [%(process)d]'
229
+ )
230
+ else:
231
+ base_format = base_format.replace(
232
+ '%(asctime)s',
233
+ '%(asctime)s [%(process)d]'
234
+ )
235
+
236
+ # 添加模块路径
237
+ if self.include_module_path:
238
+ if '%(pathname)s:%(lineno)d' not in base_format:
239
+ # 在消息前添加文件路径和行号
240
+ base_format = base_format.replace(
241
+ '%(message)s',
242
+ '%(pathname)s:%(lineno)d - %(message)s'
243
+ )
244
+
245
+ return base_format
246
+
247
+ def validate(self) -> tuple[bool, str]:
248
+ """验证配置有效性
249
+
250
+ Returns:
251
+ tuple[bool, str]: (是否有效, 错误信息)
252
+ """
253
+ valid_levels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'}
254
+
255
+ # 验证主级别
256
+ if self.level.upper() not in valid_levels:
257
+ return False, f"无效的日志级别: {self.level},有效级别为: {', '.join(valid_levels)}"
258
+
259
+ # 验证控制台级别
260
+ if self.console_level and self.console_level.upper() not in valid_levels:
261
+ return False, f"无效的控制台日志级别: {self.console_level},有效级别为: {', '.join(valid_levels)}"
262
+
263
+ # 验证文件级别
264
+ if self.file_level and self.file_level.upper() not in valid_levels:
265
+ return False, f"无效的文件日志级别: {self.file_level},有效级别为: {', '.join(valid_levels)}"
266
+
267
+ # 确保日志目录存在
268
+ if self.file_path and self.file_enabled:
269
+ try:
270
+ log_dir = os.path.dirname(self.file_path)
271
+ if log_dir and not os.path.exists(log_dir):
272
+ os.makedirs(log_dir, exist_ok=True)
273
+ except (OSError, PermissionError) as e:
274
+ log_dir = os.path.dirname(self.file_path) if self.file_path else "未知"
275
+ return False, f"无法创建日志目录 {log_dir}: {e}"
276
+
277
+ return True, "配置有效"