crawlo 1.4.2__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (326) hide show
  1. crawlo/__init__.py +93 -93
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -341
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +438 -439
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +291 -257
  19. crawlo/crawler.py +650 -650
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +63 -63
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +61 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +103 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -257
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -292
  47. crawlo/initialization/__init__.py +44 -44
  48. crawlo/initialization/built_in.py +425 -425
  49. crawlo/initialization/context.py +141 -141
  50. crawlo/initialization/core.py +193 -193
  51. crawlo/initialization/phases.py +148 -148
  52. crawlo/initialization/registry.py +145 -145
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -23
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +45 -37
  58. crawlo/logging/async_handler.py +181 -0
  59. crawlo/logging/config.py +196 -96
  60. crawlo/logging/factory.py +171 -128
  61. crawlo/logging/manager.py +111 -111
  62. crawlo/logging/monitor.py +153 -0
  63. crawlo/logging/sampler.py +167 -0
  64. crawlo/middleware/__init__.py +21 -21
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +135 -135
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +386 -386
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/middleware/simple_proxy.py +65 -65
  75. crawlo/mode_manager.py +219 -219
  76. crawlo/network/__init__.py +21 -21
  77. crawlo/network/request.py +379 -379
  78. crawlo/network/response.py +359 -359
  79. crawlo/pipelines/__init__.py +21 -21
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +131 -131
  87. crawlo/pipelines/mysql_pipeline.py +325 -325
  88. crawlo/pipelines/pipeline_manager.py +100 -84
  89. crawlo/pipelines/redis_dedup_pipeline.py +156 -156
  90. crawlo/project.py +349 -338
  91. crawlo/queue/pqueue.py +42 -42
  92. crawlo/queue/queue_manager.py +526 -522
  93. crawlo/queue/redis_priority_queue.py +370 -367
  94. crawlo/settings/__init__.py +7 -7
  95. crawlo/settings/default_settings.py +284 -284
  96. crawlo/settings/setting_manager.py +219 -219
  97. crawlo/spider/__init__.py +657 -657
  98. crawlo/stats_collector.py +73 -73
  99. crawlo/subscriber.py +129 -129
  100. crawlo/task_manager.py +138 -138
  101. crawlo/templates/crawlo.cfg.tmpl +10 -10
  102. crawlo/templates/project/__init__.py.tmpl +3 -3
  103. crawlo/templates/project/items.py.tmpl +17 -17
  104. crawlo/templates/project/middlewares.py.tmpl +118 -118
  105. crawlo/templates/project/pipelines.py.tmpl +96 -96
  106. crawlo/templates/project/settings.py.tmpl +170 -170
  107. crawlo/templates/project/settings_distributed.py.tmpl +169 -169
  108. crawlo/templates/project/settings_gentle.py.tmpl +166 -166
  109. crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
  110. crawlo/templates/project/settings_minimal.py.tmpl +65 -65
  111. crawlo/templates/project/settings_simple.py.tmpl +164 -164
  112. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  113. crawlo/templates/run.py.tmpl +34 -34
  114. crawlo/templates/spider/spider.py.tmpl +143 -143
  115. crawlo/templates/spiders_init.py.tmpl +9 -9
  116. crawlo/tools/__init__.py +200 -200
  117. crawlo/tools/anti_crawler.py +268 -268
  118. crawlo/tools/authenticated_proxy.py +240 -240
  119. crawlo/tools/data_formatter.py +225 -225
  120. crawlo/tools/data_validator.py +180 -180
  121. crawlo/tools/date_tools.py +289 -289
  122. crawlo/tools/distributed_coordinator.py +384 -384
  123. crawlo/tools/encoding_converter.py +127 -127
  124. crawlo/tools/network_diagnostic.py +364 -364
  125. crawlo/tools/request_tools.py +82 -82
  126. crawlo/tools/retry_mechanism.py +224 -224
  127. crawlo/tools/scenario_adapter.py +262 -262
  128. crawlo/tools/text_cleaner.py +232 -232
  129. crawlo/utils/__init__.py +34 -34
  130. crawlo/utils/batch_processor.py +259 -259
  131. crawlo/utils/class_loader.py +25 -25
  132. crawlo/utils/controlled_spider_mixin.py +439 -439
  133. crawlo/utils/db_helper.py +343 -343
  134. crawlo/utils/enhanced_error_handler.py +356 -356
  135. crawlo/utils/env_config.py +142 -142
  136. crawlo/utils/error_handler.py +165 -165
  137. crawlo/utils/fingerprint.py +122 -122
  138. crawlo/utils/func_tools.py +82 -82
  139. crawlo/utils/large_scale_config.py +286 -286
  140. crawlo/utils/large_scale_helper.py +344 -344
  141. crawlo/utils/log.py +79 -79
  142. crawlo/utils/performance_monitor.py +285 -285
  143. crawlo/utils/queue_helper.py +175 -175
  144. crawlo/utils/redis_connection_pool.py +388 -388
  145. crawlo/utils/redis_key_validator.py +198 -198
  146. crawlo/utils/request.py +267 -267
  147. crawlo/utils/request_serializer.py +225 -225
  148. crawlo/utils/spider_loader.py +61 -61
  149. crawlo/utils/system.py +11 -11
  150. crawlo/utils/tools.py +4 -4
  151. crawlo/utils/url.py +39 -39
  152. crawlo-1.4.3.dist-info/METADATA +190 -0
  153. crawlo-1.4.3.dist-info/RECORD +326 -0
  154. examples/__init__.py +7 -7
  155. examples/test_project/__init__.py +7 -7
  156. examples/test_project/run.py +34 -34
  157. examples/test_project/test_project/__init__.py +3 -3
  158. examples/test_project/test_project/items.py +17 -17
  159. examples/test_project/test_project/middlewares.py +118 -118
  160. examples/test_project/test_project/pipelines.py +96 -96
  161. examples/test_project/test_project/settings.py +169 -169
  162. examples/test_project/test_project/spiders/__init__.py +9 -9
  163. examples/test_project/test_project/spiders/of_week_dis.py +143 -143
  164. tests/__init__.py +7 -7
  165. tests/advanced_tools_example.py +275 -275
  166. tests/authenticated_proxy_example.py +106 -106
  167. tests/baidu_performance_test.py +108 -108
  168. tests/baidu_test.py +59 -59
  169. tests/cleaners_example.py +160 -160
  170. tests/comprehensive_framework_test.py +212 -212
  171. tests/comprehensive_test.py +81 -81
  172. tests/comprehensive_testing_summary.md +186 -186
  173. tests/config_validation_demo.py +142 -142
  174. tests/controlled_spider_example.py +205 -205
  175. tests/date_tools_example.py +180 -180
  176. tests/debug_configure.py +69 -69
  177. tests/debug_framework_logger.py +84 -84
  178. tests/debug_log_config.py +126 -126
  179. tests/debug_log_levels.py +63 -63
  180. tests/debug_pipelines.py +66 -66
  181. tests/detailed_log_test.py +233 -233
  182. tests/distributed_test.py +66 -66
  183. tests/distributed_test_debug.py +76 -76
  184. tests/dynamic_loading_example.py +523 -523
  185. tests/dynamic_loading_test.py +104 -104
  186. tests/env_config_example.py +133 -133
  187. tests/error_handling_example.py +171 -171
  188. tests/final_comprehensive_test.py +151 -151
  189. tests/final_log_test.py +260 -260
  190. tests/final_validation_test.py +182 -182
  191. tests/fix_log_test.py +142 -142
  192. tests/framework_performance_test.py +202 -202
  193. tests/log_buffering_test.py +111 -111
  194. tests/log_generation_timing_test.py +153 -153
  195. tests/optimized_performance_test.py +211 -211
  196. tests/performance_comparison.py +245 -245
  197. tests/queue_blocking_test.py +113 -113
  198. tests/queue_test.py +89 -89
  199. tests/redis_key_validation_demo.py +130 -130
  200. tests/request_params_example.py +150 -150
  201. tests/response_improvements_example.py +144 -144
  202. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  203. tests/scrapy_comparison/scrapy_test.py +133 -133
  204. tests/simple_command_test.py +119 -119
  205. tests/simple_crawlo_test.py +127 -127
  206. tests/simple_log_test.py +57 -57
  207. tests/simple_log_test2.py +137 -137
  208. tests/simple_optimization_test.py +128 -128
  209. tests/simple_queue_type_test.py +41 -41
  210. tests/simple_spider_test.py +49 -49
  211. tests/simple_test.py +47 -47
  212. tests/spider_log_timing_test.py +177 -177
  213. tests/test_advanced_tools.py +148 -148
  214. tests/test_all_commands.py +230 -230
  215. tests/test_all_pipeline_fingerprints.py +133 -133
  216. tests/test_all_redis_key_configs.py +145 -145
  217. tests/test_authenticated_proxy.py +141 -141
  218. tests/test_batch_processor.py +178 -178
  219. tests/test_cleaners.py +54 -54
  220. tests/test_component_factory.py +174 -174
  221. tests/test_comprehensive.py +146 -146
  222. tests/test_config_consistency.py +80 -80
  223. tests/test_config_merge.py +152 -152
  224. tests/test_config_validator.py +182 -182
  225. tests/test_controlled_spider_mixin.py +79 -79
  226. tests/test_crawlo_proxy_integration.py +108 -108
  227. tests/test_date_tools.py +123 -123
  228. tests/test_dedup_fix.py +220 -220
  229. tests/test_dedup_pipeline_consistency.py +125 -0
  230. tests/test_default_header_middleware.py +313 -313
  231. tests/test_distributed.py +65 -65
  232. tests/test_double_crawlo_fix.py +204 -204
  233. tests/test_double_crawlo_fix_simple.py +124 -124
  234. tests/test_download_delay_middleware.py +221 -221
  235. tests/test_downloader_proxy_compatibility.py +268 -268
  236. tests/test_dynamic_downloaders_proxy.py +124 -124
  237. tests/test_dynamic_proxy.py +92 -92
  238. tests/test_dynamic_proxy_config.py +146 -146
  239. tests/test_dynamic_proxy_real.py +109 -109
  240. tests/test_edge_cases.py +303 -303
  241. tests/test_enhanced_error_handler.py +270 -270
  242. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  243. tests/test_env_config.py +121 -121
  244. tests/test_error_handler_compatibility.py +112 -112
  245. tests/test_factories.py +252 -252
  246. tests/test_final_validation.py +153 -153
  247. tests/test_fingerprint_consistency.py +135 -135
  248. tests/test_fingerprint_simple.py +51 -51
  249. tests/test_framework_env_usage.py +103 -103
  250. tests/test_framework_logger.py +66 -66
  251. tests/test_framework_startup.py +64 -64
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_config.py +112 -112
  257. tests/test_large_scale_helper.py +235 -235
  258. tests/test_logging_enhancements.py +375 -0
  259. tests/test_logging_final.py +185 -0
  260. tests/test_logging_integration.py +313 -0
  261. tests/test_logging_system.py +282 -282
  262. tests/test_middleware_debug.py +142 -0
  263. tests/test_mode_change.py +72 -72
  264. tests/test_mode_consistency.py +51 -51
  265. tests/test_offsite_middleware.py +244 -244
  266. tests/test_offsite_middleware_simple.py +203 -203
  267. tests/test_parsel.py +29 -29
  268. tests/test_performance.py +327 -327
  269. tests/test_performance_monitor.py +115 -115
  270. tests/test_pipeline_fingerprint_consistency.py +86 -86
  271. tests/test_priority_behavior.py +212 -0
  272. tests/test_priority_consistency.py +152 -0
  273. tests/test_priority_consistency_fixed.py +250 -0
  274. tests/test_proxy_api.py +264 -264
  275. tests/test_proxy_health_check.py +32 -32
  276. tests/test_proxy_middleware.py +121 -121
  277. tests/test_proxy_middleware_enhanced.py +216 -216
  278. tests/test_proxy_middleware_integration.py +136 -136
  279. tests/test_proxy_middleware_refactored.py +184 -184
  280. tests/test_proxy_providers.py +56 -56
  281. tests/test_proxy_stats.py +19 -19
  282. tests/test_proxy_strategies.py +59 -59
  283. tests/test_queue_empty_check.py +41 -41
  284. tests/test_queue_manager_double_crawlo.py +173 -173
  285. tests/test_queue_manager_redis_key.py +179 -179
  286. tests/test_queue_naming.py +154 -154
  287. tests/test_queue_type.py +106 -106
  288. tests/test_queue_type_redis_config_consistency.py +131 -0
  289. tests/test_random_headers_default.py +323 -0
  290. tests/test_random_headers_necessity.py +309 -0
  291. tests/test_random_user_agent.py +72 -72
  292. tests/test_real_scenario_proxy.py +195 -195
  293. tests/test_redis_config.py +28 -28
  294. tests/test_redis_connection_pool.py +294 -294
  295. tests/test_redis_key_naming.py +181 -181
  296. tests/test_redis_key_validator.py +123 -123
  297. tests/test_redis_queue.py +224 -224
  298. tests/test_redis_queue_name_fix.py +175 -175
  299. tests/test_redis_queue_type_fallback.py +130 -0
  300. tests/test_request_ignore_middleware.py +182 -182
  301. tests/test_request_params.py +111 -111
  302. tests/test_request_serialization.py +70 -70
  303. tests/test_response_code_middleware.py +349 -349
  304. tests/test_response_filter_middleware.py +427 -427
  305. tests/test_response_improvements.py +152 -152
  306. tests/test_retry_middleware.py +334 -242
  307. tests/test_retry_middleware_realistic.py +274 -0
  308. tests/test_scheduler.py +252 -252
  309. tests/test_scheduler_config_update.py +133 -133
  310. tests/test_simple_response.py +61 -61
  311. tests/test_telecom_spider_redis_key.py +205 -205
  312. tests/test_template_content.py +87 -87
  313. tests/test_template_redis_key.py +134 -134
  314. tests/test_tools.py +159 -159
  315. tests/test_user_agent_randomness.py +177 -0
  316. tests/test_user_agents.py +96 -96
  317. tests/tools_example.py +260 -260
  318. tests/untested_features_report.md +138 -138
  319. tests/verify_debug.py +51 -51
  320. tests/verify_distributed.py +117 -117
  321. tests/verify_log_fix.py +111 -111
  322. crawlo-1.4.2.dist-info/METADATA +0 -1199
  323. crawlo-1.4.2.dist-info/RECORD +0 -309
  324. {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
  325. {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
  326. {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
crawlo/logging/config.py CHANGED
@@ -1,97 +1,197 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 日志配置管理
5
- """
6
-
7
- import os
8
- from dataclasses import dataclass, field
9
- from typing import Optional, Dict, Any
10
-
11
-
12
- @dataclass
13
- class LogConfig:
14
- """日志配置数据类 - 简单明确的配置结构"""
15
-
16
- # 基本配置
17
- level: str = "INFO"
18
- format: str = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
19
- encoding: str = "utf-8"
20
-
21
- # 文件配置
22
- file_path: Optional[str] = None
23
- max_bytes: int = 10 * 1024 * 1024 # 10MB
24
- backup_count: int = 5
25
-
26
- # 控制台配置
27
- console_enabled: bool = True
28
- file_enabled: bool = True
29
-
30
- # 模块级别配置
31
- module_levels: Dict[str, str] = field(default_factory=dict)
32
-
33
- @classmethod
34
- def from_settings(cls, settings) -> 'LogConfig':
35
- """从settings对象创建配置"""
36
- if not settings:
37
- return cls()
38
-
39
- # 使用settings的get方法而不是getattr
40
- if hasattr(settings, 'get'):
41
- get_val = settings.get
42
- else:
43
- get_val = lambda k, d=None: getattr(settings, k, d)
44
-
45
- # 获取默认值
46
- format_default_value = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
47
-
48
- return cls(
49
- level=get_val('LOG_LEVEL', 'INFO'),
50
- format=get_val('LOG_FORMAT', format_default_value),
51
- encoding=get_val('LOG_ENCODING', 'utf-8'),
52
- file_path=get_val('LOG_FILE'),
53
- max_bytes=get_val('LOG_MAX_BYTES', 10 * 1024 * 1024),
54
- backup_count=get_val('LOG_BACKUP_COUNT', 5),
55
- console_enabled=get_val('LOG_CONSOLE_ENABLED', True),
56
- file_enabled=get_val('LOG_FILE_ENABLED', True),
57
- module_levels=get_val('LOG_LEVELS', {})
58
- )
59
-
60
- @classmethod
61
- def from_dict(cls, config_dict: Dict[str, Any]) -> 'LogConfig':
62
- """从字典创建配置"""
63
- return cls(**{k: v for k, v in config_dict.items() if k in cls.__annotations__})
64
-
65
- def get_module_level(self, module_name: str) -> str:
66
- """获取模块的日志级别"""
67
- # 先查找精确匹配
68
- if module_name in self.module_levels:
69
- return self.module_levels[module_name]
70
-
71
- # 查找父模块匹配
72
- parts = module_name.split('.')
73
- for i in range(len(parts) - 1, 0, -1):
74
- parent_module = '.'.join(parts[:i])
75
- if parent_module in self.module_levels:
76
- return self.module_levels[parent_module]
77
-
78
- # 返回默认级别
79
- return self.level
80
-
81
- def validate(self) -> bool:
82
- """验证配置有效性"""
83
- valid_levels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'}
84
-
85
- if self.level.upper() not in valid_levels:
86
- return False
87
-
88
- # 确保日志目录存在
89
- if self.file_path and self.file_enabled:
90
- try:
91
- log_dir = os.path.dirname(self.file_path)
92
- if log_dir and not os.path.exists(log_dir):
93
- os.makedirs(log_dir, exist_ok=True)
94
- except (OSError, PermissionError):
95
- return False
96
-
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 日志配置管理
5
+ """
6
+
7
+ import os
8
+ from dataclasses import dataclass, field
9
+ from typing import Optional, Dict, Any
10
+
11
+
12
+ @dataclass
13
+ class LogConfig:
14
+ """日志配置数据类 - 简单明确的配置结构"""
15
+
16
+ # 基本配置
17
+ level: str = "INFO"
18
+ format: str = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
19
+ encoding: str = "utf-8"
20
+
21
+ # 文件配置
22
+ file_path: Optional[str] = None
23
+ max_bytes: int = 10 * 1024 * 1024 # 10MB
24
+ backup_count: int = 5
25
+
26
+ # 控制台配置
27
+ console_enabled: bool = True
28
+ file_enabled: bool = True
29
+
30
+ # 分别控制台和文件的日志级别
31
+ console_level: Optional[str] = None
32
+ file_level: Optional[str] = None
33
+
34
+ # 上下文信息配置
35
+ include_thread_id: bool = False
36
+ include_process_id: bool = False
37
+ include_module_path: bool = False
38
+
39
+ # 模块级别配置
40
+ module_levels: Dict[str, str] = field(default_factory=dict)
41
+
42
+ @classmethod
43
+ def from_settings(cls, settings) -> 'LogConfig':
44
+ """从settings对象创建配置"""
45
+ if not settings:
46
+ return cls()
47
+
48
+ # 使用settings的get方法而不是getattr
49
+ if hasattr(settings, 'get'):
50
+ get_val = settings.get
51
+ else:
52
+ get_val = lambda k, d=None: getattr(settings, k, d)
53
+
54
+ # 获取默认值
55
+ format_default_value = "%(asctime)s - [%(name)s] - %(levelname)s: %(message)s"
56
+
57
+ return cls(
58
+ level=get_val('LOG_LEVEL', 'INFO'),
59
+ format=get_val('LOG_FORMAT', format_default_value),
60
+ encoding=get_val('LOG_ENCODING', 'utf-8'),
61
+ file_path=get_val('LOG_FILE'),
62
+ max_bytes=get_val('LOG_MAX_BYTES', 10 * 1024 * 1024),
63
+ backup_count=get_val('LOG_BACKUP_COUNT', 5),
64
+ console_enabled=get_val('LOG_CONSOLE_ENABLED', True),
65
+ file_enabled=get_val('LOG_FILE_ENABLED', True),
66
+ console_level=get_val('LOG_CONSOLE_LEVEL'), # 允许单独设置控制台级别
67
+ file_level=get_val('LOG_FILE_LEVEL'), # 允许单独设置文件级别
68
+ include_thread_id=get_val('LOG_INCLUDE_THREAD_ID', False),
69
+ include_process_id=get_val('LOG_INCLUDE_PROCESS_ID', False),
70
+ include_module_path=get_val('LOG_INCLUDE_MODULE_PATH', False),
71
+ module_levels=get_val('LOG_LEVELS', {})
72
+ )
73
+
74
+ @classmethod
75
+ def from_dict(cls, config_dict: Dict[str, Any]) -> 'LogConfig':
76
+ """从字典创建配置"""
77
+ # 映射字典键到类属性名
78
+ key_mapping = {
79
+ 'LOG_LEVEL': 'level',
80
+ 'LOG_FORMAT': 'format',
81
+ 'LOG_ENCODING': 'encoding',
82
+ 'LOG_FILE': 'file_path',
83
+ 'LOG_MAX_BYTES': 'max_bytes',
84
+ 'LOG_BACKUP_COUNT': 'backup_count',
85
+ 'LOG_CONSOLE_ENABLED': 'console_enabled',
86
+ 'LOG_FILE_ENABLED': 'file_enabled',
87
+ 'LOG_CONSOLE_LEVEL': 'console_level',
88
+ 'LOG_FILE_LEVEL': 'file_level',
89
+ 'LOG_INCLUDE_THREAD_ID': 'include_thread_id',
90
+ 'LOG_INCLUDE_PROCESS_ID': 'include_process_id',
91
+ 'LOG_INCLUDE_MODULE_PATH': 'include_module_path',
92
+ 'LOG_LEVELS': 'module_levels'
93
+ }
94
+
95
+ # 应用键映射
96
+ mapped_dict = {}
97
+ for k, v in config_dict.items():
98
+ mapped_key = key_mapping.get(k, k)
99
+ if mapped_key in cls.__annotations__:
100
+ mapped_dict[mapped_key] = v
101
+
102
+ return cls(**mapped_dict)
103
+
104
+ def get_module_level(self, module_name: str) -> str:
105
+ """获取模块的日志级别"""
106
+ # 先查找精确匹配
107
+ if module_name in self.module_levels:
108
+ return self.module_levels[module_name]
109
+
110
+ # 查找父模块匹配
111
+ parts = module_name.split('.')
112
+ for i in range(len(parts) - 1, 0, -1):
113
+ parent_module = '.'.join(parts[:i])
114
+ if parent_module in self.module_levels:
115
+ return self.module_levels[parent_module]
116
+
117
+ # 返回默认级别
118
+ return self.level
119
+
120
+ def get_console_level(self) -> str:
121
+ """获取控制台日志级别"""
122
+ return self.console_level or self.level
123
+
124
+ def get_file_level(self) -> str:
125
+ """获取文件日志级别"""
126
+ return self.file_level or self.level
127
+
128
+ def get_format(self) -> str:
129
+ """
130
+ 获取日志格式,包含上下文信息
131
+
132
+ Returns:
133
+ 日志格式字符串
134
+ """
135
+ base_format = self.format
136
+
137
+ # 添加线程ID
138
+ if self.include_thread_id:
139
+ if '[%(thread)d]' not in base_format:
140
+ # 在时间戳后添加线程ID
141
+ base_format = base_format.replace(
142
+ '%(asctime)s',
143
+ '%(asctime)s [%(thread)d]'
144
+ )
145
+
146
+ # 添加进程ID
147
+ if self.include_process_id:
148
+ if '[%(process)d]' not in base_format:
149
+ # 在时间戳后添加进程ID(如果已经有线程ID,则在线程ID后添加)
150
+ if '[%(thread)d]' in base_format:
151
+ base_format = base_format.replace(
152
+ '%(asctime)s [%(thread)d]',
153
+ '%(asctime)s [%(thread)d] [%(process)d]'
154
+ )
155
+ else:
156
+ base_format = base_format.replace(
157
+ '%(asctime)s',
158
+ '%(asctime)s [%(process)d]'
159
+ )
160
+
161
+ # 添加模块路径
162
+ if self.include_module_path:
163
+ if '%(pathname)s:%(lineno)d' not in base_format:
164
+ # 在消息前添加文件路径和行号
165
+ base_format = base_format.replace(
166
+ '%(message)s',
167
+ '%(pathname)s:%(lineno)d - %(message)s'
168
+ )
169
+
170
+ return base_format
171
+
172
+ def validate(self) -> bool:
173
+ """验证配置有效性"""
174
+ valid_levels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'}
175
+
176
+ # 验证主级别
177
+ if self.level.upper() not in valid_levels:
178
+ return False
179
+
180
+ # 验证控制台级别
181
+ if self.console_level and self.console_level.upper() not in valid_levels:
182
+ return False
183
+
184
+ # 验证文件级别
185
+ if self.file_level and self.file_level.upper() not in valid_levels:
186
+ return False
187
+
188
+ # 确保日志目录存在
189
+ if self.file_path and self.file_enabled:
190
+ try:
191
+ log_dir = os.path.dirname(self.file_path)
192
+ if log_dir and not os.path.exists(log_dir):
193
+ os.makedirs(log_dir, exist_ok=True)
194
+ except (OSError, PermissionError):
195
+ return False
196
+
97
197
  return True
crawlo/logging/factory.py CHANGED
@@ -1,129 +1,172 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 日志器工厂 - 创建和缓存Logger实例
5
- """
6
-
7
- import logging
8
- import os
9
- import threading
10
- from logging.handlers import RotatingFileHandler
11
- from typing import Dict, Optional
12
- from weakref import WeakValueDictionary
13
-
14
- from .manager import get_config, is_configured, configure
15
- from .config import LogConfig
16
-
17
-
18
- class LoggerFactory:
19
- """
20
- Logger工厂类 - 负责创建和缓存Logger实例
21
-
22
- 特点:
23
- 1. 使用WeakValueDictionary避免内存泄漏
24
- 2. 线程安全的Logger创建
25
- 3. 自动配置管理
26
- 4. 简单的缓存策略
27
- """
28
-
29
- # Logger缓存 - 使用弱引用避免内存泄漏
30
- _logger_cache: WeakValueDictionary = WeakValueDictionary()
31
- _cache_lock = threading.RLock()
32
-
33
- @classmethod
34
- def get_logger(cls, name: str = 'crawlo') -> logging.Logger:
35
- """
36
- 获取Logger实例
37
-
38
- Args:
39
- name: Logger名称
40
-
41
- Returns:
42
- logging.Logger: 配置好的Logger实例
43
- """
44
- # 确保日志系统已配置
45
- if not is_configured():
46
- configure() # 使用默认配置
47
-
48
- # 检查缓存
49
- with cls._cache_lock:
50
- if name in cls._logger_cache:
51
- return cls._logger_cache[name]
52
-
53
- # 创建新的Logger
54
- logger = cls._create_logger(name)
55
- cls._logger_cache[name] = logger
56
- return logger
57
-
58
- @classmethod
59
- def _create_logger(cls, name: str) -> logging.Logger:
60
- """创建新的Logger实例"""
61
- config = get_config()
62
- if not config:
63
- raise RuntimeError("Log system not configured")
64
-
65
- # 创建Logger
66
- logger = logging.getLogger(name)
67
- logger.setLevel(logging.DEBUG) # Logger本身设为最低级别
68
-
69
- # 清除现有handlers(避免重复添加)
70
- logger.handlers.clear()
71
-
72
- # 获取模块级别
73
- module_level = config.get_module_level(name)
74
- level = getattr(logging, module_level.upper(), logging.INFO)
75
-
76
- # 创建formatter
77
- formatter = logging.Formatter(config.format)
78
-
79
- # 添加控制台Handler
80
- if config.console_enabled:
81
- console_handler = logging.StreamHandler()
82
- console_handler.setFormatter(formatter)
83
- console_handler.setLevel(level)
84
- logger.addHandler(console_handler)
85
-
86
- # 添加文件Handler
87
- if config.file_enabled and config.file_path:
88
- try:
89
- # 确保日志目录存在
90
- log_dir = os.path.dirname(config.file_path)
91
- if log_dir and not os.path.exists(log_dir):
92
- os.makedirs(log_dir, exist_ok=True)
93
-
94
- file_handler = RotatingFileHandler(
95
- filename=config.file_path,
96
- maxBytes=config.max_bytes,
97
- backupCount=config.backup_count,
98
- encoding=config.encoding
99
- )
100
- file_handler.setFormatter(formatter)
101
- file_handler.setLevel(level)
102
- logger.addHandler(file_handler)
103
- except Exception as e:
104
- # 文件Handler创建失败时,至少保证控制台输出
105
- pass
106
-
107
- # 防止向上传播(避免重复输出)
108
- logger.propagate = False
109
-
110
- return logger
111
-
112
- @classmethod
113
- def clear_cache(cls):
114
- """清空Logger缓存"""
115
- with cls._cache_lock:
116
- cls._logger_cache.clear()
117
-
118
- @classmethod
119
- def refresh_loggers(cls, new_config: LogConfig):
120
- """刷新所有缓存的Logger(配置更新时使用)"""
121
- with cls._cache_lock:
122
- # 清空缓存,强制重新创建
123
- cls._logger_cache.clear()
124
-
125
-
126
- # 便捷函数
127
- def get_logger(name: str = 'crawlo') -> logging.Logger:
128
- """获取Logger实例的便捷函数"""
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 日志器工厂 - 创建和缓存Logger实例
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import sys
10
+ import threading
11
+ from typing import Dict, Optional
12
+ from weakref import WeakValueDictionary
13
+
14
+ # 尝试导入concurrent-log-handler,如果不可用则回退到标准库
15
+ try:
16
+ from concurrent_log_handler import ConcurrentRotatingFileHandler
17
+ USE_CONCURRENT_HANDLER = True
18
+ except ImportError:
19
+ from logging.handlers import RotatingFileHandler
20
+ USE_CONCURRENT_HANDLER = False
21
+
22
+ from .manager import get_config, is_configured, configure
23
+ from .config import LogConfig
24
+
25
+
26
+ class LoggerFactory:
27
+ """
28
+ Logger工厂类 - 负责创建和缓存Logger实例
29
+
30
+ 特点:
31
+ 1. 使用WeakValueDictionary避免内存泄漏
32
+ 2. 线程安全的Logger创建
33
+ 3. 自动配置管理
34
+ 4. 简单的缓存策略
35
+ 5. Windows兼容的日志轮转处理
36
+ """
37
+
38
+ # Logger缓存 - 使用弱引用避免内存泄漏
39
+ _logger_cache: WeakValueDictionary = WeakValueDictionary()
40
+ _cache_lock = threading.RLock()
41
+
42
+ @classmethod
43
+ def get_logger(cls, name: str = 'crawlo') -> logging.Logger:
44
+ """
45
+ 获取Logger实例
46
+
47
+ Args:
48
+ name: Logger名称
49
+
50
+ Returns:
51
+ logging.Logger: 配置好的Logger实例
52
+ """
53
+ # 确保日志系统已配置
54
+ if not is_configured():
55
+ configure() # 使用默认配置
56
+
57
+ # 检查缓存
58
+ with cls._cache_lock:
59
+ if name in cls._logger_cache:
60
+ return cls._logger_cache[name]
61
+
62
+ # 创建新的Logger
63
+ logger = cls._create_logger(name)
64
+ cls._logger_cache[name] = logger
65
+ return logger
66
+
67
+ @classmethod
68
+ def _create_logger(cls, name: str) -> logging.Logger:
69
+ """创建新的Logger实例"""
70
+ config = get_config()
71
+ if not config:
72
+ raise RuntimeError("Log system not configured")
73
+
74
+ # 创建Logger
75
+ logger = logging.getLogger(name)
76
+ logger.setLevel(logging.DEBUG) # Logger本身设为最低级别
77
+
78
+ # 清除现有handlers(避免重复添加)
79
+ logger.handlers.clear()
80
+
81
+ # 获取模块级别
82
+ module_level = config.get_module_level(name)
83
+
84
+ # 创建formatter
85
+ formatter = logging.Formatter(config.get_format())
86
+
87
+ # 添加控制台Handler
88
+ if config.console_enabled:
89
+ console_handler = logging.StreamHandler()
90
+ console_handler.setFormatter(formatter)
91
+ # 使用专门的控制台级别或模块级别
92
+ console_level = config.get_console_level()
93
+ level = getattr(logging, console_level.upper(), logging.INFO)
94
+ console_handler.setLevel(level)
95
+ logger.addHandler(console_handler)
96
+
97
+ # 添加文件Handler
98
+ if config.file_enabled and config.file_path:
99
+ try:
100
+ # 确保日志目录存在
101
+ log_dir = os.path.dirname(config.file_path)
102
+ if log_dir and not os.path.exists(log_dir):
103
+ os.makedirs(log_dir, exist_ok=True)
104
+
105
+ # 根据平台选择合适的Handler
106
+ if USE_CONCURRENT_HANDLER:
107
+ file_handler = ConcurrentRotatingFileHandler(
108
+ filename=config.file_path,
109
+ maxBytes=config.max_bytes,
110
+ backupCount=config.backup_count,
111
+ encoding=config.encoding
112
+ )
113
+ else:
114
+ # 在Windows上给出警告信息
115
+ if sys.platform.startswith('win'):
116
+ # 检查是否已经有同名的日志文件被其他进程使用
117
+ try:
118
+ # 尝试以独占模式打开文件来检查是否被占用
119
+ with open(config.file_path, 'a'):
120
+ pass
121
+ except (PermissionError, OSError):
122
+ # 如果文件被占用,记录警告信息
123
+ console_handler = logging.StreamHandler()
124
+ console_handler.setFormatter(formatter)
125
+ console_handler.setLevel(logging.WARNING)
126
+ logger.addHandler(console_handler)
127
+ logger.warning(f"日志文件 {config.file_path} 可能正在被其他进程使用,这可能导致日志轮转失败。建议安装 concurrent-log-handler 库以获得更好的Windows兼容性。")
128
+
129
+ file_handler = RotatingFileHandler(
130
+ filename=config.file_path,
131
+ maxBytes=config.max_bytes,
132
+ backupCount=config.backup_count,
133
+ encoding=config.encoding
134
+ )
135
+
136
+ file_handler.setFormatter(formatter)
137
+ # 使用专门的文件级别或模块级别
138
+ file_level = config.get_file_level()
139
+ level = getattr(logging, file_level.upper(), logging.INFO)
140
+ file_handler.setLevel(level)
141
+ logger.addHandler(file_handler)
142
+ except Exception as e:
143
+ # 文件Handler创建失败时,至少保证控制台输出
144
+ console_handler = logging.StreamHandler()
145
+ console_handler.setFormatter(formatter)
146
+ console_handler.setLevel(logging.WARNING)
147
+ logger.addHandler(console_handler)
148
+ logger.warning(f"无法创建文件日志处理器: {e},仅使用控制台输出。")
149
+
150
+ # 防止向上传播(避免重复输出)
151
+ logger.propagate = False
152
+
153
+ return logger
154
+
155
+ @classmethod
156
+ def clear_cache(cls):
157
+ """清空Logger缓存"""
158
+ with cls._cache_lock:
159
+ cls._logger_cache.clear()
160
+
161
+ @classmethod
162
+ def refresh_loggers(cls, new_config: LogConfig):
163
+ """刷新所有缓存的Logger(配置更新时使用)"""
164
+ with cls._cache_lock:
165
+ # 清空缓存,强制重新创建
166
+ cls._logger_cache.clear()
167
+
168
+
169
+ # 便捷函数
170
+ def get_logger(name: str = 'crawlo') -> logging.Logger:
171
+ """获取Logger实例的便捷函数"""
129
172
  return LoggerFactory.get_logger(name)