crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (374) hide show
  1. crawlo/__init__.py +90 -89
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -341
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -438
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -291
  19. crawlo/crawler.py +698 -657
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -276
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -247
  25. crawlo/downloader/httpx_downloader.py +265 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -402
  28. crawlo/downloader/selenium_downloader.py +486 -472
  29. crawlo/event.py +45 -11
  30. crawlo/exceptions.py +215 -82
  31. crawlo/extension/__init__.py +65 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +135 -0
  44. crawlo/filters/__init__.py +170 -153
  45. crawlo/filters/aioredis_filter.py +348 -264
  46. crawlo/filters/memory_filter.py +261 -276
  47. crawlo/framework.py +306 -292
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -434
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -194
  52. crawlo/initialization/phases.py +230 -149
  53. crawlo/initialization/registry.py +143 -145
  54. crawlo/initialization/utils.py +49 -0
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -46
  61. crawlo/logging/config.py +277 -197
  62. crawlo/logging/factory.py +175 -171
  63. crawlo/logging/manager.py +104 -112
  64. crawlo/middleware/__init__.py +87 -24
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -253
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +375 -379
  77. crawlo/network/response.py +569 -664
  78. crawlo/pipelines/__init__.py +53 -22
  79. crawlo/pipelines/base_pipeline.py +452 -0
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -132
  87. crawlo/pipelines/mysql_pipeline.py +469 -476
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +10 -0
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -525
  94. crawlo/queue/redis_priority_queue.py +519 -370
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +284 -277
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +657 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +2 -4
  104. crawlo/templates/project/items.py.tmpl +13 -17
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -36
  107. crawlo/templates/project/settings.py.tmpl +109 -111
  108. crawlo/templates/project/settings_distributed.py.tmpl +156 -159
  109. crawlo/templates/project/settings_gentle.py.tmpl +170 -176
  110. crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
  111. crawlo/templates/project/settings_minimal.py.tmpl +98 -100
  112. crawlo/templates/project/settings_simple.py.tmpl +168 -174
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -40
  116. crawlo/templates/spiders_init.py.tmpl +5 -10
  117. crawlo/tools/__init__.py +86 -189
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +50 -50
  123. crawlo/utils/batch_processor.py +276 -259
  124. crawlo/utils/config_manager.py +442 -0
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/error_handler.py +410 -410
  128. crawlo/utils/fingerprint.py +121 -121
  129. crawlo/utils/func_tools.py +82 -82
  130. crawlo/utils/large_scale_helper.py +344 -344
  131. crawlo/utils/leak_detector.py +335 -0
  132. crawlo/utils/log.py +79 -79
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -0
  135. crawlo/utils/mysql_connection_pool.py +197 -0
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +91 -0
  139. crawlo/utils/redis_connection_pool.py +578 -388
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -256
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -0
  144. crawlo/utils/selector_helper.py +137 -137
  145. crawlo/utils/singleton.py +70 -0
  146. crawlo/utils/spider_loader.py +201 -201
  147. crawlo/utils/text_helper.py +94 -94
  148. crawlo/utils/{url.py → url_utils.py} +39 -39
  149. crawlo-1.4.7.dist-info/METADATA +689 -0
  150. crawlo-1.4.7.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -275
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -0
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -0
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/scrapy.cfg +11 -11
  192. tests/optimized_performance_test.py +211 -211
  193. tests/performance_comparison.py +244 -244
  194. tests/queue_blocking_test.py +113 -113
  195. tests/queue_test.py +89 -89
  196. tests/redis_key_validation_demo.py +130 -130
  197. tests/request_params_example.py +150 -150
  198. tests/response_improvements_example.py +144 -144
  199. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  200. tests/scrapy_comparison/scrapy_test.py +133 -133
  201. tests/simple_cli_test.py +55 -0
  202. tests/simple_command_test.py +119 -119
  203. tests/simple_crawlo_test.py +126 -126
  204. tests/simple_follow_test.py +38 -38
  205. tests/simple_log_test2.py +137 -137
  206. tests/simple_optimization_test.py +128 -128
  207. tests/simple_queue_type_test.py +41 -41
  208. tests/simple_response_selector_test.py +94 -94
  209. tests/simple_selector_helper_test.py +154 -154
  210. tests/simple_selector_test.py +207 -207
  211. tests/simple_spider_test.py +49 -49
  212. tests/simple_url_test.py +73 -73
  213. tests/simulate_mysql_update_test.py +139 -139
  214. tests/spider_log_timing_test.py +177 -177
  215. tests/test_advanced_tools.py +148 -148
  216. tests/test_all_commands.py +230 -230
  217. tests/test_all_pipeline_fingerprints.py +133 -133
  218. tests/test_all_redis_key_configs.py +145 -145
  219. tests/test_asyncmy_usage.py +56 -56
  220. tests/test_batch_processor.py +178 -178
  221. tests/test_cleaners.py +54 -54
  222. tests/test_cli_arguments.py +119 -0
  223. tests/test_component_factory.py +174 -174
  224. tests/test_config_consistency.py +80 -80
  225. tests/test_config_merge.py +152 -152
  226. tests/test_config_validator.py +182 -182
  227. tests/test_controlled_spider_mixin.py +79 -79
  228. tests/test_crawler_process_import.py +38 -38
  229. tests/test_crawler_process_spider_modules.py +47 -47
  230. tests/test_crawlo_proxy_integration.py +114 -114
  231. tests/test_date_tools.py +123 -123
  232. tests/test_dedup_fix.py +220 -220
  233. tests/test_dedup_pipeline_consistency.py +124 -124
  234. tests/test_default_header_middleware.py +313 -313
  235. tests/test_distributed.py +65 -65
  236. tests/test_double_crawlo_fix.py +204 -204
  237. tests/test_double_crawlo_fix_simple.py +124 -124
  238. tests/test_download_delay_middleware.py +221 -221
  239. tests/test_downloader_proxy_compatibility.py +272 -272
  240. tests/test_edge_cases.py +305 -305
  241. tests/test_encoding_core.py +56 -56
  242. tests/test_encoding_detection.py +126 -126
  243. tests/test_enhanced_error_handler.py +270 -270
  244. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  245. tests/test_error_handler_compatibility.py +112 -112
  246. tests/test_factories.py +252 -252
  247. tests/test_factory_compatibility.py +196 -196
  248. tests/test_final_validation.py +153 -153
  249. tests/test_fingerprint_consistency.py +135 -135
  250. tests/test_fingerprint_simple.py +51 -51
  251. tests/test_get_component_logger.py +83 -83
  252. tests/test_hash_performance.py +99 -99
  253. tests/test_integration.py +169 -169
  254. tests/test_item_dedup_redis_key.py +122 -122
  255. tests/test_large_scale_helper.py +235 -235
  256. tests/test_logging_enhancements.py +374 -374
  257. tests/test_logging_final.py +184 -184
  258. tests/test_logging_integration.py +312 -312
  259. tests/test_logging_system.py +282 -282
  260. tests/test_middleware_debug.py +141 -141
  261. tests/test_mode_consistency.py +51 -51
  262. tests/test_multi_directory.py +67 -67
  263. tests/test_multiple_spider_modules.py +80 -80
  264. tests/test_mysql_pipeline_config.py +164 -164
  265. tests/test_mysql_pipeline_error.py +98 -98
  266. tests/test_mysql_pipeline_init_log.py +82 -82
  267. tests/test_mysql_pipeline_integration.py +132 -132
  268. tests/test_mysql_pipeline_refactor.py +143 -143
  269. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  270. tests/test_mysql_pipeline_robustness.py +195 -195
  271. tests/test_mysql_pipeline_types.py +88 -88
  272. tests/test_mysql_update_columns.py +93 -93
  273. tests/test_offsite_middleware.py +244 -244
  274. tests/test_offsite_middleware_simple.py +203 -203
  275. tests/test_optimized_selector_naming.py +100 -100
  276. tests/test_parsel.py +29 -29
  277. tests/test_performance.py +327 -327
  278. tests/test_performance_monitor.py +115 -115
  279. tests/test_pipeline_fingerprint_consistency.py +86 -86
  280. tests/test_priority_behavior.py +211 -211
  281. tests/test_priority_consistency.py +151 -151
  282. tests/test_priority_consistency_fixed.py +249 -249
  283. tests/test_proxy_health_check.py +32 -32
  284. tests/test_proxy_middleware.py +217 -217
  285. tests/test_proxy_middleware_enhanced.py +212 -212
  286. tests/test_proxy_middleware_integration.py +142 -142
  287. tests/test_proxy_middleware_refactored.py +207 -207
  288. tests/test_proxy_only.py +83 -83
  289. tests/test_proxy_providers.py +56 -56
  290. tests/test_proxy_stats.py +19 -19
  291. tests/test_proxy_strategies.py +59 -59
  292. tests/test_proxy_with_downloader.py +152 -152
  293. tests/test_queue_empty_check.py +41 -41
  294. tests/test_queue_manager_double_crawlo.py +173 -173
  295. tests/test_queue_manager_redis_key.py +179 -179
  296. tests/test_queue_naming.py +154 -154
  297. tests/test_queue_type.py +106 -106
  298. tests/test_queue_type_redis_config_consistency.py +130 -130
  299. tests/test_random_headers_default.py +322 -322
  300. tests/test_random_headers_necessity.py +308 -308
  301. tests/test_random_user_agent.py +72 -72
  302. tests/test_redis_config.py +28 -28
  303. tests/test_redis_connection_pool.py +294 -294
  304. tests/test_redis_key_naming.py +181 -181
  305. tests/test_redis_key_validator.py +123 -123
  306. tests/test_redis_queue.py +224 -224
  307. tests/test_redis_queue_name_fix.py +175 -175
  308. tests/test_redis_queue_type_fallback.py +129 -129
  309. tests/test_request_ignore_middleware.py +182 -182
  310. tests/test_request_params.py +111 -111
  311. tests/test_request_serialization.py +70 -70
  312. tests/test_response_code_middleware.py +349 -349
  313. tests/test_response_filter_middleware.py +427 -427
  314. tests/test_response_follow.py +104 -104
  315. tests/test_response_improvements.py +152 -152
  316. tests/test_response_selector_methods.py +92 -92
  317. tests/test_response_url_methods.py +70 -70
  318. tests/test_response_urljoin.py +86 -86
  319. tests/test_retry_middleware.py +333 -333
  320. tests/test_retry_middleware_realistic.py +273 -273
  321. tests/test_scheduler.py +252 -252
  322. tests/test_scheduler_config_update.py +133 -133
  323. tests/test_scrapy_style_encoding.py +112 -112
  324. tests/test_selector_helper.py +100 -100
  325. tests/test_selector_optimizations.py +146 -146
  326. tests/test_simple_response.py +61 -61
  327. tests/test_spider_loader.py +49 -49
  328. tests/test_spider_loader_comprehensive.py +69 -69
  329. tests/test_spider_modules.py +84 -84
  330. tests/test_spiders/test_spider.py +9 -9
  331. tests/test_telecom_spider_redis_key.py +205 -205
  332. tests/test_template_content.py +87 -87
  333. tests/test_template_redis_key.py +134 -134
  334. tests/test_tools.py +159 -159
  335. tests/test_user_agent_randomness.py +176 -176
  336. tests/test_user_agents.py +96 -96
  337. tests/untested_features_report.md +138 -138
  338. tests/verify_debug.py +51 -51
  339. tests/verify_distributed.py +117 -117
  340. tests/verify_log_fix.py +111 -111
  341. tests/verify_mysql_warnings.py +109 -109
  342. crawlo/logging/async_handler.py +0 -181
  343. crawlo/logging/monitor.py +0 -153
  344. crawlo/logging/sampler.py +0 -167
  345. crawlo/tools/authenticated_proxy.py +0 -241
  346. crawlo/tools/data_formatter.py +0 -226
  347. crawlo/tools/data_validator.py +0 -181
  348. crawlo/tools/encoding_converter.py +0 -127
  349. crawlo/tools/network_diagnostic.py +0 -365
  350. crawlo/tools/request_tools.py +0 -83
  351. crawlo/tools/retry_mechanism.py +0 -224
  352. crawlo/utils/env_config.py +0 -143
  353. crawlo/utils/large_scale_config.py +0 -287
  354. crawlo/utils/system.py +0 -11
  355. crawlo/utils/tools.py +0 -5
  356. crawlo-1.4.6.dist-info/METADATA +0 -329
  357. crawlo-1.4.6.dist-info/RECORD +0 -361
  358. tests/env_config_example.py +0 -134
  359. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  360. tests/test_authenticated_proxy.py +0 -142
  361. tests/test_comprehensive.py +0 -147
  362. tests/test_dynamic_downloaders_proxy.py +0 -125
  363. tests/test_dynamic_proxy.py +0 -93
  364. tests/test_dynamic_proxy_config.py +0 -147
  365. tests/test_dynamic_proxy_real.py +0 -110
  366. tests/test_env_config.py +0 -122
  367. tests/test_framework_env_usage.py +0 -104
  368. tests/test_large_scale_config.py +0 -113
  369. tests/test_proxy_api.py +0 -265
  370. tests/test_real_scenario_proxy.py +0 -196
  371. tests/tools_example.py +0 -261
  372. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
  373. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
  374. {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
tests/verify_log_fix.py CHANGED
@@ -1,112 +1,112 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 验证日志级别修复效果
5
- 创建一个简化的测试来验证控制台和日志文件级别的一致性
6
- """
7
- import sys
8
- import os
9
- import tempfile
10
-
11
- # 添加项目根目录到Python路径
12
- sys.path.insert(0, '/')
13
-
14
- from crawlo.utils.log import LoggerManager, get_logger
15
-
16
-
17
- def main():
18
- """验证日志级别修复效果"""
19
- print("🔧 验证日志级别修复效果")
20
- print("=" * 50)
21
-
22
- # 创建临时日志文件
23
- temp_log = tempfile.NamedTemporaryFile(mode='w+', suffix='.log', delete=False)
24
- temp_log_path = temp_log.name
25
- temp_log.close()
26
-
27
- try:
28
- # 重置LoggerManager状态
29
- LoggerManager.reset()
30
-
31
- # 使用INFO级别配置
32
- LoggerManager.configure(
33
- LOG_LEVEL='INFO',
34
- LOG_FILE=temp_log_path,
35
- LOG_FORMAT='%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
36
- )
37
-
38
- print(f"✅ 配置完成:")
39
- print(f" 默认级别: {LoggerManager._default_level}")
40
- print(f" 控制台级别: {LoggerManager._default_console_level}")
41
- print(f" 文件级别: {LoggerManager._default_file_level}")
42
- print(f" 日志文件: {temp_log_path}")
43
-
44
- # 创建测试logger
45
- test_logger = get_logger('crawlo.test')
46
-
47
- # 检查handler配置
48
- print(f"\n📋 Handler配置:")
49
- for i, handler in enumerate(test_logger.handlers):
50
- handler_type = type(handler).__name__
51
- handler_level = handler.level
52
- print(f" Handler {i} ({handler_type}): 级别 {handler_level}")
53
-
54
- # 测试日志输出
55
- print(f"\n📝 测试日志输出(控制台):")
56
- test_logger.debug("这是DEBUG级别日志 - 不应该显示")
57
- test_logger.info("这是INFO级别日志 - 应该显示")
58
- test_logger.warning("这是WARNING级别日志 - 应该显示")
59
- test_logger.error("这是ERROR级别日志 - 应该显示")
60
-
61
- # 检查日志文件内容
62
- print(f"\n📄 检查日志文件内容:")
63
- with open(temp_log_path, 'r', encoding='utf-8') as f:
64
- log_content = f.read()
65
- if log_content:
66
- print("日志文件内容:")
67
- print(log_content)
68
- else:
69
- print("❌ 日志文件为空")
70
-
71
- # 分析结果
72
- lines = log_content.strip().split('\n') if log_content.strip() else []
73
- debug_lines = [line for line in lines if '- DEBUG:' in line]
74
- info_lines = [line for line in lines if '- INFO:' in line]
75
- warning_lines = [line for line in lines if '- WARNING:' in line]
76
- error_lines = [line for line in lines if '- ERROR:' in line]
77
-
78
- print(f"\n📊 分析结果:")
79
- print(f" DEBUG级别日志: {len(debug_lines)}条 {'✅ 正确' if len(debug_lines) == 0 else '❌ 错误'}")
80
- print(f" INFO级别日志: {len(info_lines)}条 {'✅ 正确' if len(info_lines) >= 1 else '❌ 错误'}")
81
- print(f" WARNING级别日志: {len(warning_lines)}条 {'✅ 正确' if len(warning_lines) >= 1 else '❌ 错误'}")
82
- print(f" ERROR级别日志: {len(error_lines)}条 {'✅ 正确' if len(error_lines) >= 1 else '❌ 错误'}")
83
-
84
- # 判断修复是否成功
85
- success = (len(debug_lines) == 0 and len(info_lines) >= 1 and
86
- len(warning_lines) >= 1 and len(error_lines) >= 1)
87
-
88
- print(f"\n🎯 修复结果: {'✅ 成功' if success else '❌ 失败'}")
89
-
90
- if success:
91
- print("📋 控制台和日志文件现在使用相同的INFO级别")
92
- print("🎉 日志级别一致性问题已解决")
93
- else:
94
- print("❌ 仍存在日志级别不一致问题,需要进一步调试")
95
-
96
- except Exception as e:
97
- print(f"❌ 验证过程中发生错误: {e}")
98
- import traceback
99
- traceback.print_exc()
100
- return 1
101
- finally:
102
- # 清理临时文件
103
- try:
104
- os.unlink(temp_log_path)
105
- except:
106
- pass
107
-
108
- return 0 if success else 1
109
-
110
-
111
- if __name__ == '__main__':
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 验证日志级别修复效果
5
+ 创建一个简化的测试来验证控制台和日志文件级别的一致性
6
+ """
7
+ import sys
8
+ import os
9
+ import tempfile
10
+
11
+ # 添加项目根目录到Python路径
12
+ sys.path.insert(0, '/')
13
+
14
+ from crawlo.utils.log import LoggerManager, get_logger
15
+
16
+
17
+ def main():
18
+ """验证日志级别修复效果"""
19
+ print("🔧 验证日志级别修复效果")
20
+ print("=" * 50)
21
+
22
+ # 创建临时日志文件
23
+ temp_log = tempfile.NamedTemporaryFile(mode='w+', suffix='.log', delete=False)
24
+ temp_log_path = temp_log.name
25
+ temp_log.close()
26
+
27
+ try:
28
+ # 重置LoggerManager状态
29
+ LoggerManager.reset()
30
+
31
+ # 使用INFO级别配置
32
+ LoggerManager.configure(
33
+ LOG_LEVEL='INFO',
34
+ LOG_FILE=temp_log_path,
35
+ LOG_FORMAT='%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
36
+ )
37
+
38
+ print(f"✅ 配置完成:")
39
+ print(f" 默认级别: {LoggerManager._default_level}")
40
+ print(f" 控制台级别: {LoggerManager._default_console_level}")
41
+ print(f" 文件级别: {LoggerManager._default_file_level}")
42
+ print(f" 日志文件: {temp_log_path}")
43
+
44
+ # 创建测试logger
45
+ test_logger = get_logger('crawlo.test')
46
+
47
+ # 检查handler配置
48
+ print(f"\n📋 Handler配置:")
49
+ for i, handler in enumerate(test_logger.handlers):
50
+ handler_type = type(handler).__name__
51
+ handler_level = handler.level
52
+ print(f" Handler {i} ({handler_type}): 级别 {handler_level}")
53
+
54
+ # 测试日志输出
55
+ print(f"\n📝 测试日志输出(控制台):")
56
+ test_logger.debug("这是DEBUG级别日志 - 不应该显示")
57
+ test_logger.info("这是INFO级别日志 - 应该显示")
58
+ test_logger.warning("这是WARNING级别日志 - 应该显示")
59
+ test_logger.error("这是ERROR级别日志 - 应该显示")
60
+
61
+ # 检查日志文件内容
62
+ print(f"\n📄 检查日志文件内容:")
63
+ with open(temp_log_path, 'r', encoding='utf-8') as f:
64
+ log_content = f.read()
65
+ if log_content:
66
+ print("日志文件内容:")
67
+ print(log_content)
68
+ else:
69
+ print("❌ 日志文件为空")
70
+
71
+ # 分析结果
72
+ lines = log_content.strip().split('\n') if log_content.strip() else []
73
+ debug_lines = [line for line in lines if '- DEBUG:' in line]
74
+ info_lines = [line for line in lines if '- INFO:' in line]
75
+ warning_lines = [line for line in lines if '- WARNING:' in line]
76
+ error_lines = [line for line in lines if '- ERROR:' in line]
77
+
78
+ print(f"\n📊 分析结果:")
79
+ print(f" DEBUG级别日志: {len(debug_lines)}条 {'✅ 正确' if len(debug_lines) == 0 else '❌ 错误'}")
80
+ print(f" INFO级别日志: {len(info_lines)}条 {'✅ 正确' if len(info_lines) >= 1 else '❌ 错误'}")
81
+ print(f" WARNING级别日志: {len(warning_lines)}条 {'✅ 正确' if len(warning_lines) >= 1 else '❌ 错误'}")
82
+ print(f" ERROR级别日志: {len(error_lines)}条 {'✅ 正确' if len(error_lines) >= 1 else '❌ 错误'}")
83
+
84
+ # 判断修复是否成功
85
+ success = (len(debug_lines) == 0 and len(info_lines) >= 1 and
86
+ len(warning_lines) >= 1 and len(error_lines) >= 1)
87
+
88
+ print(f"\n🎯 修复结果: {'✅ 成功' if success else '❌ 失败'}")
89
+
90
+ if success:
91
+ print("📋 控制台和日志文件现在使用相同的INFO级别")
92
+ print("🎉 日志级别一致性问题已解决")
93
+ else:
94
+ print("❌ 仍存在日志级别不一致问题,需要进一步调试")
95
+
96
+ except Exception as e:
97
+ print(f"❌ 验证过程中发生错误: {e}")
98
+ import traceback
99
+ traceback.print_exc()
100
+ return 1
101
+ finally:
102
+ # 清理临时文件
103
+ try:
104
+ os.unlink(temp_log_path)
105
+ except:
106
+ pass
107
+
108
+ return 0 if success else 1
109
+
110
+
111
+ if __name__ == '__main__':
112
112
  sys.exit(main())
@@ -1,110 +1,110 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- 验证 MySQL 警告是否已解决
4
- 通过模拟实际运行环境来检查
5
- """
6
- import asyncio
7
- import sys
8
- import os
9
-
10
- # 添加项目根目录到 Python 路径
11
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
12
-
13
- from crawlo.utils.db_helper import SQLBuilder
14
- from crawlo.pipelines.mysql_pipeline import BaseMySQLPipeline, AsyncmyMySQLPipeline, AiomysqlMySQLPipeline
15
-
16
-
17
- def verify_mysql_syntax():
18
- """验证 MySQL 语法是否正确,不会产生警告"""
19
- print("=== 验证 MySQL 语法是否正确 ===\n")
20
-
21
- # 模拟实际使用的数据
22
- test_data = {
23
- 'title': '新一代OLED屏下光谱颜色传感技术:解锁显示新密码,重塑视觉新体验',
24
- 'publish_time': '2025-10-09 09:57',
25
- 'url': 'https://ee.ofweek.com/2025-10/ART-8460-2806-30671544.html',
26
- 'source': '',
27
- 'content': '在全球智能手机市场竞争日趋白热化的当下,消费者对手机屏幕显示效果的要求愈发严苛...'
28
- }
29
-
30
- # 模拟 ofweek_standalone 项目的配置
31
- update_columns = ('title', 'publish_time')
32
-
33
- print("1. 检查 SQLBuilder 生成的语法...")
34
- sql = SQLBuilder.make_insert(
35
- table="news_items",
36
- data=test_data,
37
- auto_update=False,
38
- update_columns=update_columns,
39
- insert_ignore=False
40
- )
41
-
42
- print("生成的 SQL:")
43
- print(sql[:200] + "..." if len(sql) > 200 else sql)
44
- print()
45
-
46
- # 检查是否包含弃用的 VALUES() 函数用法
47
- if "VALUES(`title`)" in sql or "VALUES(`publish_time`)" in sql:
48
- print("✗ 发现弃用的 VALUES() 函数用法,会产生警告")
49
- return False
50
- else:
51
- print("✓ 未发现弃用的 VALUES() 函数用法")
52
-
53
- if "AS `excluded`" in sql and "ON DUPLICATE KEY UPDATE" in sql:
54
- print("✓ 正确使用了新的 MySQL 语法")
55
- else:
56
- print("✗ 未正确使用新的 MySQL 语法")
57
- return False
58
-
59
- # 检查更新子句
60
- if "`title`=`excluded`.`title`" in sql and "`publish_time`=`excluded`.`publish_time`" in sql:
61
- print("✓ 更新子句正确使用了 excluded 别名")
62
- else:
63
- print("✗ 更新子句语法不正确")
64
- return False
65
-
66
- print("\n2. 检查批量插入语法...")
67
- batch_result = SQLBuilder.make_batch(
68
- table="news_items",
69
- datas=[test_data, test_data],
70
- auto_update=False,
71
- update_columns=update_columns
72
- )
73
-
74
- if batch_result:
75
- batch_sql, _ = batch_result
76
- print("生成的批量 SQL:")
77
- print(batch_sql[:200] + "..." if len(batch_sql) > 200 else batch_sql)
78
- print()
79
-
80
- # 检查批量插入语法
81
- if "VALUES(`title`)" in batch_sql or "VALUES(`publish_time`)" in batch_sql:
82
- print("✗ 批量插入中发现弃用的 VALUES() 函数用法,会产生警告")
83
- return False
84
- else:
85
- print("✓ 批量插入未发现弃用的 VALUES() 函数用法")
86
-
87
- if "AS `excluded`" in batch_sql and "ON DUPLICATE KEY UPDATE" in batch_sql:
88
- print("✓ 批量插入正确使用了新的 MySQL 语法")
89
- else:
90
- print("✗ 批量插入未正确使用新的 MySQL 语法")
91
- return False
92
-
93
- # 检查批量更新子句
94
- if "`title`=`excluded`.`title`" in batch_sql and "`publish_time`=`excluded`.`publish_time`" in batch_sql:
95
- print("✓ 批量插入更新子句正确使用了 excluded 别名")
96
- else:
97
- print("✗ 批量插入更新子句语法不正确")
98
- return False
99
-
100
- print("\n=== 验证完成 ===")
101
- print("✓ 所有语法检查通过,应该不会再出现 MySQL 的 VALUES() 函数弃用警告")
102
- return True
103
-
104
-
105
- if __name__ == "__main__":
106
- success = verify_mysql_syntax()
107
- if success:
108
- print("\n🎉 MySQL 语法问题已解决!")
109
- else:
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 验证 MySQL 警告是否已解决
4
+ 通过模拟实际运行环境来检查
5
+ """
6
+ import asyncio
7
+ import sys
8
+ import os
9
+
10
+ # 添加项目根目录到 Python 路径
11
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
12
+
13
+ from crawlo.utils.db_helper import SQLBuilder
14
+ from crawlo.pipelines.mysql_pipeline import BaseMySQLPipeline, AsyncmyMySQLPipeline, AiomysqlMySQLPipeline
15
+
16
+
17
+ def verify_mysql_syntax():
18
+ """验证 MySQL 语法是否正确,不会产生警告"""
19
+ print("=== 验证 MySQL 语法是否正确 ===\n")
20
+
21
+ # 模拟实际使用的数据
22
+ test_data = {
23
+ 'title': '新一代OLED屏下光谱颜色传感技术:解锁显示新密码,重塑视觉新体验',
24
+ 'publish_time': '2025-10-09 09:57',
25
+ 'url': 'https://ee.ofweek.com/2025-10/ART-8460-2806-30671544.html',
26
+ 'source': '',
27
+ 'content': '在全球智能手机市场竞争日趋白热化的当下,消费者对手机屏幕显示效果的要求愈发严苛...'
28
+ }
29
+
30
+ # 模拟 ofweek_standalone 项目的配置
31
+ update_columns = ('title', 'publish_time')
32
+
33
+ print("1. 检查 SQLBuilder 生成的语法...")
34
+ sql = SQLBuilder.make_insert(
35
+ table="news_items",
36
+ data=test_data,
37
+ auto_update=False,
38
+ update_columns=update_columns,
39
+ insert_ignore=False
40
+ )
41
+
42
+ print("生成的 SQL:")
43
+ print(sql[:200] + "..." if len(sql) > 200 else sql)
44
+ print()
45
+
46
+ # 检查是否包含弃用的 VALUES() 函数用法
47
+ if "VALUES(`title`)" in sql or "VALUES(`publish_time`)" in sql:
48
+ print("✗ 发现弃用的 VALUES() 函数用法,会产生警告")
49
+ return False
50
+ else:
51
+ print("✓ 未发现弃用的 VALUES() 函数用法")
52
+
53
+ if "AS `excluded`" in sql and "ON DUPLICATE KEY UPDATE" in sql:
54
+ print("✓ 正确使用了新的 MySQL 语法")
55
+ else:
56
+ print("✗ 未正确使用新的 MySQL 语法")
57
+ return False
58
+
59
+ # 检查更新子句
60
+ if "`title`=`excluded`.`title`" in sql and "`publish_time`=`excluded`.`publish_time`" in sql:
61
+ print("✓ 更新子句正确使用了 excluded 别名")
62
+ else:
63
+ print("✗ 更新子句语法不正确")
64
+ return False
65
+
66
+ print("\n2. 检查批量插入语法...")
67
+ batch_result = SQLBuilder.make_batch(
68
+ table="news_items",
69
+ datas=[test_data, test_data],
70
+ auto_update=False,
71
+ update_columns=update_columns
72
+ )
73
+
74
+ if batch_result:
75
+ batch_sql, _ = batch_result
76
+ print("生成的批量 SQL:")
77
+ print(batch_sql[:200] + "..." if len(batch_sql) > 200 else batch_sql)
78
+ print()
79
+
80
+ # 检查批量插入语法
81
+ if "VALUES(`title`)" in batch_sql or "VALUES(`publish_time`)" in batch_sql:
82
+ print("✗ 批量插入中发现弃用的 VALUES() 函数用法,会产生警告")
83
+ return False
84
+ else:
85
+ print("✓ 批量插入未发现弃用的 VALUES() 函数用法")
86
+
87
+ if "AS `excluded`" in batch_sql and "ON DUPLICATE KEY UPDATE" in batch_sql:
88
+ print("✓ 批量插入正确使用了新的 MySQL 语法")
89
+ else:
90
+ print("✗ 批量插入未正确使用新的 MySQL 语法")
91
+ return False
92
+
93
+ # 检查批量更新子句
94
+ if "`title`=`excluded`.`title`" in batch_sql and "`publish_time`=`excluded`.`publish_time`" in batch_sql:
95
+ print("✓ 批量插入更新子句正确使用了 excluded 别名")
96
+ else:
97
+ print("✗ 批量插入更新子句语法不正确")
98
+ return False
99
+
100
+ print("\n=== 验证完成 ===")
101
+ print("✓ 所有语法检查通过,应该不会再出现 MySQL 的 VALUES() 函数弃用警告")
102
+ return True
103
+
104
+
105
+ if __name__ == "__main__":
106
+ success = verify_mysql_syntax()
107
+ if success:
108
+ print("\n🎉 MySQL 语法问题已解决!")
109
+ else:
110
110
  print("\n❌ 仍存在 MySQL 语法问题需要修复")
@@ -1,181 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 异步日志处理器
5
- 用于提高日志写入性能
6
- """
7
-
8
- import asyncio
9
- import logging
10
- import threading
11
- import queue
12
- from typing import Optional
13
- from concurrent_log_handler import ConcurrentRotatingFileHandler
14
-
15
-
16
- class AsyncLogHandler(logging.Handler):
17
- """
18
- 异步日志处理器
19
- 将日志记录放入队列中,由后台线程异步处理
20
- """
21
-
22
- def __init__(self, handler: logging.Handler, queue_size: int = 10000):
23
- """
24
- 初始化异步日志处理器
25
-
26
- Args:
27
- handler: 实际的日志处理器
28
- queue_size: 队列大小
29
- """
30
- super().__init__()
31
- self._handler = handler
32
- self._queue = queue.Queue(maxsize=queue_size)
33
- self._thread: Optional[threading.Thread] = None
34
- self._stop_event = threading.Event()
35
- self._started = False
36
-
37
- def start(self):
38
- """启动异步处理线程"""
39
- if self._started:
40
- return
41
-
42
- self._started = True
43
- self._stop_event.clear()
44
- self._thread = threading.Thread(target=self._worker, daemon=True)
45
- self._thread.start()
46
-
47
- def stop(self):
48
- """停止异步处理线程"""
49
- if not self._started:
50
- return
51
-
52
- self._started = False
53
- self._stop_event.set()
54
-
55
- # 发送一个哨兵消息来唤醒工作线程
56
- try:
57
- self._queue.put_nowait(None)
58
- except queue.Full:
59
- pass
60
-
61
- # 等待线程结束
62
- if self._thread and self._thread.is_alive():
63
- self._thread.join(timeout=5.0)
64
-
65
- # 关闭底层处理器
66
- if self._handler:
67
- self._handler.close()
68
-
69
- def _worker(self):
70
- """工作线程函数"""
71
- while not self._stop_event.is_set():
72
- try:
73
- # 从队列中获取日志记录
74
- record = self._queue.get(timeout=1.0)
75
-
76
- # 哨兵消息,表示停止
77
- if record is None:
78
- break
79
-
80
- # 处理日志记录
81
- try:
82
- self._handler.emit(record)
83
- except Exception:
84
- pass # 忽略处理错误
85
-
86
- self._queue.task_done()
87
-
88
- except queue.Empty:
89
- continue
90
- except Exception:
91
- if not self._stop_event.is_set():
92
- continue
93
- else:
94
- break
95
-
96
- def emit(self, record):
97
- """
98
- 发出日志记录
99
-
100
- Args:
101
- record: 日志记录
102
- """
103
- if not self._started:
104
- self.start()
105
-
106
- # 将日志记录放入队列
107
- try:
108
- self._queue.put_nowait(record)
109
- except queue.Full:
110
- # 队列满时丢弃日志记录
111
- pass
112
-
113
- def flush(self):
114
- """刷新日志处理器"""
115
- if self._handler:
116
- self._handler.flush()
117
-
118
- def close(self):
119
- """关闭日志处理器"""
120
- self.stop()
121
- super().close()
122
-
123
-
124
- class AsyncConcurrentRotatingFileHandler(AsyncLogHandler):
125
- """
126
- 异步并发轮转文件处理器
127
- 结合了异步处理和并发轮转文件的功能
128
- """
129
-
130
- def __init__(self, filename, mode='a', maxBytes=0, backupCount=0,
131
- encoding=None, delay=False, queue_size: int = 10000):
132
- """
133
- 初始化异步并发轮转文件处理器
134
-
135
- Args:
136
- filename: 日志文件名
137
- mode: 文件打开模式
138
- maxBytes: 最大文件大小
139
- backupCount: 备份文件数量
140
- encoding: 文件编码
141
- delay: 是否延迟打开文件
142
- queue_size: 队列大小
143
- """
144
- handler = ConcurrentRotatingFileHandler(
145
- filename=filename,
146
- mode=mode,
147
- maxBytes=maxBytes,
148
- backupCount=backupCount,
149
- encoding=encoding,
150
- delay=delay
151
- )
152
- super().__init__(handler, queue_size)
153
-
154
- @property
155
- def baseFilename(self):
156
- """获取基础文件名"""
157
- return self._handler.baseFilename if self._handler else None
158
-
159
- @property
160
- def maxBytes(self):
161
- """获取最大字节数"""
162
- return self._handler.maxBytes if self._handler else 0
163
-
164
- @property
165
- def backupCount(self):
166
- """获取备份计数"""
167
- return self._handler.backupCount if self._handler else 0
168
-
169
-
170
- def wrap_handler_async(handler: logging.Handler, queue_size: int = 10000) -> AsyncLogHandler:
171
- """
172
- 将现有的日志处理器包装为异步处理器
173
-
174
- Args:
175
- handler: 要包装的日志处理器
176
- queue_size: 队列大小
177
-
178
- Returns:
179
- 异步日志处理器
180
- """
181
- return AsyncLogHandler(handler, queue_size)