crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (105) hide show
  1. crawlo/__init__.py +24 -0
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/run.py +58 -32
  4. crawlo/core/__init__.py +44 -0
  5. crawlo/core/engine.py +119 -45
  6. crawlo/core/scheduler.py +4 -3
  7. crawlo/crawler.py +603 -1133
  8. crawlo/downloader/aiohttp_downloader.py +4 -2
  9. crawlo/extension/__init__.py +1 -1
  10. crawlo/extension/logging_extension.py +23 -7
  11. crawlo/factories/__init__.py +28 -0
  12. crawlo/factories/base.py +69 -0
  13. crawlo/factories/crawler.py +104 -0
  14. crawlo/factories/registry.py +85 -0
  15. crawlo/filters/aioredis_filter.py +25 -2
  16. crawlo/framework.py +292 -0
  17. crawlo/initialization/__init__.py +40 -0
  18. crawlo/initialization/built_in.py +426 -0
  19. crawlo/initialization/context.py +142 -0
  20. crawlo/initialization/core.py +194 -0
  21. crawlo/initialization/phases.py +149 -0
  22. crawlo/initialization/registry.py +146 -0
  23. crawlo/items/base.py +2 -1
  24. crawlo/logging/__init__.py +38 -0
  25. crawlo/logging/config.py +97 -0
  26. crawlo/logging/factory.py +129 -0
  27. crawlo/logging/manager.py +112 -0
  28. crawlo/middleware/middleware_manager.py +1 -1
  29. crawlo/middleware/offsite.py +1 -1
  30. crawlo/mode_manager.py +26 -1
  31. crawlo/pipelines/pipeline_manager.py +2 -1
  32. crawlo/project.py +76 -46
  33. crawlo/queue/pqueue.py +11 -5
  34. crawlo/queue/queue_manager.py +143 -19
  35. crawlo/queue/redis_priority_queue.py +69 -49
  36. crawlo/settings/default_settings.py +110 -14
  37. crawlo/settings/setting_manager.py +29 -13
  38. crawlo/spider/__init__.py +34 -16
  39. crawlo/stats_collector.py +17 -3
  40. crawlo/task_manager.py +112 -3
  41. crawlo/templates/project/settings.py.tmpl +103 -202
  42. crawlo/templates/project/settings_distributed.py.tmpl +122 -135
  43. crawlo/templates/project/settings_gentle.py.tmpl +149 -43
  44. crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
  45. crawlo/templates/project/settings_minimal.py.tmpl +46 -15
  46. crawlo/templates/project/settings_simple.py.tmpl +138 -75
  47. crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
  48. crawlo/templates/run.py.tmpl +10 -14
  49. crawlo/templates/spiders_init.py.tmpl +10 -0
  50. crawlo/tools/network_diagnostic.py +365 -0
  51. crawlo/utils/class_loader.py +26 -0
  52. crawlo/utils/error_handler.py +76 -35
  53. crawlo/utils/log.py +41 -144
  54. crawlo/utils/redis_connection_pool.py +43 -6
  55. crawlo/utils/request_serializer.py +8 -1
  56. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
  57. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
  58. tests/authenticated_proxy_example.py +2 -2
  59. tests/baidu_performance_test.py +109 -0
  60. tests/baidu_test.py +60 -0
  61. tests/comprehensive_framework_test.py +213 -0
  62. tests/comprehensive_test.py +82 -0
  63. tests/comprehensive_testing_summary.md +187 -0
  64. tests/debug_configure.py +70 -0
  65. tests/debug_framework_logger.py +85 -0
  66. tests/debug_log_levels.py +64 -0
  67. tests/distributed_test.py +67 -0
  68. tests/distributed_test_debug.py +77 -0
  69. tests/final_command_test_report.md +0 -0
  70. tests/final_comprehensive_test.py +152 -0
  71. tests/final_validation_test.py +183 -0
  72. tests/framework_performance_test.py +203 -0
  73. tests/optimized_performance_test.py +212 -0
  74. tests/performance_comparison.py +246 -0
  75. tests/queue_blocking_test.py +114 -0
  76. tests/queue_test.py +90 -0
  77. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  78. tests/scrapy_comparison/scrapy_test.py +134 -0
  79. tests/simple_command_test.py +120 -0
  80. tests/simple_crawlo_test.py +128 -0
  81. tests/simple_log_test.py +58 -0
  82. tests/simple_optimization_test.py +129 -0
  83. tests/simple_spider_test.py +50 -0
  84. tests/simple_test.py +48 -0
  85. tests/test_all_commands.py +231 -0
  86. tests/test_batch_processor.py +179 -0
  87. tests/test_component_factory.py +175 -0
  88. tests/test_controlled_spider_mixin.py +80 -0
  89. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  90. tests/test_factories.py +253 -0
  91. tests/test_framework_logger.py +67 -0
  92. tests/test_framework_startup.py +65 -0
  93. tests/test_large_scale_config.py +113 -0
  94. tests/test_large_scale_helper.py +236 -0
  95. tests/test_mode_change.py +73 -0
  96. tests/test_mode_consistency.py +1 -1
  97. tests/test_performance_monitor.py +116 -0
  98. tests/test_queue_empty_check.py +42 -0
  99. tests/untested_features_report.md +139 -0
  100. tests/verify_debug.py +52 -0
  101. tests/verify_log_fix.py +112 -0
  102. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  103. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  104. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  105. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 验证日志级别修复效果
5
+ 创建一个简化的测试来验证控制台和日志文件级别的一致性
6
+ """
7
+ import sys
8
+ import os
9
+ import tempfile
10
+
11
+ # 添加项目根目录到Python路径
12
+ sys.path.insert(0, '/')
13
+
14
+ from crawlo.utils.log import LoggerManager, get_logger
15
+
16
+
17
+ def main():
18
+ """验证日志级别修复效果"""
19
+ print("🔧 验证日志级别修复效果")
20
+ print("=" * 50)
21
+
22
+ # 创建临时日志文件
23
+ temp_log = tempfile.NamedTemporaryFile(mode='w+', suffix='.log', delete=False)
24
+ temp_log_path = temp_log.name
25
+ temp_log.close()
26
+
27
+ try:
28
+ # 重置LoggerManager状态
29
+ LoggerManager.reset()
30
+
31
+ # 使用INFO级别配置
32
+ LoggerManager.configure(
33
+ LOG_LEVEL='INFO',
34
+ LOG_FILE=temp_log_path,
35
+ LOG_FORMAT='%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
36
+ )
37
+
38
+ print(f"✅ 配置完成:")
39
+ print(f" 默认级别: {LoggerManager._default_level}")
40
+ print(f" 控制台级别: {LoggerManager._default_console_level}")
41
+ print(f" 文件级别: {LoggerManager._default_file_level}")
42
+ print(f" 日志文件: {temp_log_path}")
43
+
44
+ # 创建测试logger
45
+ test_logger = get_logger('crawlo.test')
46
+
47
+ # 检查handler配置
48
+ print(f"\n📋 Handler配置:")
49
+ for i, handler in enumerate(test_logger.handlers):
50
+ handler_type = type(handler).__name__
51
+ handler_level = handler.level
52
+ print(f" Handler {i} ({handler_type}): 级别 {handler_level}")
53
+
54
+ # 测试日志输出
55
+ print(f"\n📝 测试日志输出(控制台):")
56
+ test_logger.debug("这是DEBUG级别日志 - 不应该显示")
57
+ test_logger.info("这是INFO级别日志 - 应该显示")
58
+ test_logger.warning("这是WARNING级别日志 - 应该显示")
59
+ test_logger.error("这是ERROR级别日志 - 应该显示")
60
+
61
+ # 检查日志文件内容
62
+ print(f"\n📄 检查日志文件内容:")
63
+ with open(temp_log_path, 'r', encoding='utf-8') as f:
64
+ log_content = f.read()
65
+ if log_content:
66
+ print("日志文件内容:")
67
+ print(log_content)
68
+ else:
69
+ print("❌ 日志文件为空")
70
+
71
+ # 分析结果
72
+ lines = log_content.strip().split('\n') if log_content.strip() else []
73
+ debug_lines = [line for line in lines if '- DEBUG:' in line]
74
+ info_lines = [line for line in lines if '- INFO:' in line]
75
+ warning_lines = [line for line in lines if '- WARNING:' in line]
76
+ error_lines = [line for line in lines if '- ERROR:' in line]
77
+
78
+ print(f"\n📊 分析结果:")
79
+ print(f" DEBUG级别日志: {len(debug_lines)}条 {'✅ 正确' if len(debug_lines) == 0 else '❌ 错误'}")
80
+ print(f" INFO级别日志: {len(info_lines)}条 {'✅ 正确' if len(info_lines) >= 1 else '❌ 错误'}")
81
+ print(f" WARNING级别日志: {len(warning_lines)}条 {'✅ 正确' if len(warning_lines) >= 1 else '❌ 错误'}")
82
+ print(f" ERROR级别日志: {len(error_lines)}条 {'✅ 正确' if len(error_lines) >= 1 else '❌ 错误'}")
83
+
84
+ # 判断修复是否成功
85
+ success = (len(debug_lines) == 0 and len(info_lines) >= 1 and
86
+ len(warning_lines) >= 1 and len(error_lines) >= 1)
87
+
88
+ print(f"\n🎯 修复结果: {'✅ 成功' if success else '❌ 失败'}")
89
+
90
+ if success:
91
+ print("📋 控制台和日志文件现在使用相同的INFO级别")
92
+ print("🎉 日志级别一致性问题已解决")
93
+ else:
94
+ print("❌ 仍存在日志级别不一致问题,需要进一步调试")
95
+
96
+ except Exception as e:
97
+ print(f"❌ 验证过程中发生错误: {e}")
98
+ import traceback
99
+ traceback.print_exc()
100
+ return 1
101
+ finally:
102
+ # 清理临时文件
103
+ try:
104
+ os.unlink(temp_log_path)
105
+ except:
106
+ pass
107
+
108
+ return 0 if success else 1
109
+
110
+
111
+ if __name__ == '__main__':
112
+ sys.exit(main())
@@ -1,82 +0,0 @@
1
- # 双重 crawlo 前缀问题修复报告
2
-
3
- ## 问题描述
4
- 用户在使用分布式爬虫时发现Redis key中出现了双重`crawlo`前缀,例如`crawlo:crawlo:queue:processing:data`。这导致了Redis key命名不一致和潜在的混淆问题。
5
-
6
- ## 问题分析
7
- 经过代码分析,发现问题出在以下两个方面:
8
- 1. RedisPriorityQueue类在处理队列名称时会自动修改用户提供的队列名称
9
- 2. QueueManager类在提取项目名称时没有正确处理双重`crawlo`前缀的情况
10
-
11
- ## 修复方案
12
-
13
- ### 1. RedisPriorityQueue类修复
14
- 文件:`crawlo/queue/redis_priority_queue.py`
15
-
16
- **修复前**:
17
- ```python
18
- # 如果提供了 queue_name,确保符合命名规范
19
- # 处理可能的重复前缀问题
20
- if queue_name.startswith("crawlo:crawlo:"):
21
- # 修复双重 crawlo 前缀
22
- self.queue_name = queue_name.replace("crawlo:crawlo:", "crawlo:", 1)
23
- elif not queue_name.startswith("crawlo:"):
24
- # 如果没有 crawlo 前缀,添加它
25
- self.queue_name = f"crawlo:{module_name}:queue:requests"
26
- else:
27
- # 已经有正确的 crawlo 前缀
28
- self.queue_name = queue_name
29
- ```
30
-
31
- **修复后**:
32
- ```python
33
- # 保持用户提供的队列名称不变,不做修改
34
- self.queue_name = queue_name
35
- ```
36
-
37
- ### 2. QueueManager类修复
38
- 文件:`crawlo/queue/queue_manager.py`
39
-
40
- **修复后**:
41
- ```python
42
- # 处理可能的双重 crawlo 前缀
43
- if parts[0] == "crawlo" and parts[1] == "crawlo":
44
- # 双重 crawlo 前缀,取第三个部分作为项目名称
45
- if len(parts) >= 3:
46
- project_name = parts[2]
47
- else:
48
- project_name = "default"
49
- elif parts[0] == "crawlo":
50
- # 正常的 crawlo 前缀,取第二个部分作为项目名称
51
- project_name = parts[1]
52
- else:
53
- # 没有 crawlo 前缀,使用第一个部分作为项目名称
54
- project_name = parts[0]
55
- ```
56
-
57
- ## 测试验证
58
-
59
- ### 测试1:Redis队列命名修复测试
60
- 验证RedisPriorityQueue正确处理各种队列名称格式:
61
- - 正常命名:`crawlo:test_project:queue:requests` → `crawlo:test_project:queue:requests`
62
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → `crawlo:crawlo:queue:requests`
63
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo:crawlo:crawlo:queue:requests`
64
-
65
- ### 测试2:队列管理器项目名称提取测试
66
- 验证QueueManager正确提取项目名称:
67
- - 正常命名:`crawlo:test_project:queue:requests` → `test_project`
68
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → [queue](file://d:\dowell\projects\Crawlo\crawlo\core\processor.py#L13-L13)
69
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo`
70
-
71
- ### 测试3:队列管理器创建队列测试
72
- 验证整个流程的正确性,确保队列名称在传递过程中保持一致。
73
-
74
- 所有测试均已通过,表明双重`crawlo`前缀问题已得到解决。
75
-
76
- ## 结论
77
- 通过以上修复,我们成功解决了Redis key中出现双重`crawlo`前缀的问题。现在Redis队列名称将保持用户配置的一致性,processing和failed队列也会相应地保持相同的前缀结构。
78
-
79
- ## 建议
80
- 1. 建议用户在项目配置中使用标准的队列名称格式,如`crawlo:{project_name}:queue:requests`
81
- 2. 可以使用Redis key验证工具定期检查和规范Redis key命名
82
- 3. 如果需要统一的命名规范,可以在项目初始化时明确指定队列名称
File without changes