crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (105) hide show
  1. crawlo/__init__.py +24 -0
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/run.py +58 -32
  4. crawlo/core/__init__.py +44 -0
  5. crawlo/core/engine.py +119 -45
  6. crawlo/core/scheduler.py +4 -3
  7. crawlo/crawler.py +603 -1133
  8. crawlo/downloader/aiohttp_downloader.py +4 -2
  9. crawlo/extension/__init__.py +1 -1
  10. crawlo/extension/logging_extension.py +23 -7
  11. crawlo/factories/__init__.py +28 -0
  12. crawlo/factories/base.py +69 -0
  13. crawlo/factories/crawler.py +104 -0
  14. crawlo/factories/registry.py +85 -0
  15. crawlo/filters/aioredis_filter.py +25 -2
  16. crawlo/framework.py +292 -0
  17. crawlo/initialization/__init__.py +40 -0
  18. crawlo/initialization/built_in.py +426 -0
  19. crawlo/initialization/context.py +142 -0
  20. crawlo/initialization/core.py +194 -0
  21. crawlo/initialization/phases.py +149 -0
  22. crawlo/initialization/registry.py +146 -0
  23. crawlo/items/base.py +2 -1
  24. crawlo/logging/__init__.py +38 -0
  25. crawlo/logging/config.py +97 -0
  26. crawlo/logging/factory.py +129 -0
  27. crawlo/logging/manager.py +112 -0
  28. crawlo/middleware/middleware_manager.py +1 -1
  29. crawlo/middleware/offsite.py +1 -1
  30. crawlo/mode_manager.py +26 -1
  31. crawlo/pipelines/pipeline_manager.py +2 -1
  32. crawlo/project.py +76 -46
  33. crawlo/queue/pqueue.py +11 -5
  34. crawlo/queue/queue_manager.py +143 -19
  35. crawlo/queue/redis_priority_queue.py +69 -49
  36. crawlo/settings/default_settings.py +110 -14
  37. crawlo/settings/setting_manager.py +29 -13
  38. crawlo/spider/__init__.py +34 -16
  39. crawlo/stats_collector.py +17 -3
  40. crawlo/task_manager.py +112 -3
  41. crawlo/templates/project/settings.py.tmpl +103 -202
  42. crawlo/templates/project/settings_distributed.py.tmpl +122 -135
  43. crawlo/templates/project/settings_gentle.py.tmpl +149 -43
  44. crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
  45. crawlo/templates/project/settings_minimal.py.tmpl +46 -15
  46. crawlo/templates/project/settings_simple.py.tmpl +138 -75
  47. crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
  48. crawlo/templates/run.py.tmpl +10 -14
  49. crawlo/templates/spiders_init.py.tmpl +10 -0
  50. crawlo/tools/network_diagnostic.py +365 -0
  51. crawlo/utils/class_loader.py +26 -0
  52. crawlo/utils/error_handler.py +76 -35
  53. crawlo/utils/log.py +41 -144
  54. crawlo/utils/redis_connection_pool.py +43 -6
  55. crawlo/utils/request_serializer.py +8 -1
  56. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
  57. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
  58. tests/authenticated_proxy_example.py +2 -2
  59. tests/baidu_performance_test.py +109 -0
  60. tests/baidu_test.py +60 -0
  61. tests/comprehensive_framework_test.py +213 -0
  62. tests/comprehensive_test.py +82 -0
  63. tests/comprehensive_testing_summary.md +187 -0
  64. tests/debug_configure.py +70 -0
  65. tests/debug_framework_logger.py +85 -0
  66. tests/debug_log_levels.py +64 -0
  67. tests/distributed_test.py +67 -0
  68. tests/distributed_test_debug.py +77 -0
  69. tests/final_command_test_report.md +0 -0
  70. tests/final_comprehensive_test.py +152 -0
  71. tests/final_validation_test.py +183 -0
  72. tests/framework_performance_test.py +203 -0
  73. tests/optimized_performance_test.py +212 -0
  74. tests/performance_comparison.py +246 -0
  75. tests/queue_blocking_test.py +114 -0
  76. tests/queue_test.py +90 -0
  77. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  78. tests/scrapy_comparison/scrapy_test.py +134 -0
  79. tests/simple_command_test.py +120 -0
  80. tests/simple_crawlo_test.py +128 -0
  81. tests/simple_log_test.py +58 -0
  82. tests/simple_optimization_test.py +129 -0
  83. tests/simple_spider_test.py +50 -0
  84. tests/simple_test.py +48 -0
  85. tests/test_all_commands.py +231 -0
  86. tests/test_batch_processor.py +179 -0
  87. tests/test_component_factory.py +175 -0
  88. tests/test_controlled_spider_mixin.py +80 -0
  89. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  90. tests/test_factories.py +253 -0
  91. tests/test_framework_logger.py +67 -0
  92. tests/test_framework_startup.py +65 -0
  93. tests/test_large_scale_config.py +113 -0
  94. tests/test_large_scale_helper.py +236 -0
  95. tests/test_mode_change.py +73 -0
  96. tests/test_mode_consistency.py +1 -1
  97. tests/test_performance_monitor.py +116 -0
  98. tests/test_queue_empty_check.py +42 -0
  99. tests/untested_features_report.md +139 -0
  100. tests/verify_debug.py +52 -0
  101. tests/verify_log_fix.py +112 -0
  102. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  103. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  104. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  105. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,236 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 大规模爬虫辅助工具测试
5
+ 测试 LargeScaleHelper, ProgressManager, MemoryOptimizer, DataSourceAdapter, LargeScaleSpiderMixin
6
+ """
7
+ import sys
8
+ import os
9
+ import unittest
10
+ from unittest.mock import Mock, patch, MagicMock
11
+ import asyncio
12
+ import json
13
+ import tempfile
14
+
15
+ # 添加项目根目录到 Python 路径
16
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
17
+
18
+ from crawlo.utils.large_scale_helper import (
19
+ LargeScaleHelper,
20
+ ProgressManager,
21
+ MemoryOptimizer,
22
+ DataSourceAdapter,
23
+ LargeScaleSpiderMixin
24
+ )
25
+
26
+
27
+ class TestLargeScaleHelper(unittest.TestCase):
28
+ """大规模爬虫辅助类测试"""
29
+
30
+ def setUp(self):
31
+ """测试前准备"""
32
+ self.helper = LargeScaleHelper(batch_size=100, checkpoint_interval=500)
33
+
34
+ def test_helper_initialization(self):
35
+ """测试辅助类初始化"""
36
+ self.assertEqual(self.helper.batch_size, 100)
37
+ self.assertEqual(self.helper.checkpoint_interval, 500)
38
+
39
+ def test_batch_iterator_with_list(self):
40
+ """测试批次迭代器与列表数据源"""
41
+ data = list(range(250)) # 250个元素
42
+ batches = list(self.helper.batch_iterator(data))
43
+
44
+ # 验证批次数量
45
+ self.assertEqual(len(batches), 3) # 250/100 = 3批次(向上取整)
46
+
47
+ # 验证每个批次的大小
48
+ self.assertEqual(len(batches[0]), 100)
49
+ self.assertEqual(len(batches[1]), 100)
50
+ self.assertEqual(len(batches[2]), 50) # 最后一个批次
51
+
52
+ # 验证数据完整性
53
+ all_data = []
54
+ for batch in batches:
55
+ all_data.extend(batch)
56
+ self.assertEqual(all_data, data)
57
+
58
+ def test_batch_iterator_with_offset(self):
59
+ """测试批次迭代器与偏移量"""
60
+ data = list(range(250)) # 250个元素
61
+ batches = list(self.helper.batch_iterator(data, start_offset=50))
62
+
63
+ # 验证批次数量
64
+ self.assertEqual(len(batches), 2) # 剩余200个元素,2个批次
65
+
66
+ # 验证数据正确性
67
+ all_data = []
68
+ for batch in batches:
69
+ all_data.extend(batch)
70
+ self.assertEqual(all_data, list(range(50, 250)))
71
+
72
+ def test_batch_iterator_invalid_source(self):
73
+ """测试批次迭代器与无效数据源"""
74
+ with self.assertRaises(ValueError) as context:
75
+ list(self.helper.batch_iterator(123)) # 整数不是有效的数据源
76
+ self.assertIn("不支持的数据源类型", str(context.exception))
77
+
78
+
79
+ class TestProgressManager(unittest.TestCase):
80
+ """进度管理器测试"""
81
+
82
+ def setUp(self):
83
+ """测试前准备"""
84
+ # 创建临时文件用于测试
85
+ self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json')
86
+ self.temp_file.close()
87
+ self.progress_manager = ProgressManager(self.temp_file.name)
88
+
89
+ def tearDown(self):
90
+ """测试后清理"""
91
+ try:
92
+ os.unlink(self.temp_file.name)
93
+ except:
94
+ pass
95
+
96
+ def test_progress_manager_initialization(self):
97
+ """测试进度管理器初始化"""
98
+ self.assertEqual(self.progress_manager.progress_file, self.temp_file.name)
99
+
100
+ def test_save_and_load_progress(self):
101
+ """测试保存和加载进度"""
102
+ # 保存进度
103
+ self.progress_manager.save_progress(
104
+ batch_num=10,
105
+ processed_count=1000,
106
+ skipped_count=50
107
+ )
108
+
109
+ # 加载进度
110
+ progress = self.progress_manager.load_progress()
111
+
112
+ # 验证进度数据
113
+ self.assertEqual(progress['batch_num'], 10)
114
+ self.assertEqual(progress['processed_count'], 1000)
115
+ self.assertEqual(progress['skipped_count'], 50)
116
+ self.assertIn('timestamp', progress)
117
+ self.assertIn('formatted_time', progress)
118
+
119
+ def test_load_progress_file_not_found(self):
120
+ """测试加载不存在的进度文件"""
121
+ # 创建一个新的进度管理器与不存在的文件
122
+ non_existent_file = tempfile.gettempdir() + "/non_existent_progress.json"
123
+ pm = ProgressManager(non_existent_file)
124
+
125
+ # 加载进度应该返回默认值
126
+ progress = pm.load_progress()
127
+ self.assertEqual(progress['batch_num'], 0)
128
+ self.assertEqual(progress['processed_count'], 0)
129
+ self.assertEqual(progress['skipped_count'], 0)
130
+
131
+
132
+ class TestMemoryOptimizer(unittest.TestCase):
133
+ """内存优化器测试"""
134
+
135
+ def setUp(self):
136
+ """测试前准备"""
137
+ self.optimizer = MemoryOptimizer(max_memory_mb=100)
138
+
139
+ def test_optimizer_initialization(self):
140
+ """测试内存优化器初始化"""
141
+ self.assertEqual(self.optimizer.max_memory_mb, 100)
142
+
143
+ def test_should_pause_for_memory_without_psutil(self):
144
+ """测试在没有psutil时的内存检查"""
145
+ # 在没有psutil的情况下,should_pause_for_memory应该返回False
146
+ result = self.optimizer.should_pause_for_memory()
147
+ self.assertFalse(result)
148
+
149
+ def test_force_garbage_collection(self):
150
+ """测试强制垃圾回收"""
151
+ # 这个方法应该能正常执行而不抛出异常
152
+ try:
153
+ self.optimizer.force_garbage_collection()
154
+ success = True
155
+ except:
156
+ success = False
157
+ self.assertTrue(success)
158
+
159
+
160
+ class TestDataSourceAdapter(unittest.TestCase):
161
+ """数据源适配器测试"""
162
+
163
+ def test_from_file_adapter(self):
164
+ """测试文件数据源适配器"""
165
+ # 创建临时文件
166
+ with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
167
+ for i in range(10):
168
+ f.write(f"line {i}\n")
169
+ temp_file_name = f.name
170
+
171
+ try:
172
+ # 创建文件数据源适配器
173
+ adapter = DataSourceAdapter.from_file(temp_file_name, batch_size=5)
174
+
175
+ # 获取第一批数据
176
+ batch = adapter(0, 5)
177
+ self.assertEqual(len(batch), 5)
178
+ self.assertEqual(batch[0], "line 0")
179
+ self.assertEqual(batch[4], "line 4")
180
+
181
+ # 获取第二批数据
182
+ batch = adapter(5, 5)
183
+ self.assertEqual(len(batch), 5)
184
+ self.assertEqual(batch[0], "line 5")
185
+ self.assertEqual(batch[4], "line 9")
186
+
187
+ finally:
188
+ # 清理临时文件
189
+ os.unlink(temp_file_name)
190
+
191
+
192
+ class TestLargeScaleSpiderMixin(unittest.TestCase):
193
+ """大规模爬虫混入类测试"""
194
+
195
+ def test_mixin_initialization(self):
196
+ """测试混入类初始化"""
197
+ # 创建一个模拟的爬虫类
198
+ class MockSpider:
199
+ def __init__(self):
200
+ self.name = "test_spider"
201
+
202
+ class TestSpider(MockSpider, LargeScaleSpiderMixin):
203
+ def __init__(self):
204
+ MockSpider.__init__(self)
205
+ LargeScaleSpiderMixin.__init__(self)
206
+
207
+ spider = TestSpider()
208
+
209
+ # 验证初始化
210
+ self.assertEqual(spider.name, "test_spider")
211
+ self.assertIsNotNone(spider.large_scale_helper)
212
+ self.assertIsNotNone(spider.progress_manager)
213
+ self.assertIsNotNone(spider.memory_optimizer)
214
+
215
+ def test_mixin_attributes(self):
216
+ """测试混入类属性"""
217
+ # 创建一个模拟的爬虫类
218
+ class MockSpider:
219
+ def __init__(self):
220
+ self.name = "test_spider"
221
+
222
+ class TestSpider(MockSpider, LargeScaleSpiderMixin):
223
+ def __init__(self):
224
+ MockSpider.__init__(self)
225
+ LargeScaleSpiderMixin.__init__(self)
226
+
227
+ spider = TestSpider()
228
+
229
+ # 验证属性
230
+ self.assertIsInstance(spider.large_scale_helper, LargeScaleHelper)
231
+ self.assertIsInstance(spider.progress_manager, ProgressManager)
232
+ self.assertIsInstance(spider.memory_optimizer, MemoryOptimizer)
233
+
234
+
235
+ if __name__ == '__main__':
236
+ unittest.main()
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 测试运行模式日志级别修改
5
+ """
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, '/')
9
+
10
+ def test_mode_log_level():
11
+ print("=== 测试运行模式日志级别修改 ===")
12
+
13
+ # 删除旧的日志文件
14
+ test_log_file = '/Users/oscar/projects/Crawlo/test_mode_change.log'
15
+ if os.path.exists(test_log_file):
16
+ os.remove(test_log_file)
17
+
18
+ # 准备测试设置
19
+ test_settings = {
20
+ 'PROJECT_NAME': 'test_mode_change',
21
+ 'LOG_LEVEL': 'INFO',
22
+ 'LOG_FILE': test_log_file,
23
+ 'RUN_MODE': 'standalone'
24
+ }
25
+
26
+ try:
27
+ # 初始化框架
28
+ from crawlo.core.framework_initializer import initialize_framework
29
+ settings = initialize_framework(test_settings)
30
+
31
+ print(f"设置初始化完成: {settings.get('PROJECT_NAME')}")
32
+
33
+ # 检查日志文件是否包含运行模式信息
34
+ if os.path.exists(test_log_file):
35
+ with open(test_log_file, 'r', encoding='utf-8') as f:
36
+ content = f.read()
37
+ print(f"日志文件内容长度: {len(content)} 字符")
38
+
39
+ # 检查是否还有INFO级别的运行模式信息
40
+ info_lines = [line for line in content.split('\n') if 'INFO' in line and '使用单机模式' in line]
41
+ debug_lines = [line for line in content.split('\n') if 'DEBUG' in line and '使用单机模式' in line]
42
+
43
+ if info_lines:
44
+ print("❌ 仍然发现INFO级别的运行模式信息:")
45
+ for line in info_lines:
46
+ print(f" {line}")
47
+ else:
48
+ print("✅ 没有发现INFO级别的运行模式信息")
49
+
50
+ if debug_lines:
51
+ print("✅ 发现DEBUG级别的运行模式信息:")
52
+ for line in debug_lines:
53
+ print(f" {line}")
54
+ else:
55
+ print("❌ 没有发现DEBUG级别的运行模式信息")
56
+
57
+ print("\n所有日志内容:")
58
+ lines = content.split('\n')
59
+ for i, line in enumerate(lines, 1):
60
+ if line.strip():
61
+ print(f"{i:3d}: {line}")
62
+ else:
63
+ print("❌ 日志文件未创建")
64
+
65
+ except Exception as e:
66
+ print(f"错误: {e}")
67
+ import traceback
68
+ traceback.print_exc()
69
+
70
+ print("=== 测试完成 ===")
71
+
72
+ if __name__ == "__main__":
73
+ test_mode_log_level()
@@ -22,7 +22,7 @@ class TestSpider(Spider):
22
22
  yield Request("https://httpbin.org/get")
23
23
 
24
24
  def parse(self, response):
25
- yield {"url": response.url, "status": response.status}
25
+ yield {"url": response.url, "status": response.status_code} # 修复:使用status_code而不是status
26
26
 
27
27
 
28
28
  async def test_mode_consistency():
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 性能监控工具测试
5
+ 测试 PerformanceMonitor, PerformanceTimer, performance_monitor_decorator
6
+ """
7
+ import sys
8
+ import os
9
+ import unittest
10
+ from unittest.mock import Mock, patch, MagicMock
11
+ import asyncio
12
+
13
+ # 添加项目根目录到 Python 路径
14
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
15
+
16
+ # 尝试导入性能监控工具
17
+ try:
18
+ from crawlo.utils.performance_monitor import PerformanceMonitor, PerformanceTimer, performance_monitor_decorator
19
+ PSUTIL_AVAILABLE = True
20
+ except ImportError:
21
+ PSUTIL_AVAILABLE = False
22
+ PerformanceMonitor = None
23
+ PerformanceTimer = None
24
+ performance_monitor_decorator = None
25
+
26
+
27
+ class TestPerformanceTimer(unittest.TestCase):
28
+ """性能计时器测试"""
29
+
30
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
31
+ def setUp(self):
32
+ """测试前准备"""
33
+ self.timer = PerformanceTimer("test_timer")
34
+
35
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
36
+ def test_timer_initialization(self):
37
+ """测试计时器初始化"""
38
+ self.assertEqual(self.timer.name, "test_timer")
39
+ self.assertIsNone(self.timer.start_time)
40
+ self.assertIsNone(self.timer.end_time)
41
+
42
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
43
+ def test_timer_context_manager(self):
44
+ """测试计时器上下文管理器"""
45
+ with self.timer as t:
46
+ # 在这里执行一些操作
47
+ result = 1 + 1
48
+
49
+ self.assertEqual(result, 2)
50
+ # 验证计时器已启动和停止
51
+ self.assertIsNotNone(t.start_time)
52
+ self.assertIsNotNone(t.end_time)
53
+
54
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
55
+ def test_timer_start_stop(self):
56
+ """测试计时器启动和停止"""
57
+ self.timer.start()
58
+ self.assertIsNotNone(self.timer.start_time)
59
+
60
+ # 等待一小段时间
61
+ import time
62
+ time.sleep(0.01)
63
+
64
+ elapsed = self.timer.stop()
65
+ self.assertIsNotNone(self.timer.end_time)
66
+ self.assertIsInstance(elapsed, float)
67
+ self.assertGreater(elapsed, 0)
68
+
69
+
70
+ class TestPerformanceMonitor(unittest.TestCase):
71
+ """性能监控器测试"""
72
+
73
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
74
+ def setUp(self):
75
+ """测试前准备"""
76
+ self.monitor = PerformanceMonitor("test_monitor")
77
+
78
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
79
+ def test_monitor_initialization(self):
80
+ """测试监控器初始化"""
81
+ self.assertEqual(self.monitor.start_time, self.monitor.start_time)
82
+ self.assertIsInstance(self.monitor.metrics, dict)
83
+ self.assertIn('cpu_usage', self.monitor.metrics)
84
+ self.assertIn('memory_usage', self.monitor.metrics)
85
+ self.assertIn('network_io', self.monitor.metrics)
86
+ self.assertIn('disk_io', self.monitor.metrics)
87
+
88
+
89
+ class TestPerformanceMonitorDecorator(unittest.TestCase):
90
+ """性能监控装饰器测试"""
91
+
92
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
93
+ def test_performance_monitor_decorator_sync(self):
94
+ """测试同步函数的性能监控装饰器"""
95
+ @performance_monitor_decorator(name="test_sync_function")
96
+ def sync_function():
97
+ return "test_result"
98
+
99
+ result = sync_function()
100
+ self.assertEqual(result, "test_result")
101
+
102
+ @unittest.skipIf(not PSUTIL_AVAILABLE, "psutil not available")
103
+ def test_performance_monitor_decorator_async(self):
104
+ """测试异步函数的性能监控装饰器"""
105
+ @performance_monitor_decorator(name="test_async_function")
106
+ async def async_function():
107
+ await asyncio.sleep(0.01) # 模拟异步操作
108
+ return "async_result"
109
+
110
+ # 使用事件循环运行异步函数
111
+ result = asyncio.run(async_function())
112
+ self.assertEqual(result, "async_result")
113
+
114
+
115
+ if __name__ == '__main__':
116
+ unittest.main()
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 测试队列空检查功能
5
+ """
6
+ import asyncio
7
+ import sys
8
+ import os
9
+
10
+ # 添加项目根目录到Python路径
11
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
12
+
13
+ from crawlo.queue.pqueue import SpiderPriorityQueue
14
+
15
+
16
+ async def test_queue_empty_check():
17
+ """测试队列空检查功能"""
18
+ print("🚀 开始测试队列空检查功能...")
19
+
20
+ # 创建队列实例
21
+ queue = SpiderPriorityQueue()
22
+
23
+ # 检查空队列
24
+ print(f"空队列大小: {queue.qsize()}")
25
+ print(f"空队列是否为空: {queue.qsize() == 0}")
26
+
27
+ # 添加一个元素
28
+ await queue.put((1, "test"))
29
+ print(f"添加元素后队列大小: {queue.qsize()}")
30
+ print(f"添加元素后队列是否为空: {queue.qsize() == 0}")
31
+
32
+ # 获取元素
33
+ item = await queue.get()
34
+ print(f"获取元素: {item}")
35
+ print(f"获取元素后队列大小: {queue.qsize()}")
36
+ print(f"获取元素后队列是否为空: {queue.qsize() == 0}")
37
+
38
+ print("✅ 队列空检查功能测试完成!")
39
+
40
+
41
+ if __name__ == '__main__':
42
+ asyncio.run(test_queue_empty_check())
@@ -0,0 +1,139 @@
1
+ # 未测试功能报告
2
+
3
+ ## 概述
4
+
5
+ 在对Crawlo框架进行全面分析后,发现以下功能模块缺乏专门的测试用例。这些模块虽然部分功能在其他测试中可能有间接覆盖,但缺乏针对性的单元测试和集成测试。
6
+
7
+ ## 已完成测试的功能模块
8
+
9
+ ### 1. 工厂模式相关模块
10
+
11
+ **模块路径**: `crawlo/factories/`
12
+
13
+ **测试文件**: `tests/test_factories.py`
14
+
15
+ **已测试组件**:
16
+ - `ComponentRegistry` - 组件注册表
17
+ - `ComponentFactory` - 组件工厂基类
18
+ - `DefaultComponentFactory` - 默认组件工厂
19
+ - `CrawlerComponentFactory` - Crawler组件工厂
20
+
21
+ ### 2. 批处理工具
22
+
23
+ **模块路径**: `crawlo/utils/batch_processor.py`
24
+
25
+ **测试文件**: `tests/test_batch_processor.py`
26
+
27
+ **已测试组件**:
28
+ - `BatchProcessor` - 批处理处理器
29
+ - `RedisBatchProcessor` - Redis批处理处理器
30
+ - `batch_process` - 便捷批处理函数
31
+
32
+ ### 3. 受控爬虫混入类
33
+
34
+ **模块路径**: `crawlo/utils/controlled_spider_mixin.py`
35
+
36
+ **测试文件**: `tests/test_controlled_spider_mixin.py`
37
+
38
+ **已测试组件**:
39
+ - `ControlledRequestMixin` - 受控请求生成混入类
40
+ - `AsyncControlledRequestMixin` - 异步受控请求混入类
41
+
42
+ ### 4. 大规模配置工具
43
+
44
+ **模块路径**: `crawlo/utils/large_scale_config.py`
45
+
46
+ **测试文件**: `tests/test_large_scale_config.py`
47
+
48
+ **已测试组件**:
49
+ - `LargeScaleConfig` - 大规模爬虫配置类
50
+ - `apply_large_scale_config` - 应用大规模配置函数
51
+
52
+ ### 5. 大规模爬虫辅助工具
53
+
54
+ **模块路径**: `crawlo/utils/large_scale_helper.py`
55
+
56
+ **测试文件**: `tests/test_large_scale_helper.py`
57
+
58
+ **已测试组件**:
59
+ - `LargeScaleHelper` - 大规模爬虫辅助类
60
+ - `ProgressManager` - 进度管理器
61
+ - `MemoryOptimizer` - 内存优化器
62
+ - `DataSourceAdapter` - 数据源适配器
63
+ - `LargeScaleSpiderMixin` - 大规模爬虫混入类
64
+
65
+ ### 6. 增强错误处理工具
66
+
67
+ **模块路径**: `crawlo/utils/enhanced_error_handler.py`
68
+
69
+ **测试文件**:
70
+ - `tests/test_enhanced_error_handler.py` (基础测试)
71
+ - `tests/test_enhanced_error_handler_comprehensive.py` (综合测试)
72
+
73
+ **已测试组件**:
74
+ - `ErrorContext` - 错误上下文信息
75
+ - `DetailedException` - 详细异常基类
76
+ - `EnhancedErrorHandler` - 增强错误处理器
77
+ - `handle_exception` 装饰器
78
+
79
+ ## 未测试的功能模块
80
+
81
+ ### 1. 性能监控工具
82
+
83
+ **模块路径**: `crawlo/utils/performance_monitor.py`
84
+
85
+ **测试文件**: `tests/test_performance_monitor.py` (部分测试,依赖psutil)
86
+
87
+ **未充分测试组件**:
88
+ - `PerformanceMonitor` - 性能监控器
89
+ - `PerformanceTimer` - 性能计时器
90
+ - `performance_monitor_decorator` - 性能监控装饰器
91
+
92
+ **风险**: 性能监控是优化和诊断的重要工具,缺乏测试可能导致监控数据不准确或监控功能失效。
93
+
94
+ ## 建议的测试策略
95
+
96
+ ### 1. 优先级排序
97
+
98
+ **高优先级** (直接影响核心功能):
99
+ - (已完成)
100
+
101
+ **中优先级** (影响性能和稳定性):
102
+ - 性能监控工具
103
+
104
+ **低优先级** (辅助功能):
105
+ - (已完成)
106
+
107
+ ### 2. 测试类型建议
108
+
109
+ **单元测试**:
110
+ - 针对每个类的方法进行独立测试
111
+ - 验证边界条件和异常情况
112
+ - 测试配置参数的有效性
113
+
114
+ **集成测试**:
115
+ - 测试模块间的协作
116
+ - 验证与Redis等外部服务的交互
117
+ - 测试在真实爬虫场景中的表现
118
+
119
+ **性能测试**:
120
+ - 验证批处理工具的性能优势
121
+ - 测试大规模处理工具的内存使用情况
122
+ - 验证性能监控工具的准确性
123
+
124
+ ### 3. 测试覆盖建议
125
+
126
+ **核心功能覆盖**:
127
+ - 正常流程测试
128
+ - 异常流程测试
129
+ - 边界条件测试
130
+ - 并发安全测试
131
+
132
+ **配置覆盖**:
133
+ - 不同配置参数的测试
134
+ - 默认配置与自定义配置的对比
135
+ - 配置更新的动态测试
136
+
137
+ ## 结论
138
+
139
+ 已为工厂模式、批处理工具、受控爬虫混入类、大规模配置工具、大规模爬虫辅助工具和增强错误处理工具创建了测试用例,这些核心组件现在有了基本的测试覆盖。建议继续为性能监控工具补充测试用例(在安装psutil后),以确保框架的完整性和稳定性。
tests/verify_debug.py ADDED
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 验证运行模式日志级别修改的简单测试
5
+ """
6
+ import os
7
+
8
+ # 删除旧日志文件
9
+ log_file = 'verify_debug.log'
10
+ if os.path.exists(log_file):
11
+ os.remove(log_file)
12
+
13
+ # 简单测试日志级别
14
+ from crawlo.utils.log import LoggerManager
15
+
16
+ # 配置日志系统
17
+ LoggerManager.configure(
18
+ LOG_LEVEL='INFO',
19
+ LOG_FILE=log_file
20
+ )
21
+
22
+ from crawlo.utils.log import get_logger
23
+
24
+ # 创建测试logger
25
+ test_logger = get_logger('crawlo.framework')
26
+
27
+ # 测试输出
28
+ test_logger.info("这是INFO级别的测试信息")
29
+ test_logger.debug("这是DEBUG级别的测试信息(不应该在INFO级别的日志中出现)")
30
+ test_logger.debug("使用单机模式 - 简单快速,适合开发和中小规模爬取")
31
+
32
+ print("测试完成")
33
+
34
+ # 检查日志文件
35
+ if os.path.exists(log_file):
36
+ with open(log_file, 'r', encoding='utf-8') as f:
37
+ content = f.read()
38
+ print(f"日志文件内容({len(content)} 字符):")
39
+ print(content)
40
+
41
+ # 检查是否包含不应该出现的DEBUG信息
42
+ if "DEBUG" in content:
43
+ print("❌ 发现DEBUG级别信息(不应该出现)")
44
+ else:
45
+ print("✅ 没有发现DEBUG级别信息(正确)")
46
+
47
+ if "使用单机模式" in content:
48
+ print("❌ 发现运行模式信息(不应该出现在INFO级别)")
49
+ else:
50
+ print("✅ 没有发现运行模式信息(正确)")
51
+ else:
52
+ print("❌ 日志文件未创建")