crawlo 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +61 -61
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +60 -60
- crawlo/cleaners/data_formatter.py +225 -225
- crawlo/cleaners/encoding_converter.py +125 -125
- crawlo/cleaners/text_cleaner.py +232 -232
- crawlo/cli.py +81 -65
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +143 -133
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +292 -292
- crawlo/commands/startproject.py +418 -418
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +312 -312
- crawlo/config_validator.py +252 -252
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +354 -354
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +143 -143
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +266 -266
- crawlo/downloader/aiohttp_downloader.py +220 -220
- crawlo/downloader/cffi_downloader.py +256 -256
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +213 -213
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +37 -37
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +43 -43
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +280 -280
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -32
- crawlo/middleware/download_delay.py +105 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +116 -0
- crawlo/middleware/proxy.py +366 -272
- crawlo/middleware/request_ignore.py +88 -30
- crawlo/middleware/response_code.py +164 -18
- crawlo/middleware/response_filter.py +138 -26
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +211 -211
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +338 -338
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +224 -224
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +316 -316
- crawlo/pipelines/pipeline_manager.py +61 -61
- crawlo/pipelines/redis_dedup_pipeline.py +167 -167
- crawlo/project.py +187 -187
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +337 -337
- crawlo/queue/redis_priority_queue.py +298 -298
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +226 -219
- crawlo/settings/setting_manager.py +122 -122
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +130 -130
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -109
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/run.py.tmpl +45 -45
- crawlo/templates/project/settings.py.tmpl +327 -326
- crawlo/templates/project/settings_distributed.py.tmpl +119 -119
- crawlo/templates/project/settings_gentle.py.tmpl +94 -94
- crawlo/templates/project/settings_high_performance.py.tmpl +151 -151
- crawlo/templates/project/settings_simple.py.tmpl +68 -68
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +143 -141
- crawlo/tools/__init__.py +182 -182
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +35 -35
- crawlo/tools/distributed_coordinator.py +386 -386
- crawlo/tools/retry_mechanism.py +220 -220
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/utils/__init__.py +35 -35
- crawlo/utils/batch_processor.py +260 -260
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +290 -290
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +359 -359
- crawlo/utils/env_config.py +105 -105
- crawlo/utils/error_handler.py +125 -125
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/performance_monitor.py +284 -284
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +334 -334
- crawlo/utils/redis_key_validator.py +199 -199
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/METADATA +692 -697
- crawlo-1.2.2.dist-info/RECORD +220 -0
- examples/__init__.py +7 -7
- examples/aiohttp_settings.py +42 -0
- examples/curl_cffi_settings.py +41 -0
- examples/default_header_middleware_example.py +107 -0
- examples/default_header_spider_example.py +129 -0
- examples/download_delay_middleware_example.py +160 -0
- examples/httpx_settings.py +42 -0
- examples/multi_downloader_proxy_example.py +81 -0
- examples/offsite_middleware_example.py +55 -0
- examples/offsite_spider_example.py +107 -0
- examples/proxy_spider_example.py +166 -0
- examples/request_ignore_middleware_example.py +51 -0
- examples/request_ignore_spider_example.py +99 -0
- examples/response_code_middleware_example.py +52 -0
- examples/response_filter_middleware_example.py +67 -0
- examples/tong_hua_shun_settings.py +62 -0
- examples/tong_hua_shun_spider.py +170 -0
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +236 -236
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +102 -102
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_validator.py +193 -193
- tests/test_crawlo_proxy_integration.py +173 -0
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +159 -0
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +222 -0
- tests/test_downloader_proxy_compatibility.py +269 -0
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +356 -356
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_offsite_middleware.py +222 -0
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +265 -0
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +122 -0
- tests/test_proxy_middleware_enhanced.py +217 -0
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_real_scenario_proxy.py +196 -0
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +183 -0
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +350 -0
- tests/test_response_filter_middleware.py +428 -0
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +242 -0
- tests/test_scheduler.py +241 -241
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +153 -153
- tests/tools_example.py +257 -257
- crawlo-1.2.0.dist-info/RECORD +0 -190
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/WHEEL +0 -0
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/top_level.txt +0 -0
tests/test_config_validator.py
CHANGED
|
@@ -1,194 +1,194 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
配置验证器测试脚本
|
|
5
|
-
用于验证配置验证器的功能
|
|
6
|
-
"""
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
import unittest
|
|
10
|
-
|
|
11
|
-
# 添加项目根目录到路径
|
|
12
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
13
|
-
|
|
14
|
-
from crawlo.config_validator import ConfigValidator, validate_config
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TestConfigValidator(unittest.TestCase):
|
|
18
|
-
"""配置验证器测试类"""
|
|
19
|
-
|
|
20
|
-
def setUp(self):
|
|
21
|
-
"""测试前准备"""
|
|
22
|
-
self.validator = ConfigValidator()
|
|
23
|
-
|
|
24
|
-
def test_valid_standalone_config(self):
|
|
25
|
-
"""测试有效的单机配置"""
|
|
26
|
-
config = {
|
|
27
|
-
'PROJECT_NAME': 'test_project',
|
|
28
|
-
'QUEUE_TYPE': 'memory',
|
|
29
|
-
'CONCURRENCY': 8,
|
|
30
|
-
'DOWNLOAD_DELAY': 1.0,
|
|
31
|
-
'DOWNLOAD_TIMEOUT': 30,
|
|
32
|
-
'CONNECTION_POOL_LIMIT': 50,
|
|
33
|
-
'SCHEDULER_MAX_QUEUE_SIZE': 2000,
|
|
34
|
-
'LOG_LEVEL': 'INFO',
|
|
35
|
-
'MIDDLEWARES': [
|
|
36
|
-
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
37
|
-
'crawlo.middleware.download_delay.DownloadDelayMiddleware'
|
|
38
|
-
],
|
|
39
|
-
'PIPELINES': [
|
|
40
|
-
'crawlo.pipelines.console_pipeline.ConsolePipeline'
|
|
41
|
-
]
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
45
|
-
self.assertTrue(is_valid)
|
|
46
|
-
self.assertEqual(len(errors), 0)
|
|
47
|
-
|
|
48
|
-
def test_valid_distributed_config(self):
|
|
49
|
-
"""测试有效的分布式配置"""
|
|
50
|
-
config = {
|
|
51
|
-
'PROJECT_NAME': 'test_project',
|
|
52
|
-
'QUEUE_TYPE': 'redis',
|
|
53
|
-
'CONCURRENCY': 16,
|
|
54
|
-
'DOWNLOAD_DELAY': 1.0,
|
|
55
|
-
'DOWNLOAD_TIMEOUT': 30,
|
|
56
|
-
'CONNECTION_POOL_LIMIT': 50,
|
|
57
|
-
'SCHEDULER_MAX_QUEUE_SIZE': 2000,
|
|
58
|
-
'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
|
|
59
|
-
'REDIS_HOST': '127.0.0.1',
|
|
60
|
-
'REDIS_PORT': 6379,
|
|
61
|
-
'REDIS_URL': 'redis://127.0.0.1:6379/0',
|
|
62
|
-
'LOG_LEVEL': 'INFO',
|
|
63
|
-
'MIDDLEWARES': [
|
|
64
|
-
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
65
|
-
'crawlo.middleware.download_delay.DownloadDelayMiddleware'
|
|
66
|
-
],
|
|
67
|
-
'PIPELINES': [
|
|
68
|
-
'crawlo.pipelines.console_pipeline.ConsolePipeline'
|
|
69
|
-
]
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
73
|
-
self.assertTrue(is_valid)
|
|
74
|
-
self.assertEqual(len(errors), 0)
|
|
75
|
-
|
|
76
|
-
def test_invalid_project_name(self):
|
|
77
|
-
"""测试无效的项目名称"""
|
|
78
|
-
config = {
|
|
79
|
-
'PROJECT_NAME': '', # 空字符串
|
|
80
|
-
'QUEUE_TYPE': 'memory',
|
|
81
|
-
'CONCURRENCY': 8
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
85
|
-
self.assertFalse(is_valid)
|
|
86
|
-
self.assertIn("PROJECT_NAME 必须是非空字符串", errors)
|
|
87
|
-
|
|
88
|
-
def test_invalid_concurrency(self):
|
|
89
|
-
"""测试无效的并发数"""
|
|
90
|
-
config = {
|
|
91
|
-
'PROJECT_NAME': 'test_project',
|
|
92
|
-
'QUEUE_TYPE': 'memory',
|
|
93
|
-
'CONCURRENCY': -1 # 负数
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
97
|
-
self.assertFalse(is_valid)
|
|
98
|
-
self.assertIn("CONCURRENCY 必须是正整数", errors)
|
|
99
|
-
|
|
100
|
-
def test_invalid_queue_type(self):
|
|
101
|
-
"""测试无效的队列类型"""
|
|
102
|
-
config = {
|
|
103
|
-
'PROJECT_NAME': 'test_project',
|
|
104
|
-
'QUEUE_TYPE': 'invalid_type', # 无效类型
|
|
105
|
-
'CONCURRENCY': 8
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
109
|
-
self.assertFalse(is_valid)
|
|
110
|
-
self.assertIn("QUEUE_TYPE 必须是以下值之一: ['memory', 'redis', 'auto']", errors)
|
|
111
|
-
|
|
112
|
-
def test_invalid_redis_queue_name(self):
|
|
113
|
-
"""测试无效的Redis队列名称"""
|
|
114
|
-
config = {
|
|
115
|
-
'PROJECT_NAME': 'test_project',
|
|
116
|
-
'QUEUE_TYPE': 'redis',
|
|
117
|
-
'CONCURRENCY': 8,
|
|
118
|
-
'SCHEDULER_QUEUE_NAME': 'invalid_queue_name' # 不符合命名规范
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
122
|
-
self.assertTrue(is_valid) # 队列名称错误是警告,不是错误
|
|
123
|
-
self.assertGreater(len(warnings), 0)
|
|
124
|
-
self.assertTrue(any("Redis队列名称" in warning for warning in warnings))
|
|
125
|
-
|
|
126
|
-
def test_missing_redis_queue_name(self):
|
|
127
|
-
"""测试缺少Redis队列名称"""
|
|
128
|
-
config = {
|
|
129
|
-
'PROJECT_NAME': 'test_project',
|
|
130
|
-
'QUEUE_TYPE': 'redis',
|
|
131
|
-
'CONCURRENCY': 8
|
|
132
|
-
# 缺少 SCHEDULER_QUEUE_NAME
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
136
|
-
self.assertFalse(is_valid)
|
|
137
|
-
self.assertIn("使用Redis队列时,SCHEDULER_QUEUE_NAME 不能为空", errors)
|
|
138
|
-
|
|
139
|
-
def test_invalid_redis_port(self):
|
|
140
|
-
"""测试无效的Redis端口"""
|
|
141
|
-
config = {
|
|
142
|
-
'PROJECT_NAME': 'test_project',
|
|
143
|
-
'QUEUE_TYPE': 'redis',
|
|
144
|
-
'CONCURRENCY': 8,
|
|
145
|
-
'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
|
|
146
|
-
'REDIS_HOST': '127.0.0.1',
|
|
147
|
-
'REDIS_PORT': 99999 # 无效端口
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
151
|
-
self.assertFalse(is_valid)
|
|
152
|
-
self.assertIn("REDIS_PORT 必须是1-65535之间的整数", errors)
|
|
153
|
-
|
|
154
|
-
def test_invalid_log_level(self):
|
|
155
|
-
"""测试无效的日志级别"""
|
|
156
|
-
config = {
|
|
157
|
-
'PROJECT_NAME': 'test_project',
|
|
158
|
-
'QUEUE_TYPE': 'memory',
|
|
159
|
-
'CONCURRENCY': 8,
|
|
160
|
-
'LOG_LEVEL': 'INVALID_LEVEL' # 无效日志级别
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
is_valid, errors, warnings = self.validator.validate(config)
|
|
164
|
-
self.assertFalse(is_valid)
|
|
165
|
-
self.assertIn("LOG_LEVEL 必须是以下值之一: ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']", errors)
|
|
166
|
-
|
|
167
|
-
def test_convenience_function(self):
|
|
168
|
-
"""测试便利函数"""
|
|
169
|
-
config = {
|
|
170
|
-
'PROJECT_NAME': 'test_project',
|
|
171
|
-
'QUEUE_TYPE': 'memory',
|
|
172
|
-
'CONCURRENCY': 8,
|
|
173
|
-
'LOG_LEVEL': 'INFO'
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
is_valid, errors, warnings = validate_config(config)
|
|
177
|
-
self.assertTrue(is_valid)
|
|
178
|
-
self.assertEqual(len(errors), 0)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
def main():
|
|
182
|
-
"""主测试函数"""
|
|
183
|
-
print("🚀 开始配置验证器测试...")
|
|
184
|
-
print("=" * 50)
|
|
185
|
-
|
|
186
|
-
# 运行测试
|
|
187
|
-
unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)
|
|
188
|
-
|
|
189
|
-
print("=" * 50)
|
|
190
|
-
print("✅ 配置验证器测试完成")
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
配置验证器测试脚本
|
|
5
|
+
用于验证配置验证器的功能
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import unittest
|
|
10
|
+
|
|
11
|
+
# 添加项目根目录到路径
|
|
12
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
13
|
+
|
|
14
|
+
from crawlo.config_validator import ConfigValidator, validate_config
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestConfigValidator(unittest.TestCase):
|
|
18
|
+
"""配置验证器测试类"""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""测试前准备"""
|
|
22
|
+
self.validator = ConfigValidator()
|
|
23
|
+
|
|
24
|
+
def test_valid_standalone_config(self):
|
|
25
|
+
"""测试有效的单机配置"""
|
|
26
|
+
config = {
|
|
27
|
+
'PROJECT_NAME': 'test_project',
|
|
28
|
+
'QUEUE_TYPE': 'memory',
|
|
29
|
+
'CONCURRENCY': 8,
|
|
30
|
+
'DOWNLOAD_DELAY': 1.0,
|
|
31
|
+
'DOWNLOAD_TIMEOUT': 30,
|
|
32
|
+
'CONNECTION_POOL_LIMIT': 50,
|
|
33
|
+
'SCHEDULER_MAX_QUEUE_SIZE': 2000,
|
|
34
|
+
'LOG_LEVEL': 'INFO',
|
|
35
|
+
'MIDDLEWARES': [
|
|
36
|
+
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
37
|
+
'crawlo.middleware.download_delay.DownloadDelayMiddleware'
|
|
38
|
+
],
|
|
39
|
+
'PIPELINES': [
|
|
40
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline'
|
|
41
|
+
]
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
45
|
+
self.assertTrue(is_valid)
|
|
46
|
+
self.assertEqual(len(errors), 0)
|
|
47
|
+
|
|
48
|
+
def test_valid_distributed_config(self):
|
|
49
|
+
"""测试有效的分布式配置"""
|
|
50
|
+
config = {
|
|
51
|
+
'PROJECT_NAME': 'test_project',
|
|
52
|
+
'QUEUE_TYPE': 'redis',
|
|
53
|
+
'CONCURRENCY': 16,
|
|
54
|
+
'DOWNLOAD_DELAY': 1.0,
|
|
55
|
+
'DOWNLOAD_TIMEOUT': 30,
|
|
56
|
+
'CONNECTION_POOL_LIMIT': 50,
|
|
57
|
+
'SCHEDULER_MAX_QUEUE_SIZE': 2000,
|
|
58
|
+
'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
|
|
59
|
+
'REDIS_HOST': '127.0.0.1',
|
|
60
|
+
'REDIS_PORT': 6379,
|
|
61
|
+
'REDIS_URL': 'redis://127.0.0.1:6379/0',
|
|
62
|
+
'LOG_LEVEL': 'INFO',
|
|
63
|
+
'MIDDLEWARES': [
|
|
64
|
+
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
65
|
+
'crawlo.middleware.download_delay.DownloadDelayMiddleware'
|
|
66
|
+
],
|
|
67
|
+
'PIPELINES': [
|
|
68
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline'
|
|
69
|
+
]
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
73
|
+
self.assertTrue(is_valid)
|
|
74
|
+
self.assertEqual(len(errors), 0)
|
|
75
|
+
|
|
76
|
+
def test_invalid_project_name(self):
|
|
77
|
+
"""测试无效的项目名称"""
|
|
78
|
+
config = {
|
|
79
|
+
'PROJECT_NAME': '', # 空字符串
|
|
80
|
+
'QUEUE_TYPE': 'memory',
|
|
81
|
+
'CONCURRENCY': 8
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
85
|
+
self.assertFalse(is_valid)
|
|
86
|
+
self.assertIn("PROJECT_NAME 必须是非空字符串", errors)
|
|
87
|
+
|
|
88
|
+
def test_invalid_concurrency(self):
|
|
89
|
+
"""测试无效的并发数"""
|
|
90
|
+
config = {
|
|
91
|
+
'PROJECT_NAME': 'test_project',
|
|
92
|
+
'QUEUE_TYPE': 'memory',
|
|
93
|
+
'CONCURRENCY': -1 # 负数
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
97
|
+
self.assertFalse(is_valid)
|
|
98
|
+
self.assertIn("CONCURRENCY 必须是正整数", errors)
|
|
99
|
+
|
|
100
|
+
def test_invalid_queue_type(self):
|
|
101
|
+
"""测试无效的队列类型"""
|
|
102
|
+
config = {
|
|
103
|
+
'PROJECT_NAME': 'test_project',
|
|
104
|
+
'QUEUE_TYPE': 'invalid_type', # 无效类型
|
|
105
|
+
'CONCURRENCY': 8
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
109
|
+
self.assertFalse(is_valid)
|
|
110
|
+
self.assertIn("QUEUE_TYPE 必须是以下值之一: ['memory', 'redis', 'auto']", errors)
|
|
111
|
+
|
|
112
|
+
def test_invalid_redis_queue_name(self):
|
|
113
|
+
"""测试无效的Redis队列名称"""
|
|
114
|
+
config = {
|
|
115
|
+
'PROJECT_NAME': 'test_project',
|
|
116
|
+
'QUEUE_TYPE': 'redis',
|
|
117
|
+
'CONCURRENCY': 8,
|
|
118
|
+
'SCHEDULER_QUEUE_NAME': 'invalid_queue_name' # 不符合命名规范
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
122
|
+
self.assertTrue(is_valid) # 队列名称错误是警告,不是错误
|
|
123
|
+
self.assertGreater(len(warnings), 0)
|
|
124
|
+
self.assertTrue(any("Redis队列名称" in warning for warning in warnings))
|
|
125
|
+
|
|
126
|
+
def test_missing_redis_queue_name(self):
|
|
127
|
+
"""测试缺少Redis队列名称"""
|
|
128
|
+
config = {
|
|
129
|
+
'PROJECT_NAME': 'test_project',
|
|
130
|
+
'QUEUE_TYPE': 'redis',
|
|
131
|
+
'CONCURRENCY': 8
|
|
132
|
+
# 缺少 SCHEDULER_QUEUE_NAME
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
136
|
+
self.assertFalse(is_valid)
|
|
137
|
+
self.assertIn("使用Redis队列时,SCHEDULER_QUEUE_NAME 不能为空", errors)
|
|
138
|
+
|
|
139
|
+
def test_invalid_redis_port(self):
|
|
140
|
+
"""测试无效的Redis端口"""
|
|
141
|
+
config = {
|
|
142
|
+
'PROJECT_NAME': 'test_project',
|
|
143
|
+
'QUEUE_TYPE': 'redis',
|
|
144
|
+
'CONCURRENCY': 8,
|
|
145
|
+
'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
|
|
146
|
+
'REDIS_HOST': '127.0.0.1',
|
|
147
|
+
'REDIS_PORT': 99999 # 无效端口
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
151
|
+
self.assertFalse(is_valid)
|
|
152
|
+
self.assertIn("REDIS_PORT 必须是1-65535之间的整数", errors)
|
|
153
|
+
|
|
154
|
+
def test_invalid_log_level(self):
|
|
155
|
+
"""测试无效的日志级别"""
|
|
156
|
+
config = {
|
|
157
|
+
'PROJECT_NAME': 'test_project',
|
|
158
|
+
'QUEUE_TYPE': 'memory',
|
|
159
|
+
'CONCURRENCY': 8,
|
|
160
|
+
'LOG_LEVEL': 'INVALID_LEVEL' # 无效日志级别
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
is_valid, errors, warnings = self.validator.validate(config)
|
|
164
|
+
self.assertFalse(is_valid)
|
|
165
|
+
self.assertIn("LOG_LEVEL 必须是以下值之一: ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']", errors)
|
|
166
|
+
|
|
167
|
+
def test_convenience_function(self):
|
|
168
|
+
"""测试便利函数"""
|
|
169
|
+
config = {
|
|
170
|
+
'PROJECT_NAME': 'test_project',
|
|
171
|
+
'QUEUE_TYPE': 'memory',
|
|
172
|
+
'CONCURRENCY': 8,
|
|
173
|
+
'LOG_LEVEL': 'INFO'
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
is_valid, errors, warnings = validate_config(config)
|
|
177
|
+
self.assertTrue(is_valid)
|
|
178
|
+
self.assertEqual(len(errors), 0)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def main():
|
|
182
|
+
"""主测试函数"""
|
|
183
|
+
print("🚀 开始配置验证器测试...")
|
|
184
|
+
print("=" * 50)
|
|
185
|
+
|
|
186
|
+
# 运行测试
|
|
187
|
+
unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)
|
|
188
|
+
|
|
189
|
+
print("=" * 50)
|
|
190
|
+
print("✅ 配置验证器测试完成")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
if __name__ == "__main__":
|
|
194
194
|
main()
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Crawlo框架代理集成测试
|
|
5
|
+
====================
|
|
6
|
+
展示如何在Crawlo框架中集成和使用指定的代理API
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
# 添加项目根目录到Python路径
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
15
|
+
|
|
16
|
+
from crawlo import Spider, Request
|
|
17
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
18
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestProxySpider(Spider):
|
|
22
|
+
"""测试代理的爬虫示例"""
|
|
23
|
+
name = 'test_proxy_spider'
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
self.test_urls = [
|
|
28
|
+
'https://httpbin.org/ip', # 查看IP地址
|
|
29
|
+
'https://httpbin.org/headers', # 查看请求头
|
|
30
|
+
'https://stock.10jqka.com.cn/20240315/c655957791.shtml', # 测试目标链接
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
def start_requests(self):
|
|
34
|
+
"""生成初始请求"""
|
|
35
|
+
for url in self.test_urls:
|
|
36
|
+
request = Request(url=url, callback=self.parse)
|
|
37
|
+
yield request
|
|
38
|
+
|
|
39
|
+
def parse(self, response):
|
|
40
|
+
"""解析响应"""
|
|
41
|
+
print(f"\n=== 响应详情 ===")
|
|
42
|
+
print(f"URL: {response.url}")
|
|
43
|
+
print(f"状态码: {response.status_code}")
|
|
44
|
+
print(f"响应头: {dict(response.headers)}")
|
|
45
|
+
|
|
46
|
+
# 对于httpbin.org/ip,显示IP信息
|
|
47
|
+
if 'httpbin.org/ip' in response.url:
|
|
48
|
+
print(f"IP信息: {response.text[:200]}")
|
|
49
|
+
|
|
50
|
+
# 对于httpbin.org/headers,显示请求头信息
|
|
51
|
+
elif 'httpbin.org/headers' in response.url:
|
|
52
|
+
print(f"请求头信息: {response.text[:200]}")
|
|
53
|
+
|
|
54
|
+
# 对于目标链接,显示部分内容
|
|
55
|
+
else:
|
|
56
|
+
# 只显示前200个字符
|
|
57
|
+
content_preview = response.text[:200] if response.text else ""
|
|
58
|
+
print(f"内容预览: {content_preview}")
|
|
59
|
+
|
|
60
|
+
# 返回一个简单的item
|
|
61
|
+
return {
|
|
62
|
+
'url': response.url,
|
|
63
|
+
'status_code': response.status_code,
|
|
64
|
+
'title': response.css('title::text').get() if response.text else None
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def create_proxy_settings():
|
|
69
|
+
"""创建代理配置"""
|
|
70
|
+
settings = SettingManager()
|
|
71
|
+
|
|
72
|
+
# 基础配置
|
|
73
|
+
settings.set("LOG_LEVEL", "INFO")
|
|
74
|
+
settings.set("CONCURRENCY", 1) # 为了测试,设置并发数为1
|
|
75
|
+
|
|
76
|
+
# 代理配置
|
|
77
|
+
settings.set("PROXY_ENABLED", True)
|
|
78
|
+
settings.set("PROXY_API_URL", "http://test.proxy.api:8080/proxy/getitem/")
|
|
79
|
+
settings.set("PROXY_EXTRACTOR", "proxy") # 根据API响应结构调整
|
|
80
|
+
settings.set("PROXY_REFRESH_INTERVAL", 30) # 30秒刷新一次
|
|
81
|
+
settings.set("PROXY_API_TIMEOUT", 10) # 10秒超时
|
|
82
|
+
settings.set("PROXY_POOL_SIZE", 3) # 代理池大小
|
|
83
|
+
settings.set("PROXY_HEALTH_CHECK_THRESHOLD", 0.5) # 健康检查阈值
|
|
84
|
+
|
|
85
|
+
return settings
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def test_proxy_middleware_integration():
|
|
89
|
+
"""测试代理中间件集成"""
|
|
90
|
+
print("=== 测试Crawlo代理中间件集成 ===")
|
|
91
|
+
|
|
92
|
+
# 创建配置
|
|
93
|
+
settings = create_proxy_settings()
|
|
94
|
+
|
|
95
|
+
# 创建代理中间件实例
|
|
96
|
+
proxy_middleware = ProxyMiddleware(settings, "INFO")
|
|
97
|
+
|
|
98
|
+
# 测试代理API连接
|
|
99
|
+
print(f"代理API URL: {proxy_middleware.api_url}")
|
|
100
|
+
print(f"代理刷新间隔: {proxy_middleware.refresh_interval}秒")
|
|
101
|
+
print(f"代理池大小: {proxy_middleware.proxy_pool_size}")
|
|
102
|
+
|
|
103
|
+
# 测试获取代理
|
|
104
|
+
print("\n--- 测试获取代理 ---")
|
|
105
|
+
try:
|
|
106
|
+
# 这里我们直接测试API连接,而不是完整的代理池更新
|
|
107
|
+
proxy_data = await proxy_middleware._get_proxy_from_api()
|
|
108
|
+
if proxy_data:
|
|
109
|
+
print(f"✅ 成功从API获取代理信息: {proxy_data}")
|
|
110
|
+
else:
|
|
111
|
+
print("❌ 无法从API获取代理信息")
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"❌ 获取代理时出错: {e}")
|
|
114
|
+
|
|
115
|
+
print("\n=== 代理中间件集成测试完成 ===")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def show_proxy_configuration_example():
|
|
119
|
+
"""显示代理配置示例"""
|
|
120
|
+
print("\n=== 代理配置示例 ===")
|
|
121
|
+
print("""
|
|
122
|
+
在Crawlo项目中配置代理的方法:
|
|
123
|
+
|
|
124
|
+
1. 在settings.py中添加以下配置:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# 代理配置
|
|
128
|
+
PROXY_ENABLED = True
|
|
129
|
+
PROXY_API_URL = 'http://test.proxy.api:8080/proxy/getitem/'
|
|
130
|
+
PROXY_EXTRACTOR = 'proxy'
|
|
131
|
+
PROXY_REFRESH_INTERVAL = 30
|
|
132
|
+
PROXY_API_TIMEOUT = 10
|
|
133
|
+
PROXY_POOL_SIZE = 5
|
|
134
|
+
PROXY_HEALTH_CHECK_THRESHOLD = 0.5
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
2. 确保代理中间件在MIDDLEWARES列表中:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
MIDDLEWARES = [
|
|
141
|
+
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
142
|
+
'crawlo.middleware.download_delay.DownloadDelayMiddleware',
|
|
143
|
+
'crawlo.middleware.default_header.DefaultHeaderMiddleware',
|
|
144
|
+
'crawlo.middleware.proxy.ProxyMiddleware', # 代理中间件
|
|
145
|
+
'crawlo.middleware.retry.RetryMiddleware',
|
|
146
|
+
'crawlo.middleware.response_code.ResponseCodeMiddleware',
|
|
147
|
+
'crawlo.middleware.response_filter.ResponseFilterMiddleware',
|
|
148
|
+
]
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
3. 启动爬虫后,代理中间件会自动:
|
|
152
|
+
- 定期从API获取代理
|
|
153
|
+
- 维护代理池
|
|
154
|
+
- 自动为请求分配代理
|
|
155
|
+
- 监控代理健康状态
|
|
156
|
+
""")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
async def main():
|
|
160
|
+
"""主函数"""
|
|
161
|
+
print("开始Crawlo代理集成测试...\n")
|
|
162
|
+
|
|
163
|
+
# 1. 测试代理中间件集成
|
|
164
|
+
await test_proxy_middleware_integration()
|
|
165
|
+
|
|
166
|
+
# 2. 显示配置示例
|
|
167
|
+
show_proxy_configuration_example()
|
|
168
|
+
|
|
169
|
+
print("\n所有测试完成!")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
if __name__ == "__main__":
|
|
173
|
+
asyncio.run(main())
|