crawlo 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (220) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +81 -65
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +143 -133
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +292 -292
  14. crawlo/commands/startproject.py +418 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +252 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1027 -1027
  24. crawlo/downloader/__init__.py +266 -266
  25. crawlo/downloader/aiohttp_downloader.py +220 -220
  26. crawlo/downloader/cffi_downloader.py +256 -256
  27. crawlo/downloader/httpx_downloader.py +259 -259
  28. crawlo/downloader/hybrid_downloader.py +213 -213
  29. crawlo/downloader/playwright_downloader.py +402 -402
  30. crawlo/downloader/selenium_downloader.py +472 -472
  31. crawlo/event.py +11 -11
  32. crawlo/exceptions.py +81 -81
  33. crawlo/extension/__init__.py +37 -37
  34. crawlo/extension/health_check.py +141 -141
  35. crawlo/extension/log_interval.py +57 -57
  36. crawlo/extension/log_stats.py +81 -81
  37. crawlo/extension/logging_extension.py +43 -43
  38. crawlo/extension/memory_monitor.py +104 -104
  39. crawlo/extension/performance_profiler.py +133 -133
  40. crawlo/extension/request_recorder.py +107 -107
  41. crawlo/filters/__init__.py +154 -154
  42. crawlo/filters/aioredis_filter.py +280 -280
  43. crawlo/filters/memory_filter.py +269 -269
  44. crawlo/items/__init__.py +23 -23
  45. crawlo/items/base.py +21 -21
  46. crawlo/items/fields.py +53 -53
  47. crawlo/items/items.py +104 -104
  48. crawlo/middleware/__init__.py +21 -21
  49. crawlo/middleware/default_header.py +132 -32
  50. crawlo/middleware/download_delay.py +105 -28
  51. crawlo/middleware/middleware_manager.py +135 -135
  52. crawlo/middleware/offsite.py +116 -0
  53. crawlo/middleware/proxy.py +366 -272
  54. crawlo/middleware/request_ignore.py +88 -30
  55. crawlo/middleware/response_code.py +164 -18
  56. crawlo/middleware/response_filter.py +138 -26
  57. crawlo/middleware/retry.py +124 -124
  58. crawlo/mode_manager.py +211 -211
  59. crawlo/network/__init__.py +21 -21
  60. crawlo/network/request.py +338 -338
  61. crawlo/network/response.py +359 -359
  62. crawlo/pipelines/__init__.py +21 -21
  63. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  64. crawlo/pipelines/console_pipeline.py +39 -39
  65. crawlo/pipelines/csv_pipeline.py +316 -316
  66. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  67. crawlo/pipelines/json_pipeline.py +218 -218
  68. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  69. crawlo/pipelines/mongo_pipeline.py +131 -131
  70. crawlo/pipelines/mysql_pipeline.py +316 -316
  71. crawlo/pipelines/pipeline_manager.py +61 -61
  72. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  73. crawlo/project.py +187 -187
  74. crawlo/queue/pqueue.py +37 -37
  75. crawlo/queue/queue_manager.py +337 -337
  76. crawlo/queue/redis_priority_queue.py +298 -298
  77. crawlo/settings/__init__.py +7 -7
  78. crawlo/settings/default_settings.py +226 -219
  79. crawlo/settings/setting_manager.py +122 -122
  80. crawlo/spider/__init__.py +639 -639
  81. crawlo/stats_collector.py +59 -59
  82. crawlo/subscriber.py +130 -130
  83. crawlo/task_manager.py +30 -30
  84. crawlo/templates/crawlo.cfg.tmpl +10 -10
  85. crawlo/templates/project/__init__.py.tmpl +3 -3
  86. crawlo/templates/project/items.py.tmpl +17 -17
  87. crawlo/templates/project/middlewares.py.tmpl +118 -109
  88. crawlo/templates/project/pipelines.py.tmpl +96 -96
  89. crawlo/templates/project/run.py.tmpl +45 -45
  90. crawlo/templates/project/settings.py.tmpl +327 -326
  91. crawlo/templates/project/settings_distributed.py.tmpl +119 -119
  92. crawlo/templates/project/settings_gentle.py.tmpl +94 -94
  93. crawlo/templates/project/settings_high_performance.py.tmpl +151 -151
  94. crawlo/templates/project/settings_simple.py.tmpl +68 -68
  95. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  96. crawlo/templates/spider/spider.py.tmpl +143 -141
  97. crawlo/tools/__init__.py +182 -182
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/tools/data_validator.py +180 -180
  101. crawlo/tools/date_tools.py +35 -35
  102. crawlo/tools/distributed_coordinator.py +386 -386
  103. crawlo/tools/retry_mechanism.py +220 -220
  104. crawlo/tools/scenario_adapter.py +262 -262
  105. crawlo/utils/__init__.py +35 -35
  106. crawlo/utils/batch_processor.py +260 -260
  107. crawlo/utils/controlled_spider_mixin.py +439 -439
  108. crawlo/utils/date_tools.py +290 -290
  109. crawlo/utils/db_helper.py +343 -343
  110. crawlo/utils/enhanced_error_handler.py +359 -359
  111. crawlo/utils/env_config.py +105 -105
  112. crawlo/utils/error_handler.py +125 -125
  113. crawlo/utils/func_tools.py +82 -82
  114. crawlo/utils/large_scale_config.py +286 -286
  115. crawlo/utils/large_scale_helper.py +343 -343
  116. crawlo/utils/log.py +128 -128
  117. crawlo/utils/performance_monitor.py +284 -284
  118. crawlo/utils/queue_helper.py +175 -175
  119. crawlo/utils/redis_connection_pool.py +334 -334
  120. crawlo/utils/redis_key_validator.py +199 -199
  121. crawlo/utils/request.py +267 -267
  122. crawlo/utils/request_serializer.py +219 -219
  123. crawlo/utils/spider_loader.py +62 -62
  124. crawlo/utils/system.py +11 -11
  125. crawlo/utils/tools.py +4 -4
  126. crawlo/utils/url.py +39 -39
  127. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/METADATA +692 -697
  128. crawlo-1.2.2.dist-info/RECORD +220 -0
  129. examples/__init__.py +7 -7
  130. examples/aiohttp_settings.py +42 -0
  131. examples/curl_cffi_settings.py +41 -0
  132. examples/default_header_middleware_example.py +107 -0
  133. examples/default_header_spider_example.py +129 -0
  134. examples/download_delay_middleware_example.py +160 -0
  135. examples/httpx_settings.py +42 -0
  136. examples/multi_downloader_proxy_example.py +81 -0
  137. examples/offsite_middleware_example.py +55 -0
  138. examples/offsite_spider_example.py +107 -0
  139. examples/proxy_spider_example.py +166 -0
  140. examples/request_ignore_middleware_example.py +51 -0
  141. examples/request_ignore_spider_example.py +99 -0
  142. examples/response_code_middleware_example.py +52 -0
  143. examples/response_filter_middleware_example.py +67 -0
  144. examples/tong_hua_shun_settings.py +62 -0
  145. examples/tong_hua_shun_spider.py +170 -0
  146. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  147. tests/__init__.py +7 -7
  148. tests/advanced_tools_example.py +275 -275
  149. tests/authenticated_proxy_example.py +236 -236
  150. tests/cleaners_example.py +160 -160
  151. tests/config_validation_demo.py +102 -102
  152. tests/controlled_spider_example.py +205 -205
  153. tests/date_tools_example.py +180 -180
  154. tests/dynamic_loading_example.py +523 -523
  155. tests/dynamic_loading_test.py +104 -104
  156. tests/env_config_example.py +133 -133
  157. tests/error_handling_example.py +171 -171
  158. tests/redis_key_validation_demo.py +130 -130
  159. tests/response_improvements_example.py +144 -144
  160. tests/test_advanced_tools.py +148 -148
  161. tests/test_all_redis_key_configs.py +145 -145
  162. tests/test_authenticated_proxy.py +141 -141
  163. tests/test_cleaners.py +54 -54
  164. tests/test_comprehensive.py +146 -146
  165. tests/test_config_validator.py +193 -193
  166. tests/test_crawlo_proxy_integration.py +173 -0
  167. tests/test_date_tools.py +123 -123
  168. tests/test_default_header_middleware.py +159 -0
  169. tests/test_double_crawlo_fix.py +207 -207
  170. tests/test_double_crawlo_fix_simple.py +124 -124
  171. tests/test_download_delay_middleware.py +222 -0
  172. tests/test_downloader_proxy_compatibility.py +269 -0
  173. tests/test_dynamic_downloaders_proxy.py +124 -124
  174. tests/test_dynamic_proxy.py +92 -92
  175. tests/test_dynamic_proxy_config.py +146 -146
  176. tests/test_dynamic_proxy_real.py +109 -109
  177. tests/test_edge_cases.py +303 -303
  178. tests/test_enhanced_error_handler.py +270 -270
  179. tests/test_env_config.py +121 -121
  180. tests/test_error_handler_compatibility.py +112 -112
  181. tests/test_final_validation.py +153 -153
  182. tests/test_framework_env_usage.py +103 -103
  183. tests/test_integration.py +356 -356
  184. tests/test_item_dedup_redis_key.py +122 -122
  185. tests/test_offsite_middleware.py +222 -0
  186. tests/test_parsel.py +29 -29
  187. tests/test_performance.py +327 -327
  188. tests/test_proxy_api.py +265 -0
  189. tests/test_proxy_health_check.py +32 -32
  190. tests/test_proxy_middleware.py +122 -0
  191. tests/test_proxy_middleware_enhanced.py +217 -0
  192. tests/test_proxy_middleware_integration.py +136 -136
  193. tests/test_proxy_providers.py +56 -56
  194. tests/test_proxy_stats.py +19 -19
  195. tests/test_proxy_strategies.py +59 -59
  196. tests/test_queue_manager_double_crawlo.py +173 -173
  197. tests/test_queue_manager_redis_key.py +176 -176
  198. tests/test_real_scenario_proxy.py +196 -0
  199. tests/test_redis_config.py +28 -28
  200. tests/test_redis_connection_pool.py +294 -294
  201. tests/test_redis_key_naming.py +181 -181
  202. tests/test_redis_key_validator.py +123 -123
  203. tests/test_redis_queue.py +224 -224
  204. tests/test_request_ignore_middleware.py +183 -0
  205. tests/test_request_serialization.py +70 -70
  206. tests/test_response_code_middleware.py +350 -0
  207. tests/test_response_filter_middleware.py +428 -0
  208. tests/test_response_improvements.py +152 -152
  209. tests/test_retry_middleware.py +242 -0
  210. tests/test_scheduler.py +241 -241
  211. tests/test_simple_response.py +61 -61
  212. tests/test_telecom_spider_redis_key.py +205 -205
  213. tests/test_template_content.py +87 -87
  214. tests/test_template_redis_key.py +134 -134
  215. tests/test_tools.py +153 -153
  216. tests/tools_example.py +257 -257
  217. crawlo-1.2.0.dist-info/RECORD +0 -190
  218. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/WHEEL +0 -0
  219. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/entry_points.txt +0 -0
  220. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/top_level.txt +0 -0
@@ -1,194 +1,194 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- 配置验证器测试脚本
5
- 用于验证配置验证器的功能
6
- """
7
- import sys
8
- import os
9
- import unittest
10
-
11
- # 添加项目根目录到路径
12
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
13
-
14
- from crawlo.config_validator import ConfigValidator, validate_config
15
-
16
-
17
- class TestConfigValidator(unittest.TestCase):
18
- """配置验证器测试类"""
19
-
20
- def setUp(self):
21
- """测试前准备"""
22
- self.validator = ConfigValidator()
23
-
24
- def test_valid_standalone_config(self):
25
- """测试有效的单机配置"""
26
- config = {
27
- 'PROJECT_NAME': 'test_project',
28
- 'QUEUE_TYPE': 'memory',
29
- 'CONCURRENCY': 8,
30
- 'DOWNLOAD_DELAY': 1.0,
31
- 'DOWNLOAD_TIMEOUT': 30,
32
- 'CONNECTION_POOL_LIMIT': 50,
33
- 'SCHEDULER_MAX_QUEUE_SIZE': 2000,
34
- 'LOG_LEVEL': 'INFO',
35
- 'MIDDLEWARES': [
36
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
37
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware'
38
- ],
39
- 'PIPELINES': [
40
- 'crawlo.pipelines.console_pipeline.ConsolePipeline'
41
- ]
42
- }
43
-
44
- is_valid, errors, warnings = self.validator.validate(config)
45
- self.assertTrue(is_valid)
46
- self.assertEqual(len(errors), 0)
47
-
48
- def test_valid_distributed_config(self):
49
- """测试有效的分布式配置"""
50
- config = {
51
- 'PROJECT_NAME': 'test_project',
52
- 'QUEUE_TYPE': 'redis',
53
- 'CONCURRENCY': 16,
54
- 'DOWNLOAD_DELAY': 1.0,
55
- 'DOWNLOAD_TIMEOUT': 30,
56
- 'CONNECTION_POOL_LIMIT': 50,
57
- 'SCHEDULER_MAX_QUEUE_SIZE': 2000,
58
- 'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
59
- 'REDIS_HOST': '127.0.0.1',
60
- 'REDIS_PORT': 6379,
61
- 'REDIS_URL': 'redis://127.0.0.1:6379/0',
62
- 'LOG_LEVEL': 'INFO',
63
- 'MIDDLEWARES': [
64
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
65
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware'
66
- ],
67
- 'PIPELINES': [
68
- 'crawlo.pipelines.console_pipeline.ConsolePipeline'
69
- ]
70
- }
71
-
72
- is_valid, errors, warnings = self.validator.validate(config)
73
- self.assertTrue(is_valid)
74
- self.assertEqual(len(errors), 0)
75
-
76
- def test_invalid_project_name(self):
77
- """测试无效的项目名称"""
78
- config = {
79
- 'PROJECT_NAME': '', # 空字符串
80
- 'QUEUE_TYPE': 'memory',
81
- 'CONCURRENCY': 8
82
- }
83
-
84
- is_valid, errors, warnings = self.validator.validate(config)
85
- self.assertFalse(is_valid)
86
- self.assertIn("PROJECT_NAME 必须是非空字符串", errors)
87
-
88
- def test_invalid_concurrency(self):
89
- """测试无效的并发数"""
90
- config = {
91
- 'PROJECT_NAME': 'test_project',
92
- 'QUEUE_TYPE': 'memory',
93
- 'CONCURRENCY': -1 # 负数
94
- }
95
-
96
- is_valid, errors, warnings = self.validator.validate(config)
97
- self.assertFalse(is_valid)
98
- self.assertIn("CONCURRENCY 必须是正整数", errors)
99
-
100
- def test_invalid_queue_type(self):
101
- """测试无效的队列类型"""
102
- config = {
103
- 'PROJECT_NAME': 'test_project',
104
- 'QUEUE_TYPE': 'invalid_type', # 无效类型
105
- 'CONCURRENCY': 8
106
- }
107
-
108
- is_valid, errors, warnings = self.validator.validate(config)
109
- self.assertFalse(is_valid)
110
- self.assertIn("QUEUE_TYPE 必须是以下值之一: ['memory', 'redis', 'auto']", errors)
111
-
112
- def test_invalid_redis_queue_name(self):
113
- """测试无效的Redis队列名称"""
114
- config = {
115
- 'PROJECT_NAME': 'test_project',
116
- 'QUEUE_TYPE': 'redis',
117
- 'CONCURRENCY': 8,
118
- 'SCHEDULER_QUEUE_NAME': 'invalid_queue_name' # 不符合命名规范
119
- }
120
-
121
- is_valid, errors, warnings = self.validator.validate(config)
122
- self.assertTrue(is_valid) # 队列名称错误是警告,不是错误
123
- self.assertGreater(len(warnings), 0)
124
- self.assertTrue(any("Redis队列名称" in warning for warning in warnings))
125
-
126
- def test_missing_redis_queue_name(self):
127
- """测试缺少Redis队列名称"""
128
- config = {
129
- 'PROJECT_NAME': 'test_project',
130
- 'QUEUE_TYPE': 'redis',
131
- 'CONCURRENCY': 8
132
- # 缺少 SCHEDULER_QUEUE_NAME
133
- }
134
-
135
- is_valid, errors, warnings = self.validator.validate(config)
136
- self.assertFalse(is_valid)
137
- self.assertIn("使用Redis队列时,SCHEDULER_QUEUE_NAME 不能为空", errors)
138
-
139
- def test_invalid_redis_port(self):
140
- """测试无效的Redis端口"""
141
- config = {
142
- 'PROJECT_NAME': 'test_project',
143
- 'QUEUE_TYPE': 'redis',
144
- 'CONCURRENCY': 8,
145
- 'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
146
- 'REDIS_HOST': '127.0.0.1',
147
- 'REDIS_PORT': 99999 # 无效端口
148
- }
149
-
150
- is_valid, errors, warnings = self.validator.validate(config)
151
- self.assertFalse(is_valid)
152
- self.assertIn("REDIS_PORT 必须是1-65535之间的整数", errors)
153
-
154
- def test_invalid_log_level(self):
155
- """测试无效的日志级别"""
156
- config = {
157
- 'PROJECT_NAME': 'test_project',
158
- 'QUEUE_TYPE': 'memory',
159
- 'CONCURRENCY': 8,
160
- 'LOG_LEVEL': 'INVALID_LEVEL' # 无效日志级别
161
- }
162
-
163
- is_valid, errors, warnings = self.validator.validate(config)
164
- self.assertFalse(is_valid)
165
- self.assertIn("LOG_LEVEL 必须是以下值之一: ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']", errors)
166
-
167
- def test_convenience_function(self):
168
- """测试便利函数"""
169
- config = {
170
- 'PROJECT_NAME': 'test_project',
171
- 'QUEUE_TYPE': 'memory',
172
- 'CONCURRENCY': 8,
173
- 'LOG_LEVEL': 'INFO'
174
- }
175
-
176
- is_valid, errors, warnings = validate_config(config)
177
- self.assertTrue(is_valid)
178
- self.assertEqual(len(errors), 0)
179
-
180
-
181
- def main():
182
- """主测试函数"""
183
- print("🚀 开始配置验证器测试...")
184
- print("=" * 50)
185
-
186
- # 运行测试
187
- unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)
188
-
189
- print("=" * 50)
190
- print("✅ 配置验证器测试完成")
191
-
192
-
193
- if __name__ == "__main__":
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 配置验证器测试脚本
5
+ 用于验证配置验证器的功能
6
+ """
7
+ import sys
8
+ import os
9
+ import unittest
10
+
11
+ # 添加项目根目录到路径
12
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
13
+
14
+ from crawlo.config_validator import ConfigValidator, validate_config
15
+
16
+
17
+ class TestConfigValidator(unittest.TestCase):
18
+ """配置验证器测试类"""
19
+
20
+ def setUp(self):
21
+ """测试前准备"""
22
+ self.validator = ConfigValidator()
23
+
24
+ def test_valid_standalone_config(self):
25
+ """测试有效的单机配置"""
26
+ config = {
27
+ 'PROJECT_NAME': 'test_project',
28
+ 'QUEUE_TYPE': 'memory',
29
+ 'CONCURRENCY': 8,
30
+ 'DOWNLOAD_DELAY': 1.0,
31
+ 'DOWNLOAD_TIMEOUT': 30,
32
+ 'CONNECTION_POOL_LIMIT': 50,
33
+ 'SCHEDULER_MAX_QUEUE_SIZE': 2000,
34
+ 'LOG_LEVEL': 'INFO',
35
+ 'MIDDLEWARES': [
36
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
37
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware'
38
+ ],
39
+ 'PIPELINES': [
40
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline'
41
+ ]
42
+ }
43
+
44
+ is_valid, errors, warnings = self.validator.validate(config)
45
+ self.assertTrue(is_valid)
46
+ self.assertEqual(len(errors), 0)
47
+
48
+ def test_valid_distributed_config(self):
49
+ """测试有效的分布式配置"""
50
+ config = {
51
+ 'PROJECT_NAME': 'test_project',
52
+ 'QUEUE_TYPE': 'redis',
53
+ 'CONCURRENCY': 16,
54
+ 'DOWNLOAD_DELAY': 1.0,
55
+ 'DOWNLOAD_TIMEOUT': 30,
56
+ 'CONNECTION_POOL_LIMIT': 50,
57
+ 'SCHEDULER_MAX_QUEUE_SIZE': 2000,
58
+ 'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
59
+ 'REDIS_HOST': '127.0.0.1',
60
+ 'REDIS_PORT': 6379,
61
+ 'REDIS_URL': 'redis://127.0.0.1:6379/0',
62
+ 'LOG_LEVEL': 'INFO',
63
+ 'MIDDLEWARES': [
64
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
65
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware'
66
+ ],
67
+ 'PIPELINES': [
68
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline'
69
+ ]
70
+ }
71
+
72
+ is_valid, errors, warnings = self.validator.validate(config)
73
+ self.assertTrue(is_valid)
74
+ self.assertEqual(len(errors), 0)
75
+
76
+ def test_invalid_project_name(self):
77
+ """测试无效的项目名称"""
78
+ config = {
79
+ 'PROJECT_NAME': '', # 空字符串
80
+ 'QUEUE_TYPE': 'memory',
81
+ 'CONCURRENCY': 8
82
+ }
83
+
84
+ is_valid, errors, warnings = self.validator.validate(config)
85
+ self.assertFalse(is_valid)
86
+ self.assertIn("PROJECT_NAME 必须是非空字符串", errors)
87
+
88
+ def test_invalid_concurrency(self):
89
+ """测试无效的并发数"""
90
+ config = {
91
+ 'PROJECT_NAME': 'test_project',
92
+ 'QUEUE_TYPE': 'memory',
93
+ 'CONCURRENCY': -1 # 负数
94
+ }
95
+
96
+ is_valid, errors, warnings = self.validator.validate(config)
97
+ self.assertFalse(is_valid)
98
+ self.assertIn("CONCURRENCY 必须是正整数", errors)
99
+
100
+ def test_invalid_queue_type(self):
101
+ """测试无效的队列类型"""
102
+ config = {
103
+ 'PROJECT_NAME': 'test_project',
104
+ 'QUEUE_TYPE': 'invalid_type', # 无效类型
105
+ 'CONCURRENCY': 8
106
+ }
107
+
108
+ is_valid, errors, warnings = self.validator.validate(config)
109
+ self.assertFalse(is_valid)
110
+ self.assertIn("QUEUE_TYPE 必须是以下值之一: ['memory', 'redis', 'auto']", errors)
111
+
112
+ def test_invalid_redis_queue_name(self):
113
+ """测试无效的Redis队列名称"""
114
+ config = {
115
+ 'PROJECT_NAME': 'test_project',
116
+ 'QUEUE_TYPE': 'redis',
117
+ 'CONCURRENCY': 8,
118
+ 'SCHEDULER_QUEUE_NAME': 'invalid_queue_name' # 不符合命名规范
119
+ }
120
+
121
+ is_valid, errors, warnings = self.validator.validate(config)
122
+ self.assertTrue(is_valid) # 队列名称错误是警告,不是错误
123
+ self.assertGreater(len(warnings), 0)
124
+ self.assertTrue(any("Redis队列名称" in warning for warning in warnings))
125
+
126
+ def test_missing_redis_queue_name(self):
127
+ """测试缺少Redis队列名称"""
128
+ config = {
129
+ 'PROJECT_NAME': 'test_project',
130
+ 'QUEUE_TYPE': 'redis',
131
+ 'CONCURRENCY': 8
132
+ # 缺少 SCHEDULER_QUEUE_NAME
133
+ }
134
+
135
+ is_valid, errors, warnings = self.validator.validate(config)
136
+ self.assertFalse(is_valid)
137
+ self.assertIn("使用Redis队列时,SCHEDULER_QUEUE_NAME 不能为空", errors)
138
+
139
+ def test_invalid_redis_port(self):
140
+ """测试无效的Redis端口"""
141
+ config = {
142
+ 'PROJECT_NAME': 'test_project',
143
+ 'QUEUE_TYPE': 'redis',
144
+ 'CONCURRENCY': 8,
145
+ 'SCHEDULER_QUEUE_NAME': 'crawlo:test_project:queue:requests',
146
+ 'REDIS_HOST': '127.0.0.1',
147
+ 'REDIS_PORT': 99999 # 无效端口
148
+ }
149
+
150
+ is_valid, errors, warnings = self.validator.validate(config)
151
+ self.assertFalse(is_valid)
152
+ self.assertIn("REDIS_PORT 必须是1-65535之间的整数", errors)
153
+
154
+ def test_invalid_log_level(self):
155
+ """测试无效的日志级别"""
156
+ config = {
157
+ 'PROJECT_NAME': 'test_project',
158
+ 'QUEUE_TYPE': 'memory',
159
+ 'CONCURRENCY': 8,
160
+ 'LOG_LEVEL': 'INVALID_LEVEL' # 无效日志级别
161
+ }
162
+
163
+ is_valid, errors, warnings = self.validator.validate(config)
164
+ self.assertFalse(is_valid)
165
+ self.assertIn("LOG_LEVEL 必须是以下值之一: ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']", errors)
166
+
167
+ def test_convenience_function(self):
168
+ """测试便利函数"""
169
+ config = {
170
+ 'PROJECT_NAME': 'test_project',
171
+ 'QUEUE_TYPE': 'memory',
172
+ 'CONCURRENCY': 8,
173
+ 'LOG_LEVEL': 'INFO'
174
+ }
175
+
176
+ is_valid, errors, warnings = validate_config(config)
177
+ self.assertTrue(is_valid)
178
+ self.assertEqual(len(errors), 0)
179
+
180
+
181
+ def main():
182
+ """主测试函数"""
183
+ print("🚀 开始配置验证器测试...")
184
+ print("=" * 50)
185
+
186
+ # 运行测试
187
+ unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)
188
+
189
+ print("=" * 50)
190
+ print("✅ 配置验证器测试完成")
191
+
192
+
193
+ if __name__ == "__main__":
194
194
  main()
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawlo框架代理集成测试
5
+ ====================
6
+ 展示如何在Crawlo框架中集成和使用指定的代理API
7
+ """
8
+
9
+ import asyncio
10
+ import sys
11
+ import os
12
+
13
+ # 添加项目根目录到Python路径
14
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
15
+
16
+ from crawlo import Spider, Request
17
+ from crawlo.middleware.proxy import ProxyMiddleware
18
+ from crawlo.settings.setting_manager import SettingManager
19
+
20
+
21
+ class TestProxySpider(Spider):
22
+ """测试代理的爬虫示例"""
23
+ name = 'test_proxy_spider'
24
+
25
+ def __init__(self):
26
+ super().__init__()
27
+ self.test_urls = [
28
+ 'https://httpbin.org/ip', # 查看IP地址
29
+ 'https://httpbin.org/headers', # 查看请求头
30
+ 'https://stock.10jqka.com.cn/20240315/c655957791.shtml', # 测试目标链接
31
+ ]
32
+
33
+ def start_requests(self):
34
+ """生成初始请求"""
35
+ for url in self.test_urls:
36
+ request = Request(url=url, callback=self.parse)
37
+ yield request
38
+
39
+ def parse(self, response):
40
+ """解析响应"""
41
+ print(f"\n=== 响应详情 ===")
42
+ print(f"URL: {response.url}")
43
+ print(f"状态码: {response.status_code}")
44
+ print(f"响应头: {dict(response.headers)}")
45
+
46
+ # 对于httpbin.org/ip,显示IP信息
47
+ if 'httpbin.org/ip' in response.url:
48
+ print(f"IP信息: {response.text[:200]}")
49
+
50
+ # 对于httpbin.org/headers,显示请求头信息
51
+ elif 'httpbin.org/headers' in response.url:
52
+ print(f"请求头信息: {response.text[:200]}")
53
+
54
+ # 对于目标链接,显示部分内容
55
+ else:
56
+ # 只显示前200个字符
57
+ content_preview = response.text[:200] if response.text else ""
58
+ print(f"内容预览: {content_preview}")
59
+
60
+ # 返回一个简单的item
61
+ return {
62
+ 'url': response.url,
63
+ 'status_code': response.status_code,
64
+ 'title': response.css('title::text').get() if response.text else None
65
+ }
66
+
67
+
68
+ def create_proxy_settings():
69
+ """创建代理配置"""
70
+ settings = SettingManager()
71
+
72
+ # 基础配置
73
+ settings.set("LOG_LEVEL", "INFO")
74
+ settings.set("CONCURRENCY", 1) # 为了测试,设置并发数为1
75
+
76
+ # 代理配置
77
+ settings.set("PROXY_ENABLED", True)
78
+ settings.set("PROXY_API_URL", "http://test.proxy.api:8080/proxy/getitem/")
79
+ settings.set("PROXY_EXTRACTOR", "proxy") # 根据API响应结构调整
80
+ settings.set("PROXY_REFRESH_INTERVAL", 30) # 30秒刷新一次
81
+ settings.set("PROXY_API_TIMEOUT", 10) # 10秒超时
82
+ settings.set("PROXY_POOL_SIZE", 3) # 代理池大小
83
+ settings.set("PROXY_HEALTH_CHECK_THRESHOLD", 0.5) # 健康检查阈值
84
+
85
+ return settings
86
+
87
+
88
+ async def test_proxy_middleware_integration():
89
+ """测试代理中间件集成"""
90
+ print("=== 测试Crawlo代理中间件集成 ===")
91
+
92
+ # 创建配置
93
+ settings = create_proxy_settings()
94
+
95
+ # 创建代理中间件实例
96
+ proxy_middleware = ProxyMiddleware(settings, "INFO")
97
+
98
+ # 测试代理API连接
99
+ print(f"代理API URL: {proxy_middleware.api_url}")
100
+ print(f"代理刷新间隔: {proxy_middleware.refresh_interval}秒")
101
+ print(f"代理池大小: {proxy_middleware.proxy_pool_size}")
102
+
103
+ # 测试获取代理
104
+ print("\n--- 测试获取代理 ---")
105
+ try:
106
+ # 这里我们直接测试API连接,而不是完整的代理池更新
107
+ proxy_data = await proxy_middleware._get_proxy_from_api()
108
+ if proxy_data:
109
+ print(f"✅ 成功从API获取代理信息: {proxy_data}")
110
+ else:
111
+ print("❌ 无法从API获取代理信息")
112
+ except Exception as e:
113
+ print(f"❌ 获取代理时出错: {e}")
114
+
115
+ print("\n=== 代理中间件集成测试完成 ===")
116
+
117
+
118
+ def show_proxy_configuration_example():
119
+ """显示代理配置示例"""
120
+ print("\n=== 代理配置示例 ===")
121
+ print("""
122
+ 在Crawlo项目中配置代理的方法:
123
+
124
+ 1. 在settings.py中添加以下配置:
125
+
126
+ ```python
127
+ # 代理配置
128
+ PROXY_ENABLED = True
129
+ PROXY_API_URL = 'http://test.proxy.api:8080/proxy/getitem/'
130
+ PROXY_EXTRACTOR = 'proxy'
131
+ PROXY_REFRESH_INTERVAL = 30
132
+ PROXY_API_TIMEOUT = 10
133
+ PROXY_POOL_SIZE = 5
134
+ PROXY_HEALTH_CHECK_THRESHOLD = 0.5
135
+ ```
136
+
137
+ 2. 确保代理中间件在MIDDLEWARES列表中:
138
+
139
+ ```python
140
+ MIDDLEWARES = [
141
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
142
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
143
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
144
+ 'crawlo.middleware.proxy.ProxyMiddleware', # 代理中间件
145
+ 'crawlo.middleware.retry.RetryMiddleware',
146
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
147
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
148
+ ]
149
+ ```
150
+
151
+ 3. 启动爬虫后,代理中间件会自动:
152
+ - 定期从API获取代理
153
+ - 维护代理池
154
+ - 自动为请求分配代理
155
+ - 监控代理健康状态
156
+ """)
157
+
158
+
159
+ async def main():
160
+ """主函数"""
161
+ print("开始Crawlo代理集成测试...\n")
162
+
163
+ # 1. 测试代理中间件集成
164
+ await test_proxy_middleware_integration()
165
+
166
+ # 2. 显示配置示例
167
+ show_proxy_configuration_example()
168
+
169
+ print("\n所有测试完成!")
170
+
171
+
172
+ if __name__ == "__main__":
173
+ asyncio.run(main())