crawlo 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +8 -7
- crawlo/downloader/__init__.py +5 -2
- crawlo/downloader/cffi_downloader.py +3 -1
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/middleware/proxy.py +171 -348
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +340 -189
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/settings/default_settings.py +42 -30
- crawlo/stats_collector.py +10 -1
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +10 -55
- crawlo/templates/project/settings_distributed.py.tmpl +20 -22
- crawlo/templates/project/settings_gentle.py.tmpl +5 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- crawlo/templates/project/settings_minimal.py.tmpl +25 -1
- crawlo/templates/project/settings_simple.py.tmpl +5 -0
- crawlo/templates/run.py.tmpl +1 -8
- crawlo/templates/spider/spider.py.tmpl +5 -108
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +226 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.6.dist-info/METADATA +329 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/RECORD +110 -69
- tests/authenticated_proxy_example.py +10 -6
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/explain_mysql_update_behavior.py +77 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/simulate_mysql_update_test.py +140 -0
- tests/test_asyncmy_usage.py +57 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_crawlo_proxy_integration.py +8 -2
- tests/test_downloader_proxy_compatibility.py +24 -20
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_proxy_middleware.py +104 -8
- tests/test_proxy_middleware_enhanced.py +1 -5
- tests/test_proxy_middleware_integration.py +7 -2
- tests/test_proxy_middleware_refactored.py +25 -2
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_real_scenario_proxy.py +17 -17
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- tests/verify_mysql_warnings.py +110 -0
- crawlo/middleware/simple_proxy.py +0 -65
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.4.dist-info/METADATA +0 -190
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
tests/test_proxy_middleware.py
CHANGED
|
@@ -49,25 +49,27 @@ class TestProxyMiddleware(unittest.TestCase):
|
|
|
49
49
|
@patch('crawlo.utils.log.get_logger')
|
|
50
50
|
def test_middleware_initialization_without_api_url(self, mock_get_logger):
|
|
51
51
|
"""测试没有配置API URL时中间件初始化"""
|
|
52
|
-
|
|
52
|
+
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
53
53
|
self.settings.set('PROXY_API_URL', None)
|
|
54
54
|
self.settings.set('LOG_LEVEL', 'INFO')
|
|
55
55
|
|
|
56
56
|
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
57
57
|
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
# 应该正常创建实例,但会禁用
|
|
59
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
60
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
61
|
+
self.assertFalse(middleware.enabled)
|
|
61
62
|
|
|
62
63
|
@patch('crawlo.utils.log.get_logger')
|
|
63
64
|
def test_middleware_initialization_with_disabled_proxy(self, mock_get_logger):
|
|
64
65
|
"""测试禁用代理时中间件初始化"""
|
|
65
|
-
|
|
66
|
+
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
67
|
+
self.settings.set('PROXY_API_URL', None)
|
|
66
68
|
self.settings.set('LOG_LEVEL', 'INFO')
|
|
67
69
|
|
|
68
70
|
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
69
71
|
|
|
70
|
-
#
|
|
72
|
+
# 应该正常创建实例,但会禁用
|
|
71
73
|
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
72
74
|
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
73
75
|
self.assertFalse(middleware.enabled)
|
|
@@ -75,18 +77,48 @@ class TestProxyMiddleware(unittest.TestCase):
|
|
|
75
77
|
@patch('crawlo.utils.log.get_logger')
|
|
76
78
|
def test_middleware_initialization_with_api_url(self, mock_get_logger):
|
|
77
79
|
"""测试配置API URL时中间件初始化"""
|
|
78
|
-
|
|
80
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
79
81
|
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
80
82
|
self.settings.set('LOG_LEVEL', 'INFO')
|
|
81
83
|
|
|
82
84
|
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
83
85
|
|
|
84
|
-
#
|
|
86
|
+
# 应该正常创建实例并启用
|
|
85
87
|
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
86
88
|
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
87
89
|
self.assertTrue(middleware.enabled)
|
|
88
90
|
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
89
91
|
|
|
92
|
+
def test_middleware_initialization(self):
|
|
93
|
+
"""测试中间件初始化"""
|
|
94
|
+
# 配置代理API URL以启用中间件
|
|
95
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
96
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
97
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
98
|
+
self.assertTrue(middleware.enabled)
|
|
99
|
+
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
100
|
+
|
|
101
|
+
def test_middleware_enabled_with_api_url(self):
|
|
102
|
+
"""测试配置了代理API URL时中间件启用"""
|
|
103
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
104
|
+
# 不再需要显式设置 PROXY_ENABLED = True
|
|
105
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
106
|
+
self.assertTrue(middleware.enabled)
|
|
107
|
+
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
108
|
+
|
|
109
|
+
def test_middleware_disabled_without_api_url(self):
|
|
110
|
+
"""测试未配置代理API URL时中间件禁用"""
|
|
111
|
+
# 不设置 PROXY_API_URL 或设置为空
|
|
112
|
+
self.settings.set('PROXY_API_URL', '')
|
|
113
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
114
|
+
self.assertFalse(middleware.enabled)
|
|
115
|
+
|
|
116
|
+
def test_middleware_disabled_explicitly(self):
|
|
117
|
+
"""测试显式禁用中间件(通过不配置API URL)"""
|
|
118
|
+
# 不配置 PROXY_API_URL
|
|
119
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
120
|
+
self.assertFalse(middleware.enabled)
|
|
121
|
+
|
|
90
122
|
def test_is_https_with_https_url(self):
|
|
91
123
|
"""测试HTTPS URL判断"""
|
|
92
124
|
# 创建中间件实例
|
|
@@ -117,6 +149,70 @@ class TestProxyMiddleware(unittest.TestCase):
|
|
|
117
149
|
# 应该返回False
|
|
118
150
|
self.assertFalse(middleware._is_https(request))
|
|
119
151
|
|
|
152
|
+
def test_proxy_extractor_field(self):
|
|
153
|
+
"""测试字段名提取方式"""
|
|
154
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
155
|
+
self.settings.set('PROXY_EXTRACTOR', 'data') # 从data字段提取
|
|
156
|
+
|
|
157
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
158
|
+
self.assertEqual(middleware.proxy_extractor, 'data')
|
|
159
|
+
|
|
160
|
+
# 测试提取逻辑
|
|
161
|
+
data = {'data': 'http://proxy-from-data:8080'}
|
|
162
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
163
|
+
self.assertEqual(proxy, 'http://proxy-from-data:8080')
|
|
164
|
+
|
|
165
|
+
def test_proxy_extractor_dict_field(self):
|
|
166
|
+
"""测试字典字段提取方式"""
|
|
167
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
168
|
+
self.settings.set('PROXY_EXTRACTOR', {'type': 'field', 'value': 'result'})
|
|
169
|
+
|
|
170
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
171
|
+
self.assertEqual(middleware.proxy_extractor['type'], 'field')
|
|
172
|
+
self.assertEqual(middleware.proxy_extractor['value'], 'result')
|
|
173
|
+
|
|
174
|
+
# 测试提取逻辑
|
|
175
|
+
data = {'result': 'http://proxy-from-result:8080'}
|
|
176
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
177
|
+
self.assertEqual(proxy, 'http://proxy-from-result:8080')
|
|
178
|
+
|
|
179
|
+
def test_proxy_extractor_custom_function(self):
|
|
180
|
+
"""测试自定义函数提取方式"""
|
|
181
|
+
def custom_extractor(data):
|
|
182
|
+
return data.get('custom_proxy')
|
|
183
|
+
|
|
184
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
185
|
+
self.settings.set('PROXY_EXTRACTOR', {'type': 'custom', 'function': custom_extractor})
|
|
186
|
+
|
|
187
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
188
|
+
|
|
189
|
+
# 测试提取逻辑
|
|
190
|
+
data = {'custom_proxy': 'http://proxy-from-custom:8080'}
|
|
191
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
192
|
+
self.assertEqual(proxy, 'http://proxy-from-custom:8080')
|
|
193
|
+
|
|
194
|
+
def test_proxy_extractor_callable(self):
|
|
195
|
+
"""测试直接函数提取方式"""
|
|
196
|
+
def direct_extractor(data):
|
|
197
|
+
return data.get('direct_proxy')
|
|
198
|
+
|
|
199
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
200
|
+
self.settings.set('PROXY_EXTRACTOR', direct_extractor)
|
|
201
|
+
|
|
202
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
203
|
+
|
|
204
|
+
# 测试提取逻辑
|
|
205
|
+
data = {'direct_proxy': 'http://proxy-from-direct:8080'}
|
|
206
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
207
|
+
self.assertEqual(proxy, 'http://proxy-from-direct:8080')
|
|
208
|
+
|
|
209
|
+
def test_middleware_disabled_without_list(self):
|
|
210
|
+
"""测试未配置代理列表时代理中间件禁用"""
|
|
211
|
+
# 不设置 PROXY_LIST 或设置为空列表
|
|
212
|
+
self.settings.set('PROXY_LIST', [])
|
|
213
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
214
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
215
|
+
self.assertFalse(middleware.enabled)
|
|
120
216
|
|
|
121
217
|
if __name__ == '__main__':
|
|
122
218
|
unittest.main()
|
|
@@ -52,12 +52,8 @@ def test_proxy_class():
|
|
|
52
52
|
def create_mock_settings():
|
|
53
53
|
"""创建模拟设置"""
|
|
54
54
|
settings = SettingManager()
|
|
55
|
-
|
|
55
|
+
# 不再需要显式设置 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
56
56
|
settings.set("PROXY_API_URL", "http://test.proxy.api/get")
|
|
57
|
-
settings.set("PROXY_EXTRACTOR", "proxy")
|
|
58
|
-
settings.set("PROXY_REFRESH_INTERVAL", 10)
|
|
59
|
-
settings.set("PROXY_POOL_SIZE", 3)
|
|
60
|
-
settings.set("PROXY_HEALTH_CHECK_THRESHOLD", 0.5)
|
|
61
57
|
settings.set("LOG_LEVEL", "DEBUG")
|
|
62
58
|
return settings
|
|
63
59
|
|
|
@@ -13,7 +13,12 @@ def crawler():
|
|
|
13
13
|
class MockSettings:
|
|
14
14
|
def get(self, key, default=None):
|
|
15
15
|
defaults = {
|
|
16
|
-
|
|
16
|
+
# 配置代理中间件
|
|
17
|
+
custom_settings = {
|
|
18
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
19
|
+
# 只要配置了代理API URL,中间件就会自动启用
|
|
20
|
+
'PROXY_API_URL': 'http://mock-proxy-service.com/api',
|
|
21
|
+
}
|
|
17
22
|
'PROXIES': ['http://p1:8080', 'http://p2:8080'],
|
|
18
23
|
'PROXY_SELECTION_STRATEGY': 'random',
|
|
19
24
|
'PROXY_REQUEST_DELAY_ENABLED': False,
|
|
@@ -134,4 +139,4 @@ async def test_request_delay(middleware, spider):
|
|
|
134
139
|
|
|
135
140
|
mock_sleep.assert_called_once()
|
|
136
141
|
delay = mock_sleep.call_args[0][0]
|
|
137
|
-
assert 0.04 <= delay <= 0.06
|
|
142
|
+
assert 0.04 <= delay <= 0.06
|
|
@@ -126,9 +126,8 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
|
126
126
|
@patch('crawlo.utils.log.get_logger')
|
|
127
127
|
def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
|
|
128
128
|
"""测试使用解析后的代理数据更新代理池"""
|
|
129
|
-
|
|
129
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
130
130
|
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
131
|
-
self.settings.set('PROXY_POOL_SIZE', 2)
|
|
132
131
|
self.settings.set('LOG_LEVEL', 'INFO')
|
|
133
132
|
|
|
134
133
|
mock_get_logger.return_value = Mock()
|
|
@@ -181,5 +180,29 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
|
181
180
|
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
182
181
|
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
183
182
|
|
|
183
|
+
def test_proxy_middleware_initialization(self):
|
|
184
|
+
"""测试代理中间件初始化"""
|
|
185
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
186
|
+
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
187
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
188
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
189
|
+
self.assertTrue(middleware.enabled)
|
|
190
|
+
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
191
|
+
|
|
192
|
+
def test_proxy_middleware_enabled_with_api_url(self):
|
|
193
|
+
"""测试配置了代理API URL时中间件启用"""
|
|
194
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
195
|
+
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
196
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
197
|
+
self.assertTrue(middleware.enabled)
|
|
198
|
+
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
199
|
+
|
|
200
|
+
def test_proxy_middleware_disabled_without_api_url(self):
|
|
201
|
+
"""测试未配置代理API URL时中间件禁用"""
|
|
202
|
+
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
203
|
+
self.settings.set('PROXY_API_URL', None)
|
|
204
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
205
|
+
self.assertFalse(middleware.enabled)
|
|
206
|
+
|
|
184
207
|
if __name__ == '__main__':
|
|
185
208
|
unittest.main()
|
tests/test_proxy_only.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
代理中间件测试脚本
|
|
5
|
+
测试指定的代理URL功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
17
|
+
from crawlo.network import Request
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def test_proxy_middleware():
|
|
21
|
+
"""测试代理中间件"""
|
|
22
|
+
print("=== 测试代理中间件 ===")
|
|
23
|
+
|
|
24
|
+
# 创建设置管理器
|
|
25
|
+
settings_manager = SettingManager()
|
|
26
|
+
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
27
|
+
|
|
28
|
+
# 配置代理
|
|
29
|
+
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
30
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
31
|
+
|
|
32
|
+
# 创建代理中间件
|
|
33
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
34
|
+
|
|
35
|
+
print(f"代理中间件已创建")
|
|
36
|
+
print(f"模式: {proxy_middleware.mode}")
|
|
37
|
+
print(f"是否启用: {proxy_middleware.enabled}")
|
|
38
|
+
|
|
39
|
+
if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
|
|
40
|
+
# 测试从API获取代理
|
|
41
|
+
print("\n尝试从API获取代理...")
|
|
42
|
+
proxy = await proxy_middleware._fetch_proxy_from_api()
|
|
43
|
+
print(f"获取到的代理: {proxy}")
|
|
44
|
+
|
|
45
|
+
# 测试代理提取功能
|
|
46
|
+
if proxy:
|
|
47
|
+
print(f"代理格式正确: {proxy.startswith('http://') or proxy.startswith('https://')}")
|
|
48
|
+
|
|
49
|
+
# 测试处理请求
|
|
50
|
+
print("\n测试处理请求...")
|
|
51
|
+
request = Request(url="https://httpbin.org/ip")
|
|
52
|
+
|
|
53
|
+
class MockSpider:
|
|
54
|
+
def __init__(self):
|
|
55
|
+
self.name = "test_spider"
|
|
56
|
+
|
|
57
|
+
spider = MockSpider()
|
|
58
|
+
|
|
59
|
+
await proxy_middleware.process_request(request, spider)
|
|
60
|
+
|
|
61
|
+
if request.proxy:
|
|
62
|
+
print(f"请求代理设置成功: {request.proxy}")
|
|
63
|
+
else:
|
|
64
|
+
print("请求代理设置失败")
|
|
65
|
+
else:
|
|
66
|
+
print("未能从API获取有效代理")
|
|
67
|
+
else:
|
|
68
|
+
print("代理中间件未启用或模式不正确")
|
|
69
|
+
|
|
70
|
+
return proxy_middleware
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def main():
|
|
74
|
+
"""主测试函数"""
|
|
75
|
+
print("开始测试代理中间件...")
|
|
76
|
+
|
|
77
|
+
# 测试代理中间件
|
|
78
|
+
await test_proxy_middleware()
|
|
79
|
+
|
|
80
|
+
print("\n测试完成")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
代理中间件与下载器配合测试脚本
|
|
5
|
+
测试指定的代理URL与下载器的兼容性
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
17
|
+
from crawlo.downloader.httpx_downloader import HttpXDownloader
|
|
18
|
+
from crawlo.network import Request
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def test_proxy_middleware():
|
|
22
|
+
"""测试代理中间件"""
|
|
23
|
+
print("=== 测试代理中间件 ===")
|
|
24
|
+
|
|
25
|
+
# 创建设置管理器
|
|
26
|
+
settings_manager = SettingManager()
|
|
27
|
+
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
28
|
+
|
|
29
|
+
# 配置代理
|
|
30
|
+
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
31
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
32
|
+
|
|
33
|
+
# 创建代理中间件
|
|
34
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
35
|
+
|
|
36
|
+
print(f"代理中间件已创建")
|
|
37
|
+
print(f"模式: {proxy_middleware.mode}")
|
|
38
|
+
print(f"是否启用: {proxy_middleware.enabled}")
|
|
39
|
+
|
|
40
|
+
if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
|
|
41
|
+
# 测试从API获取代理
|
|
42
|
+
print("\n尝试从API获取代理...")
|
|
43
|
+
proxy = await proxy_middleware._fetch_proxy_from_api()
|
|
44
|
+
print(f"获取到的代理: {proxy}")
|
|
45
|
+
|
|
46
|
+
return proxy_middleware
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def test_downloader_with_proxy():
|
|
50
|
+
"""测试下载器与代理配合"""
|
|
51
|
+
print("\n=== 测试下载器与代理配合 ===")
|
|
52
|
+
|
|
53
|
+
# 创建设置管理器
|
|
54
|
+
settings_manager = SettingManager()
|
|
55
|
+
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
56
|
+
|
|
57
|
+
# 配置代理
|
|
58
|
+
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
59
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
60
|
+
|
|
61
|
+
# 创建代理中间件
|
|
62
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
63
|
+
|
|
64
|
+
# 创建下载器
|
|
65
|
+
class MockStats:
|
|
66
|
+
def __init__(self):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
def inc_value(self, key, count=1):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
class MockSubscriber:
|
|
73
|
+
def __init__(self):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
def subscribe(self, callback, event):
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
class MockSpider:
|
|
80
|
+
def __init__(self):
|
|
81
|
+
self.name = "test_spider"
|
|
82
|
+
|
|
83
|
+
class MockEngine:
|
|
84
|
+
def __init__(self):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
class MockCrawler:
|
|
88
|
+
def __init__(self, settings):
|
|
89
|
+
self.settings = settings
|
|
90
|
+
self.spider = MockSpider() # 添加spider属性
|
|
91
|
+
self.stats = MockStats() # 添加stats属性
|
|
92
|
+
self.subscriber = MockSubscriber() # 添加subscriber属性
|
|
93
|
+
self.engine = MockEngine() # 添加engine属性
|
|
94
|
+
|
|
95
|
+
crawler = MockCrawler(settings)
|
|
96
|
+
downloader = HttpXDownloader(crawler)
|
|
97
|
+
downloader.open()
|
|
98
|
+
|
|
99
|
+
# 创建测试请求
|
|
100
|
+
test_url = "https://httpbin.org/ip" # 返回客户端IP的测试站点
|
|
101
|
+
request = Request(url=test_url)
|
|
102
|
+
|
|
103
|
+
# 创建模拟爬虫
|
|
104
|
+
spider = MockSpider()
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
# 通过代理中间件处理请求
|
|
108
|
+
print("通过代理中间件处理请求...")
|
|
109
|
+
await proxy_middleware.process_request(request, spider)
|
|
110
|
+
|
|
111
|
+
if request.proxy:
|
|
112
|
+
print(f"代理已设置: {request.proxy}")
|
|
113
|
+
else:
|
|
114
|
+
print("未设置代理")
|
|
115
|
+
|
|
116
|
+
# 使用下载器下载
|
|
117
|
+
print(f"开始下载: {test_url}")
|
|
118
|
+
response = await downloader.download(request)
|
|
119
|
+
|
|
120
|
+
if response:
|
|
121
|
+
print(f"下载成功,状态码: {response.status_code}")
|
|
122
|
+
print(f"响应内容: {response.text[:200]}...") # 只显示前200个字符
|
|
123
|
+
else:
|
|
124
|
+
print("下载失败,响应为空")
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"下载过程中出错: {e}")
|
|
128
|
+
import traceback
|
|
129
|
+
traceback.print_exc()
|
|
130
|
+
|
|
131
|
+
finally:
|
|
132
|
+
# 清理资源
|
|
133
|
+
try:
|
|
134
|
+
await downloader.close()
|
|
135
|
+
except:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
async def main():
|
|
140
|
+
"""主测试函数"""
|
|
141
|
+
print("开始测试代理中间件与下载器的配合...")
|
|
142
|
+
|
|
143
|
+
# 测试代理中间件
|
|
144
|
+
await test_proxy_middleware()
|
|
145
|
+
|
|
146
|
+
# 测试下载器与代理配合
|
|
147
|
+
await test_downloader_with_proxy()
|
|
148
|
+
|
|
149
|
+
print("\n测试完成")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
asyncio.run(main())
|
|
@@ -39,28 +39,28 @@ COOKIES = {
|
|
|
39
39
|
"Hm_lvt_929f8b362150b1f77b477230541dbbc2": "1758071793",
|
|
40
40
|
"historystock": "600699",
|
|
41
41
|
"spversion": "20130314",
|
|
42
|
-
"cid": "
|
|
43
|
-
"u_ukey": "
|
|
42
|
+
"cid": "example_cid_value",
|
|
43
|
+
"u_ukey": "example_u_ukey_value",
|
|
44
44
|
"u_uver": "1.0.0",
|
|
45
|
-
"u_dpass": "
|
|
46
|
-
"u_did": "
|
|
45
|
+
"u_dpass": "example_u_dpass_value",
|
|
46
|
+
"u_did": "example_u_did_value",
|
|
47
47
|
"u_ttype": "WEB",
|
|
48
48
|
"user_status": "0",
|
|
49
49
|
"ttype": "WEB",
|
|
50
50
|
"log": "",
|
|
51
|
-
"Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "
|
|
52
|
-
"HMACCOUNT": "
|
|
53
|
-
"Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "
|
|
54
|
-
"user": "
|
|
55
|
-
"userid": "
|
|
56
|
-
"u_name": "
|
|
57
|
-
"escapename": "
|
|
58
|
-
"ticket": "
|
|
59
|
-
"utk": "
|
|
60
|
-
"sess_tk": "
|
|
61
|
-
"cuc": "
|
|
62
|
-
"Hm_lvt_f79b64788a4e377c608617fba4c736e2": "
|
|
63
|
-
"v": "
|
|
51
|
+
"Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "example_Hm_lvt_value",
|
|
52
|
+
"HMACCOUNT": "example_HMACCOUNT_value",
|
|
53
|
+
"Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "example_Hm_lvt_value",
|
|
54
|
+
"user": "example_user_value",
|
|
55
|
+
"userid": "example_userid_value",
|
|
56
|
+
"u_name": "example_u_name_value",
|
|
57
|
+
"escapename": "example_escapename_value",
|
|
58
|
+
"ticket": "example_ticket_value",
|
|
59
|
+
"utk": "example_utk_value",
|
|
60
|
+
"sess_tk": "example_sess_tk_value",
|
|
61
|
+
"cuc": "example_cuc_value",
|
|
62
|
+
"Hm_lvt_f79b64788a4e377c608617fba4c736e2": "example_Hm_lvt_value",
|
|
63
|
+
"v": "example_v_value",
|
|
64
64
|
"Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1": "1758163145",
|
|
65
65
|
"Hm_lpvt_f79b64788a4e377c608617fba4c736e2": "1758163145",
|
|
66
66
|
"Hm_lpvt_69929b9dce4c22a060bd22d703b2a280": "1758163145"
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response.follow 方法测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest.mock import Mock
|
|
8
|
+
|
|
9
|
+
# 模拟 Request 类
|
|
10
|
+
class MockRequest:
|
|
11
|
+
def __init__(self, url, callback=None, **kwargs):
|
|
12
|
+
self.url = url
|
|
13
|
+
self.callback = callback
|
|
14
|
+
self.kwargs = kwargs
|
|
15
|
+
|
|
16
|
+
# 模拟 crawlo.Request
|
|
17
|
+
import sys
|
|
18
|
+
sys.modules['crawlo'] = Mock()
|
|
19
|
+
sys.modules['crawlo'].Request = MockRequest
|
|
20
|
+
|
|
21
|
+
from crawlo.network.response import Response
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TestResponseFollow(unittest.TestCase):
|
|
25
|
+
"""Response.follow 方法测试类"""
|
|
26
|
+
|
|
27
|
+
def setUp(self):
|
|
28
|
+
"""测试前准备"""
|
|
29
|
+
# 创建一个模拟的HTML响应
|
|
30
|
+
html_content = """
|
|
31
|
+
<html>
|
|
32
|
+
<head>
|
|
33
|
+
<title>测试页面</title>
|
|
34
|
+
</head>
|
|
35
|
+
<body>
|
|
36
|
+
<div class="content">
|
|
37
|
+
<h1>主标题</h1>
|
|
38
|
+
<p class="intro">这是介绍段落</p>
|
|
39
|
+
<ul class="list">
|
|
40
|
+
<li>项目1</li>
|
|
41
|
+
<li>项目2</li>
|
|
42
|
+
<li>项目3</li>
|
|
43
|
+
</ul>
|
|
44
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
45
|
+
<a href="/relative/path" class="relative-link">相对链接</a>
|
|
46
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
47
|
+
</div>
|
|
48
|
+
</body>
|
|
49
|
+
</html>
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# 创建模拟的请求对象
|
|
53
|
+
mock_request = Mock()
|
|
54
|
+
mock_request.callback = None
|
|
55
|
+
|
|
56
|
+
self.response = Response(
|
|
57
|
+
url="https://example.com/test",
|
|
58
|
+
body=html_content.encode('utf-8'),
|
|
59
|
+
headers={"content-type": "text/html; charset=utf-8"},
|
|
60
|
+
request=mock_request
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def test_follow_absolute_url(self):
|
|
64
|
+
"""测试处理绝对URL"""
|
|
65
|
+
request = self.response.follow("https://other.com/page", callback=lambda r: None)
|
|
66
|
+
self.assertEqual(request.url, "https://other.com/page")
|
|
67
|
+
self.assertIsNotNone(request.callback)
|
|
68
|
+
|
|
69
|
+
def test_follow_relative_url(self):
|
|
70
|
+
"""测试处理相对URL"""
|
|
71
|
+
request = self.response.follow("/relative/path", callback=lambda r: None)
|
|
72
|
+
self.assertEqual(request.url, "https://example.com/relative/path")
|
|
73
|
+
self.assertIsNotNone(request.callback)
|
|
74
|
+
|
|
75
|
+
def test_follow_complex_relative_url(self):
|
|
76
|
+
"""测试处理复杂的相对URL"""
|
|
77
|
+
request = self.response.follow("../other/path", callback=lambda r: None)
|
|
78
|
+
self.assertEqual(request.url, "https://example.com/other/path")
|
|
79
|
+
|
|
80
|
+
request2 = self.response.follow("./another/path", callback=lambda r: None)
|
|
81
|
+
self.assertEqual(request2.url, "https://example.com/another/path")
|
|
82
|
+
|
|
83
|
+
def test_follow_with_query_params(self):
|
|
84
|
+
"""测试处理带查询参数的URL"""
|
|
85
|
+
request = self.response.follow("/path?param=value", callback=lambda r: None)
|
|
86
|
+
self.assertEqual(request.url, "https://example.com/path?param=value")
|
|
87
|
+
|
|
88
|
+
request2 = self.response.follow("/path#section", callback=lambda r: None)
|
|
89
|
+
self.assertEqual(request2.url, "https://example.com/path#section")
|
|
90
|
+
|
|
91
|
+
def test_follow_with_additional_kwargs(self):
|
|
92
|
+
"""测试传递额外参数"""
|
|
93
|
+
request = self.response.follow(
|
|
94
|
+
"/path",
|
|
95
|
+
callback=lambda r: None,
|
|
96
|
+
method="POST",
|
|
97
|
+
headers={"User-Agent": "test"}
|
|
98
|
+
)
|
|
99
|
+
self.assertEqual(request.url, "https://example.com/path")
|
|
100
|
+
self.assertEqual(request.kwargs.get("method"), "POST")
|
|
101
|
+
self.assertEqual(request.kwargs.get("headers"), {"User-Agent": "test"})
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == '__main__':
|
|
105
|
+
unittest.main()
|