crawlo 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (120) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/startproject.py +24 -0
  4. crawlo/core/engine.py +2 -2
  5. crawlo/core/scheduler.py +4 -4
  6. crawlo/crawler.py +8 -7
  7. crawlo/downloader/__init__.py +5 -2
  8. crawlo/downloader/cffi_downloader.py +3 -1
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/middleware/proxy.py +171 -348
  18. crawlo/mode_manager.py +45 -11
  19. crawlo/network/response.py +374 -69
  20. crawlo/pipelines/mysql_pipeline.py +340 -189
  21. crawlo/pipelines/pipeline_manager.py +2 -2
  22. crawlo/project.py +2 -4
  23. crawlo/settings/default_settings.py +42 -30
  24. crawlo/stats_collector.py +10 -1
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +10 -55
  30. crawlo/templates/project/settings_distributed.py.tmpl +20 -22
  31. crawlo/templates/project/settings_gentle.py.tmpl +5 -0
  32. crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
  33. crawlo/templates/project/settings_minimal.py.tmpl +25 -1
  34. crawlo/templates/project/settings_simple.py.tmpl +5 -0
  35. crawlo/templates/run.py.tmpl +1 -8
  36. crawlo/templates/spider/spider.py.tmpl +5 -108
  37. crawlo/tools/__init__.py +0 -11
  38. crawlo/utils/__init__.py +17 -1
  39. crawlo/utils/db_helper.py +226 -319
  40. crawlo/utils/error_handler.py +313 -67
  41. crawlo/utils/fingerprint.py +3 -4
  42. crawlo/utils/misc.py +82 -0
  43. crawlo/utils/request.py +55 -66
  44. crawlo/utils/selector_helper.py +138 -0
  45. crawlo/utils/spider_loader.py +185 -45
  46. crawlo/utils/text_helper.py +95 -0
  47. crawlo-1.4.6.dist-info/METADATA +329 -0
  48. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/RECORD +110 -69
  49. tests/authenticated_proxy_example.py +10 -6
  50. tests/bug_check_test.py +251 -0
  51. tests/direct_selector_helper_test.py +97 -0
  52. tests/explain_mysql_update_behavior.py +77 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  56. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  57. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  58. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  59. tests/ofweek_scrapy/scrapy.cfg +11 -0
  60. tests/performance_comparison.py +4 -5
  61. tests/simple_crawlo_test.py +1 -2
  62. tests/simple_follow_test.py +39 -0
  63. tests/simple_response_selector_test.py +95 -0
  64. tests/simple_selector_helper_test.py +155 -0
  65. tests/simple_selector_test.py +208 -0
  66. tests/simple_url_test.py +74 -0
  67. tests/simulate_mysql_update_test.py +140 -0
  68. tests/test_asyncmy_usage.py +57 -0
  69. tests/test_crawler_process_import.py +39 -0
  70. tests/test_crawler_process_spider_modules.py +48 -0
  71. tests/test_crawlo_proxy_integration.py +8 -2
  72. tests/test_downloader_proxy_compatibility.py +24 -20
  73. tests/test_edge_cases.py +7 -5
  74. tests/test_encoding_core.py +57 -0
  75. tests/test_encoding_detection.py +127 -0
  76. tests/test_factory_compatibility.py +197 -0
  77. tests/test_mysql_pipeline_config.py +165 -0
  78. tests/test_mysql_pipeline_error.py +99 -0
  79. tests/test_mysql_pipeline_init_log.py +83 -0
  80. tests/test_mysql_pipeline_integration.py +133 -0
  81. tests/test_mysql_pipeline_refactor.py +144 -0
  82. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  83. tests/test_mysql_pipeline_robustness.py +196 -0
  84. tests/test_mysql_pipeline_types.py +89 -0
  85. tests/test_mysql_update_columns.py +94 -0
  86. tests/test_optimized_selector_naming.py +101 -0
  87. tests/test_priority_behavior.py +18 -18
  88. tests/test_proxy_middleware.py +104 -8
  89. tests/test_proxy_middleware_enhanced.py +1 -5
  90. tests/test_proxy_middleware_integration.py +7 -2
  91. tests/test_proxy_middleware_refactored.py +25 -2
  92. tests/test_proxy_only.py +84 -0
  93. tests/test_proxy_with_downloader.py +153 -0
  94. tests/test_real_scenario_proxy.py +17 -17
  95. tests/test_response_follow.py +105 -0
  96. tests/test_response_selector_methods.py +93 -0
  97. tests/test_response_url_methods.py +71 -0
  98. tests/test_response_urljoin.py +87 -0
  99. tests/test_scrapy_style_encoding.py +113 -0
  100. tests/test_selector_helper.py +101 -0
  101. tests/test_selector_optimizations.py +147 -0
  102. tests/test_spider_loader.py +50 -0
  103. tests/test_spider_loader_comprehensive.py +70 -0
  104. tests/test_spiders/__init__.py +1 -0
  105. tests/test_spiders/test_spider.py +10 -0
  106. tests/verify_mysql_warnings.py +110 -0
  107. crawlo/middleware/simple_proxy.py +0 -65
  108. crawlo/tools/anti_crawler.py +0 -269
  109. crawlo/utils/class_loader.py +0 -26
  110. crawlo/utils/enhanced_error_handler.py +0 -357
  111. crawlo-1.4.4.dist-info/METADATA +0 -190
  112. tests/simple_log_test.py +0 -58
  113. tests/simple_test.py +0 -48
  114. tests/test_framework_logger.py +0 -67
  115. tests/test_framework_startup.py +0 -65
  116. tests/test_mode_change.py +0 -73
  117. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
  118. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
  119. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
  120. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
@@ -49,25 +49,27 @@ class TestProxyMiddleware(unittest.TestCase):
49
49
  @patch('crawlo.utils.log.get_logger')
50
50
  def test_middleware_initialization_without_api_url(self, mock_get_logger):
51
51
  """测试没有配置API URL时中间件初始化"""
52
- self.settings.set('PROXY_ENABLED', True)
52
+ # 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
53
53
  self.settings.set('PROXY_API_URL', None)
54
54
  self.settings.set('LOG_LEVEL', 'INFO')
55
55
 
56
56
  mock_get_logger.return_value = MockLogger('ProxyMiddleware')
57
57
 
58
- # 应该抛出NotConfiguredError异常
59
- with self.assertRaises(NotConfiguredError):
60
- ProxyMiddleware.create_instance(self.crawler)
58
+ # 应该正常创建实例,但会禁用
59
+ middleware = ProxyMiddleware.create_instance(self.crawler)
60
+ self.assertIsInstance(middleware, ProxyMiddleware)
61
+ self.assertFalse(middleware.enabled)
61
62
 
62
63
  @patch('crawlo.utils.log.get_logger')
63
64
  def test_middleware_initialization_with_disabled_proxy(self, mock_get_logger):
64
65
  """测试禁用代理时中间件初始化"""
65
- self.settings.set('PROXY_ENABLED', False)
66
+ # 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
67
+ self.settings.set('PROXY_API_URL', None)
66
68
  self.settings.set('LOG_LEVEL', 'INFO')
67
69
 
68
70
  mock_get_logger.return_value = MockLogger('ProxyMiddleware')
69
71
 
70
- # 应该正常创建实例
72
+ # 应该正常创建实例,但会禁用
71
73
  middleware = ProxyMiddleware.create_instance(self.crawler)
72
74
  self.assertIsInstance(middleware, ProxyMiddleware)
73
75
  self.assertFalse(middleware.enabled)
@@ -75,18 +77,48 @@ class TestProxyMiddleware(unittest.TestCase):
75
77
  @patch('crawlo.utils.log.get_logger')
76
78
  def test_middleware_initialization_with_api_url(self, mock_get_logger):
77
79
  """测试配置API URL时中间件初始化"""
78
- self.settings.set('PROXY_ENABLED', True)
80
+ # 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
79
81
  self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
80
82
  self.settings.set('LOG_LEVEL', 'INFO')
81
83
 
82
84
  mock_get_logger.return_value = MockLogger('ProxyMiddleware')
83
85
 
84
- # 应该正常创建实例
86
+ # 应该正常创建实例并启用
85
87
  middleware = ProxyMiddleware.create_instance(self.crawler)
86
88
  self.assertIsInstance(middleware, ProxyMiddleware)
87
89
  self.assertTrue(middleware.enabled)
88
90
  self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
89
91
 
92
+ def test_middleware_initialization(self):
93
+ """测试中间件初始化"""
94
+ # 配置代理API URL以启用中间件
95
+ self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
96
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
97
+ self.assertIsInstance(middleware, ProxyMiddleware)
98
+ self.assertTrue(middleware.enabled)
99
+ self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
100
+
101
+ def test_middleware_enabled_with_api_url(self):
102
+ """测试配置了代理API URL时中间件启用"""
103
+ self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
104
+ # 不再需要显式设置 PROXY_ENABLED = True
105
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
106
+ self.assertTrue(middleware.enabled)
107
+ self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
108
+
109
+ def test_middleware_disabled_without_api_url(self):
110
+ """测试未配置代理API URL时中间件禁用"""
111
+ # 不设置 PROXY_API_URL 或设置为空
112
+ self.settings.set('PROXY_API_URL', '')
113
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
114
+ self.assertFalse(middleware.enabled)
115
+
116
+ def test_middleware_disabled_explicitly(self):
117
+ """测试显式禁用中间件(通过不配置API URL)"""
118
+ # 不配置 PROXY_API_URL
119
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
120
+ self.assertFalse(middleware.enabled)
121
+
90
122
  def test_is_https_with_https_url(self):
91
123
  """测试HTTPS URL判断"""
92
124
  # 创建中间件实例
@@ -117,6 +149,70 @@ class TestProxyMiddleware(unittest.TestCase):
117
149
  # 应该返回False
118
150
  self.assertFalse(middleware._is_https(request))
119
151
 
152
+ def test_proxy_extractor_field(self):
153
+ """测试字段名提取方式"""
154
+ self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
155
+ self.settings.set('PROXY_EXTRACTOR', 'data') # 从data字段提取
156
+
157
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
158
+ self.assertEqual(middleware.proxy_extractor, 'data')
159
+
160
+ # 测试提取逻辑
161
+ data = {'data': 'http://proxy-from-data:8080'}
162
+ proxy = middleware._extract_proxy_from_data(data)
163
+ self.assertEqual(proxy, 'http://proxy-from-data:8080')
164
+
165
+ def test_proxy_extractor_dict_field(self):
166
+ """测试字典字段提取方式"""
167
+ self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
168
+ self.settings.set('PROXY_EXTRACTOR', {'type': 'field', 'value': 'result'})
169
+
170
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
171
+ self.assertEqual(middleware.proxy_extractor['type'], 'field')
172
+ self.assertEqual(middleware.proxy_extractor['value'], 'result')
173
+
174
+ # 测试提取逻辑
175
+ data = {'result': 'http://proxy-from-result:8080'}
176
+ proxy = middleware._extract_proxy_from_data(data)
177
+ self.assertEqual(proxy, 'http://proxy-from-result:8080')
178
+
179
+ def test_proxy_extractor_custom_function(self):
180
+ """测试自定义函数提取方式"""
181
+ def custom_extractor(data):
182
+ return data.get('custom_proxy')
183
+
184
+ self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
185
+ self.settings.set('PROXY_EXTRACTOR', {'type': 'custom', 'function': custom_extractor})
186
+
187
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
188
+
189
+ # 测试提取逻辑
190
+ data = {'custom_proxy': 'http://proxy-from-custom:8080'}
191
+ proxy = middleware._extract_proxy_from_data(data)
192
+ self.assertEqual(proxy, 'http://proxy-from-custom:8080')
193
+
194
+ def test_proxy_extractor_callable(self):
195
+ """测试直接函数提取方式"""
196
+ def direct_extractor(data):
197
+ return data.get('direct_proxy')
198
+
199
+ self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
200
+ self.settings.set('PROXY_EXTRACTOR', direct_extractor)
201
+
202
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
203
+
204
+ # 测试提取逻辑
205
+ data = {'direct_proxy': 'http://proxy-from-direct:8080'}
206
+ proxy = middleware._extract_proxy_from_data(data)
207
+ self.assertEqual(proxy, 'http://proxy-from-direct:8080')
208
+
209
+ def test_middleware_disabled_without_list(self):
210
+ """测试未配置代理列表时代理中间件禁用"""
211
+ # 不设置 PROXY_LIST 或设置为空列表
212
+ self.settings.set('PROXY_LIST', [])
213
+ from crawlo.middleware.proxy import ProxyMiddleware
214
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
215
+ self.assertFalse(middleware.enabled)
120
216
 
121
217
  if __name__ == '__main__':
122
218
  unittest.main()
@@ -52,12 +52,8 @@ def test_proxy_class():
52
52
  def create_mock_settings():
53
53
  """创建模拟设置"""
54
54
  settings = SettingManager()
55
- settings.set("PROXY_ENABLED", True)
55
+ # 不再需要显式设置 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
56
56
  settings.set("PROXY_API_URL", "http://test.proxy.api/get")
57
- settings.set("PROXY_EXTRACTOR", "proxy")
58
- settings.set("PROXY_REFRESH_INTERVAL", 10)
59
- settings.set("PROXY_POOL_SIZE", 3)
60
- settings.set("PROXY_HEALTH_CHECK_THRESHOLD", 0.5)
61
57
  settings.set("LOG_LEVEL", "DEBUG")
62
58
  return settings
63
59
 
@@ -13,7 +13,12 @@ def crawler():
13
13
  class MockSettings:
14
14
  def get(self, key, default=None):
15
15
  defaults = {
16
- 'PROXY_ENABLED': True,
16
+ # 配置代理中间件
17
+ custom_settings = {
18
+ # 高级代理配置(适用于ProxyMiddleware)
19
+ # 只要配置了代理API URL,中间件就会自动启用
20
+ 'PROXY_API_URL': 'http://mock-proxy-service.com/api',
21
+ }
17
22
  'PROXIES': ['http://p1:8080', 'http://p2:8080'],
18
23
  'PROXY_SELECTION_STRATEGY': 'random',
19
24
  'PROXY_REQUEST_DELAY_ENABLED': False,
@@ -134,4 +139,4 @@ async def test_request_delay(middleware, spider):
134
139
 
135
140
  mock_sleep.assert_called_once()
136
141
  delay = mock_sleep.call_args[0][0]
137
- assert 0.04 <= delay <= 0.06
142
+ assert 0.04 <= delay <= 0.06
@@ -126,9 +126,8 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
126
126
  @patch('crawlo.utils.log.get_logger')
127
127
  def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
128
128
  """测试使用解析后的代理数据更新代理池"""
129
- self.settings.set('PROXY_ENABLED', True)
129
+ # 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
130
130
  self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
131
- self.settings.set('PROXY_POOL_SIZE', 2)
132
131
  self.settings.set('LOG_LEVEL', 'INFO')
133
132
 
134
133
  mock_get_logger.return_value = Mock()
@@ -181,5 +180,29 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
181
180
  healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
182
181
  self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
183
182
 
183
+ def test_proxy_middleware_initialization(self):
184
+ """测试代理中间件初始化"""
185
+ # 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
186
+ self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
187
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
188
+ self.assertIsInstance(middleware, ProxyMiddleware)
189
+ self.assertTrue(middleware.enabled)
190
+ self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
191
+
192
+ def test_proxy_middleware_enabled_with_api_url(self):
193
+ """测试配置了代理API URL时中间件启用"""
194
+ # 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
195
+ self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
196
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
197
+ self.assertTrue(middleware.enabled)
198
+ self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
199
+
200
+ def test_proxy_middleware_disabled_without_api_url(self):
201
+ """测试未配置代理API URL时中间件禁用"""
202
+ # 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
203
+ self.settings.set('PROXY_API_URL', None)
204
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
205
+ self.assertFalse(middleware.enabled)
206
+
184
207
  if __name__ == '__main__':
185
208
  unittest.main()
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 代理中间件测试脚本
5
+ 测试指定的代理URL功能
6
+ """
7
+
8
+ import asyncio
9
+ import sys
10
+ import os
11
+
12
+ # 添加项目根目录到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
14
+
15
+ from crawlo.settings.setting_manager import SettingManager
16
+ from crawlo.middleware.proxy import ProxyMiddleware
17
+ from crawlo.network import Request
18
+
19
+
20
+ async def test_proxy_middleware():
21
+ """测试代理中间件"""
22
+ print("=== 测试代理中间件 ===")
23
+
24
+ # 创建设置管理器
25
+ settings_manager = SettingManager()
26
+ settings = settings_manager # SettingManager实例本身就是设置对象
27
+
28
+ # 配置代理
29
+ settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
30
+ settings.set('LOG_LEVEL', 'DEBUG')
31
+
32
+ # 创建代理中间件
33
+ proxy_middleware = ProxyMiddleware(settings, "DEBUG")
34
+
35
+ print(f"代理中间件已创建")
36
+ print(f"模式: {proxy_middleware.mode}")
37
+ print(f"是否启用: {proxy_middleware.enabled}")
38
+
39
+ if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
40
+ # 测试从API获取代理
41
+ print("\n尝试从API获取代理...")
42
+ proxy = await proxy_middleware._fetch_proxy_from_api()
43
+ print(f"获取到的代理: {proxy}")
44
+
45
+ # 测试代理提取功能
46
+ if proxy:
47
+ print(f"代理格式正确: {proxy.startswith('http://') or proxy.startswith('https://')}")
48
+
49
+ # 测试处理请求
50
+ print("\n测试处理请求...")
51
+ request = Request(url="https://httpbin.org/ip")
52
+
53
+ class MockSpider:
54
+ def __init__(self):
55
+ self.name = "test_spider"
56
+
57
+ spider = MockSpider()
58
+
59
+ await proxy_middleware.process_request(request, spider)
60
+
61
+ if request.proxy:
62
+ print(f"请求代理设置成功: {request.proxy}")
63
+ else:
64
+ print("请求代理设置失败")
65
+ else:
66
+ print("未能从API获取有效代理")
67
+ else:
68
+ print("代理中间件未启用或模式不正确")
69
+
70
+ return proxy_middleware
71
+
72
+
73
+ async def main():
74
+ """主测试函数"""
75
+ print("开始测试代理中间件...")
76
+
77
+ # 测试代理中间件
78
+ await test_proxy_middleware()
79
+
80
+ print("\n测试完成")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ asyncio.run(main())
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 代理中间件与下载器配合测试脚本
5
+ 测试指定的代理URL与下载器的兼容性
6
+ """
7
+
8
+ import asyncio
9
+ import sys
10
+ import os
11
+
12
+ # 添加项目根目录到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
14
+
15
+ from crawlo.settings.setting_manager import SettingManager
16
+ from crawlo.middleware.proxy import ProxyMiddleware
17
+ from crawlo.downloader.httpx_downloader import HttpXDownloader
18
+ from crawlo.network import Request
19
+
20
+
21
+ async def test_proxy_middleware():
22
+ """测试代理中间件"""
23
+ print("=== 测试代理中间件 ===")
24
+
25
+ # 创建设置管理器
26
+ settings_manager = SettingManager()
27
+ settings = settings_manager # SettingManager实例本身就是设置对象
28
+
29
+ # 配置代理
30
+ settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
31
+ settings.set('LOG_LEVEL', 'DEBUG')
32
+
33
+ # 创建代理中间件
34
+ proxy_middleware = ProxyMiddleware(settings, "DEBUG")
35
+
36
+ print(f"代理中间件已创建")
37
+ print(f"模式: {proxy_middleware.mode}")
38
+ print(f"是否启用: {proxy_middleware.enabled}")
39
+
40
+ if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
41
+ # 测试从API获取代理
42
+ print("\n尝试从API获取代理...")
43
+ proxy = await proxy_middleware._fetch_proxy_from_api()
44
+ print(f"获取到的代理: {proxy}")
45
+
46
+ return proxy_middleware
47
+
48
+
49
+ async def test_downloader_with_proxy():
50
+ """测试下载器与代理配合"""
51
+ print("\n=== 测试下载器与代理配合 ===")
52
+
53
+ # 创建设置管理器
54
+ settings_manager = SettingManager()
55
+ settings = settings_manager # SettingManager实例本身就是设置对象
56
+
57
+ # 配置代理
58
+ settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
59
+ settings.set('LOG_LEVEL', 'DEBUG')
60
+
61
+ # 创建代理中间件
62
+ proxy_middleware = ProxyMiddleware(settings, "DEBUG")
63
+
64
+ # 创建下载器
65
+ class MockStats:
66
+ def __init__(self):
67
+ pass
68
+
69
+ def inc_value(self, key, count=1):
70
+ pass
71
+
72
+ class MockSubscriber:
73
+ def __init__(self):
74
+ pass
75
+
76
+ def subscribe(self, callback, event):
77
+ pass
78
+
79
+ class MockSpider:
80
+ def __init__(self):
81
+ self.name = "test_spider"
82
+
83
+ class MockEngine:
84
+ def __init__(self):
85
+ pass
86
+
87
+ class MockCrawler:
88
+ def __init__(self, settings):
89
+ self.settings = settings
90
+ self.spider = MockSpider() # 添加spider属性
91
+ self.stats = MockStats() # 添加stats属性
92
+ self.subscriber = MockSubscriber() # 添加subscriber属性
93
+ self.engine = MockEngine() # 添加engine属性
94
+
95
+ crawler = MockCrawler(settings)
96
+ downloader = HttpXDownloader(crawler)
97
+ downloader.open()
98
+
99
+ # 创建测试请求
100
+ test_url = "https://httpbin.org/ip" # 返回客户端IP的测试站点
101
+ request = Request(url=test_url)
102
+
103
+ # 创建模拟爬虫
104
+ spider = MockSpider()
105
+
106
+ try:
107
+ # 通过代理中间件处理请求
108
+ print("通过代理中间件处理请求...")
109
+ await proxy_middleware.process_request(request, spider)
110
+
111
+ if request.proxy:
112
+ print(f"代理已设置: {request.proxy}")
113
+ else:
114
+ print("未设置代理")
115
+
116
+ # 使用下载器下载
117
+ print(f"开始下载: {test_url}")
118
+ response = await downloader.download(request)
119
+
120
+ if response:
121
+ print(f"下载成功,状态码: {response.status_code}")
122
+ print(f"响应内容: {response.text[:200]}...") # 只显示前200个字符
123
+ else:
124
+ print("下载失败,响应为空")
125
+
126
+ except Exception as e:
127
+ print(f"下载过程中出错: {e}")
128
+ import traceback
129
+ traceback.print_exc()
130
+
131
+ finally:
132
+ # 清理资源
133
+ try:
134
+ await downloader.close()
135
+ except:
136
+ pass
137
+
138
+
139
+ async def main():
140
+ """主测试函数"""
141
+ print("开始测试代理中间件与下载器的配合...")
142
+
143
+ # 测试代理中间件
144
+ await test_proxy_middleware()
145
+
146
+ # 测试下载器与代理配合
147
+ await test_downloader_with_proxy()
148
+
149
+ print("\n测试完成")
150
+
151
+
152
+ if __name__ == "__main__":
153
+ asyncio.run(main())
@@ -39,28 +39,28 @@ COOKIES = {
39
39
  "Hm_lvt_929f8b362150b1f77b477230541dbbc2": "1758071793",
40
40
  "historystock": "600699",
41
41
  "spversion": "20130314",
42
- "cid": "f9bc812da2c3a7ddf6d5df1fa2d497091758076438",
43
- "u_ukey": "A10702B8689642C6BE607730E11E6E4A",
42
+ "cid": "example_cid_value",
43
+ "u_ukey": "example_u_ukey_value",
44
44
  "u_uver": "1.0.0",
45
- "u_dpass": "Qk3U07X7SHGKa0AcRUg1R1DVWbPioD9Eg270bdikvlwWWXexbsXnRsQNt%2B04iXwdHi80LrSsTFH9a%2B6rtRvqGg%3D%3D",
46
- "u_did": "E3ED337393E1429DA56E380DD00B3CCD",
45
+ "u_dpass": "example_u_dpass_value",
46
+ "u_did": "example_u_did_value",
47
47
  "u_ttype": "WEB",
48
48
  "user_status": "0",
49
49
  "ttype": "WEB",
50
50
  "log": "",
51
- "Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "1758079404,1758113068,1758157144",
52
- "HMACCOUNT": "08DF0D235A291EAA",
53
- "Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "1758071793,1758113068,1758157144",
54
- "user": "MDpteF9lNXRkY3RpdHo6Ok5vbmU6NTAwOjgxNzYyOTAwNDo3LDExMTExMTExMTExLDQwOzQ0LDExLDQwOzYsMSw0MDs1LDEsNDA7MSwxMDEsNDA7MiwxLDQwOzMsMSw0MDs1LDEsNDA7OCwwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMSw0MDsxMDIsMSw0MDoxNjo6OjgwNzYyOTAwNDoxNzU4MTYxNTE0Ojo6MTc1ODA3MjA2MDo2MDQ4MDA6MDoxYTQ0NmFlNDY4M2VmZWY3YmNjYTczY2U3ODZmZTNiODg6ZGVmYXVsdF81OjA%3D",
55
- "userid": "807629004",
56
- "u_name": "mx_e5tdctitz",
57
- "escapename": "mx_e5tdctitz",
58
- "ticket": "85eea709becdd924d7eb975351de629e",
59
- "utk": "8959c4c6b6f5fb7628864feab15473f4",
60
- "sess_tk": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFUzI1NiIsImtpZCI6InNlc3NfdGtfMSIsImJ0eSI6InNlc3NfdGsifQ.eyJqdGkiOiI4ODNiZmU4NmU3M2NhN2NjN2JlZmVmODM0NmFlNDZhNDEiLCJpYXQiOjE3NTgxNjE1MTQsImV4cCI6MTc1ODc2NjMxNCwic3ViIjoiODA3NjI5MDA0IiwiaXNzIjoidXBhc3MuaXdlbmNhaS5jb20iLCJhdWQiOiIyMDIwMTExODUyODg5MDcyIiwiYWN0Ijoib2ZjIiwiY3VocyI6ImIwNTcyZDVjOWNlNDg0MGFlOWYxYTlhYjU3NGZkNjQyYjgzNmExN2E3Y2NhZjk4ZWRiNzI5ZmJkOWFjOGVkYmYifQ.UBNIzxGvQQtXSiIcB_1JJl-EuAc1S9j2LcTLXjwy4ImhDDbh1oJvyRdDUrXdUpwBpIyx5zgYqgt_3FEhY_iayw",
61
- "cuc": "ap2eap3gg99g",
62
- "Hm_lvt_f79b64788a4e377c608617fba4c736e2": "1758161692",
63
- "v": "A1glI4rWhPCQGqh0MvA0ioufKY3vQbzLHqWQT5JJpBNGLfazOlGMW261YNrh",
51
+ "Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "example_Hm_lvt_value",
52
+ "HMACCOUNT": "example_HMACCOUNT_value",
53
+ "Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "example_Hm_lvt_value",
54
+ "user": "example_user_value",
55
+ "userid": "example_userid_value",
56
+ "u_name": "example_u_name_value",
57
+ "escapename": "example_escapename_value",
58
+ "ticket": "example_ticket_value",
59
+ "utk": "example_utk_value",
60
+ "sess_tk": "example_sess_tk_value",
61
+ "cuc": "example_cuc_value",
62
+ "Hm_lvt_f79b64788a4e377c608617fba4c736e2": "example_Hm_lvt_value",
63
+ "v": "example_v_value",
64
64
  "Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1": "1758163145",
65
65
  "Hm_lpvt_f79b64788a4e377c608617fba4c736e2": "1758163145",
66
66
  "Hm_lpvt_69929b9dce4c22a060bd22d703b2a280": "1758163145"
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response.follow 方法测试
5
+ """
6
+ import unittest
7
+ from unittest.mock import Mock
8
+
9
+ # 模拟 Request 类
10
+ class MockRequest:
11
+ def __init__(self, url, callback=None, **kwargs):
12
+ self.url = url
13
+ self.callback = callback
14
+ self.kwargs = kwargs
15
+
16
+ # 模拟 crawlo.Request
17
+ import sys
18
+ sys.modules['crawlo'] = Mock()
19
+ sys.modules['crawlo'].Request = MockRequest
20
+
21
+ from crawlo.network.response import Response
22
+
23
+
24
+ class TestResponseFollow(unittest.TestCase):
25
+ """Response.follow 方法测试类"""
26
+
27
+ def setUp(self):
28
+ """测试前准备"""
29
+ # 创建一个模拟的HTML响应
30
+ html_content = """
31
+ <html>
32
+ <head>
33
+ <title>测试页面</title>
34
+ </head>
35
+ <body>
36
+ <div class="content">
37
+ <h1>主标题</h1>
38
+ <p class="intro">这是介绍段落</p>
39
+ <ul class="list">
40
+ <li>项目1</li>
41
+ <li>项目2</li>
42
+ <li>项目3</li>
43
+ </ul>
44
+ <a href="https://example.com" class="link">链接文本</a>
45
+ <a href="/relative/path" class="relative-link">相对链接</a>
46
+ <img src="image.jpg" alt="图片描述" class="image">
47
+ </div>
48
+ </body>
49
+ </html>
50
+ """
51
+
52
+ # 创建模拟的请求对象
53
+ mock_request = Mock()
54
+ mock_request.callback = None
55
+
56
+ self.response = Response(
57
+ url="https://example.com/test",
58
+ body=html_content.encode('utf-8'),
59
+ headers={"content-type": "text/html; charset=utf-8"},
60
+ request=mock_request
61
+ )
62
+
63
+ def test_follow_absolute_url(self):
64
+ """测试处理绝对URL"""
65
+ request = self.response.follow("https://other.com/page", callback=lambda r: None)
66
+ self.assertEqual(request.url, "https://other.com/page")
67
+ self.assertIsNotNone(request.callback)
68
+
69
+ def test_follow_relative_url(self):
70
+ """测试处理相对URL"""
71
+ request = self.response.follow("/relative/path", callback=lambda r: None)
72
+ self.assertEqual(request.url, "https://example.com/relative/path")
73
+ self.assertIsNotNone(request.callback)
74
+
75
+ def test_follow_complex_relative_url(self):
76
+ """测试处理复杂的相对URL"""
77
+ request = self.response.follow("../other/path", callback=lambda r: None)
78
+ self.assertEqual(request.url, "https://example.com/other/path")
79
+
80
+ request2 = self.response.follow("./another/path", callback=lambda r: None)
81
+ self.assertEqual(request2.url, "https://example.com/another/path")
82
+
83
+ def test_follow_with_query_params(self):
84
+ """测试处理带查询参数的URL"""
85
+ request = self.response.follow("/path?param=value", callback=lambda r: None)
86
+ self.assertEqual(request.url, "https://example.com/path?param=value")
87
+
88
+ request2 = self.response.follow("/path#section", callback=lambda r: None)
89
+ self.assertEqual(request2.url, "https://example.com/path#section")
90
+
91
+ def test_follow_with_additional_kwargs(self):
92
+ """测试传递额外参数"""
93
+ request = self.response.follow(
94
+ "/path",
95
+ callback=lambda r: None,
96
+ method="POST",
97
+ headers={"User-Agent": "test"}
98
+ )
99
+ self.assertEqual(request.url, "https://example.com/path")
100
+ self.assertEqual(request.kwargs.get("method"), "POST")
101
+ self.assertEqual(request.kwargs.get("headers"), {"User-Agent": "test"})
102
+
103
+
104
+ if __name__ == '__main__':
105
+ unittest.main()