crawlo 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (44) hide show
  1. crawlo/__version__.py +1 -1
  2. crawlo/downloader/cffi_downloader.py +3 -1
  3. crawlo/middleware/proxy.py +171 -348
  4. crawlo/pipelines/mysql_pipeline.py +339 -188
  5. crawlo/settings/default_settings.py +38 -30
  6. crawlo/stats_collector.py +10 -1
  7. crawlo/templates/project/settings.py.tmpl +10 -55
  8. crawlo/templates/project/settings_distributed.py.tmpl +20 -22
  9. crawlo/templates/project/settings_gentle.py.tmpl +5 -0
  10. crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
  11. crawlo/templates/project/settings_minimal.py.tmpl +25 -1
  12. crawlo/templates/project/settings_simple.py.tmpl +5 -0
  13. crawlo/templates/run.py.tmpl +1 -8
  14. crawlo/templates/spider/spider.py.tmpl +5 -108
  15. crawlo/utils/db_helper.py +11 -5
  16. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/METADATA +1 -1
  17. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/RECORD +43 -29
  18. tests/authenticated_proxy_example.py +10 -6
  19. tests/explain_mysql_update_behavior.py +77 -0
  20. tests/simulate_mysql_update_test.py +140 -0
  21. tests/test_asyncmy_usage.py +57 -0
  22. tests/test_crawlo_proxy_integration.py +8 -2
  23. tests/test_downloader_proxy_compatibility.py +24 -20
  24. tests/test_mysql_pipeline_config.py +165 -0
  25. tests/test_mysql_pipeline_error.py +99 -0
  26. tests/test_mysql_pipeline_init_log.py +83 -0
  27. tests/test_mysql_pipeline_integration.py +133 -0
  28. tests/test_mysql_pipeline_refactor.py +144 -0
  29. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  30. tests/test_mysql_pipeline_robustness.py +196 -0
  31. tests/test_mysql_pipeline_types.py +89 -0
  32. tests/test_mysql_update_columns.py +94 -0
  33. tests/test_proxy_middleware.py +104 -8
  34. tests/test_proxy_middleware_enhanced.py +1 -5
  35. tests/test_proxy_middleware_integration.py +7 -2
  36. tests/test_proxy_middleware_refactored.py +25 -2
  37. tests/test_proxy_only.py +84 -0
  38. tests/test_proxy_with_downloader.py +153 -0
  39. tests/test_real_scenario_proxy.py +17 -17
  40. tests/verify_mysql_warnings.py +110 -0
  41. crawlo/middleware/simple_proxy.py +0 -65
  42. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
  43. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
  44. {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
@@ -126,9 +126,8 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
126
126
  @patch('crawlo.utils.log.get_logger')
127
127
  def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
128
128
  """测试使用解析后的代理数据更新代理池"""
129
- self.settings.set('PROXY_ENABLED', True)
129
+ # 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
130
130
  self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
131
- self.settings.set('PROXY_POOL_SIZE', 2)
132
131
  self.settings.set('LOG_LEVEL', 'INFO')
133
132
 
134
133
  mock_get_logger.return_value = Mock()
@@ -181,5 +180,29 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
181
180
  healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
182
181
  self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
183
182
 
183
+ def test_proxy_middleware_initialization(self):
184
+ """测试代理中间件初始化"""
185
+ # 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
186
+ self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
187
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
188
+ self.assertIsInstance(middleware, ProxyMiddleware)
189
+ self.assertTrue(middleware.enabled)
190
+ self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
191
+
192
+ def test_proxy_middleware_enabled_with_api_url(self):
193
+ """测试配置了代理API URL时中间件启用"""
194
+ # 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
195
+ self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
196
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
197
+ self.assertTrue(middleware.enabled)
198
+ self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
199
+
200
+ def test_proxy_middleware_disabled_without_api_url(self):
201
+ """测试未配置代理API URL时中间件禁用"""
202
+ # 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
203
+ self.settings.set('PROXY_API_URL', None)
204
+ middleware = ProxyMiddleware(self.settings, "DEBUG")
205
+ self.assertFalse(middleware.enabled)
206
+
184
207
  if __name__ == '__main__':
185
208
  unittest.main()
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 代理中间件测试脚本
5
+ 测试指定的代理URL功能
6
+ """
7
+
8
+ import asyncio
9
+ import sys
10
+ import os
11
+
12
+ # 添加项目根目录到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
14
+
15
+ from crawlo.settings.setting_manager import SettingManager
16
+ from crawlo.middleware.proxy import ProxyMiddleware
17
+ from crawlo.network import Request
18
+
19
+
20
+ async def test_proxy_middleware():
21
+ """测试代理中间件"""
22
+ print("=== 测试代理中间件 ===")
23
+
24
+ # 创建设置管理器
25
+ settings_manager = SettingManager()
26
+ settings = settings_manager # SettingManager实例本身就是设置对象
27
+
28
+ # 配置代理
29
+ settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
30
+ settings.set('LOG_LEVEL', 'DEBUG')
31
+
32
+ # 创建代理中间件
33
+ proxy_middleware = ProxyMiddleware(settings, "DEBUG")
34
+
35
+ print(f"代理中间件已创建")
36
+ print(f"模式: {proxy_middleware.mode}")
37
+ print(f"是否启用: {proxy_middleware.enabled}")
38
+
39
+ if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
40
+ # 测试从API获取代理
41
+ print("\n尝试从API获取代理...")
42
+ proxy = await proxy_middleware._fetch_proxy_from_api()
43
+ print(f"获取到的代理: {proxy}")
44
+
45
+ # 测试代理提取功能
46
+ if proxy:
47
+ print(f"代理格式正确: {proxy.startswith('http://') or proxy.startswith('https://')}")
48
+
49
+ # 测试处理请求
50
+ print("\n测试处理请求...")
51
+ request = Request(url="https://httpbin.org/ip")
52
+
53
+ class MockSpider:
54
+ def __init__(self):
55
+ self.name = "test_spider"
56
+
57
+ spider = MockSpider()
58
+
59
+ await proxy_middleware.process_request(request, spider)
60
+
61
+ if request.proxy:
62
+ print(f"请求代理设置成功: {request.proxy}")
63
+ else:
64
+ print("请求代理设置失败")
65
+ else:
66
+ print("未能从API获取有效代理")
67
+ else:
68
+ print("代理中间件未启用或模式不正确")
69
+
70
+ return proxy_middleware
71
+
72
+
73
+ async def main():
74
+ """主测试函数"""
75
+ print("开始测试代理中间件...")
76
+
77
+ # 测试代理中间件
78
+ await test_proxy_middleware()
79
+
80
+ print("\n测试完成")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ asyncio.run(main())
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 代理中间件与下载器配合测试脚本
5
+ 测试指定的代理URL与下载器的兼容性
6
+ """
7
+
8
+ import asyncio
9
+ import sys
10
+ import os
11
+
12
+ # 添加项目根目录到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
14
+
15
+ from crawlo.settings.setting_manager import SettingManager
16
+ from crawlo.middleware.proxy import ProxyMiddleware
17
+ from crawlo.downloader.httpx_downloader import HttpXDownloader
18
+ from crawlo.network import Request
19
+
20
+
21
+ async def test_proxy_middleware():
22
+ """测试代理中间件"""
23
+ print("=== 测试代理中间件 ===")
24
+
25
+ # 创建设置管理器
26
+ settings_manager = SettingManager()
27
+ settings = settings_manager # SettingManager实例本身就是设置对象
28
+
29
+ # 配置代理
30
+ settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
31
+ settings.set('LOG_LEVEL', 'DEBUG')
32
+
33
+ # 创建代理中间件
34
+ proxy_middleware = ProxyMiddleware(settings, "DEBUG")
35
+
36
+ print(f"代理中间件已创建")
37
+ print(f"模式: {proxy_middleware.mode}")
38
+ print(f"是否启用: {proxy_middleware.enabled}")
39
+
40
+ if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
41
+ # 测试从API获取代理
42
+ print("\n尝试从API获取代理...")
43
+ proxy = await proxy_middleware._fetch_proxy_from_api()
44
+ print(f"获取到的代理: {proxy}")
45
+
46
+ return proxy_middleware
47
+
48
+
49
+ async def test_downloader_with_proxy():
50
+ """测试下载器与代理配合"""
51
+ print("\n=== 测试下载器与代理配合 ===")
52
+
53
+ # 创建设置管理器
54
+ settings_manager = SettingManager()
55
+ settings = settings_manager # SettingManager实例本身就是设置对象
56
+
57
+ # 配置代理
58
+ settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
59
+ settings.set('LOG_LEVEL', 'DEBUG')
60
+
61
+ # 创建代理中间件
62
+ proxy_middleware = ProxyMiddleware(settings, "DEBUG")
63
+
64
+ # 创建下载器
65
+ class MockStats:
66
+ def __init__(self):
67
+ pass
68
+
69
+ def inc_value(self, key, count=1):
70
+ pass
71
+
72
+ class MockSubscriber:
73
+ def __init__(self):
74
+ pass
75
+
76
+ def subscribe(self, callback, event):
77
+ pass
78
+
79
+ class MockSpider:
80
+ def __init__(self):
81
+ self.name = "test_spider"
82
+
83
+ class MockEngine:
84
+ def __init__(self):
85
+ pass
86
+
87
+ class MockCrawler:
88
+ def __init__(self, settings):
89
+ self.settings = settings
90
+ self.spider = MockSpider() # 添加spider属性
91
+ self.stats = MockStats() # 添加stats属性
92
+ self.subscriber = MockSubscriber() # 添加subscriber属性
93
+ self.engine = MockEngine() # 添加engine属性
94
+
95
+ crawler = MockCrawler(settings)
96
+ downloader = HttpXDownloader(crawler)
97
+ downloader.open()
98
+
99
+ # 创建测试请求
100
+ test_url = "https://httpbin.org/ip" # 返回客户端IP的测试站点
101
+ request = Request(url=test_url)
102
+
103
+ # 创建模拟爬虫
104
+ spider = MockSpider()
105
+
106
+ try:
107
+ # 通过代理中间件处理请求
108
+ print("通过代理中间件处理请求...")
109
+ await proxy_middleware.process_request(request, spider)
110
+
111
+ if request.proxy:
112
+ print(f"代理已设置: {request.proxy}")
113
+ else:
114
+ print("未设置代理")
115
+
116
+ # 使用下载器下载
117
+ print(f"开始下载: {test_url}")
118
+ response = await downloader.download(request)
119
+
120
+ if response:
121
+ print(f"下载成功,状态码: {response.status_code}")
122
+ print(f"响应内容: {response.text[:200]}...") # 只显示前200个字符
123
+ else:
124
+ print("下载失败,响应为空")
125
+
126
+ except Exception as e:
127
+ print(f"下载过程中出错: {e}")
128
+ import traceback
129
+ traceback.print_exc()
130
+
131
+ finally:
132
+ # 清理资源
133
+ try:
134
+ await downloader.close()
135
+ except:
136
+ pass
137
+
138
+
139
+ async def main():
140
+ """主测试函数"""
141
+ print("开始测试代理中间件与下载器的配合...")
142
+
143
+ # 测试代理中间件
144
+ await test_proxy_middleware()
145
+
146
+ # 测试下载器与代理配合
147
+ await test_downloader_with_proxy()
148
+
149
+ print("\n测试完成")
150
+
151
+
152
+ if __name__ == "__main__":
153
+ asyncio.run(main())
@@ -39,28 +39,28 @@ COOKIES = {
39
39
  "Hm_lvt_929f8b362150b1f77b477230541dbbc2": "1758071793",
40
40
  "historystock": "600699",
41
41
  "spversion": "20130314",
42
- "cid": "f9bc812da2c3a7ddf6d5df1fa2d497091758076438",
43
- "u_ukey": "A10702B8689642C6BE607730E11E6E4A",
42
+ "cid": "example_cid_value",
43
+ "u_ukey": "example_u_ukey_value",
44
44
  "u_uver": "1.0.0",
45
- "u_dpass": "Qk3U07X7SHGKa0AcRUg1R1DVWbPioD9Eg270bdikvlwWWXexbsXnRsQNt%2B04iXwdHi80LrSsTFH9a%2B6rtRvqGg%3D%3D",
46
- "u_did": "E3ED337393E1429DA56E380DD00B3CCD",
45
+ "u_dpass": "example_u_dpass_value",
46
+ "u_did": "example_u_did_value",
47
47
  "u_ttype": "WEB",
48
48
  "user_status": "0",
49
49
  "ttype": "WEB",
50
50
  "log": "",
51
- "Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "1758079404,1758113068,1758157144",
52
- "HMACCOUNT": "08DF0D235A291EAA",
53
- "Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "1758071793,1758113068,1758157144",
54
- "user": "MDpteF9lNXRkY3RpdHo6Ok5vbmU6NTAwOjgxNzYyOTAwNDo3LDExMTExMTExMTExLDQwOzQ0LDExLDQwOzYsMSw0MDs1LDEsNDA7MSwxMDEsNDA7MiwxLDQwOzMsMSw0MDs1LDEsNDA7OCwwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMSw0MDsxMDIsMSw0MDoxNjo6OjgwNzYyOTAwNDoxNzU4MTYxNTE0Ojo6MTc1ODA3MjA2MDo2MDQ4MDA6MDoxYTQ0NmFlNDY4M2VmZWY3YmNjYTczY2U3ODZmZTNiODg6ZGVmYXVsdF81OjA%3D",
55
- "userid": "807629004",
56
- "u_name": "mx_e5tdctitz",
57
- "escapename": "mx_e5tdctitz",
58
- "ticket": "85eea709becdd924d7eb975351de629e",
59
- "utk": "8959c4c6b6f5fb7628864feab15473f4",
60
- "sess_tk": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFUzI1NiIsImtpZCI6InNlc3NfdGtfMSIsImJ0eSI6InNlc3NfdGsifQ.eyJqdGkiOiI4ODNiZmU4NmU3M2NhN2NjN2JlZmVmODM0NmFlNDZhNDEiLCJpYXQiOjE3NTgxNjE1MTQsImV4cCI6MTc1ODc2NjMxNCwic3ViIjoiODA3NjI5MDA0IiwiaXNzIjoidXBhc3MuaXdlbmNhaS5jb20iLCJhdWQiOiIyMDIwMTExODUyODg5MDcyIiwiYWN0Ijoib2ZjIiwiY3VocyI6ImIwNTcyZDVjOWNlNDg0MGFlOWYxYTlhYjU3NGZkNjQyYjgzNmExN2E3Y2NhZjk4ZWRiNzI5ZmJkOWFjOGVkYmYifQ.UBNIzxGvQQtXSiIcB_1JJl-EuAc1S9j2LcTLXjwy4ImhDDbh1oJvyRdDUrXdUpwBpIyx5zgYqgt_3FEhY_iayw",
61
- "cuc": "ap2eap3gg99g",
62
- "Hm_lvt_f79b64788a4e377c608617fba4c736e2": "1758161692",
63
- "v": "A1glI4rWhPCQGqh0MvA0ioufKY3vQbzLHqWQT5JJpBNGLfazOlGMW261YNrh",
51
+ "Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "example_Hm_lvt_value",
52
+ "HMACCOUNT": "example_HMACCOUNT_value",
53
+ "Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "example_Hm_lvt_value",
54
+ "user": "example_user_value",
55
+ "userid": "example_userid_value",
56
+ "u_name": "example_u_name_value",
57
+ "escapename": "example_escapename_value",
58
+ "ticket": "example_ticket_value",
59
+ "utk": "example_utk_value",
60
+ "sess_tk": "example_sess_tk_value",
61
+ "cuc": "example_cuc_value",
62
+ "Hm_lvt_f79b64788a4e377c608617fba4c736e2": "example_Hm_lvt_value",
63
+ "v": "example_v_value",
64
64
  "Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1": "1758163145",
65
65
  "Hm_lpvt_f79b64788a4e377c608617fba4c736e2": "1758163145",
66
66
  "Hm_lpvt_69929b9dce4c22a060bd22d703b2a280": "1758163145"
@@ -0,0 +1,110 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 验证 MySQL 警告是否已解决
4
+ 通过模拟实际运行环境来检查
5
+ """
6
+ import asyncio
7
+ import sys
8
+ import os
9
+
10
+ # 添加项目根目录到 Python 路径
11
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
12
+
13
+ from crawlo.utils.db_helper import SQLBuilder
14
+ from crawlo.pipelines.mysql_pipeline import BaseMySQLPipeline, AsyncmyMySQLPipeline, AiomysqlMySQLPipeline
15
+
16
+
17
+ def verify_mysql_syntax():
18
+ """验证 MySQL 语法是否正确,不会产生警告"""
19
+ print("=== 验证 MySQL 语法是否正确 ===\n")
20
+
21
+ # 模拟实际使用的数据
22
+ test_data = {
23
+ 'title': '新一代OLED屏下光谱颜色传感技术:解锁显示新密码,重塑视觉新体验',
24
+ 'publish_time': '2025-10-09 09:57',
25
+ 'url': 'https://ee.ofweek.com/2025-10/ART-8460-2806-30671544.html',
26
+ 'source': '',
27
+ 'content': '在全球智能手机市场竞争日趋白热化的当下,消费者对手机屏幕显示效果的要求愈发严苛...'
28
+ }
29
+
30
+ # 模拟 ofweek_standalone 项目的配置
31
+ update_columns = ('title', 'publish_time')
32
+
33
+ print("1. 检查 SQLBuilder 生成的语法...")
34
+ sql = SQLBuilder.make_insert(
35
+ table="news_items",
36
+ data=test_data,
37
+ auto_update=False,
38
+ update_columns=update_columns,
39
+ insert_ignore=False
40
+ )
41
+
42
+ print("生成的 SQL:")
43
+ print(sql[:200] + "..." if len(sql) > 200 else sql)
44
+ print()
45
+
46
+ # 检查是否包含弃用的 VALUES() 函数用法
47
+ if "VALUES(`title`)" in sql or "VALUES(`publish_time`)" in sql:
48
+ print("✗ 发现弃用的 VALUES() 函数用法,会产生警告")
49
+ return False
50
+ else:
51
+ print("✓ 未发现弃用的 VALUES() 函数用法")
52
+
53
+ if "AS `excluded`" in sql and "ON DUPLICATE KEY UPDATE" in sql:
54
+ print("✓ 正确使用了新的 MySQL 语法")
55
+ else:
56
+ print("✗ 未正确使用新的 MySQL 语法")
57
+ return False
58
+
59
+ # 检查更新子句
60
+ if "`title`=`excluded`.`title`" in sql and "`publish_time`=`excluded`.`publish_time`" in sql:
61
+ print("✓ 更新子句正确使用了 excluded 别名")
62
+ else:
63
+ print("✗ 更新子句语法不正确")
64
+ return False
65
+
66
+ print("\n2. 检查批量插入语法...")
67
+ batch_result = SQLBuilder.make_batch(
68
+ table="news_items",
69
+ datas=[test_data, test_data],
70
+ auto_update=False,
71
+ update_columns=update_columns
72
+ )
73
+
74
+ if batch_result:
75
+ batch_sql, _ = batch_result
76
+ print("生成的批量 SQL:")
77
+ print(batch_sql[:200] + "..." if len(batch_sql) > 200 else batch_sql)
78
+ print()
79
+
80
+ # 检查批量插入语法
81
+ if "VALUES(`title`)" in batch_sql or "VALUES(`publish_time`)" in batch_sql:
82
+ print("✗ 批量插入中发现弃用的 VALUES() 函数用法,会产生警告")
83
+ return False
84
+ else:
85
+ print("✓ 批量插入未发现弃用的 VALUES() 函数用法")
86
+
87
+ if "AS `excluded`" in batch_sql and "ON DUPLICATE KEY UPDATE" in batch_sql:
88
+ print("✓ 批量插入正确使用了新的 MySQL 语法")
89
+ else:
90
+ print("✗ 批量插入未正确使用新的 MySQL 语法")
91
+ return False
92
+
93
+ # 检查批量更新子句
94
+ if "`title`=`excluded`.`title`" in batch_sql and "`publish_time`=`excluded`.`publish_time`" in batch_sql:
95
+ print("✓ 批量插入更新子句正确使用了 excluded 别名")
96
+ else:
97
+ print("✗ 批量插入更新子句语法不正确")
98
+ return False
99
+
100
+ print("\n=== 验证完成 ===")
101
+ print("✓ 所有语法检查通过,应该不会再出现 MySQL 的 VALUES() 函数弃用警告")
102
+ return True
103
+
104
+
105
+ if __name__ == "__main__":
106
+ success = verify_mysql_syntax()
107
+ if success:
108
+ print("\n🎉 MySQL 语法问题已解决!")
109
+ else:
110
+ print("\n❌ 仍存在 MySQL 语法问题需要修复")
@@ -1,65 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 简化版代理中间件
5
- 提供基础的代理功能,避免过度复杂的实现
6
- """
7
- import random
8
- from typing import Optional, List
9
-
10
- from crawlo import Request, Response
11
- from crawlo.exceptions import NotConfiguredError
12
- from crawlo.utils.log import get_logger
13
-
14
-
15
- class SimpleProxyMiddleware:
16
- """简化版代理中间件"""
17
-
18
- def __init__(self, settings, log_level):
19
- self.logger = get_logger(self.__class__.__name__, log_level)
20
-
21
- # 获取代理列表
22
- self.proxies: List[str] = settings.get("PROXY_LIST", [])
23
- self.enabled = settings.get_bool("PROXY_ENABLED", False)
24
-
25
- if not self.enabled:
26
- self.logger.info("SimpleProxyMiddleware disabled")
27
- return
28
-
29
- if not self.proxies:
30
- raise NotConfiguredError("PROXY_LIST not configured, SimpleProxyMiddleware disabled")
31
-
32
- self.logger.info(f"SimpleProxyMiddleware enabled with {len(self.proxies)} proxies")
33
-
34
- @classmethod
35
- def create_instance(cls, crawler):
36
- return cls(settings=crawler.settings, log_level=crawler.settings.get("LOG_LEVEL"))
37
-
38
- async def process_request(self, request: Request, spider) -> Optional[Request]:
39
- """为请求分配代理"""
40
- if not self.enabled:
41
- return None
42
-
43
- if request.proxy:
44
- # 请求已指定代理,不覆盖
45
- return None
46
-
47
- if self.proxies:
48
- # 随机选择一个代理
49
- proxy = random.choice(self.proxies)
50
- request.proxy = proxy
51
- self.logger.debug(f"Assigned proxy {proxy} to {request.url}")
52
-
53
- return None
54
-
55
- def process_response(self, request: Request, response: Response, spider) -> Response:
56
- """处理响应"""
57
- if request.proxy:
58
- self.logger.debug(f"Proxy request successful: {request.proxy} | {request.url}")
59
- return response
60
-
61
- def process_exception(self, request: Request, exception: Exception, spider) -> Optional[Request]:
62
- """处理异常"""
63
- if request.proxy:
64
- self.logger.warning(f"Proxy request failed: {request.proxy} | {request.url} | {repr(exception)}")
65
- return None
File without changes