crawlo 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/downloader/cffi_downloader.py +3 -1
- crawlo/middleware/proxy.py +171 -348
- crawlo/pipelines/mysql_pipeline.py +339 -188
- crawlo/settings/default_settings.py +38 -30
- crawlo/stats_collector.py +10 -1
- crawlo/templates/project/settings.py.tmpl +10 -55
- crawlo/templates/project/settings_distributed.py.tmpl +20 -22
- crawlo/templates/project/settings_gentle.py.tmpl +5 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- crawlo/templates/project/settings_minimal.py.tmpl +25 -1
- crawlo/templates/project/settings_simple.py.tmpl +5 -0
- crawlo/templates/run.py.tmpl +1 -8
- crawlo/templates/spider/spider.py.tmpl +5 -108
- crawlo/utils/db_helper.py +11 -5
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/METADATA +1 -1
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/RECORD +43 -29
- tests/authenticated_proxy_example.py +10 -6
- tests/explain_mysql_update_behavior.py +77 -0
- tests/simulate_mysql_update_test.py +140 -0
- tests/test_asyncmy_usage.py +57 -0
- tests/test_crawlo_proxy_integration.py +8 -2
- tests/test_downloader_proxy_compatibility.py +24 -20
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_proxy_middleware.py +104 -8
- tests/test_proxy_middleware_enhanced.py +1 -5
- tests/test_proxy_middleware_integration.py +7 -2
- tests/test_proxy_middleware_refactored.py +25 -2
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_real_scenario_proxy.py +17 -17
- tests/verify_mysql_warnings.py +110 -0
- crawlo/middleware/simple_proxy.py +0 -65
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
|
@@ -126,9 +126,8 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
|
126
126
|
@patch('crawlo.utils.log.get_logger')
|
|
127
127
|
def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
|
|
128
128
|
"""测试使用解析后的代理数据更新代理池"""
|
|
129
|
-
|
|
129
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
130
130
|
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
131
|
-
self.settings.set('PROXY_POOL_SIZE', 2)
|
|
132
131
|
self.settings.set('LOG_LEVEL', 'INFO')
|
|
133
132
|
|
|
134
133
|
mock_get_logger.return_value = Mock()
|
|
@@ -181,5 +180,29 @@ class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
|
181
180
|
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
182
181
|
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
183
182
|
|
|
183
|
+
def test_proxy_middleware_initialization(self):
|
|
184
|
+
"""测试代理中间件初始化"""
|
|
185
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
186
|
+
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
187
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
188
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
189
|
+
self.assertTrue(middleware.enabled)
|
|
190
|
+
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
191
|
+
|
|
192
|
+
def test_proxy_middleware_enabled_with_api_url(self):
|
|
193
|
+
"""测试配置了代理API URL时中间件启用"""
|
|
194
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
195
|
+
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
196
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
197
|
+
self.assertTrue(middleware.enabled)
|
|
198
|
+
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
199
|
+
|
|
200
|
+
def test_proxy_middleware_disabled_without_api_url(self):
|
|
201
|
+
"""测试未配置代理API URL时中间件禁用"""
|
|
202
|
+
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
203
|
+
self.settings.set('PROXY_API_URL', None)
|
|
204
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
205
|
+
self.assertFalse(middleware.enabled)
|
|
206
|
+
|
|
184
207
|
if __name__ == '__main__':
|
|
185
208
|
unittest.main()
|
tests/test_proxy_only.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
代理中间件测试脚本
|
|
5
|
+
测试指定的代理URL功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
17
|
+
from crawlo.network import Request
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def test_proxy_middleware():
|
|
21
|
+
"""测试代理中间件"""
|
|
22
|
+
print("=== 测试代理中间件 ===")
|
|
23
|
+
|
|
24
|
+
# 创建设置管理器
|
|
25
|
+
settings_manager = SettingManager()
|
|
26
|
+
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
27
|
+
|
|
28
|
+
# 配置代理
|
|
29
|
+
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
30
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
31
|
+
|
|
32
|
+
# 创建代理中间件
|
|
33
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
34
|
+
|
|
35
|
+
print(f"代理中间件已创建")
|
|
36
|
+
print(f"模式: {proxy_middleware.mode}")
|
|
37
|
+
print(f"是否启用: {proxy_middleware.enabled}")
|
|
38
|
+
|
|
39
|
+
if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
|
|
40
|
+
# 测试从API获取代理
|
|
41
|
+
print("\n尝试从API获取代理...")
|
|
42
|
+
proxy = await proxy_middleware._fetch_proxy_from_api()
|
|
43
|
+
print(f"获取到的代理: {proxy}")
|
|
44
|
+
|
|
45
|
+
# 测试代理提取功能
|
|
46
|
+
if proxy:
|
|
47
|
+
print(f"代理格式正确: {proxy.startswith('http://') or proxy.startswith('https://')}")
|
|
48
|
+
|
|
49
|
+
# 测试处理请求
|
|
50
|
+
print("\n测试处理请求...")
|
|
51
|
+
request = Request(url="https://httpbin.org/ip")
|
|
52
|
+
|
|
53
|
+
class MockSpider:
|
|
54
|
+
def __init__(self):
|
|
55
|
+
self.name = "test_spider"
|
|
56
|
+
|
|
57
|
+
spider = MockSpider()
|
|
58
|
+
|
|
59
|
+
await proxy_middleware.process_request(request, spider)
|
|
60
|
+
|
|
61
|
+
if request.proxy:
|
|
62
|
+
print(f"请求代理设置成功: {request.proxy}")
|
|
63
|
+
else:
|
|
64
|
+
print("请求代理设置失败")
|
|
65
|
+
else:
|
|
66
|
+
print("未能从API获取有效代理")
|
|
67
|
+
else:
|
|
68
|
+
print("代理中间件未启用或模式不正确")
|
|
69
|
+
|
|
70
|
+
return proxy_middleware
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def main():
|
|
74
|
+
"""主测试函数"""
|
|
75
|
+
print("开始测试代理中间件...")
|
|
76
|
+
|
|
77
|
+
# 测试代理中间件
|
|
78
|
+
await test_proxy_middleware()
|
|
79
|
+
|
|
80
|
+
print("\n测试完成")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
代理中间件与下载器配合测试脚本
|
|
5
|
+
测试指定的代理URL与下载器的兼容性
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
17
|
+
from crawlo.downloader.httpx_downloader import HttpXDownloader
|
|
18
|
+
from crawlo.network import Request
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def test_proxy_middleware():
|
|
22
|
+
"""测试代理中间件"""
|
|
23
|
+
print("=== 测试代理中间件 ===")
|
|
24
|
+
|
|
25
|
+
# 创建设置管理器
|
|
26
|
+
settings_manager = SettingManager()
|
|
27
|
+
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
28
|
+
|
|
29
|
+
# 配置代理
|
|
30
|
+
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
31
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
32
|
+
|
|
33
|
+
# 创建代理中间件
|
|
34
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
35
|
+
|
|
36
|
+
print(f"代理中间件已创建")
|
|
37
|
+
print(f"模式: {proxy_middleware.mode}")
|
|
38
|
+
print(f"是否启用: {proxy_middleware.enabled}")
|
|
39
|
+
|
|
40
|
+
if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
|
|
41
|
+
# 测试从API获取代理
|
|
42
|
+
print("\n尝试从API获取代理...")
|
|
43
|
+
proxy = await proxy_middleware._fetch_proxy_from_api()
|
|
44
|
+
print(f"获取到的代理: {proxy}")
|
|
45
|
+
|
|
46
|
+
return proxy_middleware
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def test_downloader_with_proxy():
|
|
50
|
+
"""测试下载器与代理配合"""
|
|
51
|
+
print("\n=== 测试下载器与代理配合 ===")
|
|
52
|
+
|
|
53
|
+
# 创建设置管理器
|
|
54
|
+
settings_manager = SettingManager()
|
|
55
|
+
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
56
|
+
|
|
57
|
+
# 配置代理
|
|
58
|
+
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
59
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
60
|
+
|
|
61
|
+
# 创建代理中间件
|
|
62
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
63
|
+
|
|
64
|
+
# 创建下载器
|
|
65
|
+
class MockStats:
|
|
66
|
+
def __init__(self):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
def inc_value(self, key, count=1):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
class MockSubscriber:
|
|
73
|
+
def __init__(self):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
def subscribe(self, callback, event):
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
class MockSpider:
|
|
80
|
+
def __init__(self):
|
|
81
|
+
self.name = "test_spider"
|
|
82
|
+
|
|
83
|
+
class MockEngine:
|
|
84
|
+
def __init__(self):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
class MockCrawler:
|
|
88
|
+
def __init__(self, settings):
|
|
89
|
+
self.settings = settings
|
|
90
|
+
self.spider = MockSpider() # 添加spider属性
|
|
91
|
+
self.stats = MockStats() # 添加stats属性
|
|
92
|
+
self.subscriber = MockSubscriber() # 添加subscriber属性
|
|
93
|
+
self.engine = MockEngine() # 添加engine属性
|
|
94
|
+
|
|
95
|
+
crawler = MockCrawler(settings)
|
|
96
|
+
downloader = HttpXDownloader(crawler)
|
|
97
|
+
downloader.open()
|
|
98
|
+
|
|
99
|
+
# 创建测试请求
|
|
100
|
+
test_url = "https://httpbin.org/ip" # 返回客户端IP的测试站点
|
|
101
|
+
request = Request(url=test_url)
|
|
102
|
+
|
|
103
|
+
# 创建模拟爬虫
|
|
104
|
+
spider = MockSpider()
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
# 通过代理中间件处理请求
|
|
108
|
+
print("通过代理中间件处理请求...")
|
|
109
|
+
await proxy_middleware.process_request(request, spider)
|
|
110
|
+
|
|
111
|
+
if request.proxy:
|
|
112
|
+
print(f"代理已设置: {request.proxy}")
|
|
113
|
+
else:
|
|
114
|
+
print("未设置代理")
|
|
115
|
+
|
|
116
|
+
# 使用下载器下载
|
|
117
|
+
print(f"开始下载: {test_url}")
|
|
118
|
+
response = await downloader.download(request)
|
|
119
|
+
|
|
120
|
+
if response:
|
|
121
|
+
print(f"下载成功,状态码: {response.status_code}")
|
|
122
|
+
print(f"响应内容: {response.text[:200]}...") # 只显示前200个字符
|
|
123
|
+
else:
|
|
124
|
+
print("下载失败,响应为空")
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"下载过程中出错: {e}")
|
|
128
|
+
import traceback
|
|
129
|
+
traceback.print_exc()
|
|
130
|
+
|
|
131
|
+
finally:
|
|
132
|
+
# 清理资源
|
|
133
|
+
try:
|
|
134
|
+
await downloader.close()
|
|
135
|
+
except:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
async def main():
|
|
140
|
+
"""主测试函数"""
|
|
141
|
+
print("开始测试代理中间件与下载器的配合...")
|
|
142
|
+
|
|
143
|
+
# 测试代理中间件
|
|
144
|
+
await test_proxy_middleware()
|
|
145
|
+
|
|
146
|
+
# 测试下载器与代理配合
|
|
147
|
+
await test_downloader_with_proxy()
|
|
148
|
+
|
|
149
|
+
print("\n测试完成")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
asyncio.run(main())
|
|
@@ -39,28 +39,28 @@ COOKIES = {
|
|
|
39
39
|
"Hm_lvt_929f8b362150b1f77b477230541dbbc2": "1758071793",
|
|
40
40
|
"historystock": "600699",
|
|
41
41
|
"spversion": "20130314",
|
|
42
|
-
"cid": "
|
|
43
|
-
"u_ukey": "
|
|
42
|
+
"cid": "example_cid_value",
|
|
43
|
+
"u_ukey": "example_u_ukey_value",
|
|
44
44
|
"u_uver": "1.0.0",
|
|
45
|
-
"u_dpass": "
|
|
46
|
-
"u_did": "
|
|
45
|
+
"u_dpass": "example_u_dpass_value",
|
|
46
|
+
"u_did": "example_u_did_value",
|
|
47
47
|
"u_ttype": "WEB",
|
|
48
48
|
"user_status": "0",
|
|
49
49
|
"ttype": "WEB",
|
|
50
50
|
"log": "",
|
|
51
|
-
"Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "
|
|
52
|
-
"HMACCOUNT": "
|
|
53
|
-
"Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "
|
|
54
|
-
"user": "
|
|
55
|
-
"userid": "
|
|
56
|
-
"u_name": "
|
|
57
|
-
"escapename": "
|
|
58
|
-
"ticket": "
|
|
59
|
-
"utk": "
|
|
60
|
-
"sess_tk": "
|
|
61
|
-
"cuc": "
|
|
62
|
-
"Hm_lvt_f79b64788a4e377c608617fba4c736e2": "
|
|
63
|
-
"v": "
|
|
51
|
+
"Hm_lvt_69929b9dce4c22a060bd22d703b2a280": "example_Hm_lvt_value",
|
|
52
|
+
"HMACCOUNT": "example_HMACCOUNT_value",
|
|
53
|
+
"Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1": "example_Hm_lvt_value",
|
|
54
|
+
"user": "example_user_value",
|
|
55
|
+
"userid": "example_userid_value",
|
|
56
|
+
"u_name": "example_u_name_value",
|
|
57
|
+
"escapename": "example_escapename_value",
|
|
58
|
+
"ticket": "example_ticket_value",
|
|
59
|
+
"utk": "example_utk_value",
|
|
60
|
+
"sess_tk": "example_sess_tk_value",
|
|
61
|
+
"cuc": "example_cuc_value",
|
|
62
|
+
"Hm_lvt_f79b64788a4e377c608617fba4c736e2": "example_Hm_lvt_value",
|
|
63
|
+
"v": "example_v_value",
|
|
64
64
|
"Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1": "1758163145",
|
|
65
65
|
"Hm_lpvt_f79b64788a4e377c608617fba4c736e2": "1758163145",
|
|
66
66
|
"Hm_lpvt_69929b9dce4c22a060bd22d703b2a280": "1758163145"
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
验证 MySQL 警告是否已解决
|
|
4
|
+
通过模拟实际运行环境来检查
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到 Python 路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.utils.db_helper import SQLBuilder
|
|
14
|
+
from crawlo.pipelines.mysql_pipeline import BaseMySQLPipeline, AsyncmyMySQLPipeline, AiomysqlMySQLPipeline
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def verify_mysql_syntax():
|
|
18
|
+
"""验证 MySQL 语法是否正确,不会产生警告"""
|
|
19
|
+
print("=== 验证 MySQL 语法是否正确 ===\n")
|
|
20
|
+
|
|
21
|
+
# 模拟实际使用的数据
|
|
22
|
+
test_data = {
|
|
23
|
+
'title': '新一代OLED屏下光谱颜色传感技术:解锁显示新密码,重塑视觉新体验',
|
|
24
|
+
'publish_time': '2025-10-09 09:57',
|
|
25
|
+
'url': 'https://ee.ofweek.com/2025-10/ART-8460-2806-30671544.html',
|
|
26
|
+
'source': '',
|
|
27
|
+
'content': '在全球智能手机市场竞争日趋白热化的当下,消费者对手机屏幕显示效果的要求愈发严苛...'
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# 模拟 ofweek_standalone 项目的配置
|
|
31
|
+
update_columns = ('title', 'publish_time')
|
|
32
|
+
|
|
33
|
+
print("1. 检查 SQLBuilder 生成的语法...")
|
|
34
|
+
sql = SQLBuilder.make_insert(
|
|
35
|
+
table="news_items",
|
|
36
|
+
data=test_data,
|
|
37
|
+
auto_update=False,
|
|
38
|
+
update_columns=update_columns,
|
|
39
|
+
insert_ignore=False
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
print("生成的 SQL:")
|
|
43
|
+
print(sql[:200] + "..." if len(sql) > 200 else sql)
|
|
44
|
+
print()
|
|
45
|
+
|
|
46
|
+
# 检查是否包含弃用的 VALUES() 函数用法
|
|
47
|
+
if "VALUES(`title`)" in sql or "VALUES(`publish_time`)" in sql:
|
|
48
|
+
print("✗ 发现弃用的 VALUES() 函数用法,会产生警告")
|
|
49
|
+
return False
|
|
50
|
+
else:
|
|
51
|
+
print("✓ 未发现弃用的 VALUES() 函数用法")
|
|
52
|
+
|
|
53
|
+
if "AS `excluded`" in sql and "ON DUPLICATE KEY UPDATE" in sql:
|
|
54
|
+
print("✓ 正确使用了新的 MySQL 语法")
|
|
55
|
+
else:
|
|
56
|
+
print("✗ 未正确使用新的 MySQL 语法")
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
# 检查更新子句
|
|
60
|
+
if "`title`=`excluded`.`title`" in sql and "`publish_time`=`excluded`.`publish_time`" in sql:
|
|
61
|
+
print("✓ 更新子句正确使用了 excluded 别名")
|
|
62
|
+
else:
|
|
63
|
+
print("✗ 更新子句语法不正确")
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
print("\n2. 检查批量插入语法...")
|
|
67
|
+
batch_result = SQLBuilder.make_batch(
|
|
68
|
+
table="news_items",
|
|
69
|
+
datas=[test_data, test_data],
|
|
70
|
+
auto_update=False,
|
|
71
|
+
update_columns=update_columns
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if batch_result:
|
|
75
|
+
batch_sql, _ = batch_result
|
|
76
|
+
print("生成的批量 SQL:")
|
|
77
|
+
print(batch_sql[:200] + "..." if len(batch_sql) > 200 else batch_sql)
|
|
78
|
+
print()
|
|
79
|
+
|
|
80
|
+
# 检查批量插入语法
|
|
81
|
+
if "VALUES(`title`)" in batch_sql or "VALUES(`publish_time`)" in batch_sql:
|
|
82
|
+
print("✗ 批量插入中发现弃用的 VALUES() 函数用法,会产生警告")
|
|
83
|
+
return False
|
|
84
|
+
else:
|
|
85
|
+
print("✓ 批量插入未发现弃用的 VALUES() 函数用法")
|
|
86
|
+
|
|
87
|
+
if "AS `excluded`" in batch_sql and "ON DUPLICATE KEY UPDATE" in batch_sql:
|
|
88
|
+
print("✓ 批量插入正确使用了新的 MySQL 语法")
|
|
89
|
+
else:
|
|
90
|
+
print("✗ 批量插入未正确使用新的 MySQL 语法")
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
# 检查批量更新子句
|
|
94
|
+
if "`title`=`excluded`.`title`" in batch_sql and "`publish_time`=`excluded`.`publish_time`" in batch_sql:
|
|
95
|
+
print("✓ 批量插入更新子句正确使用了 excluded 别名")
|
|
96
|
+
else:
|
|
97
|
+
print("✗ 批量插入更新子句语法不正确")
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
print("\n=== 验证完成 ===")
|
|
101
|
+
print("✓ 所有语法检查通过,应该不会再出现 MySQL 的 VALUES() 函数弃用警告")
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
if __name__ == "__main__":
|
|
106
|
+
success = verify_mysql_syntax()
|
|
107
|
+
if success:
|
|
108
|
+
print("\n🎉 MySQL 语法问题已解决!")
|
|
109
|
+
else:
|
|
110
|
+
print("\n❌ 仍存在 MySQL 语法问题需要修复")
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
简化版代理中间件
|
|
5
|
-
提供基础的代理功能,避免过度复杂的实现
|
|
6
|
-
"""
|
|
7
|
-
import random
|
|
8
|
-
from typing import Optional, List
|
|
9
|
-
|
|
10
|
-
from crawlo import Request, Response
|
|
11
|
-
from crawlo.exceptions import NotConfiguredError
|
|
12
|
-
from crawlo.utils.log import get_logger
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class SimpleProxyMiddleware:
|
|
16
|
-
"""简化版代理中间件"""
|
|
17
|
-
|
|
18
|
-
def __init__(self, settings, log_level):
|
|
19
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
20
|
-
|
|
21
|
-
# 获取代理列表
|
|
22
|
-
self.proxies: List[str] = settings.get("PROXY_LIST", [])
|
|
23
|
-
self.enabled = settings.get_bool("PROXY_ENABLED", False)
|
|
24
|
-
|
|
25
|
-
if not self.enabled:
|
|
26
|
-
self.logger.info("SimpleProxyMiddleware disabled")
|
|
27
|
-
return
|
|
28
|
-
|
|
29
|
-
if not self.proxies:
|
|
30
|
-
raise NotConfiguredError("PROXY_LIST not configured, SimpleProxyMiddleware disabled")
|
|
31
|
-
|
|
32
|
-
self.logger.info(f"SimpleProxyMiddleware enabled with {len(self.proxies)} proxies")
|
|
33
|
-
|
|
34
|
-
@classmethod
|
|
35
|
-
def create_instance(cls, crawler):
|
|
36
|
-
return cls(settings=crawler.settings, log_level=crawler.settings.get("LOG_LEVEL"))
|
|
37
|
-
|
|
38
|
-
async def process_request(self, request: Request, spider) -> Optional[Request]:
|
|
39
|
-
"""为请求分配代理"""
|
|
40
|
-
if not self.enabled:
|
|
41
|
-
return None
|
|
42
|
-
|
|
43
|
-
if request.proxy:
|
|
44
|
-
# 请求已指定代理,不覆盖
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
if self.proxies:
|
|
48
|
-
# 随机选择一个代理
|
|
49
|
-
proxy = random.choice(self.proxies)
|
|
50
|
-
request.proxy = proxy
|
|
51
|
-
self.logger.debug(f"Assigned proxy {proxy} to {request.url}")
|
|
52
|
-
|
|
53
|
-
return None
|
|
54
|
-
|
|
55
|
-
def process_response(self, request: Request, response: Response, spider) -> Response:
|
|
56
|
-
"""处理响应"""
|
|
57
|
-
if request.proxy:
|
|
58
|
-
self.logger.debug(f"Proxy request successful: {request.proxy} | {request.url}")
|
|
59
|
-
return response
|
|
60
|
-
|
|
61
|
-
def process_exception(self, request: Request, exception: Exception, spider) -> Optional[Request]:
|
|
62
|
-
"""处理异常"""
|
|
63
|
-
if request.proxy:
|
|
64
|
-
self.logger.warning(f"Proxy request failed: {request.proxy} | {request.url} | {repr(exception)}")
|
|
65
|
-
return None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|