crawlo 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +8 -7
- crawlo/downloader/__init__.py +5 -2
- crawlo/downloader/cffi_downloader.py +3 -1
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/middleware/proxy.py +171 -348
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +340 -189
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/settings/default_settings.py +42 -30
- crawlo/stats_collector.py +10 -1
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +10 -55
- crawlo/templates/project/settings_distributed.py.tmpl +20 -22
- crawlo/templates/project/settings_gentle.py.tmpl +5 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- crawlo/templates/project/settings_minimal.py.tmpl +25 -1
- crawlo/templates/project/settings_simple.py.tmpl +5 -0
- crawlo/templates/run.py.tmpl +1 -8
- crawlo/templates/spider/spider.py.tmpl +5 -108
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +226 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.6.dist-info/METADATA +329 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/RECORD +110 -69
- tests/authenticated_proxy_example.py +10 -6
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/explain_mysql_update_behavior.py +77 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/simulate_mysql_update_test.py +140 -0
- tests/test_asyncmy_usage.py +57 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_crawlo_proxy_integration.py +8 -2
- tests/test_downloader_proxy_compatibility.py +24 -20
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_proxy_middleware.py +104 -8
- tests/test_proxy_middleware_enhanced.py +1 -5
- tests/test_proxy_middleware_integration.py +7 -2
- tests/test_proxy_middleware_refactored.py +25 -2
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_real_scenario_proxy.py +17 -17
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- tests/verify_mysql_warnings.py +110 -0
- crawlo/middleware/simple_proxy.py +0 -65
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.4.dist-info/METADATA +0 -190
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简化选择器测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
# 直接导入需要的模块
|
|
11
|
+
from parsel import Selector, SelectorList
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MockResponse:
|
|
15
|
+
"""模拟Response类用于测试"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, text):
|
|
18
|
+
self._text = text
|
|
19
|
+
self._selector_instance = None
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def text(self):
|
|
23
|
+
return self._text
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def _selector(self):
|
|
27
|
+
if self._selector_instance is None:
|
|
28
|
+
self._selector_instance = Selector(self.text)
|
|
29
|
+
return self._selector_instance
|
|
30
|
+
|
|
31
|
+
def xpath(self, query):
|
|
32
|
+
return self._selector.xpath(query)
|
|
33
|
+
|
|
34
|
+
def css(self, query):
|
|
35
|
+
return self._selector.css(query)
|
|
36
|
+
|
|
37
|
+
def _is_xpath(self, query):
|
|
38
|
+
return query.startswith(('/', '//', './'))
|
|
39
|
+
|
|
40
|
+
def _extract_text_from_elements(self, elements, join_str=" "):
|
|
41
|
+
texts = []
|
|
42
|
+
for element in elements:
|
|
43
|
+
if hasattr(element, 'xpath'):
|
|
44
|
+
element_texts = element.xpath('.//text()').getall()
|
|
45
|
+
else:
|
|
46
|
+
element_texts = [str(element)]
|
|
47
|
+
for text in element_texts:
|
|
48
|
+
cleaned = text.strip()
|
|
49
|
+
if cleaned:
|
|
50
|
+
texts.append(cleaned)
|
|
51
|
+
return join_str.join(texts)
|
|
52
|
+
|
|
53
|
+
def extract_text(self, xpath_or_css, join_str=" ", default=''):
|
|
54
|
+
try:
|
|
55
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
56
|
+
if not elements:
|
|
57
|
+
return default
|
|
58
|
+
return self._extract_text_from_elements(elements, join_str)
|
|
59
|
+
except Exception:
|
|
60
|
+
return default
|
|
61
|
+
|
|
62
|
+
def extract_texts(self, xpath_or_css, join_str=" ", default=None):
|
|
63
|
+
if default is None:
|
|
64
|
+
default = []
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
68
|
+
if not elements:
|
|
69
|
+
return default
|
|
70
|
+
|
|
71
|
+
result = []
|
|
72
|
+
for element in elements:
|
|
73
|
+
if hasattr(element, 'xpath'):
|
|
74
|
+
texts = element.xpath('.//text()').getall()
|
|
75
|
+
else:
|
|
76
|
+
texts = [str(element)]
|
|
77
|
+
|
|
78
|
+
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
79
|
+
if clean_texts:
|
|
80
|
+
result.append(join_str.join(clean_texts))
|
|
81
|
+
|
|
82
|
+
return result if result else default
|
|
83
|
+
except Exception:
|
|
84
|
+
return default
|
|
85
|
+
|
|
86
|
+
def extract_attr(self, xpath_or_css, attr_name, default=None):
|
|
87
|
+
try:
|
|
88
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
89
|
+
if not elements:
|
|
90
|
+
return default
|
|
91
|
+
if hasattr(elements, 'attrib'):
|
|
92
|
+
return elements.attrib.get(attr_name, default)
|
|
93
|
+
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
94
|
+
return elements[0].attrib.get(attr_name, default)
|
|
95
|
+
return default
|
|
96
|
+
except Exception:
|
|
97
|
+
return default
|
|
98
|
+
|
|
99
|
+
def extract_attrs(self, xpath_or_css, attr_name, default=None):
|
|
100
|
+
if default is None:
|
|
101
|
+
default = []
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
105
|
+
if not elements:
|
|
106
|
+
return default
|
|
107
|
+
|
|
108
|
+
result = []
|
|
109
|
+
for element in elements:
|
|
110
|
+
if hasattr(element, 'attrib'):
|
|
111
|
+
attr_value = element.attrib.get(attr_name)
|
|
112
|
+
if attr_value is not None:
|
|
113
|
+
result.append(attr_value)
|
|
114
|
+
|
|
115
|
+
return result if result else default
|
|
116
|
+
except Exception:
|
|
117
|
+
return default
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_selector_methods():
|
|
121
|
+
"""测试选择器方法"""
|
|
122
|
+
print("测试选择器方法...")
|
|
123
|
+
print("=" * 50)
|
|
124
|
+
|
|
125
|
+
# 创建测试HTML
|
|
126
|
+
html_content = """
|
|
127
|
+
<html>
|
|
128
|
+
<head>
|
|
129
|
+
<title>测试页面</title>
|
|
130
|
+
</head>
|
|
131
|
+
<body>
|
|
132
|
+
<div class="content">
|
|
133
|
+
<h1>主标题</h1>
|
|
134
|
+
<p class="intro">介绍段落</p>
|
|
135
|
+
<ul class="list">
|
|
136
|
+
<li>项目1</li>
|
|
137
|
+
<li>项目2</li>
|
|
138
|
+
<li>项目3</li>
|
|
139
|
+
</ul>
|
|
140
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
141
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
142
|
+
</div>
|
|
143
|
+
</body>
|
|
144
|
+
</html>
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
response = MockResponse(html_content)
|
|
148
|
+
|
|
149
|
+
# 测试 extract_text
|
|
150
|
+
print("1. 测试 extract_text:")
|
|
151
|
+
title = response.extract_text('title')
|
|
152
|
+
print(f" 标题: {title}")
|
|
153
|
+
|
|
154
|
+
h1_text = response.extract_text('.content h1')
|
|
155
|
+
print(f" H1文本: {h1_text}")
|
|
156
|
+
|
|
157
|
+
# 测试XPath
|
|
158
|
+
title_xpath = response.extract_text('//title')
|
|
159
|
+
print(f" XPath标题: {title_xpath}")
|
|
160
|
+
|
|
161
|
+
print()
|
|
162
|
+
|
|
163
|
+
# 测试 extract_texts
|
|
164
|
+
print("2. 测试 extract_texts:")
|
|
165
|
+
list_items = response.extract_texts('.list li')
|
|
166
|
+
print(f" 列表项: {list_items}")
|
|
167
|
+
|
|
168
|
+
# 测试XPath
|
|
169
|
+
list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
|
|
170
|
+
print(f" XPath列表项: {list_items_xpath}")
|
|
171
|
+
|
|
172
|
+
print()
|
|
173
|
+
|
|
174
|
+
# 测试 extract_attr
|
|
175
|
+
print("3. 测试 extract_attr:")
|
|
176
|
+
link_href = response.extract_attr('.link', 'href')
|
|
177
|
+
print(f" 链接href: {link_href}")
|
|
178
|
+
|
|
179
|
+
img_alt = response.extract_attr('.image', 'alt')
|
|
180
|
+
print(f" 图片alt: {img_alt}")
|
|
181
|
+
|
|
182
|
+
# 测试XPath
|
|
183
|
+
link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
|
|
184
|
+
print(f" XPath链接href: {link_href_xpath}")
|
|
185
|
+
|
|
186
|
+
print()
|
|
187
|
+
|
|
188
|
+
# 测试 extract_attrs
|
|
189
|
+
print("4. 测试 extract_attrs:")
|
|
190
|
+
all_links = response.extract_attrs('a', 'href')
|
|
191
|
+
print(f" 所有链接: {all_links}")
|
|
192
|
+
|
|
193
|
+
print()
|
|
194
|
+
|
|
195
|
+
# 测试边界情况
|
|
196
|
+
print("5. 测试边界情况:")
|
|
197
|
+
non_exist = response.extract_text('.non-exist', default='默认文本')
|
|
198
|
+
print(f" 不存在元素的默认值: {non_exist}")
|
|
199
|
+
|
|
200
|
+
non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
|
|
201
|
+
print(f" 不存在属性的默认值: {non_exist_attr}")
|
|
202
|
+
|
|
203
|
+
print()
|
|
204
|
+
print("所有测试完成!")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == '__main__':
|
|
208
|
+
test_selector_methods()
|
tests/simple_url_test.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response URL 处理方法简单测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到Python路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
# 直接导入需要的模块
|
|
13
|
+
from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_url_methods():
|
|
17
|
+
"""测试 URL 处理方法"""
|
|
18
|
+
print("测试 Response URL 处理方法")
|
|
19
|
+
|
|
20
|
+
# 测试数据
|
|
21
|
+
test_url = "https://example.com/test?param1=value1¶m2=value2#section1"
|
|
22
|
+
print(f"测试URL: {test_url}")
|
|
23
|
+
|
|
24
|
+
# 1. 测试 urlparse
|
|
25
|
+
print("\n1. 测试 urlparse:")
|
|
26
|
+
parsed = urlparse(test_url)
|
|
27
|
+
print(f" scheme: {parsed.scheme}")
|
|
28
|
+
print(f" netloc: {parsed.netloc}")
|
|
29
|
+
print(f" path: {parsed.path}")
|
|
30
|
+
print(f" query: {parsed.query}")
|
|
31
|
+
print(f" fragment: {parsed.fragment}")
|
|
32
|
+
|
|
33
|
+
# 2. 测试 urlsplit
|
|
34
|
+
print("\n2. 测试 urlsplit:")
|
|
35
|
+
split_result = urlsplit(test_url)
|
|
36
|
+
print(f" scheme: {split_result.scheme}")
|
|
37
|
+
print(f" netloc: {split_result.netloc}")
|
|
38
|
+
print(f" path: {split_result.path}")
|
|
39
|
+
print(f" query: {split_result.query}")
|
|
40
|
+
print(f" fragment: {split_result.fragment}")
|
|
41
|
+
|
|
42
|
+
# 3. 测试 parse_qs
|
|
43
|
+
print("\n3. 测试 parse_qs:")
|
|
44
|
+
query_dict = parse_qs(parsed.query)
|
|
45
|
+
print(f" 解析结果: {query_dict}")
|
|
46
|
+
|
|
47
|
+
# 4. 测试 urlencode
|
|
48
|
+
print("\n4. 测试 urlencode:")
|
|
49
|
+
test_dict = {"name": "张三", "age": 25, "city": "北京"}
|
|
50
|
+
encoded = urlencode(test_dict)
|
|
51
|
+
print(f" 编码结果: {encoded}")
|
|
52
|
+
|
|
53
|
+
# 5. 测试 quote/unquote
|
|
54
|
+
print("\n5. 测试 quote/unquote:")
|
|
55
|
+
original = "hello world 你好"
|
|
56
|
+
quoted = quote(original)
|
|
57
|
+
print(f" 原始字符串: {original}")
|
|
58
|
+
print(f" URL编码: {quoted}")
|
|
59
|
+
|
|
60
|
+
unquoted = unquote(quoted)
|
|
61
|
+
print(f" URL解码: {unquoted}")
|
|
62
|
+
print(f" 编码解码是否一致: {original == unquoted}")
|
|
63
|
+
|
|
64
|
+
# 6. 测试 urldefrag
|
|
65
|
+
print("\n6. 测试 urldefrag:")
|
|
66
|
+
url_without_frag, fragment = urldefrag(test_url)
|
|
67
|
+
print(f" 去除片段的URL: {url_without_frag}")
|
|
68
|
+
print(f" 片段: {fragment}")
|
|
69
|
+
|
|
70
|
+
print("\n所有测试完成!")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == '__main__':
|
|
74
|
+
test_url_methods()
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
模拟 MySQL ON DUPLICATE KEY UPDATE 行为测试
|
|
4
|
+
演示不同情况下的影响行数
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到 Python 路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.utils.db_helper import SQLBuilder
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def simulate_mysql_scenarios():
|
|
16
|
+
"""模拟不同的 MySQL 场景"""
|
|
17
|
+
print("=== MySQL 场景模拟测试 ===\n")
|
|
18
|
+
|
|
19
|
+
table = "news_items"
|
|
20
|
+
|
|
21
|
+
# 场景1: 新记录插入
|
|
22
|
+
print("场景1: 插入新记录")
|
|
23
|
+
new_data = {
|
|
24
|
+
'title': '新文章标题',
|
|
25
|
+
'publish_time': '2025-10-09 10:00',
|
|
26
|
+
'url': 'https://example.com/new-article',
|
|
27
|
+
'source': '新来源',
|
|
28
|
+
'content': '新文章内容'
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
sql1 = SQLBuilder.make_insert(
|
|
32
|
+
table=table,
|
|
33
|
+
data=new_data,
|
|
34
|
+
auto_update=False,
|
|
35
|
+
update_columns=('title', 'publish_time'),
|
|
36
|
+
insert_ignore=False
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
print(f"SQL: {sql1[:100]}...")
|
|
40
|
+
print("预期行为: 正常插入,影响行数 = 1")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
# 场景2: 冲突但字段值相同
|
|
44
|
+
print("场景2: 主键冲突,更新字段值相同")
|
|
45
|
+
duplicate_data = {
|
|
46
|
+
'title': '已有文章标题', # 假设数据库中已存在相同标题的记录
|
|
47
|
+
'publish_time': '2025-10-09 09:00', # 与数据库中记录相同的发布时间
|
|
48
|
+
'url': 'https://example.com/existing-article',
|
|
49
|
+
'source': '来源',
|
|
50
|
+
'content': '文章内容'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
sql2 = SQLBuilder.make_insert(
|
|
54
|
+
table=table,
|
|
55
|
+
data=duplicate_data,
|
|
56
|
+
auto_update=False,
|
|
57
|
+
update_columns=('title', 'publish_time'),
|
|
58
|
+
insert_ignore=False
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
print(f"SQL: {sql2[:100]}...")
|
|
62
|
+
print("预期行为: 触发 ON DUPLICATE KEY UPDATE,但字段值未变化,影响行数 = 0")
|
|
63
|
+
print()
|
|
64
|
+
|
|
65
|
+
# 场景3: 冲突且字段值不同
|
|
66
|
+
print("场景3: 主键冲突,更新字段值不同")
|
|
67
|
+
updated_data = {
|
|
68
|
+
'title': '已有文章标题', # 与数据库中记录相同
|
|
69
|
+
'publish_time': '2025-10-09 11:00', # 与数据库中记录不同的发布时间
|
|
70
|
+
'url': 'https://example.com/existing-article',
|
|
71
|
+
'source': '来源',
|
|
72
|
+
'content': '文章内容'
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
sql3 = SQLBuilder.make_insert(
|
|
76
|
+
table=table,
|
|
77
|
+
data=updated_data,
|
|
78
|
+
auto_update=False,
|
|
79
|
+
update_columns=('title', 'publish_time'),
|
|
80
|
+
insert_ignore=False
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
print(f"SQL: {sql3[:100]}...")
|
|
84
|
+
print("预期行为: 触发 ON DUPLICATE KEY UPDATE,字段值变化,影响行数 = 2")
|
|
85
|
+
print("(MySQL 5.7+ 版本中,更新操作返回的影响行数为 2)")
|
|
86
|
+
print()
|
|
87
|
+
|
|
88
|
+
# 场景4: 使用 INSERT IGNORE
|
|
89
|
+
print("场景4: 使用 INSERT IGNORE")
|
|
90
|
+
ignore_data = {
|
|
91
|
+
'title': '忽略重复标题', # 假设数据库中已存在相同标题的记录
|
|
92
|
+
'publish_time': '2025-10-09 12:00',
|
|
93
|
+
'url': 'https://example.com/ignore-article',
|
|
94
|
+
'source': '忽略来源',
|
|
95
|
+
'content': '忽略内容'
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
sql4 = SQLBuilder.make_insert(
|
|
99
|
+
table=table,
|
|
100
|
+
data=ignore_data,
|
|
101
|
+
auto_update=False,
|
|
102
|
+
update_columns=(),
|
|
103
|
+
insert_ignore=True
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
print(f"SQL: {sql4[:100]}...")
|
|
107
|
+
print("预期行为: 遇到重复记录时忽略插入,影响行数 = 0")
|
|
108
|
+
print()
|
|
109
|
+
|
|
110
|
+
# 场景5: 使用 REPLACE INTO
|
|
111
|
+
print("场景5: 使用 REPLACE INTO")
|
|
112
|
+
replace_data = {
|
|
113
|
+
'title': '替换文章标题', # 假设数据库中已存在相同标题的记录
|
|
114
|
+
'publish_time': '2025-10-09 13:00',
|
|
115
|
+
'url': 'https://example.com/replace-article',
|
|
116
|
+
'source': '替换来源',
|
|
117
|
+
'content': '替换内容'
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
sql5 = SQLBuilder.make_insert(
|
|
121
|
+
table=table,
|
|
122
|
+
data=replace_data,
|
|
123
|
+
auto_update=True, # 使用 REPLACE INTO
|
|
124
|
+
update_columns=(),
|
|
125
|
+
insert_ignore=False
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
print(f"SQL: {sql5[:100]}...")
|
|
129
|
+
print("预期行为: 删除旧记录并插入新记录,影响行数 = 2")
|
|
130
|
+
print()
|
|
131
|
+
|
|
132
|
+
print("=== 总结 ===")
|
|
133
|
+
print("1. 当使用 MYSQL_UPDATE_COLUMNS 时,影响行数为 0 并不表示错误")
|
|
134
|
+
print("2. 这可能意味着更新的字段值与现有记录相同")
|
|
135
|
+
print("3. 如果需要确保更新,可以在 update_columns 中包含更多字段")
|
|
136
|
+
print("4. 如果需要完全替换记录,使用 MYSQL_AUTO_UPDATE = True")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
simulate_mysql_scenarios()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import asyncio
|
|
3
|
+
from asyncmy import create_pool
|
|
4
|
+
|
|
5
|
+
async def test_asyncmy_usage():
|
|
6
|
+
"""测试asyncmy库的正确使用方式"""
|
|
7
|
+
try:
|
|
8
|
+
# 创建连接池
|
|
9
|
+
pool = await create_pool(
|
|
10
|
+
host='127.0.0.1',
|
|
11
|
+
port=3306,
|
|
12
|
+
user='root',
|
|
13
|
+
password='123456',
|
|
14
|
+
db='test',
|
|
15
|
+
minsize=1,
|
|
16
|
+
maxsize=5
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# 获取连接
|
|
20
|
+
conn = await pool.acquire()
|
|
21
|
+
try:
|
|
22
|
+
# 获取游标
|
|
23
|
+
cursor = await conn.cursor()
|
|
24
|
+
try:
|
|
25
|
+
# 执行SQL
|
|
26
|
+
result = cursor.execute("SELECT 1")
|
|
27
|
+
print(f"execute返回类型: {type(result)}")
|
|
28
|
+
print(f"execute返回值: {result}")
|
|
29
|
+
|
|
30
|
+
# 检查是否需要await
|
|
31
|
+
if hasattr(result, '__await__'):
|
|
32
|
+
print("execute返回的是协程对象,需要await")
|
|
33
|
+
result = await result
|
|
34
|
+
else:
|
|
35
|
+
print("execute返回的不是协程对象,不需要await")
|
|
36
|
+
|
|
37
|
+
# 提交事务
|
|
38
|
+
await conn.commit()
|
|
39
|
+
|
|
40
|
+
finally:
|
|
41
|
+
await cursor.close()
|
|
42
|
+
finally:
|
|
43
|
+
pool.release(conn)
|
|
44
|
+
|
|
45
|
+
# 关闭连接池
|
|
46
|
+
pool.close()
|
|
47
|
+
await pool.wait_closed()
|
|
48
|
+
|
|
49
|
+
print("测试完成")
|
|
50
|
+
|
|
51
|
+
except Exception as e:
|
|
52
|
+
print(f"测试出错: {e}")
|
|
53
|
+
import traceback
|
|
54
|
+
traceback.print_exc()
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
asyncio.run(test_asyncmy_usage())
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
测试CrawlerProcess导入功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_crawler_process_import():
|
|
14
|
+
"""测试CrawlerProcess导入功能"""
|
|
15
|
+
print("测试CrawlerProcess导入功能...")
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
# 测试直接从crawlo导入CrawlerProcess
|
|
19
|
+
from crawlo import CrawlerProcess
|
|
20
|
+
print(f" 成功从crawlo导入CrawlerProcess: {CrawlerProcess}")
|
|
21
|
+
|
|
22
|
+
# 测试创建实例
|
|
23
|
+
process = CrawlerProcess()
|
|
24
|
+
print(f" 成功创建CrawlerProcess实例: {process}")
|
|
25
|
+
|
|
26
|
+
print("CrawlerProcess导入测试通过!")
|
|
27
|
+
|
|
28
|
+
except ImportError as e:
|
|
29
|
+
print(f" 导入失败: {e}")
|
|
30
|
+
# 如果直接导入失败,尝试从crawler模块导入
|
|
31
|
+
try:
|
|
32
|
+
from crawlo.crawler import CrawlerProcess
|
|
33
|
+
print(f" 成功从crawlo.crawler导入CrawlerProcess: {CrawlerProcess}")
|
|
34
|
+
except ImportError as e2:
|
|
35
|
+
print(f" 从crawler模块导入也失败: {e2}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == '__main__':
|
|
39
|
+
test_crawler_process_import()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
测试CrawlerProcess与SPIDER_MODULES的集成
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
import asyncio
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.crawler import CrawlerProcess
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_crawler_process_spider_modules():
|
|
18
|
+
"""测试CrawlerProcess与SPIDER_MODULES的集成"""
|
|
19
|
+
print("测试CrawlerProcess与SPIDER_MODULES的集成...")
|
|
20
|
+
|
|
21
|
+
# 创建一个包含SPIDER_MODULES的设置
|
|
22
|
+
settings = SettingManager({
|
|
23
|
+
'SPIDER_MODULES': ['tests.test_spiders'],
|
|
24
|
+
'SPIDER_LOADER_WARN_ONLY': True,
|
|
25
|
+
'CONCURRENCY': 1,
|
|
26
|
+
'LOG_LEVEL': 'INFO'
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
# 创建CrawlerProcess实例
|
|
30
|
+
process = CrawlerProcess(settings=settings)
|
|
31
|
+
|
|
32
|
+
# 测试获取爬虫名称
|
|
33
|
+
spider_names = process.get_spider_names()
|
|
34
|
+
print(f"发现的爬虫: {spider_names}")
|
|
35
|
+
|
|
36
|
+
# 测试检查爬虫是否已注册
|
|
37
|
+
is_registered = process.is_spider_registered('test_spider')
|
|
38
|
+
print(f"爬虫'test_spider'是否已注册: {is_registered}")
|
|
39
|
+
|
|
40
|
+
# 测试获取爬虫类
|
|
41
|
+
spider_class = process.get_spider_class('test_spider')
|
|
42
|
+
print(f"爬虫'test_spider'的类: {spider_class}")
|
|
43
|
+
|
|
44
|
+
print("测试完成!")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == '__main__':
|
|
48
|
+
test_crawler_process_spider_modules()
|
|
@@ -61,8 +61,14 @@ async def test_proxy_integration():
|
|
|
61
61
|
config = CrawloConfig.standalone(
|
|
62
62
|
concurrency=1,
|
|
63
63
|
download_delay=0.1,
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
# 代理配置
|
|
65
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
66
|
+
# 只要配置了代理API URL,中间件就会自动启用
|
|
67
|
+
PROXY_API_URL="https://proxy-api.example.com/get", # 模拟代理API
|
|
68
|
+
|
|
69
|
+
# 代理配置(适用于ProxyMiddleware)
|
|
70
|
+
# 只要配置了代理列表,中间件就会自动启用
|
|
71
|
+
# PROXY_LIST=["http://proxy1:8080", "http://proxy2:8080"],
|
|
66
72
|
LOG_LEVEL='WARNING' # 减少日志输出
|
|
67
73
|
)
|
|
68
74
|
|