crawlo 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +8 -7
- crawlo/downloader/__init__.py +5 -2
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +6 -6
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/settings/default_settings.py +4 -0
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +220 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.5.dist-info/METADATA +329 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/RECORD +76 -49
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.4.dist-info/METADATA +0 -190
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试CrawloConfig工厂模式兼容性
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import traceback
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.config import CrawloConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_standalone_factory():
|
|
16
|
+
"""测试单机模式工厂函数"""
|
|
17
|
+
print("测试单机模式工厂函数...")
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
# 创建单机模式配置
|
|
21
|
+
config = CrawloConfig.standalone(
|
|
22
|
+
project_name='ofweek_standalone',
|
|
23
|
+
concurrency=8,
|
|
24
|
+
download_delay=1.0
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
print(f"配置创建成功")
|
|
28
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
29
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
30
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
31
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
32
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
33
|
+
|
|
34
|
+
# 验证配置是否正确
|
|
35
|
+
assert config.get('RUN_MODE') == 'standalone'
|
|
36
|
+
assert config.get('QUEUE_TYPE') == 'memory'
|
|
37
|
+
assert config.get('PROJECT_NAME') == 'ofweek_standalone'
|
|
38
|
+
assert config.get('CONCURRENCY') == 8
|
|
39
|
+
assert config.get('DOWNLOAD_DELAY') == 1.0
|
|
40
|
+
|
|
41
|
+
print("✅ 单机模式工厂函数测试通过")
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
print(f"❌ 单机模式工厂函数测试失败: {e}")
|
|
46
|
+
traceback.print_exc()
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_distributed_factory():
|
|
51
|
+
"""测试分布式模式工厂函数"""
|
|
52
|
+
print("\n测试分布式模式工厂函数...")
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
# 创建分布式模式配置
|
|
56
|
+
config = CrawloConfig.distributed(
|
|
57
|
+
redis_host='127.0.0.1',
|
|
58
|
+
redis_port=6379,
|
|
59
|
+
project_name='ofweek_distributed',
|
|
60
|
+
concurrency=16,
|
|
61
|
+
download_delay=0.5
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
print(f"配置创建成功")
|
|
65
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
66
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
67
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
68
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
69
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
70
|
+
print(f"REDIS_HOST: {config.get('REDIS_HOST')}")
|
|
71
|
+
print(f"REDIS_PORT: {config.get('REDIS_PORT')}")
|
|
72
|
+
|
|
73
|
+
# 验证配置是否正确
|
|
74
|
+
assert config.get('RUN_MODE') == 'distributed'
|
|
75
|
+
assert config.get('QUEUE_TYPE') == 'redis'
|
|
76
|
+
assert config.get('PROJECT_NAME') == 'ofweek_distributed'
|
|
77
|
+
assert config.get('CONCURRENCY') == 16
|
|
78
|
+
assert config.get('DOWNLOAD_DELAY') == 0.5
|
|
79
|
+
assert config.get('REDIS_HOST') == '127.0.0.1'
|
|
80
|
+
assert config.get('REDIS_PORT') == 6379
|
|
81
|
+
|
|
82
|
+
print("✅ 分布式模式工厂函数测试通过")
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"❌ 分布式模式工厂函数测试失败: {e}")
|
|
87
|
+
traceback.print_exc()
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_auto_factory():
|
|
92
|
+
"""测试自动模式工厂函数"""
|
|
93
|
+
print("\n测试自动模式工厂函数...")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
# 创建自动模式配置
|
|
97
|
+
config = CrawloConfig.auto(
|
|
98
|
+
project_name='ofweek_auto',
|
|
99
|
+
concurrency=12,
|
|
100
|
+
download_delay=0.8
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(f"配置创建成功")
|
|
104
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
105
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
106
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
107
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
108
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
109
|
+
|
|
110
|
+
# 验证配置是否正确
|
|
111
|
+
assert config.get('RUN_MODE') == 'auto'
|
|
112
|
+
assert config.get('QUEUE_TYPE') == 'auto'
|
|
113
|
+
assert config.get('PROJECT_NAME') == 'ofweek_auto'
|
|
114
|
+
assert config.get('CONCURRENCY') == 12
|
|
115
|
+
assert config.get('DOWNLOAD_DELAY') == 0.8
|
|
116
|
+
|
|
117
|
+
print("✅ 自动模式工厂函数测试通过")
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"❌ 自动模式工厂函数测试失败: {e}")
|
|
122
|
+
traceback.print_exc()
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_config_to_dict():
|
|
127
|
+
"""测试配置转换为字典"""
|
|
128
|
+
print("\n测试配置转换为字典...")
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# 创建配置
|
|
132
|
+
config = CrawloConfig.standalone(
|
|
133
|
+
project_name='test_project',
|
|
134
|
+
concurrency=4
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# 转换为字典
|
|
138
|
+
config_dict = config.to_dict()
|
|
139
|
+
|
|
140
|
+
print(f"字典转换成功")
|
|
141
|
+
print(f"字典键数量: {len(config_dict)}")
|
|
142
|
+
|
|
143
|
+
# 验证关键配置项
|
|
144
|
+
assert 'RUN_MODE' in config_dict
|
|
145
|
+
assert 'QUEUE_TYPE' in config_dict
|
|
146
|
+
assert 'PROJECT_NAME' in config_dict
|
|
147
|
+
assert 'CONCURRENCY' in config_dict
|
|
148
|
+
|
|
149
|
+
print("✅ 配置转换为字典测试通过")
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
print(f"❌ 配置转换为字典测试失败: {e}")
|
|
154
|
+
traceback.print_exc()
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def main():
|
|
159
|
+
"""主函数"""
|
|
160
|
+
print("开始测试CrawloConfig工厂模式兼容性...")
|
|
161
|
+
print("=" * 50)
|
|
162
|
+
|
|
163
|
+
tests = [
|
|
164
|
+
test_standalone_factory,
|
|
165
|
+
test_distributed_factory,
|
|
166
|
+
test_auto_factory,
|
|
167
|
+
test_config_to_dict,
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
passed = 0
|
|
171
|
+
total = len(tests)
|
|
172
|
+
|
|
173
|
+
for test_func in tests:
|
|
174
|
+
try:
|
|
175
|
+
if test_func():
|
|
176
|
+
passed += 1
|
|
177
|
+
print(f"✓ {test_func.__name__} 通过")
|
|
178
|
+
else:
|
|
179
|
+
print(f"✗ {test_func.__name__} 失败")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
print(f"✗ {test_func.__name__} 异常: {e}")
|
|
182
|
+
print()
|
|
183
|
+
|
|
184
|
+
print("=" * 50)
|
|
185
|
+
print(f"测试结果: {passed}/{total} 通过")
|
|
186
|
+
|
|
187
|
+
if passed == total:
|
|
188
|
+
print("所有测试通过!CrawloConfig工厂模式兼容性正常。")
|
|
189
|
+
return 0
|
|
190
|
+
else:
|
|
191
|
+
print("部分测试失败,请检查实现。")
|
|
192
|
+
return 1
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
exit_code = main()
|
|
197
|
+
exit(exit_code)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
优化后的选择器命名测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.utils import (
|
|
11
|
+
extract_text,
|
|
12
|
+
extract_texts,
|
|
13
|
+
extract_attr,
|
|
14
|
+
extract_attrs,
|
|
15
|
+
is_xpath
|
|
16
|
+
)
|
|
17
|
+
from parsel import Selector
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_optimized_naming():
|
|
21
|
+
"""测试优化后的命名"""
|
|
22
|
+
print("测试优化后的选择器命名...")
|
|
23
|
+
print("=" * 50)
|
|
24
|
+
|
|
25
|
+
# 创建测试HTML
|
|
26
|
+
html_content = """
|
|
27
|
+
<html>
|
|
28
|
+
<head>
|
|
29
|
+
<title>测试页面</title>
|
|
30
|
+
</head>
|
|
31
|
+
<body>
|
|
32
|
+
<div class="content">
|
|
33
|
+
<h1>主标题</h1>
|
|
34
|
+
<p class="intro">介绍段落</p>
|
|
35
|
+
<ul class="list">
|
|
36
|
+
<li>项目1</li>
|
|
37
|
+
<li>项目2</li>
|
|
38
|
+
<li>项目3</li>
|
|
39
|
+
</ul>
|
|
40
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
41
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
42
|
+
</div>
|
|
43
|
+
</body>
|
|
44
|
+
</html>
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
selector = Selector(text=html_content)
|
|
48
|
+
|
|
49
|
+
# 测试 is_xpath
|
|
50
|
+
print("1. 测试 is_xpath:")
|
|
51
|
+
print(f" '/' 开头: {is_xpath('/')}")
|
|
52
|
+
print(f" '//' 开头: {is_xpath('//title')}")
|
|
53
|
+
print(f" './' 开头: {is_xpath('./div')}")
|
|
54
|
+
print(f" 'title' 开头: {is_xpath('title')}")
|
|
55
|
+
print()
|
|
56
|
+
|
|
57
|
+
# 测试 extract_text
|
|
58
|
+
print("2. 测试 extract_text:")
|
|
59
|
+
title_elements = selector.css('title')
|
|
60
|
+
title_text = extract_text(title_elements)
|
|
61
|
+
print(f" 标题文本: {title_text}")
|
|
62
|
+
|
|
63
|
+
h1_elements = selector.css('.content h1')
|
|
64
|
+
h1_text = extract_text(h1_elements)
|
|
65
|
+
print(f" H1文本: {h1_text}")
|
|
66
|
+
print()
|
|
67
|
+
|
|
68
|
+
# 测试 extract_texts
|
|
69
|
+
print("3. 测试 extract_texts:")
|
|
70
|
+
li_elements = selector.css('.list li')
|
|
71
|
+
li_texts = extract_texts(li_elements)
|
|
72
|
+
print(f" 列表项文本: {li_texts}")
|
|
73
|
+
print()
|
|
74
|
+
|
|
75
|
+
# 测试 extract_attr
|
|
76
|
+
print("4. 测试 extract_attr:")
|
|
77
|
+
link_elements = selector.css('.link')
|
|
78
|
+
link_href = extract_attr(link_elements, 'href')
|
|
79
|
+
print(f" 链接href: {link_href}")
|
|
80
|
+
|
|
81
|
+
img_elements = selector.css('.image')
|
|
82
|
+
img_alt = extract_attr(img_elements, 'alt')
|
|
83
|
+
print(f" 图片alt: {img_alt}")
|
|
84
|
+
print()
|
|
85
|
+
|
|
86
|
+
# 测试 extract_attrs
|
|
87
|
+
print("5. 测试 extract_attrs:")
|
|
88
|
+
all_links = selector.css('a')
|
|
89
|
+
all_hrefs = extract_attrs(all_links, 'href')
|
|
90
|
+
print(f" 所有链接href: {all_hrefs}")
|
|
91
|
+
|
|
92
|
+
all_images = selector.css('img')
|
|
93
|
+
all_srcs = extract_attrs(all_images, 'src')
|
|
94
|
+
print(f" 所有图片src: {all_srcs}")
|
|
95
|
+
print()
|
|
96
|
+
|
|
97
|
+
print("所有测试完成!")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == '__main__':
|
|
101
|
+
test_optimized_naming()
|
tests/test_priority_behavior.py
CHANGED
|
@@ -66,22 +66,22 @@ async def test_redis_queue_priority():
|
|
|
66
66
|
await queue._redis.delete(f"{queue.queue_name}:data")
|
|
67
67
|
|
|
68
68
|
# 创建不同优先级的请求
|
|
69
|
-
# 注意:
|
|
70
|
-
# 所以priority
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 正常优先级
|
|
69
|
+
# 注意:Request构造函数会将传入的priority值取反存储
|
|
70
|
+
# 所以priority=100的请求实际存储为-100,priority=-100的请求实际存储为100
|
|
71
|
+
request_low_priority = Request(url="https://low-priority.com", priority=100) # 实际存储为-100(高优先级)
|
|
72
|
+
request_high_priority = Request(url="https://high-priority.com", priority=-100) # 实际存储为100(低优先级)
|
|
73
|
+
request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 实际存储为0(正常优先级)
|
|
75
74
|
|
|
76
75
|
# 按照正确的顺序入队以验证优先级行为
|
|
77
|
-
|
|
78
|
-
await queue.put(
|
|
79
|
-
await queue.put(
|
|
76
|
+
# 使用实际存储的priority值
|
|
77
|
+
await queue.put(request_low_priority, priority=request_low_priority.priority) # 实际score=-100
|
|
78
|
+
await queue.put(request_normal_priority, priority=request_normal_priority.priority) # 实际score=0
|
|
79
|
+
await queue.put(request_high_priority, priority=request_high_priority.priority) # 实际score=100
|
|
80
80
|
|
|
81
81
|
print(f" 队列大小: {await queue.qsize()}")
|
|
82
82
|
|
|
83
|
-
# 出队顺序应该按照score从小到大(priority
|
|
84
|
-
#
|
|
83
|
+
# 出队顺序应该按照score从小到大(priority从小到大)
|
|
84
|
+
# 所以request_low_priority先出队(score=-100),request_normal_priority第二个出队(score=0),request_high_priority最后出队(score=100)
|
|
85
85
|
item1 = await queue.get(timeout=2.0)
|
|
86
86
|
item2 = await queue.get(timeout=2.0)
|
|
87
87
|
item3 = await queue.get(timeout=2.0)
|
|
@@ -91,13 +91,13 @@ async def test_redis_queue_priority():
|
|
|
91
91
|
print(f" 第二个出队: {item2.url if item2 else None}")
|
|
92
92
|
print(f" 第三个出队: {item3.url if item3 else None}")
|
|
93
93
|
|
|
94
|
-
# Redis队列中,score小的先出队,所以priority
|
|
95
|
-
assert item1 is not None and item1.url == "https://low-priority.com", f"
|
|
96
|
-
assert item2 is not None and item2.url == "https://normal-priority.com", f"
|
|
97
|
-
assert item3 is not None and item3.url == "https://high-priority.com", f"
|
|
94
|
+
# Redis队列中,score小的先出队,所以priority小的先出队
|
|
95
|
+
assert item1 is not None and item1.url == "https://low-priority.com", f"低优先级请求应该先出队,实际: {item1.url if item1 else None}"
|
|
96
|
+
assert item2 is not None and item2.url == "https://normal-priority.com", f"正常优先级请求应该第二个出队,实际: {item2.url if item2 else None}"
|
|
97
|
+
assert item3 is not None and item3.url == "https://high-priority.com", f"高优先级请求应该最后出队,实际: {item3.url if item3 else None}"
|
|
98
98
|
|
|
99
99
|
print(" ✅ Redis队列优先级测试通过(确认了score越小越优先的规则)")
|
|
100
|
-
print(" 注意:Redis队列中score =
|
|
100
|
+
print(" 注意:Redis队列中score = priority,所以priority值小的请求score小,会先出队")
|
|
101
101
|
|
|
102
102
|
except Exception as e:
|
|
103
103
|
print(f" ❌ Redis队列优先级测试失败: {e}")
|
|
@@ -196,8 +196,8 @@ async def main():
|
|
|
196
196
|
print("\n总结:")
|
|
197
197
|
print("1. 请求优先级遵循'数值越小越优先'的原则")
|
|
198
198
|
print("2. 内存队列: 直接使用(priority, request)元组,priority小的先出队")
|
|
199
|
-
print("3. Redis队列: 使用score =
|
|
200
|
-
print("
|
|
199
|
+
print("3. Redis队列: 使用score = priority,score小的先出队,所以priority小的先出队")
|
|
200
|
+
print(" 现在内存队列和Redis队列行为一致")
|
|
201
201
|
print("4. 重试中间件会根据RETRY_PRIORITY配置调整请求优先级")
|
|
202
202
|
print("5. 系统内置的优先级常量: URGENT(-200) < HIGH(-100) < NORMAL(0) < LOW(100) < BACKGROUND(200)")
|
|
203
203
|
print("6. Request对象构造时会将传入的priority值取反存储")
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response.follow 方法测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest.mock import Mock
|
|
8
|
+
|
|
9
|
+
# 模拟 Request 类
|
|
10
|
+
class MockRequest:
|
|
11
|
+
def __init__(self, url, callback=None, **kwargs):
|
|
12
|
+
self.url = url
|
|
13
|
+
self.callback = callback
|
|
14
|
+
self.kwargs = kwargs
|
|
15
|
+
|
|
16
|
+
# 模拟 crawlo.Request
|
|
17
|
+
import sys
|
|
18
|
+
sys.modules['crawlo'] = Mock()
|
|
19
|
+
sys.modules['crawlo'].Request = MockRequest
|
|
20
|
+
|
|
21
|
+
from crawlo.network.response import Response
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TestResponseFollow(unittest.TestCase):
|
|
25
|
+
"""Response.follow 方法测试类"""
|
|
26
|
+
|
|
27
|
+
def setUp(self):
|
|
28
|
+
"""测试前准备"""
|
|
29
|
+
# 创建一个模拟的HTML响应
|
|
30
|
+
html_content = """
|
|
31
|
+
<html>
|
|
32
|
+
<head>
|
|
33
|
+
<title>测试页面</title>
|
|
34
|
+
</head>
|
|
35
|
+
<body>
|
|
36
|
+
<div class="content">
|
|
37
|
+
<h1>主标题</h1>
|
|
38
|
+
<p class="intro">这是介绍段落</p>
|
|
39
|
+
<ul class="list">
|
|
40
|
+
<li>项目1</li>
|
|
41
|
+
<li>项目2</li>
|
|
42
|
+
<li>项目3</li>
|
|
43
|
+
</ul>
|
|
44
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
45
|
+
<a href="/relative/path" class="relative-link">相对链接</a>
|
|
46
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
47
|
+
</div>
|
|
48
|
+
</body>
|
|
49
|
+
</html>
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# 创建模拟的请求对象
|
|
53
|
+
mock_request = Mock()
|
|
54
|
+
mock_request.callback = None
|
|
55
|
+
|
|
56
|
+
self.response = Response(
|
|
57
|
+
url="https://example.com/test",
|
|
58
|
+
body=html_content.encode('utf-8'),
|
|
59
|
+
headers={"content-type": "text/html; charset=utf-8"},
|
|
60
|
+
request=mock_request
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def test_follow_absolute_url(self):
|
|
64
|
+
"""测试处理绝对URL"""
|
|
65
|
+
request = self.response.follow("https://other.com/page", callback=lambda r: None)
|
|
66
|
+
self.assertEqual(request.url, "https://other.com/page")
|
|
67
|
+
self.assertIsNotNone(request.callback)
|
|
68
|
+
|
|
69
|
+
def test_follow_relative_url(self):
|
|
70
|
+
"""测试处理相对URL"""
|
|
71
|
+
request = self.response.follow("/relative/path", callback=lambda r: None)
|
|
72
|
+
self.assertEqual(request.url, "https://example.com/relative/path")
|
|
73
|
+
self.assertIsNotNone(request.callback)
|
|
74
|
+
|
|
75
|
+
def test_follow_complex_relative_url(self):
|
|
76
|
+
"""测试处理复杂的相对URL"""
|
|
77
|
+
request = self.response.follow("../other/path", callback=lambda r: None)
|
|
78
|
+
self.assertEqual(request.url, "https://example.com/other/path")
|
|
79
|
+
|
|
80
|
+
request2 = self.response.follow("./another/path", callback=lambda r: None)
|
|
81
|
+
self.assertEqual(request2.url, "https://example.com/another/path")
|
|
82
|
+
|
|
83
|
+
def test_follow_with_query_params(self):
|
|
84
|
+
"""测试处理带查询参数的URL"""
|
|
85
|
+
request = self.response.follow("/path?param=value", callback=lambda r: None)
|
|
86
|
+
self.assertEqual(request.url, "https://example.com/path?param=value")
|
|
87
|
+
|
|
88
|
+
request2 = self.response.follow("/path#section", callback=lambda r: None)
|
|
89
|
+
self.assertEqual(request2.url, "https://example.com/path#section")
|
|
90
|
+
|
|
91
|
+
def test_follow_with_additional_kwargs(self):
|
|
92
|
+
"""测试传递额外参数"""
|
|
93
|
+
request = self.response.follow(
|
|
94
|
+
"/path",
|
|
95
|
+
callback=lambda r: None,
|
|
96
|
+
method="POST",
|
|
97
|
+
headers={"User-Agent": "test"}
|
|
98
|
+
)
|
|
99
|
+
self.assertEqual(request.url, "https://example.com/path")
|
|
100
|
+
self.assertEqual(request.kwargs.get("method"), "POST")
|
|
101
|
+
self.assertEqual(request.kwargs.get("headers"), {"User-Agent": "test"})
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == '__main__':
|
|
105
|
+
unittest.main()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试Response类中的选择器方法
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.network.response import Response
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_response_selector_methods():
|
|
14
|
+
"""测试Response类中的选择器方法"""
|
|
15
|
+
print("测试Response类中的选择器方法...")
|
|
16
|
+
print("=" * 50)
|
|
17
|
+
|
|
18
|
+
# 创建测试HTML响应
|
|
19
|
+
html_content = """
|
|
20
|
+
<html>
|
|
21
|
+
<head>
|
|
22
|
+
<title>测试页面</title>
|
|
23
|
+
</head>
|
|
24
|
+
<body>
|
|
25
|
+
<div class="content">
|
|
26
|
+
<h1>主标题</h1>
|
|
27
|
+
<p class="intro">介绍段落</p>
|
|
28
|
+
<ul class="list">
|
|
29
|
+
<li>项目1</li>
|
|
30
|
+
<li>项目2</li>
|
|
31
|
+
<li>项目3</li>
|
|
32
|
+
</ul>
|
|
33
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
34
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
35
|
+
</div>
|
|
36
|
+
</body>
|
|
37
|
+
</html>
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# 创建Response对象
|
|
41
|
+
response = Response(
|
|
42
|
+
url="https://example.com/test",
|
|
43
|
+
body=html_content.encode('utf-8'),
|
|
44
|
+
headers={"content-type": "text/html; charset=utf-8"}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# 测试 extract_text (CSS选择器)
|
|
48
|
+
print("1. 测试 extract_text (CSS选择器):")
|
|
49
|
+
title_text = response.extract_text('title')
|
|
50
|
+
print(f" 标题文本: {title_text}")
|
|
51
|
+
|
|
52
|
+
h1_text = response.extract_text('.content h1')
|
|
53
|
+
print(f" H1文本: {h1_text}")
|
|
54
|
+
print()
|
|
55
|
+
|
|
56
|
+
# 测试 extract_text (XPath选择器)
|
|
57
|
+
print("2. 测试 extract_text (XPath选择器):")
|
|
58
|
+
title_text_xpath = response.extract_text('//title')
|
|
59
|
+
print(f" 标题文本: {title_text_xpath}")
|
|
60
|
+
|
|
61
|
+
h1_text_xpath = response.extract_text('//div[@class="content"]/h1')
|
|
62
|
+
print(f" H1文本: {h1_text_xpath}")
|
|
63
|
+
print()
|
|
64
|
+
|
|
65
|
+
# 测试 extract_texts
|
|
66
|
+
print("3. 测试 extract_texts:")
|
|
67
|
+
li_texts = response.extract_texts('.list li')
|
|
68
|
+
print(f" 列表项文本: {li_texts}")
|
|
69
|
+
print()
|
|
70
|
+
|
|
71
|
+
# 测试 extract_attr
|
|
72
|
+
print("4. 测试 extract_attr:")
|
|
73
|
+
link_href = response.extract_attr('.link', 'href')
|
|
74
|
+
print(f" 链接href: {link_href}")
|
|
75
|
+
|
|
76
|
+
img_alt = response.extract_attr('.image', 'alt')
|
|
77
|
+
print(f" 图片alt: {img_alt}")
|
|
78
|
+
print()
|
|
79
|
+
|
|
80
|
+
# 测试 extract_attrs
|
|
81
|
+
print("5. 测试 extract_attrs:")
|
|
82
|
+
all_links = response.extract_attrs('a', 'href')
|
|
83
|
+
print(f" 所有链接href: {all_links}")
|
|
84
|
+
|
|
85
|
+
all_images = response.extract_attrs('img', 'src')
|
|
86
|
+
print(f" 所有图片src: {all_images}")
|
|
87
|
+
print()
|
|
88
|
+
|
|
89
|
+
print("所有测试完成!")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
if __name__ == '__main__':
|
|
93
|
+
test_response_selector_methods()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response URL 处理方法测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestUrlMethods(unittest.TestCase):
|
|
11
|
+
"""URL 处理方法测试类"""
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
"""测试前准备"""
|
|
15
|
+
self.test_url = "https://example.com/test?param1=value1¶m2=value2#section1"
|
|
16
|
+
|
|
17
|
+
def test_urlparse(self):
|
|
18
|
+
"""测试 urlparse 方法"""
|
|
19
|
+
parsed = urlparse(self.test_url)
|
|
20
|
+
self.assertEqual(parsed.scheme, "https")
|
|
21
|
+
self.assertEqual(parsed.netloc, "example.com")
|
|
22
|
+
self.assertEqual(parsed.path, "/test")
|
|
23
|
+
self.assertEqual(parsed.query, "param1=value1¶m2=value2")
|
|
24
|
+
self.assertEqual(parsed.fragment, "section1")
|
|
25
|
+
|
|
26
|
+
def test_urlsplit(self):
|
|
27
|
+
"""测试 urlsplit 方法"""
|
|
28
|
+
split_result = urlsplit(self.test_url)
|
|
29
|
+
self.assertEqual(split_result.scheme, "https")
|
|
30
|
+
self.assertEqual(split_result.netloc, "example.com")
|
|
31
|
+
self.assertEqual(split_result.path, "/test")
|
|
32
|
+
self.assertEqual(split_result.query, "param1=value1¶m2=value2")
|
|
33
|
+
self.assertEqual(split_result.fragment, "section1")
|
|
34
|
+
|
|
35
|
+
def test_parse_qs(self):
|
|
36
|
+
"""测试 parse_qs 方法"""
|
|
37
|
+
query_dict = parse_qs("param1=value1¶m2=value2¶m2=value3")
|
|
38
|
+
self.assertIn("param1", query_dict)
|
|
39
|
+
self.assertIn("param2", query_dict)
|
|
40
|
+
self.assertEqual(query_dict["param1"], ["value1"])
|
|
41
|
+
self.assertEqual(query_dict["param2"], ["value2", "value3"])
|
|
42
|
+
|
|
43
|
+
def test_urlencode(self):
|
|
44
|
+
"""测试 urlencode 方法"""
|
|
45
|
+
query_dict = {"name": "张三", "age": 25, "city": "北京"}
|
|
46
|
+
encoded = urlencode(query_dict)
|
|
47
|
+
# 注意:urlencode 的顺序可能不同,所以我们检查是否包含所有键值对
|
|
48
|
+
self.assertIn("name=%E5%BC%A0%E4%B8%89", encoded)
|
|
49
|
+
self.assertIn("age=25", encoded)
|
|
50
|
+
self.assertIn("city=%E5%8C%97%E4%BA%AC", encoded)
|
|
51
|
+
|
|
52
|
+
def test_quote_unquote(self):
|
|
53
|
+
"""测试 quote 和 unquote 方法"""
|
|
54
|
+
# 测试 quote
|
|
55
|
+
original = "hello world 你好"
|
|
56
|
+
quoted = quote(original)
|
|
57
|
+
self.assertEqual(quoted, "hello%20world%20%E4%BD%A0%E5%A5%BD")
|
|
58
|
+
|
|
59
|
+
# 测试 unquote
|
|
60
|
+
unquoted = unquote(quoted)
|
|
61
|
+
self.assertEqual(unquoted, original)
|
|
62
|
+
|
|
63
|
+
def test_urldefrag(self):
|
|
64
|
+
"""测试 urldefrag 方法"""
|
|
65
|
+
url_without_frag, fragment = urldefrag(self.test_url)
|
|
66
|
+
self.assertEqual(url_without_frag, "https://example.com/test?param1=value1¶m2=value2")
|
|
67
|
+
self.assertEqual(fragment, "section1")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == '__main__':
|
|
71
|
+
unittest.main()
|