crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +52 -17
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +13 -6
- crawlo/downloader/__init__.py +5 -2
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +6 -6
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/queue/pqueue.py +2 -6
- crawlo/queue/queue_manager.py +1 -2
- crawlo/settings/default_settings.py +15 -30
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +51 -65
- crawlo/templates/project/settings_distributed.py.tmpl +59 -67
- crawlo/templates/project/settings_gentle.py.tmpl +45 -40
- crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
- crawlo/templates/project/settings_minimal.py.tmpl +37 -26
- crawlo/templates/project/settings_simple.py.tmpl +45 -40
- crawlo/templates/run.py.tmpl +3 -7
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +220 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.5.dist-info/METADATA +329 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_multi_directory.py +68 -0
- tests/test_multiple_spider_modules.py +81 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spider_modules.py +85 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.3.dist-info/METADATA +0 -190
- examples/test_project/__init__.py +0 -7
- examples/test_project/run.py +0 -35
- examples/test_project/test_project/__init__.py +0 -4
- examples/test_project/test_project/items.py +0 -18
- examples/test_project/test_project/middlewares.py +0 -119
- examples/test_project/test_project/pipelines.py +0 -97
- examples/test_project/test_project/settings.py +0 -170
- examples/test_project/test_project/spiders/__init__.py +0 -10
- examples/test_project/test_project/spiders/of_week_dis.py +0 -144
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response 编码检测优化测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
# 模拟 Response 类的部分功能用于测试
|
|
9
|
+
class MockResponse:
|
|
10
|
+
def __init__(self, body, headers=None, request=None):
|
|
11
|
+
self.body = body
|
|
12
|
+
self.headers = headers or {}
|
|
13
|
+
self.request = request
|
|
14
|
+
self._DEFAULT_ENCODING = "ascii"
|
|
15
|
+
|
|
16
|
+
def _determine_encoding(self):
|
|
17
|
+
"""简化版编码检测"""
|
|
18
|
+
# 1. 优先使用声明的编码
|
|
19
|
+
declared_encoding = self._declared_encoding()
|
|
20
|
+
if declared_encoding:
|
|
21
|
+
return declared_encoding
|
|
22
|
+
|
|
23
|
+
# 2. 默认使用 utf-8
|
|
24
|
+
return 'utf-8'
|
|
25
|
+
|
|
26
|
+
def _declared_encoding(self):
|
|
27
|
+
"""获取声明的编码"""
|
|
28
|
+
# 1. Request 中指定的编码
|
|
29
|
+
if self.request and getattr(self.request, 'encoding', None):
|
|
30
|
+
return self.request.encoding
|
|
31
|
+
|
|
32
|
+
# 2. 从 Content-Type 头中检测
|
|
33
|
+
content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
|
|
34
|
+
if content_type:
|
|
35
|
+
import re
|
|
36
|
+
charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
|
|
37
|
+
if charset_match:
|
|
38
|
+
return charset_match.group(1).lower()
|
|
39
|
+
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestDetermineEncoding(unittest.TestCase):
|
|
44
|
+
"""编码检测测试类"""
|
|
45
|
+
|
|
46
|
+
def test_request_encoding_priority(self):
|
|
47
|
+
"""测试 Request 编码优先级"""
|
|
48
|
+
class MockRequest:
|
|
49
|
+
encoding = 'gbk'
|
|
50
|
+
|
|
51
|
+
response = MockResponse(b'', request=MockRequest())
|
|
52
|
+
encoding = response._determine_encoding()
|
|
53
|
+
self.assertEqual(encoding, 'gbk')
|
|
54
|
+
|
|
55
|
+
def test_content_type_encoding(self):
|
|
56
|
+
"""测试 Content-Type 头部编码检测"""
|
|
57
|
+
response = MockResponse(
|
|
58
|
+
b'',
|
|
59
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
60
|
+
)
|
|
61
|
+
encoding = response._determine_encoding()
|
|
62
|
+
self.assertEqual(encoding, 'iso-8859-1')
|
|
63
|
+
|
|
64
|
+
def test_default_encoding(self):
|
|
65
|
+
"""测试默认编码"""
|
|
66
|
+
response = MockResponse(b'')
|
|
67
|
+
encoding = response._determine_encoding()
|
|
68
|
+
self.assertEqual(encoding, 'utf-8')
|
|
69
|
+
|
|
70
|
+
def test_case_insensitive_content_type(self):
|
|
71
|
+
"""测试 Content-Type 头部大小写不敏感"""
|
|
72
|
+
response = MockResponse(
|
|
73
|
+
b'',
|
|
74
|
+
headers={"Content-Type": "text/html; CHARSET=UTF-8"}
|
|
75
|
+
)
|
|
76
|
+
encoding = response._determine_encoding()
|
|
77
|
+
self.assertEqual(encoding, 'utf-8')
|
|
78
|
+
|
|
79
|
+
def test_declared_encoding_with_request(self):
|
|
80
|
+
"""测试声明编码 - Request优先级"""
|
|
81
|
+
class MockRequest:
|
|
82
|
+
encoding = 'gbk'
|
|
83
|
+
|
|
84
|
+
response = MockResponse(b'', request=MockRequest())
|
|
85
|
+
declared_encoding = response._declared_encoding()
|
|
86
|
+
self.assertEqual(declared_encoding, 'gbk')
|
|
87
|
+
|
|
88
|
+
def test_declared_encoding_with_content_type(self):
|
|
89
|
+
"""测试声明编码 - Content-Type"""
|
|
90
|
+
response = MockResponse(
|
|
91
|
+
b'',
|
|
92
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
93
|
+
)
|
|
94
|
+
declared_encoding = response._declared_encoding()
|
|
95
|
+
self.assertEqual(declared_encoding, 'iso-8859-1')
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_encoding_detection():
|
|
99
|
+
"""简单测试编码检测功能"""
|
|
100
|
+
print("测试编码检测功能...")
|
|
101
|
+
|
|
102
|
+
# 测试 Request 编码优先级
|
|
103
|
+
class MockRequest:
|
|
104
|
+
encoding = 'gbk'
|
|
105
|
+
|
|
106
|
+
response1 = MockResponse(b'', request=MockRequest())
|
|
107
|
+
encoding1 = response1._determine_encoding()
|
|
108
|
+
print(f"Request 编码优先级: {encoding1}")
|
|
109
|
+
|
|
110
|
+
# 测试 Content-Type 头部编码
|
|
111
|
+
response2 = MockResponse(
|
|
112
|
+
b'',
|
|
113
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
114
|
+
)
|
|
115
|
+
encoding2 = response2._determine_encoding()
|
|
116
|
+
print(f"Content-Type 编码: {encoding2}")
|
|
117
|
+
|
|
118
|
+
# 测试默认编码
|
|
119
|
+
response3 = MockResponse(b'')
|
|
120
|
+
encoding3 = response3._determine_encoding()
|
|
121
|
+
print(f"默认编码: {encoding3}")
|
|
122
|
+
|
|
123
|
+
print("编码检测测试完成!")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
if __name__ == '__main__':
|
|
127
|
+
test_encoding_detection()
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试CrawloConfig工厂模式兼容性
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import traceback
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.config import CrawloConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_standalone_factory():
|
|
16
|
+
"""测试单机模式工厂函数"""
|
|
17
|
+
print("测试单机模式工厂函数...")
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
# 创建单机模式配置
|
|
21
|
+
config = CrawloConfig.standalone(
|
|
22
|
+
project_name='ofweek_standalone',
|
|
23
|
+
concurrency=8,
|
|
24
|
+
download_delay=1.0
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
print(f"配置创建成功")
|
|
28
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
29
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
30
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
31
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
32
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
33
|
+
|
|
34
|
+
# 验证配置是否正确
|
|
35
|
+
assert config.get('RUN_MODE') == 'standalone'
|
|
36
|
+
assert config.get('QUEUE_TYPE') == 'memory'
|
|
37
|
+
assert config.get('PROJECT_NAME') == 'ofweek_standalone'
|
|
38
|
+
assert config.get('CONCURRENCY') == 8
|
|
39
|
+
assert config.get('DOWNLOAD_DELAY') == 1.0
|
|
40
|
+
|
|
41
|
+
print("✅ 单机模式工厂函数测试通过")
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
print(f"❌ 单机模式工厂函数测试失败: {e}")
|
|
46
|
+
traceback.print_exc()
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_distributed_factory():
|
|
51
|
+
"""测试分布式模式工厂函数"""
|
|
52
|
+
print("\n测试分布式模式工厂函数...")
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
# 创建分布式模式配置
|
|
56
|
+
config = CrawloConfig.distributed(
|
|
57
|
+
redis_host='127.0.0.1',
|
|
58
|
+
redis_port=6379,
|
|
59
|
+
project_name='ofweek_distributed',
|
|
60
|
+
concurrency=16,
|
|
61
|
+
download_delay=0.5
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
print(f"配置创建成功")
|
|
65
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
66
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
67
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
68
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
69
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
70
|
+
print(f"REDIS_HOST: {config.get('REDIS_HOST')}")
|
|
71
|
+
print(f"REDIS_PORT: {config.get('REDIS_PORT')}")
|
|
72
|
+
|
|
73
|
+
# 验证配置是否正确
|
|
74
|
+
assert config.get('RUN_MODE') == 'distributed'
|
|
75
|
+
assert config.get('QUEUE_TYPE') == 'redis'
|
|
76
|
+
assert config.get('PROJECT_NAME') == 'ofweek_distributed'
|
|
77
|
+
assert config.get('CONCURRENCY') == 16
|
|
78
|
+
assert config.get('DOWNLOAD_DELAY') == 0.5
|
|
79
|
+
assert config.get('REDIS_HOST') == '127.0.0.1'
|
|
80
|
+
assert config.get('REDIS_PORT') == 6379
|
|
81
|
+
|
|
82
|
+
print("✅ 分布式模式工厂函数测试通过")
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"❌ 分布式模式工厂函数测试失败: {e}")
|
|
87
|
+
traceback.print_exc()
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_auto_factory():
|
|
92
|
+
"""测试自动模式工厂函数"""
|
|
93
|
+
print("\n测试自动模式工厂函数...")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
# 创建自动模式配置
|
|
97
|
+
config = CrawloConfig.auto(
|
|
98
|
+
project_name='ofweek_auto',
|
|
99
|
+
concurrency=12,
|
|
100
|
+
download_delay=0.8
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(f"配置创建成功")
|
|
104
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
105
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
106
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
107
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
108
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
109
|
+
|
|
110
|
+
# 验证配置是否正确
|
|
111
|
+
assert config.get('RUN_MODE') == 'auto'
|
|
112
|
+
assert config.get('QUEUE_TYPE') == 'auto'
|
|
113
|
+
assert config.get('PROJECT_NAME') == 'ofweek_auto'
|
|
114
|
+
assert config.get('CONCURRENCY') == 12
|
|
115
|
+
assert config.get('DOWNLOAD_DELAY') == 0.8
|
|
116
|
+
|
|
117
|
+
print("✅ 自动模式工厂函数测试通过")
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"❌ 自动模式工厂函数测试失败: {e}")
|
|
122
|
+
traceback.print_exc()
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_config_to_dict():
|
|
127
|
+
"""测试配置转换为字典"""
|
|
128
|
+
print("\n测试配置转换为字典...")
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# 创建配置
|
|
132
|
+
config = CrawloConfig.standalone(
|
|
133
|
+
project_name='test_project',
|
|
134
|
+
concurrency=4
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# 转换为字典
|
|
138
|
+
config_dict = config.to_dict()
|
|
139
|
+
|
|
140
|
+
print(f"字典转换成功")
|
|
141
|
+
print(f"字典键数量: {len(config_dict)}")
|
|
142
|
+
|
|
143
|
+
# 验证关键配置项
|
|
144
|
+
assert 'RUN_MODE' in config_dict
|
|
145
|
+
assert 'QUEUE_TYPE' in config_dict
|
|
146
|
+
assert 'PROJECT_NAME' in config_dict
|
|
147
|
+
assert 'CONCURRENCY' in config_dict
|
|
148
|
+
|
|
149
|
+
print("✅ 配置转换为字典测试通过")
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
print(f"❌ 配置转换为字典测试失败: {e}")
|
|
154
|
+
traceback.print_exc()
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def main():
|
|
159
|
+
"""主函数"""
|
|
160
|
+
print("开始测试CrawloConfig工厂模式兼容性...")
|
|
161
|
+
print("=" * 50)
|
|
162
|
+
|
|
163
|
+
tests = [
|
|
164
|
+
test_standalone_factory,
|
|
165
|
+
test_distributed_factory,
|
|
166
|
+
test_auto_factory,
|
|
167
|
+
test_config_to_dict,
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
passed = 0
|
|
171
|
+
total = len(tests)
|
|
172
|
+
|
|
173
|
+
for test_func in tests:
|
|
174
|
+
try:
|
|
175
|
+
if test_func():
|
|
176
|
+
passed += 1
|
|
177
|
+
print(f"✓ {test_func.__name__} 通过")
|
|
178
|
+
else:
|
|
179
|
+
print(f"✗ {test_func.__name__} 失败")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
print(f"✗ {test_func.__name__} 异常: {e}")
|
|
182
|
+
print()
|
|
183
|
+
|
|
184
|
+
print("=" * 50)
|
|
185
|
+
print(f"测试结果: {passed}/{total} 通过")
|
|
186
|
+
|
|
187
|
+
if passed == total:
|
|
188
|
+
print("所有测试通过!CrawloConfig工厂模式兼容性正常。")
|
|
189
|
+
return 0
|
|
190
|
+
else:
|
|
191
|
+
print("部分测试失败,请检查实现。")
|
|
192
|
+
return 1
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
exit_code = main()
|
|
197
|
+
exit(exit_code)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试多个爬虫目录的支持
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到Python路径
|
|
10
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
11
|
+
|
|
12
|
+
# 添加ofweek_standalone到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
14
|
+
|
|
15
|
+
def test_multiple_spider_directories():
|
|
16
|
+
"""测试多个爬虫目录的支持"""
|
|
17
|
+
print("测试多个爬虫目录的支持...")
|
|
18
|
+
|
|
19
|
+
# 导入设置
|
|
20
|
+
import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
|
|
21
|
+
|
|
22
|
+
# 创建设置管理器
|
|
23
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
24
|
+
settings = SettingManager()
|
|
25
|
+
settings.set_settings(settings_module)
|
|
26
|
+
|
|
27
|
+
# 检查SPIDER_MODULES配置
|
|
28
|
+
spider_modules = settings.get('SPIDER_MODULES')
|
|
29
|
+
print(f"SPIDER_MODULES配置: {spider_modules}")
|
|
30
|
+
|
|
31
|
+
# 创建CrawlerProcess实例
|
|
32
|
+
from crawlo.crawler import CrawlerProcess
|
|
33
|
+
process = CrawlerProcess(settings=settings)
|
|
34
|
+
|
|
35
|
+
# 检查是否注册了爬虫
|
|
36
|
+
spider_names = process.get_spider_names()
|
|
37
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
38
|
+
|
|
39
|
+
# 验证期望的爬虫是否已注册
|
|
40
|
+
expected_spiders = ['of_week_standalone', 'test_spider']
|
|
41
|
+
registered_spiders = []
|
|
42
|
+
|
|
43
|
+
for spider_name in expected_spiders:
|
|
44
|
+
if spider_name in spider_names:
|
|
45
|
+
print(f"✅ 成功: 爬虫 '{spider_name}' 已注册")
|
|
46
|
+
registered_spiders.append(spider_name)
|
|
47
|
+
else:
|
|
48
|
+
print(f"❌ 失败: 爬虫 '{spider_name}' 未找到")
|
|
49
|
+
|
|
50
|
+
if len(registered_spiders) == len(expected_spiders):
|
|
51
|
+
print(f"🎉 所有爬虫都已成功注册!")
|
|
52
|
+
return True
|
|
53
|
+
else:
|
|
54
|
+
print(f"⚠️ 部分爬虫未注册: {set(expected_spiders) - set(registered_spiders)}")
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == '__main__':
|
|
59
|
+
print("开始测试多个爬虫目录的支持...\n")
|
|
60
|
+
|
|
61
|
+
success = test_multiple_spider_directories()
|
|
62
|
+
|
|
63
|
+
if success:
|
|
64
|
+
print("\n🎉 测试通过!")
|
|
65
|
+
sys.exit(0)
|
|
66
|
+
else:
|
|
67
|
+
print("\n❌ 测试失败!")
|
|
68
|
+
sys.exit(1)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试多个SPIDER_MODULES目录的支持
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import asyncio
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到Python路径
|
|
11
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
12
|
+
|
|
13
|
+
# 添加ofweek_standalone到Python路径
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
15
|
+
|
|
16
|
+
from crawlo.crawler import CrawlerProcess
|
|
17
|
+
from crawlo.spider import get_spider_names
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_multiple_spider_modules():
|
|
21
|
+
"""测试多个SPIDER_MODULES目录的支持"""
|
|
22
|
+
print("测试多个SPIDER_MODULES目录的支持...")
|
|
23
|
+
|
|
24
|
+
# 模拟包含多个目录的SPIDER_MODULES配置
|
|
25
|
+
spider_modules = ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
26
|
+
|
|
27
|
+
# 创建CrawlerProcess实例
|
|
28
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
29
|
+
|
|
30
|
+
# 检查是否注册了爬虫
|
|
31
|
+
spider_names = process.get_spider_names()
|
|
32
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
33
|
+
|
|
34
|
+
# 验证期望的爬虫是否已注册
|
|
35
|
+
expected_spider = 'of_week_standalone'
|
|
36
|
+
if expected_spider in spider_names:
|
|
37
|
+
print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
|
|
38
|
+
return True
|
|
39
|
+
else:
|
|
40
|
+
print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_settings_with_multiple_spider_modules():
|
|
45
|
+
"""测试settings中配置多个SPIDER_MODULES目录"""
|
|
46
|
+
print("\n测试settings中配置多个SPIDER_MODULES目录...")
|
|
47
|
+
|
|
48
|
+
# 创建模拟的settings对象
|
|
49
|
+
class MockSettings:
|
|
50
|
+
def get(self, key, default=None):
|
|
51
|
+
if key == 'SPIDER_MODULES':
|
|
52
|
+
return ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
53
|
+
return default
|
|
54
|
+
|
|
55
|
+
settings = MockSettings()
|
|
56
|
+
|
|
57
|
+
# 创建CrawlerProcess实例
|
|
58
|
+
process = CrawlerProcess(settings=settings)
|
|
59
|
+
|
|
60
|
+
# 检查是否注册了爬虫
|
|
61
|
+
spider_names = process.get_spider_names()
|
|
62
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
63
|
+
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == '__main__':
|
|
68
|
+
print("开始测试多个SPIDER_MODULES目录的支持...\n")
|
|
69
|
+
|
|
70
|
+
# 测试显式传递多个spider_modules参数
|
|
71
|
+
success1 = test_multiple_spider_modules()
|
|
72
|
+
|
|
73
|
+
# 测试从settings中读取多个spider_modules配置
|
|
74
|
+
success2 = test_settings_with_multiple_spider_modules()
|
|
75
|
+
|
|
76
|
+
if success1 and success2:
|
|
77
|
+
print("\n🎉 所有测试通过!")
|
|
78
|
+
sys.exit(0)
|
|
79
|
+
else:
|
|
80
|
+
print("\n❌ 部分测试失败!")
|
|
81
|
+
sys.exit(1)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
优化后的选择器命名测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.utils import (
|
|
11
|
+
extract_text,
|
|
12
|
+
extract_texts,
|
|
13
|
+
extract_attr,
|
|
14
|
+
extract_attrs,
|
|
15
|
+
is_xpath
|
|
16
|
+
)
|
|
17
|
+
from parsel import Selector
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_optimized_naming():
|
|
21
|
+
"""测试优化后的命名"""
|
|
22
|
+
print("测试优化后的选择器命名...")
|
|
23
|
+
print("=" * 50)
|
|
24
|
+
|
|
25
|
+
# 创建测试HTML
|
|
26
|
+
html_content = """
|
|
27
|
+
<html>
|
|
28
|
+
<head>
|
|
29
|
+
<title>测试页面</title>
|
|
30
|
+
</head>
|
|
31
|
+
<body>
|
|
32
|
+
<div class="content">
|
|
33
|
+
<h1>主标题</h1>
|
|
34
|
+
<p class="intro">介绍段落</p>
|
|
35
|
+
<ul class="list">
|
|
36
|
+
<li>项目1</li>
|
|
37
|
+
<li>项目2</li>
|
|
38
|
+
<li>项目3</li>
|
|
39
|
+
</ul>
|
|
40
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
41
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
42
|
+
</div>
|
|
43
|
+
</body>
|
|
44
|
+
</html>
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
selector = Selector(text=html_content)
|
|
48
|
+
|
|
49
|
+
# 测试 is_xpath
|
|
50
|
+
print("1. 测试 is_xpath:")
|
|
51
|
+
print(f" '/' 开头: {is_xpath('/')}")
|
|
52
|
+
print(f" '//' 开头: {is_xpath('//title')}")
|
|
53
|
+
print(f" './' 开头: {is_xpath('./div')}")
|
|
54
|
+
print(f" 'title' 开头: {is_xpath('title')}")
|
|
55
|
+
print()
|
|
56
|
+
|
|
57
|
+
# 测试 extract_text
|
|
58
|
+
print("2. 测试 extract_text:")
|
|
59
|
+
title_elements = selector.css('title')
|
|
60
|
+
title_text = extract_text(title_elements)
|
|
61
|
+
print(f" 标题文本: {title_text}")
|
|
62
|
+
|
|
63
|
+
h1_elements = selector.css('.content h1')
|
|
64
|
+
h1_text = extract_text(h1_elements)
|
|
65
|
+
print(f" H1文本: {h1_text}")
|
|
66
|
+
print()
|
|
67
|
+
|
|
68
|
+
# 测试 extract_texts
|
|
69
|
+
print("3. 测试 extract_texts:")
|
|
70
|
+
li_elements = selector.css('.list li')
|
|
71
|
+
li_texts = extract_texts(li_elements)
|
|
72
|
+
print(f" 列表项文本: {li_texts}")
|
|
73
|
+
print()
|
|
74
|
+
|
|
75
|
+
# 测试 extract_attr
|
|
76
|
+
print("4. 测试 extract_attr:")
|
|
77
|
+
link_elements = selector.css('.link')
|
|
78
|
+
link_href = extract_attr(link_elements, 'href')
|
|
79
|
+
print(f" 链接href: {link_href}")
|
|
80
|
+
|
|
81
|
+
img_elements = selector.css('.image')
|
|
82
|
+
img_alt = extract_attr(img_elements, 'alt')
|
|
83
|
+
print(f" 图片alt: {img_alt}")
|
|
84
|
+
print()
|
|
85
|
+
|
|
86
|
+
# 测试 extract_attrs
|
|
87
|
+
print("5. 测试 extract_attrs:")
|
|
88
|
+
all_links = selector.css('a')
|
|
89
|
+
all_hrefs = extract_attrs(all_links, 'href')
|
|
90
|
+
print(f" 所有链接href: {all_hrefs}")
|
|
91
|
+
|
|
92
|
+
all_images = selector.css('img')
|
|
93
|
+
all_srcs = extract_attrs(all_images, 'src')
|
|
94
|
+
print(f" 所有图片src: {all_srcs}")
|
|
95
|
+
print()
|
|
96
|
+
|
|
97
|
+
print("所有测试完成!")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == '__main__':
|
|
101
|
+
test_optimized_naming()
|
tests/test_priority_behavior.py
CHANGED
|
@@ -66,22 +66,22 @@ async def test_redis_queue_priority():
|
|
|
66
66
|
await queue._redis.delete(f"{queue.queue_name}:data")
|
|
67
67
|
|
|
68
68
|
# 创建不同优先级的请求
|
|
69
|
-
# 注意:
|
|
70
|
-
# 所以priority
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 正常优先级
|
|
69
|
+
# 注意:Request构造函数会将传入的priority值取反存储
|
|
70
|
+
# 所以priority=100的请求实际存储为-100,priority=-100的请求实际存储为100
|
|
71
|
+
request_low_priority = Request(url="https://low-priority.com", priority=100) # 实际存储为-100(高优先级)
|
|
72
|
+
request_high_priority = Request(url="https://high-priority.com", priority=-100) # 实际存储为100(低优先级)
|
|
73
|
+
request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 实际存储为0(正常优先级)
|
|
75
74
|
|
|
76
75
|
# 按照正确的顺序入队以验证优先级行为
|
|
77
|
-
|
|
78
|
-
await queue.put(
|
|
79
|
-
await queue.put(
|
|
76
|
+
# 使用实际存储的priority值
|
|
77
|
+
await queue.put(request_low_priority, priority=request_low_priority.priority) # 实际score=-100
|
|
78
|
+
await queue.put(request_normal_priority, priority=request_normal_priority.priority) # 实际score=0
|
|
79
|
+
await queue.put(request_high_priority, priority=request_high_priority.priority) # 实际score=100
|
|
80
80
|
|
|
81
81
|
print(f" 队列大小: {await queue.qsize()}")
|
|
82
82
|
|
|
83
|
-
# 出队顺序应该按照score从小到大(priority
|
|
84
|
-
#
|
|
83
|
+
# 出队顺序应该按照score从小到大(priority从小到大)
|
|
84
|
+
# 所以request_low_priority先出队(score=-100),request_normal_priority第二个出队(score=0),request_high_priority最后出队(score=100)
|
|
85
85
|
item1 = await queue.get(timeout=2.0)
|
|
86
86
|
item2 = await queue.get(timeout=2.0)
|
|
87
87
|
item3 = await queue.get(timeout=2.0)
|
|
@@ -91,13 +91,13 @@ async def test_redis_queue_priority():
|
|
|
91
91
|
print(f" 第二个出队: {item2.url if item2 else None}")
|
|
92
92
|
print(f" 第三个出队: {item3.url if item3 else None}")
|
|
93
93
|
|
|
94
|
-
# Redis队列中,score小的先出队,所以priority
|
|
95
|
-
assert item1 is not None and item1.url == "https://low-priority.com", f"
|
|
96
|
-
assert item2 is not None and item2.url == "https://normal-priority.com", f"
|
|
97
|
-
assert item3 is not None and item3.url == "https://high-priority.com", f"
|
|
94
|
+
# Redis队列中,score小的先出队,所以priority小的先出队
|
|
95
|
+
assert item1 is not None and item1.url == "https://low-priority.com", f"低优先级请求应该先出队,实际: {item1.url if item1 else None}"
|
|
96
|
+
assert item2 is not None and item2.url == "https://normal-priority.com", f"正常优先级请求应该第二个出队,实际: {item2.url if item2 else None}"
|
|
97
|
+
assert item3 is not None and item3.url == "https://high-priority.com", f"高优先级请求应该最后出队,实际: {item3.url if item3 else None}"
|
|
98
98
|
|
|
99
99
|
print(" ✅ Redis队列优先级测试通过(确认了score越小越优先的规则)")
|
|
100
|
-
print(" 注意:Redis队列中score =
|
|
100
|
+
print(" 注意:Redis队列中score = priority,所以priority值小的请求score小,会先出队")
|
|
101
101
|
|
|
102
102
|
except Exception as e:
|
|
103
103
|
print(f" ❌ Redis队列优先级测试失败: {e}")
|
|
@@ -196,8 +196,8 @@ async def main():
|
|
|
196
196
|
print("\n总结:")
|
|
197
197
|
print("1. 请求优先级遵循'数值越小越优先'的原则")
|
|
198
198
|
print("2. 内存队列: 直接使用(priority, request)元组,priority小的先出队")
|
|
199
|
-
print("3. Redis队列: 使用score =
|
|
200
|
-
print("
|
|
199
|
+
print("3. Redis队列: 使用score = priority,score小的先出队,所以priority小的先出队")
|
|
200
|
+
print(" 现在内存队列和Redis队列行为一致")
|
|
201
201
|
print("4. 重试中间件会根据RETRY_PRIORITY配置调整请求优先级")
|
|
202
202
|
print("5. 系统内置的优先级常量: URGENT(-200) < HIGH(-100) < NORMAL(0) < LOW(100) < BACKGROUND(200)")
|
|
203
203
|
print("6. Request对象构造时会将传入的priority值取反存储")
|