crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (107) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +52 -17
  4. crawlo/commands/startproject.py +24 -0
  5. crawlo/core/engine.py +2 -2
  6. crawlo/core/scheduler.py +4 -4
  7. crawlo/crawler.py +13 -6
  8. crawlo/downloader/__init__.py +5 -2
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/mode_manager.py +45 -11
  18. crawlo/network/response.py +374 -69
  19. crawlo/pipelines/mysql_pipeline.py +6 -6
  20. crawlo/pipelines/pipeline_manager.py +2 -2
  21. crawlo/project.py +2 -4
  22. crawlo/queue/pqueue.py +2 -6
  23. crawlo/queue/queue_manager.py +1 -2
  24. crawlo/settings/default_settings.py +15 -30
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +51 -65
  30. crawlo/templates/project/settings_distributed.py.tmpl +59 -67
  31. crawlo/templates/project/settings_gentle.py.tmpl +45 -40
  32. crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
  33. crawlo/templates/project/settings_minimal.py.tmpl +37 -26
  34. crawlo/templates/project/settings_simple.py.tmpl +45 -40
  35. crawlo/templates/run.py.tmpl +3 -7
  36. crawlo/tools/__init__.py +0 -11
  37. crawlo/utils/__init__.py +17 -1
  38. crawlo/utils/db_helper.py +220 -319
  39. crawlo/utils/error_handler.py +313 -67
  40. crawlo/utils/fingerprint.py +3 -4
  41. crawlo/utils/misc.py +82 -0
  42. crawlo/utils/request.py +55 -66
  43. crawlo/utils/selector_helper.py +138 -0
  44. crawlo/utils/spider_loader.py +185 -45
  45. crawlo/utils/text_helper.py +95 -0
  46. crawlo-1.4.5.dist-info/METADATA +329 -0
  47. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
  48. tests/bug_check_test.py +251 -0
  49. tests/direct_selector_helper_test.py +97 -0
  50. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  51. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  52. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  56. tests/ofweek_scrapy/scrapy.cfg +11 -0
  57. tests/performance_comparison.py +4 -5
  58. tests/simple_crawlo_test.py +1 -2
  59. tests/simple_follow_test.py +39 -0
  60. tests/simple_response_selector_test.py +95 -0
  61. tests/simple_selector_helper_test.py +155 -0
  62. tests/simple_selector_test.py +208 -0
  63. tests/simple_url_test.py +74 -0
  64. tests/test_crawler_process_import.py +39 -0
  65. tests/test_crawler_process_spider_modules.py +48 -0
  66. tests/test_edge_cases.py +7 -5
  67. tests/test_encoding_core.py +57 -0
  68. tests/test_encoding_detection.py +127 -0
  69. tests/test_factory_compatibility.py +197 -0
  70. tests/test_multi_directory.py +68 -0
  71. tests/test_multiple_spider_modules.py +81 -0
  72. tests/test_optimized_selector_naming.py +101 -0
  73. tests/test_priority_behavior.py +18 -18
  74. tests/test_response_follow.py +105 -0
  75. tests/test_response_selector_methods.py +93 -0
  76. tests/test_response_url_methods.py +71 -0
  77. tests/test_response_urljoin.py +87 -0
  78. tests/test_scrapy_style_encoding.py +113 -0
  79. tests/test_selector_helper.py +101 -0
  80. tests/test_selector_optimizations.py +147 -0
  81. tests/test_spider_loader.py +50 -0
  82. tests/test_spider_loader_comprehensive.py +70 -0
  83. tests/test_spider_modules.py +85 -0
  84. tests/test_spiders/__init__.py +1 -0
  85. tests/test_spiders/test_spider.py +10 -0
  86. crawlo/tools/anti_crawler.py +0 -269
  87. crawlo/utils/class_loader.py +0 -26
  88. crawlo/utils/enhanced_error_handler.py +0 -357
  89. crawlo-1.4.3.dist-info/METADATA +0 -190
  90. examples/test_project/__init__.py +0 -7
  91. examples/test_project/run.py +0 -35
  92. examples/test_project/test_project/__init__.py +0 -4
  93. examples/test_project/test_project/items.py +0 -18
  94. examples/test_project/test_project/middlewares.py +0 -119
  95. examples/test_project/test_project/pipelines.py +0 -97
  96. examples/test_project/test_project/settings.py +0 -170
  97. examples/test_project/test_project/spiders/__init__.py +0 -10
  98. examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  99. tests/simple_log_test.py +0 -58
  100. tests/simple_test.py +0 -48
  101. tests/test_framework_logger.py +0 -67
  102. tests/test_framework_startup.py +0 -65
  103. tests/test_mode_change.py +0 -73
  104. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  105. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  106. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  107. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response 编码检测优化测试
5
+ """
6
+ import unittest
7
+
8
+ # 模拟 Response 类的部分功能用于测试
9
+ class MockResponse:
10
+ def __init__(self, body, headers=None, request=None):
11
+ self.body = body
12
+ self.headers = headers or {}
13
+ self.request = request
14
+ self._DEFAULT_ENCODING = "ascii"
15
+
16
+ def _determine_encoding(self):
17
+ """简化版编码检测"""
18
+ # 1. 优先使用声明的编码
19
+ declared_encoding = self._declared_encoding()
20
+ if declared_encoding:
21
+ return declared_encoding
22
+
23
+ # 2. 默认使用 utf-8
24
+ return 'utf-8'
25
+
26
+ def _declared_encoding(self):
27
+ """获取声明的编码"""
28
+ # 1. Request 中指定的编码
29
+ if self.request and getattr(self.request, 'encoding', None):
30
+ return self.request.encoding
31
+
32
+ # 2. 从 Content-Type 头中检测
33
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
34
+ if content_type:
35
+ import re
36
+ charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
37
+ if charset_match:
38
+ return charset_match.group(1).lower()
39
+
40
+ return None
41
+
42
+
43
+ class TestDetermineEncoding(unittest.TestCase):
44
+ """编码检测测试类"""
45
+
46
+ def test_request_encoding_priority(self):
47
+ """测试 Request 编码优先级"""
48
+ class MockRequest:
49
+ encoding = 'gbk'
50
+
51
+ response = MockResponse(b'', request=MockRequest())
52
+ encoding = response._determine_encoding()
53
+ self.assertEqual(encoding, 'gbk')
54
+
55
+ def test_content_type_encoding(self):
56
+ """测试 Content-Type 头部编码检测"""
57
+ response = MockResponse(
58
+ b'',
59
+ headers={"content-type": "text/html; charset=iso-8859-1"}
60
+ )
61
+ encoding = response._determine_encoding()
62
+ self.assertEqual(encoding, 'iso-8859-1')
63
+
64
+ def test_default_encoding(self):
65
+ """测试默认编码"""
66
+ response = MockResponse(b'')
67
+ encoding = response._determine_encoding()
68
+ self.assertEqual(encoding, 'utf-8')
69
+
70
+ def test_case_insensitive_content_type(self):
71
+ """测试 Content-Type 头部大小写不敏感"""
72
+ response = MockResponse(
73
+ b'',
74
+ headers={"Content-Type": "text/html; CHARSET=UTF-8"}
75
+ )
76
+ encoding = response._determine_encoding()
77
+ self.assertEqual(encoding, 'utf-8')
78
+
79
+ def test_declared_encoding_with_request(self):
80
+ """测试声明编码 - Request优先级"""
81
+ class MockRequest:
82
+ encoding = 'gbk'
83
+
84
+ response = MockResponse(b'', request=MockRequest())
85
+ declared_encoding = response._declared_encoding()
86
+ self.assertEqual(declared_encoding, 'gbk')
87
+
88
+ def test_declared_encoding_with_content_type(self):
89
+ """测试声明编码 - Content-Type"""
90
+ response = MockResponse(
91
+ b'',
92
+ headers={"content-type": "text/html; charset=iso-8859-1"}
93
+ )
94
+ declared_encoding = response._declared_encoding()
95
+ self.assertEqual(declared_encoding, 'iso-8859-1')
96
+
97
+
98
+ def test_encoding_detection():
99
+ """简单测试编码检测功能"""
100
+ print("测试编码检测功能...")
101
+
102
+ # 测试 Request 编码优先级
103
+ class MockRequest:
104
+ encoding = 'gbk'
105
+
106
+ response1 = MockResponse(b'', request=MockRequest())
107
+ encoding1 = response1._determine_encoding()
108
+ print(f"Request 编码优先级: {encoding1}")
109
+
110
+ # 测试 Content-Type 头部编码
111
+ response2 = MockResponse(
112
+ b'',
113
+ headers={"content-type": "text/html; charset=iso-8859-1"}
114
+ )
115
+ encoding2 = response2._determine_encoding()
116
+ print(f"Content-Type 编码: {encoding2}")
117
+
118
+ # 测试默认编码
119
+ response3 = MockResponse(b'')
120
+ encoding3 = response3._determine_encoding()
121
+ print(f"默认编码: {encoding3}")
122
+
123
+ print("编码检测测试完成!")
124
+
125
+
126
+ if __name__ == '__main__':
127
+ test_encoding_detection()
@@ -0,0 +1,197 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 测试CrawloConfig工厂模式兼容性
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ import traceback
10
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
11
+
12
+ from crawlo.config import CrawloConfig
13
+
14
+
15
+ def test_standalone_factory():
16
+ """测试单机模式工厂函数"""
17
+ print("测试单机模式工厂函数...")
18
+
19
+ try:
20
+ # 创建单机模式配置
21
+ config = CrawloConfig.standalone(
22
+ project_name='ofweek_standalone',
23
+ concurrency=8,
24
+ download_delay=1.0
25
+ )
26
+
27
+ print(f"配置创建成功")
28
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
29
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
30
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
31
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
32
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
33
+
34
+ # 验证配置是否正确
35
+ assert config.get('RUN_MODE') == 'standalone'
36
+ assert config.get('QUEUE_TYPE') == 'memory'
37
+ assert config.get('PROJECT_NAME') == 'ofweek_standalone'
38
+ assert config.get('CONCURRENCY') == 8
39
+ assert config.get('DOWNLOAD_DELAY') == 1.0
40
+
41
+ print("✅ 单机模式工厂函数测试通过")
42
+ return True
43
+
44
+ except Exception as e:
45
+ print(f"❌ 单机模式工厂函数测试失败: {e}")
46
+ traceback.print_exc()
47
+ return False
48
+
49
+
50
+ def test_distributed_factory():
51
+ """测试分布式模式工厂函数"""
52
+ print("\n测试分布式模式工厂函数...")
53
+
54
+ try:
55
+ # 创建分布式模式配置
56
+ config = CrawloConfig.distributed(
57
+ redis_host='127.0.0.1',
58
+ redis_port=6379,
59
+ project_name='ofweek_distributed',
60
+ concurrency=16,
61
+ download_delay=0.5
62
+ )
63
+
64
+ print(f"配置创建成功")
65
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
66
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
67
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
68
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
69
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
70
+ print(f"REDIS_HOST: {config.get('REDIS_HOST')}")
71
+ print(f"REDIS_PORT: {config.get('REDIS_PORT')}")
72
+
73
+ # 验证配置是否正确
74
+ assert config.get('RUN_MODE') == 'distributed'
75
+ assert config.get('QUEUE_TYPE') == 'redis'
76
+ assert config.get('PROJECT_NAME') == 'ofweek_distributed'
77
+ assert config.get('CONCURRENCY') == 16
78
+ assert config.get('DOWNLOAD_DELAY') == 0.5
79
+ assert config.get('REDIS_HOST') == '127.0.0.1'
80
+ assert config.get('REDIS_PORT') == 6379
81
+
82
+ print("✅ 分布式模式工厂函数测试通过")
83
+ return True
84
+
85
+ except Exception as e:
86
+ print(f"❌ 分布式模式工厂函数测试失败: {e}")
87
+ traceback.print_exc()
88
+ return False
89
+
90
+
91
+ def test_auto_factory():
92
+ """测试自动模式工厂函数"""
93
+ print("\n测试自动模式工厂函数...")
94
+
95
+ try:
96
+ # 创建自动模式配置
97
+ config = CrawloConfig.auto(
98
+ project_name='ofweek_auto',
99
+ concurrency=12,
100
+ download_delay=0.8
101
+ )
102
+
103
+ print(f"配置创建成功")
104
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
105
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
106
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
107
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
108
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
109
+
110
+ # 验证配置是否正确
111
+ assert config.get('RUN_MODE') == 'auto'
112
+ assert config.get('QUEUE_TYPE') == 'auto'
113
+ assert config.get('PROJECT_NAME') == 'ofweek_auto'
114
+ assert config.get('CONCURRENCY') == 12
115
+ assert config.get('DOWNLOAD_DELAY') == 0.8
116
+
117
+ print("✅ 自动模式工厂函数测试通过")
118
+ return True
119
+
120
+ except Exception as e:
121
+ print(f"❌ 自动模式工厂函数测试失败: {e}")
122
+ traceback.print_exc()
123
+ return False
124
+
125
+
126
+ def test_config_to_dict():
127
+ """测试配置转换为字典"""
128
+ print("\n测试配置转换为字典...")
129
+
130
+ try:
131
+ # 创建配置
132
+ config = CrawloConfig.standalone(
133
+ project_name='test_project',
134
+ concurrency=4
135
+ )
136
+
137
+ # 转换为字典
138
+ config_dict = config.to_dict()
139
+
140
+ print(f"字典转换成功")
141
+ print(f"字典键数量: {len(config_dict)}")
142
+
143
+ # 验证关键配置项
144
+ assert 'RUN_MODE' in config_dict
145
+ assert 'QUEUE_TYPE' in config_dict
146
+ assert 'PROJECT_NAME' in config_dict
147
+ assert 'CONCURRENCY' in config_dict
148
+
149
+ print("✅ 配置转换为字典测试通过")
150
+ return True
151
+
152
+ except Exception as e:
153
+ print(f"❌ 配置转换为字典测试失败: {e}")
154
+ traceback.print_exc()
155
+ return False
156
+
157
+
158
+ def main():
159
+ """主函数"""
160
+ print("开始测试CrawloConfig工厂模式兼容性...")
161
+ print("=" * 50)
162
+
163
+ tests = [
164
+ test_standalone_factory,
165
+ test_distributed_factory,
166
+ test_auto_factory,
167
+ test_config_to_dict,
168
+ ]
169
+
170
+ passed = 0
171
+ total = len(tests)
172
+
173
+ for test_func in tests:
174
+ try:
175
+ if test_func():
176
+ passed += 1
177
+ print(f"✓ {test_func.__name__} 通过")
178
+ else:
179
+ print(f"✗ {test_func.__name__} 失败")
180
+ except Exception as e:
181
+ print(f"✗ {test_func.__name__} 异常: {e}")
182
+ print()
183
+
184
+ print("=" * 50)
185
+ print(f"测试结果: {passed}/{total} 通过")
186
+
187
+ if passed == total:
188
+ print("所有测试通过!CrawloConfig工厂模式兼容性正常。")
189
+ return 0
190
+ else:
191
+ print("部分测试失败,请检查实现。")
192
+ return 1
193
+
194
+
195
+ if __name__ == "__main__":
196
+ exit_code = main()
197
+ exit(exit_code)
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 测试多个爬虫目录的支持
5
+ """
6
+ import sys
7
+ import os
8
+
9
+ # 添加项目根目录到Python路径
10
+ sys.path.insert(0, os.path.dirname(__file__))
11
+
12
+ # 添加ofweek_standalone到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
14
+
15
+ def test_multiple_spider_directories():
16
+ """测试多个爬虫目录的支持"""
17
+ print("测试多个爬虫目录的支持...")
18
+
19
+ # 导入设置
20
+ import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
21
+
22
+ # 创建设置管理器
23
+ from crawlo.settings.setting_manager import SettingManager
24
+ settings = SettingManager()
25
+ settings.set_settings(settings_module)
26
+
27
+ # 检查SPIDER_MODULES配置
28
+ spider_modules = settings.get('SPIDER_MODULES')
29
+ print(f"SPIDER_MODULES配置: {spider_modules}")
30
+
31
+ # 创建CrawlerProcess实例
32
+ from crawlo.crawler import CrawlerProcess
33
+ process = CrawlerProcess(settings=settings)
34
+
35
+ # 检查是否注册了爬虫
36
+ spider_names = process.get_spider_names()
37
+ print(f"已注册的爬虫: {spider_names}")
38
+
39
+ # 验证期望的爬虫是否已注册
40
+ expected_spiders = ['of_week_standalone', 'test_spider']
41
+ registered_spiders = []
42
+
43
+ for spider_name in expected_spiders:
44
+ if spider_name in spider_names:
45
+ print(f"✅ 成功: 爬虫 '{spider_name}' 已注册")
46
+ registered_spiders.append(spider_name)
47
+ else:
48
+ print(f"❌ 失败: 爬虫 '{spider_name}' 未找到")
49
+
50
+ if len(registered_spiders) == len(expected_spiders):
51
+ print(f"🎉 所有爬虫都已成功注册!")
52
+ return True
53
+ else:
54
+ print(f"⚠️ 部分爬虫未注册: {set(expected_spiders) - set(registered_spiders)}")
55
+ return False
56
+
57
+
58
+ if __name__ == '__main__':
59
+ print("开始测试多个爬虫目录的支持...\n")
60
+
61
+ success = test_multiple_spider_directories()
62
+
63
+ if success:
64
+ print("\n🎉 测试通过!")
65
+ sys.exit(0)
66
+ else:
67
+ print("\n❌ 测试失败!")
68
+ sys.exit(1)
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 测试多个SPIDER_MODULES目录的支持
5
+ """
6
+ import sys
7
+ import os
8
+ import asyncio
9
+
10
+ # 添加项目根目录到Python路径
11
+ sys.path.insert(0, os.path.dirname(__file__))
12
+
13
+ # 添加ofweek_standalone到Python路径
14
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
15
+
16
+ from crawlo.crawler import CrawlerProcess
17
+ from crawlo.spider import get_spider_names
18
+
19
+
20
+ def test_multiple_spider_modules():
21
+ """测试多个SPIDER_MODULES目录的支持"""
22
+ print("测试多个SPIDER_MODULES目录的支持...")
23
+
24
+ # 模拟包含多个目录的SPIDER_MODULES配置
25
+ spider_modules = ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
26
+
27
+ # 创建CrawlerProcess实例
28
+ process = CrawlerProcess(spider_modules=spider_modules)
29
+
30
+ # 检查是否注册了爬虫
31
+ spider_names = process.get_spider_names()
32
+ print(f"已注册的爬虫: {spider_names}")
33
+
34
+ # 验证期望的爬虫是否已注册
35
+ expected_spider = 'of_week_standalone'
36
+ if expected_spider in spider_names:
37
+ print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
38
+ return True
39
+ else:
40
+ print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
41
+ return False
42
+
43
+
44
+ def test_settings_with_multiple_spider_modules():
45
+ """测试settings中配置多个SPIDER_MODULES目录"""
46
+ print("\n测试settings中配置多个SPIDER_MODULES目录...")
47
+
48
+ # 创建模拟的settings对象
49
+ class MockSettings:
50
+ def get(self, key, default=None):
51
+ if key == 'SPIDER_MODULES':
52
+ return ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
53
+ return default
54
+
55
+ settings = MockSettings()
56
+
57
+ # 创建CrawlerProcess实例
58
+ process = CrawlerProcess(settings=settings)
59
+
60
+ # 检查是否注册了爬虫
61
+ spider_names = process.get_spider_names()
62
+ print(f"已注册的爬虫: {spider_names}")
63
+
64
+ return True
65
+
66
+
67
+ if __name__ == '__main__':
68
+ print("开始测试多个SPIDER_MODULES目录的支持...\n")
69
+
70
+ # 测试显式传递多个spider_modules参数
71
+ success1 = test_multiple_spider_modules()
72
+
73
+ # 测试从settings中读取多个spider_modules配置
74
+ success2 = test_settings_with_multiple_spider_modules()
75
+
76
+ if success1 and success2:
77
+ print("\n🎉 所有测试通过!")
78
+ sys.exit(0)
79
+ else:
80
+ print("\n❌ 部分测试失败!")
81
+ sys.exit(1)
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 优化后的选择器命名测试
5
+ """
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
9
+
10
+ from crawlo.utils import (
11
+ extract_text,
12
+ extract_texts,
13
+ extract_attr,
14
+ extract_attrs,
15
+ is_xpath
16
+ )
17
+ from parsel import Selector
18
+
19
+
20
+ def test_optimized_naming():
21
+ """测试优化后的命名"""
22
+ print("测试优化后的选择器命名...")
23
+ print("=" * 50)
24
+
25
+ # 创建测试HTML
26
+ html_content = """
27
+ <html>
28
+ <head>
29
+ <title>测试页面</title>
30
+ </head>
31
+ <body>
32
+ <div class="content">
33
+ <h1>主标题</h1>
34
+ <p class="intro">介绍段落</p>
35
+ <ul class="list">
36
+ <li>项目1</li>
37
+ <li>项目2</li>
38
+ <li>项目3</li>
39
+ </ul>
40
+ <a href="https://example.com" class="link">链接文本</a>
41
+ <img src="image.jpg" alt="图片描述" class="image">
42
+ </div>
43
+ </body>
44
+ </html>
45
+ """
46
+
47
+ selector = Selector(text=html_content)
48
+
49
+ # 测试 is_xpath
50
+ print("1. 测试 is_xpath:")
51
+ print(f" '/' 开头: {is_xpath('/')}")
52
+ print(f" '//' 开头: {is_xpath('//title')}")
53
+ print(f" './' 开头: {is_xpath('./div')}")
54
+ print(f" 'title' 开头: {is_xpath('title')}")
55
+ print()
56
+
57
+ # 测试 extract_text
58
+ print("2. 测试 extract_text:")
59
+ title_elements = selector.css('title')
60
+ title_text = extract_text(title_elements)
61
+ print(f" 标题文本: {title_text}")
62
+
63
+ h1_elements = selector.css('.content h1')
64
+ h1_text = extract_text(h1_elements)
65
+ print(f" H1文本: {h1_text}")
66
+ print()
67
+
68
+ # 测试 extract_texts
69
+ print("3. 测试 extract_texts:")
70
+ li_elements = selector.css('.list li')
71
+ li_texts = extract_texts(li_elements)
72
+ print(f" 列表项文本: {li_texts}")
73
+ print()
74
+
75
+ # 测试 extract_attr
76
+ print("4. 测试 extract_attr:")
77
+ link_elements = selector.css('.link')
78
+ link_href = extract_attr(link_elements, 'href')
79
+ print(f" 链接href: {link_href}")
80
+
81
+ img_elements = selector.css('.image')
82
+ img_alt = extract_attr(img_elements, 'alt')
83
+ print(f" 图片alt: {img_alt}")
84
+ print()
85
+
86
+ # 测试 extract_attrs
87
+ print("5. 测试 extract_attrs:")
88
+ all_links = selector.css('a')
89
+ all_hrefs = extract_attrs(all_links, 'href')
90
+ print(f" 所有链接href: {all_hrefs}")
91
+
92
+ all_images = selector.css('img')
93
+ all_srcs = extract_attrs(all_images, 'src')
94
+ print(f" 所有图片src: {all_srcs}")
95
+ print()
96
+
97
+ print("所有测试完成!")
98
+
99
+
100
+ if __name__ == '__main__':
101
+ test_optimized_naming()
@@ -66,22 +66,22 @@ async def test_redis_queue_priority():
66
66
  await queue._redis.delete(f"{queue.queue_name}:data")
67
67
 
68
68
  # 创建不同优先级的请求
69
- # 注意:Redis队列中,score = -priority
70
- # 所以priority=-100的请求score=100,priority=100的请求score=-100
71
- # zpopmin会弹出score最小的元素,所以priority=100的请求会先出队
72
- request_low_priority = Request(url="https://low-priority.com", priority=100) # 低优先级(数值大)
73
- request_high_priority = Request(url="https://high-priority.com", priority=-100) # 高优先级(数值小)
74
- request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 正常优先级
69
+ # 注意:Request构造函数会将传入的priority值取反存储
70
+ # 所以priority=100的请求实际存储为-100,priority=-100的请求实际存储为100
71
+ request_low_priority = Request(url="https://low-priority.com", priority=100) # 实际存储为-100(高优先级)
72
+ request_high_priority = Request(url="https://high-priority.com", priority=-100) # 实际存储为100(低优先级)
73
+ request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 实际存储为0(正常优先级)
75
74
 
76
75
  # 按照正确的顺序入队以验证优先级行为
77
- await queue.put(request_high_priority, priority=-100) # 高优先级,score=100
78
- await queue.put(request_normal_priority, priority=0) # 正常优先级,score=0
79
- await queue.put(request_low_priority, priority=100) # 低优先级,score=-100
76
+ # 使用实际存储的priority
77
+ await queue.put(request_low_priority, priority=request_low_priority.priority) # 实际score=-100
78
+ await queue.put(request_normal_priority, priority=request_normal_priority.priority) # 实际score=0
79
+ await queue.put(request_high_priority, priority=request_high_priority.priority) # 实际score=100
80
80
 
81
81
  print(f" 队列大小: {await queue.qsize()}")
82
82
 
83
- # 出队顺序应该按照score从小到大(priority从大到小)
84
- # 所以低优先级先出队,高优先级最后出队
83
+ # 出队顺序应该按照score从小到大(priority从小到大)
84
+ # 所以request_low_priority先出队(score=-100),request_normal_priority第二个出队(score=0),request_high_priority最后出队(score=100)
85
85
  item1 = await queue.get(timeout=2.0)
86
86
  item2 = await queue.get(timeout=2.0)
87
87
  item3 = await queue.get(timeout=2.0)
@@ -91,13 +91,13 @@ async def test_redis_queue_priority():
91
91
  print(f" 第二个出队: {item2.url if item2 else None}")
92
92
  print(f" 第三个出队: {item3.url if item3 else None}")
93
93
 
94
- # Redis队列中,score小的先出队,所以priority大的先出队
95
- assert item1 is not None and item1.url == "https://low-priority.com", f"低优先级应该先出队,实际: {item1.url if item1 else None}"
96
- assert item2 is not None and item2.url == "https://normal-priority.com", f"正常优先级应该第二个出队,实际: {item2.url if item2 else None}"
97
- assert item3 is not None and item3.url == "https://high-priority.com", f"高优先级应该最后出队,实际: {item3.url if item3 else None}"
94
+ # Redis队列中,score小的先出队,所以priority小的先出队
95
+ assert item1 is not None and item1.url == "https://low-priority.com", f"低优先级请求应该先出队,实际: {item1.url if item1 else None}"
96
+ assert item2 is not None and item2.url == "https://normal-priority.com", f"正常优先级请求应该第二个出队,实际: {item2.url if item2 else None}"
97
+ assert item3 is not None and item3.url == "https://high-priority.com", f"高优先级请求应该最后出队,实际: {item3.url if item3 else None}"
98
98
 
99
99
  print(" ✅ Redis队列优先级测试通过(确认了score越小越优先的规则)")
100
- print(" 注意:Redis队列中score = -priority,所以priority值大的请求score小,会先出队")
100
+ print(" 注意:Redis队列中score = priority,所以priority值小的请求score小,会先出队")
101
101
 
102
102
  except Exception as e:
103
103
  print(f" ❌ Redis队列优先级测试失败: {e}")
@@ -196,8 +196,8 @@ async def main():
196
196
  print("\n总结:")
197
197
  print("1. 请求优先级遵循'数值越小越优先'的原则")
198
198
  print("2. 内存队列: 直接使用(priority, request)元组,priority小的先出队")
199
- print("3. Redis队列: 使用score = -priority,score小的先出队,所以priority大的先出队")
200
- print(" 这是一个已知的行为差异,需要在使用时注意")
199
+ print("3. Redis队列: 使用score = priority,score小的先出队,所以priority小的先出队")
200
+ print(" 现在内存队列和Redis队列行为一致")
201
201
  print("4. 重试中间件会根据RETRY_PRIORITY配置调整请求优先级")
202
202
  print("5. 系统内置的优先级常量: URGENT(-200) < HIGH(-100) < NORMAL(0) < LOW(100) < BACKGROUND(200)")
203
203
  print("6. Request对象构造时会将传入的priority值取反存储")