crawlo 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (85) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/startproject.py +24 -0
  4. crawlo/core/engine.py +2 -2
  5. crawlo/core/scheduler.py +4 -4
  6. crawlo/crawler.py +8 -7
  7. crawlo/downloader/__init__.py +5 -2
  8. crawlo/extension/__init__.py +2 -2
  9. crawlo/filters/aioredis_filter.py +8 -1
  10. crawlo/filters/memory_filter.py +8 -1
  11. crawlo/initialization/built_in.py +13 -4
  12. crawlo/initialization/core.py +5 -4
  13. crawlo/interfaces.py +24 -0
  14. crawlo/middleware/__init__.py +7 -4
  15. crawlo/middleware/middleware_manager.py +15 -8
  16. crawlo/mode_manager.py +45 -11
  17. crawlo/network/response.py +374 -69
  18. crawlo/pipelines/mysql_pipeline.py +6 -6
  19. crawlo/pipelines/pipeline_manager.py +2 -2
  20. crawlo/project.py +2 -4
  21. crawlo/settings/default_settings.py +4 -0
  22. crawlo/task_manager.py +2 -2
  23. crawlo/templates/project/items.py.tmpl +2 -2
  24. crawlo/templates/project/middlewares.py.tmpl +9 -89
  25. crawlo/templates/project/pipelines.py.tmpl +8 -68
  26. crawlo/tools/__init__.py +0 -11
  27. crawlo/utils/__init__.py +17 -1
  28. crawlo/utils/db_helper.py +220 -319
  29. crawlo/utils/error_handler.py +313 -67
  30. crawlo/utils/fingerprint.py +3 -4
  31. crawlo/utils/misc.py +82 -0
  32. crawlo/utils/request.py +55 -66
  33. crawlo/utils/selector_helper.py +138 -0
  34. crawlo/utils/spider_loader.py +185 -45
  35. crawlo/utils/text_helper.py +95 -0
  36. crawlo-1.4.5.dist-info/METADATA +329 -0
  37. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/RECORD +76 -49
  38. tests/bug_check_test.py +251 -0
  39. tests/direct_selector_helper_test.py +97 -0
  40. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  41. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  42. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  43. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  44. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  45. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  46. tests/ofweek_scrapy/scrapy.cfg +11 -0
  47. tests/performance_comparison.py +4 -5
  48. tests/simple_crawlo_test.py +1 -2
  49. tests/simple_follow_test.py +39 -0
  50. tests/simple_response_selector_test.py +95 -0
  51. tests/simple_selector_helper_test.py +155 -0
  52. tests/simple_selector_test.py +208 -0
  53. tests/simple_url_test.py +74 -0
  54. tests/test_crawler_process_import.py +39 -0
  55. tests/test_crawler_process_spider_modules.py +48 -0
  56. tests/test_edge_cases.py +7 -5
  57. tests/test_encoding_core.py +57 -0
  58. tests/test_encoding_detection.py +127 -0
  59. tests/test_factory_compatibility.py +197 -0
  60. tests/test_optimized_selector_naming.py +101 -0
  61. tests/test_priority_behavior.py +18 -18
  62. tests/test_response_follow.py +105 -0
  63. tests/test_response_selector_methods.py +93 -0
  64. tests/test_response_url_methods.py +71 -0
  65. tests/test_response_urljoin.py +87 -0
  66. tests/test_scrapy_style_encoding.py +113 -0
  67. tests/test_selector_helper.py +101 -0
  68. tests/test_selector_optimizations.py +147 -0
  69. tests/test_spider_loader.py +50 -0
  70. tests/test_spider_loader_comprehensive.py +70 -0
  71. tests/test_spiders/__init__.py +1 -0
  72. tests/test_spiders/test_spider.py +10 -0
  73. crawlo/tools/anti_crawler.py +0 -269
  74. crawlo/utils/class_loader.py +0 -26
  75. crawlo/utils/enhanced_error_handler.py +0 -357
  76. crawlo-1.4.4.dist-info/METADATA +0 -190
  77. tests/simple_log_test.py +0 -58
  78. tests/simple_test.py +0 -48
  79. tests/test_framework_logger.py +0 -67
  80. tests/test_framework_startup.py +0 -65
  81. tests/test_mode_change.py +0 -73
  82. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  83. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  84. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  85. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
@@ -0,0 +1,197 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 测试CrawloConfig工厂模式兼容性
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ import traceback
10
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
11
+
12
+ from crawlo.config import CrawloConfig
13
+
14
+
15
+ def test_standalone_factory():
16
+ """测试单机模式工厂函数"""
17
+ print("测试单机模式工厂函数...")
18
+
19
+ try:
20
+ # 创建单机模式配置
21
+ config = CrawloConfig.standalone(
22
+ project_name='ofweek_standalone',
23
+ concurrency=8,
24
+ download_delay=1.0
25
+ )
26
+
27
+ print(f"配置创建成功")
28
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
29
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
30
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
31
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
32
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
33
+
34
+ # 验证配置是否正确
35
+ assert config.get('RUN_MODE') == 'standalone'
36
+ assert config.get('QUEUE_TYPE') == 'memory'
37
+ assert config.get('PROJECT_NAME') == 'ofweek_standalone'
38
+ assert config.get('CONCURRENCY') == 8
39
+ assert config.get('DOWNLOAD_DELAY') == 1.0
40
+
41
+ print("✅ 单机模式工厂函数测试通过")
42
+ return True
43
+
44
+ except Exception as e:
45
+ print(f"❌ 单机模式工厂函数测试失败: {e}")
46
+ traceback.print_exc()
47
+ return False
48
+
49
+
50
+ def test_distributed_factory():
51
+ """测试分布式模式工厂函数"""
52
+ print("\n测试分布式模式工厂函数...")
53
+
54
+ try:
55
+ # 创建分布式模式配置
56
+ config = CrawloConfig.distributed(
57
+ redis_host='127.0.0.1',
58
+ redis_port=6379,
59
+ project_name='ofweek_distributed',
60
+ concurrency=16,
61
+ download_delay=0.5
62
+ )
63
+
64
+ print(f"配置创建成功")
65
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
66
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
67
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
68
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
69
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
70
+ print(f"REDIS_HOST: {config.get('REDIS_HOST')}")
71
+ print(f"REDIS_PORT: {config.get('REDIS_PORT')}")
72
+
73
+ # 验证配置是否正确
74
+ assert config.get('RUN_MODE') == 'distributed'
75
+ assert config.get('QUEUE_TYPE') == 'redis'
76
+ assert config.get('PROJECT_NAME') == 'ofweek_distributed'
77
+ assert config.get('CONCURRENCY') == 16
78
+ assert config.get('DOWNLOAD_DELAY') == 0.5
79
+ assert config.get('REDIS_HOST') == '127.0.0.1'
80
+ assert config.get('REDIS_PORT') == 6379
81
+
82
+ print("✅ 分布式模式工厂函数测试通过")
83
+ return True
84
+
85
+ except Exception as e:
86
+ print(f"❌ 分布式模式工厂函数测试失败: {e}")
87
+ traceback.print_exc()
88
+ return False
89
+
90
+
91
+ def test_auto_factory():
92
+ """测试自动模式工厂函数"""
93
+ print("\n测试自动模式工厂函数...")
94
+
95
+ try:
96
+ # 创建自动模式配置
97
+ config = CrawloConfig.auto(
98
+ project_name='ofweek_auto',
99
+ concurrency=12,
100
+ download_delay=0.8
101
+ )
102
+
103
+ print(f"配置创建成功")
104
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
105
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
106
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
107
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
108
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
109
+
110
+ # 验证配置是否正确
111
+ assert config.get('RUN_MODE') == 'auto'
112
+ assert config.get('QUEUE_TYPE') == 'auto'
113
+ assert config.get('PROJECT_NAME') == 'ofweek_auto'
114
+ assert config.get('CONCURRENCY') == 12
115
+ assert config.get('DOWNLOAD_DELAY') == 0.8
116
+
117
+ print("✅ 自动模式工厂函数测试通过")
118
+ return True
119
+
120
+ except Exception as e:
121
+ print(f"❌ 自动模式工厂函数测试失败: {e}")
122
+ traceback.print_exc()
123
+ return False
124
+
125
+
126
+ def test_config_to_dict():
127
+ """测试配置转换为字典"""
128
+ print("\n测试配置转换为字典...")
129
+
130
+ try:
131
+ # 创建配置
132
+ config = CrawloConfig.standalone(
133
+ project_name='test_project',
134
+ concurrency=4
135
+ )
136
+
137
+ # 转换为字典
138
+ config_dict = config.to_dict()
139
+
140
+ print(f"字典转换成功")
141
+ print(f"字典键数量: {len(config_dict)}")
142
+
143
+ # 验证关键配置项
144
+ assert 'RUN_MODE' in config_dict
145
+ assert 'QUEUE_TYPE' in config_dict
146
+ assert 'PROJECT_NAME' in config_dict
147
+ assert 'CONCURRENCY' in config_dict
148
+
149
+ print("✅ 配置转换为字典测试通过")
150
+ return True
151
+
152
+ except Exception as e:
153
+ print(f"❌ 配置转换为字典测试失败: {e}")
154
+ traceback.print_exc()
155
+ return False
156
+
157
+
158
+ def main():
159
+ """主函数"""
160
+ print("开始测试CrawloConfig工厂模式兼容性...")
161
+ print("=" * 50)
162
+
163
+ tests = [
164
+ test_standalone_factory,
165
+ test_distributed_factory,
166
+ test_auto_factory,
167
+ test_config_to_dict,
168
+ ]
169
+
170
+ passed = 0
171
+ total = len(tests)
172
+
173
+ for test_func in tests:
174
+ try:
175
+ if test_func():
176
+ passed += 1
177
+ print(f"✓ {test_func.__name__} 通过")
178
+ else:
179
+ print(f"✗ {test_func.__name__} 失败")
180
+ except Exception as e:
181
+ print(f"✗ {test_func.__name__} 异常: {e}")
182
+ print()
183
+
184
+ print("=" * 50)
185
+ print(f"测试结果: {passed}/{total} 通过")
186
+
187
+ if passed == total:
188
+ print("所有测试通过!CrawloConfig工厂模式兼容性正常。")
189
+ return 0
190
+ else:
191
+ print("部分测试失败,请检查实现。")
192
+ return 1
193
+
194
+
195
+ if __name__ == "__main__":
196
+ exit_code = main()
197
+ exit(exit_code)
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 优化后的选择器命名测试
5
+ """
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
9
+
10
+ from crawlo.utils import (
11
+ extract_text,
12
+ extract_texts,
13
+ extract_attr,
14
+ extract_attrs,
15
+ is_xpath
16
+ )
17
+ from parsel import Selector
18
+
19
+
20
+ def test_optimized_naming():
21
+ """测试优化后的命名"""
22
+ print("测试优化后的选择器命名...")
23
+ print("=" * 50)
24
+
25
+ # 创建测试HTML
26
+ html_content = """
27
+ <html>
28
+ <head>
29
+ <title>测试页面</title>
30
+ </head>
31
+ <body>
32
+ <div class="content">
33
+ <h1>主标题</h1>
34
+ <p class="intro">介绍段落</p>
35
+ <ul class="list">
36
+ <li>项目1</li>
37
+ <li>项目2</li>
38
+ <li>项目3</li>
39
+ </ul>
40
+ <a href="https://example.com" class="link">链接文本</a>
41
+ <img src="image.jpg" alt="图片描述" class="image">
42
+ </div>
43
+ </body>
44
+ </html>
45
+ """
46
+
47
+ selector = Selector(text=html_content)
48
+
49
+ # 测试 is_xpath
50
+ print("1. 测试 is_xpath:")
51
+ print(f" '/' 开头: {is_xpath('/')}")
52
+ print(f" '//' 开头: {is_xpath('//title')}")
53
+ print(f" './' 开头: {is_xpath('./div')}")
54
+ print(f" 'title' 开头: {is_xpath('title')}")
55
+ print()
56
+
57
+ # 测试 extract_text
58
+ print("2. 测试 extract_text:")
59
+ title_elements = selector.css('title')
60
+ title_text = extract_text(title_elements)
61
+ print(f" 标题文本: {title_text}")
62
+
63
+ h1_elements = selector.css('.content h1')
64
+ h1_text = extract_text(h1_elements)
65
+ print(f" H1文本: {h1_text}")
66
+ print()
67
+
68
+ # 测试 extract_texts
69
+ print("3. 测试 extract_texts:")
70
+ li_elements = selector.css('.list li')
71
+ li_texts = extract_texts(li_elements)
72
+ print(f" 列表项文本: {li_texts}")
73
+ print()
74
+
75
+ # 测试 extract_attr
76
+ print("4. 测试 extract_attr:")
77
+ link_elements = selector.css('.link')
78
+ link_href = extract_attr(link_elements, 'href')
79
+ print(f" 链接href: {link_href}")
80
+
81
+ img_elements = selector.css('.image')
82
+ img_alt = extract_attr(img_elements, 'alt')
83
+ print(f" 图片alt: {img_alt}")
84
+ print()
85
+
86
+ # 测试 extract_attrs
87
+ print("5. 测试 extract_attrs:")
88
+ all_links = selector.css('a')
89
+ all_hrefs = extract_attrs(all_links, 'href')
90
+ print(f" 所有链接href: {all_hrefs}")
91
+
92
+ all_images = selector.css('img')
93
+ all_srcs = extract_attrs(all_images, 'src')
94
+ print(f" 所有图片src: {all_srcs}")
95
+ print()
96
+
97
+ print("所有测试完成!")
98
+
99
+
100
+ if __name__ == '__main__':
101
+ test_optimized_naming()
@@ -66,22 +66,22 @@ async def test_redis_queue_priority():
66
66
  await queue._redis.delete(f"{queue.queue_name}:data")
67
67
 
68
68
  # 创建不同优先级的请求
69
- # 注意:Redis队列中,score = -priority
70
- # 所以priority=-100的请求score=100,priority=100的请求score=-100
71
- # zpopmin会弹出score最小的元素,所以priority=100的请求会先出队
72
- request_low_priority = Request(url="https://low-priority.com", priority=100) # 低优先级(数值大)
73
- request_high_priority = Request(url="https://high-priority.com", priority=-100) # 高优先级(数值小)
74
- request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 正常优先级
69
+ # 注意:Request构造函数会将传入的priority值取反存储
70
+ # 所以priority=100的请求实际存储为-100,priority=-100的请求实际存储为100
71
+ request_low_priority = Request(url="https://low-priority.com", priority=100) # 实际存储为-100(高优先级)
72
+ request_high_priority = Request(url="https://high-priority.com", priority=-100) # 实际存储为100(低优先级)
73
+ request_normal_priority = Request(url="https://normal-priority.com", priority=0) # 实际存储为0(正常优先级)
75
74
 
76
75
  # 按照正确的顺序入队以验证优先级行为
77
- await queue.put(request_high_priority, priority=-100) # 高优先级,score=100
78
- await queue.put(request_normal_priority, priority=0) # 正常优先级,score=0
79
- await queue.put(request_low_priority, priority=100) # 低优先级,score=-100
76
+ # 使用实际存储的priority
77
+ await queue.put(request_low_priority, priority=request_low_priority.priority) # 实际score=-100
78
+ await queue.put(request_normal_priority, priority=request_normal_priority.priority) # 实际score=0
79
+ await queue.put(request_high_priority, priority=request_high_priority.priority) # 实际score=100
80
80
 
81
81
  print(f" 队列大小: {await queue.qsize()}")
82
82
 
83
- # 出队顺序应该按照score从小到大(priority从大到小)
84
- # 所以低优先级先出队,高优先级最后出队
83
+ # 出队顺序应该按照score从小到大(priority从小到大)
84
+ # 所以request_low_priority先出队(score=-100),request_normal_priority第二个出队(score=0),request_high_priority最后出队(score=100)
85
85
  item1 = await queue.get(timeout=2.0)
86
86
  item2 = await queue.get(timeout=2.0)
87
87
  item3 = await queue.get(timeout=2.0)
@@ -91,13 +91,13 @@ async def test_redis_queue_priority():
91
91
  print(f" 第二个出队: {item2.url if item2 else None}")
92
92
  print(f" 第三个出队: {item3.url if item3 else None}")
93
93
 
94
- # Redis队列中,score小的先出队,所以priority大的先出队
95
- assert item1 is not None and item1.url == "https://low-priority.com", f"低优先级应该先出队,实际: {item1.url if item1 else None}"
96
- assert item2 is not None and item2.url == "https://normal-priority.com", f"正常优先级应该第二个出队,实际: {item2.url if item2 else None}"
97
- assert item3 is not None and item3.url == "https://high-priority.com", f"高优先级应该最后出队,实际: {item3.url if item3 else None}"
94
+ # Redis队列中,score小的先出队,所以priority小的先出队
95
+ assert item1 is not None and item1.url == "https://low-priority.com", f"低优先级请求应该先出队,实际: {item1.url if item1 else None}"
96
+ assert item2 is not None and item2.url == "https://normal-priority.com", f"正常优先级请求应该第二个出队,实际: {item2.url if item2 else None}"
97
+ assert item3 is not None and item3.url == "https://high-priority.com", f"高优先级请求应该最后出队,实际: {item3.url if item3 else None}"
98
98
 
99
99
  print(" ✅ Redis队列优先级测试通过(确认了score越小越优先的规则)")
100
- print(" 注意:Redis队列中score = -priority,所以priority值大的请求score小,会先出队")
100
+ print(" 注意:Redis队列中score = priority,所以priority值小的请求score小,会先出队")
101
101
 
102
102
  except Exception as e:
103
103
  print(f" ❌ Redis队列优先级测试失败: {e}")
@@ -196,8 +196,8 @@ async def main():
196
196
  print("\n总结:")
197
197
  print("1. 请求优先级遵循'数值越小越优先'的原则")
198
198
  print("2. 内存队列: 直接使用(priority, request)元组,priority小的先出队")
199
- print("3. Redis队列: 使用score = -priority,score小的先出队,所以priority大的先出队")
200
- print(" 这是一个已知的行为差异,需要在使用时注意")
199
+ print("3. Redis队列: 使用score = priority,score小的先出队,所以priority小的先出队")
200
+ print(" 现在内存队列和Redis队列行为一致")
201
201
  print("4. 重试中间件会根据RETRY_PRIORITY配置调整请求优先级")
202
202
  print("5. 系统内置的优先级常量: URGENT(-200) < HIGH(-100) < NORMAL(0) < LOW(100) < BACKGROUND(200)")
203
203
  print("6. Request对象构造时会将传入的priority值取反存储")
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response.follow 方法测试
5
+ """
6
+ import unittest
7
+ from unittest.mock import Mock
8
+
9
+ # 模拟 Request 类
10
+ class MockRequest:
11
+ def __init__(self, url, callback=None, **kwargs):
12
+ self.url = url
13
+ self.callback = callback
14
+ self.kwargs = kwargs
15
+
16
+ # 模拟 crawlo.Request
17
+ import sys
18
+ sys.modules['crawlo'] = Mock()
19
+ sys.modules['crawlo'].Request = MockRequest
20
+
21
+ from crawlo.network.response import Response
22
+
23
+
24
+ class TestResponseFollow(unittest.TestCase):
25
+ """Response.follow 方法测试类"""
26
+
27
+ def setUp(self):
28
+ """测试前准备"""
29
+ # 创建一个模拟的HTML响应
30
+ html_content = """
31
+ <html>
32
+ <head>
33
+ <title>测试页面</title>
34
+ </head>
35
+ <body>
36
+ <div class="content">
37
+ <h1>主标题</h1>
38
+ <p class="intro">这是介绍段落</p>
39
+ <ul class="list">
40
+ <li>项目1</li>
41
+ <li>项目2</li>
42
+ <li>项目3</li>
43
+ </ul>
44
+ <a href="https://example.com" class="link">链接文本</a>
45
+ <a href="/relative/path" class="relative-link">相对链接</a>
46
+ <img src="image.jpg" alt="图片描述" class="image">
47
+ </div>
48
+ </body>
49
+ </html>
50
+ """
51
+
52
+ # 创建模拟的请求对象
53
+ mock_request = Mock()
54
+ mock_request.callback = None
55
+
56
+ self.response = Response(
57
+ url="https://example.com/test",
58
+ body=html_content.encode('utf-8'),
59
+ headers={"content-type": "text/html; charset=utf-8"},
60
+ request=mock_request
61
+ )
62
+
63
+ def test_follow_absolute_url(self):
64
+ """测试处理绝对URL"""
65
+ request = self.response.follow("https://other.com/page", callback=lambda r: None)
66
+ self.assertEqual(request.url, "https://other.com/page")
67
+ self.assertIsNotNone(request.callback)
68
+
69
+ def test_follow_relative_url(self):
70
+ """测试处理相对URL"""
71
+ request = self.response.follow("/relative/path", callback=lambda r: None)
72
+ self.assertEqual(request.url, "https://example.com/relative/path")
73
+ self.assertIsNotNone(request.callback)
74
+
75
+ def test_follow_complex_relative_url(self):
76
+ """测试处理复杂的相对URL"""
77
+ request = self.response.follow("../other/path", callback=lambda r: None)
78
+ self.assertEqual(request.url, "https://example.com/other/path")
79
+
80
+ request2 = self.response.follow("./another/path", callback=lambda r: None)
81
+ self.assertEqual(request2.url, "https://example.com/another/path")
82
+
83
+ def test_follow_with_query_params(self):
84
+ """测试处理带查询参数的URL"""
85
+ request = self.response.follow("/path?param=value", callback=lambda r: None)
86
+ self.assertEqual(request.url, "https://example.com/path?param=value")
87
+
88
+ request2 = self.response.follow("/path#section", callback=lambda r: None)
89
+ self.assertEqual(request2.url, "https://example.com/path#section")
90
+
91
+ def test_follow_with_additional_kwargs(self):
92
+ """测试传递额外参数"""
93
+ request = self.response.follow(
94
+ "/path",
95
+ callback=lambda r: None,
96
+ method="POST",
97
+ headers={"User-Agent": "test"}
98
+ )
99
+ self.assertEqual(request.url, "https://example.com/path")
100
+ self.assertEqual(request.kwargs.get("method"), "POST")
101
+ self.assertEqual(request.kwargs.get("headers"), {"User-Agent": "test"})
102
+
103
+
104
+ if __name__ == '__main__':
105
+ unittest.main()
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 测试Response类中的选择器方法
5
+ """
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
9
+
10
+ from crawlo.network.response import Response
11
+
12
+
13
+ def test_response_selector_methods():
14
+ """测试Response类中的选择器方法"""
15
+ print("测试Response类中的选择器方法...")
16
+ print("=" * 50)
17
+
18
+ # 创建测试HTML响应
19
+ html_content = """
20
+ <html>
21
+ <head>
22
+ <title>测试页面</title>
23
+ </head>
24
+ <body>
25
+ <div class="content">
26
+ <h1>主标题</h1>
27
+ <p class="intro">介绍段落</p>
28
+ <ul class="list">
29
+ <li>项目1</li>
30
+ <li>项目2</li>
31
+ <li>项目3</li>
32
+ </ul>
33
+ <a href="https://example.com" class="link">链接文本</a>
34
+ <img src="image.jpg" alt="图片描述" class="image">
35
+ </div>
36
+ </body>
37
+ </html>
38
+ """
39
+
40
+ # 创建Response对象
41
+ response = Response(
42
+ url="https://example.com/test",
43
+ body=html_content.encode('utf-8'),
44
+ headers={"content-type": "text/html; charset=utf-8"}
45
+ )
46
+
47
+ # 测试 extract_text (CSS选择器)
48
+ print("1. 测试 extract_text (CSS选择器):")
49
+ title_text = response.extract_text('title')
50
+ print(f" 标题文本: {title_text}")
51
+
52
+ h1_text = response.extract_text('.content h1')
53
+ print(f" H1文本: {h1_text}")
54
+ print()
55
+
56
+ # 测试 extract_text (XPath选择器)
57
+ print("2. 测试 extract_text (XPath选择器):")
58
+ title_text_xpath = response.extract_text('//title')
59
+ print(f" 标题文本: {title_text_xpath}")
60
+
61
+ h1_text_xpath = response.extract_text('//div[@class="content"]/h1')
62
+ print(f" H1文本: {h1_text_xpath}")
63
+ print()
64
+
65
+ # 测试 extract_texts
66
+ print("3. 测试 extract_texts:")
67
+ li_texts = response.extract_texts('.list li')
68
+ print(f" 列表项文本: {li_texts}")
69
+ print()
70
+
71
+ # 测试 extract_attr
72
+ print("4. 测试 extract_attr:")
73
+ link_href = response.extract_attr('.link', 'href')
74
+ print(f" 链接href: {link_href}")
75
+
76
+ img_alt = response.extract_attr('.image', 'alt')
77
+ print(f" 图片alt: {img_alt}")
78
+ print()
79
+
80
+ # 测试 extract_attrs
81
+ print("5. 测试 extract_attrs:")
82
+ all_links = response.extract_attrs('a', 'href')
83
+ print(f" 所有链接href: {all_links}")
84
+
85
+ all_images = response.extract_attrs('img', 'src')
86
+ print(f" 所有图片src: {all_images}")
87
+ print()
88
+
89
+ print("所有测试完成!")
90
+
91
+
92
+ if __name__ == '__main__':
93
+ test_response_selector_methods()
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response URL 处理方法测试
5
+ """
6
+ import unittest
7
+ from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
8
+
9
+
10
+ class TestUrlMethods(unittest.TestCase):
11
+ """URL 处理方法测试类"""
12
+
13
+ def setUp(self):
14
+ """测试前准备"""
15
+ self.test_url = "https://example.com/test?param1=value1&param2=value2#section1"
16
+
17
+ def test_urlparse(self):
18
+ """测试 urlparse 方法"""
19
+ parsed = urlparse(self.test_url)
20
+ self.assertEqual(parsed.scheme, "https")
21
+ self.assertEqual(parsed.netloc, "example.com")
22
+ self.assertEqual(parsed.path, "/test")
23
+ self.assertEqual(parsed.query, "param1=value1&param2=value2")
24
+ self.assertEqual(parsed.fragment, "section1")
25
+
26
+ def test_urlsplit(self):
27
+ """测试 urlsplit 方法"""
28
+ split_result = urlsplit(self.test_url)
29
+ self.assertEqual(split_result.scheme, "https")
30
+ self.assertEqual(split_result.netloc, "example.com")
31
+ self.assertEqual(split_result.path, "/test")
32
+ self.assertEqual(split_result.query, "param1=value1&param2=value2")
33
+ self.assertEqual(split_result.fragment, "section1")
34
+
35
+ def test_parse_qs(self):
36
+ """测试 parse_qs 方法"""
37
+ query_dict = parse_qs("param1=value1&param2=value2&param2=value3")
38
+ self.assertIn("param1", query_dict)
39
+ self.assertIn("param2", query_dict)
40
+ self.assertEqual(query_dict["param1"], ["value1"])
41
+ self.assertEqual(query_dict["param2"], ["value2", "value3"])
42
+
43
+ def test_urlencode(self):
44
+ """测试 urlencode 方法"""
45
+ query_dict = {"name": "张三", "age": 25, "city": "北京"}
46
+ encoded = urlencode(query_dict)
47
+ # 注意:urlencode 的顺序可能不同,所以我们检查是否包含所有键值对
48
+ self.assertIn("name=%E5%BC%A0%E4%B8%89", encoded)
49
+ self.assertIn("age=25", encoded)
50
+ self.assertIn("city=%E5%8C%97%E4%BA%AC", encoded)
51
+
52
+ def test_quote_unquote(self):
53
+ """测试 quote 和 unquote 方法"""
54
+ # 测试 quote
55
+ original = "hello world 你好"
56
+ quoted = quote(original)
57
+ self.assertEqual(quoted, "hello%20world%20%E4%BD%A0%E5%A5%BD")
58
+
59
+ # 测试 unquote
60
+ unquoted = unquote(quoted)
61
+ self.assertEqual(unquoted, original)
62
+
63
+ def test_urldefrag(self):
64
+ """测试 urldefrag 方法"""
65
+ url_without_frag, fragment = urldefrag(self.test_url)
66
+ self.assertEqual(url_without_frag, "https://example.com/test?param1=value1&param2=value2")
67
+ self.assertEqual(fragment, "section1")
68
+
69
+
70
+ if __name__ == '__main__':
71
+ unittest.main()