crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (107) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +52 -17
  4. crawlo/commands/startproject.py +24 -0
  5. crawlo/core/engine.py +2 -2
  6. crawlo/core/scheduler.py +4 -4
  7. crawlo/crawler.py +13 -6
  8. crawlo/downloader/__init__.py +5 -2
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/mode_manager.py +45 -11
  18. crawlo/network/response.py +374 -69
  19. crawlo/pipelines/mysql_pipeline.py +6 -6
  20. crawlo/pipelines/pipeline_manager.py +2 -2
  21. crawlo/project.py +2 -4
  22. crawlo/queue/pqueue.py +2 -6
  23. crawlo/queue/queue_manager.py +1 -2
  24. crawlo/settings/default_settings.py +15 -30
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +51 -65
  30. crawlo/templates/project/settings_distributed.py.tmpl +59 -67
  31. crawlo/templates/project/settings_gentle.py.tmpl +45 -40
  32. crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
  33. crawlo/templates/project/settings_minimal.py.tmpl +37 -26
  34. crawlo/templates/project/settings_simple.py.tmpl +45 -40
  35. crawlo/templates/run.py.tmpl +3 -7
  36. crawlo/tools/__init__.py +0 -11
  37. crawlo/utils/__init__.py +17 -1
  38. crawlo/utils/db_helper.py +220 -319
  39. crawlo/utils/error_handler.py +313 -67
  40. crawlo/utils/fingerprint.py +3 -4
  41. crawlo/utils/misc.py +82 -0
  42. crawlo/utils/request.py +55 -66
  43. crawlo/utils/selector_helper.py +138 -0
  44. crawlo/utils/spider_loader.py +185 -45
  45. crawlo/utils/text_helper.py +95 -0
  46. crawlo-1.4.5.dist-info/METADATA +329 -0
  47. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
  48. tests/bug_check_test.py +251 -0
  49. tests/direct_selector_helper_test.py +97 -0
  50. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  51. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  52. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  56. tests/ofweek_scrapy/scrapy.cfg +11 -0
  57. tests/performance_comparison.py +4 -5
  58. tests/simple_crawlo_test.py +1 -2
  59. tests/simple_follow_test.py +39 -0
  60. tests/simple_response_selector_test.py +95 -0
  61. tests/simple_selector_helper_test.py +155 -0
  62. tests/simple_selector_test.py +208 -0
  63. tests/simple_url_test.py +74 -0
  64. tests/test_crawler_process_import.py +39 -0
  65. tests/test_crawler_process_spider_modules.py +48 -0
  66. tests/test_edge_cases.py +7 -5
  67. tests/test_encoding_core.py +57 -0
  68. tests/test_encoding_detection.py +127 -0
  69. tests/test_factory_compatibility.py +197 -0
  70. tests/test_multi_directory.py +68 -0
  71. tests/test_multiple_spider_modules.py +81 -0
  72. tests/test_optimized_selector_naming.py +101 -0
  73. tests/test_priority_behavior.py +18 -18
  74. tests/test_response_follow.py +105 -0
  75. tests/test_response_selector_methods.py +93 -0
  76. tests/test_response_url_methods.py +71 -0
  77. tests/test_response_urljoin.py +87 -0
  78. tests/test_scrapy_style_encoding.py +113 -0
  79. tests/test_selector_helper.py +101 -0
  80. tests/test_selector_optimizations.py +147 -0
  81. tests/test_spider_loader.py +50 -0
  82. tests/test_spider_loader_comprehensive.py +70 -0
  83. tests/test_spider_modules.py +85 -0
  84. tests/test_spiders/__init__.py +1 -0
  85. tests/test_spiders/test_spider.py +10 -0
  86. crawlo/tools/anti_crawler.py +0 -269
  87. crawlo/utils/class_loader.py +0 -26
  88. crawlo/utils/enhanced_error_handler.py +0 -357
  89. crawlo-1.4.3.dist-info/METADATA +0 -190
  90. examples/test_project/__init__.py +0 -7
  91. examples/test_project/run.py +0 -35
  92. examples/test_project/test_project/__init__.py +0 -4
  93. examples/test_project/test_project/items.py +0 -18
  94. examples/test_project/test_project/middlewares.py +0 -119
  95. examples/test_project/test_project/pipelines.py +0 -97
  96. examples/test_project/test_project/settings.py +0 -170
  97. examples/test_project/test_project/spiders/__init__.py +0 -10
  98. examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  99. tests/simple_log_test.py +0 -58
  100. tests/simple_test.py +0 -48
  101. tests/test_framework_logger.py +0 -67
  102. tests/test_framework_startup.py +0 -65
  103. tests/test_mode_change.py +0 -73
  104. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  105. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  106. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  107. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response.follow 方法测试
5
+ """
6
+ import unittest
7
+ from unittest.mock import Mock
8
+
9
+ # 模拟 Request 类
10
+ class MockRequest:
11
+ def __init__(self, url, callback=None, **kwargs):
12
+ self.url = url
13
+ self.callback = callback
14
+ self.kwargs = kwargs
15
+
16
+ # 模拟 crawlo.Request
17
+ import sys
18
+ sys.modules['crawlo'] = Mock()
19
+ sys.modules['crawlo'].Request = MockRequest
20
+
21
+ from crawlo.network.response import Response
22
+
23
+
24
+ class TestResponseFollow(unittest.TestCase):
25
+ """Response.follow 方法测试类"""
26
+
27
+ def setUp(self):
28
+ """测试前准备"""
29
+ # 创建一个模拟的HTML响应
30
+ html_content = """
31
+ <html>
32
+ <head>
33
+ <title>测试页面</title>
34
+ </head>
35
+ <body>
36
+ <div class="content">
37
+ <h1>主标题</h1>
38
+ <p class="intro">这是介绍段落</p>
39
+ <ul class="list">
40
+ <li>项目1</li>
41
+ <li>项目2</li>
42
+ <li>项目3</li>
43
+ </ul>
44
+ <a href="https://example.com" class="link">链接文本</a>
45
+ <a href="/relative/path" class="relative-link">相对链接</a>
46
+ <img src="image.jpg" alt="图片描述" class="image">
47
+ </div>
48
+ </body>
49
+ </html>
50
+ """
51
+
52
+ # 创建模拟的请求对象
53
+ mock_request = Mock()
54
+ mock_request.callback = None
55
+
56
+ self.response = Response(
57
+ url="https://example.com/test",
58
+ body=html_content.encode('utf-8'),
59
+ headers={"content-type": "text/html; charset=utf-8"},
60
+ request=mock_request
61
+ )
62
+
63
+ def test_follow_absolute_url(self):
64
+ """测试处理绝对URL"""
65
+ request = self.response.follow("https://other.com/page", callback=lambda r: None)
66
+ self.assertEqual(request.url, "https://other.com/page")
67
+ self.assertIsNotNone(request.callback)
68
+
69
+ def test_follow_relative_url(self):
70
+ """测试处理相对URL"""
71
+ request = self.response.follow("/relative/path", callback=lambda r: None)
72
+ self.assertEqual(request.url, "https://example.com/relative/path")
73
+ self.assertIsNotNone(request.callback)
74
+
75
+ def test_follow_complex_relative_url(self):
76
+ """测试处理复杂的相对URL"""
77
+ request = self.response.follow("../other/path", callback=lambda r: None)
78
+ self.assertEqual(request.url, "https://example.com/other/path")
79
+
80
+ request2 = self.response.follow("./another/path", callback=lambda r: None)
81
+ self.assertEqual(request2.url, "https://example.com/another/path")
82
+
83
+ def test_follow_with_query_params(self):
84
+ """测试处理带查询参数的URL"""
85
+ request = self.response.follow("/path?param=value", callback=lambda r: None)
86
+ self.assertEqual(request.url, "https://example.com/path?param=value")
87
+
88
+ request2 = self.response.follow("/path#section", callback=lambda r: None)
89
+ self.assertEqual(request2.url, "https://example.com/path#section")
90
+
91
+ def test_follow_with_additional_kwargs(self):
92
+ """测试传递额外参数"""
93
+ request = self.response.follow(
94
+ "/path",
95
+ callback=lambda r: None,
96
+ method="POST",
97
+ headers={"User-Agent": "test"}
98
+ )
99
+ self.assertEqual(request.url, "https://example.com/path")
100
+ self.assertEqual(request.kwargs.get("method"), "POST")
101
+ self.assertEqual(request.kwargs.get("headers"), {"User-Agent": "test"})
102
+
103
+
104
+ if __name__ == '__main__':
105
+ unittest.main()
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 测试Response类中的选择器方法
5
+ """
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
9
+
10
+ from crawlo.network.response import Response
11
+
12
+
13
+ def test_response_selector_methods():
14
+ """测试Response类中的选择器方法"""
15
+ print("测试Response类中的选择器方法...")
16
+ print("=" * 50)
17
+
18
+ # 创建测试HTML响应
19
+ html_content = """
20
+ <html>
21
+ <head>
22
+ <title>测试页面</title>
23
+ </head>
24
+ <body>
25
+ <div class="content">
26
+ <h1>主标题</h1>
27
+ <p class="intro">介绍段落</p>
28
+ <ul class="list">
29
+ <li>项目1</li>
30
+ <li>项目2</li>
31
+ <li>项目3</li>
32
+ </ul>
33
+ <a href="https://example.com" class="link">链接文本</a>
34
+ <img src="image.jpg" alt="图片描述" class="image">
35
+ </div>
36
+ </body>
37
+ </html>
38
+ """
39
+
40
+ # 创建Response对象
41
+ response = Response(
42
+ url="https://example.com/test",
43
+ body=html_content.encode('utf-8'),
44
+ headers={"content-type": "text/html; charset=utf-8"}
45
+ )
46
+
47
+ # 测试 extract_text (CSS选择器)
48
+ print("1. 测试 extract_text (CSS选择器):")
49
+ title_text = response.extract_text('title')
50
+ print(f" 标题文本: {title_text}")
51
+
52
+ h1_text = response.extract_text('.content h1')
53
+ print(f" H1文本: {h1_text}")
54
+ print()
55
+
56
+ # 测试 extract_text (XPath选择器)
57
+ print("2. 测试 extract_text (XPath选择器):")
58
+ title_text_xpath = response.extract_text('//title')
59
+ print(f" 标题文本: {title_text_xpath}")
60
+
61
+ h1_text_xpath = response.extract_text('//div[@class="content"]/h1')
62
+ print(f" H1文本: {h1_text_xpath}")
63
+ print()
64
+
65
+ # 测试 extract_texts
66
+ print("3. 测试 extract_texts:")
67
+ li_texts = response.extract_texts('.list li')
68
+ print(f" 列表项文本: {li_texts}")
69
+ print()
70
+
71
+ # 测试 extract_attr
72
+ print("4. 测试 extract_attr:")
73
+ link_href = response.extract_attr('.link', 'href')
74
+ print(f" 链接href: {link_href}")
75
+
76
+ img_alt = response.extract_attr('.image', 'alt')
77
+ print(f" 图片alt: {img_alt}")
78
+ print()
79
+
80
+ # 测试 extract_attrs
81
+ print("5. 测试 extract_attrs:")
82
+ all_links = response.extract_attrs('a', 'href')
83
+ print(f" 所有链接href: {all_links}")
84
+
85
+ all_images = response.extract_attrs('img', 'src')
86
+ print(f" 所有图片src: {all_images}")
87
+ print()
88
+
89
+ print("所有测试完成!")
90
+
91
+
92
+ if __name__ == '__main__':
93
+ test_response_selector_methods()
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response URL 处理方法测试
5
+ """
6
+ import unittest
7
+ from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
8
+
9
+
10
+ class TestUrlMethods(unittest.TestCase):
11
+ """URL 处理方法测试类"""
12
+
13
+ def setUp(self):
14
+ """测试前准备"""
15
+ self.test_url = "https://example.com/test?param1=value1&param2=value2#section1"
16
+
17
+ def test_urlparse(self):
18
+ """测试 urlparse 方法"""
19
+ parsed = urlparse(self.test_url)
20
+ self.assertEqual(parsed.scheme, "https")
21
+ self.assertEqual(parsed.netloc, "example.com")
22
+ self.assertEqual(parsed.path, "/test")
23
+ self.assertEqual(parsed.query, "param1=value1&param2=value2")
24
+ self.assertEqual(parsed.fragment, "section1")
25
+
26
+ def test_urlsplit(self):
27
+ """测试 urlsplit 方法"""
28
+ split_result = urlsplit(self.test_url)
29
+ self.assertEqual(split_result.scheme, "https")
30
+ self.assertEqual(split_result.netloc, "example.com")
31
+ self.assertEqual(split_result.path, "/test")
32
+ self.assertEqual(split_result.query, "param1=value1&param2=value2")
33
+ self.assertEqual(split_result.fragment, "section1")
34
+
35
+ def test_parse_qs(self):
36
+ """测试 parse_qs 方法"""
37
+ query_dict = parse_qs("param1=value1&param2=value2&param2=value3")
38
+ self.assertIn("param1", query_dict)
39
+ self.assertIn("param2", query_dict)
40
+ self.assertEqual(query_dict["param1"], ["value1"])
41
+ self.assertEqual(query_dict["param2"], ["value2", "value3"])
42
+
43
+ def test_urlencode(self):
44
+ """测试 urlencode 方法"""
45
+ query_dict = {"name": "张三", "age": 25, "city": "北京"}
46
+ encoded = urlencode(query_dict)
47
+ # 注意:urlencode 的顺序可能不同,所以我们检查是否包含所有键值对
48
+ self.assertIn("name=%E5%BC%A0%E4%B8%89", encoded)
49
+ self.assertIn("age=25", encoded)
50
+ self.assertIn("city=%E5%8C%97%E4%BA%AC", encoded)
51
+
52
+ def test_quote_unquote(self):
53
+ """测试 quote 和 unquote 方法"""
54
+ # 测试 quote
55
+ original = "hello world 你好"
56
+ quoted = quote(original)
57
+ self.assertEqual(quoted, "hello%20world%20%E4%BD%A0%E5%A5%BD")
58
+
59
+ # 测试 unquote
60
+ unquoted = unquote(quoted)
61
+ self.assertEqual(unquoted, original)
62
+
63
+ def test_urldefrag(self):
64
+ """测试 urldefrag 方法"""
65
+ url_without_frag, fragment = urldefrag(self.test_url)
66
+ self.assertEqual(url_without_frag, "https://example.com/test?param1=value1&param2=value2")
67
+ self.assertEqual(fragment, "section1")
68
+
69
+
70
+ if __name__ == '__main__':
71
+ unittest.main()
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response.urljoin 方法测试
5
+ """
6
+ import unittest
7
+ from crawlo.network.response import Response
8
+
9
+
10
+ class TestResponseUrljoin(unittest.TestCase):
11
+ """Response.urljoin 方法测试类"""
12
+
13
+ def setUp(self):
14
+ """测试前准备"""
15
+ # 创建一个模拟的HTML响应
16
+ html_content = """
17
+ <html>
18
+ <head>
19
+ <title>测试页面</title>
20
+ </head>
21
+ <body>
22
+ <div class="content">
23
+ <h1>主标题</h1>
24
+ <p class="intro">这是介绍段落</p>
25
+ <ul class="list">
26
+ <li>项目1</li>
27
+ <li>项目2</li>
28
+ <li>项目3</li>
29
+ </ul>
30
+ <a href="https://example.com" class="link">链接文本</a>
31
+ <a href="/relative/path" class="relative-link">相对链接</a>
32
+ <img src="image.jpg" alt="图片描述" class="image">
33
+ </div>
34
+ </body>
35
+ </html>
36
+ """
37
+
38
+ self.response = Response(
39
+ url="https://example.com/test",
40
+ body=html_content.encode('utf-8'),
41
+ headers={"content-type": "text/html; charset=utf-8"}
42
+ )
43
+
44
+ def test_urljoin_absolute_url(self):
45
+ """测试处理绝对URL"""
46
+ absolute_url = self.response.urljoin("https://other.com/page")
47
+ self.assertEqual(absolute_url, "https://other.com/page")
48
+
49
+ def test_urljoin_relative_url(self):
50
+ """测试处理相对URL"""
51
+ relative_url = self.response.urljoin("/relative/path")
52
+ self.assertEqual(relative_url, "https://example.com/relative/path")
53
+
54
+ relative_url2 = self.response.urljoin("relative/path")
55
+ self.assertEqual(relative_url2, "https://example.com/relative/path")
56
+
57
+ def test_urljoin_complex_relative_url(self):
58
+ """测试处理复杂的相对URL"""
59
+ relative_url = self.response.urljoin("../other/path")
60
+ self.assertEqual(relative_url, "https://example.com/other/path")
61
+
62
+ relative_url2 = self.response.urljoin("./another/path")
63
+ self.assertEqual(relative_url2, "https://example.com/another/path")
64
+
65
+ def test_urljoin_with_query_params(self):
66
+ """测试处理带查询参数的URL"""
67
+ url_with_params = self.response.urljoin("/path?param=value")
68
+ self.assertEqual(url_with_params, "https://example.com/path?param=value")
69
+
70
+ url_with_fragment = self.response.urljoin("/path#section")
71
+ self.assertEqual(url_with_fragment, "https://example.com/path#section")
72
+
73
+ def test_urljoin_empty_url(self):
74
+ """测试处理空URL"""
75
+ empty_url = self.response.urljoin("")
76
+ self.assertEqual(empty_url, "https://example.com/test")
77
+
78
+ def test_urljoin_none_url(self):
79
+ """测试处理None URL"""
80
+ # 由于 urllib.parse.urljoin 会将 None 转换为字符串 "None",所以我们测试实际行为
81
+ none_url = self.response.urljoin(None)
82
+ # 根据实际测试结果,urllib.parse.urljoin(None) 返回基础URL
83
+ # 我们接受这种行为,因为它与 urllib.parse.urljoin 的行为一致
84
+ self.assertEqual(none_url, "https://example.com/test")
85
+
86
+ if __name__ == '__main__':
87
+ unittest.main()
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Scrapy风格编码检测测试
5
+ """
6
+ import unittest
7
+ from crawlo.network.response import Response
8
+
9
+
10
+ class TestScrapyStyleEncoding(unittest.TestCase):
11
+ """Scrapy风格编码检测测试类"""
12
+
13
+ def test_request_encoding_priority(self):
14
+ """测试 Request 编码优先级"""
15
+ class MockRequest:
16
+ encoding = 'gbk'
17
+
18
+ response = Response(
19
+ url="https://example.com",
20
+ body=b'',
21
+ request=MockRequest()
22
+ )
23
+ self.assertEqual(response.encoding, 'gbk')
24
+
25
+ def test_declared_encoding_method(self):
26
+ """测试 _declared_encoding 方法"""
27
+ class MockRequest:
28
+ encoding = 'gbk'
29
+
30
+ response = Response(
31
+ url="https://example.com",
32
+ body=b'',
33
+ request=MockRequest()
34
+ )
35
+ self.assertEqual(response._declared_encoding(), 'gbk')
36
+
37
+ def test_content_type_encoding(self):
38
+ """测试 Content-Type 头部编码检测"""
39
+ response = Response(
40
+ url="https://example.com",
41
+ body=b'',
42
+ headers={"content-type": "text/html; charset=iso-8859-1"}
43
+ )
44
+ self.assertEqual(response.encoding, 'iso-8859-1')
45
+
46
+ def test_case_insensitive_content_type(self):
47
+ """测试 Content-Type 头部大小写不敏感"""
48
+ response = Response(
49
+ url="https://example.com",
50
+ body=b'',
51
+ headers={"Content-Type": "text/html; CHARSET=UTF-8"}
52
+ )
53
+ self.assertEqual(response.encoding, 'utf-8')
54
+
55
+ def test_default_encoding(self):
56
+ """测试默认编码"""
57
+ response = Response(
58
+ url="https://example.com",
59
+ body=b''
60
+ )
61
+ self.assertEqual(response.encoding, 'utf-8')
62
+
63
+ def test_declared_encoding_priority(self):
64
+ """测试声明编码的优先级"""
65
+ # 模拟没有request编码的情况
66
+ response = Response(
67
+ url="https://example.com",
68
+ body=b'',
69
+ headers={"content-type": "text/html; charset=iso-8859-1"}
70
+ )
71
+ # 应该返回Content-Type中的编码
72
+ self.assertEqual(response._declared_encoding(), 'iso-8859-1')
73
+
74
+
75
+ def test_scrapy_style_encoding():
76
+ """测试Scrapy风格的编码检测"""
77
+ print("测试Scrapy风格的编码检测...")
78
+
79
+ # 测试 Request 编码优先级
80
+ class MockRequest:
81
+ encoding = 'gbk'
82
+
83
+ response1 = Response(
84
+ url="https://example.com",
85
+ body=b'',
86
+ request=MockRequest()
87
+ )
88
+ print(f"Request 编码优先级: {response1.encoding}")
89
+
90
+ # 测试 Content-Type 头部编码
91
+ response2 = Response(
92
+ url="https://example.com",
93
+ body=b'',
94
+ headers={"content-type": "text/html; charset=iso-8859-1"}
95
+ )
96
+ print(f"Content-Type 编码: {response2.encoding}")
97
+
98
+ # 测试声明编码方法
99
+ declared_enc = response2._declared_encoding()
100
+ print(f"声明编码: {declared_enc}")
101
+
102
+ # 测试默认编码
103
+ response3 = Response(
104
+ url="https://example.com",
105
+ body=b''
106
+ )
107
+ print(f"默认编码: {response3.encoding}")
108
+
109
+ print("Scrapy风格编码检测测试完成!")
110
+
111
+
112
+ if __name__ == '__main__':
113
+ test_scrapy_style_encoding()
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 选择器辅助工具测试
5
+ """
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
9
+
10
+ from crawlo.utils import (
11
+ extract_text,
12
+ extract_texts,
13
+ extract_attr,
14
+ extract_attrs,
15
+ is_xpath
16
+ )
17
+ from parsel import Selector, SelectorList
18
+
19
+
20
+ def test_selector_helper():
21
+ """测试选择器辅助工具"""
22
+ print("测试选择器辅助工具...")
23
+ print("=" * 50)
24
+
25
+ # 创建测试HTML
26
+ html_content = """
27
+ <html>
28
+ <head>
29
+ <title>测试页面</title>
30
+ </head>
31
+ <body>
32
+ <div class="content">
33
+ <h1>主标题</h1>
34
+ <p class="intro">介绍段落</p>
35
+ <ul class="list">
36
+ <li>项目1</li>
37
+ <li>项目2</li>
38
+ <li>项目3</li>
39
+ </ul>
40
+ <a href="https://example.com" class="link">链接文本</a>
41
+ <img src="image.jpg" alt="图片描述" class="image">
42
+ </div>
43
+ </body>
44
+ </html>
45
+ """
46
+
47
+ selector = Selector(text=html_content)
48
+
49
+ # 测试 is_xpath
50
+ print("1. 测试 is_xpath:")
51
+ print(f" '/' 开头: {is_xpath('/')}")
52
+ print(f" '//' 开头: {is_xpath('//title')}")
53
+ print(f" './' 开头: {is_xpath('./div')}")
54
+ print(f" 'title' 开头: {is_xpath('title')}")
55
+ print()
56
+
57
+ # 测试 extract_text
58
+ print("2. 测试 extract_text:")
59
+ title_elements = selector.css('title')
60
+ title_text = extract_text(title_elements)
61
+ print(f" 标题文本: {title_text}")
62
+
63
+ h1_elements = selector.css('.content h1')
64
+ h1_text = extract_text(h1_elements)
65
+ print(f" H1文本: {h1_text}")
66
+ print()
67
+
68
+ # 测试 extract_texts_from_elements
69
+ print("3. 测试 extract_texts_from_elements:")
70
+ li_elements = selector.css('.list li')
71
+ li_texts = extract_texts(li_elements)
72
+ print(f" 列表项文本: {li_texts}")
73
+ print()
74
+
75
+ # 测试 extract_attr
76
+ print("4. 测试 extract_attr:")
77
+ link_elements = selector.css('.link')
78
+ link_href = extract_attr(link_elements, 'href')
79
+ print(f" 链接href: {link_href}")
80
+
81
+ img_elements = selector.css('.image')
82
+ img_alt = extract_attr(img_elements, 'alt')
83
+ print(f" 图片alt: {img_alt}")
84
+ print()
85
+
86
+ # 测试 extract_attrs
87
+ print("5. 测试 extract_attrs:")
88
+ all_links = selector.css('a')
89
+ all_hrefs = extract_attrs(all_links, 'href')
90
+ print(f" 所有链接href: {all_hrefs}")
91
+
92
+ all_images = selector.css('img')
93
+ all_srcs = extract_attrs(all_images, 'src')
94
+ print(f" 所有图片src: {all_srcs}")
95
+ print()
96
+
97
+ print("所有测试完成!")
98
+
99
+
100
+ if __name__ == '__main__':
101
+ test_selector_helper()