crawlo 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +8 -7
- crawlo/downloader/__init__.py +5 -2
- crawlo/downloader/cffi_downloader.py +3 -1
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/middleware/proxy.py +171 -348
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +340 -189
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/settings/default_settings.py +42 -30
- crawlo/stats_collector.py +10 -1
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +10 -55
- crawlo/templates/project/settings_distributed.py.tmpl +20 -22
- crawlo/templates/project/settings_gentle.py.tmpl +5 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- crawlo/templates/project/settings_minimal.py.tmpl +25 -1
- crawlo/templates/project/settings_simple.py.tmpl +5 -0
- crawlo/templates/run.py.tmpl +1 -8
- crawlo/templates/spider/spider.py.tmpl +5 -108
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +226 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.6.dist-info/METADATA +329 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/RECORD +110 -69
- tests/authenticated_proxy_example.py +10 -6
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/explain_mysql_update_behavior.py +77 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/simulate_mysql_update_test.py +140 -0
- tests/test_asyncmy_usage.py +57 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_crawlo_proxy_integration.py +8 -2
- tests/test_downloader_proxy_compatibility.py +24 -20
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_proxy_middleware.py +104 -8
- tests/test_proxy_middleware_enhanced.py +1 -5
- tests/test_proxy_middleware_integration.py +7 -2
- tests/test_proxy_middleware_refactored.py +25 -2
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_real_scenario_proxy.py +17 -17
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- tests/verify_mysql_warnings.py +110 -0
- crawlo/middleware/simple_proxy.py +0 -65
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.4.dist-info/METADATA +0 -190
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试Response类中的选择器方法
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.network.response import Response
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_response_selector_methods():
|
|
14
|
+
"""测试Response类中的选择器方法"""
|
|
15
|
+
print("测试Response类中的选择器方法...")
|
|
16
|
+
print("=" * 50)
|
|
17
|
+
|
|
18
|
+
# 创建测试HTML响应
|
|
19
|
+
html_content = """
|
|
20
|
+
<html>
|
|
21
|
+
<head>
|
|
22
|
+
<title>测试页面</title>
|
|
23
|
+
</head>
|
|
24
|
+
<body>
|
|
25
|
+
<div class="content">
|
|
26
|
+
<h1>主标题</h1>
|
|
27
|
+
<p class="intro">介绍段落</p>
|
|
28
|
+
<ul class="list">
|
|
29
|
+
<li>项目1</li>
|
|
30
|
+
<li>项目2</li>
|
|
31
|
+
<li>项目3</li>
|
|
32
|
+
</ul>
|
|
33
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
34
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
35
|
+
</div>
|
|
36
|
+
</body>
|
|
37
|
+
</html>
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# 创建Response对象
|
|
41
|
+
response = Response(
|
|
42
|
+
url="https://example.com/test",
|
|
43
|
+
body=html_content.encode('utf-8'),
|
|
44
|
+
headers={"content-type": "text/html; charset=utf-8"}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# 测试 extract_text (CSS选择器)
|
|
48
|
+
print("1. 测试 extract_text (CSS选择器):")
|
|
49
|
+
title_text = response.extract_text('title')
|
|
50
|
+
print(f" 标题文本: {title_text}")
|
|
51
|
+
|
|
52
|
+
h1_text = response.extract_text('.content h1')
|
|
53
|
+
print(f" H1文本: {h1_text}")
|
|
54
|
+
print()
|
|
55
|
+
|
|
56
|
+
# 测试 extract_text (XPath选择器)
|
|
57
|
+
print("2. 测试 extract_text (XPath选择器):")
|
|
58
|
+
title_text_xpath = response.extract_text('//title')
|
|
59
|
+
print(f" 标题文本: {title_text_xpath}")
|
|
60
|
+
|
|
61
|
+
h1_text_xpath = response.extract_text('//div[@class="content"]/h1')
|
|
62
|
+
print(f" H1文本: {h1_text_xpath}")
|
|
63
|
+
print()
|
|
64
|
+
|
|
65
|
+
# 测试 extract_texts
|
|
66
|
+
print("3. 测试 extract_texts:")
|
|
67
|
+
li_texts = response.extract_texts('.list li')
|
|
68
|
+
print(f" 列表项文本: {li_texts}")
|
|
69
|
+
print()
|
|
70
|
+
|
|
71
|
+
# 测试 extract_attr
|
|
72
|
+
print("4. 测试 extract_attr:")
|
|
73
|
+
link_href = response.extract_attr('.link', 'href')
|
|
74
|
+
print(f" 链接href: {link_href}")
|
|
75
|
+
|
|
76
|
+
img_alt = response.extract_attr('.image', 'alt')
|
|
77
|
+
print(f" 图片alt: {img_alt}")
|
|
78
|
+
print()
|
|
79
|
+
|
|
80
|
+
# 测试 extract_attrs
|
|
81
|
+
print("5. 测试 extract_attrs:")
|
|
82
|
+
all_links = response.extract_attrs('a', 'href')
|
|
83
|
+
print(f" 所有链接href: {all_links}")
|
|
84
|
+
|
|
85
|
+
all_images = response.extract_attrs('img', 'src')
|
|
86
|
+
print(f" 所有图片src: {all_images}")
|
|
87
|
+
print()
|
|
88
|
+
|
|
89
|
+
print("所有测试完成!")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
if __name__ == '__main__':
|
|
93
|
+
test_response_selector_methods()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response URL 处理方法测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestUrlMethods(unittest.TestCase):
|
|
11
|
+
"""URL 处理方法测试类"""
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
"""测试前准备"""
|
|
15
|
+
self.test_url = "https://example.com/test?param1=value1¶m2=value2#section1"
|
|
16
|
+
|
|
17
|
+
def test_urlparse(self):
|
|
18
|
+
"""测试 urlparse 方法"""
|
|
19
|
+
parsed = urlparse(self.test_url)
|
|
20
|
+
self.assertEqual(parsed.scheme, "https")
|
|
21
|
+
self.assertEqual(parsed.netloc, "example.com")
|
|
22
|
+
self.assertEqual(parsed.path, "/test")
|
|
23
|
+
self.assertEqual(parsed.query, "param1=value1¶m2=value2")
|
|
24
|
+
self.assertEqual(parsed.fragment, "section1")
|
|
25
|
+
|
|
26
|
+
def test_urlsplit(self):
|
|
27
|
+
"""测试 urlsplit 方法"""
|
|
28
|
+
split_result = urlsplit(self.test_url)
|
|
29
|
+
self.assertEqual(split_result.scheme, "https")
|
|
30
|
+
self.assertEqual(split_result.netloc, "example.com")
|
|
31
|
+
self.assertEqual(split_result.path, "/test")
|
|
32
|
+
self.assertEqual(split_result.query, "param1=value1¶m2=value2")
|
|
33
|
+
self.assertEqual(split_result.fragment, "section1")
|
|
34
|
+
|
|
35
|
+
def test_parse_qs(self):
|
|
36
|
+
"""测试 parse_qs 方法"""
|
|
37
|
+
query_dict = parse_qs("param1=value1¶m2=value2¶m2=value3")
|
|
38
|
+
self.assertIn("param1", query_dict)
|
|
39
|
+
self.assertIn("param2", query_dict)
|
|
40
|
+
self.assertEqual(query_dict["param1"], ["value1"])
|
|
41
|
+
self.assertEqual(query_dict["param2"], ["value2", "value3"])
|
|
42
|
+
|
|
43
|
+
def test_urlencode(self):
|
|
44
|
+
"""测试 urlencode 方法"""
|
|
45
|
+
query_dict = {"name": "张三", "age": 25, "city": "北京"}
|
|
46
|
+
encoded = urlencode(query_dict)
|
|
47
|
+
# 注意:urlencode 的顺序可能不同,所以我们检查是否包含所有键值对
|
|
48
|
+
self.assertIn("name=%E5%BC%A0%E4%B8%89", encoded)
|
|
49
|
+
self.assertIn("age=25", encoded)
|
|
50
|
+
self.assertIn("city=%E5%8C%97%E4%BA%AC", encoded)
|
|
51
|
+
|
|
52
|
+
def test_quote_unquote(self):
|
|
53
|
+
"""测试 quote 和 unquote 方法"""
|
|
54
|
+
# 测试 quote
|
|
55
|
+
original = "hello world 你好"
|
|
56
|
+
quoted = quote(original)
|
|
57
|
+
self.assertEqual(quoted, "hello%20world%20%E4%BD%A0%E5%A5%BD")
|
|
58
|
+
|
|
59
|
+
# 测试 unquote
|
|
60
|
+
unquoted = unquote(quoted)
|
|
61
|
+
self.assertEqual(unquoted, original)
|
|
62
|
+
|
|
63
|
+
def test_urldefrag(self):
|
|
64
|
+
"""测试 urldefrag 方法"""
|
|
65
|
+
url_without_frag, fragment = urldefrag(self.test_url)
|
|
66
|
+
self.assertEqual(url_without_frag, "https://example.com/test?param1=value1¶m2=value2")
|
|
67
|
+
self.assertEqual(fragment, "section1")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == '__main__':
|
|
71
|
+
unittest.main()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response.urljoin 方法测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from crawlo.network.response import Response
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestResponseUrljoin(unittest.TestCase):
|
|
11
|
+
"""Response.urljoin 方法测试类"""
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
"""测试前准备"""
|
|
15
|
+
# 创建一个模拟的HTML响应
|
|
16
|
+
html_content = """
|
|
17
|
+
<html>
|
|
18
|
+
<head>
|
|
19
|
+
<title>测试页面</title>
|
|
20
|
+
</head>
|
|
21
|
+
<body>
|
|
22
|
+
<div class="content">
|
|
23
|
+
<h1>主标题</h1>
|
|
24
|
+
<p class="intro">这是介绍段落</p>
|
|
25
|
+
<ul class="list">
|
|
26
|
+
<li>项目1</li>
|
|
27
|
+
<li>项目2</li>
|
|
28
|
+
<li>项目3</li>
|
|
29
|
+
</ul>
|
|
30
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
31
|
+
<a href="/relative/path" class="relative-link">相对链接</a>
|
|
32
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
33
|
+
</div>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
self.response = Response(
|
|
39
|
+
url="https://example.com/test",
|
|
40
|
+
body=html_content.encode('utf-8'),
|
|
41
|
+
headers={"content-type": "text/html; charset=utf-8"}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def test_urljoin_absolute_url(self):
|
|
45
|
+
"""测试处理绝对URL"""
|
|
46
|
+
absolute_url = self.response.urljoin("https://other.com/page")
|
|
47
|
+
self.assertEqual(absolute_url, "https://other.com/page")
|
|
48
|
+
|
|
49
|
+
def test_urljoin_relative_url(self):
|
|
50
|
+
"""测试处理相对URL"""
|
|
51
|
+
relative_url = self.response.urljoin("/relative/path")
|
|
52
|
+
self.assertEqual(relative_url, "https://example.com/relative/path")
|
|
53
|
+
|
|
54
|
+
relative_url2 = self.response.urljoin("relative/path")
|
|
55
|
+
self.assertEqual(relative_url2, "https://example.com/relative/path")
|
|
56
|
+
|
|
57
|
+
def test_urljoin_complex_relative_url(self):
|
|
58
|
+
"""测试处理复杂的相对URL"""
|
|
59
|
+
relative_url = self.response.urljoin("../other/path")
|
|
60
|
+
self.assertEqual(relative_url, "https://example.com/other/path")
|
|
61
|
+
|
|
62
|
+
relative_url2 = self.response.urljoin("./another/path")
|
|
63
|
+
self.assertEqual(relative_url2, "https://example.com/another/path")
|
|
64
|
+
|
|
65
|
+
def test_urljoin_with_query_params(self):
|
|
66
|
+
"""测试处理带查询参数的URL"""
|
|
67
|
+
url_with_params = self.response.urljoin("/path?param=value")
|
|
68
|
+
self.assertEqual(url_with_params, "https://example.com/path?param=value")
|
|
69
|
+
|
|
70
|
+
url_with_fragment = self.response.urljoin("/path#section")
|
|
71
|
+
self.assertEqual(url_with_fragment, "https://example.com/path#section")
|
|
72
|
+
|
|
73
|
+
def test_urljoin_empty_url(self):
|
|
74
|
+
"""测试处理空URL"""
|
|
75
|
+
empty_url = self.response.urljoin("")
|
|
76
|
+
self.assertEqual(empty_url, "https://example.com/test")
|
|
77
|
+
|
|
78
|
+
def test_urljoin_none_url(self):
|
|
79
|
+
"""测试处理None URL"""
|
|
80
|
+
# 由于 urllib.parse.urljoin 会将 None 转换为字符串 "None",所以我们测试实际行为
|
|
81
|
+
none_url = self.response.urljoin(None)
|
|
82
|
+
# 根据实际测试结果,urllib.parse.urljoin(None) 返回基础URL
|
|
83
|
+
# 我们接受这种行为,因为它与 urllib.parse.urljoin 的行为一致
|
|
84
|
+
self.assertEqual(none_url, "https://example.com/test")
|
|
85
|
+
|
|
86
|
+
if __name__ == '__main__':
|
|
87
|
+
unittest.main()
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Scrapy风格编码检测测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from crawlo.network.response import Response
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestScrapyStyleEncoding(unittest.TestCase):
|
|
11
|
+
"""Scrapy风格编码检测测试类"""
|
|
12
|
+
|
|
13
|
+
def test_request_encoding_priority(self):
|
|
14
|
+
"""测试 Request 编码优先级"""
|
|
15
|
+
class MockRequest:
|
|
16
|
+
encoding = 'gbk'
|
|
17
|
+
|
|
18
|
+
response = Response(
|
|
19
|
+
url="https://example.com",
|
|
20
|
+
body=b'',
|
|
21
|
+
request=MockRequest()
|
|
22
|
+
)
|
|
23
|
+
self.assertEqual(response.encoding, 'gbk')
|
|
24
|
+
|
|
25
|
+
def test_declared_encoding_method(self):
|
|
26
|
+
"""测试 _declared_encoding 方法"""
|
|
27
|
+
class MockRequest:
|
|
28
|
+
encoding = 'gbk'
|
|
29
|
+
|
|
30
|
+
response = Response(
|
|
31
|
+
url="https://example.com",
|
|
32
|
+
body=b'',
|
|
33
|
+
request=MockRequest()
|
|
34
|
+
)
|
|
35
|
+
self.assertEqual(response._declared_encoding(), 'gbk')
|
|
36
|
+
|
|
37
|
+
def test_content_type_encoding(self):
|
|
38
|
+
"""测试 Content-Type 头部编码检测"""
|
|
39
|
+
response = Response(
|
|
40
|
+
url="https://example.com",
|
|
41
|
+
body=b'',
|
|
42
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
43
|
+
)
|
|
44
|
+
self.assertEqual(response.encoding, 'iso-8859-1')
|
|
45
|
+
|
|
46
|
+
def test_case_insensitive_content_type(self):
|
|
47
|
+
"""测试 Content-Type 头部大小写不敏感"""
|
|
48
|
+
response = Response(
|
|
49
|
+
url="https://example.com",
|
|
50
|
+
body=b'',
|
|
51
|
+
headers={"Content-Type": "text/html; CHARSET=UTF-8"}
|
|
52
|
+
)
|
|
53
|
+
self.assertEqual(response.encoding, 'utf-8')
|
|
54
|
+
|
|
55
|
+
def test_default_encoding(self):
|
|
56
|
+
"""测试默认编码"""
|
|
57
|
+
response = Response(
|
|
58
|
+
url="https://example.com",
|
|
59
|
+
body=b''
|
|
60
|
+
)
|
|
61
|
+
self.assertEqual(response.encoding, 'utf-8')
|
|
62
|
+
|
|
63
|
+
def test_declared_encoding_priority(self):
|
|
64
|
+
"""测试声明编码的优先级"""
|
|
65
|
+
# 模拟没有request编码的情况
|
|
66
|
+
response = Response(
|
|
67
|
+
url="https://example.com",
|
|
68
|
+
body=b'',
|
|
69
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
70
|
+
)
|
|
71
|
+
# 应该返回Content-Type中的编码
|
|
72
|
+
self.assertEqual(response._declared_encoding(), 'iso-8859-1')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_scrapy_style_encoding():
|
|
76
|
+
"""测试Scrapy风格的编码检测"""
|
|
77
|
+
print("测试Scrapy风格的编码检测...")
|
|
78
|
+
|
|
79
|
+
# 测试 Request 编码优先级
|
|
80
|
+
class MockRequest:
|
|
81
|
+
encoding = 'gbk'
|
|
82
|
+
|
|
83
|
+
response1 = Response(
|
|
84
|
+
url="https://example.com",
|
|
85
|
+
body=b'',
|
|
86
|
+
request=MockRequest()
|
|
87
|
+
)
|
|
88
|
+
print(f"Request 编码优先级: {response1.encoding}")
|
|
89
|
+
|
|
90
|
+
# 测试 Content-Type 头部编码
|
|
91
|
+
response2 = Response(
|
|
92
|
+
url="https://example.com",
|
|
93
|
+
body=b'',
|
|
94
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
95
|
+
)
|
|
96
|
+
print(f"Content-Type 编码: {response2.encoding}")
|
|
97
|
+
|
|
98
|
+
# 测试声明编码方法
|
|
99
|
+
declared_enc = response2._declared_encoding()
|
|
100
|
+
print(f"声明编码: {declared_enc}")
|
|
101
|
+
|
|
102
|
+
# 测试默认编码
|
|
103
|
+
response3 = Response(
|
|
104
|
+
url="https://example.com",
|
|
105
|
+
body=b''
|
|
106
|
+
)
|
|
107
|
+
print(f"默认编码: {response3.encoding}")
|
|
108
|
+
|
|
109
|
+
print("Scrapy风格编码检测测试完成!")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
if __name__ == '__main__':
|
|
113
|
+
test_scrapy_style_encoding()
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
选择器辅助工具测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.utils import (
|
|
11
|
+
extract_text,
|
|
12
|
+
extract_texts,
|
|
13
|
+
extract_attr,
|
|
14
|
+
extract_attrs,
|
|
15
|
+
is_xpath
|
|
16
|
+
)
|
|
17
|
+
from parsel import Selector, SelectorList
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_selector_helper():
|
|
21
|
+
"""测试选择器辅助工具"""
|
|
22
|
+
print("测试选择器辅助工具...")
|
|
23
|
+
print("=" * 50)
|
|
24
|
+
|
|
25
|
+
# 创建测试HTML
|
|
26
|
+
html_content = """
|
|
27
|
+
<html>
|
|
28
|
+
<head>
|
|
29
|
+
<title>测试页面</title>
|
|
30
|
+
</head>
|
|
31
|
+
<body>
|
|
32
|
+
<div class="content">
|
|
33
|
+
<h1>主标题</h1>
|
|
34
|
+
<p class="intro">介绍段落</p>
|
|
35
|
+
<ul class="list">
|
|
36
|
+
<li>项目1</li>
|
|
37
|
+
<li>项目2</li>
|
|
38
|
+
<li>项目3</li>
|
|
39
|
+
</ul>
|
|
40
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
41
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
42
|
+
</div>
|
|
43
|
+
</body>
|
|
44
|
+
</html>
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
selector = Selector(text=html_content)
|
|
48
|
+
|
|
49
|
+
# 测试 is_xpath
|
|
50
|
+
print("1. 测试 is_xpath:")
|
|
51
|
+
print(f" '/' 开头: {is_xpath('/')}")
|
|
52
|
+
print(f" '//' 开头: {is_xpath('//title')}")
|
|
53
|
+
print(f" './' 开头: {is_xpath('./div')}")
|
|
54
|
+
print(f" 'title' 开头: {is_xpath('title')}")
|
|
55
|
+
print()
|
|
56
|
+
|
|
57
|
+
# 测试 extract_text
|
|
58
|
+
print("2. 测试 extract_text:")
|
|
59
|
+
title_elements = selector.css('title')
|
|
60
|
+
title_text = extract_text(title_elements)
|
|
61
|
+
print(f" 标题文本: {title_text}")
|
|
62
|
+
|
|
63
|
+
h1_elements = selector.css('.content h1')
|
|
64
|
+
h1_text = extract_text(h1_elements)
|
|
65
|
+
print(f" H1文本: {h1_text}")
|
|
66
|
+
print()
|
|
67
|
+
|
|
68
|
+
# 测试 extract_texts_from_elements
|
|
69
|
+
print("3. 测试 extract_texts_from_elements:")
|
|
70
|
+
li_elements = selector.css('.list li')
|
|
71
|
+
li_texts = extract_texts(li_elements)
|
|
72
|
+
print(f" 列表项文本: {li_texts}")
|
|
73
|
+
print()
|
|
74
|
+
|
|
75
|
+
# 测试 extract_attr
|
|
76
|
+
print("4. 测试 extract_attr:")
|
|
77
|
+
link_elements = selector.css('.link')
|
|
78
|
+
link_href = extract_attr(link_elements, 'href')
|
|
79
|
+
print(f" 链接href: {link_href}")
|
|
80
|
+
|
|
81
|
+
img_elements = selector.css('.image')
|
|
82
|
+
img_alt = extract_attr(img_elements, 'alt')
|
|
83
|
+
print(f" 图片alt: {img_alt}")
|
|
84
|
+
print()
|
|
85
|
+
|
|
86
|
+
# 测试 extract_attrs
|
|
87
|
+
print("5. 测试 extract_attrs:")
|
|
88
|
+
all_links = selector.css('a')
|
|
89
|
+
all_hrefs = extract_attrs(all_links, 'href')
|
|
90
|
+
print(f" 所有链接href: {all_hrefs}")
|
|
91
|
+
|
|
92
|
+
all_images = selector.css('img')
|
|
93
|
+
all_srcs = extract_attrs(all_images, 'src')
|
|
94
|
+
print(f" 所有图片src: {all_srcs}")
|
|
95
|
+
print()
|
|
96
|
+
|
|
97
|
+
print("所有测试完成!")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == '__main__':
|
|
101
|
+
test_selector_helper()
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
选择器方法优化测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.network.response import Response
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_selector_optimizations():
|
|
14
|
+
"""测试选择器方法优化"""
|
|
15
|
+
print("测试选择器方法优化...")
|
|
16
|
+
print("=" * 50)
|
|
17
|
+
|
|
18
|
+
# 创建一个复杂的HTML响应
|
|
19
|
+
html_content = """
|
|
20
|
+
<html>
|
|
21
|
+
<head>
|
|
22
|
+
<title>测试页面标题</title>
|
|
23
|
+
</head>
|
|
24
|
+
<body>
|
|
25
|
+
<div class="content">
|
|
26
|
+
<h1>主标题</h1>
|
|
27
|
+
<p class="intro">这是介绍段落</p>
|
|
28
|
+
<div class="article">
|
|
29
|
+
<p>第一段内容 <strong>粗体文本</strong> 普通文本</p>
|
|
30
|
+
<p>第二段内容 <em>斜体文本</em></p>
|
|
31
|
+
</div>
|
|
32
|
+
<ul class="list">
|
|
33
|
+
<li>项目1</li>
|
|
34
|
+
<li>项目2</li>
|
|
35
|
+
<li>项目3</li>
|
|
36
|
+
</ul>
|
|
37
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
38
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
39
|
+
<div class="products">
|
|
40
|
+
<div class="product" data-id="1">
|
|
41
|
+
<h2>产品A</h2>
|
|
42
|
+
<p class="price">¥99.99</p>
|
|
43
|
+
</div>
|
|
44
|
+
<div class="product" data-id="2">
|
|
45
|
+
<h2>产品B</h2>
|
|
46
|
+
<p class="price">¥149.99</p>
|
|
47
|
+
</div>
|
|
48
|
+
</div>
|
|
49
|
+
</div>
|
|
50
|
+
</body>
|
|
51
|
+
</html>
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
response = Response(
|
|
55
|
+
url="https://example.com/test",
|
|
56
|
+
body=html_content.encode('utf-8'),
|
|
57
|
+
headers={"content-type": "text/html; charset=utf-8"}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# 测试 extract_text 方法
|
|
61
|
+
print("1. 测试 extract_text 方法:")
|
|
62
|
+
title = response.extract_text('title')
|
|
63
|
+
print(f" 标题: {title}")
|
|
64
|
+
|
|
65
|
+
h1_text = response.extract_text('.content h1')
|
|
66
|
+
print(f" H1文本: {h1_text}")
|
|
67
|
+
|
|
68
|
+
# 测试XPath
|
|
69
|
+
title_xpath = response.extract_text('//title')
|
|
70
|
+
print(f" XPath标题: {title_xpath}")
|
|
71
|
+
|
|
72
|
+
# 测试复杂文本提取
|
|
73
|
+
complex_text = response.extract_text('.article p', join_str=' ')
|
|
74
|
+
print(f" 复杂文本: {complex_text}")
|
|
75
|
+
|
|
76
|
+
print()
|
|
77
|
+
|
|
78
|
+
# 测试 extract_texts 方法
|
|
79
|
+
print("2. 测试 extract_texts 方法:")
|
|
80
|
+
list_items = response.extract_texts('.list li')
|
|
81
|
+
print(f" 列表项: {list_items}")
|
|
82
|
+
|
|
83
|
+
# 测试XPath
|
|
84
|
+
list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
|
|
85
|
+
print(f" XPath列表项: {list_items_xpath}")
|
|
86
|
+
|
|
87
|
+
# 测试多个元素
|
|
88
|
+
product_names = response.extract_texts('.product h2')
|
|
89
|
+
print(f" 产品名称: {product_names}")
|
|
90
|
+
|
|
91
|
+
product_prices = response.extract_texts('.price')
|
|
92
|
+
print(f" 产品价格: {product_prices}")
|
|
93
|
+
|
|
94
|
+
print()
|
|
95
|
+
|
|
96
|
+
# 测试 extract_attr 方法
|
|
97
|
+
print("3. 测试 extract_attr 方法:")
|
|
98
|
+
link_href = response.extract_attr('.link', 'href')
|
|
99
|
+
print(f" 链接href: {link_href}")
|
|
100
|
+
|
|
101
|
+
img_alt = response.extract_attr('.image', 'alt')
|
|
102
|
+
print(f" 图片alt: {img_alt}")
|
|
103
|
+
|
|
104
|
+
# 测试XPath
|
|
105
|
+
link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
|
|
106
|
+
print(f" XPath链接href: {link_href_xpath}")
|
|
107
|
+
|
|
108
|
+
print()
|
|
109
|
+
|
|
110
|
+
# 测试 extract_attrs 方法
|
|
111
|
+
print("4. 测试 extract_attrs 方法:")
|
|
112
|
+
product_ids = response.extract_attrs('.product', 'data-id')
|
|
113
|
+
print(f" 产品ID: {product_ids}")
|
|
114
|
+
|
|
115
|
+
# 测试XPath
|
|
116
|
+
product_ids_xpath = response.extract_attrs('//div[@class="product"]', 'data-id')
|
|
117
|
+
print(f" XPath产品ID: {product_ids_xpath}")
|
|
118
|
+
|
|
119
|
+
# 测试所有链接
|
|
120
|
+
all_links = response.extract_attrs('a', 'href')
|
|
121
|
+
print(f" 所有链接: {all_links}")
|
|
122
|
+
|
|
123
|
+
print()
|
|
124
|
+
|
|
125
|
+
# 测试边界情况
|
|
126
|
+
print("5. 测试边界情况:")
|
|
127
|
+
# 测试默认值
|
|
128
|
+
non_exist = response.extract_text('.non-exist', default='默认文本')
|
|
129
|
+
print(f" 不存在元素的默认值: {non_exist}")
|
|
130
|
+
|
|
131
|
+
non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
|
|
132
|
+
print(f" 不存在属性的默认值: {non_exist_attr}")
|
|
133
|
+
|
|
134
|
+
print()
|
|
135
|
+
|
|
136
|
+
# 测试空响应
|
|
137
|
+
print("6. 测试空响应:")
|
|
138
|
+
empty_response = Response(url="https://example.com/empty", body=b"")
|
|
139
|
+
empty_text = empty_response.extract_text('title', default='默认标题')
|
|
140
|
+
print(f" 空响应默认值: {empty_text}")
|
|
141
|
+
|
|
142
|
+
print()
|
|
143
|
+
print("所有测试完成!")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == '__main__':
|
|
147
|
+
test_selector_optimizations()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
测试SpiderLoader功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.utils.spider_loader import SpiderLoader
|
|
13
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_spider_loader():
|
|
17
|
+
"""测试SpiderLoader基本功能"""
|
|
18
|
+
print("测试SpiderLoader基本功能...")
|
|
19
|
+
|
|
20
|
+
# 创建一个简单的设置
|
|
21
|
+
settings = SettingManager({
|
|
22
|
+
'SPIDER_MODULES': ['tests.test_spiders'],
|
|
23
|
+
'SPIDER_LOADER_WARN_ONLY': True
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
# 创建SpiderLoader实例
|
|
27
|
+
loader = SpiderLoader.from_settings(settings)
|
|
28
|
+
|
|
29
|
+
# 测试list方法
|
|
30
|
+
spider_names = loader.list()
|
|
31
|
+
print(f"发现的爬虫: {spider_names}")
|
|
32
|
+
|
|
33
|
+
# 测试load方法
|
|
34
|
+
if spider_names:
|
|
35
|
+
spider_name = spider_names[0]
|
|
36
|
+
try:
|
|
37
|
+
spider_class = loader.load(spider_name)
|
|
38
|
+
print(f"成功加载爬虫: {spider_name} -> {spider_class}")
|
|
39
|
+
except KeyError as e:
|
|
40
|
+
print(f"加载爬虫失败: {e}")
|
|
41
|
+
|
|
42
|
+
# 测试get_all方法
|
|
43
|
+
all_spiders = loader.get_all()
|
|
44
|
+
print(f"所有爬虫: {list(all_spiders.keys())}")
|
|
45
|
+
|
|
46
|
+
print("测试完成!")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == '__main__':
|
|
50
|
+
test_spider_loader()
|