crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +52 -17
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +13 -6
- crawlo/downloader/__init__.py +5 -2
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +6 -6
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/queue/pqueue.py +2 -6
- crawlo/queue/queue_manager.py +1 -2
- crawlo/settings/default_settings.py +15 -30
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +51 -65
- crawlo/templates/project/settings_distributed.py.tmpl +59 -67
- crawlo/templates/project/settings_gentle.py.tmpl +45 -40
- crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
- crawlo/templates/project/settings_minimal.py.tmpl +37 -26
- crawlo/templates/project/settings_simple.py.tmpl +45 -40
- crawlo/templates/run.py.tmpl +3 -7
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +220 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.5.dist-info/METADATA +329 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_multi_directory.py +68 -0
- tests/test_multiple_spider_modules.py +81 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spider_modules.py +85 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.3.dist-info/METADATA +0 -190
- examples/test_project/__init__.py +0 -7
- examples/test_project/run.py +0 -35
- examples/test_project/test_project/__init__.py +0 -4
- examples/test_project/test_project/items.py +0 -18
- examples/test_project/test_project/middlewares.py +0 -119
- examples/test_project/test_project/pipelines.py +0 -97
- examples/test_project/test_project/settings.py +0 -170
- examples/test_project/test_project/spiders/__init__.py +0 -10
- examples/test_project/test_project/spiders/of_week_dis.py +0 -144
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简化选择器辅助工具测试
|
|
5
|
+
"""
|
|
6
|
+
from parsel import Selector
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# 直接复制工具函数用于测试
|
|
10
|
+
def extract_text(elements, join_str=" "):
|
|
11
|
+
"""
|
|
12
|
+
从元素列表中提取文本并拼接
|
|
13
|
+
"""
|
|
14
|
+
texts = []
|
|
15
|
+
for element in elements:
|
|
16
|
+
if hasattr(element, 'xpath'):
|
|
17
|
+
element_texts = element.xpath('.//text()').getall()
|
|
18
|
+
else:
|
|
19
|
+
element_texts = [str(element)]
|
|
20
|
+
for text in element_texts:
|
|
21
|
+
cleaned = text.strip()
|
|
22
|
+
if cleaned:
|
|
23
|
+
texts.append(cleaned)
|
|
24
|
+
return join_str.join(texts)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def extract_texts(elements, join_str=" "):
|
|
28
|
+
"""
|
|
29
|
+
从元素列表中提取多个文本列表
|
|
30
|
+
"""
|
|
31
|
+
result = []
|
|
32
|
+
for element in elements:
|
|
33
|
+
if hasattr(element, 'xpath'):
|
|
34
|
+
texts = element.xpath('.//text()').getall()
|
|
35
|
+
else:
|
|
36
|
+
texts = [str(element)]
|
|
37
|
+
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
38
|
+
if clean_texts:
|
|
39
|
+
result.append(join_str.join(clean_texts))
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def extract_attr(elements, attr_name, default=None):
|
|
44
|
+
"""
|
|
45
|
+
从元素列表中提取单个元素的属性值
|
|
46
|
+
"""
|
|
47
|
+
if hasattr(elements, 'attrib'):
|
|
48
|
+
return elements.attrib.get(attr_name, default)
|
|
49
|
+
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
50
|
+
return elements[0].attrib.get(attr_name, default)
|
|
51
|
+
return default
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def extract_attrs(elements, attr_name):
|
|
55
|
+
"""
|
|
56
|
+
从元素列表中提取多个元素的属性值列表
|
|
57
|
+
"""
|
|
58
|
+
result = []
|
|
59
|
+
for element in elements:
|
|
60
|
+
if hasattr(element, 'attrib'):
|
|
61
|
+
attr_value = element.attrib.get(attr_name)
|
|
62
|
+
if attr_value is not None:
|
|
63
|
+
result.append(attr_value)
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_xpath(query):
|
|
68
|
+
"""
|
|
69
|
+
判断查询语句是否为XPath
|
|
70
|
+
"""
|
|
71
|
+
return query.startswith(('/', '//', './'))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_selector_helper():
|
|
75
|
+
"""测试选择器辅助工具"""
|
|
76
|
+
print("测试选择器辅助工具...")
|
|
77
|
+
print("=" * 50)
|
|
78
|
+
|
|
79
|
+
# 创建测试HTML
|
|
80
|
+
html_content = """
|
|
81
|
+
<html>
|
|
82
|
+
<head>
|
|
83
|
+
<title>测试页面</title>
|
|
84
|
+
</head>
|
|
85
|
+
<body>
|
|
86
|
+
<div class="content">
|
|
87
|
+
<h1>主标题</h1>
|
|
88
|
+
<p class="intro">介绍段落</p>
|
|
89
|
+
<ul class="list">
|
|
90
|
+
<li>项目1</li>
|
|
91
|
+
<li>项目2</li>
|
|
92
|
+
<li>项目3</li>
|
|
93
|
+
</ul>
|
|
94
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
95
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
96
|
+
</div>
|
|
97
|
+
</body>
|
|
98
|
+
</html>
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
selector = Selector(text=html_content)
|
|
102
|
+
|
|
103
|
+
# 测试 is_xpath
|
|
104
|
+
print("1. 测试 is_xpath:")
|
|
105
|
+
print(f" '/' 开头: {is_xpath('/')}")
|
|
106
|
+
print(f" '//' 开头: {is_xpath('//title')}")
|
|
107
|
+
print(f" './' 开头: {is_xpath('./div')}")
|
|
108
|
+
print(f" 'title' 开头: {is_xpath('title')}")
|
|
109
|
+
print()
|
|
110
|
+
|
|
111
|
+
# 测试 extract_text
|
|
112
|
+
print("2. 测试 extract_text:")
|
|
113
|
+
title_elements = selector.css('title')
|
|
114
|
+
title_text = extract_text(title_elements)
|
|
115
|
+
print(f" 标题文本: {title_text}")
|
|
116
|
+
|
|
117
|
+
h1_elements = selector.css('.content h1')
|
|
118
|
+
h1_text = extract_text(h1_elements)
|
|
119
|
+
print(f" H1文本: {h1_text}")
|
|
120
|
+
print()
|
|
121
|
+
|
|
122
|
+
# 测试 extract_texts
|
|
123
|
+
print("3. 测试 extract_texts:")
|
|
124
|
+
li_elements = selector.css('.list li')
|
|
125
|
+
li_texts = extract_texts(li_elements)
|
|
126
|
+
print(f" 列表项文本: {li_texts}")
|
|
127
|
+
print()
|
|
128
|
+
|
|
129
|
+
# 测试 extract_attr
|
|
130
|
+
print("4. 测试 extract_attr:")
|
|
131
|
+
link_elements = selector.css('.link')
|
|
132
|
+
link_href = extract_attr(link_elements, 'href')
|
|
133
|
+
print(f" 链接href: {link_href}")
|
|
134
|
+
|
|
135
|
+
img_elements = selector.css('.image')
|
|
136
|
+
img_alt = extract_attr(img_elements, 'alt')
|
|
137
|
+
print(f" 图片alt: {img_alt}")
|
|
138
|
+
print()
|
|
139
|
+
|
|
140
|
+
# 测试 extract_attrs
|
|
141
|
+
print("5. 测试 extract_attrs:")
|
|
142
|
+
all_links = selector.css('a')
|
|
143
|
+
all_hrefs = extract_attrs(all_links, 'href')
|
|
144
|
+
print(f" 所有链接href: {all_hrefs}")
|
|
145
|
+
|
|
146
|
+
all_images = selector.css('img')
|
|
147
|
+
all_srcs = extract_attrs(all_images, 'src')
|
|
148
|
+
print(f" 所有图片src: {all_srcs}")
|
|
149
|
+
print()
|
|
150
|
+
|
|
151
|
+
print("所有测试完成!")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
if __name__ == '__main__':
|
|
155
|
+
test_selector_helper()
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简化选择器测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
# 直接导入需要的模块
|
|
11
|
+
from parsel import Selector, SelectorList
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MockResponse:
|
|
15
|
+
"""模拟Response类用于测试"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, text):
|
|
18
|
+
self._text = text
|
|
19
|
+
self._selector_instance = None
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def text(self):
|
|
23
|
+
return self._text
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def _selector(self):
|
|
27
|
+
if self._selector_instance is None:
|
|
28
|
+
self._selector_instance = Selector(self.text)
|
|
29
|
+
return self._selector_instance
|
|
30
|
+
|
|
31
|
+
def xpath(self, query):
|
|
32
|
+
return self._selector.xpath(query)
|
|
33
|
+
|
|
34
|
+
def css(self, query):
|
|
35
|
+
return self._selector.css(query)
|
|
36
|
+
|
|
37
|
+
def _is_xpath(self, query):
|
|
38
|
+
return query.startswith(('/', '//', './'))
|
|
39
|
+
|
|
40
|
+
def _extract_text_from_elements(self, elements, join_str=" "):
|
|
41
|
+
texts = []
|
|
42
|
+
for element in elements:
|
|
43
|
+
if hasattr(element, 'xpath'):
|
|
44
|
+
element_texts = element.xpath('.//text()').getall()
|
|
45
|
+
else:
|
|
46
|
+
element_texts = [str(element)]
|
|
47
|
+
for text in element_texts:
|
|
48
|
+
cleaned = text.strip()
|
|
49
|
+
if cleaned:
|
|
50
|
+
texts.append(cleaned)
|
|
51
|
+
return join_str.join(texts)
|
|
52
|
+
|
|
53
|
+
def extract_text(self, xpath_or_css, join_str=" ", default=''):
|
|
54
|
+
try:
|
|
55
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
56
|
+
if not elements:
|
|
57
|
+
return default
|
|
58
|
+
return self._extract_text_from_elements(elements, join_str)
|
|
59
|
+
except Exception:
|
|
60
|
+
return default
|
|
61
|
+
|
|
62
|
+
def extract_texts(self, xpath_or_css, join_str=" ", default=None):
|
|
63
|
+
if default is None:
|
|
64
|
+
default = []
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
68
|
+
if not elements:
|
|
69
|
+
return default
|
|
70
|
+
|
|
71
|
+
result = []
|
|
72
|
+
for element in elements:
|
|
73
|
+
if hasattr(element, 'xpath'):
|
|
74
|
+
texts = element.xpath('.//text()').getall()
|
|
75
|
+
else:
|
|
76
|
+
texts = [str(element)]
|
|
77
|
+
|
|
78
|
+
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
79
|
+
if clean_texts:
|
|
80
|
+
result.append(join_str.join(clean_texts))
|
|
81
|
+
|
|
82
|
+
return result if result else default
|
|
83
|
+
except Exception:
|
|
84
|
+
return default
|
|
85
|
+
|
|
86
|
+
def extract_attr(self, xpath_or_css, attr_name, default=None):
|
|
87
|
+
try:
|
|
88
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
89
|
+
if not elements:
|
|
90
|
+
return default
|
|
91
|
+
if hasattr(elements, 'attrib'):
|
|
92
|
+
return elements.attrib.get(attr_name, default)
|
|
93
|
+
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
94
|
+
return elements[0].attrib.get(attr_name, default)
|
|
95
|
+
return default
|
|
96
|
+
except Exception:
|
|
97
|
+
return default
|
|
98
|
+
|
|
99
|
+
def extract_attrs(self, xpath_or_css, attr_name, default=None):
|
|
100
|
+
if default is None:
|
|
101
|
+
default = []
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
105
|
+
if not elements:
|
|
106
|
+
return default
|
|
107
|
+
|
|
108
|
+
result = []
|
|
109
|
+
for element in elements:
|
|
110
|
+
if hasattr(element, 'attrib'):
|
|
111
|
+
attr_value = element.attrib.get(attr_name)
|
|
112
|
+
if attr_value is not None:
|
|
113
|
+
result.append(attr_value)
|
|
114
|
+
|
|
115
|
+
return result if result else default
|
|
116
|
+
except Exception:
|
|
117
|
+
return default
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_selector_methods():
|
|
121
|
+
"""测试选择器方法"""
|
|
122
|
+
print("测试选择器方法...")
|
|
123
|
+
print("=" * 50)
|
|
124
|
+
|
|
125
|
+
# 创建测试HTML
|
|
126
|
+
html_content = """
|
|
127
|
+
<html>
|
|
128
|
+
<head>
|
|
129
|
+
<title>测试页面</title>
|
|
130
|
+
</head>
|
|
131
|
+
<body>
|
|
132
|
+
<div class="content">
|
|
133
|
+
<h1>主标题</h1>
|
|
134
|
+
<p class="intro">介绍段落</p>
|
|
135
|
+
<ul class="list">
|
|
136
|
+
<li>项目1</li>
|
|
137
|
+
<li>项目2</li>
|
|
138
|
+
<li>项目3</li>
|
|
139
|
+
</ul>
|
|
140
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
141
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
142
|
+
</div>
|
|
143
|
+
</body>
|
|
144
|
+
</html>
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
response = MockResponse(html_content)
|
|
148
|
+
|
|
149
|
+
# 测试 extract_text
|
|
150
|
+
print("1. 测试 extract_text:")
|
|
151
|
+
title = response.extract_text('title')
|
|
152
|
+
print(f" 标题: {title}")
|
|
153
|
+
|
|
154
|
+
h1_text = response.extract_text('.content h1')
|
|
155
|
+
print(f" H1文本: {h1_text}")
|
|
156
|
+
|
|
157
|
+
# 测试XPath
|
|
158
|
+
title_xpath = response.extract_text('//title')
|
|
159
|
+
print(f" XPath标题: {title_xpath}")
|
|
160
|
+
|
|
161
|
+
print()
|
|
162
|
+
|
|
163
|
+
# 测试 extract_texts
|
|
164
|
+
print("2. 测试 extract_texts:")
|
|
165
|
+
list_items = response.extract_texts('.list li')
|
|
166
|
+
print(f" 列表项: {list_items}")
|
|
167
|
+
|
|
168
|
+
# 测试XPath
|
|
169
|
+
list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
|
|
170
|
+
print(f" XPath列表项: {list_items_xpath}")
|
|
171
|
+
|
|
172
|
+
print()
|
|
173
|
+
|
|
174
|
+
# 测试 extract_attr
|
|
175
|
+
print("3. 测试 extract_attr:")
|
|
176
|
+
link_href = response.extract_attr('.link', 'href')
|
|
177
|
+
print(f" 链接href: {link_href}")
|
|
178
|
+
|
|
179
|
+
img_alt = response.extract_attr('.image', 'alt')
|
|
180
|
+
print(f" 图片alt: {img_alt}")
|
|
181
|
+
|
|
182
|
+
# 测试XPath
|
|
183
|
+
link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
|
|
184
|
+
print(f" XPath链接href: {link_href_xpath}")
|
|
185
|
+
|
|
186
|
+
print()
|
|
187
|
+
|
|
188
|
+
# 测试 extract_attrs
|
|
189
|
+
print("4. 测试 extract_attrs:")
|
|
190
|
+
all_links = response.extract_attrs('a', 'href')
|
|
191
|
+
print(f" 所有链接: {all_links}")
|
|
192
|
+
|
|
193
|
+
print()
|
|
194
|
+
|
|
195
|
+
# 测试边界情况
|
|
196
|
+
print("5. 测试边界情况:")
|
|
197
|
+
non_exist = response.extract_text('.non-exist', default='默认文本')
|
|
198
|
+
print(f" 不存在元素的默认值: {non_exist}")
|
|
199
|
+
|
|
200
|
+
non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
|
|
201
|
+
print(f" 不存在属性的默认值: {non_exist_attr}")
|
|
202
|
+
|
|
203
|
+
print()
|
|
204
|
+
print("所有测试完成!")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == '__main__':
|
|
208
|
+
test_selector_methods()
|
tests/simple_url_test.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response URL 处理方法简单测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到Python路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
# 直接导入需要的模块
|
|
13
|
+
from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_url_methods():
|
|
17
|
+
"""测试 URL 处理方法"""
|
|
18
|
+
print("测试 Response URL 处理方法")
|
|
19
|
+
|
|
20
|
+
# 测试数据
|
|
21
|
+
test_url = "https://example.com/test?param1=value1¶m2=value2#section1"
|
|
22
|
+
print(f"测试URL: {test_url}")
|
|
23
|
+
|
|
24
|
+
# 1. 测试 urlparse
|
|
25
|
+
print("\n1. 测试 urlparse:")
|
|
26
|
+
parsed = urlparse(test_url)
|
|
27
|
+
print(f" scheme: {parsed.scheme}")
|
|
28
|
+
print(f" netloc: {parsed.netloc}")
|
|
29
|
+
print(f" path: {parsed.path}")
|
|
30
|
+
print(f" query: {parsed.query}")
|
|
31
|
+
print(f" fragment: {parsed.fragment}")
|
|
32
|
+
|
|
33
|
+
# 2. 测试 urlsplit
|
|
34
|
+
print("\n2. 测试 urlsplit:")
|
|
35
|
+
split_result = urlsplit(test_url)
|
|
36
|
+
print(f" scheme: {split_result.scheme}")
|
|
37
|
+
print(f" netloc: {split_result.netloc}")
|
|
38
|
+
print(f" path: {split_result.path}")
|
|
39
|
+
print(f" query: {split_result.query}")
|
|
40
|
+
print(f" fragment: {split_result.fragment}")
|
|
41
|
+
|
|
42
|
+
# 3. 测试 parse_qs
|
|
43
|
+
print("\n3. 测试 parse_qs:")
|
|
44
|
+
query_dict = parse_qs(parsed.query)
|
|
45
|
+
print(f" 解析结果: {query_dict}")
|
|
46
|
+
|
|
47
|
+
# 4. 测试 urlencode
|
|
48
|
+
print("\n4. 测试 urlencode:")
|
|
49
|
+
test_dict = {"name": "张三", "age": 25, "city": "北京"}
|
|
50
|
+
encoded = urlencode(test_dict)
|
|
51
|
+
print(f" 编码结果: {encoded}")
|
|
52
|
+
|
|
53
|
+
# 5. 测试 quote/unquote
|
|
54
|
+
print("\n5. 测试 quote/unquote:")
|
|
55
|
+
original = "hello world 你好"
|
|
56
|
+
quoted = quote(original)
|
|
57
|
+
print(f" 原始字符串: {original}")
|
|
58
|
+
print(f" URL编码: {quoted}")
|
|
59
|
+
|
|
60
|
+
unquoted = unquote(quoted)
|
|
61
|
+
print(f" URL解码: {unquoted}")
|
|
62
|
+
print(f" 编码解码是否一致: {original == unquoted}")
|
|
63
|
+
|
|
64
|
+
# 6. 测试 urldefrag
|
|
65
|
+
print("\n6. 测试 urldefrag:")
|
|
66
|
+
url_without_frag, fragment = urldefrag(test_url)
|
|
67
|
+
print(f" 去除片段的URL: {url_without_frag}")
|
|
68
|
+
print(f" 片段: {fragment}")
|
|
69
|
+
|
|
70
|
+
print("\n所有测试完成!")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == '__main__':
|
|
74
|
+
test_url_methods()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
测试CrawlerProcess导入功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_crawler_process_import():
|
|
14
|
+
"""测试CrawlerProcess导入功能"""
|
|
15
|
+
print("测试CrawlerProcess导入功能...")
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
# 测试直接从crawlo导入CrawlerProcess
|
|
19
|
+
from crawlo import CrawlerProcess
|
|
20
|
+
print(f" 成功从crawlo导入CrawlerProcess: {CrawlerProcess}")
|
|
21
|
+
|
|
22
|
+
# 测试创建实例
|
|
23
|
+
process = CrawlerProcess()
|
|
24
|
+
print(f" 成功创建CrawlerProcess实例: {process}")
|
|
25
|
+
|
|
26
|
+
print("CrawlerProcess导入测试通过!")
|
|
27
|
+
|
|
28
|
+
except ImportError as e:
|
|
29
|
+
print(f" 导入失败: {e}")
|
|
30
|
+
# 如果直接导入失败,尝试从crawler模块导入
|
|
31
|
+
try:
|
|
32
|
+
from crawlo.crawler import CrawlerProcess
|
|
33
|
+
print(f" 成功从crawlo.crawler导入CrawlerProcess: {CrawlerProcess}")
|
|
34
|
+
except ImportError as e2:
|
|
35
|
+
print(f" 从crawler模块导入也失败: {e2}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == '__main__':
|
|
39
|
+
test_crawler_process_import()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
测试CrawlerProcess与SPIDER_MODULES的集成
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
import asyncio
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.crawler import CrawlerProcess
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_crawler_process_spider_modules():
|
|
18
|
+
"""测试CrawlerProcess与SPIDER_MODULES的集成"""
|
|
19
|
+
print("测试CrawlerProcess与SPIDER_MODULES的集成...")
|
|
20
|
+
|
|
21
|
+
# 创建一个包含SPIDER_MODULES的设置
|
|
22
|
+
settings = SettingManager({
|
|
23
|
+
'SPIDER_MODULES': ['tests.test_spiders'],
|
|
24
|
+
'SPIDER_LOADER_WARN_ONLY': True,
|
|
25
|
+
'CONCURRENCY': 1,
|
|
26
|
+
'LOG_LEVEL': 'INFO'
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
# 创建CrawlerProcess实例
|
|
30
|
+
process = CrawlerProcess(settings=settings)
|
|
31
|
+
|
|
32
|
+
# 测试获取爬虫名称
|
|
33
|
+
spider_names = process.get_spider_names()
|
|
34
|
+
print(f"发现的爬虫: {spider_names}")
|
|
35
|
+
|
|
36
|
+
# 测试检查爬虫是否已注册
|
|
37
|
+
is_registered = process.is_spider_registered('test_spider')
|
|
38
|
+
print(f"爬虫'test_spider'是否已注册: {is_registered}")
|
|
39
|
+
|
|
40
|
+
# 测试获取爬虫类
|
|
41
|
+
spider_class = process.get_spider_class('test_spider')
|
|
42
|
+
print(f"爬虫'test_spider'的类: {spider_class}")
|
|
43
|
+
|
|
44
|
+
print("测试完成!")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == '__main__':
|
|
48
|
+
test_crawler_process_spider_modules()
|
tests/test_edge_cases.py
CHANGED
|
@@ -112,13 +112,15 @@ async def test_redis_queue_edge_cases():
|
|
|
112
112
|
print(" 特殊字符 URL 测试通过")
|
|
113
113
|
|
|
114
114
|
# 4. 测试优先级(高优先级值应该先出队)
|
|
115
|
-
|
|
116
|
-
|
|
115
|
+
# 注意:Request构造函数会将传入的priority值取反存储
|
|
116
|
+
# 所以priority=1000的请求实际存储为-1000,priority=-1000的请求实际存储为1000
|
|
117
|
+
high_priority_request = Request(url="https://high-priority.com", priority=1000) # 实际存储为-1000
|
|
118
|
+
low_priority_request = Request(url="https://low-priority.com", priority=-1000) # 实际存储为1000
|
|
117
119
|
|
|
118
|
-
await queue.put(high_priority_request) #
|
|
119
|
-
await queue.put(low_priority_request)
|
|
120
|
+
await queue.put(high_priority_request, priority=high_priority_request.priority) # 使用实际存储的priority值
|
|
121
|
+
await queue.put(low_priority_request, priority=low_priority_request.priority) # 使用实际存储的priority值
|
|
120
122
|
|
|
121
|
-
#
|
|
123
|
+
# 高优先级值应该先出队(因为score = priority,score小的先出队)
|
|
122
124
|
first = await queue.get(timeout=1.0)
|
|
123
125
|
assert first is not None and first.url == "https://high-priority.com", "高优先级值应该先出队"
|
|
124
126
|
print(" 优先级测试通过")
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
编码检测核心功能测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.network.response import Response
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_encoding_detection():
|
|
14
|
+
"""测试编码检测核心功能"""
|
|
15
|
+
print("测试编码检测核心功能...")
|
|
16
|
+
|
|
17
|
+
# 测试 Request 编码优先级
|
|
18
|
+
class MockRequest:
|
|
19
|
+
encoding = 'gbk'
|
|
20
|
+
|
|
21
|
+
response1 = Response(
|
|
22
|
+
url="https://example.com",
|
|
23
|
+
body=b'',
|
|
24
|
+
request=MockRequest()
|
|
25
|
+
)
|
|
26
|
+
print(f"Request 编码优先级: {response1.encoding}")
|
|
27
|
+
|
|
28
|
+
# 测试 Content-Type 头部编码
|
|
29
|
+
response2 = Response(
|
|
30
|
+
url="https://example.com",
|
|
31
|
+
body=b'',
|
|
32
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
33
|
+
)
|
|
34
|
+
print(f"Content-Type 编码: {response2.encoding}")
|
|
35
|
+
|
|
36
|
+
# 测试声明编码方法
|
|
37
|
+
declared_enc = response2._declared_encoding()
|
|
38
|
+
print(f"声明编码: {declared_enc}")
|
|
39
|
+
|
|
40
|
+
# 测试默认编码
|
|
41
|
+
response3 = Response(
|
|
42
|
+
url="https://example.com",
|
|
43
|
+
body=b''
|
|
44
|
+
)
|
|
45
|
+
print(f"默认编码: {response3.encoding}")
|
|
46
|
+
|
|
47
|
+
# 验证结果
|
|
48
|
+
assert response1.encoding == 'gbk', f"Expected 'gbk', got {response1.encoding}"
|
|
49
|
+
assert response2.encoding == 'iso-8859-1', f"Expected 'iso-8859-1', got {response2.encoding}"
|
|
50
|
+
assert declared_enc == 'iso-8859-1', f"Expected 'iso-8859-1', got {declared_enc}"
|
|
51
|
+
assert response3.encoding == 'utf-8', f"Expected 'utf-8', got {response3.encoding}"
|
|
52
|
+
|
|
53
|
+
print("所有测试通过!")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == '__main__':
|
|
57
|
+
test_encoding_detection()
|