crawlo 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +8 -7
- crawlo/downloader/__init__.py +5 -2
- crawlo/downloader/cffi_downloader.py +3 -1
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/middleware/proxy.py +171 -348
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +340 -189
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/settings/default_settings.py +42 -30
- crawlo/stats_collector.py +10 -1
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +10 -55
- crawlo/templates/project/settings_distributed.py.tmpl +20 -22
- crawlo/templates/project/settings_gentle.py.tmpl +5 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- crawlo/templates/project/settings_minimal.py.tmpl +25 -1
- crawlo/templates/project/settings_simple.py.tmpl +5 -0
- crawlo/templates/run.py.tmpl +1 -8
- crawlo/templates/spider/spider.py.tmpl +5 -108
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +226 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.6.dist-info/METADATA +329 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/RECORD +110 -69
- tests/authenticated_proxy_example.py +10 -6
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/explain_mysql_update_behavior.py +77 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/simulate_mysql_update_test.py +140 -0
- tests/test_asyncmy_usage.py +57 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_crawlo_proxy_integration.py +8 -2
- tests/test_downloader_proxy_compatibility.py +24 -20
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_proxy_middleware.py +104 -8
- tests/test_proxy_middleware_enhanced.py +1 -5
- tests/test_proxy_middleware_integration.py +7 -2
- tests/test_proxy_middleware_refactored.py +25 -2
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_real_scenario_proxy.py +17 -17
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- tests/verify_mysql_warnings.py +110 -0
- crawlo/middleware/simple_proxy.py +0 -65
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.4.dist-info/METADATA +0 -190
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
|
@@ -35,7 +35,7 @@ class MockCrawler:
|
|
|
35
35
|
self.spider = MockSpider(self) # 添加spider属性
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def create_test_settings(proxy_url=None):
|
|
38
|
+
def create_test_settings(proxy_url=None, proxy_list=None):
|
|
39
39
|
"""创建测试设置"""
|
|
40
40
|
settings = SettingManager()
|
|
41
41
|
settings.set("LOG_LEVEL", "DEBUG")
|
|
@@ -47,12 +47,13 @@ def create_test_settings(proxy_url=None):
|
|
|
47
47
|
|
|
48
48
|
# 代理相关设置
|
|
49
49
|
if proxy_url:
|
|
50
|
-
|
|
50
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
51
|
+
# 只要配置了代理API URL,中间件就会自动启用
|
|
51
52
|
settings.set("PROXY_API_URL", proxy_url)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
settings.set("
|
|
53
|
+
elif proxy_list:
|
|
54
|
+
# 代理配置(适用于ProxyMiddleware)
|
|
55
|
+
# 只要配置了代理列表,中间件就会自动启用
|
|
56
|
+
settings.set("PROXY_LIST", proxy_list)
|
|
56
57
|
|
|
57
58
|
return settings
|
|
58
59
|
|
|
@@ -65,7 +66,7 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
|
65
66
|
|
|
66
67
|
try:
|
|
67
68
|
# 创建设置
|
|
68
|
-
settings = create_test_settings(proxy_url)
|
|
69
|
+
settings = create_test_settings(proxy_url=proxy_url)
|
|
69
70
|
crawler = MockCrawler(settings)
|
|
70
71
|
|
|
71
72
|
# 创建下载器
|
|
@@ -73,6 +74,7 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
|
73
74
|
downloader.open()
|
|
74
75
|
|
|
75
76
|
# 创建代理中间件
|
|
77
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
76
78
|
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
77
79
|
|
|
78
80
|
# 创建请求
|
|
@@ -115,15 +117,15 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
|
115
117
|
pass
|
|
116
118
|
|
|
117
119
|
|
|
118
|
-
async def test_httpx_with_proxy_async(
|
|
120
|
+
async def test_httpx_with_proxy_async(proxy_list, target_url):
|
|
119
121
|
"""测试httpx下载器与代理的适配性"""
|
|
120
122
|
print(f"\n=== 测试 httpx 下载器与代理 ===")
|
|
121
|
-
print(f"
|
|
123
|
+
print(f"代理列表: {proxy_list}")
|
|
122
124
|
print(f"目标URL: {target_url}")
|
|
123
125
|
|
|
124
126
|
try:
|
|
125
127
|
# 创建设置
|
|
126
|
-
settings = create_test_settings(
|
|
128
|
+
settings = create_test_settings(proxy_list=proxy_list)
|
|
127
129
|
crawler = MockCrawler(settings)
|
|
128
130
|
|
|
129
131
|
# 创建下载器
|
|
@@ -131,7 +133,8 @@ async def test_httpx_with_proxy_async(proxy_url, target_url):
|
|
|
131
133
|
downloader.open()
|
|
132
134
|
|
|
133
135
|
# 创建代理中间件
|
|
134
|
-
|
|
136
|
+
from crawlo.middleware.simple_proxy import SimpleProxyMiddleware
|
|
137
|
+
proxy_middleware = SimpleProxyMiddleware(settings, "DEBUG")
|
|
135
138
|
|
|
136
139
|
# 创建请求
|
|
137
140
|
request = Request(url=target_url)
|
|
@@ -168,7 +171,6 @@ async def test_httpx_with_proxy_async(proxy_url, target_url):
|
|
|
168
171
|
# 清理资源
|
|
169
172
|
try:
|
|
170
173
|
await downloader.close()
|
|
171
|
-
await proxy_middleware.close()
|
|
172
174
|
except:
|
|
173
175
|
pass
|
|
174
176
|
|
|
@@ -181,7 +183,7 @@ async def test_curl_cffi_with_proxy_async(proxy_url, target_url):
|
|
|
181
183
|
|
|
182
184
|
try:
|
|
183
185
|
# 创建设置
|
|
184
|
-
settings = create_test_settings(proxy_url)
|
|
186
|
+
settings = create_test_settings(proxy_url=proxy_url)
|
|
185
187
|
crawler = MockCrawler(settings)
|
|
186
188
|
|
|
187
189
|
# 创建下载器
|
|
@@ -238,26 +240,28 @@ async def main():
|
|
|
238
240
|
# 使用测试代理URL(这里使用一个公开的测试代理)
|
|
239
241
|
# 注意:在实际使用中,您需要替换为有效的代理URL
|
|
240
242
|
test_proxy_url = "http://test.proxy.api:8080/proxy/getitem/"
|
|
243
|
+
test_proxy_list = ["http://proxy1:8080", "http://proxy2:8080"]
|
|
241
244
|
test_target_url = "https://httpbin.org/ip" # 一个返回IP信息的测试站点
|
|
242
245
|
|
|
243
246
|
print(f"测试代理API: {test_proxy_url}")
|
|
247
|
+
print(f"测试代理列表: {test_proxy_list}")
|
|
244
248
|
print(f"测试目标URL: {test_target_url}")
|
|
245
249
|
|
|
246
|
-
# 测试aiohttp
|
|
250
|
+
# 测试aiohttp下载器(使用高级代理)
|
|
247
251
|
aiohttp_result = await test_aiohttp_with_proxy(test_proxy_url, test_target_url)
|
|
248
252
|
|
|
249
|
-
# 测试httpx
|
|
250
|
-
httpx_result = await test_httpx_with_proxy_async(
|
|
253
|
+
# 测试httpx下载器(使用简化代理)
|
|
254
|
+
httpx_result = await test_httpx_with_proxy_async(test_proxy_list, test_target_url)
|
|
251
255
|
|
|
252
|
-
# 测试curl-cffi
|
|
256
|
+
# 测试curl-cffi下载器(使用高级代理)
|
|
253
257
|
curl_cffi_result = await test_curl_cffi_with_proxy_async(test_proxy_url, test_target_url)
|
|
254
258
|
|
|
255
259
|
# 汇总结果
|
|
256
260
|
print("\n" + "="*50)
|
|
257
261
|
print("测试结果汇总:")
|
|
258
|
-
print(f"aiohttp
|
|
259
|
-
print(f"httpx
|
|
260
|
-
print(f"curl-cffi
|
|
262
|
+
print(f"aiohttp 下载器 (高级代理): {'✓ 通过' if aiohttp_result else '✗ 失败'}")
|
|
263
|
+
print(f"httpx 下载器 (简化代理): {'✓ 通过' if httpx_result else '✗ 失败'}")
|
|
264
|
+
print(f"curl-cffi 下载器 (高级代理): {'✓ 通过' if curl_cffi_result else '✗ 失败'}")
|
|
261
265
|
|
|
262
266
|
overall_result = all([aiohttp_result, httpx_result, curl_cffi_result])
|
|
263
267
|
print(f"\n总体结果: {'✓ 所有下载器都适配代理中间件' if overall_result else '✗ 部分下载器不兼容'}")
|
tests/test_edge_cases.py
CHANGED
|
@@ -112,13 +112,15 @@ async def test_redis_queue_edge_cases():
|
|
|
112
112
|
print(" 特殊字符 URL 测试通过")
|
|
113
113
|
|
|
114
114
|
# 4. 测试优先级(高优先级值应该先出队)
|
|
115
|
-
|
|
116
|
-
|
|
115
|
+
# 注意:Request构造函数会将传入的priority值取反存储
|
|
116
|
+
# 所以priority=1000的请求实际存储为-1000,priority=-1000的请求实际存储为1000
|
|
117
|
+
high_priority_request = Request(url="https://high-priority.com", priority=1000) # 实际存储为-1000
|
|
118
|
+
low_priority_request = Request(url="https://low-priority.com", priority=-1000) # 实际存储为1000
|
|
117
119
|
|
|
118
|
-
await queue.put(high_priority_request) #
|
|
119
|
-
await queue.put(low_priority_request)
|
|
120
|
+
await queue.put(high_priority_request, priority=high_priority_request.priority) # 使用实际存储的priority值
|
|
121
|
+
await queue.put(low_priority_request, priority=low_priority_request.priority) # 使用实际存储的priority值
|
|
120
122
|
|
|
121
|
-
#
|
|
123
|
+
# 高优先级值应该先出队(因为score = priority,score小的先出队)
|
|
122
124
|
first = await queue.get(timeout=1.0)
|
|
123
125
|
assert first is not None and first.url == "https://high-priority.com", "高优先级值应该先出队"
|
|
124
126
|
print(" 优先级测试通过")
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
编码检测核心功能测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
from crawlo.network.response import Response
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_encoding_detection():
|
|
14
|
+
"""测试编码检测核心功能"""
|
|
15
|
+
print("测试编码检测核心功能...")
|
|
16
|
+
|
|
17
|
+
# 测试 Request 编码优先级
|
|
18
|
+
class MockRequest:
|
|
19
|
+
encoding = 'gbk'
|
|
20
|
+
|
|
21
|
+
response1 = Response(
|
|
22
|
+
url="https://example.com",
|
|
23
|
+
body=b'',
|
|
24
|
+
request=MockRequest()
|
|
25
|
+
)
|
|
26
|
+
print(f"Request 编码优先级: {response1.encoding}")
|
|
27
|
+
|
|
28
|
+
# 测试 Content-Type 头部编码
|
|
29
|
+
response2 = Response(
|
|
30
|
+
url="https://example.com",
|
|
31
|
+
body=b'',
|
|
32
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
33
|
+
)
|
|
34
|
+
print(f"Content-Type 编码: {response2.encoding}")
|
|
35
|
+
|
|
36
|
+
# 测试声明编码方法
|
|
37
|
+
declared_enc = response2._declared_encoding()
|
|
38
|
+
print(f"声明编码: {declared_enc}")
|
|
39
|
+
|
|
40
|
+
# 测试默认编码
|
|
41
|
+
response3 = Response(
|
|
42
|
+
url="https://example.com",
|
|
43
|
+
body=b''
|
|
44
|
+
)
|
|
45
|
+
print(f"默认编码: {response3.encoding}")
|
|
46
|
+
|
|
47
|
+
# 验证结果
|
|
48
|
+
assert response1.encoding == 'gbk', f"Expected 'gbk', got {response1.encoding}"
|
|
49
|
+
assert response2.encoding == 'iso-8859-1', f"Expected 'iso-8859-1', got {response2.encoding}"
|
|
50
|
+
assert declared_enc == 'iso-8859-1', f"Expected 'iso-8859-1', got {declared_enc}"
|
|
51
|
+
assert response3.encoding == 'utf-8', f"Expected 'utf-8', got {response3.encoding}"
|
|
52
|
+
|
|
53
|
+
print("所有测试通过!")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == '__main__':
|
|
57
|
+
test_encoding_detection()
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response 编码检测优化测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
# 模拟 Response 类的部分功能用于测试
|
|
9
|
+
class MockResponse:
|
|
10
|
+
def __init__(self, body, headers=None, request=None):
|
|
11
|
+
self.body = body
|
|
12
|
+
self.headers = headers or {}
|
|
13
|
+
self.request = request
|
|
14
|
+
self._DEFAULT_ENCODING = "ascii"
|
|
15
|
+
|
|
16
|
+
def _determine_encoding(self):
|
|
17
|
+
"""简化版编码检测"""
|
|
18
|
+
# 1. 优先使用声明的编码
|
|
19
|
+
declared_encoding = self._declared_encoding()
|
|
20
|
+
if declared_encoding:
|
|
21
|
+
return declared_encoding
|
|
22
|
+
|
|
23
|
+
# 2. 默认使用 utf-8
|
|
24
|
+
return 'utf-8'
|
|
25
|
+
|
|
26
|
+
def _declared_encoding(self):
|
|
27
|
+
"""获取声明的编码"""
|
|
28
|
+
# 1. Request 中指定的编码
|
|
29
|
+
if self.request and getattr(self.request, 'encoding', None):
|
|
30
|
+
return self.request.encoding
|
|
31
|
+
|
|
32
|
+
# 2. 从 Content-Type 头中检测
|
|
33
|
+
content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
|
|
34
|
+
if content_type:
|
|
35
|
+
import re
|
|
36
|
+
charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
|
|
37
|
+
if charset_match:
|
|
38
|
+
return charset_match.group(1).lower()
|
|
39
|
+
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestDetermineEncoding(unittest.TestCase):
|
|
44
|
+
"""编码检测测试类"""
|
|
45
|
+
|
|
46
|
+
def test_request_encoding_priority(self):
|
|
47
|
+
"""测试 Request 编码优先级"""
|
|
48
|
+
class MockRequest:
|
|
49
|
+
encoding = 'gbk'
|
|
50
|
+
|
|
51
|
+
response = MockResponse(b'', request=MockRequest())
|
|
52
|
+
encoding = response._determine_encoding()
|
|
53
|
+
self.assertEqual(encoding, 'gbk')
|
|
54
|
+
|
|
55
|
+
def test_content_type_encoding(self):
|
|
56
|
+
"""测试 Content-Type 头部编码检测"""
|
|
57
|
+
response = MockResponse(
|
|
58
|
+
b'',
|
|
59
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
60
|
+
)
|
|
61
|
+
encoding = response._determine_encoding()
|
|
62
|
+
self.assertEqual(encoding, 'iso-8859-1')
|
|
63
|
+
|
|
64
|
+
def test_default_encoding(self):
|
|
65
|
+
"""测试默认编码"""
|
|
66
|
+
response = MockResponse(b'')
|
|
67
|
+
encoding = response._determine_encoding()
|
|
68
|
+
self.assertEqual(encoding, 'utf-8')
|
|
69
|
+
|
|
70
|
+
def test_case_insensitive_content_type(self):
|
|
71
|
+
"""测试 Content-Type 头部大小写不敏感"""
|
|
72
|
+
response = MockResponse(
|
|
73
|
+
b'',
|
|
74
|
+
headers={"Content-Type": "text/html; CHARSET=UTF-8"}
|
|
75
|
+
)
|
|
76
|
+
encoding = response._determine_encoding()
|
|
77
|
+
self.assertEqual(encoding, 'utf-8')
|
|
78
|
+
|
|
79
|
+
def test_declared_encoding_with_request(self):
|
|
80
|
+
"""测试声明编码 - Request优先级"""
|
|
81
|
+
class MockRequest:
|
|
82
|
+
encoding = 'gbk'
|
|
83
|
+
|
|
84
|
+
response = MockResponse(b'', request=MockRequest())
|
|
85
|
+
declared_encoding = response._declared_encoding()
|
|
86
|
+
self.assertEqual(declared_encoding, 'gbk')
|
|
87
|
+
|
|
88
|
+
def test_declared_encoding_with_content_type(self):
|
|
89
|
+
"""测试声明编码 - Content-Type"""
|
|
90
|
+
response = MockResponse(
|
|
91
|
+
b'',
|
|
92
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
93
|
+
)
|
|
94
|
+
declared_encoding = response._declared_encoding()
|
|
95
|
+
self.assertEqual(declared_encoding, 'iso-8859-1')
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_encoding_detection():
|
|
99
|
+
"""简单测试编码检测功能"""
|
|
100
|
+
print("测试编码检测功能...")
|
|
101
|
+
|
|
102
|
+
# 测试 Request 编码优先级
|
|
103
|
+
class MockRequest:
|
|
104
|
+
encoding = 'gbk'
|
|
105
|
+
|
|
106
|
+
response1 = MockResponse(b'', request=MockRequest())
|
|
107
|
+
encoding1 = response1._determine_encoding()
|
|
108
|
+
print(f"Request 编码优先级: {encoding1}")
|
|
109
|
+
|
|
110
|
+
# 测试 Content-Type 头部编码
|
|
111
|
+
response2 = MockResponse(
|
|
112
|
+
b'',
|
|
113
|
+
headers={"content-type": "text/html; charset=iso-8859-1"}
|
|
114
|
+
)
|
|
115
|
+
encoding2 = response2._determine_encoding()
|
|
116
|
+
print(f"Content-Type 编码: {encoding2}")
|
|
117
|
+
|
|
118
|
+
# 测试默认编码
|
|
119
|
+
response3 = MockResponse(b'')
|
|
120
|
+
encoding3 = response3._determine_encoding()
|
|
121
|
+
print(f"默认编码: {encoding3}")
|
|
122
|
+
|
|
123
|
+
print("编码检测测试完成!")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
if __name__ == '__main__':
|
|
127
|
+
test_encoding_detection()
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试CrawloConfig工厂模式兼容性
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import traceback
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.config import CrawloConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_standalone_factory():
|
|
16
|
+
"""测试单机模式工厂函数"""
|
|
17
|
+
print("测试单机模式工厂函数...")
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
# 创建单机模式配置
|
|
21
|
+
config = CrawloConfig.standalone(
|
|
22
|
+
project_name='ofweek_standalone',
|
|
23
|
+
concurrency=8,
|
|
24
|
+
download_delay=1.0
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
print(f"配置创建成功")
|
|
28
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
29
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
30
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
31
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
32
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
33
|
+
|
|
34
|
+
# 验证配置是否正确
|
|
35
|
+
assert config.get('RUN_MODE') == 'standalone'
|
|
36
|
+
assert config.get('QUEUE_TYPE') == 'memory'
|
|
37
|
+
assert config.get('PROJECT_NAME') == 'ofweek_standalone'
|
|
38
|
+
assert config.get('CONCURRENCY') == 8
|
|
39
|
+
assert config.get('DOWNLOAD_DELAY') == 1.0
|
|
40
|
+
|
|
41
|
+
print("✅ 单机模式工厂函数测试通过")
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
print(f"❌ 单机模式工厂函数测试失败: {e}")
|
|
46
|
+
traceback.print_exc()
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_distributed_factory():
|
|
51
|
+
"""测试分布式模式工厂函数"""
|
|
52
|
+
print("\n测试分布式模式工厂函数...")
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
# 创建分布式模式配置
|
|
56
|
+
config = CrawloConfig.distributed(
|
|
57
|
+
redis_host='127.0.0.1',
|
|
58
|
+
redis_port=6379,
|
|
59
|
+
project_name='ofweek_distributed',
|
|
60
|
+
concurrency=16,
|
|
61
|
+
download_delay=0.5
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
print(f"配置创建成功")
|
|
65
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
66
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
67
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
68
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
69
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
70
|
+
print(f"REDIS_HOST: {config.get('REDIS_HOST')}")
|
|
71
|
+
print(f"REDIS_PORT: {config.get('REDIS_PORT')}")
|
|
72
|
+
|
|
73
|
+
# 验证配置是否正确
|
|
74
|
+
assert config.get('RUN_MODE') == 'distributed'
|
|
75
|
+
assert config.get('QUEUE_TYPE') == 'redis'
|
|
76
|
+
assert config.get('PROJECT_NAME') == 'ofweek_distributed'
|
|
77
|
+
assert config.get('CONCURRENCY') == 16
|
|
78
|
+
assert config.get('DOWNLOAD_DELAY') == 0.5
|
|
79
|
+
assert config.get('REDIS_HOST') == '127.0.0.1'
|
|
80
|
+
assert config.get('REDIS_PORT') == 6379
|
|
81
|
+
|
|
82
|
+
print("✅ 分布式模式工厂函数测试通过")
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"❌ 分布式模式工厂函数测试失败: {e}")
|
|
87
|
+
traceback.print_exc()
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_auto_factory():
|
|
92
|
+
"""测试自动模式工厂函数"""
|
|
93
|
+
print("\n测试自动模式工厂函数...")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
# 创建自动模式配置
|
|
97
|
+
config = CrawloConfig.auto(
|
|
98
|
+
project_name='ofweek_auto',
|
|
99
|
+
concurrency=12,
|
|
100
|
+
download_delay=0.8
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(f"配置创建成功")
|
|
104
|
+
print(f"RUN_MODE: {config.get('RUN_MODE')}")
|
|
105
|
+
print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
|
|
106
|
+
print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
|
|
107
|
+
print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
|
|
108
|
+
print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
|
|
109
|
+
|
|
110
|
+
# 验证配置是否正确
|
|
111
|
+
assert config.get('RUN_MODE') == 'auto'
|
|
112
|
+
assert config.get('QUEUE_TYPE') == 'auto'
|
|
113
|
+
assert config.get('PROJECT_NAME') == 'ofweek_auto'
|
|
114
|
+
assert config.get('CONCURRENCY') == 12
|
|
115
|
+
assert config.get('DOWNLOAD_DELAY') == 0.8
|
|
116
|
+
|
|
117
|
+
print("✅ 自动模式工厂函数测试通过")
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"❌ 自动模式工厂函数测试失败: {e}")
|
|
122
|
+
traceback.print_exc()
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_config_to_dict():
|
|
127
|
+
"""测试配置转换为字典"""
|
|
128
|
+
print("\n测试配置转换为字典...")
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# 创建配置
|
|
132
|
+
config = CrawloConfig.standalone(
|
|
133
|
+
project_name='test_project',
|
|
134
|
+
concurrency=4
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# 转换为字典
|
|
138
|
+
config_dict = config.to_dict()
|
|
139
|
+
|
|
140
|
+
print(f"字典转换成功")
|
|
141
|
+
print(f"字典键数量: {len(config_dict)}")
|
|
142
|
+
|
|
143
|
+
# 验证关键配置项
|
|
144
|
+
assert 'RUN_MODE' in config_dict
|
|
145
|
+
assert 'QUEUE_TYPE' in config_dict
|
|
146
|
+
assert 'PROJECT_NAME' in config_dict
|
|
147
|
+
assert 'CONCURRENCY' in config_dict
|
|
148
|
+
|
|
149
|
+
print("✅ 配置转换为字典测试通过")
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
print(f"❌ 配置转换为字典测试失败: {e}")
|
|
154
|
+
traceback.print_exc()
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def main():
|
|
159
|
+
"""主函数"""
|
|
160
|
+
print("开始测试CrawloConfig工厂模式兼容性...")
|
|
161
|
+
print("=" * 50)
|
|
162
|
+
|
|
163
|
+
tests = [
|
|
164
|
+
test_standalone_factory,
|
|
165
|
+
test_distributed_factory,
|
|
166
|
+
test_auto_factory,
|
|
167
|
+
test_config_to_dict,
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
passed = 0
|
|
171
|
+
total = len(tests)
|
|
172
|
+
|
|
173
|
+
for test_func in tests:
|
|
174
|
+
try:
|
|
175
|
+
if test_func():
|
|
176
|
+
passed += 1
|
|
177
|
+
print(f"✓ {test_func.__name__} 通过")
|
|
178
|
+
else:
|
|
179
|
+
print(f"✗ {test_func.__name__} 失败")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
print(f"✗ {test_func.__name__} 异常: {e}")
|
|
182
|
+
print()
|
|
183
|
+
|
|
184
|
+
print("=" * 50)
|
|
185
|
+
print(f"测试结果: {passed}/{total} 通过")
|
|
186
|
+
|
|
187
|
+
if passed == total:
|
|
188
|
+
print("所有测试通过!CrawloConfig工厂模式兼容性正常。")
|
|
189
|
+
return 0
|
|
190
|
+
else:
|
|
191
|
+
print("部分测试失败,请检查实现。")
|
|
192
|
+
return 1
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
exit_code = main()
|
|
197
|
+
exit(exit_code)
|