crawlo 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -4
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +1 -1
- crawlo/core/__init__.py +8 -2
- crawlo/core/scheduler.py +2 -2
- crawlo/extension/log_interval.py +44 -7
- crawlo/initialization/__init__.py +6 -2
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/mode_manager.py +13 -7
- crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
- crawlo/pipelines/database_dedup_pipeline.py +5 -8
- crawlo/pipelines/memory_dedup_pipeline.py +5 -15
- crawlo/pipelines/redis_dedup_pipeline.py +2 -15
- crawlo/project.py +18 -7
- crawlo/settings/default_settings.py +114 -150
- crawlo/settings/setting_manager.py +14 -9
- crawlo/tools/distributed_coordinator.py +4 -8
- crawlo/utils/fingerprint.py +123 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/METADATA +1 -1
- {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/RECORD +45 -29
- examples/test_project/__init__.py +7 -0
- examples/test_project/run.py +35 -0
- examples/test_project/test_project/__init__.py +4 -0
- examples/test_project/test_project/items.py +18 -0
- examples/test_project/test_project/middlewares.py +119 -0
- examples/test_project/test_project/pipelines.py +97 -0
- examples/test_project/test_project/settings.py +170 -0
- examples/test_project/test_project/spiders/__init__.py +10 -0
- examples/test_project/test_project/spiders/of_week_dis.py +144 -0
- tests/debug_framework_logger.py +1 -1
- tests/debug_log_levels.py +1 -1
- tests/test_all_pipeline_fingerprints.py +134 -0
- tests/test_default_header_middleware.py +242 -87
- tests/test_fingerprint_consistency.py +136 -0
- tests/test_fingerprint_simple.py +52 -0
- tests/test_framework_logger.py +1 -1
- tests/test_framework_startup.py +1 -1
- tests/test_hash_performance.py +100 -0
- tests/test_mode_change.py +1 -1
- tests/test_offsite_middleware.py +185 -162
- tests/test_offsite_middleware_simple.py +204 -0
- tests/test_pipeline_fingerprint_consistency.py +87 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/WHEEL +0 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
SHA256 vs MD5 性能对比测试
|
|
5
|
+
=====================
|
|
6
|
+
测试在爬虫场景中两种哈希算法的性能差异
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import time
|
|
11
|
+
from collections import namedtuple
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# 创建测试数据
|
|
15
|
+
TestItem = namedtuple('TestItem', ['title', 'url', 'content', 'price', 'tags'])
|
|
16
|
+
|
|
17
|
+
def create_test_items(count=10000):
|
|
18
|
+
"""创建测试数据项"""
|
|
19
|
+
items = []
|
|
20
|
+
for i in range(count):
|
|
21
|
+
item = TestItem(
|
|
22
|
+
title=f"Test Title {i}",
|
|
23
|
+
url=f"https://example.com/page/{i}",
|
|
24
|
+
content=f"This is test content number {i} with some additional text to make it longer",
|
|
25
|
+
price=99.99 + i,
|
|
26
|
+
tags=[f"tag{j}" for j in range(5)]
|
|
27
|
+
)
|
|
28
|
+
items.append(item)
|
|
29
|
+
return items
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def md5_fingerprint(data):
|
|
33
|
+
"""使用MD5生成指纹"""
|
|
34
|
+
if hasattr(data, '_asdict'):
|
|
35
|
+
data_dict = data._asdict()
|
|
36
|
+
else:
|
|
37
|
+
data_dict = {'__data__': str(data)}
|
|
38
|
+
|
|
39
|
+
sorted_items = sorted(data_dict.items())
|
|
40
|
+
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
41
|
+
return hashlib.md5(fingerprint_string.encode('utf-8')).hexdigest()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def sha256_fingerprint(data):
|
|
45
|
+
"""使用SHA256生成指纹"""
|
|
46
|
+
if hasattr(data, '_asdict'):
|
|
47
|
+
data_dict = data._asdict()
|
|
48
|
+
else:
|
|
49
|
+
data_dict = {'__data__': str(data)}
|
|
50
|
+
|
|
51
|
+
sorted_items = sorted(data_dict.items())
|
|
52
|
+
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
53
|
+
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def performance_test():
|
|
57
|
+
"""性能测试"""
|
|
58
|
+
print("开始哈希算法性能测试...")
|
|
59
|
+
print("=" * 50)
|
|
60
|
+
|
|
61
|
+
# 创建测试数据
|
|
62
|
+
test_items = create_test_items(10000)
|
|
63
|
+
|
|
64
|
+
# 测试MD5性能
|
|
65
|
+
start_time = time.time()
|
|
66
|
+
md5_results = []
|
|
67
|
+
for item in test_items:
|
|
68
|
+
fingerprint = md5_fingerprint(item)
|
|
69
|
+
md5_results.append(fingerprint)
|
|
70
|
+
md5_time = time.time() - start_time
|
|
71
|
+
|
|
72
|
+
# 测试SHA256性能
|
|
73
|
+
start_time = time.time()
|
|
74
|
+
sha256_results = []
|
|
75
|
+
for item in test_items:
|
|
76
|
+
fingerprint = sha256_fingerprint(item)
|
|
77
|
+
sha256_results.append(fingerprint)
|
|
78
|
+
sha256_time = time.time() - start_time
|
|
79
|
+
|
|
80
|
+
# 输出结果
|
|
81
|
+
print(f"测试数据量: {len(test_items)} 条")
|
|
82
|
+
print(f"MD5 耗时: {md5_time:.4f} 秒")
|
|
83
|
+
print(f"SHA256 耗时: {sha256_time:.4f} 秒")
|
|
84
|
+
print(f"性能差异: {((sha256_time - md5_time) / md5_time * 100):.2f}%")
|
|
85
|
+
|
|
86
|
+
# 验证结果一致性
|
|
87
|
+
print("\n验证指纹长度:")
|
|
88
|
+
print(f"MD5 指纹长度: {len(md5_results[0])} 字符")
|
|
89
|
+
print(f"SHA256 指纹长度: {len(sha256_results[0])} 字符")
|
|
90
|
+
|
|
91
|
+
# 检查是否有重复指纹(理论上不应该有)
|
|
92
|
+
md5_unique = len(set(md5_results))
|
|
93
|
+
sha256_unique = len(set(sha256_results))
|
|
94
|
+
print(f"\n唯一指纹数量:")
|
|
95
|
+
print(f"MD5: {md5_unique}/{len(test_items)} ({md5_unique/len(test_items)*100:.2f}%)")
|
|
96
|
+
print(f"SHA256: {sha256_unique}/{len(test_items)} ({sha256_unique/len(test_items)*100:.2f}%)")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == '__main__':
|
|
100
|
+
performance_test()
|
tests/test_mode_change.py
CHANGED
|
@@ -25,7 +25,7 @@ def test_mode_log_level():
|
|
|
25
25
|
|
|
26
26
|
try:
|
|
27
27
|
# 初始化框架
|
|
28
|
-
from crawlo.
|
|
28
|
+
from crawlo.initialization import initialize_framework
|
|
29
29
|
settings = initialize_framework(test_settings)
|
|
30
30
|
|
|
31
31
|
print(f"设置初始化完成: {settings.get('PROJECT_NAME')}")
|
tests/test_offsite_middleware.py
CHANGED
|
@@ -2,16 +2,27 @@
|
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
3
|
"""
|
|
4
4
|
OffsiteMiddleware 测试文件
|
|
5
|
-
|
|
5
|
+
用于测试站点过滤中间件的功能,特别是多个域名的情况
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
import asyncio
|
|
9
8
|
import unittest
|
|
10
9
|
from unittest.mock import Mock, patch
|
|
11
10
|
|
|
12
11
|
from crawlo.middleware.offsite import OffsiteMiddleware
|
|
13
|
-
from crawlo.exceptions import NotConfiguredError, IgnoreRequestError
|
|
14
12
|
from crawlo.settings.setting_manager import SettingManager
|
|
13
|
+
from crawlo.exceptions import IgnoreRequestError, NotConfiguredError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockStats:
|
|
17
|
+
"""Mock Stats 类,用于测试统计信息"""
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self.stats = {}
|
|
20
|
+
|
|
21
|
+
def inc_value(self, key, value=1):
|
|
22
|
+
if key in self.stats:
|
|
23
|
+
self.stats[key] += value
|
|
24
|
+
else:
|
|
25
|
+
self.stats[key] = value
|
|
15
26
|
|
|
16
27
|
|
|
17
28
|
class MockLogger:
|
|
@@ -34,18 +45,6 @@ class MockLogger:
|
|
|
34
45
|
self.logs.append(('error', msg))
|
|
35
46
|
|
|
36
47
|
|
|
37
|
-
class MockStats:
|
|
38
|
-
"""Mock Stats 类,用于测试统计信息"""
|
|
39
|
-
def __init__(self):
|
|
40
|
-
self.stats = {}
|
|
41
|
-
|
|
42
|
-
def inc_value(self, key, value=1):
|
|
43
|
-
if key in self.stats:
|
|
44
|
-
self.stats[key] += value
|
|
45
|
-
else:
|
|
46
|
-
self.stats[key] = value
|
|
47
|
-
|
|
48
|
-
|
|
49
48
|
class TestOffsiteMiddleware(unittest.TestCase):
|
|
50
49
|
"""OffsiteMiddleware 测试类"""
|
|
51
50
|
|
|
@@ -58,165 +57,189 @@ class TestOffsiteMiddleware(unittest.TestCase):
|
|
|
58
57
|
self.crawler = Mock()
|
|
59
58
|
self.crawler.settings = self.settings
|
|
60
59
|
self.crawler.stats = MockStats()
|
|
61
|
-
self.crawler.logger = Mock()
|
|
62
60
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
self.settings.set('
|
|
78
|
-
|
|
79
|
-
mock_get_logger.return_value = MockLogger('OffsiteMiddleware')
|
|
80
|
-
|
|
81
|
-
# 应该正常创建实例
|
|
82
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
83
|
-
self.assertIsInstance(middleware, OffsiteMiddleware)
|
|
84
|
-
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
85
|
-
self.assertIn('example.com', middleware.allowed_domains)
|
|
86
|
-
self.assertIn('test.com', middleware.allowed_domains)
|
|
87
|
-
|
|
88
|
-
def test_is_offsite_request_with_valid_domain(self):
|
|
89
|
-
"""测试有效域名的站外请求判断"""
|
|
90
|
-
# 设置允许的域名
|
|
91
|
-
self.settings.set('ALLOWED_DOMAINS', ['example.com'])
|
|
92
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
93
|
-
|
|
94
|
-
# 创建中间件实例
|
|
95
|
-
middleware = OffsiteMiddleware(
|
|
96
|
-
stats=MockStats(),
|
|
97
|
-
log_level='INFO',
|
|
98
|
-
allowed_domains=['example.com']
|
|
99
|
-
)
|
|
100
|
-
middleware._compile_domains()
|
|
101
|
-
|
|
102
|
-
# 创建请求对象
|
|
103
|
-
request = Mock()
|
|
104
|
-
request.url = 'http://example.com/page'
|
|
105
|
-
|
|
106
|
-
# 应该不是站外请求
|
|
107
|
-
self.assertFalse(middleware._is_offsite_request(request))
|
|
108
|
-
|
|
109
|
-
def test_is_offsite_request_with_subdomain(self):
|
|
110
|
-
"""测试子域名的站外请求判断"""
|
|
111
|
-
# 设置允许的域名
|
|
112
|
-
self.settings.set('ALLOWED_DOMAINS', ['example.com'])
|
|
113
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
114
|
-
|
|
115
|
-
# 创建中间件实例
|
|
116
|
-
middleware = OffsiteMiddleware(
|
|
117
|
-
stats=MockStats(),
|
|
118
|
-
log_level='INFO',
|
|
119
|
-
allowed_domains=['example.com']
|
|
120
|
-
)
|
|
121
|
-
middleware._compile_domains()
|
|
122
|
-
|
|
123
|
-
# 创建请求对象(子域名)
|
|
124
|
-
request = Mock()
|
|
125
|
-
request.url = 'http://sub.example.com/page'
|
|
126
|
-
|
|
127
|
-
# 应该不是站外请求(子域名应该被允许)
|
|
128
|
-
self.assertFalse(middleware._is_offsite_request(request))
|
|
129
|
-
|
|
130
|
-
def test_is_offsite_request_with_invalid_domain(self):
|
|
131
|
-
"""测试无效域名的站外请求判断"""
|
|
132
|
-
# 设置允许的域名
|
|
133
|
-
self.settings.set('ALLOWED_DOMAINS', ['example.com'])
|
|
134
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
135
|
-
|
|
136
|
-
# 创建中间件实例
|
|
137
|
-
middleware = OffsiteMiddleware(
|
|
138
|
-
stats=MockStats(),
|
|
139
|
-
log_level='INFO',
|
|
140
|
-
allowed_domains=['example.com']
|
|
141
|
-
)
|
|
142
|
-
middleware._compile_domains()
|
|
143
|
-
|
|
144
|
-
# 创建请求对象
|
|
145
|
-
request = Mock()
|
|
146
|
-
request.url = 'http://other.com/page'
|
|
147
|
-
|
|
148
|
-
# 应该是站外请求
|
|
149
|
-
self.assertTrue(middleware._is_offsite_request(request))
|
|
150
|
-
|
|
151
|
-
def test_is_offsite_request_with_invalid_url(self):
|
|
152
|
-
"""测试无效URL的站外请求判断"""
|
|
153
|
-
# 设置允许的域名
|
|
154
|
-
self.settings.set('ALLOWED_DOMAINS', ['example.com'])
|
|
155
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
156
|
-
|
|
157
|
-
# 创建中间件实例
|
|
158
|
-
middleware = OffsiteMiddleware(
|
|
159
|
-
stats=MockStats(),
|
|
160
|
-
log_level='INFO',
|
|
161
|
-
allowed_domains=['example.com']
|
|
162
|
-
)
|
|
163
|
-
middleware._compile_domains()
|
|
164
|
-
|
|
165
|
-
# 创建请求对象(无效URL)
|
|
166
|
-
request = Mock()
|
|
167
|
-
request.url = 'invalid-url'
|
|
168
|
-
|
|
169
|
-
# 应该是站外请求
|
|
170
|
-
self.assertTrue(middleware._is_offsite_request(request))
|
|
171
|
-
|
|
172
|
-
@patch('crawlo.utils.log.get_logger')
|
|
173
|
-
def test_process_request_with_offsite_request(self, mock_get_logger):
|
|
174
|
-
"""测试处理站外请求"""
|
|
175
|
-
# 设置允许的域名
|
|
176
|
-
self.settings.set('ALLOWED_DOMAINS', ['example.com'])
|
|
61
|
+
def test_middleware_initialization_without_domains(self):
|
|
62
|
+
"""测试没有设置ALLOWED_DOMAINS时中间件初始化"""
|
|
63
|
+
# 不设置ALLOWED_DOMAINS
|
|
64
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
65
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
66
|
+
# 应该抛出NotConfiguredError异常
|
|
67
|
+
with self.assertRaises(NotConfiguredError) as context:
|
|
68
|
+
OffsiteMiddleware.create_instance(self.crawler)
|
|
69
|
+
|
|
70
|
+
self.assertIn("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用", str(context.exception))
|
|
71
|
+
|
|
72
|
+
def test_middleware_initialization_with_global_domains(self):
|
|
73
|
+
"""测试使用全局ALLOWED_DOMAINS设置时中间件初始化"""
|
|
74
|
+
# 设置全局ALLOWED_DOMAINS
|
|
75
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
177
76
|
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
178
77
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
78
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
79
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
80
|
+
# 应该正常创建实例
|
|
81
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
82
|
+
self.assertIsInstance(middleware, OffsiteMiddleware)
|
|
83
|
+
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
84
|
+
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
85
|
+
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
86
|
+
|
|
87
|
+
def test_middleware_initialization_with_spider_domains(self):
|
|
88
|
+
"""测试使用Spider实例allowed_domains属性时中间件初始化"""
|
|
89
|
+
# 设置Spider实例的allowed_domains
|
|
90
|
+
spider = Mock()
|
|
91
|
+
spider.allowed_domains = ['ee.ofweek.com', 'www.baidu.com']
|
|
92
|
+
|
|
93
|
+
self.crawler.spider = spider
|
|
94
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
188
95
|
|
|
189
|
-
|
|
190
|
-
with
|
|
191
|
-
|
|
96
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
97
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
98
|
+
# 应该正常创建实例,使用Spider的allowed_domains
|
|
99
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
100
|
+
self.assertIsInstance(middleware, OffsiteMiddleware)
|
|
101
|
+
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
102
|
+
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
103
|
+
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
104
|
+
|
|
105
|
+
def test_is_offsite_request_with_allowed_domains(self):
|
|
106
|
+
"""测试允许域名内的请求"""
|
|
107
|
+
# 设置ALLOWED_DOMAINS
|
|
108
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
109
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
192
110
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
111
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
112
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
113
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
114
|
+
|
|
115
|
+
# 创建允许的请求
|
|
116
|
+
request1 = Mock()
|
|
117
|
+
request1.url = 'https://ee.ofweek.com/news/article1.html'
|
|
118
|
+
|
|
119
|
+
request2 = Mock()
|
|
120
|
+
request2.url = 'https://www.baidu.com/s?wd=test'
|
|
121
|
+
|
|
122
|
+
# 这些请求应该不被认为是站外请求
|
|
123
|
+
self.assertFalse(middleware._is_offsite_request(request1))
|
|
124
|
+
self.assertFalse(middleware._is_offsite_request(request2))
|
|
125
|
+
|
|
126
|
+
def test_is_offsite_request_with_subdomains(self):
|
|
127
|
+
"""测试子域名的请求"""
|
|
128
|
+
# 设置ALLOWED_DOMAINS
|
|
129
|
+
self.settings.set('ALLOWED_DOMAINS', ['ofweek.com', 'baidu.com'])
|
|
202
130
|
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
203
131
|
|
|
204
|
-
|
|
205
|
-
|
|
132
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
133
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
134
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
135
|
+
|
|
136
|
+
# 创建子域名的请求
|
|
137
|
+
request1 = Mock()
|
|
138
|
+
request1.url = 'https://news.ofweek.com/article1.html'
|
|
139
|
+
|
|
140
|
+
request2 = Mock()
|
|
141
|
+
request2.url = 'https://map.baidu.com/location'
|
|
142
|
+
|
|
143
|
+
# 这些请求应该不被认为是站外请求(因为允许了根域名)
|
|
144
|
+
self.assertFalse(middleware._is_offsite_request(request1))
|
|
145
|
+
self.assertFalse(middleware._is_offsite_request(request2))
|
|
146
|
+
|
|
147
|
+
def test_is_offsite_request_with_disallowed_domains(self):
|
|
148
|
+
"""测试不允许域名的请求"""
|
|
149
|
+
# 设置ALLOWED_DOMAINS
|
|
150
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
151
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
206
152
|
|
|
207
|
-
|
|
208
|
-
middleware =
|
|
153
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
154
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
155
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
156
|
+
|
|
157
|
+
# 创建不允许的请求
|
|
158
|
+
request1 = Mock()
|
|
159
|
+
request1.url = 'https://www.google.com/search?q=test'
|
|
160
|
+
|
|
161
|
+
request2 = Mock()
|
|
162
|
+
request2.url = 'https://github.com/user/repo'
|
|
163
|
+
|
|
164
|
+
# 这些请求应该被认为是站外请求
|
|
165
|
+
self.assertTrue(middleware._is_offsite_request(request1))
|
|
166
|
+
self.assertTrue(middleware._is_offsite_request(request2))
|
|
167
|
+
|
|
168
|
+
def test_process_request_with_allowed_domain(self):
|
|
169
|
+
"""测试处理允许域名内的请求"""
|
|
170
|
+
# 设置ALLOWED_DOMAINS
|
|
171
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
172
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
209
173
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
174
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
175
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
176
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
177
|
+
|
|
178
|
+
# 创建允许的请求
|
|
179
|
+
request = Mock()
|
|
180
|
+
request.url = 'https://ee.ofweek.com/news/article1.html'
|
|
181
|
+
spider = Mock()
|
|
182
|
+
|
|
183
|
+
# 处理请求,应该不抛出异常
|
|
184
|
+
result = middleware.process_request(request, spider)
|
|
185
|
+
self.assertIsNone(result) # 应该返回None,表示请求被允许
|
|
186
|
+
|
|
187
|
+
# 检查没有增加统计计数
|
|
188
|
+
self.assertNotIn('offsite_request_count', self.crawler.stats.stats)
|
|
189
|
+
|
|
190
|
+
def test_process_request_with_disallowed_domain(self):
|
|
191
|
+
"""测试处理不允许域名的请求"""
|
|
192
|
+
# 设置ALLOWED_DOMAINS
|
|
193
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
194
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
213
195
|
|
|
214
|
-
|
|
215
|
-
|
|
196
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
197
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
198
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
199
|
+
|
|
200
|
+
# 创建不允许的请求
|
|
201
|
+
request = Mock()
|
|
202
|
+
request.url = 'https://www.google.com/search?q=test'
|
|
203
|
+
spider = Mock()
|
|
204
|
+
|
|
205
|
+
# 处理请求,应该抛出IgnoreRequestError异常
|
|
206
|
+
with self.assertRaises(IgnoreRequestError) as context:
|
|
207
|
+
middleware.process_request(request, spider)
|
|
208
|
+
|
|
209
|
+
self.assertIn("站外请求被过滤", str(context.exception))
|
|
210
|
+
|
|
211
|
+
# 检查增加了统计计数
|
|
212
|
+
self.assertIn('offsite_request_count', self.crawler.stats.stats)
|
|
213
|
+
self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
|
|
214
|
+
self.assertIn('offsite_request_count/www.google.com', self.crawler.stats.stats)
|
|
215
|
+
|
|
216
|
+
def test_process_request_with_invalid_url(self):
|
|
217
|
+
"""测试处理无效URL的请求"""
|
|
218
|
+
# 设置ALLOWED_DOMAINS
|
|
219
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
220
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
216
221
|
|
|
217
|
-
|
|
218
|
-
|
|
222
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
223
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
224
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
225
|
+
|
|
226
|
+
# 创建无效URL的请求
|
|
227
|
+
request = Mock()
|
|
228
|
+
request.url = 'not_a_valid_url'
|
|
229
|
+
spider = Mock()
|
|
230
|
+
|
|
231
|
+
# 处理请求,应该抛出IgnoreRequestError异常
|
|
232
|
+
with self.assertRaises(IgnoreRequestError) as context:
|
|
233
|
+
middleware.process_request(request, spider)
|
|
234
|
+
|
|
235
|
+
self.assertIn("站外请求被过滤", str(context.exception))
|
|
236
|
+
|
|
237
|
+
# 检查增加了统计计数
|
|
238
|
+
self.assertIn('offsite_request_count', self.crawler.stats.stats)
|
|
239
|
+
self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
|
|
240
|
+
self.assertIn('offsite_request_count/invalid_url', self.crawler.stats.stats)
|
|
219
241
|
|
|
220
242
|
|
|
221
243
|
if __name__ == '__main__':
|
|
244
|
+
# 直接创建一个OffsiteMiddleware实例进行测试,绕过create_instance的复杂逻辑
|
|
222
245
|
unittest.main()
|