crawlo 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -4
- crawlo/__version__.py +1 -1
- crawlo/core/__init__.py +8 -2
- crawlo/core/scheduler.py +2 -2
- crawlo/downloader/aiohttp_downloader.py +7 -2
- crawlo/extension/log_interval.py +44 -7
- crawlo/initialization/__init__.py +6 -2
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/mode_manager.py +13 -7
- crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
- crawlo/pipelines/database_dedup_pipeline.py +5 -8
- crawlo/pipelines/memory_dedup_pipeline.py +5 -15
- crawlo/pipelines/redis_dedup_pipeline.py +2 -15
- crawlo/project.py +18 -7
- crawlo/settings/default_settings.py +114 -150
- crawlo/settings/setting_manager.py +14 -9
- crawlo/tools/distributed_coordinator.py +4 -8
- crawlo/utils/fingerprint.py +123 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/METADATA +1 -1
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/RECORD +45 -29
- examples/test_project/__init__.py +7 -0
- examples/test_project/run.py +35 -0
- examples/test_project/test_project/__init__.py +4 -0
- examples/test_project/test_project/items.py +18 -0
- examples/test_project/test_project/middlewares.py +119 -0
- examples/test_project/test_project/pipelines.py +97 -0
- examples/test_project/test_project/settings.py +170 -0
- examples/test_project/test_project/spiders/__init__.py +10 -0
- examples/test_project/test_project/spiders/of_week_dis.py +144 -0
- tests/debug_framework_logger.py +1 -1
- tests/debug_log_levels.py +1 -1
- tests/test_all_pipeline_fingerprints.py +134 -0
- tests/test_default_header_middleware.py +242 -87
- tests/test_fingerprint_consistency.py +136 -0
- tests/test_fingerprint_simple.py +52 -0
- tests/test_framework_logger.py +1 -1
- tests/test_framework_startup.py +1 -1
- tests/test_hash_performance.py +100 -0
- tests/test_mode_change.py +1 -1
- tests/test_offsite_middleware.py +185 -162
- tests/test_offsite_middleware_simple.py +204 -0
- tests/test_pipeline_fingerprint_consistency.py +87 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/WHEEL +0 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/top_level.txt +0 -0
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
3
|
"""
|
|
4
4
|
DefaultHeaderMiddleware 测试文件
|
|
5
|
-
|
|
5
|
+
用于测试默认请求头中间件的功能,包括随机更换header功能
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import unittest
|
|
9
9
|
from unittest.mock import Mock, patch
|
|
10
10
|
|
|
11
11
|
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
12
|
-
from crawlo.exceptions import NotConfiguredError
|
|
13
12
|
from crawlo.settings.setting_manager import SettingManager
|
|
13
|
+
from crawlo.exceptions import NotConfiguredError
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class MockLogger:
|
|
@@ -32,6 +32,9 @@ class MockLogger:
|
|
|
32
32
|
def error(self, msg):
|
|
33
33
|
self.logs.append(('error', msg))
|
|
34
34
|
|
|
35
|
+
def isEnabledFor(self, level):
|
|
36
|
+
return True
|
|
37
|
+
|
|
35
38
|
|
|
36
39
|
class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
37
40
|
"""DefaultHeaderMiddleware 测试类"""
|
|
@@ -40,119 +43,271 @@ class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
|
40
43
|
"""测试前准备"""
|
|
41
44
|
# 创建设置管理器
|
|
42
45
|
self.settings = SettingManager()
|
|
46
|
+
|
|
47
|
+
def test_middleware_initialization_without_config(self):
|
|
48
|
+
"""测试没有配置时中间件初始化"""
|
|
49
|
+
# 创建一个模拟的crawler对象
|
|
50
|
+
crawler = Mock()
|
|
51
|
+
crawler.settings = self.settings
|
|
43
52
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
self.settings.set('USER_AGENTS', [])
|
|
55
|
-
self.settings.set('RANDOM_HEADERS', {})
|
|
56
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
57
|
-
|
|
58
|
-
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
59
|
-
|
|
60
|
-
# 应该抛出NotConfiguredError异常
|
|
61
|
-
with self.assertRaises(NotConfiguredError):
|
|
62
|
-
DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
63
|
-
|
|
64
|
-
@patch('crawlo.utils.log.get_logger')
|
|
65
|
-
def test_middleware_initialization_with_default_headers(self, mock_get_logger):
|
|
66
|
-
"""测试配置默认请求头时中间件初始化"""
|
|
53
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
54
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
55
|
+
# 应该抛出NotConfiguredError异常
|
|
56
|
+
with self.assertRaises(NotConfiguredError) as context:
|
|
57
|
+
DefaultHeaderMiddleware.create_instance(crawler)
|
|
58
|
+
|
|
59
|
+
self.assertIn("未配置DEFAULT_REQUEST_HEADERS、USER_AGENT或随机头部配置,DefaultHeaderMiddleware已禁用", str(context.exception))
|
|
60
|
+
|
|
61
|
+
def test_middleware_initialization_with_default_headers(self):
|
|
62
|
+
"""测试使用默认请求头配置时中间件初始化"""
|
|
67
63
|
# 设置默认请求头
|
|
68
64
|
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
69
|
-
'
|
|
70
|
-
'Accept': '
|
|
65
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
66
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
67
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
71
68
|
})
|
|
72
|
-
self.settings.set('LOG_LEVEL', '
|
|
69
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
73
70
|
|
|
74
|
-
|
|
71
|
+
# 创建一个模拟的crawler对象
|
|
72
|
+
crawler = Mock()
|
|
73
|
+
crawler.settings = self.settings
|
|
75
74
|
|
|
76
|
-
|
|
77
|
-
middleware =
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
75
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
76
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
77
|
+
# 应该正常创建实例
|
|
78
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
79
|
+
|
|
80
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
81
|
+
self.assertEqual(len(middleware.headers), 3)
|
|
82
|
+
self.assertIn('Accept', middleware.headers)
|
|
83
|
+
self.assertIn('Accept-Language', middleware.headers)
|
|
84
|
+
self.assertIn('Accept-Encoding', middleware.headers)
|
|
81
85
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
"""测试配置User-Agent时中间件初始化"""
|
|
85
|
-
# 清除默认的请求头配置
|
|
86
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
86
|
+
def test_middleware_initialization_with_user_agent(self):
|
|
87
|
+
"""测试使用User-Agent配置时中间件初始化"""
|
|
87
88
|
# 设置User-Agent
|
|
88
|
-
self.settings.set('USER_AGENT', '
|
|
89
|
-
self.settings.set('LOG_LEVEL', '
|
|
89
|
+
self.settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
90
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
90
91
|
|
|
91
|
-
|
|
92
|
+
# 创建一个模拟的crawler对象
|
|
93
|
+
crawler = Mock()
|
|
94
|
+
crawler.settings = self.settings
|
|
92
95
|
|
|
93
|
-
|
|
94
|
-
middleware =
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
97
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
98
|
+
# 应该正常创建实例
|
|
99
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
100
|
+
|
|
101
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
102
|
+
self.assertIn('User-Agent', middleware.headers)
|
|
103
|
+
self.assertEqual(middleware.headers['User-Agent'], 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
98
104
|
|
|
99
|
-
|
|
100
|
-
|
|
105
|
+
def test_middleware_initialization_with_random_user_agent_enabled(self):
|
|
106
|
+
"""测试启用随机User-Agent时中间件初始化"""
|
|
107
|
+
# 启用随机User-Agent并提供一个User-Agent
|
|
108
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
109
|
+
self.settings.set('USER_AGENTS', ['Test-Agent/1.0']) # 提供一个User-Agent以通过初始化检查
|
|
110
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
111
|
+
|
|
112
|
+
# 创建一个模拟的crawler对象
|
|
113
|
+
crawler = Mock()
|
|
114
|
+
crawler.settings = self.settings
|
|
115
|
+
|
|
116
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
117
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
118
|
+
# 应该正常创建实例,使用内置User-Agent列表
|
|
119
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
120
|
+
|
|
121
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
122
|
+
self.assertTrue(middleware.random_user_agent_enabled)
|
|
123
|
+
# 注意:这里user_agents会被get_user_agents覆盖,所以长度可能不为1
|
|
124
|
+
|
|
125
|
+
def test_middleware_initialization_with_custom_user_agents(self):
|
|
126
|
+
"""测试使用自定义User-Agent列表时中间件初始化"""
|
|
127
|
+
# 设置自定义User-Agent列表
|
|
128
|
+
custom_user_agents = [
|
|
129
|
+
'Custom-Agent/1.0',
|
|
130
|
+
'Custom-Agent/2.0',
|
|
131
|
+
'Custom-Agent/3.0'
|
|
132
|
+
]
|
|
133
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
134
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
135
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
136
|
+
|
|
137
|
+
# 创建一个模拟的crawler对象
|
|
138
|
+
crawler = Mock()
|
|
139
|
+
crawler.settings = self.settings
|
|
140
|
+
|
|
141
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
142
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
143
|
+
# 应该正常创建实例,使用自定义User-Agent列表
|
|
144
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
145
|
+
|
|
146
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
147
|
+
self.assertTrue(middleware.random_user_agent_enabled)
|
|
148
|
+
self.assertEqual(middleware.user_agents, custom_user_agents)
|
|
149
|
+
|
|
150
|
+
def test_process_request_with_default_headers(self):
|
|
101
151
|
"""测试处理请求时添加默认请求头"""
|
|
102
152
|
# 设置默认请求头
|
|
103
153
|
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
104
|
-
'
|
|
105
|
-
'Accept': '
|
|
154
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
155
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
106
156
|
})
|
|
107
157
|
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
108
158
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
113
|
-
|
|
114
|
-
# 创建请求对象
|
|
115
|
-
request = Mock()
|
|
116
|
-
request.headers = {}
|
|
117
|
-
request.url = 'http://example.com'
|
|
118
|
-
|
|
119
|
-
# 处理请求
|
|
120
|
-
middleware.process_request(request, Mock())
|
|
159
|
+
# 创建一个模拟的crawler对象
|
|
160
|
+
crawler = Mock()
|
|
161
|
+
crawler.settings = self.settings
|
|
121
162
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
163
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
164
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
165
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
166
|
+
|
|
167
|
+
# 创建请求
|
|
168
|
+
request = Mock()
|
|
169
|
+
request.headers = {}
|
|
170
|
+
request.url = 'https://example.com'
|
|
171
|
+
|
|
172
|
+
spider = Mock()
|
|
173
|
+
|
|
174
|
+
# 处理请求
|
|
175
|
+
middleware.process_request(request, spider)
|
|
176
|
+
|
|
177
|
+
# 检查默认请求头是否添加
|
|
178
|
+
self.assertIn('Accept', request.headers)
|
|
179
|
+
self.assertIn('Accept-Language', request.headers)
|
|
180
|
+
self.assertEqual(request.headers['Accept'], 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
|
181
|
+
self.assertEqual(request.headers['Accept-Language'], 'en-US,en;q=0.5')
|
|
127
182
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
"""测试处理请求时不覆盖已存在的请求头"""
|
|
183
|
+
def test_process_request_with_existing_headers(self):
|
|
184
|
+
"""测试处理已有请求头的请求"""
|
|
131
185
|
# 设置默认请求头
|
|
132
186
|
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
133
|
-
'
|
|
134
|
-
'Accept': '
|
|
187
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
188
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
135
189
|
})
|
|
136
190
|
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
137
191
|
|
|
138
|
-
|
|
139
|
-
|
|
192
|
+
# 创建一个模拟的crawler对象
|
|
193
|
+
crawler = Mock()
|
|
194
|
+
crawler.settings = self.settings
|
|
195
|
+
|
|
196
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
197
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
198
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
199
|
+
|
|
200
|
+
# 创建已有请求头的请求
|
|
201
|
+
request = Mock()
|
|
202
|
+
request.headers = {
|
|
203
|
+
'Accept': 'application/json', # 已存在的请求头
|
|
204
|
+
}
|
|
205
|
+
request.url = 'https://example.com'
|
|
206
|
+
|
|
207
|
+
spider = Mock()
|
|
208
|
+
|
|
209
|
+
# 处理请求
|
|
210
|
+
middleware.process_request(request, spider)
|
|
211
|
+
|
|
212
|
+
# 检查已存在的请求头不被覆盖,新请求头被添加
|
|
213
|
+
self.assertEqual(request.headers['Accept'], 'application/json') # 保持原值
|
|
214
|
+
self.assertIn('Accept-Language', request.headers) # 新添加的请求头
|
|
215
|
+
|
|
216
|
+
def test_process_request_with_random_user_agent(self):
|
|
217
|
+
"""测试处理请求时添加随机User-Agent"""
|
|
218
|
+
# 启用随机User-Agent并设置自定义列表
|
|
219
|
+
custom_user_agents = [
|
|
220
|
+
'Custom-Agent/1.0',
|
|
221
|
+
'Custom-Agent/2.0',
|
|
222
|
+
'Custom-Agent/3.0'
|
|
223
|
+
]
|
|
224
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
225
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
226
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
140
227
|
|
|
141
|
-
|
|
228
|
+
# 创建一个模拟的crawler对象
|
|
229
|
+
crawler = Mock()
|
|
230
|
+
crawler.settings = self.settings
|
|
142
231
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
232
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
233
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
234
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
235
|
+
|
|
236
|
+
# 创建没有User-Agent的请求
|
|
237
|
+
request = Mock()
|
|
238
|
+
request.headers = {}
|
|
239
|
+
request.url = 'https://example.com'
|
|
240
|
+
|
|
241
|
+
spider = Mock()
|
|
242
|
+
|
|
243
|
+
# 处理请求
|
|
244
|
+
middleware.process_request(request, spider)
|
|
245
|
+
|
|
246
|
+
# 检查随机User-Agent是否添加
|
|
247
|
+
self.assertIn('User-Agent', request.headers)
|
|
248
|
+
self.assertIn(request.headers['User-Agent'], custom_user_agents)
|
|
249
|
+
|
|
250
|
+
def test_process_request_with_existing_user_agent(self):
|
|
251
|
+
"""测试处理已有User-Agent的请求"""
|
|
252
|
+
# 启用随机User-Agent并设置自定义列表
|
|
253
|
+
custom_user_agents = [
|
|
254
|
+
'Custom-Agent/1.0',
|
|
255
|
+
'Custom-Agent/2.0',
|
|
256
|
+
'Custom-Agent/3.0'
|
|
257
|
+
]
|
|
258
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
259
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
260
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
261
|
+
|
|
262
|
+
# 创建一个模拟的crawler对象
|
|
263
|
+
crawler = Mock()
|
|
264
|
+
crawler.settings = self.settings
|
|
265
|
+
|
|
266
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
267
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
268
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
269
|
+
|
|
270
|
+
# 创建已有User-Agent的请求
|
|
271
|
+
existing_ua = 'Existing-Agent/1.0'
|
|
272
|
+
request = Mock()
|
|
273
|
+
request.headers = {
|
|
274
|
+
'User-Agent': existing_ua,
|
|
275
|
+
}
|
|
276
|
+
request.url = 'https://example.com'
|
|
277
|
+
|
|
278
|
+
spider = Mock()
|
|
279
|
+
|
|
280
|
+
# 处理请求
|
|
281
|
+
middleware.process_request(request, spider)
|
|
282
|
+
|
|
283
|
+
# 检查已存在的User-Agent不被覆盖
|
|
284
|
+
self.assertEqual(request.headers['User-Agent'], existing_ua)
|
|
285
|
+
|
|
286
|
+
def test_get_random_user_agent(self):
|
|
287
|
+
"""测试获取随机User-Agent功能"""
|
|
288
|
+
# 设置自定义User-Agent列表
|
|
289
|
+
custom_user_agents = [
|
|
290
|
+
'Custom-Agent/1.0',
|
|
291
|
+
'Custom-Agent/2.0',
|
|
292
|
+
'Custom-Agent/3.0'
|
|
293
|
+
]
|
|
294
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
295
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
296
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
147
297
|
|
|
148
|
-
#
|
|
149
|
-
|
|
298
|
+
# 创建一个模拟的crawler对象
|
|
299
|
+
crawler = Mock()
|
|
300
|
+
crawler.settings = self.settings
|
|
150
301
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
302
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
303
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
304
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
305
|
+
|
|
306
|
+
# 获取随机User-Agent
|
|
307
|
+
random_ua = middleware._get_random_user_agent()
|
|
308
|
+
|
|
309
|
+
# 检查返回的User-Agent在列表中
|
|
310
|
+
self.assertIn(random_ua, custom_user_agents)
|
|
156
311
|
|
|
157
312
|
|
|
158
313
|
if __name__ == '__main__':
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
指纹一致性测试
|
|
5
|
+
==============
|
|
6
|
+
验证框架中各组件对相同数据生成一致的指纹
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import unittest
|
|
10
|
+
from unittest.mock import Mock
|
|
11
|
+
|
|
12
|
+
from crawlo import Item
|
|
13
|
+
from crawlo.pipelines.memory_dedup_pipeline import MemoryDedupPipeline
|
|
14
|
+
from crawlo.pipelines.redis_dedup_pipeline import RedisDedupPipeline
|
|
15
|
+
from crawlo.pipelines.bloom_dedup_pipeline import BloomDedupPipeline
|
|
16
|
+
from crawlo.pipelines.database_dedup_pipeline import DatabaseDedupPipeline
|
|
17
|
+
from crawlo.tools.distributed_coordinator import DeduplicationTool
|
|
18
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestItem(Item):
|
|
22
|
+
"""测试用数据项类"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, **kwargs):
|
|
25
|
+
super().__init__()
|
|
26
|
+
for key, value in kwargs.items():
|
|
27
|
+
setattr(self, key, value)
|
|
28
|
+
|
|
29
|
+
def to_dict(self):
|
|
30
|
+
"""转换为字典"""
|
|
31
|
+
return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FingerprintConsistencyTest(unittest.TestCase):
|
|
35
|
+
"""指纹一致性测试"""
|
|
36
|
+
|
|
37
|
+
def setUp(self):
|
|
38
|
+
"""测试初始化"""
|
|
39
|
+
# 创建测试数据项
|
|
40
|
+
self.test_item = TestItem(
|
|
41
|
+
title="Test Title",
|
|
42
|
+
url="https://example.com",
|
|
43
|
+
content="Test content",
|
|
44
|
+
price=99.99
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# 创建各去重管道实例
|
|
48
|
+
self.memory_pipeline = MemoryDedupPipeline()
|
|
49
|
+
self.redis_pipeline = RedisDedupPipeline(
|
|
50
|
+
redis_host='localhost',
|
|
51
|
+
redis_port=6379,
|
|
52
|
+
redis_db=0,
|
|
53
|
+
redis_key='test:fingerprints'
|
|
54
|
+
)
|
|
55
|
+
self.bloom_pipeline = BloomDedupPipeline()
|
|
56
|
+
self.database_pipeline = DatabaseDedupPipeline()
|
|
57
|
+
|
|
58
|
+
# 创建去重工具实例
|
|
59
|
+
self.dedup_tool = DeduplicationTool()
|
|
60
|
+
|
|
61
|
+
def test_item_fingerprint_consistency(self):
|
|
62
|
+
"""测试数据项指纹一致性"""
|
|
63
|
+
# 使用各管道生成指纹
|
|
64
|
+
memory_fingerprint = self.memory_pipeline._generate_item_fingerprint(self.test_item)
|
|
65
|
+
redis_fingerprint = self.redis_pipeline._generate_item_fingerprint(self.test_item)
|
|
66
|
+
bloom_fingerprint = self.bloom_pipeline._generate_item_fingerprint(self.test_item)
|
|
67
|
+
database_fingerprint = self.database_pipeline._generate_item_fingerprint(self.test_item)
|
|
68
|
+
|
|
69
|
+
# 使用指纹生成器直接生成指纹
|
|
70
|
+
direct_fingerprint = FingerprintGenerator.item_fingerprint(self.test_item)
|
|
71
|
+
|
|
72
|
+
# 验证所有指纹一致
|
|
73
|
+
self.assertEqual(memory_fingerprint, redis_fingerprint)
|
|
74
|
+
self.assertEqual(memory_fingerprint, bloom_fingerprint)
|
|
75
|
+
self.assertEqual(memory_fingerprint, database_fingerprint)
|
|
76
|
+
self.assertEqual(memory_fingerprint, direct_fingerprint)
|
|
77
|
+
|
|
78
|
+
print(f"Memory Pipeline Fingerprint: {memory_fingerprint}")
|
|
79
|
+
print(f"Redis Pipeline Fingerprint: {redis_fingerprint}")
|
|
80
|
+
print(f"Bloom Pipeline Fingerprint: {bloom_fingerprint}")
|
|
81
|
+
print(f"Database Pipeline Fingerprint: {database_fingerprint}")
|
|
82
|
+
print(f"Direct Fingerprint: {direct_fingerprint}")
|
|
83
|
+
|
|
84
|
+
def test_data_fingerprint_consistency(self):
|
|
85
|
+
"""测试通用数据指纹一致性"""
|
|
86
|
+
# 测试字典数据
|
|
87
|
+
test_data = {
|
|
88
|
+
"name": "test",
|
|
89
|
+
"value": 123,
|
|
90
|
+
"nested": {
|
|
91
|
+
"inner": "value"
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# 使用去重工具生成指纹
|
|
96
|
+
tool_fingerprint = self.dedup_tool.generate_fingerprint(test_data)
|
|
97
|
+
|
|
98
|
+
# 使用指纹生成器生成指纹
|
|
99
|
+
generator_fingerprint = FingerprintGenerator.data_fingerprint(test_data)
|
|
100
|
+
|
|
101
|
+
# 验证指纹一致
|
|
102
|
+
self.assertEqual(tool_fingerprint, generator_fingerprint)
|
|
103
|
+
|
|
104
|
+
print(f"DeduplicationTool Fingerprint: {tool_fingerprint}")
|
|
105
|
+
print(f"FingerprintGenerator Fingerprint: {generator_fingerprint}")
|
|
106
|
+
|
|
107
|
+
def test_fingerprint_stability(self):
|
|
108
|
+
"""测试指纹稳定性"""
|
|
109
|
+
# 创建相同的测试数据项多次
|
|
110
|
+
item1 = TestItem(
|
|
111
|
+
title="Test Title",
|
|
112
|
+
url="https://example.com",
|
|
113
|
+
content="Test content",
|
|
114
|
+
price=99.99
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
item2 = TestItem(
|
|
118
|
+
title="Test Title",
|
|
119
|
+
url="https://example.com",
|
|
120
|
+
content="Test content",
|
|
121
|
+
price=99.99
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# 生成指纹
|
|
125
|
+
fingerprint1 = FingerprintGenerator.item_fingerprint(item1)
|
|
126
|
+
fingerprint2 = FingerprintGenerator.item_fingerprint(item2)
|
|
127
|
+
|
|
128
|
+
# 验证相同数据生成相同指纹
|
|
129
|
+
self.assertEqual(fingerprint1, fingerprint2)
|
|
130
|
+
|
|
131
|
+
print(f"First fingerprint: {fingerprint1}")
|
|
132
|
+
print(f"Second fingerprint: {fingerprint2}")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == '__main__':
|
|
136
|
+
unittest.main()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简化版指纹一致性测试
|
|
5
|
+
==============
|
|
6
|
+
验证框架中各组件对相同数据生成一致的指纹
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_fingerprint_consistency():
|
|
19
|
+
"""测试指纹一致性"""
|
|
20
|
+
# 测试数据
|
|
21
|
+
test_data = {
|
|
22
|
+
"title": "Test Title",
|
|
23
|
+
"url": "https://example.com",
|
|
24
|
+
"content": "Test content",
|
|
25
|
+
"price": 99.99
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# 使用指纹生成器生成指纹
|
|
29
|
+
fingerprint1 = FingerprintGenerator.data_fingerprint(test_data)
|
|
30
|
+
fingerprint2 = FingerprintGenerator.data_fingerprint(test_data)
|
|
31
|
+
|
|
32
|
+
# 验证相同数据生成相同指纹
|
|
33
|
+
print(f"First fingerprint: {fingerprint1}")
|
|
34
|
+
print(f"Second fingerprint: {fingerprint2}")
|
|
35
|
+
print(f"指纹一致: {fingerprint1 == fingerprint2}")
|
|
36
|
+
|
|
37
|
+
# 测试请求指纹
|
|
38
|
+
method = "GET"
|
|
39
|
+
url = "https://example.com"
|
|
40
|
+
body = b""
|
|
41
|
+
headers = {"User-Agent": "test-agent"}
|
|
42
|
+
|
|
43
|
+
request_fingerprint1 = FingerprintGenerator.request_fingerprint(method, url, body, headers)
|
|
44
|
+
request_fingerprint2 = FingerprintGenerator.request_fingerprint(method, url, body, headers)
|
|
45
|
+
|
|
46
|
+
print(f"\nRequest fingerprint 1: {request_fingerprint1}")
|
|
47
|
+
print(f"Request fingerprint 2: {request_fingerprint2}")
|
|
48
|
+
print(f"请求指纹一致: {request_fingerprint1 == request_fingerprint2}")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == '__main__':
|
|
52
|
+
test_fingerprint_consistency()
|
tests/test_framework_logger.py
CHANGED
|
@@ -7,7 +7,7 @@ import sys
|
|
|
7
7
|
import os
|
|
8
8
|
sys.path.insert(0, '/')
|
|
9
9
|
|
|
10
|
-
from crawlo.
|
|
10
|
+
from crawlo.initialization import initialize_framework, get_framework_initializer
|
|
11
11
|
from crawlo.utils.log import get_logger, LoggerManager
|
|
12
12
|
|
|
13
13
|
def test_framework_logger():
|
tests/test_framework_startup.py
CHANGED
|
@@ -24,7 +24,7 @@ def test_framework_startup():
|
|
|
24
24
|
}
|
|
25
25
|
|
|
26
26
|
# 初始化框架
|
|
27
|
-
from crawlo.
|
|
27
|
+
from crawlo.initialization import initialize_framework
|
|
28
28
|
settings = initialize_framework(test_settings)
|
|
29
29
|
|
|
30
30
|
print(f"设置初始化完成: {settings.get('PROJECT_NAME')}")
|