crawlo 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (45) hide show
  1. crawlo/__init__.py +9 -4
  2. crawlo/__version__.py +1 -1
  3. crawlo/core/__init__.py +8 -2
  4. crawlo/core/scheduler.py +2 -2
  5. crawlo/downloader/aiohttp_downloader.py +7 -2
  6. crawlo/extension/log_interval.py +44 -7
  7. crawlo/initialization/__init__.py +6 -2
  8. crawlo/middleware/middleware_manager.py +1 -1
  9. crawlo/mode_manager.py +13 -7
  10. crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
  11. crawlo/pipelines/database_dedup_pipeline.py +5 -8
  12. crawlo/pipelines/memory_dedup_pipeline.py +5 -15
  13. crawlo/pipelines/redis_dedup_pipeline.py +2 -15
  14. crawlo/project.py +18 -7
  15. crawlo/settings/default_settings.py +114 -150
  16. crawlo/settings/setting_manager.py +14 -9
  17. crawlo/tools/distributed_coordinator.py +4 -8
  18. crawlo/utils/fingerprint.py +123 -0
  19. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/METADATA +1 -1
  20. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/RECORD +45 -29
  21. examples/test_project/__init__.py +7 -0
  22. examples/test_project/run.py +35 -0
  23. examples/test_project/test_project/__init__.py +4 -0
  24. examples/test_project/test_project/items.py +18 -0
  25. examples/test_project/test_project/middlewares.py +119 -0
  26. examples/test_project/test_project/pipelines.py +97 -0
  27. examples/test_project/test_project/settings.py +170 -0
  28. examples/test_project/test_project/spiders/__init__.py +10 -0
  29. examples/test_project/test_project/spiders/of_week_dis.py +144 -0
  30. tests/debug_framework_logger.py +1 -1
  31. tests/debug_log_levels.py +1 -1
  32. tests/test_all_pipeline_fingerprints.py +134 -0
  33. tests/test_default_header_middleware.py +242 -87
  34. tests/test_fingerprint_consistency.py +136 -0
  35. tests/test_fingerprint_simple.py +52 -0
  36. tests/test_framework_logger.py +1 -1
  37. tests/test_framework_startup.py +1 -1
  38. tests/test_hash_performance.py +100 -0
  39. tests/test_mode_change.py +1 -1
  40. tests/test_offsite_middleware.py +185 -162
  41. tests/test_offsite_middleware_simple.py +204 -0
  42. tests/test_pipeline_fingerprint_consistency.py +87 -0
  43. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/WHEEL +0 -0
  44. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/entry_points.txt +0 -0
  45. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/top_level.txt +0 -0
@@ -2,15 +2,15 @@
2
2
  # -*- coding:UTF-8 -*-
3
3
  """
4
4
  DefaultHeaderMiddleware 测试文件
5
- 用于测试默认请求头中间件的功能
5
+ 用于测试默认请求头中间件的功能,包括随机更换header功能
6
6
  """
7
7
 
8
8
  import unittest
9
9
  from unittest.mock import Mock, patch
10
10
 
11
11
  from crawlo.middleware.default_header import DefaultHeaderMiddleware
12
- from crawlo.exceptions import NotConfiguredError
13
12
  from crawlo.settings.setting_manager import SettingManager
13
+ from crawlo.exceptions import NotConfiguredError
14
14
 
15
15
 
16
16
  class MockLogger:
@@ -32,6 +32,9 @@ class MockLogger:
32
32
  def error(self, msg):
33
33
  self.logs.append(('error', msg))
34
34
 
35
+ def isEnabledFor(self, level):
36
+ return True
37
+
35
38
 
36
39
  class TestDefaultHeaderMiddleware(unittest.TestCase):
37
40
  """DefaultHeaderMiddleware 测试类"""
@@ -40,119 +43,271 @@ class TestDefaultHeaderMiddleware(unittest.TestCase):
40
43
  """测试前准备"""
41
44
  # 创建设置管理器
42
45
  self.settings = SettingManager()
46
+
47
+ def test_middleware_initialization_without_config(self):
48
+ """测试没有配置时中间件初始化"""
49
+ # 创建一个模拟的crawler对象
50
+ crawler = Mock()
51
+ crawler.settings = self.settings
43
52
 
44
- # 创建爬虫模拟对象
45
- self.crawler = Mock()
46
- self.crawler.settings = self.settings
47
-
48
- @patch('crawlo.utils.log.get_logger')
49
- def test_middleware_initialization_without_config(self, mock_get_logger):
50
- """测试没有配置时中间件初始化(清除默认配置)"""
51
- # 清除默认的请求头配置
52
- self.settings.set('DEFAULT_REQUEST_HEADERS', {})
53
- self.settings.set('USER_AGENT', None)
54
- self.settings.set('USER_AGENTS', [])
55
- self.settings.set('RANDOM_HEADERS', {})
56
- self.settings.set('LOG_LEVEL', 'INFO')
57
-
58
- mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
59
-
60
- # 应该抛出NotConfiguredError异常
61
- with self.assertRaises(NotConfiguredError):
62
- DefaultHeaderMiddleware.create_instance(self.crawler)
63
-
64
- @patch('crawlo.utils.log.get_logger')
65
- def test_middleware_initialization_with_default_headers(self, mock_get_logger):
66
- """测试配置默认请求头时中间件初始化"""
53
+ logger = MockLogger('DefaultHeaderMiddleware')
54
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
55
+ # 应该抛出NotConfiguredError异常
56
+ with self.assertRaises(NotConfiguredError) as context:
57
+ DefaultHeaderMiddleware.create_instance(crawler)
58
+
59
+ self.assertIn("未配置DEFAULT_REQUEST_HEADERS、USER_AGENT或随机头部配置,DefaultHeaderMiddleware已禁用", str(context.exception))
60
+
61
+ def test_middleware_initialization_with_default_headers(self):
62
+ """测试使用默认请求头配置时中间件初始化"""
67
63
  # 设置默认请求头
68
64
  self.settings.set('DEFAULT_REQUEST_HEADERS', {
69
- 'User-Agent': 'Test-Agent',
70
- 'Accept': 'text/html'
65
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66
+ 'Accept-Language': 'en-US,en;q=0.5',
67
+ 'Accept-Encoding': 'gzip, deflate',
71
68
  })
72
- self.settings.set('LOG_LEVEL', 'INFO')
69
+ self.settings.set('LOG_LEVEL', 'DEBUG')
73
70
 
74
- mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
71
+ # 创建一个模拟的crawler对象
72
+ crawler = Mock()
73
+ crawler.settings = self.settings
75
74
 
76
- # 应该正常创建实例
77
- middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
78
- self.assertIsInstance(middleware, DefaultHeaderMiddleware)
79
- self.assertIn('User-Agent', middleware.headers)
80
- self.assertIn('Accept', middleware.headers)
75
+ logger = MockLogger('DefaultHeaderMiddleware')
76
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
77
+ # 应该正常创建实例
78
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
79
+
80
+ self.assertIsInstance(middleware, DefaultHeaderMiddleware)
81
+ self.assertEqual(len(middleware.headers), 3)
82
+ self.assertIn('Accept', middleware.headers)
83
+ self.assertIn('Accept-Language', middleware.headers)
84
+ self.assertIn('Accept-Encoding', middleware.headers)
81
85
 
82
- @patch('crawlo.utils.log.get_logger')
83
- def test_middleware_initialization_with_user_agent(self, mock_get_logger):
84
- """测试配置User-Agent时中间件初始化"""
85
- # 清除默认的请求头配置
86
- self.settings.set('DEFAULT_REQUEST_HEADERS', {})
86
+ def test_middleware_initialization_with_user_agent(self):
87
+ """测试使用User-Agent配置时中间件初始化"""
87
88
  # 设置User-Agent
88
- self.settings.set('USER_AGENT', 'Custom-Agent')
89
- self.settings.set('LOG_LEVEL', 'INFO')
89
+ self.settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
90
+ self.settings.set('LOG_LEVEL', 'DEBUG')
90
91
 
91
- mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
92
+ # 创建一个模拟的crawler对象
93
+ crawler = Mock()
94
+ crawler.settings = self.settings
92
95
 
93
- # 应该正常创建实例
94
- middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
95
- self.assertIsInstance(middleware, DefaultHeaderMiddleware)
96
- self.assertIn('User-Agent', middleware.headers)
97
- self.assertEqual(middleware.headers['User-Agent'], 'Custom-Agent')
96
+ logger = MockLogger('DefaultHeaderMiddleware')
97
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
98
+ # 应该正常创建实例
99
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
100
+
101
+ self.assertIsInstance(middleware, DefaultHeaderMiddleware)
102
+ self.assertIn('User-Agent', middleware.headers)
103
+ self.assertEqual(middleware.headers['User-Agent'], 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
98
104
 
99
- @patch('crawlo.utils.log.get_logger')
100
- def test_process_request_with_default_headers(self, mock_get_logger):
105
+ def test_middleware_initialization_with_random_user_agent_enabled(self):
106
+ """测试启用随机User-Agent时中间件初始化"""
107
+ # 启用随机User-Agent并提供一个User-Agent
108
+ self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
109
+ self.settings.set('USER_AGENTS', ['Test-Agent/1.0']) # 提供一个User-Agent以通过初始化检查
110
+ self.settings.set('LOG_LEVEL', 'DEBUG')
111
+
112
+ # 创建一个模拟的crawler对象
113
+ crawler = Mock()
114
+ crawler.settings = self.settings
115
+
116
+ logger = MockLogger('DefaultHeaderMiddleware')
117
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
118
+ # 应该正常创建实例,使用内置User-Agent列表
119
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
120
+
121
+ self.assertIsInstance(middleware, DefaultHeaderMiddleware)
122
+ self.assertTrue(middleware.random_user_agent_enabled)
123
+ # 注意:这里user_agents会被get_user_agents覆盖,所以长度可能不为1
124
+
125
+ def test_middleware_initialization_with_custom_user_agents(self):
126
+ """测试使用自定义User-Agent列表时中间件初始化"""
127
+ # 设置自定义User-Agent列表
128
+ custom_user_agents = [
129
+ 'Custom-Agent/1.0',
130
+ 'Custom-Agent/2.0',
131
+ 'Custom-Agent/3.0'
132
+ ]
133
+ self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
134
+ self.settings.set('USER_AGENTS', custom_user_agents)
135
+ self.settings.set('LOG_LEVEL', 'DEBUG')
136
+
137
+ # 创建一个模拟的crawler对象
138
+ crawler = Mock()
139
+ crawler.settings = self.settings
140
+
141
+ logger = MockLogger('DefaultHeaderMiddleware')
142
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
143
+ # 应该正常创建实例,使用自定义User-Agent列表
144
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
145
+
146
+ self.assertIsInstance(middleware, DefaultHeaderMiddleware)
147
+ self.assertTrue(middleware.random_user_agent_enabled)
148
+ self.assertEqual(middleware.user_agents, custom_user_agents)
149
+
150
+ def test_process_request_with_default_headers(self):
101
151
  """测试处理请求时添加默认请求头"""
102
152
  # 设置默认请求头
103
153
  self.settings.set('DEFAULT_REQUEST_HEADERS', {
104
- 'User-Agent': 'Test-Agent',
105
- 'Accept': 'text/html'
154
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
155
+ 'Accept-Language': 'en-US,en;q=0.5',
106
156
  })
107
157
  self.settings.set('LOG_LEVEL', 'DEBUG')
108
158
 
109
- mock_logger = MockLogger('DefaultHeaderMiddleware')
110
- mock_get_logger.return_value = mock_logger
111
-
112
- middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
113
-
114
- # 创建请求对象
115
- request = Mock()
116
- request.headers = {}
117
- request.url = 'http://example.com'
118
-
119
- # 处理请求
120
- middleware.process_request(request, Mock())
159
+ # 创建一个模拟的crawler对象
160
+ crawler = Mock()
161
+ crawler.settings = self.settings
121
162
 
122
- # 验证请求头被添加
123
- self.assertIn('User-Agent', request.headers)
124
- self.assertEqual(request.headers['User-Agent'], 'Test-Agent')
125
- self.assertIn('Accept', request.headers)
126
- self.assertEqual(request.headers['Accept'], 'text/html')
163
+ logger = MockLogger('DefaultHeaderMiddleware')
164
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
165
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
166
+
167
+ # 创建请求
168
+ request = Mock()
169
+ request.headers = {}
170
+ request.url = 'https://example.com'
171
+
172
+ spider = Mock()
173
+
174
+ # 处理请求
175
+ middleware.process_request(request, spider)
176
+
177
+ # 检查默认请求头是否添加
178
+ self.assertIn('Accept', request.headers)
179
+ self.assertIn('Accept-Language', request.headers)
180
+ self.assertEqual(request.headers['Accept'], 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
181
+ self.assertEqual(request.headers['Accept-Language'], 'en-US,en;q=0.5')
127
182
 
128
- @patch('crawlo.utils.log.get_logger')
129
- def test_process_request_without_overwriting_existing_headers(self, mock_get_logger):
130
- """测试处理请求时不覆盖已存在的请求头"""
183
+ def test_process_request_with_existing_headers(self):
184
+ """测试处理已有请求头的请求"""
131
185
  # 设置默认请求头
132
186
  self.settings.set('DEFAULT_REQUEST_HEADERS', {
133
- 'User-Agent': 'Test-Agent',
134
- 'Accept': 'text/html'
187
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
188
+ 'Accept-Language': 'en-US,en;q=0.5',
135
189
  })
136
190
  self.settings.set('LOG_LEVEL', 'DEBUG')
137
191
 
138
- mock_logger = MockLogger('DefaultHeaderMiddleware')
139
- mock_get_logger.return_value = mock_logger
192
+ # 创建一个模拟的crawler对象
193
+ crawler = Mock()
194
+ crawler.settings = self.settings
195
+
196
+ logger = MockLogger('DefaultHeaderMiddleware')
197
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
198
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
199
+
200
+ # 创建已有请求头的请求
201
+ request = Mock()
202
+ request.headers = {
203
+ 'Accept': 'application/json', # 已存在的请求头
204
+ }
205
+ request.url = 'https://example.com'
206
+
207
+ spider = Mock()
208
+
209
+ # 处理请求
210
+ middleware.process_request(request, spider)
211
+
212
+ # 检查已存在的请求头不被覆盖,新请求头被添加
213
+ self.assertEqual(request.headers['Accept'], 'application/json') # 保持原值
214
+ self.assertIn('Accept-Language', request.headers) # 新添加的请求头
215
+
216
+ def test_process_request_with_random_user_agent(self):
217
+ """测试处理请求时添加随机User-Agent"""
218
+ # 启用随机User-Agent并设置自定义列表
219
+ custom_user_agents = [
220
+ 'Custom-Agent/1.0',
221
+ 'Custom-Agent/2.0',
222
+ 'Custom-Agent/3.0'
223
+ ]
224
+ self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
225
+ self.settings.set('USER_AGENTS', custom_user_agents)
226
+ self.settings.set('LOG_LEVEL', 'DEBUG')
140
227
 
141
- middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
228
+ # 创建一个模拟的crawler对象
229
+ crawler = Mock()
230
+ crawler.settings = self.settings
142
231
 
143
- # 创建请求对象,已包含User-Agent
144
- request = Mock()
145
- request.headers = {'User-Agent': 'Existing-Agent'}
146
- request.url = 'http://example.com'
232
+ logger = MockLogger('DefaultHeaderMiddleware')
233
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
234
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
235
+
236
+ # 创建没有User-Agent的请求
237
+ request = Mock()
238
+ request.headers = {}
239
+ request.url = 'https://example.com'
240
+
241
+ spider = Mock()
242
+
243
+ # 处理请求
244
+ middleware.process_request(request, spider)
245
+
246
+ # 检查随机User-Agent是否添加
247
+ self.assertIn('User-Agent', request.headers)
248
+ self.assertIn(request.headers['User-Agent'], custom_user_agents)
249
+
250
+ def test_process_request_with_existing_user_agent(self):
251
+ """测试处理已有User-Agent的请求"""
252
+ # 启用随机User-Agent并设置自定义列表
253
+ custom_user_agents = [
254
+ 'Custom-Agent/1.0',
255
+ 'Custom-Agent/2.0',
256
+ 'Custom-Agent/3.0'
257
+ ]
258
+ self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
259
+ self.settings.set('USER_AGENTS', custom_user_agents)
260
+ self.settings.set('LOG_LEVEL', 'DEBUG')
261
+
262
+ # 创建一个模拟的crawler对象
263
+ crawler = Mock()
264
+ crawler.settings = self.settings
265
+
266
+ logger = MockLogger('DefaultHeaderMiddleware')
267
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
268
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
269
+
270
+ # 创建已有User-Agent的请求
271
+ existing_ua = 'Existing-Agent/1.0'
272
+ request = Mock()
273
+ request.headers = {
274
+ 'User-Agent': existing_ua,
275
+ }
276
+ request.url = 'https://example.com'
277
+
278
+ spider = Mock()
279
+
280
+ # 处理请求
281
+ middleware.process_request(request, spider)
282
+
283
+ # 检查已存在的User-Agent不被覆盖
284
+ self.assertEqual(request.headers['User-Agent'], existing_ua)
285
+
286
+ def test_get_random_user_agent(self):
287
+ """测试获取随机User-Agent功能"""
288
+ # 设置自定义User-Agent列表
289
+ custom_user_agents = [
290
+ 'Custom-Agent/1.0',
291
+ 'Custom-Agent/2.0',
292
+ 'Custom-Agent/3.0'
293
+ ]
294
+ self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
295
+ self.settings.set('USER_AGENTS', custom_user_agents)
296
+ self.settings.set('LOG_LEVEL', 'DEBUG')
147
297
 
148
- # 处理请求
149
- middleware.process_request(request, Mock())
298
+ # 创建一个模拟的crawler对象
299
+ crawler = Mock()
300
+ crawler.settings = self.settings
150
301
 
151
- # 验证已存在的请求头未被覆盖
152
- self.assertEqual(request.headers['User-Agent'], 'Existing-Agent')
153
- # 验证其他请求头被添加
154
- self.assertIn('Accept', request.headers)
155
- self.assertEqual(request.headers['Accept'], 'text/html')
302
+ logger = MockLogger('DefaultHeaderMiddleware')
303
+ with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
304
+ middleware = DefaultHeaderMiddleware.create_instance(crawler)
305
+
306
+ # 获取随机User-Agent
307
+ random_ua = middleware._get_random_user_agent()
308
+
309
+ # 检查返回的User-Agent在列表中
310
+ self.assertIn(random_ua, custom_user_agents)
156
311
 
157
312
 
158
313
  if __name__ == '__main__':
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 指纹一致性测试
5
+ ==============
6
+ 验证框架中各组件对相同数据生成一致的指纹
7
+ """
8
+
9
+ import unittest
10
+ from unittest.mock import Mock
11
+
12
+ from crawlo import Item
13
+ from crawlo.pipelines.memory_dedup_pipeline import MemoryDedupPipeline
14
+ from crawlo.pipelines.redis_dedup_pipeline import RedisDedupPipeline
15
+ from crawlo.pipelines.bloom_dedup_pipeline import BloomDedupPipeline
16
+ from crawlo.pipelines.database_dedup_pipeline import DatabaseDedupPipeline
17
+ from crawlo.tools.distributed_coordinator import DeduplicationTool
18
+ from crawlo.utils.fingerprint import FingerprintGenerator
19
+
20
+
21
+ class TestItem(Item):
22
+ """测试用数据项类"""
23
+
24
+ def __init__(self, **kwargs):
25
+ super().__init__()
26
+ for key, value in kwargs.items():
27
+ setattr(self, key, value)
28
+
29
+ def to_dict(self):
30
+ """转换为字典"""
31
+ return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}
32
+
33
+
34
+ class FingerprintConsistencyTest(unittest.TestCase):
35
+ """指纹一致性测试"""
36
+
37
+ def setUp(self):
38
+ """测试初始化"""
39
+ # 创建测试数据项
40
+ self.test_item = TestItem(
41
+ title="Test Title",
42
+ url="https://example.com",
43
+ content="Test content",
44
+ price=99.99
45
+ )
46
+
47
+ # 创建各去重管道实例
48
+ self.memory_pipeline = MemoryDedupPipeline()
49
+ self.redis_pipeline = RedisDedupPipeline(
50
+ redis_host='localhost',
51
+ redis_port=6379,
52
+ redis_db=0,
53
+ redis_key='test:fingerprints'
54
+ )
55
+ self.bloom_pipeline = BloomDedupPipeline()
56
+ self.database_pipeline = DatabaseDedupPipeline()
57
+
58
+ # 创建去重工具实例
59
+ self.dedup_tool = DeduplicationTool()
60
+
61
+ def test_item_fingerprint_consistency(self):
62
+ """测试数据项指纹一致性"""
63
+ # 使用各管道生成指纹
64
+ memory_fingerprint = self.memory_pipeline._generate_item_fingerprint(self.test_item)
65
+ redis_fingerprint = self.redis_pipeline._generate_item_fingerprint(self.test_item)
66
+ bloom_fingerprint = self.bloom_pipeline._generate_item_fingerprint(self.test_item)
67
+ database_fingerprint = self.database_pipeline._generate_item_fingerprint(self.test_item)
68
+
69
+ # 使用指纹生成器直接生成指纹
70
+ direct_fingerprint = FingerprintGenerator.item_fingerprint(self.test_item)
71
+
72
+ # 验证所有指纹一致
73
+ self.assertEqual(memory_fingerprint, redis_fingerprint)
74
+ self.assertEqual(memory_fingerprint, bloom_fingerprint)
75
+ self.assertEqual(memory_fingerprint, database_fingerprint)
76
+ self.assertEqual(memory_fingerprint, direct_fingerprint)
77
+
78
+ print(f"Memory Pipeline Fingerprint: {memory_fingerprint}")
79
+ print(f"Redis Pipeline Fingerprint: {redis_fingerprint}")
80
+ print(f"Bloom Pipeline Fingerprint: {bloom_fingerprint}")
81
+ print(f"Database Pipeline Fingerprint: {database_fingerprint}")
82
+ print(f"Direct Fingerprint: {direct_fingerprint}")
83
+
84
+ def test_data_fingerprint_consistency(self):
85
+ """测试通用数据指纹一致性"""
86
+ # 测试字典数据
87
+ test_data = {
88
+ "name": "test",
89
+ "value": 123,
90
+ "nested": {
91
+ "inner": "value"
92
+ }
93
+ }
94
+
95
+ # 使用去重工具生成指纹
96
+ tool_fingerprint = self.dedup_tool.generate_fingerprint(test_data)
97
+
98
+ # 使用指纹生成器生成指纹
99
+ generator_fingerprint = FingerprintGenerator.data_fingerprint(test_data)
100
+
101
+ # 验证指纹一致
102
+ self.assertEqual(tool_fingerprint, generator_fingerprint)
103
+
104
+ print(f"DeduplicationTool Fingerprint: {tool_fingerprint}")
105
+ print(f"FingerprintGenerator Fingerprint: {generator_fingerprint}")
106
+
107
+ def test_fingerprint_stability(self):
108
+ """测试指纹稳定性"""
109
+ # 创建相同的测试数据项多次
110
+ item1 = TestItem(
111
+ title="Test Title",
112
+ url="https://example.com",
113
+ content="Test content",
114
+ price=99.99
115
+ )
116
+
117
+ item2 = TestItem(
118
+ title="Test Title",
119
+ url="https://example.com",
120
+ content="Test content",
121
+ price=99.99
122
+ )
123
+
124
+ # 生成指纹
125
+ fingerprint1 = FingerprintGenerator.item_fingerprint(item1)
126
+ fingerprint2 = FingerprintGenerator.item_fingerprint(item2)
127
+
128
+ # 验证相同数据生成相同指纹
129
+ self.assertEqual(fingerprint1, fingerprint2)
130
+
131
+ print(f"First fingerprint: {fingerprint1}")
132
+ print(f"Second fingerprint: {fingerprint2}")
133
+
134
+
135
+ if __name__ == '__main__':
136
+ unittest.main()
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 简化版指纹一致性测试
5
+ ==============
6
+ 验证框架中各组件对相同数据生成一致的指纹
7
+ """
8
+
9
+ import sys
10
+ import os
11
+
12
+ # 添加项目根目录到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
14
+
15
+ from crawlo.utils.fingerprint import FingerprintGenerator
16
+
17
+
18
+ def test_fingerprint_consistency():
19
+ """测试指纹一致性"""
20
+ # 测试数据
21
+ test_data = {
22
+ "title": "Test Title",
23
+ "url": "https://example.com",
24
+ "content": "Test content",
25
+ "price": 99.99
26
+ }
27
+
28
+ # 使用指纹生成器生成指纹
29
+ fingerprint1 = FingerprintGenerator.data_fingerprint(test_data)
30
+ fingerprint2 = FingerprintGenerator.data_fingerprint(test_data)
31
+
32
+ # 验证相同数据生成相同指纹
33
+ print(f"First fingerprint: {fingerprint1}")
34
+ print(f"Second fingerprint: {fingerprint2}")
35
+ print(f"指纹一致: {fingerprint1 == fingerprint2}")
36
+
37
+ # 测试请求指纹
38
+ method = "GET"
39
+ url = "https://example.com"
40
+ body = b""
41
+ headers = {"User-Agent": "test-agent"}
42
+
43
+ request_fingerprint1 = FingerprintGenerator.request_fingerprint(method, url, body, headers)
44
+ request_fingerprint2 = FingerprintGenerator.request_fingerprint(method, url, body, headers)
45
+
46
+ print(f"\nRequest fingerprint 1: {request_fingerprint1}")
47
+ print(f"Request fingerprint 2: {request_fingerprint2}")
48
+ print(f"请求指纹一致: {request_fingerprint1 == request_fingerprint2}")
49
+
50
+
51
+ if __name__ == '__main__':
52
+ test_fingerprint_consistency()
@@ -7,7 +7,7 @@ import sys
7
7
  import os
8
8
  sys.path.insert(0, '/')
9
9
 
10
- from crawlo.core.framework_initializer import initialize_framework, get_framework_initializer
10
+ from crawlo.initialization import initialize_framework, get_framework_initializer
11
11
  from crawlo.utils.log import get_logger, LoggerManager
12
12
 
13
13
  def test_framework_logger():
@@ -24,7 +24,7 @@ def test_framework_startup():
24
24
  }
25
25
 
26
26
  # 初始化框架
27
- from crawlo.core.framework_initializer import initialize_framework
27
+ from crawlo.initialization import initialize_framework
28
28
  settings = initialize_framework(test_settings)
29
29
 
30
30
  print(f"设置初始化完成: {settings.get('PROJECT_NAME')}")