crawlo 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (45) hide show
  1. crawlo/__init__.py +9 -4
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/run.py +1 -1
  4. crawlo/core/__init__.py +8 -2
  5. crawlo/core/scheduler.py +2 -2
  6. crawlo/extension/log_interval.py +44 -7
  7. crawlo/initialization/__init__.py +6 -2
  8. crawlo/middleware/middleware_manager.py +1 -1
  9. crawlo/mode_manager.py +13 -7
  10. crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
  11. crawlo/pipelines/database_dedup_pipeline.py +5 -8
  12. crawlo/pipelines/memory_dedup_pipeline.py +5 -15
  13. crawlo/pipelines/redis_dedup_pipeline.py +2 -15
  14. crawlo/project.py +18 -7
  15. crawlo/settings/default_settings.py +114 -150
  16. crawlo/settings/setting_manager.py +14 -9
  17. crawlo/tools/distributed_coordinator.py +4 -8
  18. crawlo/utils/fingerprint.py +123 -0
  19. {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/METADATA +1 -1
  20. {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/RECORD +45 -29
  21. examples/test_project/__init__.py +7 -0
  22. examples/test_project/run.py +35 -0
  23. examples/test_project/test_project/__init__.py +4 -0
  24. examples/test_project/test_project/items.py +18 -0
  25. examples/test_project/test_project/middlewares.py +119 -0
  26. examples/test_project/test_project/pipelines.py +97 -0
  27. examples/test_project/test_project/settings.py +170 -0
  28. examples/test_project/test_project/spiders/__init__.py +10 -0
  29. examples/test_project/test_project/spiders/of_week_dis.py +144 -0
  30. tests/debug_framework_logger.py +1 -1
  31. tests/debug_log_levels.py +1 -1
  32. tests/test_all_pipeline_fingerprints.py +134 -0
  33. tests/test_default_header_middleware.py +242 -87
  34. tests/test_fingerprint_consistency.py +136 -0
  35. tests/test_fingerprint_simple.py +52 -0
  36. tests/test_framework_logger.py +1 -1
  37. tests/test_framework_startup.py +1 -1
  38. tests/test_hash_performance.py +100 -0
  39. tests/test_mode_change.py +1 -1
  40. tests/test_offsite_middleware.py +185 -162
  41. tests/test_offsite_middleware_simple.py +204 -0
  42. tests/test_pipeline_fingerprint_consistency.py +87 -0
  43. {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/WHEEL +0 -0
  44. {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/entry_points.txt +0 -0
  45. {crawlo-1.4.0.dist-info → crawlo-1.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ SHA256 vs MD5 性能对比测试
5
+ =====================
6
+ 测试在爬虫场景中两种哈希算法的性能差异
7
+ """
8
+
9
+ import hashlib
10
+ import time
11
+ from collections import namedtuple
12
+
13
+
14
+ # 创建测试数据
15
+ TestItem = namedtuple('TestItem', ['title', 'url', 'content', 'price', 'tags'])
16
+
17
+ def create_test_items(count=10000):
18
+ """创建测试数据项"""
19
+ items = []
20
+ for i in range(count):
21
+ item = TestItem(
22
+ title=f"Test Title {i}",
23
+ url=f"https://example.com/page/{i}",
24
+ content=f"This is test content number {i} with some additional text to make it longer",
25
+ price=99.99 + i,
26
+ tags=[f"tag{j}" for j in range(5)]
27
+ )
28
+ items.append(item)
29
+ return items
30
+
31
+
32
+ def md5_fingerprint(data):
33
+ """使用MD5生成指纹"""
34
+ if hasattr(data, '_asdict'):
35
+ data_dict = data._asdict()
36
+ else:
37
+ data_dict = {'__data__': str(data)}
38
+
39
+ sorted_items = sorted(data_dict.items())
40
+ fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
41
+ return hashlib.md5(fingerprint_string.encode('utf-8')).hexdigest()
42
+
43
+
44
+ def sha256_fingerprint(data):
45
+ """使用SHA256生成指纹"""
46
+ if hasattr(data, '_asdict'):
47
+ data_dict = data._asdict()
48
+ else:
49
+ data_dict = {'__data__': str(data)}
50
+
51
+ sorted_items = sorted(data_dict.items())
52
+ fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
53
+ return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
54
+
55
+
56
+ def performance_test():
57
+ """性能测试"""
58
+ print("开始哈希算法性能测试...")
59
+ print("=" * 50)
60
+
61
+ # 创建测试数据
62
+ test_items = create_test_items(10000)
63
+
64
+ # 测试MD5性能
65
+ start_time = time.time()
66
+ md5_results = []
67
+ for item in test_items:
68
+ fingerprint = md5_fingerprint(item)
69
+ md5_results.append(fingerprint)
70
+ md5_time = time.time() - start_time
71
+
72
+ # 测试SHA256性能
73
+ start_time = time.time()
74
+ sha256_results = []
75
+ for item in test_items:
76
+ fingerprint = sha256_fingerprint(item)
77
+ sha256_results.append(fingerprint)
78
+ sha256_time = time.time() - start_time
79
+
80
+ # 输出结果
81
+ print(f"测试数据量: {len(test_items)} 条")
82
+ print(f"MD5 耗时: {md5_time:.4f} 秒")
83
+ print(f"SHA256 耗时: {sha256_time:.4f} 秒")
84
+ print(f"性能差异: {((sha256_time - md5_time) / md5_time * 100):.2f}%")
85
+
86
+ # 验证结果一致性
87
+ print("\n验证指纹长度:")
88
+ print(f"MD5 指纹长度: {len(md5_results[0])} 字符")
89
+ print(f"SHA256 指纹长度: {len(sha256_results[0])} 字符")
90
+
91
+ # 检查是否有重复指纹(理论上不应该有)
92
+ md5_unique = len(set(md5_results))
93
+ sha256_unique = len(set(sha256_results))
94
+ print(f"\n唯一指纹数量:")
95
+ print(f"MD5: {md5_unique}/{len(test_items)} ({md5_unique/len(test_items)*100:.2f}%)")
96
+ print(f"SHA256: {sha256_unique}/{len(test_items)} ({sha256_unique/len(test_items)*100:.2f}%)")
97
+
98
+
99
+ if __name__ == '__main__':
100
+ performance_test()
tests/test_mode_change.py CHANGED
@@ -25,7 +25,7 @@ def test_mode_log_level():
25
25
 
26
26
  try:
27
27
  # 初始化框架
28
- from crawlo.core.framework_initializer import initialize_framework
28
+ from crawlo.initialization import initialize_framework
29
29
  settings = initialize_framework(test_settings)
30
30
 
31
31
  print(f"设置初始化完成: {settings.get('PROJECT_NAME')}")
@@ -2,16 +2,27 @@
2
2
  # -*- coding:UTF-8 -*-
3
3
  """
4
4
  OffsiteMiddleware 测试文件
5
- 用于测试站外请求过滤中间件的功能
5
+ 用于测试站点过滤中间件的功能,特别是多个域名的情况
6
6
  """
7
7
 
8
- import asyncio
9
8
  import unittest
10
9
  from unittest.mock import Mock, patch
11
10
 
12
11
  from crawlo.middleware.offsite import OffsiteMiddleware
13
- from crawlo.exceptions import NotConfiguredError, IgnoreRequestError
14
12
  from crawlo.settings.setting_manager import SettingManager
13
+ from crawlo.exceptions import IgnoreRequestError, NotConfiguredError
14
+
15
+
16
+ class MockStats:
17
+ """Mock Stats 类,用于测试统计信息"""
18
+ def __init__(self):
19
+ self.stats = {}
20
+
21
+ def inc_value(self, key, value=1):
22
+ if key in self.stats:
23
+ self.stats[key] += value
24
+ else:
25
+ self.stats[key] = value
15
26
 
16
27
 
17
28
  class MockLogger:
@@ -34,18 +45,6 @@ class MockLogger:
34
45
  self.logs.append(('error', msg))
35
46
 
36
47
 
37
- class MockStats:
38
- """Mock Stats 类,用于测试统计信息"""
39
- def __init__(self):
40
- self.stats = {}
41
-
42
- def inc_value(self, key, value=1):
43
- if key in self.stats:
44
- self.stats[key] += value
45
- else:
46
- self.stats[key] = value
47
-
48
-
49
48
  class TestOffsiteMiddleware(unittest.TestCase):
50
49
  """OffsiteMiddleware 测试类"""
51
50
 
@@ -58,165 +57,189 @@ class TestOffsiteMiddleware(unittest.TestCase):
58
57
  self.crawler = Mock()
59
58
  self.crawler.settings = self.settings
60
59
  self.crawler.stats = MockStats()
61
- self.crawler.logger = Mock()
62
60
 
63
- @patch('crawlo.utils.log.get_logger')
64
- def test_middleware_initialization_without_allowed_domains(self, mock_get_logger):
65
- """测试没有配置允许域名时中间件初始化"""
66
- mock_get_logger.return_value = MockLogger('OffsiteMiddleware')
67
-
68
- # 应该抛出NotConfiguredError异常
69
- with self.assertRaises(NotConfiguredError):
70
- OffsiteMiddleware.create_instance(self.crawler)
71
-
72
- @patch('crawlo.utils.log.get_logger')
73
- def test_middleware_initialization_with_allowed_domains(self, mock_get_logger):
74
- """测试配置允许域名时中间件初始化"""
75
- # 设置允许的域名
76
- self.settings.set('ALLOWED_DOMAINS', ['example.com', 'test.com'])
77
- self.settings.set('LOG_LEVEL', 'INFO')
78
-
79
- mock_get_logger.return_value = MockLogger('OffsiteMiddleware')
80
-
81
- # 应该正常创建实例
82
- middleware = OffsiteMiddleware.create_instance(self.crawler)
83
- self.assertIsInstance(middleware, OffsiteMiddleware)
84
- self.assertEqual(len(middleware.allowed_domains), 2)
85
- self.assertIn('example.com', middleware.allowed_domains)
86
- self.assertIn('test.com', middleware.allowed_domains)
87
-
88
- def test_is_offsite_request_with_valid_domain(self):
89
- """测试有效域名的站外请求判断"""
90
- # 设置允许的域名
91
- self.settings.set('ALLOWED_DOMAINS', ['example.com'])
92
- self.settings.set('LOG_LEVEL', 'INFO')
93
-
94
- # 创建中间件实例
95
- middleware = OffsiteMiddleware(
96
- stats=MockStats(),
97
- log_level='INFO',
98
- allowed_domains=['example.com']
99
- )
100
- middleware._compile_domains()
101
-
102
- # 创建请求对象
103
- request = Mock()
104
- request.url = 'http://example.com/page'
105
-
106
- # 应该不是站外请求
107
- self.assertFalse(middleware._is_offsite_request(request))
108
-
109
- def test_is_offsite_request_with_subdomain(self):
110
- """测试子域名的站外请求判断"""
111
- # 设置允许的域名
112
- self.settings.set('ALLOWED_DOMAINS', ['example.com'])
113
- self.settings.set('LOG_LEVEL', 'INFO')
114
-
115
- # 创建中间件实例
116
- middleware = OffsiteMiddleware(
117
- stats=MockStats(),
118
- log_level='INFO',
119
- allowed_domains=['example.com']
120
- )
121
- middleware._compile_domains()
122
-
123
- # 创建请求对象(子域名)
124
- request = Mock()
125
- request.url = 'http://sub.example.com/page'
126
-
127
- # 应该不是站外请求(子域名应该被允许)
128
- self.assertFalse(middleware._is_offsite_request(request))
129
-
130
- def test_is_offsite_request_with_invalid_domain(self):
131
- """测试无效域名的站外请求判断"""
132
- # 设置允许的域名
133
- self.settings.set('ALLOWED_DOMAINS', ['example.com'])
134
- self.settings.set('LOG_LEVEL', 'INFO')
135
-
136
- # 创建中间件实例
137
- middleware = OffsiteMiddleware(
138
- stats=MockStats(),
139
- log_level='INFO',
140
- allowed_domains=['example.com']
141
- )
142
- middleware._compile_domains()
143
-
144
- # 创建请求对象
145
- request = Mock()
146
- request.url = 'http://other.com/page'
147
-
148
- # 应该是站外请求
149
- self.assertTrue(middleware._is_offsite_request(request))
150
-
151
- def test_is_offsite_request_with_invalid_url(self):
152
- """测试无效URL的站外请求判断"""
153
- # 设置允许的域名
154
- self.settings.set('ALLOWED_DOMAINS', ['example.com'])
155
- self.settings.set('LOG_LEVEL', 'INFO')
156
-
157
- # 创建中间件实例
158
- middleware = OffsiteMiddleware(
159
- stats=MockStats(),
160
- log_level='INFO',
161
- allowed_domains=['example.com']
162
- )
163
- middleware._compile_domains()
164
-
165
- # 创建请求对象(无效URL)
166
- request = Mock()
167
- request.url = 'invalid-url'
168
-
169
- # 应该是站外请求
170
- self.assertTrue(middleware._is_offsite_request(request))
171
-
172
- @patch('crawlo.utils.log.get_logger')
173
- def test_process_request_with_offsite_request(self, mock_get_logger):
174
- """测试处理站外请求"""
175
- # 设置允许的域名
176
- self.settings.set('ALLOWED_DOMAINS', ['example.com'])
61
+ def test_middleware_initialization_without_domains(self):
62
+ """测试没有设置ALLOWED_DOMAINS时中间件初始化"""
63
+ # 不设置ALLOWED_DOMAINS
64
+ logger = MockLogger('OffsiteMiddleware')
65
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
66
+ # 应该抛出NotConfiguredError异常
67
+ with self.assertRaises(NotConfiguredError) as context:
68
+ OffsiteMiddleware.create_instance(self.crawler)
69
+
70
+ self.assertIn("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用", str(context.exception))
71
+
72
+ def test_middleware_initialization_with_global_domains(self):
73
+ """测试使用全局ALLOWED_DOMAINS设置时中间件初始化"""
74
+ # 设置全局ALLOWED_DOMAINS
75
+ self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
177
76
  self.settings.set('LOG_LEVEL', 'DEBUG')
178
77
 
179
- mock_logger = MockLogger('OffsiteMiddleware')
180
- mock_get_logger.return_value = mock_logger
181
-
182
- # 创建中间件实例
183
- middleware = OffsiteMiddleware.create_instance(self.crawler)
184
-
185
- # 创建请求对象(站外请求)
186
- request = Mock()
187
- request.url = 'http://other.com/page'
78
+ logger = MockLogger('OffsiteMiddleware')
79
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
80
+ # 应该正常创建实例
81
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
82
+ self.assertIsInstance(middleware, OffsiteMiddleware)
83
+ self.assertEqual(len(middleware.allowed_domains), 2)
84
+ self.assertIn('ee.ofweek.com', middleware.allowed_domains)
85
+ self.assertIn('www.baidu.com', middleware.allowed_domains)
86
+
87
+ def test_middleware_initialization_with_spider_domains(self):
88
+ """测试使用Spider实例allowed_domains属性时中间件初始化"""
89
+ # 设置Spider实例的allowed_domains
90
+ spider = Mock()
91
+ spider.allowed_domains = ['ee.ofweek.com', 'www.baidu.com']
92
+
93
+ self.crawler.spider = spider
94
+ self.settings.set('LOG_LEVEL', 'DEBUG')
188
95
 
189
- # 应该抛出IgnoreRequestError异常
190
- with self.assertRaises(IgnoreRequestError):
191
- asyncio.run(middleware.process_request(request, Mock()))
96
+ logger = MockLogger('OffsiteMiddleware')
97
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
98
+ # 应该正常创建实例,使用Spider的allowed_domains
99
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
100
+ self.assertIsInstance(middleware, OffsiteMiddleware)
101
+ self.assertEqual(len(middleware.allowed_domains), 2)
102
+ self.assertIn('ee.ofweek.com', middleware.allowed_domains)
103
+ self.assertIn('www.baidu.com', middleware.allowed_domains)
104
+
105
+ def test_is_offsite_request_with_allowed_domains(self):
106
+ """测试允许域名内的请求"""
107
+ # 设置ALLOWED_DOMAINS
108
+ self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
109
+ self.settings.set('LOG_LEVEL', 'DEBUG')
192
110
 
193
- # 验证统计信息
194
- self.assertIn('offsite_request_count', self.crawler.stats.stats)
195
- self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
196
-
197
- @patch('crawlo.utils.log.get_logger')
198
- def test_process_request_with_valid_request(self, mock_get_logger):
199
- """测试处理有效请求"""
200
- # 设置允许的域名
201
- self.settings.set('ALLOWED_DOMAINS', ['example.com'])
111
+ logger = MockLogger('OffsiteMiddleware')
112
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
113
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
114
+
115
+ # 创建允许的请求
116
+ request1 = Mock()
117
+ request1.url = 'https://ee.ofweek.com/news/article1.html'
118
+
119
+ request2 = Mock()
120
+ request2.url = 'https://www.baidu.com/s?wd=test'
121
+
122
+ # 这些请求应该不被认为是站外请求
123
+ self.assertFalse(middleware._is_offsite_request(request1))
124
+ self.assertFalse(middleware._is_offsite_request(request2))
125
+
126
+ def test_is_offsite_request_with_subdomains(self):
127
+ """测试子域名的请求"""
128
+ # 设置ALLOWED_DOMAINS
129
+ self.settings.set('ALLOWED_DOMAINS', ['ofweek.com', 'baidu.com'])
202
130
  self.settings.set('LOG_LEVEL', 'DEBUG')
203
131
 
204
- mock_logger = MockLogger('OffsiteMiddleware')
205
- mock_get_logger.return_value = mock_logger
132
+ logger = MockLogger('OffsiteMiddleware')
133
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
134
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
135
+
136
+ # 创建子域名的请求
137
+ request1 = Mock()
138
+ request1.url = 'https://news.ofweek.com/article1.html'
139
+
140
+ request2 = Mock()
141
+ request2.url = 'https://map.baidu.com/location'
142
+
143
+ # 这些请求应该不被认为是站外请求(因为允许了根域名)
144
+ self.assertFalse(middleware._is_offsite_request(request1))
145
+ self.assertFalse(middleware._is_offsite_request(request2))
146
+
147
+ def test_is_offsite_request_with_disallowed_domains(self):
148
+ """测试不允许域名的请求"""
149
+ # 设置ALLOWED_DOMAINS
150
+ self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
151
+ self.settings.set('LOG_LEVEL', 'DEBUG')
206
152
 
207
- # 创建中间件实例
208
- middleware = OffsiteMiddleware.create_instance(self.crawler)
153
+ logger = MockLogger('OffsiteMiddleware')
154
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
155
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
156
+
157
+ # 创建不允许的请求
158
+ request1 = Mock()
159
+ request1.url = 'https://www.google.com/search?q=test'
160
+
161
+ request2 = Mock()
162
+ request2.url = 'https://github.com/user/repo'
163
+
164
+ # 这些请求应该被认为是站外请求
165
+ self.assertTrue(middleware._is_offsite_request(request1))
166
+ self.assertTrue(middleware._is_offsite_request(request2))
167
+
168
+ def test_process_request_with_allowed_domain(self):
169
+ """测试处理允许域名内的请求"""
170
+ # 设置ALLOWED_DOMAINS
171
+ self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
172
+ self.settings.set('LOG_LEVEL', 'DEBUG')
209
173
 
210
- # 创建请求对象(有效请求)
211
- request = Mock()
212
- request.url = 'http://example.com/page'
174
+ logger = MockLogger('OffsiteMiddleware')
175
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
176
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
177
+
178
+ # 创建允许的请求
179
+ request = Mock()
180
+ request.url = 'https://ee.ofweek.com/news/article1.html'
181
+ spider = Mock()
182
+
183
+ # 处理请求,应该不抛出异常
184
+ result = middleware.process_request(request, spider)
185
+ self.assertIsNone(result) # 应该返回None,表示请求被允许
186
+
187
+ # 检查没有增加统计计数
188
+ self.assertNotIn('offsite_request_count', self.crawler.stats.stats)
189
+
190
+ def test_process_request_with_disallowed_domain(self):
191
+ """测试处理不允许域名的请求"""
192
+ # 设置ALLOWED_DOMAINS
193
+ self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
194
+ self.settings.set('LOG_LEVEL', 'DEBUG')
213
195
 
214
- # 应该正常处理,不抛出异常
215
- result = asyncio.run(middleware.process_request(request, Mock()))
196
+ logger = MockLogger('OffsiteMiddleware')
197
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
198
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
199
+
200
+ # 创建不允许的请求
201
+ request = Mock()
202
+ request.url = 'https://www.google.com/search?q=test'
203
+ spider = Mock()
204
+
205
+ # 处理请求,应该抛出IgnoreRequestError异常
206
+ with self.assertRaises(IgnoreRequestError) as context:
207
+ middleware.process_request(request, spider)
208
+
209
+ self.assertIn("站外请求被过滤", str(context.exception))
210
+
211
+ # 检查增加了统计计数
212
+ self.assertIn('offsite_request_count', self.crawler.stats.stats)
213
+ self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
214
+ self.assertIn('offsite_request_count/www.google.com', self.crawler.stats.stats)
215
+
216
+ def test_process_request_with_invalid_url(self):
217
+ """测试处理无效URL的请求"""
218
+ # 设置ALLOWED_DOMAINS
219
+ self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
220
+ self.settings.set('LOG_LEVEL', 'DEBUG')
216
221
 
217
- # 返回None表示继续处理
218
- self.assertIsNone(result)
222
+ logger = MockLogger('OffsiteMiddleware')
223
+ with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
224
+ middleware = OffsiteMiddleware.create_instance(self.crawler)
225
+
226
+ # 创建无效URL的请求
227
+ request = Mock()
228
+ request.url = 'not_a_valid_url'
229
+ spider = Mock()
230
+
231
+ # 处理请求,应该抛出IgnoreRequestError异常
232
+ with self.assertRaises(IgnoreRequestError) as context:
233
+ middleware.process_request(request, spider)
234
+
235
+ self.assertIn("站外请求被过滤", str(context.exception))
236
+
237
+ # 检查增加了统计计数
238
+ self.assertIn('offsite_request_count', self.crawler.stats.stats)
239
+ self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
240
+ self.assertIn('offsite_request_count/invalid_url', self.crawler.stats.stats)
219
241
 
220
242
 
221
243
  if __name__ == '__main__':
244
+ # 直接创建一个OffsiteMiddleware实例进行测试,绕过create_instance的复杂逻辑
222
245
  unittest.main()