crawlo 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (45) hide show
  1. crawlo/__init__.py +9 -4
  2. crawlo/__version__.py +1 -1
  3. crawlo/core/__init__.py +8 -2
  4. crawlo/core/scheduler.py +2 -2
  5. crawlo/downloader/aiohttp_downloader.py +7 -2
  6. crawlo/extension/log_interval.py +44 -7
  7. crawlo/initialization/__init__.py +6 -2
  8. crawlo/middleware/middleware_manager.py +1 -1
  9. crawlo/mode_manager.py +13 -7
  10. crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
  11. crawlo/pipelines/database_dedup_pipeline.py +5 -8
  12. crawlo/pipelines/memory_dedup_pipeline.py +5 -15
  13. crawlo/pipelines/redis_dedup_pipeline.py +2 -15
  14. crawlo/project.py +18 -7
  15. crawlo/settings/default_settings.py +114 -150
  16. crawlo/settings/setting_manager.py +14 -9
  17. crawlo/tools/distributed_coordinator.py +4 -8
  18. crawlo/utils/fingerprint.py +123 -0
  19. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/METADATA +1 -1
  20. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/RECORD +45 -29
  21. examples/test_project/__init__.py +7 -0
  22. examples/test_project/run.py +35 -0
  23. examples/test_project/test_project/__init__.py +4 -0
  24. examples/test_project/test_project/items.py +18 -0
  25. examples/test_project/test_project/middlewares.py +119 -0
  26. examples/test_project/test_project/pipelines.py +97 -0
  27. examples/test_project/test_project/settings.py +170 -0
  28. examples/test_project/test_project/spiders/__init__.py +10 -0
  29. examples/test_project/test_project/spiders/of_week_dis.py +144 -0
  30. tests/debug_framework_logger.py +1 -1
  31. tests/debug_log_levels.py +1 -1
  32. tests/test_all_pipeline_fingerprints.py +134 -0
  33. tests/test_default_header_middleware.py +242 -87
  34. tests/test_fingerprint_consistency.py +136 -0
  35. tests/test_fingerprint_simple.py +52 -0
  36. tests/test_framework_logger.py +1 -1
  37. tests/test_framework_startup.py +1 -1
  38. tests/test_hash_performance.py +100 -0
  39. tests/test_mode_change.py +1 -1
  40. tests/test_offsite_middleware.py +185 -162
  41. tests/test_offsite_middleware_simple.py +204 -0
  42. tests/test_pipeline_fingerprint_consistency.py +87 -0
  43. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/WHEEL +0 -0
  44. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/entry_points.txt +0 -0
  45. {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ OffsiteMiddleware 简单测试文件
5
+ 用于测试站点过滤中间件的功能,特别是多个域名的情况
6
+ """
7
+
8
+ import asyncio
9
+ import unittest
10
+ from unittest.mock import Mock, patch
11
+
12
+ from crawlo.middleware.offsite import OffsiteMiddleware
13
+ from crawlo.settings.setting_manager import SettingManager
14
+ from crawlo.exceptions import IgnoreRequestError
15
+
16
+
17
+ class MockStats:
18
+ """Mock Stats 类,用于测试统计信息"""
19
+ def __init__(self):
20
+ self.stats = {}
21
+
22
+ def inc_value(self, key, value=1):
23
+ if key in self.stats:
24
+ self.stats[key] += value
25
+ else:
26
+ self.stats[key] = value
27
+
28
+
29
+ class MockLogger:
30
+ """Mock Logger 类,用于测试日志输出"""
31
+ def __init__(self, name, level=None):
32
+ self.name = name
33
+ self.level = level
34
+ self.logs = []
35
+
36
+ def debug(self, msg):
37
+ self.logs.append(('debug', msg))
38
+
39
+ def info(self, msg):
40
+ self.logs.append(('info', msg))
41
+
42
+ def warning(self, msg):
43
+ self.logs.append(('warning', msg))
44
+
45
+ def error(self, msg):
46
+ self.logs.append(('error', msg))
47
+
48
+
49
+ class TestOffsiteMiddleware(unittest.TestCase):
50
+ """OffsiteMiddleware 测试类"""
51
+
52
+ def setUp(self):
53
+ """测试前准备"""
54
+ self.stats = MockStats()
55
+ self.logger = MockLogger('OffsiteMiddleware')
56
+
57
+ def test_multiple_domains_initialization(self):
58
+ """测试多个域名的初始化"""
59
+ with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
60
+ # 直接创建实例,传入多个域名
61
+ middleware = OffsiteMiddleware(
62
+ stats=self.stats,
63
+ log_level='DEBUG',
64
+ allowed_domains=['ee.ofweek.com', 'www.baidu.com']
65
+ )
66
+
67
+ # 手动调用编译域名方法
68
+ middleware._compile_domains()
69
+
70
+ # 检查域名是否正确设置
71
+ self.assertEqual(len(middleware.allowed_domains), 2)
72
+ self.assertIn('ee.ofweek.com', middleware.allowed_domains)
73
+ self.assertIn('www.baidu.com', middleware.allowed_domains)
74
+
75
+ # 检查是否创建了正确的正则表达式
76
+ self.assertEqual(len(middleware._domain_regexes), 2)
77
+
78
+ def test_allowed_requests_with_multiple_domains(self):
79
+ """测试多个域名下允许的请求"""
80
+ with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
81
+ middleware = OffsiteMiddleware(
82
+ stats=self.stats,
83
+ log_level='DEBUG',
84
+ allowed_domains=['ee.ofweek.com', 'www.baidu.com']
85
+ )
86
+
87
+ # 手动调用编译域名方法
88
+ middleware._compile_domains()
89
+
90
+ # 创建允许的请求
91
+ request1 = Mock()
92
+ request1.url = 'https://ee.ofweek.com/news/article1.html'
93
+
94
+ request2 = Mock()
95
+ request2.url = 'https://www.baidu.com/s?wd=test'
96
+
97
+ # 这些请求应该不被认为是站外请求
98
+ self.assertFalse(middleware._is_offsite_request(request1))
99
+ self.assertFalse(middleware._is_offsite_request(request2))
100
+
101
+ def test_disallowed_requests_with_multiple_domains(self):
102
+ """测试多个域名下不允许的请求"""
103
+ with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
104
+ middleware = OffsiteMiddleware(
105
+ stats=self.stats,
106
+ log_level='DEBUG',
107
+ allowed_domains=['ee.ofweek.com', 'www.baidu.com']
108
+ )
109
+
110
+ # 手动调用编译域名方法
111
+ middleware._compile_domains()
112
+
113
+ # 创建不允许的请求
114
+ request1 = Mock()
115
+ request1.url = 'https://www.google.com/search?q=test'
116
+
117
+ request2 = Mock()
118
+ request2.url = 'https://github.com/user/repo'
119
+
120
+ # 这些请求应该被认为是站外请求
121
+ self.assertTrue(middleware._is_offsite_request(request1))
122
+ self.assertTrue(middleware._is_offsite_request(request2))
123
+
124
+ def test_subdomain_requests_with_multiple_domains(self):
125
+ """测试多个域名下的子域名请求"""
126
+ with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
127
+ # 使用根域名,应该允许子域名
128
+ middleware = OffsiteMiddleware(
129
+ stats=self.stats,
130
+ log_level='DEBUG',
131
+ allowed_domains=['ofweek.com', 'baidu.com']
132
+ )
133
+
134
+ # 手动调用编译域名方法
135
+ middleware._compile_domains()
136
+
137
+ # 创建子域名的请求
138
+ request1 = Mock()
139
+ request1.url = 'https://news.ofweek.com/article1.html'
140
+
141
+ request2 = Mock()
142
+ request2.url = 'https://map.baidu.com/location'
143
+
144
+ # 这些请求应该不被认为是站外请求(因为允许了根域名)
145
+ self.assertFalse(middleware._is_offsite_request(request1))
146
+ self.assertFalse(middleware._is_offsite_request(request2))
147
+
148
+ def test_process_allowed_request_with_multiple_domains(self):
149
+ """测试处理多个域名下允许的请求"""
150
+ with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
151
+ middleware = OffsiteMiddleware(
152
+ stats=self.stats,
153
+ log_level='DEBUG',
154
+ allowed_domains=['ee.ofweek.com', 'www.baidu.com']
155
+ )
156
+
157
+ # 手动调用编译域名方法
158
+ middleware._compile_domains()
159
+
160
+ # 创建允许的请求
161
+ request = Mock()
162
+ request.url = 'https://ee.ofweek.com/news/article1.html'
163
+ spider = Mock()
164
+
165
+ # 处理请求,应该不抛出异常
166
+ # 使用asyncio.run来运行异步方法
167
+ result = asyncio.run(middleware.process_request(request, spider))
168
+ self.assertIsNone(result) # 应该返回None,表示请求被允许
169
+
170
+ # 检查没有增加统计计数
171
+ self.assertNotIn('offsite_request_count', self.stats.stats)
172
+
173
+ def test_process_disallowed_request_with_multiple_domains(self):
174
+ """测试处理多个域名下不允许的请求"""
175
+ with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
176
+ middleware = OffsiteMiddleware(
177
+ stats=self.stats,
178
+ log_level='DEBUG',
179
+ allowed_domains=['ee.ofweek.com', 'www.baidu.com']
180
+ )
181
+
182
+ # 手动调用编译域名方法
183
+ middleware._compile_domains()
184
+
185
+ # 创建不允许的请求
186
+ request = Mock()
187
+ request.url = 'https://www.google.com/search?q=test'
188
+ spider = Mock()
189
+
190
+ # 处理请求,应该抛出IgnoreRequestError异常
191
+ # 使用asyncio.run来运行异步方法
192
+ with self.assertRaises(IgnoreRequestError) as context:
193
+ asyncio.run(middleware.process_request(request, spider))
194
+
195
+ self.assertIn("站外请求被过滤", str(context.exception))
196
+
197
+ # 检查增加了统计计数
198
+ self.assertIn('offsite_request_count', self.stats.stats)
199
+ self.assertEqual(self.stats.stats['offsite_request_count'], 1)
200
+ self.assertIn('offsite_request_count/www.google.com', self.stats.stats)
201
+
202
+
203
+ if __name__ == '__main__':
204
+ unittest.main()
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 去重管道指纹一致性测试
5
+ ==================
6
+ 验证所有去重管道对相同数据生成一致的指纹
7
+ """
8
+
9
+ import sys
10
+ import os
11
+
12
+ # 添加项目根目录到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
14
+
15
+ from crawlo.utils.fingerprint import FingerprintGenerator
16
+
17
+
18
+ class MockItem:
19
+ """模拟数据项类"""
20
+
21
+ def __init__(self, **kwargs):
22
+ for key, value in kwargs.items():
23
+ setattr(self, key, value)
24
+
25
+ def to_dict(self):
26
+ """转换为字典"""
27
+ return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}
28
+
29
+
30
+ def test_pipeline_fingerprint_consistency():
31
+ """测试各管道指纹一致性"""
32
+ # 创建测试数据项
33
+ test_item = MockItem(
34
+ title="Test Title",
35
+ url="https://example.com",
36
+ content="Test content",
37
+ price=99.99
38
+ )
39
+
40
+ # 使用指纹生成器生成指纹
41
+ expected_fingerprint = FingerprintGenerator.item_fingerprint(test_item)
42
+
43
+ print(f"Expected fingerprint: {expected_fingerprint}")
44
+
45
+ # 验证指纹生成的稳定性
46
+ for i in range(5):
47
+ fingerprint = FingerprintGenerator.item_fingerprint(test_item)
48
+ print(f"Generated fingerprint {i+1}: {fingerprint}")
49
+ assert fingerprint == expected_fingerprint, f"Fingerprint mismatch at iteration {i+1}"
50
+
51
+ print("\n✓ 所有指纹生成一致")
52
+
53
+ # 测试不同数据生成不同指纹
54
+ test_item2 = MockItem(
55
+ title="Test Title 2",
56
+ url="https://example.com",
57
+ content="Test content",
58
+ price=99.99
59
+ )
60
+
61
+ fingerprint2 = FingerprintGenerator.item_fingerprint(test_item2)
62
+ print(f"\nDifferent item fingerprint: {fingerprint2}")
63
+ assert fingerprint2 != expected_fingerprint, "Different items should generate different fingerprints"
64
+
65
+ print("✓ 不同数据生成不同指纹")
66
+
67
+
68
+ def test_data_fingerprint_variants():
69
+ """测试不同数据类型的指纹生成"""
70
+ # 测试字典
71
+ dict_data = {"name": "test", "value": 123}
72
+ dict_fingerprint = FingerprintGenerator.data_fingerprint(dict_data)
73
+ print(f"\nDict fingerprint: {dict_fingerprint}")
74
+
75
+ # 测试相同内容的字典(不同顺序)
76
+ dict_data2 = {"value": 123, "name": "test"}
77
+ dict_fingerprint2 = FingerprintGenerator.data_fingerprint(dict_data2)
78
+ print(f"Reordered dict fingerprint: {dict_fingerprint2}")
79
+ assert dict_fingerprint == dict_fingerprint2, "Reordered dict should generate same fingerprint"
80
+
81
+ print("✓ 字典顺序不影响指纹生成")
82
+
83
+
84
+ if __name__ == '__main__':
85
+ test_pipeline_fingerprint_consistency()
86
+ test_data_fingerprint_variants()
87
+ print("\n🎉 所有测试通过!")
File without changes