crawlo 1.2.3__py3-none-any.whl → 1.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +61 -61
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +60 -60
- crawlo/cleaners/data_formatter.py +225 -225
- crawlo/cleaners/encoding_converter.py +125 -125
- crawlo/cleaners/text_cleaner.py +232 -232
- crawlo/cli.py +81 -81
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +144 -142
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +323 -292
- crawlo/commands/startproject.py +420 -417
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +312 -312
- crawlo/config_validator.py +251 -251
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +354 -354
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +143 -143
- crawlo/crawler.py +1110 -1027
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +107 -107
- crawlo/downloader/__init__.py +266 -266
- crawlo/downloader/aiohttp_downloader.py +220 -220
- crawlo/downloader/cffi_downloader.py +256 -256
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +37 -37
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +43 -43
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +280 -280
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +131 -131
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +114 -114
- crawlo/middleware/proxy.py +367 -367
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +211 -211
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +338 -338
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +317 -317
- crawlo/pipelines/pipeline_manager.py +61 -61
- crawlo/pipelines/redis_dedup_pipeline.py +165 -165
- crawlo/project.py +279 -187
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +337 -337
- crawlo/queue/redis_priority_queue.py +298 -298
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +217 -226
- crawlo/settings/setting_manager.py +122 -122
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/run.py.tmpl +47 -45
- crawlo/templates/project/settings.py.tmpl +350 -325
- crawlo/templates/project/settings_distributed.py.tmpl +160 -121
- crawlo/templates/project/settings_gentle.py.tmpl +133 -94
- crawlo/templates/project/settings_high_performance.py.tmpl +155 -151
- crawlo/templates/project/settings_simple.py.tmpl +108 -68
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +182 -182
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +35 -35
- crawlo/tools/distributed_coordinator.py +386 -386
- crawlo/tools/retry_mechanism.py +220 -220
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/utils/__init__.py +35 -35
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +290 -290
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +105 -105
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +128 -128
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +334 -334
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.2.3.dist-info → crawlo-1.2.4.dist-info}/METADATA +764 -692
- crawlo-1.2.4.dist-info/RECORD +206 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +236 -236
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +102 -102
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_validator.py +193 -193
- tests/test_crawlo_proxy_integration.py +172 -172
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +356 -356
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +241 -241
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +153 -153
- tests/tools_example.py +257 -257
- crawlo-1.2.3.dist-info/RECORD +0 -222
- examples/aiohttp_settings.py +0 -42
- examples/curl_cffi_settings.py +0 -41
- examples/default_header_middleware_example.py +0 -107
- examples/default_header_spider_example.py +0 -129
- examples/download_delay_middleware_example.py +0 -160
- examples/httpx_settings.py +0 -42
- examples/multi_downloader_proxy_example.py +0 -81
- examples/offsite_middleware_example.py +0 -55
- examples/offsite_spider_example.py +0 -107
- examples/proxy_spider_example.py +0 -166
- examples/request_ignore_middleware_example.py +0 -51
- examples/request_ignore_spider_example.py +0 -99
- examples/response_code_middleware_example.py +0 -52
- examples/response_filter_middleware_example.py +0 -67
- examples/tong_hua_shun_settings.py +0 -62
- examples/tong_hua_shun_spider.py +0 -170
- {crawlo-1.2.3.dist-info → crawlo-1.2.4.dist-info}/WHEEL +0 -0
- {crawlo-1.2.3.dist-info → crawlo-1.2.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.3.dist-info → crawlo-1.2.4.dist-info}/top_level.txt +0 -0
|
@@ -1,183 +1,183 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
RequestIgnoreMiddleware 测试文件
|
|
5
|
-
用于测试请求忽略中间件的功能
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import unittest
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
from crawlo.middleware.request_ignore import RequestIgnoreMiddleware
|
|
13
|
-
from crawlo.exceptions import IgnoreRequestError
|
|
14
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class MockLogger:
|
|
18
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
19
|
-
def __init__(self, name, level=None):
|
|
20
|
-
self.name = name
|
|
21
|
-
self.level = level
|
|
22
|
-
self.logs = []
|
|
23
|
-
|
|
24
|
-
def debug(self, msg):
|
|
25
|
-
self.logs.append(('debug', msg))
|
|
26
|
-
|
|
27
|
-
def info(self, msg):
|
|
28
|
-
self.logs.append(('info', msg))
|
|
29
|
-
|
|
30
|
-
def warning(self, msg):
|
|
31
|
-
self.logs.append(('warning', msg))
|
|
32
|
-
|
|
33
|
-
def error(self, msg):
|
|
34
|
-
self.logs.append(('error', msg))
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class MockStats:
|
|
38
|
-
"""Mock Stats 类,用于测试统计信息"""
|
|
39
|
-
def __init__(self):
|
|
40
|
-
self.stats = {}
|
|
41
|
-
|
|
42
|
-
def inc_value(self, key, value=1):
|
|
43
|
-
if key in self.stats:
|
|
44
|
-
self.stats[key] += value
|
|
45
|
-
else:
|
|
46
|
-
self.stats[key] = value
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class TestRequestIgnoreMiddleware(unittest.TestCase):
|
|
50
|
-
"""RequestIgnoreMiddleware 测试类"""
|
|
51
|
-
|
|
52
|
-
def setUp(self):
|
|
53
|
-
"""测试前准备"""
|
|
54
|
-
# 创建设置管理器
|
|
55
|
-
self.settings = SettingManager()
|
|
56
|
-
|
|
57
|
-
# 创建爬虫模拟对象
|
|
58
|
-
self.crawler = Mock()
|
|
59
|
-
self.crawler.settings = self.settings
|
|
60
|
-
self.crawler.stats = MockStats()
|
|
61
|
-
|
|
62
|
-
@patch('crawlo.utils.log.get_logger')
|
|
63
|
-
def test_middleware_initialization(self, mock_get_logger):
|
|
64
|
-
"""测试中间件初始化"""
|
|
65
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
66
|
-
mock_get_logger.return_value = MockLogger('RequestIgnoreMiddleware')
|
|
67
|
-
|
|
68
|
-
# 应该正常创建实例
|
|
69
|
-
middleware = RequestIgnoreMiddleware.create_instance(self.crawler)
|
|
70
|
-
self.assertIsInstance(middleware, RequestIgnoreMiddleware)
|
|
71
|
-
|
|
72
|
-
@patch('crawlo.utils.log.get_logger')
|
|
73
|
-
def test_request_ignore_event_handling(self, mock_get_logger):
|
|
74
|
-
"""测试请求忽略事件处理"""
|
|
75
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
76
|
-
mock_logger = MockLogger('RequestIgnoreMiddleware')
|
|
77
|
-
mock_get_logger.return_value = mock_logger
|
|
78
|
-
|
|
79
|
-
# 创建中间件实例
|
|
80
|
-
mock_stats = MockStats()
|
|
81
|
-
middleware = RequestIgnoreMiddleware(
|
|
82
|
-
stats=mock_stats,
|
|
83
|
-
log_level='DEBUG'
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
# 创建异常和请求对象
|
|
87
|
-
exc = IgnoreRequestError("test reason")
|
|
88
|
-
request = Mock()
|
|
89
|
-
request.url = 'http://example.com/page'
|
|
90
|
-
|
|
91
|
-
# 处理忽略事件
|
|
92
|
-
asyncio.run(middleware.request_ignore(exc, request, Mock()))
|
|
93
|
-
|
|
94
|
-
# 验证统计信息
|
|
95
|
-
self.assertIn('request_ignore_count', mock_stats.stats)
|
|
96
|
-
self.assertEqual(mock_stats.stats['request_ignore_count'], 1)
|
|
97
|
-
self.assertIn('request_ignore_count/reason/test reason', mock_stats.stats)
|
|
98
|
-
|
|
99
|
-
@patch('crawlo.utils.log.get_logger')
|
|
100
|
-
def test_request_ignore_event_handling_with_domain(self, mock_get_logger):
|
|
101
|
-
"""测试带域名的请求忽略事件处理"""
|
|
102
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
103
|
-
mock_logger = MockLogger('RequestIgnoreMiddleware')
|
|
104
|
-
mock_get_logger.return_value = mock_logger
|
|
105
|
-
|
|
106
|
-
# 创建中间件实例
|
|
107
|
-
mock_stats = MockStats()
|
|
108
|
-
middleware = RequestIgnoreMiddleware(
|
|
109
|
-
stats=mock_stats,
|
|
110
|
-
log_level='DEBUG'
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
# 创建异常和请求对象
|
|
114
|
-
exc = IgnoreRequestError("test reason")
|
|
115
|
-
request = Mock()
|
|
116
|
-
request.url = 'http://example.com/page'
|
|
117
|
-
|
|
118
|
-
# 处理忽略事件
|
|
119
|
-
asyncio.run(middleware.request_ignore(exc, request, Mock()))
|
|
120
|
-
|
|
121
|
-
# 验证域名统计信息
|
|
122
|
-
self.assertIn('request_ignore_count/domain/example.com', mock_stats.stats)
|
|
123
|
-
|
|
124
|
-
@patch('crawlo.utils.log.get_logger')
|
|
125
|
-
def test_request_ignore_event_handling_with_invalid_url(self, mock_get_logger):
|
|
126
|
-
"""测试带无效URL的请求忽略事件处理"""
|
|
127
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
128
|
-
mock_logger = MockLogger('RequestIgnoreMiddleware')
|
|
129
|
-
mock_get_logger.return_value = mock_logger
|
|
130
|
-
|
|
131
|
-
# 创建中间件实例
|
|
132
|
-
mock_stats = MockStats()
|
|
133
|
-
middleware = RequestIgnoreMiddleware(
|
|
134
|
-
stats=mock_stats,
|
|
135
|
-
log_level='DEBUG'
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
# 创建异常和请求对象(没有url属性,会触发异常)
|
|
139
|
-
exc = IgnoreRequestError("test reason")
|
|
140
|
-
request = Mock()
|
|
141
|
-
# 不设置request.url属性,这样在访问时会抛出AttributeError
|
|
142
|
-
|
|
143
|
-
# 处理忽略事件
|
|
144
|
-
asyncio.run(middleware.request_ignore(exc, request, Mock()))
|
|
145
|
-
|
|
146
|
-
# 验证无效URL统计信息
|
|
147
|
-
self.assertIn('request_ignore_count/domain/invalid_url', mock_stats.stats)
|
|
148
|
-
|
|
149
|
-
def test_process_exception_with_ignore_request_error(self):
|
|
150
|
-
"""测试处理IgnoreRequestError异常"""
|
|
151
|
-
# 创建中间件实例
|
|
152
|
-
middleware = RequestIgnoreMiddleware(
|
|
153
|
-
stats=MockStats(),
|
|
154
|
-
log_level='INFO'
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
# 创建异常和请求对象
|
|
158
|
-
exc = IgnoreRequestError("test reason")
|
|
159
|
-
request = Mock()
|
|
160
|
-
|
|
161
|
-
# 应该返回True表示异常已被处理
|
|
162
|
-
result = middleware.process_exception(request, exc, Mock())
|
|
163
|
-
self.assertTrue(result)
|
|
164
|
-
|
|
165
|
-
def test_process_exception_with_other_exception(self):
|
|
166
|
-
"""测试处理其他异常"""
|
|
167
|
-
# 创建中间件实例
|
|
168
|
-
middleware = RequestIgnoreMiddleware(
|
|
169
|
-
stats=MockStats(),
|
|
170
|
-
log_level='INFO'
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
# 创建异常和请求对象
|
|
174
|
-
exc = ValueError("test error")
|
|
175
|
-
request = Mock()
|
|
176
|
-
|
|
177
|
-
# 应该返回None表示异常未被处理
|
|
178
|
-
result = middleware.process_exception(request, exc, Mock())
|
|
179
|
-
self.assertIsNone(result)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
RequestIgnoreMiddleware 测试文件
|
|
5
|
+
用于测试请求忽略中间件的功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import unittest
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
from crawlo.middleware.request_ignore import RequestIgnoreMiddleware
|
|
13
|
+
from crawlo.exceptions import IgnoreRequestError
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockLogger:
|
|
18
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
19
|
+
def __init__(self, name, level=None):
|
|
20
|
+
self.name = name
|
|
21
|
+
self.level = level
|
|
22
|
+
self.logs = []
|
|
23
|
+
|
|
24
|
+
def debug(self, msg):
|
|
25
|
+
self.logs.append(('debug', msg))
|
|
26
|
+
|
|
27
|
+
def info(self, msg):
|
|
28
|
+
self.logs.append(('info', msg))
|
|
29
|
+
|
|
30
|
+
def warning(self, msg):
|
|
31
|
+
self.logs.append(('warning', msg))
|
|
32
|
+
|
|
33
|
+
def error(self, msg):
|
|
34
|
+
self.logs.append(('error', msg))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class MockStats:
|
|
38
|
+
"""Mock Stats 类,用于测试统计信息"""
|
|
39
|
+
def __init__(self):
|
|
40
|
+
self.stats = {}
|
|
41
|
+
|
|
42
|
+
def inc_value(self, key, value=1):
|
|
43
|
+
if key in self.stats:
|
|
44
|
+
self.stats[key] += value
|
|
45
|
+
else:
|
|
46
|
+
self.stats[key] = value
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TestRequestIgnoreMiddleware(unittest.TestCase):
|
|
50
|
+
"""RequestIgnoreMiddleware 测试类"""
|
|
51
|
+
|
|
52
|
+
def setUp(self):
|
|
53
|
+
"""测试前准备"""
|
|
54
|
+
# 创建设置管理器
|
|
55
|
+
self.settings = SettingManager()
|
|
56
|
+
|
|
57
|
+
# 创建爬虫模拟对象
|
|
58
|
+
self.crawler = Mock()
|
|
59
|
+
self.crawler.settings = self.settings
|
|
60
|
+
self.crawler.stats = MockStats()
|
|
61
|
+
|
|
62
|
+
@patch('crawlo.utils.log.get_logger')
|
|
63
|
+
def test_middleware_initialization(self, mock_get_logger):
|
|
64
|
+
"""测试中间件初始化"""
|
|
65
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
66
|
+
mock_get_logger.return_value = MockLogger('RequestIgnoreMiddleware')
|
|
67
|
+
|
|
68
|
+
# 应该正常创建实例
|
|
69
|
+
middleware = RequestIgnoreMiddleware.create_instance(self.crawler)
|
|
70
|
+
self.assertIsInstance(middleware, RequestIgnoreMiddleware)
|
|
71
|
+
|
|
72
|
+
@patch('crawlo.utils.log.get_logger')
|
|
73
|
+
def test_request_ignore_event_handling(self, mock_get_logger):
|
|
74
|
+
"""测试请求忽略事件处理"""
|
|
75
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
76
|
+
mock_logger = MockLogger('RequestIgnoreMiddleware')
|
|
77
|
+
mock_get_logger.return_value = mock_logger
|
|
78
|
+
|
|
79
|
+
# 创建中间件实例
|
|
80
|
+
mock_stats = MockStats()
|
|
81
|
+
middleware = RequestIgnoreMiddleware(
|
|
82
|
+
stats=mock_stats,
|
|
83
|
+
log_level='DEBUG'
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# 创建异常和请求对象
|
|
87
|
+
exc = IgnoreRequestError("test reason")
|
|
88
|
+
request = Mock()
|
|
89
|
+
request.url = 'http://example.com/page'
|
|
90
|
+
|
|
91
|
+
# 处理忽略事件
|
|
92
|
+
asyncio.run(middleware.request_ignore(exc, request, Mock()))
|
|
93
|
+
|
|
94
|
+
# 验证统计信息
|
|
95
|
+
self.assertIn('request_ignore_count', mock_stats.stats)
|
|
96
|
+
self.assertEqual(mock_stats.stats['request_ignore_count'], 1)
|
|
97
|
+
self.assertIn('request_ignore_count/reason/test reason', mock_stats.stats)
|
|
98
|
+
|
|
99
|
+
@patch('crawlo.utils.log.get_logger')
|
|
100
|
+
def test_request_ignore_event_handling_with_domain(self, mock_get_logger):
|
|
101
|
+
"""测试带域名的请求忽略事件处理"""
|
|
102
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
103
|
+
mock_logger = MockLogger('RequestIgnoreMiddleware')
|
|
104
|
+
mock_get_logger.return_value = mock_logger
|
|
105
|
+
|
|
106
|
+
# 创建中间件实例
|
|
107
|
+
mock_stats = MockStats()
|
|
108
|
+
middleware = RequestIgnoreMiddleware(
|
|
109
|
+
stats=mock_stats,
|
|
110
|
+
log_level='DEBUG'
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# 创建异常和请求对象
|
|
114
|
+
exc = IgnoreRequestError("test reason")
|
|
115
|
+
request = Mock()
|
|
116
|
+
request.url = 'http://example.com/page'
|
|
117
|
+
|
|
118
|
+
# 处理忽略事件
|
|
119
|
+
asyncio.run(middleware.request_ignore(exc, request, Mock()))
|
|
120
|
+
|
|
121
|
+
# 验证域名统计信息
|
|
122
|
+
self.assertIn('request_ignore_count/domain/example.com', mock_stats.stats)
|
|
123
|
+
|
|
124
|
+
@patch('crawlo.utils.log.get_logger')
|
|
125
|
+
def test_request_ignore_event_handling_with_invalid_url(self, mock_get_logger):
|
|
126
|
+
"""测试带无效URL的请求忽略事件处理"""
|
|
127
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
128
|
+
mock_logger = MockLogger('RequestIgnoreMiddleware')
|
|
129
|
+
mock_get_logger.return_value = mock_logger
|
|
130
|
+
|
|
131
|
+
# 创建中间件实例
|
|
132
|
+
mock_stats = MockStats()
|
|
133
|
+
middleware = RequestIgnoreMiddleware(
|
|
134
|
+
stats=mock_stats,
|
|
135
|
+
log_level='DEBUG'
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# 创建异常和请求对象(没有url属性,会触发异常)
|
|
139
|
+
exc = IgnoreRequestError("test reason")
|
|
140
|
+
request = Mock()
|
|
141
|
+
# 不设置request.url属性,这样在访问时会抛出AttributeError
|
|
142
|
+
|
|
143
|
+
# 处理忽略事件
|
|
144
|
+
asyncio.run(middleware.request_ignore(exc, request, Mock()))
|
|
145
|
+
|
|
146
|
+
# 验证无效URL统计信息
|
|
147
|
+
self.assertIn('request_ignore_count/domain/invalid_url', mock_stats.stats)
|
|
148
|
+
|
|
149
|
+
def test_process_exception_with_ignore_request_error(self):
|
|
150
|
+
"""测试处理IgnoreRequestError异常"""
|
|
151
|
+
# 创建中间件实例
|
|
152
|
+
middleware = RequestIgnoreMiddleware(
|
|
153
|
+
stats=MockStats(),
|
|
154
|
+
log_level='INFO'
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# 创建异常和请求对象
|
|
158
|
+
exc = IgnoreRequestError("test reason")
|
|
159
|
+
request = Mock()
|
|
160
|
+
|
|
161
|
+
# 应该返回True表示异常已被处理
|
|
162
|
+
result = middleware.process_exception(request, exc, Mock())
|
|
163
|
+
self.assertTrue(result)
|
|
164
|
+
|
|
165
|
+
def test_process_exception_with_other_exception(self):
|
|
166
|
+
"""测试处理其他异常"""
|
|
167
|
+
# 创建中间件实例
|
|
168
|
+
middleware = RequestIgnoreMiddleware(
|
|
169
|
+
stats=MockStats(),
|
|
170
|
+
log_level='INFO'
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# 创建异常和请求对象
|
|
174
|
+
exc = ValueError("test error")
|
|
175
|
+
request = Mock()
|
|
176
|
+
|
|
177
|
+
# 应该返回None表示异常未被处理
|
|
178
|
+
result = middleware.process_exception(request, exc, Mock())
|
|
179
|
+
self.assertIsNone(result)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if __name__ == '__main__':
|
|
183
183
|
unittest.main()
|
|
@@ -1,71 +1,71 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试 Request 序列化问题修复
|
|
5
|
-
"""
|
|
6
|
-
import pickle
|
|
7
|
-
import sys
|
|
8
|
-
sys.path.insert(0, "..")
|
|
9
|
-
|
|
10
|
-
from crawlo.network.request import Request
|
|
11
|
-
from crawlo.core.scheduler import Scheduler
|
|
12
|
-
from unittest.mock import Mock
|
|
13
|
-
|
|
14
|
-
# 模拟一个带 logger 的 Request
|
|
15
|
-
class TestRequest(Request):
|
|
16
|
-
def __init__(self, *args, **kwargs):
|
|
17
|
-
super().__init__(*args, **kwargs)
|
|
18
|
-
# 添加一个 logger 属性模拟问题
|
|
19
|
-
from crawlo.utils.log import get_logger
|
|
20
|
-
self.logger = get_logger("test_request")
|
|
21
|
-
self.meta['spider_logger'] = get_logger("spider_logger")
|
|
22
|
-
|
|
23
|
-
def test_request_serialization():
|
|
24
|
-
"""测试 Request 序列化"""
|
|
25
|
-
print("🔍 测试 Request 序列化修复...")
|
|
26
|
-
|
|
27
|
-
# 创建一个带 logger 的请求
|
|
28
|
-
request = TestRequest(
|
|
29
|
-
url="https://example.com",
|
|
30
|
-
meta={"test": "data"} # 移除 Mock 对象
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
print(f" 📦 原始请求: {request}")
|
|
34
|
-
print(f" 🔧 请求有 logger: {hasattr(request, 'logger')}")
|
|
35
|
-
print(f" 🔧 meta 有 logger: {'spider_logger' in request.meta}")
|
|
36
|
-
|
|
37
|
-
# 创建一个 mock scheduler 来测试清理
|
|
38
|
-
class MockScheduler:
|
|
39
|
-
def _deep_clean_loggers(self, request):
|
|
40
|
-
return Scheduler._deep_clean_loggers(self, request)
|
|
41
|
-
def _remove_logger_from_dict(self, d):
|
|
42
|
-
return Scheduler._remove_logger_from_dict(self, d)
|
|
43
|
-
|
|
44
|
-
scheduler = MockScheduler()
|
|
45
|
-
|
|
46
|
-
# 执行清理
|
|
47
|
-
scheduler._deep_clean_loggers(request)
|
|
48
|
-
|
|
49
|
-
print(f" 🧹 清理后有 logger: {hasattr(request, 'logger')}")
|
|
50
|
-
print(f" 🧹 清理后 meta 有 logger: {'spider_logger' in request.meta}")
|
|
51
|
-
|
|
52
|
-
# 测试序列化
|
|
53
|
-
try:
|
|
54
|
-
serialized = pickle.dumps(request)
|
|
55
|
-
print(f" ✅ 序列化成功,大小: {len(serialized)} bytes")
|
|
56
|
-
|
|
57
|
-
# 测试反序列化
|
|
58
|
-
deserialized = pickle.loads(serialized)
|
|
59
|
-
print(f" ✅ 反序列化成功: {deserialized}")
|
|
60
|
-
return True
|
|
61
|
-
|
|
62
|
-
except Exception as e:
|
|
63
|
-
print(f" ❌ 序列化失败: {e}")
|
|
64
|
-
return False
|
|
65
|
-
|
|
66
|
-
if __name__ == "__main__":
|
|
67
|
-
success = test_request_serialization()
|
|
68
|
-
if success:
|
|
69
|
-
print("🎉 Request 序列化修复成功!")
|
|
70
|
-
else:
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试 Request 序列化问题修复
|
|
5
|
+
"""
|
|
6
|
+
import pickle
|
|
7
|
+
import sys
|
|
8
|
+
sys.path.insert(0, "..")
|
|
9
|
+
|
|
10
|
+
from crawlo.network.request import Request
|
|
11
|
+
from crawlo.core.scheduler import Scheduler
|
|
12
|
+
from unittest.mock import Mock
|
|
13
|
+
|
|
14
|
+
# 模拟一个带 logger 的 Request
|
|
15
|
+
class TestRequest(Request):
|
|
16
|
+
def __init__(self, *args, **kwargs):
|
|
17
|
+
super().__init__(*args, **kwargs)
|
|
18
|
+
# 添加一个 logger 属性模拟问题
|
|
19
|
+
from crawlo.utils.log import get_logger
|
|
20
|
+
self.logger = get_logger("test_request")
|
|
21
|
+
self.meta['spider_logger'] = get_logger("spider_logger")
|
|
22
|
+
|
|
23
|
+
def test_request_serialization():
|
|
24
|
+
"""测试 Request 序列化"""
|
|
25
|
+
print("🔍 测试 Request 序列化修复...")
|
|
26
|
+
|
|
27
|
+
# 创建一个带 logger 的请求
|
|
28
|
+
request = TestRequest(
|
|
29
|
+
url="https://example.com",
|
|
30
|
+
meta={"test": "data"} # 移除 Mock 对象
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
print(f" 📦 原始请求: {request}")
|
|
34
|
+
print(f" 🔧 请求有 logger: {hasattr(request, 'logger')}")
|
|
35
|
+
print(f" 🔧 meta 有 logger: {'spider_logger' in request.meta}")
|
|
36
|
+
|
|
37
|
+
# 创建一个 mock scheduler 来测试清理
|
|
38
|
+
class MockScheduler:
|
|
39
|
+
def _deep_clean_loggers(self, request):
|
|
40
|
+
return Scheduler._deep_clean_loggers(self, request)
|
|
41
|
+
def _remove_logger_from_dict(self, d):
|
|
42
|
+
return Scheduler._remove_logger_from_dict(self, d)
|
|
43
|
+
|
|
44
|
+
scheduler = MockScheduler()
|
|
45
|
+
|
|
46
|
+
# 执行清理
|
|
47
|
+
scheduler._deep_clean_loggers(request)
|
|
48
|
+
|
|
49
|
+
print(f" 🧹 清理后有 logger: {hasattr(request, 'logger')}")
|
|
50
|
+
print(f" 🧹 清理后 meta 有 logger: {'spider_logger' in request.meta}")
|
|
51
|
+
|
|
52
|
+
# 测试序列化
|
|
53
|
+
try:
|
|
54
|
+
serialized = pickle.dumps(request)
|
|
55
|
+
print(f" ✅ 序列化成功,大小: {len(serialized)} bytes")
|
|
56
|
+
|
|
57
|
+
# 测试反序列化
|
|
58
|
+
deserialized = pickle.loads(serialized)
|
|
59
|
+
print(f" ✅ 反序列化成功: {deserialized}")
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print(f" ❌ 序列化失败: {e}")
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
success = test_request_serialization()
|
|
68
|
+
if success:
|
|
69
|
+
print("🎉 Request 序列化修复成功!")
|
|
70
|
+
else:
|
|
71
71
|
print("❌ 序列化问题仍未解决")
|