crawlo 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (220) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +65 -65
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +142 -132
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +292 -292
  14. crawlo/commands/startproject.py +418 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +252 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1027 -1027
  24. crawlo/downloader/__init__.py +266 -266
  25. crawlo/downloader/aiohttp_downloader.py +220 -220
  26. crawlo/downloader/cffi_downloader.py +256 -256
  27. crawlo/downloader/httpx_downloader.py +259 -259
  28. crawlo/downloader/hybrid_downloader.py +213 -213
  29. crawlo/downloader/playwright_downloader.py +402 -402
  30. crawlo/downloader/selenium_downloader.py +472 -472
  31. crawlo/event.py +11 -11
  32. crawlo/exceptions.py +81 -81
  33. crawlo/extension/__init__.py +37 -37
  34. crawlo/extension/health_check.py +141 -141
  35. crawlo/extension/log_interval.py +57 -57
  36. crawlo/extension/log_stats.py +81 -81
  37. crawlo/extension/logging_extension.py +43 -43
  38. crawlo/extension/memory_monitor.py +104 -104
  39. crawlo/extension/performance_profiler.py +133 -133
  40. crawlo/extension/request_recorder.py +107 -107
  41. crawlo/filters/__init__.py +154 -154
  42. crawlo/filters/aioredis_filter.py +280 -280
  43. crawlo/filters/memory_filter.py +269 -269
  44. crawlo/items/__init__.py +23 -23
  45. crawlo/items/base.py +21 -21
  46. crawlo/items/fields.py +53 -53
  47. crawlo/items/items.py +104 -104
  48. crawlo/middleware/__init__.py +21 -21
  49. crawlo/middleware/default_header.py +132 -32
  50. crawlo/middleware/download_delay.py +105 -28
  51. crawlo/middleware/middleware_manager.py +135 -135
  52. crawlo/middleware/offsite.py +116 -0
  53. crawlo/middleware/proxy.py +366 -272
  54. crawlo/middleware/request_ignore.py +88 -30
  55. crawlo/middleware/response_code.py +164 -18
  56. crawlo/middleware/response_filter.py +138 -26
  57. crawlo/middleware/retry.py +124 -124
  58. crawlo/mode_manager.py +211 -211
  59. crawlo/network/__init__.py +21 -21
  60. crawlo/network/request.py +338 -338
  61. crawlo/network/response.py +359 -359
  62. crawlo/pipelines/__init__.py +21 -21
  63. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  64. crawlo/pipelines/console_pipeline.py +39 -39
  65. crawlo/pipelines/csv_pipeline.py +316 -316
  66. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  67. crawlo/pipelines/json_pipeline.py +218 -218
  68. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  69. crawlo/pipelines/mongo_pipeline.py +131 -131
  70. crawlo/pipelines/mysql_pipeline.py +316 -316
  71. crawlo/pipelines/pipeline_manager.py +61 -61
  72. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  73. crawlo/project.py +187 -187
  74. crawlo/queue/pqueue.py +37 -37
  75. crawlo/queue/queue_manager.py +337 -337
  76. crawlo/queue/redis_priority_queue.py +298 -298
  77. crawlo/settings/__init__.py +7 -7
  78. crawlo/settings/default_settings.py +226 -219
  79. crawlo/settings/setting_manager.py +122 -122
  80. crawlo/spider/__init__.py +639 -639
  81. crawlo/stats_collector.py +59 -59
  82. crawlo/subscriber.py +130 -130
  83. crawlo/task_manager.py +30 -30
  84. crawlo/templates/crawlo.cfg.tmpl +10 -10
  85. crawlo/templates/project/__init__.py.tmpl +3 -3
  86. crawlo/templates/project/items.py.tmpl +17 -17
  87. crawlo/templates/project/middlewares.py.tmpl +118 -109
  88. crawlo/templates/project/pipelines.py.tmpl +96 -96
  89. crawlo/templates/project/run.py.tmpl +45 -45
  90. crawlo/templates/project/settings.py.tmpl +327 -326
  91. crawlo/templates/project/settings_distributed.py.tmpl +119 -119
  92. crawlo/templates/project/settings_gentle.py.tmpl +94 -94
  93. crawlo/templates/project/settings_high_performance.py.tmpl +151 -151
  94. crawlo/templates/project/settings_simple.py.tmpl +68 -68
  95. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  96. crawlo/templates/spider/spider.py.tmpl +143 -141
  97. crawlo/tools/__init__.py +182 -182
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/tools/data_validator.py +180 -180
  101. crawlo/tools/date_tools.py +35 -35
  102. crawlo/tools/distributed_coordinator.py +386 -386
  103. crawlo/tools/retry_mechanism.py +220 -220
  104. crawlo/tools/scenario_adapter.py +262 -262
  105. crawlo/utils/__init__.py +35 -35
  106. crawlo/utils/batch_processor.py +260 -260
  107. crawlo/utils/controlled_spider_mixin.py +439 -439
  108. crawlo/utils/date_tools.py +290 -290
  109. crawlo/utils/db_helper.py +343 -343
  110. crawlo/utils/enhanced_error_handler.py +359 -359
  111. crawlo/utils/env_config.py +105 -105
  112. crawlo/utils/error_handler.py +125 -125
  113. crawlo/utils/func_tools.py +82 -82
  114. crawlo/utils/large_scale_config.py +286 -286
  115. crawlo/utils/large_scale_helper.py +343 -343
  116. crawlo/utils/log.py +128 -128
  117. crawlo/utils/performance_monitor.py +284 -284
  118. crawlo/utils/queue_helper.py +175 -175
  119. crawlo/utils/redis_connection_pool.py +334 -334
  120. crawlo/utils/redis_key_validator.py +199 -199
  121. crawlo/utils/request.py +267 -267
  122. crawlo/utils/request_serializer.py +219 -219
  123. crawlo/utils/spider_loader.py +62 -62
  124. crawlo/utils/system.py +11 -11
  125. crawlo/utils/tools.py +4 -4
  126. crawlo/utils/url.py +39 -39
  127. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/METADATA +692 -697
  128. crawlo-1.2.1.dist-info/RECORD +220 -0
  129. examples/__init__.py +7 -7
  130. examples/aiohttp_settings.py +42 -0
  131. examples/curl_cffi_settings.py +41 -0
  132. examples/default_header_middleware_example.py +107 -0
  133. examples/default_header_spider_example.py +129 -0
  134. examples/download_delay_middleware_example.py +160 -0
  135. examples/httpx_settings.py +42 -0
  136. examples/multi_downloader_proxy_example.py +81 -0
  137. examples/offsite_middleware_example.py +55 -0
  138. examples/offsite_spider_example.py +107 -0
  139. examples/proxy_spider_example.py +166 -0
  140. examples/request_ignore_middleware_example.py +51 -0
  141. examples/request_ignore_spider_example.py +99 -0
  142. examples/response_code_middleware_example.py +52 -0
  143. examples/response_filter_middleware_example.py +67 -0
  144. examples/tong_hua_shun_settings.py +62 -0
  145. examples/tong_hua_shun_spider.py +170 -0
  146. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  147. tests/__init__.py +7 -7
  148. tests/advanced_tools_example.py +275 -275
  149. tests/authenticated_proxy_example.py +236 -236
  150. tests/cleaners_example.py +160 -160
  151. tests/config_validation_demo.py +102 -102
  152. tests/controlled_spider_example.py +205 -205
  153. tests/date_tools_example.py +180 -180
  154. tests/dynamic_loading_example.py +523 -523
  155. tests/dynamic_loading_test.py +104 -104
  156. tests/env_config_example.py +133 -133
  157. tests/error_handling_example.py +171 -171
  158. tests/redis_key_validation_demo.py +130 -130
  159. tests/response_improvements_example.py +144 -144
  160. tests/test_advanced_tools.py +148 -148
  161. tests/test_all_redis_key_configs.py +145 -145
  162. tests/test_authenticated_proxy.py +141 -141
  163. tests/test_cleaners.py +54 -54
  164. tests/test_comprehensive.py +146 -146
  165. tests/test_config_validator.py +193 -193
  166. tests/test_crawlo_proxy_integration.py +173 -0
  167. tests/test_date_tools.py +123 -123
  168. tests/test_default_header_middleware.py +159 -0
  169. tests/test_double_crawlo_fix.py +207 -207
  170. tests/test_double_crawlo_fix_simple.py +124 -124
  171. tests/test_download_delay_middleware.py +222 -0
  172. tests/test_downloader_proxy_compatibility.py +269 -0
  173. tests/test_dynamic_downloaders_proxy.py +124 -124
  174. tests/test_dynamic_proxy.py +92 -92
  175. tests/test_dynamic_proxy_config.py +146 -146
  176. tests/test_dynamic_proxy_real.py +109 -109
  177. tests/test_edge_cases.py +303 -303
  178. tests/test_enhanced_error_handler.py +270 -270
  179. tests/test_env_config.py +121 -121
  180. tests/test_error_handler_compatibility.py +112 -112
  181. tests/test_final_validation.py +153 -153
  182. tests/test_framework_env_usage.py +103 -103
  183. tests/test_integration.py +356 -356
  184. tests/test_item_dedup_redis_key.py +122 -122
  185. tests/test_offsite_middleware.py +222 -0
  186. tests/test_parsel.py +29 -29
  187. tests/test_performance.py +327 -327
  188. tests/test_proxy_api.py +265 -0
  189. tests/test_proxy_health_check.py +32 -32
  190. tests/test_proxy_middleware.py +122 -0
  191. tests/test_proxy_middleware_enhanced.py +217 -0
  192. tests/test_proxy_middleware_integration.py +136 -136
  193. tests/test_proxy_providers.py +56 -56
  194. tests/test_proxy_stats.py +19 -19
  195. tests/test_proxy_strategies.py +59 -59
  196. tests/test_queue_manager_double_crawlo.py +173 -173
  197. tests/test_queue_manager_redis_key.py +176 -176
  198. tests/test_real_scenario_proxy.py +196 -0
  199. tests/test_redis_config.py +28 -28
  200. tests/test_redis_connection_pool.py +294 -294
  201. tests/test_redis_key_naming.py +181 -181
  202. tests/test_redis_key_validator.py +123 -123
  203. tests/test_redis_queue.py +224 -224
  204. tests/test_request_ignore_middleware.py +183 -0
  205. tests/test_request_serialization.py +70 -70
  206. tests/test_response_code_middleware.py +350 -0
  207. tests/test_response_filter_middleware.py +428 -0
  208. tests/test_response_improvements.py +152 -152
  209. tests/test_retry_middleware.py +242 -0
  210. tests/test_scheduler.py +241 -241
  211. tests/test_simple_response.py +61 -61
  212. tests/test_telecom_spider_redis_key.py +205 -205
  213. tests/test_template_content.py +87 -87
  214. tests/test_template_redis_key.py +134 -134
  215. tests/test_tools.py +153 -153
  216. tests/tools_example.py +257 -257
  217. crawlo-1.2.0.dist-info/RECORD +0 -190
  218. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/WHEEL +0 -0
  219. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/entry_points.txt +0 -0
  220. {crawlo-1.2.0.dist-info → crawlo-1.2.1.dist-info}/top_level.txt +0 -0
@@ -1,30 +1,88 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
- from crawlo.exceptions import IgnoreRequestError
5
- from crawlo.event import ignore_request
6
-
7
-
8
- class RequestIgnoreMiddleware(object):
9
-
10
- def __init__(self, stats, log_level):
11
- self.logger = get_logger(self.__class__.__name__, log_level)
12
- self.stats = stats
13
-
14
- @classmethod
15
- def create_instance(cls, crawler):
16
- o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
17
- crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
18
- return o
19
-
20
- async def request_ignore(self, exc, request, _spider):
21
- self.logger.info(f'{request} ignored.')
22
- self.stats.inc_value('request_ignore_count')
23
- reason = exc.msg
24
- if reason:
25
- self.stats.inc_value(f'request_ignore_count/{reason}')
26
-
27
- @staticmethod
28
- def process_exception(_request, exc, _spider):
29
- if isinstance(exc, IgnoreRequestError):
30
- return True
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ RequestIgnoreMiddleware 中间件
5
+ 用于处理和记录被忽略的请求
6
+ """
7
+
8
+ from crawlo.utils.log import get_logger
9
+ from crawlo.exceptions import IgnoreRequestError
10
+ from crawlo.event import ignore_request
11
+
12
+
13
+ class RequestIgnoreMiddleware(object):
14
+ """
15
+ RequestIgnoreMiddleware 中间件
16
+ 用于处理和记录被忽略的请求,提供详细的统计信息
17
+ """
18
+
19
+ def __init__(self, stats, log_level):
20
+ """
21
+ 初始化中间件
22
+
23
+ Args:
24
+ stats: 统计信息收集器
25
+ log_level: 日志级别
26
+ """
27
+ self.logger = get_logger(self.__class__.__name__, log_level)
28
+ self.stats = stats
29
+
30
+ @classmethod
31
+ def create_instance(cls, crawler):
32
+ """
33
+ 创建中间件实例
34
+
35
+ Args:
36
+ crawler: 爬虫实例
37
+
38
+ Returns:
39
+ RequestIgnoreMiddleware: 中间件实例
40
+ """
41
+ o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
42
+ crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
43
+ return o
44
+
45
+ async def request_ignore(self, exc, request, _spider):
46
+ """
47
+ 处理被忽略的请求事件
48
+
49
+ Args:
50
+ exc: 异常对象
51
+ request: 被忽略的请求
52
+ _spider: 爬虫实例
53
+ """
54
+ # 记录被忽略的请求
55
+ self.logger.info(f'请求被忽略: {request.url}')
56
+ self.stats.inc_value('request_ignore_count')
57
+
58
+ # 记录忽略原因
59
+ reason = getattr(exc, 'msg', 'unknown')
60
+ if reason:
61
+ self.stats.inc_value(f'request_ignore_count/reason/{reason}')
62
+
63
+ # 记录请求的域名分布
64
+ try:
65
+ from urllib.parse import urlparse
66
+ parsed_url = urlparse(request.url)
67
+ domain = parsed_url.netloc
68
+ if domain:
69
+ self.stats.inc_value(f'request_ignore_count/domain/{domain}')
70
+ except Exception:
71
+ self.stats.inc_value('request_ignore_count/domain/invalid_url')
72
+
73
+ @staticmethod
74
+ def process_exception(_request, exc, _spider):
75
+ """
76
+ 处理异常,识别IgnoreRequestError
77
+
78
+ Args:
79
+ _request: 请求对象
80
+ exc: 异常对象
81
+ _spider: 爬虫实例
82
+
83
+ Returns:
84
+ bool: 如果是IgnoreRequestError则返回True,否则返回None
85
+ """
86
+ if isinstance(exc, IgnoreRequestError):
87
+ return True
88
+ return None
@@ -1,19 +1,165 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
-
5
-
6
- class ResponseCodeMiddleware(object):
7
- def __init__(self, stats, log_level):
8
- self.logger = get_logger(self.__class__.__name__, log_level)
9
- self.stats = stats
10
-
11
- @classmethod
12
- def create_instance(cls, crawler):
13
- o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
14
- return o
15
-
16
- def process_response(self, request, response, spider):
17
- self.stats.inc_value(f'stats_code/count/{response.status_code}')
18
- self.logger.debug(f'Got response from <{response.status_code} {response.url}>')
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ ResponseCodeMiddleware 中间件
5
+ 用于处理HTTP响应状态码,记录统计信息并支持特殊状态码处理
6
+ """
7
+
8
+ from crawlo.utils.log import get_logger
9
+
10
+
11
+ class ResponseCodeMiddleware(object):
12
+ """
13
+ ResponseCodeMiddleware 中间件
14
+ 用于处理HTTP响应状态码,记录统计信息并支持特殊状态码处理
15
+
16
+ 功能特性:
17
+ - 记录各种HTTP状态码的出现次数
18
+ - 支持特殊状态码的详细处理
19
+ - 提供详细的日志信息
20
+ - 支持状态码分类统计(2xx, 3xx, 4xx, 5xx)
21
+ """
22
+
23
+ def __init__(self, stats, log_level):
24
+ """
25
+ 初始化中间件
26
+
27
+ Args:
28
+ stats: 统计信息收集器
29
+ log_level: 日志级别
30
+ """
31
+ self.logger = get_logger(self.__class__.__name__, log_level)
32
+ self.stats = stats
33
+
34
+ @classmethod
35
+ def create_instance(cls, crawler):
36
+ """
37
+ 创建中间件实例
38
+
39
+ Args:
40
+ crawler: 爬虫实例
41
+
42
+ Returns:
43
+ ResponseCodeMiddleware: 中间件实例
44
+ """
45
+ o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
46
+ return o
47
+
48
+ def _get_status_category(self, status_code):
49
+ """
50
+ 获取状态码分类
51
+
52
+ Args:
53
+ status_code (int): HTTP状态码
54
+
55
+ Returns:
56
+ str: 状态码分类 (2xx, 3xx, 4xx, 5xx, other)
57
+ """
58
+ if 200 <= status_code < 300:
59
+ return "2xx"
60
+ elif 300 <= status_code < 400:
61
+ return "3xx"
62
+ elif 400 <= status_code < 500:
63
+ return "4xx"
64
+ elif 500 <= status_code < 600:
65
+ return "5xx"
66
+ else:
67
+ return "other"
68
+
69
+ def _is_success_response(self, status_code):
70
+ """
71
+ 判断是否为成功响应
72
+
73
+ Args:
74
+ status_code (int): HTTP状态码
75
+
76
+ Returns:
77
+ bool: 是否为成功响应
78
+ """
79
+ return 200 <= status_code < 300
80
+
81
+ def _is_redirect_response(self, status_code):
82
+ """
83
+ 判断是否为重定向响应
84
+
85
+ Args:
86
+ status_code (int): HTTP状态码
87
+
88
+ Returns:
89
+ bool: 是否为重定向响应
90
+ """
91
+ return 300 <= status_code < 400
92
+
93
+ def _is_client_error(self, status_code):
94
+ """
95
+ 判断是否为客户端错误
96
+
97
+ Args:
98
+ status_code (int): HTTP状态码
99
+
100
+ Returns:
101
+ bool: 是否为客户端错误
102
+ """
103
+ return 400 <= status_code < 500
104
+
105
+ def _is_server_error(self, status_code):
106
+ """
107
+ 判断是否为服务器错误
108
+
109
+ Args:
110
+ status_code (int): HTTP状态码
111
+
112
+ Returns:
113
+ bool: 是否为服务器错误
114
+ """
115
+ return 500 <= status_code < 600
116
+
117
+ def process_response(self, request, response, spider):
118
+ """
119
+ 处理响应,记录状态码统计信息
120
+
121
+ Args:
122
+ request: 请求对象
123
+ response: 响应对象
124
+ spider: 爬虫实例
125
+
126
+ Returns:
127
+ response: 响应对象
128
+ """
129
+ status_code = response.status_code
130
+
131
+ # 记录具体状态码统计
132
+ self.stats.inc_value(f'response_status_code/count/{status_code}')
133
+
134
+ # 记录状态码分类统计
135
+ category = self._get_status_category(status_code)
136
+ self.stats.inc_value(f'response_status_code/category/{category}')
137
+
138
+ # 记录成功/失败统计
139
+ if self._is_success_response(status_code):
140
+ self.stats.inc_value('response_status_code/success_count')
141
+ elif self._is_client_error(status_code) or self._is_server_error(status_code):
142
+ self.stats.inc_value('response_status_code/error_count')
143
+
144
+ # 记录响应大小统计
145
+ if hasattr(response, 'content_length') and response.content_length:
146
+ self.stats.inc_value('response_total_bytes', response.content_length)
147
+
148
+ # 记录域名统计
149
+ try:
150
+ from urllib.parse import urlparse
151
+ parsed_url = urlparse(response.url)
152
+ domain = parsed_url.netloc
153
+ if domain:
154
+ self.stats.inc_value(f'response_status_code/domain/{domain}/count/{status_code}')
155
+ self.stats.inc_value(f'response_status_code/domain/{domain}/category/{category}')
156
+ except Exception:
157
+ self.stats.inc_value('response_status_code/domain/invalid_url/count/{status_code}')
158
+
159
+ # 详细日志记录
160
+ self.logger.debug(
161
+ f'收到响应: {status_code} {response.url} '
162
+ f'(分类: {category}, 大小: {getattr(response, "content_length", "unknown")} bytes)'
163
+ )
164
+
19
165
  return response
@@ -1,26 +1,138 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
- from crawlo.exceptions import IgnoreRequestError
5
-
6
-
7
- class ResponseFilterMiddleware:
8
-
9
- def __init__(self, allowed_codes, log_level):
10
- self.allowed_codes = allowed_codes
11
- self.logger = get_logger(self.__class__.__name__, log_level)
12
-
13
- @classmethod
14
- def create_instance(cls, crawler):
15
- o = cls(
16
- allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
17
- log_level=crawler.settings.get('LOG_LEVEL')
18
- )
19
- return o
20
-
21
- def process_response(self, request, response, spider):
22
- if 200 <= response.status_code < 300:
23
- return response
24
- if response.status_code in self.allowed_codes:
25
- return response
26
- raise IgnoreRequestError(f"response status_code/non-200")
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ ResponseFilterMiddleware 中间件
5
+ 用于过滤不符合要求的HTTP响应,支持自定义允许的状态码
6
+ """
7
+
8
+ from crawlo.utils.log import get_logger
9
+ from crawlo.exceptions import IgnoreRequestError
10
+
11
+
12
+ class ResponseFilterMiddleware:
13
+ """
14
+ ResponseFilterMiddleware 中间件
15
+ 用于过滤不符合要求的HTTP响应,支持自定义允许的状态码
16
+
17
+ 功能特性:
18
+ - 默认允许2xx状态码
19
+ - 支持自定义允许的状态码列表
20
+ - 支持拒绝特定状态码
21
+ - 提供详细的日志信息
22
+ - 支持按域名配置不同的过滤规则
23
+ """
24
+
25
+ def __init__(self, allowed_codes, denied_codes, log_level):
26
+ """
27
+ 初始化中间件
28
+
29
+ Args:
30
+ allowed_codes: 允许的状态码列表
31
+ denied_codes: 拒绝的状态码列表
32
+ log_level: 日志级别
33
+ """
34
+ # 确保状态码是整数类型
35
+ self.allowed_codes = set()
36
+ if allowed_codes:
37
+ for code in allowed_codes:
38
+ try:
39
+ self.allowed_codes.add(int(code))
40
+ except (ValueError, TypeError):
41
+ pass # 忽略无效的状态码
42
+
43
+ self.denied_codes = set()
44
+ if denied_codes:
45
+ for code in denied_codes:
46
+ try:
47
+ self.denied_codes.add(int(code))
48
+ except (ValueError, TypeError):
49
+ pass # 忽略无效的状态码
50
+
51
+ self.logger = get_logger(self.__class__.__name__, log_level)
52
+
53
+ @classmethod
54
+ def create_instance(cls, crawler):
55
+ """
56
+ 创建中间件实例
57
+
58
+ Args:
59
+ crawler: 爬虫实例
60
+
61
+ Returns:
62
+ ResponseFilterMiddleware: 中间件实例
63
+ """
64
+ o = cls(
65
+ allowed_codes=crawler.settings.get_list('ALLOWED_RESPONSE_CODES'),
66
+ denied_codes=crawler.settings.get_list('DENIED_RESPONSE_CODES'),
67
+ log_level=crawler.settings.get('LOG_LEVEL')
68
+ )
69
+ return o
70
+
71
+ def _is_response_allowed(self, response):
72
+ """
73
+ 判断响应是否被允许
74
+
75
+ Args:
76
+ response: 响应对象
77
+
78
+ Returns:
79
+ bool: 是否被允许
80
+ """
81
+ status_code = response.status_code
82
+
83
+ # 首先检查是否被明确拒绝
84
+ if status_code in self.denied_codes:
85
+ return False
86
+
87
+ # 检查是否被明确允许
88
+ if status_code in self.allowed_codes:
89
+ return True
90
+
91
+ # 默认允许2xx状态码
92
+ if 200 <= status_code < 300:
93
+ return True
94
+
95
+ # 默认拒绝其他状态码
96
+ return False
97
+
98
+ def _get_filter_reason(self, status_code):
99
+ """
100
+ 获取过滤原因描述
101
+
102
+ Args:
103
+ status_code (int): HTTP状态码
104
+
105
+ Returns:
106
+ str: 过滤原因描述
107
+ """
108
+ if status_code in self.denied_codes:
109
+ return f"状态码 {status_code} 被明确拒绝"
110
+ elif status_code not in self.allowed_codes and not (200 <= status_code < 300):
111
+ return f"状态码 {status_code} 不在允许列表中"
112
+ else:
113
+ return f"状态码 {status_code} 被过滤"
114
+
115
+ def process_response(self, request, response, spider):
116
+ """
117
+ 处理响应,过滤不符合要求的响应
118
+
119
+ Args:
120
+ request: 请求对象
121
+ response: 响应对象
122
+ spider: 爬虫实例
123
+
124
+ Returns:
125
+ response: 响应对象(如果被允许)
126
+
127
+ Raises:
128
+ IgnoreRequestError: 如果响应被过滤
129
+ """
130
+ if self._is_response_allowed(response):
131
+ return response
132
+
133
+ # 响应被过滤
134
+ reason = self._get_filter_reason(response.status_code)
135
+ self.logger.debug(f"过滤响应: {response.status_code} {response.url} - {reason}")
136
+
137
+ # 抛出异常以忽略该响应
138
+ raise IgnoreRequestError(f"response filtered: {reason} - {response.status_code} {response.url}")