crawlo 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (220) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +81 -65
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +143 -133
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +292 -292
  14. crawlo/commands/startproject.py +418 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +252 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1027 -1027
  24. crawlo/downloader/__init__.py +266 -266
  25. crawlo/downloader/aiohttp_downloader.py +220 -220
  26. crawlo/downloader/cffi_downloader.py +256 -256
  27. crawlo/downloader/httpx_downloader.py +259 -259
  28. crawlo/downloader/hybrid_downloader.py +213 -213
  29. crawlo/downloader/playwright_downloader.py +402 -402
  30. crawlo/downloader/selenium_downloader.py +472 -472
  31. crawlo/event.py +11 -11
  32. crawlo/exceptions.py +81 -81
  33. crawlo/extension/__init__.py +37 -37
  34. crawlo/extension/health_check.py +141 -141
  35. crawlo/extension/log_interval.py +57 -57
  36. crawlo/extension/log_stats.py +81 -81
  37. crawlo/extension/logging_extension.py +43 -43
  38. crawlo/extension/memory_monitor.py +104 -104
  39. crawlo/extension/performance_profiler.py +133 -133
  40. crawlo/extension/request_recorder.py +107 -107
  41. crawlo/filters/__init__.py +154 -154
  42. crawlo/filters/aioredis_filter.py +280 -280
  43. crawlo/filters/memory_filter.py +269 -269
  44. crawlo/items/__init__.py +23 -23
  45. crawlo/items/base.py +21 -21
  46. crawlo/items/fields.py +53 -53
  47. crawlo/items/items.py +104 -104
  48. crawlo/middleware/__init__.py +21 -21
  49. crawlo/middleware/default_header.py +132 -32
  50. crawlo/middleware/download_delay.py +105 -28
  51. crawlo/middleware/middleware_manager.py +135 -135
  52. crawlo/middleware/offsite.py +116 -0
  53. crawlo/middleware/proxy.py +366 -272
  54. crawlo/middleware/request_ignore.py +88 -30
  55. crawlo/middleware/response_code.py +164 -18
  56. crawlo/middleware/response_filter.py +138 -26
  57. crawlo/middleware/retry.py +124 -124
  58. crawlo/mode_manager.py +211 -211
  59. crawlo/network/__init__.py +21 -21
  60. crawlo/network/request.py +338 -338
  61. crawlo/network/response.py +359 -359
  62. crawlo/pipelines/__init__.py +21 -21
  63. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  64. crawlo/pipelines/console_pipeline.py +39 -39
  65. crawlo/pipelines/csv_pipeline.py +316 -316
  66. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  67. crawlo/pipelines/json_pipeline.py +218 -218
  68. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  69. crawlo/pipelines/mongo_pipeline.py +131 -131
  70. crawlo/pipelines/mysql_pipeline.py +316 -316
  71. crawlo/pipelines/pipeline_manager.py +61 -61
  72. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  73. crawlo/project.py +187 -187
  74. crawlo/queue/pqueue.py +37 -37
  75. crawlo/queue/queue_manager.py +337 -337
  76. crawlo/queue/redis_priority_queue.py +298 -298
  77. crawlo/settings/__init__.py +7 -7
  78. crawlo/settings/default_settings.py +226 -219
  79. crawlo/settings/setting_manager.py +122 -122
  80. crawlo/spider/__init__.py +639 -639
  81. crawlo/stats_collector.py +59 -59
  82. crawlo/subscriber.py +130 -130
  83. crawlo/task_manager.py +30 -30
  84. crawlo/templates/crawlo.cfg.tmpl +10 -10
  85. crawlo/templates/project/__init__.py.tmpl +3 -3
  86. crawlo/templates/project/items.py.tmpl +17 -17
  87. crawlo/templates/project/middlewares.py.tmpl +118 -109
  88. crawlo/templates/project/pipelines.py.tmpl +96 -96
  89. crawlo/templates/project/run.py.tmpl +45 -45
  90. crawlo/templates/project/settings.py.tmpl +327 -326
  91. crawlo/templates/project/settings_distributed.py.tmpl +119 -119
  92. crawlo/templates/project/settings_gentle.py.tmpl +94 -94
  93. crawlo/templates/project/settings_high_performance.py.tmpl +151 -151
  94. crawlo/templates/project/settings_simple.py.tmpl +68 -68
  95. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  96. crawlo/templates/spider/spider.py.tmpl +143 -141
  97. crawlo/tools/__init__.py +182 -182
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/tools/data_validator.py +180 -180
  101. crawlo/tools/date_tools.py +35 -35
  102. crawlo/tools/distributed_coordinator.py +386 -386
  103. crawlo/tools/retry_mechanism.py +220 -220
  104. crawlo/tools/scenario_adapter.py +262 -262
  105. crawlo/utils/__init__.py +35 -35
  106. crawlo/utils/batch_processor.py +260 -260
  107. crawlo/utils/controlled_spider_mixin.py +439 -439
  108. crawlo/utils/date_tools.py +290 -290
  109. crawlo/utils/db_helper.py +343 -343
  110. crawlo/utils/enhanced_error_handler.py +359 -359
  111. crawlo/utils/env_config.py +105 -105
  112. crawlo/utils/error_handler.py +125 -125
  113. crawlo/utils/func_tools.py +82 -82
  114. crawlo/utils/large_scale_config.py +286 -286
  115. crawlo/utils/large_scale_helper.py +343 -343
  116. crawlo/utils/log.py +128 -128
  117. crawlo/utils/performance_monitor.py +284 -284
  118. crawlo/utils/queue_helper.py +175 -175
  119. crawlo/utils/redis_connection_pool.py +334 -334
  120. crawlo/utils/redis_key_validator.py +199 -199
  121. crawlo/utils/request.py +267 -267
  122. crawlo/utils/request_serializer.py +219 -219
  123. crawlo/utils/spider_loader.py +62 -62
  124. crawlo/utils/system.py +11 -11
  125. crawlo/utils/tools.py +4 -4
  126. crawlo/utils/url.py +39 -39
  127. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/METADATA +692 -697
  128. crawlo-1.2.2.dist-info/RECORD +220 -0
  129. examples/__init__.py +7 -7
  130. examples/aiohttp_settings.py +42 -0
  131. examples/curl_cffi_settings.py +41 -0
  132. examples/default_header_middleware_example.py +107 -0
  133. examples/default_header_spider_example.py +129 -0
  134. examples/download_delay_middleware_example.py +160 -0
  135. examples/httpx_settings.py +42 -0
  136. examples/multi_downloader_proxy_example.py +81 -0
  137. examples/offsite_middleware_example.py +55 -0
  138. examples/offsite_spider_example.py +107 -0
  139. examples/proxy_spider_example.py +166 -0
  140. examples/request_ignore_middleware_example.py +51 -0
  141. examples/request_ignore_spider_example.py +99 -0
  142. examples/response_code_middleware_example.py +52 -0
  143. examples/response_filter_middleware_example.py +67 -0
  144. examples/tong_hua_shun_settings.py +62 -0
  145. examples/tong_hua_shun_spider.py +170 -0
  146. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  147. tests/__init__.py +7 -7
  148. tests/advanced_tools_example.py +275 -275
  149. tests/authenticated_proxy_example.py +236 -236
  150. tests/cleaners_example.py +160 -160
  151. tests/config_validation_demo.py +102 -102
  152. tests/controlled_spider_example.py +205 -205
  153. tests/date_tools_example.py +180 -180
  154. tests/dynamic_loading_example.py +523 -523
  155. tests/dynamic_loading_test.py +104 -104
  156. tests/env_config_example.py +133 -133
  157. tests/error_handling_example.py +171 -171
  158. tests/redis_key_validation_demo.py +130 -130
  159. tests/response_improvements_example.py +144 -144
  160. tests/test_advanced_tools.py +148 -148
  161. tests/test_all_redis_key_configs.py +145 -145
  162. tests/test_authenticated_proxy.py +141 -141
  163. tests/test_cleaners.py +54 -54
  164. tests/test_comprehensive.py +146 -146
  165. tests/test_config_validator.py +193 -193
  166. tests/test_crawlo_proxy_integration.py +173 -0
  167. tests/test_date_tools.py +123 -123
  168. tests/test_default_header_middleware.py +159 -0
  169. tests/test_double_crawlo_fix.py +207 -207
  170. tests/test_double_crawlo_fix_simple.py +124 -124
  171. tests/test_download_delay_middleware.py +222 -0
  172. tests/test_downloader_proxy_compatibility.py +269 -0
  173. tests/test_dynamic_downloaders_proxy.py +124 -124
  174. tests/test_dynamic_proxy.py +92 -92
  175. tests/test_dynamic_proxy_config.py +146 -146
  176. tests/test_dynamic_proxy_real.py +109 -109
  177. tests/test_edge_cases.py +303 -303
  178. tests/test_enhanced_error_handler.py +270 -270
  179. tests/test_env_config.py +121 -121
  180. tests/test_error_handler_compatibility.py +112 -112
  181. tests/test_final_validation.py +153 -153
  182. tests/test_framework_env_usage.py +103 -103
  183. tests/test_integration.py +356 -356
  184. tests/test_item_dedup_redis_key.py +122 -122
  185. tests/test_offsite_middleware.py +222 -0
  186. tests/test_parsel.py +29 -29
  187. tests/test_performance.py +327 -327
  188. tests/test_proxy_api.py +265 -0
  189. tests/test_proxy_health_check.py +32 -32
  190. tests/test_proxy_middleware.py +122 -0
  191. tests/test_proxy_middleware_enhanced.py +217 -0
  192. tests/test_proxy_middleware_integration.py +136 -136
  193. tests/test_proxy_providers.py +56 -56
  194. tests/test_proxy_stats.py +19 -19
  195. tests/test_proxy_strategies.py +59 -59
  196. tests/test_queue_manager_double_crawlo.py +173 -173
  197. tests/test_queue_manager_redis_key.py +176 -176
  198. tests/test_real_scenario_proxy.py +196 -0
  199. tests/test_redis_config.py +28 -28
  200. tests/test_redis_connection_pool.py +294 -294
  201. tests/test_redis_key_naming.py +181 -181
  202. tests/test_redis_key_validator.py +123 -123
  203. tests/test_redis_queue.py +224 -224
  204. tests/test_request_ignore_middleware.py +183 -0
  205. tests/test_request_serialization.py +70 -70
  206. tests/test_response_code_middleware.py +350 -0
  207. tests/test_response_filter_middleware.py +428 -0
  208. tests/test_response_improvements.py +152 -152
  209. tests/test_retry_middleware.py +242 -0
  210. tests/test_scheduler.py +241 -241
  211. tests/test_simple_response.py +61 -61
  212. tests/test_telecom_spider_redis_key.py +205 -205
  213. tests/test_template_content.py +87 -87
  214. tests/test_template_redis_key.py +134 -134
  215. tests/test_tools.py +153 -153
  216. tests/tools_example.py +257 -257
  217. crawlo-1.2.0.dist-info/RECORD +0 -190
  218. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/WHEEL +0 -0
  219. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/entry_points.txt +0 -0
  220. {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/top_level.txt +0 -0
@@ -1,135 +1,135 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from pprint import pformat
4
- from types import MethodType
5
- from asyncio import create_task
6
- from collections import defaultdict
7
- from typing import List, Dict, Callable, Optional
8
-
9
- from crawlo import Request, Response
10
- from crawlo.utils.log import get_logger
11
- from crawlo.project import load_class
12
- from crawlo.middleware import BaseMiddleware
13
- from crawlo.project import common_call
14
- from crawlo.event import ignore_request, response_received
15
- from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
16
- NotConfiguredError
17
-
18
-
19
- class MiddlewareManager:
20
-
21
- def __init__(self, crawler):
22
- self.crawler = crawler
23
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
24
- self.middlewares: List = []
25
- self.methods: Dict[str, List[MethodType]] = defaultdict(list)
26
- middlewares = self.crawler.settings.get_list('MIDDLEWARES')
27
- self._add_middleware(middlewares)
28
- self._add_method()
29
-
30
- self.download_method: Callable = crawler.engine.downloader.download
31
- self._stats = crawler.stats
32
-
33
- async def _process_request(self, request: Request):
34
- for method in self.methods['process_request']:
35
- result = await common_call(method, request, self.crawler.spider)
36
- if result is None:
37
- continue
38
- if isinstance(result, (Request, Response)):
39
- return result
40
- raise InvalidOutputError(
41
- f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
42
- )
43
- return await self.download_method(request)
44
-
45
- async def _process_response(self, request: Request, response: Response):
46
- for method in reversed(self.methods['process_response']):
47
- try:
48
- response = await common_call(method, request, response, self.crawler.spider)
49
- except IgnoreRequestError as exp:
50
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
51
- if isinstance(response, Request):
52
- return response
53
- if isinstance(response, Response):
54
- continue
55
- raise InvalidOutputError(
56
- f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
57
- )
58
- return response
59
-
60
- async def _process_exception(self, request: Request, exp: Exception):
61
- for method in self.methods['process_exception']:
62
- response = await common_call(method, request, exp, self.crawler.spider)
63
- if response is None:
64
- continue
65
- if isinstance(response, (Request, Response)):
66
- return response
67
- if response:
68
- break
69
- raise InvalidOutputError(
70
- f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
71
- )
72
- else:
73
- raise exp
74
-
75
- async def download(self, request) -> Optional[Response]:
76
- """ called in the download method. """
77
- try:
78
- response = await self._process_request(request)
79
- except KeyError:
80
- raise RequestMethodError(f"{request.method.lower()} is not supported")
81
- except IgnoreRequestError as exp:
82
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
83
- response = await self._process_exception(request, exp)
84
- except Exception as exp:
85
- self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
86
- response = await self._process_exception(request, exp)
87
- else:
88
- create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
89
- # self.crawler.stats.inc_value('response_received_count')
90
- if isinstance(response, Response):
91
- response = await self._process_response(request, response)
92
- if isinstance(response, Request):
93
- await self.crawler.engine.enqueue_request(request)
94
- return None
95
- return response
96
-
97
- @classmethod
98
- def create_instance(cls, *args, **kwargs):
99
- return cls(*args, **kwargs)
100
-
101
- def _add_middleware(self, middlewares):
102
- enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
103
- if enabled_middlewares:
104
- self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
105
-
106
- def _validate_middleware(self, middleware):
107
- middleware_cls = load_class(middleware)
108
- if not hasattr(middleware_cls, 'create_instance'):
109
- raise MiddlewareInitError(
110
- f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
111
- )
112
- try:
113
- instance = middleware_cls.create_instance(self.crawler)
114
- self.middlewares.append(instance)
115
- return True
116
- except NotConfiguredError:
117
- return False
118
-
119
- def _add_method(self):
120
- for middleware in self.middlewares:
121
- if hasattr(middleware, 'process_request'):
122
- if self._validate_middleware_method(method_name='process_request', middleware=middleware):
123
- self.methods['process_request'].append(middleware.process_request)
124
- if hasattr(middleware, 'process_response'):
125
- if self._validate_middleware_method(method_name='process_response', middleware=middleware):
126
- self.methods['process_response'].append(middleware.process_response)
127
- if hasattr(middleware, 'process_exception'):
128
- if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
129
- self.methods['process_exception'].append(middleware.process_exception)
130
-
131
- @staticmethod
132
- def _validate_middleware_method(method_name, middleware) -> bool:
133
- method = getattr(type(middleware), method_name)
134
- base_method = getattr(BaseMiddleware, method_name)
135
- return False if method == base_method else True
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from pprint import pformat
4
+ from types import MethodType
5
+ from asyncio import create_task
6
+ from collections import defaultdict
7
+ from typing import List, Dict, Callable, Optional
8
+
9
+ from crawlo import Request, Response
10
+ from crawlo.utils.log import get_logger
11
+ from crawlo.project import load_class
12
+ from crawlo.middleware import BaseMiddleware
13
+ from crawlo.project import common_call
14
+ from crawlo.event import ignore_request, response_received
15
+ from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
16
+ NotConfiguredError
17
+
18
+
19
+ class MiddlewareManager:
20
+
21
+ def __init__(self, crawler):
22
+ self.crawler = crawler
23
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
24
+ self.middlewares: List = []
25
+ self.methods: Dict[str, List[MethodType]] = defaultdict(list)
26
+ middlewares = self.crawler.settings.get_list('MIDDLEWARES')
27
+ self._add_middleware(middlewares)
28
+ self._add_method()
29
+
30
+ self.download_method: Callable = crawler.engine.downloader.download
31
+ self._stats = crawler.stats
32
+
33
+ async def _process_request(self, request: Request):
34
+ for method in self.methods['process_request']:
35
+ result = await common_call(method, request, self.crawler.spider)
36
+ if result is None:
37
+ continue
38
+ if isinstance(result, (Request, Response)):
39
+ return result
40
+ raise InvalidOutputError(
41
+ f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
42
+ )
43
+ return await self.download_method(request)
44
+
45
+ async def _process_response(self, request: Request, response: Response):
46
+ for method in reversed(self.methods['process_response']):
47
+ try:
48
+ response = await common_call(method, request, response, self.crawler.spider)
49
+ except IgnoreRequestError as exp:
50
+ create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
51
+ if isinstance(response, Request):
52
+ return response
53
+ if isinstance(response, Response):
54
+ continue
55
+ raise InvalidOutputError(
56
+ f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
57
+ )
58
+ return response
59
+
60
+ async def _process_exception(self, request: Request, exp: Exception):
61
+ for method in self.methods['process_exception']:
62
+ response = await common_call(method, request, exp, self.crawler.spider)
63
+ if response is None:
64
+ continue
65
+ if isinstance(response, (Request, Response)):
66
+ return response
67
+ if response:
68
+ break
69
+ raise InvalidOutputError(
70
+ f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
71
+ )
72
+ else:
73
+ raise exp
74
+
75
+ async def download(self, request) -> Optional[Response]:
76
+ """ called in the download method. """
77
+ try:
78
+ response = await self._process_request(request)
79
+ except KeyError:
80
+ raise RequestMethodError(f"{request.method.lower()} is not supported")
81
+ except IgnoreRequestError as exp:
82
+ create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
83
+ response = await self._process_exception(request, exp)
84
+ except Exception as exp:
85
+ self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
86
+ response = await self._process_exception(request, exp)
87
+ else:
88
+ create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
89
+ # self.crawler.stats.inc_value('response_received_count')
90
+ if isinstance(response, Response):
91
+ response = await self._process_response(request, response)
92
+ if isinstance(response, Request):
93
+ await self.crawler.engine.enqueue_request(request)
94
+ return None
95
+ return response
96
+
97
+ @classmethod
98
+ def create_instance(cls, *args, **kwargs):
99
+ return cls(*args, **kwargs)
100
+
101
+ def _add_middleware(self, middlewares):
102
+ enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
103
+ if enabled_middlewares:
104
+ self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
105
+
106
+ def _validate_middleware(self, middleware):
107
+ middleware_cls = load_class(middleware)
108
+ if not hasattr(middleware_cls, 'create_instance'):
109
+ raise MiddlewareInitError(
110
+ f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
111
+ )
112
+ try:
113
+ instance = middleware_cls.create_instance(self.crawler)
114
+ self.middlewares.append(instance)
115
+ return True
116
+ except NotConfiguredError:
117
+ return False
118
+
119
+ def _add_method(self):
120
+ for middleware in self.middlewares:
121
+ if hasattr(middleware, 'process_request'):
122
+ if self._validate_middleware_method(method_name='process_request', middleware=middleware):
123
+ self.methods['process_request'].append(middleware.process_request)
124
+ if hasattr(middleware, 'process_response'):
125
+ if self._validate_middleware_method(method_name='process_response', middleware=middleware):
126
+ self.methods['process_response'].append(middleware.process_response)
127
+ if hasattr(middleware, 'process_exception'):
128
+ if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
129
+ self.methods['process_exception'].append(middleware.process_exception)
130
+
131
+ @staticmethod
132
+ def _validate_middleware_method(method_name, middleware) -> bool:
133
+ method = getattr(type(middleware), method_name)
134
+ base_method = getattr(BaseMiddleware, method_name)
135
+ return False if method == base_method else True
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ OffsiteMiddleware 中间件
5
+ 用于过滤掉不在指定域名范围内的请求
6
+ """
7
+
8
+ import re
9
+ from urllib.parse import urlparse
10
+
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.exceptions import IgnoreRequestError
13
+
14
+
15
+ class OffsiteMiddleware:
16
+ """
17
+ OffsiteMiddleware 中间件
18
+ 用于过滤掉不在指定域名范围内的请求,防止爬虫爬取到不相关的网站
19
+ """
20
+
21
+ def __init__(self, stats, log_level, allowed_domains=None):
22
+ self.logger = get_logger(self.__class__.__name__, log_level)
23
+ self.stats = stats
24
+ self.allowed_domains = allowed_domains or []
25
+
26
+ @classmethod
27
+ def create_instance(cls, crawler):
28
+ """
29
+ 创建中间件实例
30
+ 从爬虫设置中获取允许的域名列表
31
+ """
32
+ # 从爬虫设置中获取允许的域名
33
+ allowed_domains = crawler.settings.get_list('ALLOWED_DOMAINS')
34
+
35
+ # 如果没有配置允许的域名,则禁用此中间件
36
+ if not allowed_domains:
37
+ from crawlo.exceptions import NotConfiguredError
38
+ raise NotConfiguredError("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用")
39
+
40
+ o = cls(
41
+ stats=crawler.stats,
42
+ log_level=crawler.settings.get('LOG_LEVEL'),
43
+ allowed_domains=allowed_domains
44
+ )
45
+
46
+ # 编译域名正则表达式以提高性能
47
+ o._compile_domains()
48
+
49
+ crawler.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
50
+ return o
51
+
52
+ def _compile_domains(self):
53
+ """
54
+ 编译域名正则表达式
55
+ """
56
+ self._domain_regexes = []
57
+ for domain in self.allowed_domains:
58
+ # 转义域名中的特殊字符
59
+ escaped_domain = re.escape(domain)
60
+ # 创建匹配域名的正则表达式(支持子域名)
61
+ regex = re.compile(r'(^|.*\.)' + escaped_domain + '$', re.IGNORECASE)
62
+ self._domain_regexes.append(regex)
63
+
64
+ def _is_offsite_request(self, request):
65
+ """
66
+ 判断请求是否为站外请求
67
+ """
68
+ try:
69
+ parsed_url = urlparse(request.url)
70
+ hostname = parsed_url.hostname
71
+
72
+ if not hostname:
73
+ return True # 无效URL
74
+
75
+ # 检查是否匹配允许的域名
76
+ for regex in self._domain_regexes:
77
+ if regex.match(hostname):
78
+ return False # 匹配允许的域名
79
+
80
+ return True # 不匹配任何允许的域名
81
+ except Exception:
82
+ # URL解析失败,视为站外请求
83
+ return True
84
+
85
+ async def process_request(self, request, spider):
86
+ """
87
+ 处理请求,过滤站外请求
88
+ """
89
+ if self._is_offsite_request(request):
90
+ # 记录被过滤的请求
91
+ self.stats.inc_value('offsite_request_count')
92
+
93
+ # 记录被过滤的域名
94
+ try:
95
+ parsed_url = urlparse(request.url)
96
+ hostname = parsed_url.hostname or "unknown"
97
+ self.stats.inc_value(f'offsite_request_count/{hostname}')
98
+ except:
99
+ self.stats.inc_value('offsite_request_count/invalid_url')
100
+
101
+ self.logger.debug(f"过滤站外请求: {request.url}")
102
+
103
+ # 抛出异常以忽略该请求
104
+ raise IgnoreRequestError(f"站外请求被过滤: {request.url}")
105
+
106
+ return None
107
+
108
+ def process_exception(self, request, exception, spider):
109
+ """
110
+ 处理异常
111
+ """
112
+ # 如果是IgnoreRequestError且是我们产生的,则处理它
113
+ if isinstance(exception, IgnoreRequestError) and "站外请求被过滤" in str(exception):
114
+ self.logger.debug(f"已过滤站外请求: {request.url}")
115
+ return True # 表示异常已被处理
116
+ return None