crawlo 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.1.dist-info/METADATA +0 -1199
- crawlo-1.4.1.dist-info/RECORD +0 -309
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -1,136 +1,136 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from pprint import pformat
|
|
4
|
-
from types import MethodType
|
|
5
|
-
from asyncio import create_task
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from typing import List, Dict, Callable, Optional
|
|
8
|
-
|
|
9
|
-
from crawlo import Request, Response
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.utils.class_loader import load_class
|
|
12
|
-
from crawlo.middleware import BaseMiddleware
|
|
13
|
-
from crawlo.project import common_call
|
|
14
|
-
from crawlo.event import ignore_request, response_received
|
|
15
|
-
from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
|
|
16
|
-
NotConfiguredError
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class MiddlewareManager:
|
|
20
|
-
|
|
21
|
-
def __init__(self, crawler):
|
|
22
|
-
self.crawler = crawler
|
|
23
|
-
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
24
|
-
self.middlewares: List = []
|
|
25
|
-
self.methods: Dict[str, List[MethodType]] = defaultdict(list)
|
|
26
|
-
middlewares = self.crawler.settings.get_list('MIDDLEWARES')
|
|
27
|
-
self._add_middleware(middlewares)
|
|
28
|
-
self._add_method()
|
|
29
|
-
|
|
30
|
-
self.download_method: Callable = crawler.engine.downloader.download
|
|
31
|
-
self._stats = crawler.stats
|
|
32
|
-
|
|
33
|
-
async def _process_request(self, request: Request):
|
|
34
|
-
for method in self.methods['process_request']:
|
|
35
|
-
result = await common_call(method, request, self.crawler.spider)
|
|
36
|
-
if result is None:
|
|
37
|
-
continue
|
|
38
|
-
if isinstance(result, (Request, Response)):
|
|
39
|
-
return result
|
|
40
|
-
raise InvalidOutputError(
|
|
41
|
-
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
|
|
42
|
-
)
|
|
43
|
-
return await self.download_method(request)
|
|
44
|
-
|
|
45
|
-
async def _process_response(self, request: Request, response: Response):
|
|
46
|
-
for method in reversed(self.methods['process_response']):
|
|
47
|
-
try:
|
|
48
|
-
response = await common_call(method, request, response, self.crawler.spider)
|
|
49
|
-
except IgnoreRequestError as exp:
|
|
50
|
-
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
51
|
-
if isinstance(response, Request):
|
|
52
|
-
return response
|
|
53
|
-
if isinstance(response, Response):
|
|
54
|
-
continue
|
|
55
|
-
raise InvalidOutputError(
|
|
56
|
-
f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
|
|
57
|
-
)
|
|
58
|
-
return response
|
|
59
|
-
|
|
60
|
-
async def _process_exception(self, request: Request, exp: Exception):
|
|
61
|
-
for method in self.methods['process_exception']:
|
|
62
|
-
response = await common_call(method, request, exp, self.crawler.spider)
|
|
63
|
-
if response is None:
|
|
64
|
-
continue
|
|
65
|
-
if isinstance(response, (Request, Response)):
|
|
66
|
-
return response
|
|
67
|
-
if response:
|
|
68
|
-
break
|
|
69
|
-
raise InvalidOutputError(
|
|
70
|
-
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
|
|
71
|
-
)
|
|
72
|
-
else:
|
|
73
|
-
raise exp
|
|
74
|
-
|
|
75
|
-
async def download(self, request) -> Optional[Response]:
|
|
76
|
-
""" called in the download method. """
|
|
77
|
-
try:
|
|
78
|
-
response = await self._process_request(request)
|
|
79
|
-
except KeyError:
|
|
80
|
-
raise RequestMethodError(f"{request.method.lower()} is not supported")
|
|
81
|
-
except IgnoreRequestError as exp:
|
|
82
|
-
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
83
|
-
response = await self._process_exception(request, exp)
|
|
84
|
-
except Exception as exp:
|
|
85
|
-
self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
|
|
86
|
-
response = await self._process_exception(request, exp)
|
|
87
|
-
else:
|
|
88
|
-
create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
|
|
89
|
-
self._stats.inc_value('response_received_count')
|
|
90
|
-
if isinstance(response, Response):
|
|
91
|
-
response = await self._process_response(request, response)
|
|
92
|
-
if isinstance(response, Request):
|
|
93
|
-
await self.crawler.engine.enqueue_request(request)
|
|
94
|
-
return None
|
|
95
|
-
return response
|
|
96
|
-
|
|
97
|
-
@classmethod
|
|
98
|
-
def create_instance(cls, *args, **kwargs):
|
|
99
|
-
return cls(*args, **kwargs)
|
|
100
|
-
|
|
101
|
-
def _add_middleware(self, middlewares):
|
|
102
|
-
enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
|
|
103
|
-
if enabled_middlewares:
|
|
104
|
-
# 恢复INFO级别日志,保留关键的启用信息
|
|
105
|
-
self.logger.info(f'Enabled middlewares:\n {pformat(enabled_middlewares)}')
|
|
106
|
-
|
|
107
|
-
def _validate_middleware(self, middleware):
|
|
108
|
-
middleware_cls = load_class(middleware)
|
|
109
|
-
if not hasattr(middleware_cls, 'create_instance'):
|
|
110
|
-
raise MiddlewareInitError(
|
|
111
|
-
f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
|
|
112
|
-
)
|
|
113
|
-
try:
|
|
114
|
-
instance = middleware_cls.create_instance(self.crawler)
|
|
115
|
-
self.middlewares.append(instance)
|
|
116
|
-
return True
|
|
117
|
-
except NotConfiguredError:
|
|
118
|
-
return False
|
|
119
|
-
|
|
120
|
-
def _add_method(self):
|
|
121
|
-
for middleware in self.middlewares:
|
|
122
|
-
if hasattr(middleware, 'process_request'):
|
|
123
|
-
if self._validate_middleware_method(method_name='process_request', middleware=middleware):
|
|
124
|
-
self.methods['process_request'].append(middleware.process_request)
|
|
125
|
-
if hasattr(middleware, 'process_response'):
|
|
126
|
-
if self._validate_middleware_method(method_name='process_response', middleware=middleware):
|
|
127
|
-
self.methods['process_response'].append(middleware.process_response)
|
|
128
|
-
if hasattr(middleware, 'process_exception'):
|
|
129
|
-
if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
|
|
130
|
-
self.methods['process_exception'].append(middleware.process_exception)
|
|
131
|
-
|
|
132
|
-
@staticmethod
|
|
133
|
-
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
134
|
-
method = getattr(type(middleware), method_name)
|
|
135
|
-
base_method = getattr(BaseMiddleware, method_name)
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from pprint import pformat
|
|
4
|
+
from types import MethodType
|
|
5
|
+
from asyncio import create_task
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import List, Dict, Callable, Optional
|
|
8
|
+
|
|
9
|
+
from crawlo import Request, Response
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.utils.class_loader import load_class
|
|
12
|
+
from crawlo.middleware import BaseMiddleware
|
|
13
|
+
from crawlo.project import common_call
|
|
14
|
+
from crawlo.event import ignore_request, response_received
|
|
15
|
+
from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
|
|
16
|
+
NotConfiguredError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MiddlewareManager:
|
|
20
|
+
|
|
21
|
+
def __init__(self, crawler):
|
|
22
|
+
self.crawler = crawler
|
|
23
|
+
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
24
|
+
self.middlewares: List = []
|
|
25
|
+
self.methods: Dict[str, List[MethodType]] = defaultdict(list)
|
|
26
|
+
middlewares = self.crawler.settings.get_list('MIDDLEWARES')
|
|
27
|
+
self._add_middleware(middlewares)
|
|
28
|
+
self._add_method()
|
|
29
|
+
|
|
30
|
+
self.download_method: Callable = crawler.engine.downloader.download
|
|
31
|
+
self._stats = crawler.stats
|
|
32
|
+
|
|
33
|
+
async def _process_request(self, request: Request):
|
|
34
|
+
for method in self.methods['process_request']:
|
|
35
|
+
result = await common_call(method, request, self.crawler.spider)
|
|
36
|
+
if result is None:
|
|
37
|
+
continue
|
|
38
|
+
if isinstance(result, (Request, Response)):
|
|
39
|
+
return result
|
|
40
|
+
raise InvalidOutputError(
|
|
41
|
+
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
|
|
42
|
+
)
|
|
43
|
+
return await self.download_method(request)
|
|
44
|
+
|
|
45
|
+
async def _process_response(self, request: Request, response: Response):
|
|
46
|
+
for method in reversed(self.methods['process_response']):
|
|
47
|
+
try:
|
|
48
|
+
response = await common_call(method, request, response, self.crawler.spider)
|
|
49
|
+
except IgnoreRequestError as exp:
|
|
50
|
+
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
51
|
+
if isinstance(response, Request):
|
|
52
|
+
return response
|
|
53
|
+
if isinstance(response, Response):
|
|
54
|
+
continue
|
|
55
|
+
raise InvalidOutputError(
|
|
56
|
+
f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
|
|
57
|
+
)
|
|
58
|
+
return response
|
|
59
|
+
|
|
60
|
+
async def _process_exception(self, request: Request, exp: Exception):
|
|
61
|
+
for method in self.methods['process_exception']:
|
|
62
|
+
response = await common_call(method, request, exp, self.crawler.spider)
|
|
63
|
+
if response is None:
|
|
64
|
+
continue
|
|
65
|
+
if isinstance(response, (Request, Response)):
|
|
66
|
+
return response
|
|
67
|
+
if response:
|
|
68
|
+
break
|
|
69
|
+
raise InvalidOutputError(
|
|
70
|
+
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
raise exp
|
|
74
|
+
|
|
75
|
+
async def download(self, request) -> Optional[Response]:
|
|
76
|
+
""" called in the download method. """
|
|
77
|
+
try:
|
|
78
|
+
response = await self._process_request(request)
|
|
79
|
+
except KeyError:
|
|
80
|
+
raise RequestMethodError(f"{request.method.lower()} is not supported")
|
|
81
|
+
except IgnoreRequestError as exp:
|
|
82
|
+
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
83
|
+
response = await self._process_exception(request, exp)
|
|
84
|
+
except Exception as exp:
|
|
85
|
+
self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
|
|
86
|
+
response = await self._process_exception(request, exp)
|
|
87
|
+
else:
|
|
88
|
+
create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
|
|
89
|
+
self._stats.inc_value('response_received_count')
|
|
90
|
+
if isinstance(response, Response):
|
|
91
|
+
response = await self._process_response(request, response)
|
|
92
|
+
if isinstance(response, Request):
|
|
93
|
+
await self.crawler.engine.enqueue_request(request)
|
|
94
|
+
return None
|
|
95
|
+
return response
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def create_instance(cls, *args, **kwargs):
|
|
99
|
+
return cls(*args, **kwargs)
|
|
100
|
+
|
|
101
|
+
def _add_middleware(self, middlewares):
|
|
102
|
+
enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
|
|
103
|
+
if enabled_middlewares:
|
|
104
|
+
# 恢复INFO级别日志,保留关键的启用信息
|
|
105
|
+
self.logger.info(f'Enabled middlewares:\n {pformat(enabled_middlewares)}')
|
|
106
|
+
|
|
107
|
+
def _validate_middleware(self, middleware):
|
|
108
|
+
middleware_cls = load_class(middleware)
|
|
109
|
+
if not hasattr(middleware_cls, 'create_instance'):
|
|
110
|
+
raise MiddlewareInitError(
|
|
111
|
+
f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
|
|
112
|
+
)
|
|
113
|
+
try:
|
|
114
|
+
instance = middleware_cls.create_instance(self.crawler)
|
|
115
|
+
self.middlewares.append(instance)
|
|
116
|
+
return True
|
|
117
|
+
except NotConfiguredError:
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def _add_method(self):
|
|
121
|
+
for middleware in self.middlewares:
|
|
122
|
+
if hasattr(middleware, 'process_request'):
|
|
123
|
+
if self._validate_middleware_method(method_name='process_request', middleware=middleware):
|
|
124
|
+
self.methods['process_request'].append(middleware.process_request)
|
|
125
|
+
if hasattr(middleware, 'process_response'):
|
|
126
|
+
if self._validate_middleware_method(method_name='process_response', middleware=middleware):
|
|
127
|
+
self.methods['process_response'].append(middleware.process_response)
|
|
128
|
+
if hasattr(middleware, 'process_exception'):
|
|
129
|
+
if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
|
|
130
|
+
self.methods['process_exception'].append(middleware.process_exception)
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
134
|
+
method = getattr(type(middleware), method_name)
|
|
135
|
+
base_method = getattr(BaseMiddleware, method_name)
|
|
136
136
|
return False if method == base_method else True
|
crawlo/middleware/offsite.py
CHANGED
|
@@ -1,124 +1,124 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
OffsiteMiddleware 中间件
|
|
5
|
-
用于过滤掉不在指定域名范围内的请求
|
|
6
|
-
"""
|
|
7
|
-
import re
|
|
8
|
-
from urllib.parse import urlparse
|
|
9
|
-
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.exceptions import IgnoreRequestError
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class OffsiteMiddleware:
|
|
15
|
-
"""
|
|
16
|
-
OffsiteMiddleware 中间件
|
|
17
|
-
用于过滤掉不在指定域名范围内的请求,防止爬虫爬取到不相关的网站
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
def __init__(self, stats, log_level, allowed_domains=None):
|
|
21
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
22
|
-
self.stats = stats
|
|
23
|
-
self.allowed_domains = allowed_domains or []
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def create_instance(cls, crawler):
|
|
27
|
-
"""
|
|
28
|
-
创建中间件实例
|
|
29
|
-
从爬虫设置中获取允许的域名列表
|
|
30
|
-
"""
|
|
31
|
-
# 优先使用 Spider 实例的 allowed_domains,回退到全局设置中的 ALLOWED_DOMAINS
|
|
32
|
-
allowed_domains = []
|
|
33
|
-
|
|
34
|
-
# 检查当前爬虫实例是否有 allowed_domains 属性
|
|
35
|
-
if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'allowed_domains'):
|
|
36
|
-
allowed_domains = getattr(crawler.spider, 'allowed_domains', [])
|
|
37
|
-
|
|
38
|
-
# 如果 Spider 实例没有设置 allowed_domains,则从全局设置中获取
|
|
39
|
-
if not allowed_domains:
|
|
40
|
-
allowed_domains = crawler.settings.get_list('ALLOWED_DOMAINS')
|
|
41
|
-
|
|
42
|
-
# 如果没有配置允许的域名,则禁用此中间件
|
|
43
|
-
if not allowed_domains:
|
|
44
|
-
from crawlo.exceptions import NotConfiguredError
|
|
45
|
-
raise NotConfiguredError("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用")
|
|
46
|
-
|
|
47
|
-
o = cls(
|
|
48
|
-
stats=crawler.stats,
|
|
49
|
-
log_level=crawler.settings.get('LOG_LEVEL'),
|
|
50
|
-
allowed_domains=allowed_domains
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
# 编译域名正则表达式以提高性能
|
|
54
|
-
o._compile_domains()
|
|
55
|
-
|
|
56
|
-
# 使用中间件自己的logger而不是crawler.logger
|
|
57
|
-
o.logger.debug(f"OffsiteMiddleware 已启用,允许的域名: {allowed_domains}")
|
|
58
|
-
return o
|
|
59
|
-
|
|
60
|
-
def _compile_domains(self):
|
|
61
|
-
"""
|
|
62
|
-
编译域名正则表达式
|
|
63
|
-
"""
|
|
64
|
-
self._domain_regexes = []
|
|
65
|
-
for domain in self.allowed_domains:
|
|
66
|
-
# 转义域名中的特殊字符
|
|
67
|
-
escaped_domain = re.escape(domain)
|
|
68
|
-
# 创建匹配域名的正则表达式(支持子域名)
|
|
69
|
-
regex = re.compile(r'(^|.*\.)' + escaped_domain + '$', re.IGNORECASE)
|
|
70
|
-
self._domain_regexes.append(regex)
|
|
71
|
-
|
|
72
|
-
def _is_offsite_request(self, request):
|
|
73
|
-
"""
|
|
74
|
-
判断请求是否为站外请求
|
|
75
|
-
"""
|
|
76
|
-
try:
|
|
77
|
-
parsed_url = urlparse(request.url)
|
|
78
|
-
hostname = parsed_url.hostname
|
|
79
|
-
|
|
80
|
-
if not hostname:
|
|
81
|
-
return True # 无效URL
|
|
82
|
-
|
|
83
|
-
# 检查是否匹配允许的域名
|
|
84
|
-
for regex in self._domain_regexes:
|
|
85
|
-
if regex.match(hostname):
|
|
86
|
-
return False # 匹配允许的域名
|
|
87
|
-
|
|
88
|
-
return True # 不匹配任何允许的域名
|
|
89
|
-
except Exception:
|
|
90
|
-
# URL解析失败,视为站外请求
|
|
91
|
-
return True
|
|
92
|
-
|
|
93
|
-
async def process_request(self, request, spider):
|
|
94
|
-
"""
|
|
95
|
-
处理请求,过滤站外请求
|
|
96
|
-
"""
|
|
97
|
-
if self._is_offsite_request(request):
|
|
98
|
-
# 记录被过滤的请求
|
|
99
|
-
self.stats.inc_value('offsite_request_count')
|
|
100
|
-
|
|
101
|
-
# 记录被过滤的域名
|
|
102
|
-
try:
|
|
103
|
-
parsed_url = urlparse(request.url)
|
|
104
|
-
hostname = parsed_url.hostname or "unknown"
|
|
105
|
-
self.stats.inc_value(f'offsite_request_count/{hostname}')
|
|
106
|
-
except:
|
|
107
|
-
self.stats.inc_value('offsite_request_count/invalid_url')
|
|
108
|
-
|
|
109
|
-
self.logger.debug(f"过滤站外请求: {request.url}")
|
|
110
|
-
|
|
111
|
-
# 抛出异常以忽略该请求
|
|
112
|
-
raise IgnoreRequestError(f"站外请求被过滤: {request.url}")
|
|
113
|
-
|
|
114
|
-
return None
|
|
115
|
-
|
|
116
|
-
def process_exception(self, request, exception, spider):
|
|
117
|
-
"""
|
|
118
|
-
处理异常
|
|
119
|
-
"""
|
|
120
|
-
# 如果是IgnoreRequestError且是我们产生的,则处理它
|
|
121
|
-
if isinstance(exception, IgnoreRequestError) and "站外请求被过滤" in str(exception):
|
|
122
|
-
self.logger.debug(f"已过滤站外请求: {request.url}")
|
|
123
|
-
return True # 表示异常已被处理
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
OffsiteMiddleware 中间件
|
|
5
|
+
用于过滤掉不在指定域名范围内的请求
|
|
6
|
+
"""
|
|
7
|
+
import re
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.exceptions import IgnoreRequestError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OffsiteMiddleware:
|
|
15
|
+
"""
|
|
16
|
+
OffsiteMiddleware 中间件
|
|
17
|
+
用于过滤掉不在指定域名范围内的请求,防止爬虫爬取到不相关的网站
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, stats, log_level, allowed_domains=None):
|
|
21
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
22
|
+
self.stats = stats
|
|
23
|
+
self.allowed_domains = allowed_domains or []
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def create_instance(cls, crawler):
|
|
27
|
+
"""
|
|
28
|
+
创建中间件实例
|
|
29
|
+
从爬虫设置中获取允许的域名列表
|
|
30
|
+
"""
|
|
31
|
+
# 优先使用 Spider 实例的 allowed_domains,回退到全局设置中的 ALLOWED_DOMAINS
|
|
32
|
+
allowed_domains = []
|
|
33
|
+
|
|
34
|
+
# 检查当前爬虫实例是否有 allowed_domains 属性
|
|
35
|
+
if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'allowed_domains'):
|
|
36
|
+
allowed_domains = getattr(crawler.spider, 'allowed_domains', [])
|
|
37
|
+
|
|
38
|
+
# 如果 Spider 实例没有设置 allowed_domains,则从全局设置中获取
|
|
39
|
+
if not allowed_domains:
|
|
40
|
+
allowed_domains = crawler.settings.get_list('ALLOWED_DOMAINS')
|
|
41
|
+
|
|
42
|
+
# 如果没有配置允许的域名,则禁用此中间件
|
|
43
|
+
if not allowed_domains:
|
|
44
|
+
from crawlo.exceptions import NotConfiguredError
|
|
45
|
+
raise NotConfiguredError("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用")
|
|
46
|
+
|
|
47
|
+
o = cls(
|
|
48
|
+
stats=crawler.stats,
|
|
49
|
+
log_level=crawler.settings.get('LOG_LEVEL'),
|
|
50
|
+
allowed_domains=allowed_domains
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# 编译域名正则表达式以提高性能
|
|
54
|
+
o._compile_domains()
|
|
55
|
+
|
|
56
|
+
# 使用中间件自己的logger而不是crawler.logger
|
|
57
|
+
o.logger.debug(f"OffsiteMiddleware 已启用,允许的域名: {allowed_domains}")
|
|
58
|
+
return o
|
|
59
|
+
|
|
60
|
+
def _compile_domains(self):
|
|
61
|
+
"""
|
|
62
|
+
编译域名正则表达式
|
|
63
|
+
"""
|
|
64
|
+
self._domain_regexes = []
|
|
65
|
+
for domain in self.allowed_domains:
|
|
66
|
+
# 转义域名中的特殊字符
|
|
67
|
+
escaped_domain = re.escape(domain)
|
|
68
|
+
# 创建匹配域名的正则表达式(支持子域名)
|
|
69
|
+
regex = re.compile(r'(^|.*\.)' + escaped_domain + '$', re.IGNORECASE)
|
|
70
|
+
self._domain_regexes.append(regex)
|
|
71
|
+
|
|
72
|
+
def _is_offsite_request(self, request):
|
|
73
|
+
"""
|
|
74
|
+
判断请求是否为站外请求
|
|
75
|
+
"""
|
|
76
|
+
try:
|
|
77
|
+
parsed_url = urlparse(request.url)
|
|
78
|
+
hostname = parsed_url.hostname
|
|
79
|
+
|
|
80
|
+
if not hostname:
|
|
81
|
+
return True # 无效URL
|
|
82
|
+
|
|
83
|
+
# 检查是否匹配允许的域名
|
|
84
|
+
for regex in self._domain_regexes:
|
|
85
|
+
if regex.match(hostname):
|
|
86
|
+
return False # 匹配允许的域名
|
|
87
|
+
|
|
88
|
+
return True # 不匹配任何允许的域名
|
|
89
|
+
except Exception:
|
|
90
|
+
# URL解析失败,视为站外请求
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
async def process_request(self, request, spider):
|
|
94
|
+
"""
|
|
95
|
+
处理请求,过滤站外请求
|
|
96
|
+
"""
|
|
97
|
+
if self._is_offsite_request(request):
|
|
98
|
+
# 记录被过滤的请求
|
|
99
|
+
self.stats.inc_value('offsite_request_count')
|
|
100
|
+
|
|
101
|
+
# 记录被过滤的域名
|
|
102
|
+
try:
|
|
103
|
+
parsed_url = urlparse(request.url)
|
|
104
|
+
hostname = parsed_url.hostname or "unknown"
|
|
105
|
+
self.stats.inc_value(f'offsite_request_count/{hostname}')
|
|
106
|
+
except:
|
|
107
|
+
self.stats.inc_value('offsite_request_count/invalid_url')
|
|
108
|
+
|
|
109
|
+
self.logger.debug(f"过滤站外请求: {request.url}")
|
|
110
|
+
|
|
111
|
+
# 抛出异常以忽略该请求
|
|
112
|
+
raise IgnoreRequestError(f"站外请求被过滤: {request.url}")
|
|
113
|
+
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
def process_exception(self, request, exception, spider):
|
|
117
|
+
"""
|
|
118
|
+
处理异常
|
|
119
|
+
"""
|
|
120
|
+
# 如果是IgnoreRequestError且是我们产生的,则处理它
|
|
121
|
+
if isinstance(exception, IgnoreRequestError) and "站外请求被过滤" in str(exception):
|
|
122
|
+
self.logger.debug(f"已过滤站外请求: {request.url}")
|
|
123
|
+
return True # 表示异常已被处理
|
|
124
124
|
return None
|