crawlo 1.2.5__py3-none-any.whl → 1.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +61 -61
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +60 -60
- crawlo/cleaners/data_formatter.py +225 -225
- crawlo/cleaners/encoding_converter.py +125 -125
- crawlo/cleaners/text_cleaner.py +232 -232
- crawlo/cli.py +75 -88
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -144
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +323 -323
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +312 -312
- crawlo/config_validator.py +251 -251
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +365 -354
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +251 -143
- crawlo/crawler.py +1099 -1110
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +107 -107
- crawlo/downloader/__init__.py +266 -266
- crawlo/downloader/aiohttp_downloader.py +228 -221
- crawlo/downloader/cffi_downloader.py +256 -256
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -38
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +43 -43
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +234 -281
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +131 -131
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +136 -135
- crawlo/middleware/offsite.py +114 -114
- crawlo/middleware/proxy.py +367 -367
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +211 -211
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +338 -338
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +317 -317
- crawlo/pipelines/pipeline_manager.py +62 -61
- crawlo/pipelines/redis_dedup_pipeline.py +166 -165
- crawlo/project.py +314 -279
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +377 -337
- crawlo/queue/redis_priority_queue.py +306 -299
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +219 -217
- crawlo/settings/setting_manager.py +122 -122
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +288 -324
- crawlo/templates/project/settings_distributed.py.tmpl +157 -154
- crawlo/templates/project/settings_gentle.py.tmpl +101 -128
- crawlo/templates/project/settings_high_performance.py.tmpl +135 -150
- crawlo/templates/project/settings_simple.py.tmpl +99 -103
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/run.py.tmpl +45 -47
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +182 -182
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +35 -35
- crawlo/tools/distributed_coordinator.py +386 -386
- crawlo/tools/retry_mechanism.py +220 -220
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/utils/__init__.py +35 -35
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +290 -290
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +143 -106
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +128 -128
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +351 -334
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.2.5.dist-info → crawlo-1.2.7.dist-info}/METADATA +764 -764
- crawlo-1.2.7.dist-info/RECORD +209 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +236 -236
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +102 -102
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +81 -0
- tests/test_config_validator.py +193 -193
- tests/test_crawlo_proxy_integration.py +172 -172
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +356 -356
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_mode_consistency.py +52 -0
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -241
- tests/test_scheduler_config_update.py +134 -0
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +153 -153
- tests/tools_example.py +257 -257
- crawlo-1.2.5.dist-info/RECORD +0 -206
- {crawlo-1.2.5.dist-info → crawlo-1.2.7.dist-info}/WHEEL +0 -0
- {crawlo-1.2.5.dist-info → crawlo-1.2.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.5.dist-info → crawlo-1.2.7.dist-info}/top_level.txt +0 -0
|
@@ -1,105 +1,105 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
DownloadDelayMiddleware 中间件
|
|
5
|
-
用于控制请求之间的延迟时间,支持固定延迟和随机延迟
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from asyncio import sleep
|
|
9
|
-
from random import uniform
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.exceptions import NotConfiguredError
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class DownloadDelayMiddleware(object):
|
|
15
|
-
"""
|
|
16
|
-
DownloadDelayMiddleware 中间件
|
|
17
|
-
用于控制请求之间的延迟时间,支持固定延迟和随机延迟
|
|
18
|
-
|
|
19
|
-
功能特性:
|
|
20
|
-
- 支持固定延迟时间
|
|
21
|
-
- 支持随机延迟时间
|
|
22
|
-
- 提供详细的日志信息
|
|
23
|
-
- 记录延迟统计信息
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(self, settings, log_level, stats=None):
|
|
27
|
-
"""
|
|
28
|
-
初始化中间件
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
settings: 设置管理器
|
|
32
|
-
log_level: 日志级别
|
|
33
|
-
stats: 统计信息收集器(可选)
|
|
34
|
-
"""
|
|
35
|
-
self.delay = settings.get_float("DOWNLOAD_DELAY")
|
|
36
|
-
if not self.delay:
|
|
37
|
-
raise NotConfiguredError("DOWNLOAD_DELAY not set or is zero")
|
|
38
|
-
|
|
39
|
-
self.randomness = settings.get_bool("RANDOMNESS", False)
|
|
40
|
-
|
|
41
|
-
# 安全地获取随机范围配置
|
|
42
|
-
random_range = settings.get_list("RANDOM_RANGE")
|
|
43
|
-
if len(random_range) >= 2:
|
|
44
|
-
try:
|
|
45
|
-
self.floor = float(random_range[0])
|
|
46
|
-
self.upper = float(random_range[1])
|
|
47
|
-
except (ValueError, TypeError):
|
|
48
|
-
# 如果配置无效,使用默认值
|
|
49
|
-
self.floor, self.upper = 0.5, 1.5
|
|
50
|
-
else:
|
|
51
|
-
# 如果配置不完整,使用默认值
|
|
52
|
-
self.floor, self.upper = 0.5, 1.5
|
|
53
|
-
|
|
54
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
55
|
-
self.stats = stats
|
|
56
|
-
|
|
57
|
-
@classmethod
|
|
58
|
-
def create_instance(cls, crawler):
|
|
59
|
-
"""
|
|
60
|
-
创建中间件实例
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
crawler: 爬虫实例
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
DownloadDelayMiddleware: 中间件实例
|
|
67
|
-
"""
|
|
68
|
-
o = cls(
|
|
69
|
-
settings=crawler.settings,
|
|
70
|
-
log_level=crawler.settings.get('LOG_LEVEL'),
|
|
71
|
-
stats=getattr(crawler, 'stats', None)
|
|
72
|
-
)
|
|
73
|
-
return o
|
|
74
|
-
|
|
75
|
-
async def process_request(self, _request, _spider):
|
|
76
|
-
"""
|
|
77
|
-
处理请求,添加延迟
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
_request: 请求对象
|
|
81
|
-
_spider: 爬虫实例
|
|
82
|
-
"""
|
|
83
|
-
if self.randomness:
|
|
84
|
-
# 计算随机延迟时间
|
|
85
|
-
delay_time = uniform(self.delay * self.floor, self.delay * self.upper)
|
|
86
|
-
await sleep(delay_time)
|
|
87
|
-
|
|
88
|
-
# 记录统计信息
|
|
89
|
-
if self.stats:
|
|
90
|
-
self.stats.inc_value('download_delay/random_count')
|
|
91
|
-
self.stats.inc_value('download_delay/random_total_time', delay_time)
|
|
92
|
-
|
|
93
|
-
# 记录日志
|
|
94
|
-
self.logger.debug(f"应用随机延迟: {delay_time:.2f}秒 (范围: {self.delay * self.floor:.2f} - {self.delay * self.upper:.2f})")
|
|
95
|
-
else:
|
|
96
|
-
# 应用固定延迟
|
|
97
|
-
await sleep(self.delay)
|
|
98
|
-
|
|
99
|
-
# 记录统计信息
|
|
100
|
-
if self.stats:
|
|
101
|
-
self.stats.inc_value('download_delay/fixed_count')
|
|
102
|
-
self.stats.inc_value('download_delay/fixed_total_time', self.delay)
|
|
103
|
-
|
|
104
|
-
# 记录日志
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
DownloadDelayMiddleware 中间件
|
|
5
|
+
用于控制请求之间的延迟时间,支持固定延迟和随机延迟
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from asyncio import sleep
|
|
9
|
+
from random import uniform
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.exceptions import NotConfiguredError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DownloadDelayMiddleware(object):
|
|
15
|
+
"""
|
|
16
|
+
DownloadDelayMiddleware 中间件
|
|
17
|
+
用于控制请求之间的延迟时间,支持固定延迟和随机延迟
|
|
18
|
+
|
|
19
|
+
功能特性:
|
|
20
|
+
- 支持固定延迟时间
|
|
21
|
+
- 支持随机延迟时间
|
|
22
|
+
- 提供详细的日志信息
|
|
23
|
+
- 记录延迟统计信息
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, settings, log_level, stats=None):
|
|
27
|
+
"""
|
|
28
|
+
初始化中间件
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
settings: 设置管理器
|
|
32
|
+
log_level: 日志级别
|
|
33
|
+
stats: 统计信息收集器(可选)
|
|
34
|
+
"""
|
|
35
|
+
self.delay = settings.get_float("DOWNLOAD_DELAY")
|
|
36
|
+
if not self.delay:
|
|
37
|
+
raise NotConfiguredError("DOWNLOAD_DELAY not set or is zero")
|
|
38
|
+
|
|
39
|
+
self.randomness = settings.get_bool("RANDOMNESS", False)
|
|
40
|
+
|
|
41
|
+
# 安全地获取随机范围配置
|
|
42
|
+
random_range = settings.get_list("RANDOM_RANGE")
|
|
43
|
+
if len(random_range) >= 2:
|
|
44
|
+
try:
|
|
45
|
+
self.floor = float(random_range[0])
|
|
46
|
+
self.upper = float(random_range[1])
|
|
47
|
+
except (ValueError, TypeError):
|
|
48
|
+
# 如果配置无效,使用默认值
|
|
49
|
+
self.floor, self.upper = 0.5, 1.5
|
|
50
|
+
else:
|
|
51
|
+
# 如果配置不完整,使用默认值
|
|
52
|
+
self.floor, self.upper = 0.5, 1.5
|
|
53
|
+
|
|
54
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
55
|
+
self.stats = stats
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def create_instance(cls, crawler):
|
|
59
|
+
"""
|
|
60
|
+
创建中间件实例
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
crawler: 爬虫实例
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
DownloadDelayMiddleware: 中间件实例
|
|
67
|
+
"""
|
|
68
|
+
o = cls(
|
|
69
|
+
settings=crawler.settings,
|
|
70
|
+
log_level=crawler.settings.get('LOG_LEVEL'),
|
|
71
|
+
stats=getattr(crawler, 'stats', None)
|
|
72
|
+
)
|
|
73
|
+
return o
|
|
74
|
+
|
|
75
|
+
async def process_request(self, _request, _spider):
|
|
76
|
+
"""
|
|
77
|
+
处理请求,添加延迟
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
_request: 请求对象
|
|
81
|
+
_spider: 爬虫实例
|
|
82
|
+
"""
|
|
83
|
+
if self.randomness:
|
|
84
|
+
# 计算随机延迟时间
|
|
85
|
+
delay_time = uniform(self.delay * self.floor, self.delay * self.upper)
|
|
86
|
+
await sleep(delay_time)
|
|
87
|
+
|
|
88
|
+
# 记录统计信息
|
|
89
|
+
if self.stats:
|
|
90
|
+
self.stats.inc_value('download_delay/random_count')
|
|
91
|
+
self.stats.inc_value('download_delay/random_total_time', delay_time)
|
|
92
|
+
|
|
93
|
+
# 记录日志
|
|
94
|
+
self.logger.debug(f"应用随机延迟: {delay_time:.2f}秒 (范围: {self.delay * self.floor:.2f} - {self.delay * self.upper:.2f})")
|
|
95
|
+
else:
|
|
96
|
+
# 应用固定延迟
|
|
97
|
+
await sleep(self.delay)
|
|
98
|
+
|
|
99
|
+
# 记录统计信息
|
|
100
|
+
if self.stats:
|
|
101
|
+
self.stats.inc_value('download_delay/fixed_count')
|
|
102
|
+
self.stats.inc_value('download_delay/fixed_total_time', self.delay)
|
|
103
|
+
|
|
104
|
+
# 记录日志
|
|
105
105
|
self.logger.debug(f"应用固定延迟: {self.delay:.2f}秒")
|
|
@@ -1,135 +1,136 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from pprint import pformat
|
|
4
|
-
from types import MethodType
|
|
5
|
-
from asyncio import create_task
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from typing import List, Dict, Callable, Optional
|
|
8
|
-
|
|
9
|
-
from crawlo import Request, Response
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.project import load_class
|
|
12
|
-
from crawlo.middleware import BaseMiddleware
|
|
13
|
-
from crawlo.project import common_call
|
|
14
|
-
from crawlo.event import ignore_request, response_received
|
|
15
|
-
from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
|
|
16
|
-
NotConfiguredError
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class MiddlewareManager:
|
|
20
|
-
|
|
21
|
-
def __init__(self, crawler):
|
|
22
|
-
self.crawler = crawler
|
|
23
|
-
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
24
|
-
self.middlewares: List = []
|
|
25
|
-
self.methods: Dict[str, List[MethodType]] = defaultdict(list)
|
|
26
|
-
middlewares = self.crawler.settings.get_list('MIDDLEWARES')
|
|
27
|
-
self._add_middleware(middlewares)
|
|
28
|
-
self._add_method()
|
|
29
|
-
|
|
30
|
-
self.download_method: Callable = crawler.engine.downloader.download
|
|
31
|
-
self._stats = crawler.stats
|
|
32
|
-
|
|
33
|
-
async def _process_request(self, request: Request):
|
|
34
|
-
for method in self.methods['process_request']:
|
|
35
|
-
result = await common_call(method, request, self.crawler.spider)
|
|
36
|
-
if result is None:
|
|
37
|
-
continue
|
|
38
|
-
if isinstance(result, (Request, Response)):
|
|
39
|
-
return result
|
|
40
|
-
raise InvalidOutputError(
|
|
41
|
-
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
|
|
42
|
-
)
|
|
43
|
-
return await self.download_method(request)
|
|
44
|
-
|
|
45
|
-
async def _process_response(self, request: Request, response: Response):
|
|
46
|
-
for method in reversed(self.methods['process_response']):
|
|
47
|
-
try:
|
|
48
|
-
response = await common_call(method, request, response, self.crawler.spider)
|
|
49
|
-
except IgnoreRequestError as exp:
|
|
50
|
-
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
51
|
-
if isinstance(response, Request):
|
|
52
|
-
return response
|
|
53
|
-
if isinstance(response, Response):
|
|
54
|
-
continue
|
|
55
|
-
raise InvalidOutputError(
|
|
56
|
-
f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
|
|
57
|
-
)
|
|
58
|
-
return response
|
|
59
|
-
|
|
60
|
-
async def _process_exception(self, request: Request, exp: Exception):
|
|
61
|
-
for method in self.methods['process_exception']:
|
|
62
|
-
response = await common_call(method, request, exp, self.crawler.spider)
|
|
63
|
-
if response is None:
|
|
64
|
-
continue
|
|
65
|
-
if isinstance(response, (Request, Response)):
|
|
66
|
-
return response
|
|
67
|
-
if response:
|
|
68
|
-
break
|
|
69
|
-
raise InvalidOutputError(
|
|
70
|
-
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
|
|
71
|
-
)
|
|
72
|
-
else:
|
|
73
|
-
raise exp
|
|
74
|
-
|
|
75
|
-
async def download(self, request) -> Optional[Response]:
|
|
76
|
-
""" called in the download method. """
|
|
77
|
-
try:
|
|
78
|
-
response = await self._process_request(request)
|
|
79
|
-
except KeyError:
|
|
80
|
-
raise RequestMethodError(f"{request.method.lower()} is not supported")
|
|
81
|
-
except IgnoreRequestError as exp:
|
|
82
|
-
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
83
|
-
response = await self._process_exception(request, exp)
|
|
84
|
-
except Exception as exp:
|
|
85
|
-
self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
|
|
86
|
-
response = await self._process_exception(request, exp)
|
|
87
|
-
else:
|
|
88
|
-
create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
|
|
89
|
-
# self.crawler.stats.inc_value('response_received_count')
|
|
90
|
-
if isinstance(response, Response):
|
|
91
|
-
response = await self._process_response(request, response)
|
|
92
|
-
if isinstance(response, Request):
|
|
93
|
-
await self.crawler.engine.enqueue_request(request)
|
|
94
|
-
return None
|
|
95
|
-
return response
|
|
96
|
-
|
|
97
|
-
@classmethod
|
|
98
|
-
def create_instance(cls, *args, **kwargs):
|
|
99
|
-
return cls(*args, **kwargs)
|
|
100
|
-
|
|
101
|
-
def _add_middleware(self, middlewares):
|
|
102
|
-
enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
|
|
103
|
-
if enabled_middlewares:
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
self.
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from pprint import pformat
|
|
4
|
+
from types import MethodType
|
|
5
|
+
from asyncio import create_task
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import List, Dict, Callable, Optional
|
|
8
|
+
|
|
9
|
+
from crawlo import Request, Response
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.project import load_class
|
|
12
|
+
from crawlo.middleware import BaseMiddleware
|
|
13
|
+
from crawlo.project import common_call
|
|
14
|
+
from crawlo.event import ignore_request, response_received
|
|
15
|
+
from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
|
|
16
|
+
NotConfiguredError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MiddlewareManager:
|
|
20
|
+
|
|
21
|
+
def __init__(self, crawler):
|
|
22
|
+
self.crawler = crawler
|
|
23
|
+
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
24
|
+
self.middlewares: List = []
|
|
25
|
+
self.methods: Dict[str, List[MethodType]] = defaultdict(list)
|
|
26
|
+
middlewares = self.crawler.settings.get_list('MIDDLEWARES')
|
|
27
|
+
self._add_middleware(middlewares)
|
|
28
|
+
self._add_method()
|
|
29
|
+
|
|
30
|
+
self.download_method: Callable = crawler.engine.downloader.download
|
|
31
|
+
self._stats = crawler.stats
|
|
32
|
+
|
|
33
|
+
async def _process_request(self, request: Request):
|
|
34
|
+
for method in self.methods['process_request']:
|
|
35
|
+
result = await common_call(method, request, self.crawler.spider)
|
|
36
|
+
if result is None:
|
|
37
|
+
continue
|
|
38
|
+
if isinstance(result, (Request, Response)):
|
|
39
|
+
return result
|
|
40
|
+
raise InvalidOutputError(
|
|
41
|
+
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
|
|
42
|
+
)
|
|
43
|
+
return await self.download_method(request)
|
|
44
|
+
|
|
45
|
+
async def _process_response(self, request: Request, response: Response):
|
|
46
|
+
for method in reversed(self.methods['process_response']):
|
|
47
|
+
try:
|
|
48
|
+
response = await common_call(method, request, response, self.crawler.spider)
|
|
49
|
+
except IgnoreRequestError as exp:
|
|
50
|
+
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
51
|
+
if isinstance(response, Request):
|
|
52
|
+
return response
|
|
53
|
+
if isinstance(response, Response):
|
|
54
|
+
continue
|
|
55
|
+
raise InvalidOutputError(
|
|
56
|
+
f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
|
|
57
|
+
)
|
|
58
|
+
return response
|
|
59
|
+
|
|
60
|
+
async def _process_exception(self, request: Request, exp: Exception):
|
|
61
|
+
for method in self.methods['process_exception']:
|
|
62
|
+
response = await common_call(method, request, exp, self.crawler.spider)
|
|
63
|
+
if response is None:
|
|
64
|
+
continue
|
|
65
|
+
if isinstance(response, (Request, Response)):
|
|
66
|
+
return response
|
|
67
|
+
if response:
|
|
68
|
+
break
|
|
69
|
+
raise InvalidOutputError(
|
|
70
|
+
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
raise exp
|
|
74
|
+
|
|
75
|
+
async def download(self, request) -> Optional[Response]:
|
|
76
|
+
""" called in the download method. """
|
|
77
|
+
try:
|
|
78
|
+
response = await self._process_request(request)
|
|
79
|
+
except KeyError:
|
|
80
|
+
raise RequestMethodError(f"{request.method.lower()} is not supported")
|
|
81
|
+
except IgnoreRequestError as exp:
|
|
82
|
+
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
83
|
+
response = await self._process_exception(request, exp)
|
|
84
|
+
except Exception as exp:
|
|
85
|
+
self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
|
|
86
|
+
response = await self._process_exception(request, exp)
|
|
87
|
+
else:
|
|
88
|
+
create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
|
|
89
|
+
# self.crawler.stats.inc_value('response_received_count')
|
|
90
|
+
if isinstance(response, Response):
|
|
91
|
+
response = await self._process_response(request, response)
|
|
92
|
+
if isinstance(response, Request):
|
|
93
|
+
await self.crawler.engine.enqueue_request(request)
|
|
94
|
+
return None
|
|
95
|
+
return response
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def create_instance(cls, *args, **kwargs):
|
|
99
|
+
return cls(*args, **kwargs)
|
|
100
|
+
|
|
101
|
+
def _add_middleware(self, middlewares):
|
|
102
|
+
enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
|
|
103
|
+
if enabled_middlewares:
|
|
104
|
+
# 恢复INFO级别日志,保留关键的启用信息
|
|
105
|
+
self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
|
|
106
|
+
|
|
107
|
+
def _validate_middleware(self, middleware):
|
|
108
|
+
middleware_cls = load_class(middleware)
|
|
109
|
+
if not hasattr(middleware_cls, 'create_instance'):
|
|
110
|
+
raise MiddlewareInitError(
|
|
111
|
+
f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
|
|
112
|
+
)
|
|
113
|
+
try:
|
|
114
|
+
instance = middleware_cls.create_instance(self.crawler)
|
|
115
|
+
self.middlewares.append(instance)
|
|
116
|
+
return True
|
|
117
|
+
except NotConfiguredError:
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def _add_method(self):
|
|
121
|
+
for middleware in self.middlewares:
|
|
122
|
+
if hasattr(middleware, 'process_request'):
|
|
123
|
+
if self._validate_middleware_method(method_name='process_request', middleware=middleware):
|
|
124
|
+
self.methods['process_request'].append(middleware.process_request)
|
|
125
|
+
if hasattr(middleware, 'process_response'):
|
|
126
|
+
if self._validate_middleware_method(method_name='process_response', middleware=middleware):
|
|
127
|
+
self.methods['process_response'].append(middleware.process_response)
|
|
128
|
+
if hasattr(middleware, 'process_exception'):
|
|
129
|
+
if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
|
|
130
|
+
self.methods['process_exception'].append(middleware.process_exception)
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
134
|
+
method = getattr(type(middleware), method_name)
|
|
135
|
+
base_method = getattr(BaseMiddleware, method_name)
|
|
136
|
+
return False if method == base_method else True
|