crawlo 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +63 -63
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +322 -314
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +365 -365
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +256 -256
- crawlo/crawler.py +1166 -1168
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +226 -226
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +52 -45
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +234 -234
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -115
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +187 -148
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +318 -318
- crawlo/pipelines/pipeline_manager.py +75 -75
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +325 -297
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +379 -379
- crawlo/queue/redis_priority_queue.py +306 -306
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +225 -225
- crawlo/settings/setting_manager.py +198 -198
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +266 -261
- crawlo/templates/project/settings_distributed.py.tmpl +179 -174
- crawlo/templates/project/settings_gentle.py.tmpl +60 -95
- crawlo/templates/project/settings_high_performance.py.tmpl +130 -125
- crawlo/templates/project/settings_minimal.py.tmpl +34 -29
- crawlo/templates/project/settings_simple.py.tmpl +101 -96
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/run.py.tmpl +38 -47
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +199 -146
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +351 -351
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/METADATA +1020 -1020
- crawlo-1.3.3.dist-info/RECORD +219 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +107 -107
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_pipelines.py +66 -66
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/verify_distributed.py +117 -117
- crawlo-1.3.1.dist-info/RECORD +0 -219
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/WHEEL +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/top_level.txt +0 -0
crawlo/middleware/offsite.py
CHANGED
|
@@ -1,116 +1,124 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
OffsiteMiddleware 中间件
|
|
5
|
-
用于过滤掉不在指定域名范围内的请求
|
|
6
|
-
"""
|
|
7
|
-
import re
|
|
8
|
-
from urllib.parse import urlparse
|
|
9
|
-
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.exceptions import IgnoreRequestError
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class OffsiteMiddleware:
|
|
15
|
-
"""
|
|
16
|
-
OffsiteMiddleware 中间件
|
|
17
|
-
用于过滤掉不在指定域名范围内的请求,防止爬虫爬取到不相关的网站
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
def __init__(self, stats, log_level, allowed_domains=None):
|
|
21
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
22
|
-
self.stats = stats
|
|
23
|
-
self.allowed_domains = allowed_domains or []
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def create_instance(cls, crawler):
|
|
27
|
-
"""
|
|
28
|
-
创建中间件实例
|
|
29
|
-
从爬虫设置中获取允许的域名列表
|
|
30
|
-
"""
|
|
31
|
-
#
|
|
32
|
-
allowed_domains =
|
|
33
|
-
|
|
34
|
-
#
|
|
35
|
-
if
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
#
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
OffsiteMiddleware 中间件
|
|
5
|
+
用于过滤掉不在指定域名范围内的请求
|
|
6
|
+
"""
|
|
7
|
+
import re
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.exceptions import IgnoreRequestError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OffsiteMiddleware:
|
|
15
|
+
"""
|
|
16
|
+
OffsiteMiddleware 中间件
|
|
17
|
+
用于过滤掉不在指定域名范围内的请求,防止爬虫爬取到不相关的网站
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, stats, log_level, allowed_domains=None):
|
|
21
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
22
|
+
self.stats = stats
|
|
23
|
+
self.allowed_domains = allowed_domains or []
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def create_instance(cls, crawler):
|
|
27
|
+
"""
|
|
28
|
+
创建中间件实例
|
|
29
|
+
从爬虫设置中获取允许的域名列表
|
|
30
|
+
"""
|
|
31
|
+
# 优先使用 Spider 实例的 allowed_domains,回退到全局设置中的 ALLOWED_DOMAINS
|
|
32
|
+
allowed_domains = []
|
|
33
|
+
|
|
34
|
+
# 检查当前爬虫实例是否有 allowed_domains 属性
|
|
35
|
+
if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'allowed_domains'):
|
|
36
|
+
allowed_domains = getattr(crawler.spider, 'allowed_domains', [])
|
|
37
|
+
|
|
38
|
+
# 如果 Spider 实例没有设置 allowed_domains,则从全局设置中获取
|
|
39
|
+
if not allowed_domains:
|
|
40
|
+
allowed_domains = crawler.settings.get_list('ALLOWED_DOMAINS')
|
|
41
|
+
|
|
42
|
+
# 如果没有配置允许的域名,则禁用此中间件
|
|
43
|
+
if not allowed_domains:
|
|
44
|
+
from crawlo.exceptions import NotConfiguredError
|
|
45
|
+
raise NotConfiguredError("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用")
|
|
46
|
+
|
|
47
|
+
o = cls(
|
|
48
|
+
stats=crawler.stats,
|
|
49
|
+
log_level=crawler.settings.get('LOG_LEVEL'),
|
|
50
|
+
allowed_domains=allowed_domains
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# 编译域名正则表达式以提高性能
|
|
54
|
+
o._compile_domains()
|
|
55
|
+
|
|
56
|
+
# 使用中间件自己的logger而不是crawler.logger
|
|
57
|
+
o.logger.debug(f"OffsiteMiddleware 已启用,允许的域名: {allowed_domains}")
|
|
58
|
+
return o
|
|
59
|
+
|
|
60
|
+
def _compile_domains(self):
|
|
61
|
+
"""
|
|
62
|
+
编译域名正则表达式
|
|
63
|
+
"""
|
|
64
|
+
self._domain_regexes = []
|
|
65
|
+
for domain in self.allowed_domains:
|
|
66
|
+
# 转义域名中的特殊字符
|
|
67
|
+
escaped_domain = re.escape(domain)
|
|
68
|
+
# 创建匹配域名的正则表达式(支持子域名)
|
|
69
|
+
regex = re.compile(r'(^|.*\.)' + escaped_domain + '$', re.IGNORECASE)
|
|
70
|
+
self._domain_regexes.append(regex)
|
|
71
|
+
|
|
72
|
+
def _is_offsite_request(self, request):
|
|
73
|
+
"""
|
|
74
|
+
判断请求是否为站外请求
|
|
75
|
+
"""
|
|
76
|
+
try:
|
|
77
|
+
parsed_url = urlparse(request.url)
|
|
78
|
+
hostname = parsed_url.hostname
|
|
79
|
+
|
|
80
|
+
if not hostname:
|
|
81
|
+
return True # 无效URL
|
|
82
|
+
|
|
83
|
+
# 检查是否匹配允许的域名
|
|
84
|
+
for regex in self._domain_regexes:
|
|
85
|
+
if regex.match(hostname):
|
|
86
|
+
return False # 匹配允许的域名
|
|
87
|
+
|
|
88
|
+
return True # 不匹配任何允许的域名
|
|
89
|
+
except Exception:
|
|
90
|
+
# URL解析失败,视为站外请求
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
async def process_request(self, request, spider):
|
|
94
|
+
"""
|
|
95
|
+
处理请求,过滤站外请求
|
|
96
|
+
"""
|
|
97
|
+
if self._is_offsite_request(request):
|
|
98
|
+
# 记录被过滤的请求
|
|
99
|
+
self.stats.inc_value('offsite_request_count')
|
|
100
|
+
|
|
101
|
+
# 记录被过滤的域名
|
|
102
|
+
try:
|
|
103
|
+
parsed_url = urlparse(request.url)
|
|
104
|
+
hostname = parsed_url.hostname or "unknown"
|
|
105
|
+
self.stats.inc_value(f'offsite_request_count/{hostname}')
|
|
106
|
+
except:
|
|
107
|
+
self.stats.inc_value('offsite_request_count/invalid_url')
|
|
108
|
+
|
|
109
|
+
self.logger.debug(f"过滤站外请求: {request.url}")
|
|
110
|
+
|
|
111
|
+
# 抛出异常以忽略该请求
|
|
112
|
+
raise IgnoreRequestError(f"站外请求被过滤: {request.url}")
|
|
113
|
+
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
def process_exception(self, request, exception, spider):
|
|
117
|
+
"""
|
|
118
|
+
处理异常
|
|
119
|
+
"""
|
|
120
|
+
# 如果是IgnoreRequestError且是我们产生的,则处理它
|
|
121
|
+
if isinstance(exception, IgnoreRequestError) and "站外请求被过滤" in str(exception):
|
|
122
|
+
self.logger.debug(f"已过滤站外请求: {request.url}")
|
|
123
|
+
return True # 表示异常已被处理
|
|
116
124
|
return None
|