crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/utils/__init__.py
CHANGED
|
@@ -31,6 +31,21 @@ from .selector_helper import (
|
|
|
31
31
|
is_xpath
|
|
32
32
|
)
|
|
33
33
|
|
|
34
|
+
from .encoding_helper import (
|
|
35
|
+
html_body_declared_encoding,
|
|
36
|
+
http_content_type_encoding,
|
|
37
|
+
read_bom,
|
|
38
|
+
resolve_encoding,
|
|
39
|
+
html_to_unicode
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
from .response_helper import (
|
|
43
|
+
parse_cookies,
|
|
44
|
+
regex_search,
|
|
45
|
+
regex_findall,
|
|
46
|
+
get_header_value
|
|
47
|
+
)
|
|
48
|
+
|
|
34
49
|
__all__ = [
|
|
35
50
|
"TimeUtils",
|
|
36
51
|
"parse_time",
|
|
@@ -47,5 +62,14 @@ __all__ = [
|
|
|
47
62
|
"extract_texts",
|
|
48
63
|
"extract_attr",
|
|
49
64
|
"extract_attrs",
|
|
50
|
-
"is_xpath"
|
|
65
|
+
"is_xpath",
|
|
66
|
+
"html_body_declared_encoding",
|
|
67
|
+
"http_content_type_encoding",
|
|
68
|
+
"read_bom",
|
|
69
|
+
"resolve_encoding",
|
|
70
|
+
"html_to_unicode",
|
|
71
|
+
"parse_cookies",
|
|
72
|
+
"regex_search",
|
|
73
|
+
"regex_findall",
|
|
74
|
+
"get_header_value"
|
|
51
75
|
]
|
crawlo/utils/batch_processor.py
CHANGED
|
@@ -9,7 +9,7 @@ from functools import wraps
|
|
|
9
9
|
from typing import List, Callable, Any, Optional, Dict
|
|
10
10
|
|
|
11
11
|
from crawlo.utils.error_handler import ErrorHandler
|
|
12
|
-
from crawlo.
|
|
12
|
+
from crawlo.logging import get_logger
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class BatchProcessor:
|
|
@@ -145,12 +145,18 @@ class RedisBatchProcessor:
|
|
|
145
145
|
|
|
146
146
|
# 每达到批次大小就执行一次
|
|
147
147
|
if count % self.batch_size == 0:
|
|
148
|
-
|
|
148
|
+
result = pipe.execute()
|
|
149
|
+
# 处理可能的异步情况
|
|
150
|
+
if asyncio.iscoroutine(result):
|
|
151
|
+
await result
|
|
149
152
|
pipe = self.redis_client.pipeline()
|
|
150
153
|
|
|
151
154
|
# 执行剩余的操作
|
|
152
155
|
if count % self.batch_size != 0:
|
|
153
|
-
|
|
156
|
+
result = pipe.execute()
|
|
157
|
+
# 处理可能的异步情况
|
|
158
|
+
if asyncio.iscoroutine(result):
|
|
159
|
+
await result
|
|
154
160
|
|
|
155
161
|
self.logger.debug(f"批量设置 {count} 个键值对")
|
|
156
162
|
return count
|
|
@@ -178,7 +184,12 @@ class RedisBatchProcessor:
|
|
|
178
184
|
for key in keys:
|
|
179
185
|
pipe.get(key)
|
|
180
186
|
|
|
181
|
-
|
|
187
|
+
result = pipe.execute()
|
|
188
|
+
# 处理可能的异步情况
|
|
189
|
+
if asyncio.iscoroutine(result):
|
|
190
|
+
results = await result
|
|
191
|
+
else:
|
|
192
|
+
results = result
|
|
182
193
|
|
|
183
194
|
# 构建结果字典
|
|
184
195
|
result_dict = {}
|
|
@@ -216,12 +227,18 @@ class RedisBatchProcessor:
|
|
|
216
227
|
|
|
217
228
|
# 每达到批次大小就执行一次
|
|
218
229
|
if count % self.batch_size == 0:
|
|
219
|
-
|
|
230
|
+
result = pipe.execute()
|
|
231
|
+
# 处理可能的异步情况
|
|
232
|
+
if asyncio.iscoroutine(result):
|
|
233
|
+
await result
|
|
220
234
|
pipe = self.redis_client.pipeline()
|
|
221
235
|
|
|
222
236
|
# 执行剩余的操作
|
|
223
237
|
if count % self.batch_size != 0:
|
|
224
|
-
|
|
238
|
+
result = pipe.execute()
|
|
239
|
+
# 处理可能的异步情况
|
|
240
|
+
if asyncio.iscoroutine(result):
|
|
241
|
+
await result
|
|
225
242
|
|
|
226
243
|
self.logger.debug(f"批量删除 {count} 个键")
|
|
227
244
|
return count
|
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
配置管理模块
|
|
5
|
+
===========
|
|
6
|
+
统一的配置管理接口,整合了通用配置工具、环境变量管理和大规模爬虫配置。
|
|
7
|
+
|
|
8
|
+
本模块包含:
|
|
9
|
+
1. ConfigUtils - 通用配置工具类
|
|
10
|
+
2. EnvConfigManager - 环境变量配置管理器
|
|
11
|
+
3. LargeScaleConfig - 大规模爬虫配置类
|
|
12
|
+
4. 便捷函数 - 快速访问常用配置功能
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
from typing import Any, Dict, List, Optional, Union
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ============================================================================
|
|
21
|
+
# 第一部分:通用配置工具
|
|
22
|
+
# ============================================================================
|
|
23
|
+
|
|
24
|
+
class ConfigUtils:
|
|
25
|
+
"""通用配置工具类"""
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def get_config_value(
|
|
29
|
+
config_sources: List[Union[Dict, Any]],
|
|
30
|
+
key: str,
|
|
31
|
+
default: Any = None,
|
|
32
|
+
value_type: type = str
|
|
33
|
+
) -> Any:
|
|
34
|
+
"""
|
|
35
|
+
从多个配置源中获取配置值
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
config_sources: 配置源列表,按优先级排序
|
|
39
|
+
key: 配置键名
|
|
40
|
+
default: 默认值
|
|
41
|
+
value_type: 值类型
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
配置值或默认值
|
|
45
|
+
"""
|
|
46
|
+
for config_source in config_sources:
|
|
47
|
+
if not config_source:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
# 获取配置值
|
|
51
|
+
value = None
|
|
52
|
+
if hasattr(config_source, 'get'):
|
|
53
|
+
value = config_source.get(key)
|
|
54
|
+
elif hasattr(config_source, key):
|
|
55
|
+
value = getattr(config_source, key)
|
|
56
|
+
else:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
if value is not None:
|
|
60
|
+
# 类型转换
|
|
61
|
+
try:
|
|
62
|
+
if value_type == bool:
|
|
63
|
+
if isinstance(value, str):
|
|
64
|
+
return value.lower() in ('1', 'true', 'yes', 'on')
|
|
65
|
+
return bool(value)
|
|
66
|
+
elif value_type == int:
|
|
67
|
+
return int(value)
|
|
68
|
+
elif value_type == float:
|
|
69
|
+
return float(value)
|
|
70
|
+
else:
|
|
71
|
+
return value_type(value)
|
|
72
|
+
except (ValueError, TypeError):
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
return default
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def has_config_prefix(config_source: Union[Dict, Any], prefix: str) -> bool:
|
|
79
|
+
"""
|
|
80
|
+
检查配置源是否包含指定前缀的配置项
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
config_source: 配置源
|
|
84
|
+
prefix: 前缀
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
是否包含指定前缀的配置项
|
|
88
|
+
"""
|
|
89
|
+
if not config_source:
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
if hasattr(config_source, 'keys'):
|
|
93
|
+
return any(key.startswith(prefix) for key in config_source.keys())
|
|
94
|
+
elif hasattr(config_source, '__dict__'):
|
|
95
|
+
return any(key.startswith(prefix) for key in config_source.__dict__.keys())
|
|
96
|
+
else:
|
|
97
|
+
return any(key.startswith(prefix) for key in dir(config_source))
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def merge_config_sources(config_sources: List[Union[Dict, Any]]) -> Dict[str, Any]:
|
|
101
|
+
"""
|
|
102
|
+
合并多个配置源,后面的配置源优先级更高
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
config_sources: 配置源列表
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
合并后的配置字典
|
|
109
|
+
"""
|
|
110
|
+
merged_config = {}
|
|
111
|
+
|
|
112
|
+
for config_source in config_sources:
|
|
113
|
+
if not config_source:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
if hasattr(config_source, 'keys'):
|
|
117
|
+
# 字典类型配置源
|
|
118
|
+
for key, value in config_source.items():
|
|
119
|
+
if key.isupper(): # 只合并大写的配置项
|
|
120
|
+
merged_config[key] = value
|
|
121
|
+
elif hasattr(config_source, '__dict__'):
|
|
122
|
+
# 对象类型配置源
|
|
123
|
+
for key, value in config_source.__dict__.items():
|
|
124
|
+
if key.isupper():
|
|
125
|
+
merged_config[key] = value
|
|
126
|
+
else:
|
|
127
|
+
# 其他类型配置源
|
|
128
|
+
for key in dir(config_source):
|
|
129
|
+
if key.isupper():
|
|
130
|
+
merged_config[key] = getattr(config_source, key)
|
|
131
|
+
|
|
132
|
+
return merged_config
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ============================================================================
|
|
136
|
+
# 第二部分:环境变量配置管理
|
|
137
|
+
# ============================================================================
|
|
138
|
+
|
|
139
|
+
class EnvConfigManager:
|
|
140
|
+
"""环境变量配置管理器"""
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def get_env_var(var_name: str, default: Any = None, var_type: type = str) -> Any:
|
|
144
|
+
"""
|
|
145
|
+
获取环境变量值
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
var_name: 环境变量名称
|
|
149
|
+
default: 默认值
|
|
150
|
+
var_type: 变量类型 (str, int, float, bool)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
环境变量值或默认值
|
|
154
|
+
"""
|
|
155
|
+
value = os.getenv(var_name)
|
|
156
|
+
if value is None:
|
|
157
|
+
return default
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
if var_type == bool:
|
|
161
|
+
return value.lower() in ('1', 'true', 'yes', 'on')
|
|
162
|
+
elif var_type == int:
|
|
163
|
+
return int(value)
|
|
164
|
+
elif var_type == float:
|
|
165
|
+
return float(value)
|
|
166
|
+
else:
|
|
167
|
+
return value
|
|
168
|
+
except (ValueError, TypeError):
|
|
169
|
+
return default
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def get_redis_config() -> dict:
|
|
173
|
+
"""
|
|
174
|
+
获取 Redis 配置
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Redis 配置字典
|
|
178
|
+
"""
|
|
179
|
+
return {
|
|
180
|
+
'REDIS_HOST': EnvConfigManager.get_env_var('CRAWLO_REDIS_HOST', '127.0.0.1', str),
|
|
181
|
+
'REDIS_PORT': EnvConfigManager.get_env_var('CRAWLO_REDIS_PORT', 6379, int),
|
|
182
|
+
'REDIS_PASSWORD': EnvConfigManager.get_env_var('CRAWLO_REDIS_PASSWORD', '', str),
|
|
183
|
+
'REDIS_DB': EnvConfigManager.get_env_var('CRAWLO_REDIS_DB', 0, int),
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
@staticmethod
|
|
187
|
+
def get_runtime_config() -> dict:
|
|
188
|
+
"""
|
|
189
|
+
获取运行时配置
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
运行时配置字典
|
|
193
|
+
"""
|
|
194
|
+
return {
|
|
195
|
+
'CRAWLO_MODE': EnvConfigManager.get_env_var('CRAWLO_MODE', 'standalone', str),
|
|
196
|
+
'PROJECT_NAME': EnvConfigManager.get_env_var('CRAWLO_PROJECT_NAME', 'crawlo', str),
|
|
197
|
+
'CONCURRENCY': EnvConfigManager.get_env_var('CRAWLO_CONCURRENCY', 8, int),
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
@staticmethod
|
|
201
|
+
def get_version() -> str:
|
|
202
|
+
"""
|
|
203
|
+
获取框架版本号
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
框架版本号字符串
|
|
207
|
+
"""
|
|
208
|
+
# 获取版本文件路径
|
|
209
|
+
version_file = os.path.join(os.path.dirname(__file__), '..', '__version__.py')
|
|
210
|
+
default_version = '1.0.0'
|
|
211
|
+
|
|
212
|
+
if os.path.exists(version_file):
|
|
213
|
+
try:
|
|
214
|
+
with open(version_file, 'r', encoding='utf-8') as f:
|
|
215
|
+
content = f.read()
|
|
216
|
+
# 使用正则表达式提取版本号
|
|
217
|
+
version_match = re.search(r"__version__\s*=\s*['\"]([^'\"]*)['\"]", content)
|
|
218
|
+
if version_match:
|
|
219
|
+
return version_match.group(1)
|
|
220
|
+
except Exception:
|
|
221
|
+
# 如果读取失败,使用默认版本号
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
return default_version
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ============================================================================
|
|
228
|
+
# 第三部分:大规模爬虫配置
|
|
229
|
+
# ============================================================================
|
|
230
|
+
|
|
231
|
+
class LargeScaleConfig:
|
|
232
|
+
"""大规模爬虫配置类"""
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def conservative_config(concurrency: int = 8) -> Dict[str, Any]:
|
|
236
|
+
"""
|
|
237
|
+
保守配置 - 适用于资源有限的环境
|
|
238
|
+
|
|
239
|
+
特点:
|
|
240
|
+
- 较小的队列容量
|
|
241
|
+
- 较低的并发数
|
|
242
|
+
- 较长的延迟
|
|
243
|
+
"""
|
|
244
|
+
from crawlo.utils.queue_helper import QueueHelper
|
|
245
|
+
|
|
246
|
+
config = QueueHelper.use_redis_queue(
|
|
247
|
+
queue_name="crawlo:conservative",
|
|
248
|
+
max_retries=3,
|
|
249
|
+
timeout=300
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
config.update({
|
|
253
|
+
# 并发控制
|
|
254
|
+
'CONCURRENCY': concurrency,
|
|
255
|
+
'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 10,
|
|
256
|
+
'MAX_RUNNING_SPIDERS': 1,
|
|
257
|
+
|
|
258
|
+
# 请求控制
|
|
259
|
+
'DOWNLOAD_DELAY': 0.2,
|
|
260
|
+
'RANDOMNESS': True,
|
|
261
|
+
'RANDOM_RANGE': (0.8, 1.5),
|
|
262
|
+
|
|
263
|
+
# 内存控制
|
|
264
|
+
'DOWNLOAD_MAXSIZE': 5 * 1024 * 1024, # 5MB
|
|
265
|
+
'CONNECTION_POOL_LIMIT': concurrency * 2,
|
|
266
|
+
|
|
267
|
+
# 重试策略
|
|
268
|
+
'MAX_RETRY_TIMES': 2,
|
|
269
|
+
|
|
270
|
+
# 使用增强引擎
|
|
271
|
+
'ENGINE_CLASS': 'crawlo.core.engine.Engine'
|
|
272
|
+
})
|
|
273
|
+
|
|
274
|
+
return config
|
|
275
|
+
|
|
276
|
+
@staticmethod
|
|
277
|
+
def balanced_config(concurrency: int = 16) -> Dict[str, Any]:
|
|
278
|
+
"""
|
|
279
|
+
平衡配置 - 适用于一般生产环境
|
|
280
|
+
|
|
281
|
+
特点:
|
|
282
|
+
- 中等的队列容量
|
|
283
|
+
- 平衡的并发数
|
|
284
|
+
- 适中的延迟
|
|
285
|
+
"""
|
|
286
|
+
from crawlo.utils.queue_helper import QueueHelper
|
|
287
|
+
|
|
288
|
+
config = QueueHelper.use_redis_queue(
|
|
289
|
+
queue_name="crawlo:balanced",
|
|
290
|
+
max_retries=5,
|
|
291
|
+
timeout=600
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
config.update({
|
|
295
|
+
# 并发控制
|
|
296
|
+
'CONCURRENCY': concurrency,
|
|
297
|
+
'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 15,
|
|
298
|
+
'MAX_RUNNING_SPIDERS': 2,
|
|
299
|
+
|
|
300
|
+
# 请求控制
|
|
301
|
+
'DOWNLOAD_DELAY': 0.1,
|
|
302
|
+
'RANDOMNESS': True,
|
|
303
|
+
'RANDOM_RANGE': (0.5, 1.2),
|
|
304
|
+
|
|
305
|
+
# 内存控制
|
|
306
|
+
'DOWNLOAD_MAXSIZE': 10 * 1024 * 1024, # 10MB
|
|
307
|
+
'CONNECTION_POOL_LIMIT': concurrency * 3,
|
|
308
|
+
|
|
309
|
+
# 重试策略
|
|
310
|
+
'MAX_RETRY_TIMES': 3,
|
|
311
|
+
|
|
312
|
+
# 使用增强引擎
|
|
313
|
+
'ENGINE_CLASS': 'crawlo.core.engine.Engine'
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
return config
|
|
317
|
+
|
|
318
|
+
@staticmethod
|
|
319
|
+
def aggressive_config(concurrency: int = 32) -> Dict[str, Any]:
|
|
320
|
+
"""
|
|
321
|
+
激进配置 - 适用于高性能环境
|
|
322
|
+
|
|
323
|
+
特点:
|
|
324
|
+
- 大的队列容量
|
|
325
|
+
- 高并发数
|
|
326
|
+
- 较短的延迟
|
|
327
|
+
"""
|
|
328
|
+
from crawlo.utils.queue_helper import QueueHelper
|
|
329
|
+
|
|
330
|
+
config = QueueHelper.use_redis_queue(
|
|
331
|
+
queue_name="crawlo:aggressive",
|
|
332
|
+
max_retries=10,
|
|
333
|
+
timeout=900
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
config.update({
|
|
337
|
+
# 并发控制
|
|
338
|
+
'CONCURRENCY': concurrency,
|
|
339
|
+
'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 20,
|
|
340
|
+
'MAX_RUNNING_SPIDERS': 3,
|
|
341
|
+
|
|
342
|
+
# 请求控制
|
|
343
|
+
'DOWNLOAD_DELAY': 0.05,
|
|
344
|
+
'RANDOMNESS': True,
|
|
345
|
+
'RANDOM_RANGE': (0.3, 1.0),
|
|
346
|
+
|
|
347
|
+
# 内存控制
|
|
348
|
+
'DOWNLOAD_MAXSIZE': 20 * 1024 * 1024, # 20MB
|
|
349
|
+
'CONNECTION_POOL_LIMIT': concurrency * 4,
|
|
350
|
+
|
|
351
|
+
# 重试策略
|
|
352
|
+
'MAX_RETRY_TIMES': 5,
|
|
353
|
+
|
|
354
|
+
# 使用增强引擎
|
|
355
|
+
'ENGINE_CLASS': 'crawlo.core.engine.Engine'
|
|
356
|
+
})
|
|
357
|
+
|
|
358
|
+
return config
|
|
359
|
+
|
|
360
|
+
@staticmethod
|
|
361
|
+
def memory_optimized_config(concurrency: int = 12) -> Dict[str, Any]:
|
|
362
|
+
"""
|
|
363
|
+
内存优化配置 - 适用于大规模但内存受限的场景
|
|
364
|
+
|
|
365
|
+
特点:
|
|
366
|
+
- 小队列,快速流转
|
|
367
|
+
- 严格的内存控制
|
|
368
|
+
- 使用Redis减少内存压力
|
|
369
|
+
"""
|
|
370
|
+
from crawlo.utils.queue_helper import QueueHelper
|
|
371
|
+
|
|
372
|
+
config = QueueHelper.use_redis_queue(
|
|
373
|
+
queue_name="crawlo:memory_optimized",
|
|
374
|
+
max_retries=3,
|
|
375
|
+
timeout=300
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
config.update({
|
|
379
|
+
# 并发控制
|
|
380
|
+
'CONCURRENCY': concurrency,
|
|
381
|
+
'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 5,
|
|
382
|
+
'MAX_RUNNING_SPIDERS': 1,
|
|
383
|
+
|
|
384
|
+
# 请求控制
|
|
385
|
+
'DOWNLOAD_DELAY': 0.1,
|
|
386
|
+
'RANDOMNESS': False,
|
|
387
|
+
|
|
388
|
+
# 严格的内存控制
|
|
389
|
+
'DOWNLOAD_MAXSIZE': 2 * 1024 * 1024, # 2MB
|
|
390
|
+
'DOWNLOAD_WARN_SIZE': 512 * 1024, # 512KB
|
|
391
|
+
'CONNECTION_POOL_LIMIT': concurrency,
|
|
392
|
+
|
|
393
|
+
# 重试策略
|
|
394
|
+
'MAX_RETRY_TIMES': 2,
|
|
395
|
+
|
|
396
|
+
# 使用增强引擎
|
|
397
|
+
'ENGINE_CLASS': 'crawlo.core.engine.Engine'
|
|
398
|
+
})
|
|
399
|
+
|
|
400
|
+
return config
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def apply_large_scale_config(
|
|
404
|
+
settings_dict: Dict[str, Any],
|
|
405
|
+
config_type: str = "balanced",
|
|
406
|
+
concurrency: Optional[int] = None
|
|
407
|
+
):
|
|
408
|
+
"""
|
|
409
|
+
应用大规模配置
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
settings_dict: 设置字典
|
|
413
|
+
config_type: 配置类型 ("conservative", "balanced", "aggressive", "memory_optimized")
|
|
414
|
+
concurrency: 并发数(可选,不指定则使用默认值)
|
|
415
|
+
"""
|
|
416
|
+
config_map = {
|
|
417
|
+
"conservative": LargeScaleConfig.conservative_config,
|
|
418
|
+
"balanced": LargeScaleConfig.balanced_config,
|
|
419
|
+
"aggressive": LargeScaleConfig.aggressive_config,
|
|
420
|
+
"memory_optimized": LargeScaleConfig.memory_optimized_config
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
if config_type not in config_map:
|
|
424
|
+
raise ValueError(f"不支持的配置类型: {config_type}")
|
|
425
|
+
|
|
426
|
+
if concurrency:
|
|
427
|
+
config = config_map[config_type](concurrency)
|
|
428
|
+
else:
|
|
429
|
+
config = config_map[config_type]()
|
|
430
|
+
|
|
431
|
+
settings_dict.update(config)
|
|
432
|
+
|
|
433
|
+
return config
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# 导出所有公共API
|
|
437
|
+
__all__ = [
|
|
438
|
+
'ConfigUtils',
|
|
439
|
+
'EnvConfigManager',
|
|
440
|
+
'LargeScaleConfig',
|
|
441
|
+
'apply_large_scale_config',
|
|
442
|
+
]
|
crawlo/utils/db_helper.py
CHANGED