crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/utils/__init__.py CHANGED
@@ -31,6 +31,21 @@ from .selector_helper import (
31
31
  is_xpath
32
32
  )
33
33
 
34
+ from .encoding_helper import (
35
+ html_body_declared_encoding,
36
+ http_content_type_encoding,
37
+ read_bom,
38
+ resolve_encoding,
39
+ html_to_unicode
40
+ )
41
+
42
+ from .response_helper import (
43
+ parse_cookies,
44
+ regex_search,
45
+ regex_findall,
46
+ get_header_value
47
+ )
48
+
34
49
  __all__ = [
35
50
  "TimeUtils",
36
51
  "parse_time",
@@ -47,5 +62,14 @@ __all__ = [
47
62
  "extract_texts",
48
63
  "extract_attr",
49
64
  "extract_attrs",
50
- "is_xpath"
65
+ "is_xpath",
66
+ "html_body_declared_encoding",
67
+ "http_content_type_encoding",
68
+ "read_bom",
69
+ "resolve_encoding",
70
+ "html_to_unicode",
71
+ "parse_cookies",
72
+ "regex_search",
73
+ "regex_findall",
74
+ "get_header_value"
51
75
  ]
@@ -9,7 +9,7 @@ from functools import wraps
9
9
  from typing import List, Callable, Any, Optional, Dict
10
10
 
11
11
  from crawlo.utils.error_handler import ErrorHandler
12
- from crawlo.utils.log import get_logger
12
+ from crawlo.logging import get_logger
13
13
 
14
14
 
15
15
  class BatchProcessor:
@@ -145,12 +145,18 @@ class RedisBatchProcessor:
145
145
 
146
146
  # 每达到批次大小就执行一次
147
147
  if count % self.batch_size == 0:
148
- await pipe.execute()
148
+ result = pipe.execute()
149
+ # 处理可能的异步情况
150
+ if asyncio.iscoroutine(result):
151
+ await result
149
152
  pipe = self.redis_client.pipeline()
150
153
 
151
154
  # 执行剩余的操作
152
155
  if count % self.batch_size != 0:
153
- await pipe.execute()
156
+ result = pipe.execute()
157
+ # 处理可能的异步情况
158
+ if asyncio.iscoroutine(result):
159
+ await result
154
160
 
155
161
  self.logger.debug(f"批量设置 {count} 个键值对")
156
162
  return count
@@ -178,7 +184,12 @@ class RedisBatchProcessor:
178
184
  for key in keys:
179
185
  pipe.get(key)
180
186
 
181
- results = await pipe.execute()
187
+ result = pipe.execute()
188
+ # 处理可能的异步情况
189
+ if asyncio.iscoroutine(result):
190
+ results = await result
191
+ else:
192
+ results = result
182
193
 
183
194
  # 构建结果字典
184
195
  result_dict = {}
@@ -216,12 +227,18 @@ class RedisBatchProcessor:
216
227
 
217
228
  # 每达到批次大小就执行一次
218
229
  if count % self.batch_size == 0:
219
- await pipe.execute()
230
+ result = pipe.execute()
231
+ # 处理可能的异步情况
232
+ if asyncio.iscoroutine(result):
233
+ await result
220
234
  pipe = self.redis_client.pipeline()
221
235
 
222
236
  # 执行剩余的操作
223
237
  if count % self.batch_size != 0:
224
- await pipe.execute()
238
+ result = pipe.execute()
239
+ # 处理可能的异步情况
240
+ if asyncio.iscoroutine(result):
241
+ await result
225
242
 
226
243
  self.logger.debug(f"批量删除 {count} 个键")
227
244
  return count
@@ -0,0 +1,442 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 配置管理模块
5
+ ===========
6
+ 统一的配置管理接口,整合了通用配置工具、环境变量管理和大规模爬虫配置。
7
+
8
+ 本模块包含:
9
+ 1. ConfigUtils - 通用配置工具类
10
+ 2. EnvConfigManager - 环境变量配置管理器
11
+ 3. LargeScaleConfig - 大规模爬虫配置类
12
+ 4. 便捷函数 - 快速访问常用配置功能
13
+ """
14
+
15
+ import os
16
+ import re
17
+ from typing import Any, Dict, List, Optional, Union
18
+
19
+
20
+ # ============================================================================
21
+ # 第一部分:通用配置工具
22
+ # ============================================================================
23
+
24
+ class ConfigUtils:
25
+ """通用配置工具类"""
26
+
27
+ @staticmethod
28
+ def get_config_value(
29
+ config_sources: List[Union[Dict, Any]],
30
+ key: str,
31
+ default: Any = None,
32
+ value_type: type = str
33
+ ) -> Any:
34
+ """
35
+ 从多个配置源中获取配置值
36
+
37
+ Args:
38
+ config_sources: 配置源列表,按优先级排序
39
+ key: 配置键名
40
+ default: 默认值
41
+ value_type: 值类型
42
+
43
+ Returns:
44
+ 配置值或默认值
45
+ """
46
+ for config_source in config_sources:
47
+ if not config_source:
48
+ continue
49
+
50
+ # 获取配置值
51
+ value = None
52
+ if hasattr(config_source, 'get'):
53
+ value = config_source.get(key)
54
+ elif hasattr(config_source, key):
55
+ value = getattr(config_source, key)
56
+ else:
57
+ continue
58
+
59
+ if value is not None:
60
+ # 类型转换
61
+ try:
62
+ if value_type == bool:
63
+ if isinstance(value, str):
64
+ return value.lower() in ('1', 'true', 'yes', 'on')
65
+ return bool(value)
66
+ elif value_type == int:
67
+ return int(value)
68
+ elif value_type == float:
69
+ return float(value)
70
+ else:
71
+ return value_type(value)
72
+ except (ValueError, TypeError):
73
+ continue
74
+
75
+ return default
76
+
77
+ @staticmethod
78
+ def has_config_prefix(config_source: Union[Dict, Any], prefix: str) -> bool:
79
+ """
80
+ 检查配置源是否包含指定前缀的配置项
81
+
82
+ Args:
83
+ config_source: 配置源
84
+ prefix: 前缀
85
+
86
+ Returns:
87
+ 是否包含指定前缀的配置项
88
+ """
89
+ if not config_source:
90
+ return False
91
+
92
+ if hasattr(config_source, 'keys'):
93
+ return any(key.startswith(prefix) for key in config_source.keys())
94
+ elif hasattr(config_source, '__dict__'):
95
+ return any(key.startswith(prefix) for key in config_source.__dict__.keys())
96
+ else:
97
+ return any(key.startswith(prefix) for key in dir(config_source))
98
+
99
+ @staticmethod
100
+ def merge_config_sources(config_sources: List[Union[Dict, Any]]) -> Dict[str, Any]:
101
+ """
102
+ 合并多个配置源,后面的配置源优先级更高
103
+
104
+ Args:
105
+ config_sources: 配置源列表
106
+
107
+ Returns:
108
+ 合并后的配置字典
109
+ """
110
+ merged_config = {}
111
+
112
+ for config_source in config_sources:
113
+ if not config_source:
114
+ continue
115
+
116
+ if hasattr(config_source, 'keys'):
117
+ # 字典类型配置源
118
+ for key, value in config_source.items():
119
+ if key.isupper(): # 只合并大写的配置项
120
+ merged_config[key] = value
121
+ elif hasattr(config_source, '__dict__'):
122
+ # 对象类型配置源
123
+ for key, value in config_source.__dict__.items():
124
+ if key.isupper():
125
+ merged_config[key] = value
126
+ else:
127
+ # 其他类型配置源
128
+ for key in dir(config_source):
129
+ if key.isupper():
130
+ merged_config[key] = getattr(config_source, key)
131
+
132
+ return merged_config
133
+
134
+
135
+ # ============================================================================
136
+ # 第二部分:环境变量配置管理
137
+ # ============================================================================
138
+
139
+ class EnvConfigManager:
140
+ """环境变量配置管理器"""
141
+
142
+ @staticmethod
143
+ def get_env_var(var_name: str, default: Any = None, var_type: type = str) -> Any:
144
+ """
145
+ 获取环境变量值
146
+
147
+ Args:
148
+ var_name: 环境变量名称
149
+ default: 默认值
150
+ var_type: 变量类型 (str, int, float, bool)
151
+
152
+ Returns:
153
+ 环境变量值或默认值
154
+ """
155
+ value = os.getenv(var_name)
156
+ if value is None:
157
+ return default
158
+
159
+ try:
160
+ if var_type == bool:
161
+ return value.lower() in ('1', 'true', 'yes', 'on')
162
+ elif var_type == int:
163
+ return int(value)
164
+ elif var_type == float:
165
+ return float(value)
166
+ else:
167
+ return value
168
+ except (ValueError, TypeError):
169
+ return default
170
+
171
+ @staticmethod
172
+ def get_redis_config() -> dict:
173
+ """
174
+ 获取 Redis 配置
175
+
176
+ Returns:
177
+ Redis 配置字典
178
+ """
179
+ return {
180
+ 'REDIS_HOST': EnvConfigManager.get_env_var('CRAWLO_REDIS_HOST', '127.0.0.1', str),
181
+ 'REDIS_PORT': EnvConfigManager.get_env_var('CRAWLO_REDIS_PORT', 6379, int),
182
+ 'REDIS_PASSWORD': EnvConfigManager.get_env_var('CRAWLO_REDIS_PASSWORD', '', str),
183
+ 'REDIS_DB': EnvConfigManager.get_env_var('CRAWLO_REDIS_DB', 0, int),
184
+ }
185
+
186
+ @staticmethod
187
+ def get_runtime_config() -> dict:
188
+ """
189
+ 获取运行时配置
190
+
191
+ Returns:
192
+ 运行时配置字典
193
+ """
194
+ return {
195
+ 'CRAWLO_MODE': EnvConfigManager.get_env_var('CRAWLO_MODE', 'standalone', str),
196
+ 'PROJECT_NAME': EnvConfigManager.get_env_var('CRAWLO_PROJECT_NAME', 'crawlo', str),
197
+ 'CONCURRENCY': EnvConfigManager.get_env_var('CRAWLO_CONCURRENCY', 8, int),
198
+ }
199
+
200
+ @staticmethod
201
+ def get_version() -> str:
202
+ """
203
+ 获取框架版本号
204
+
205
+ Returns:
206
+ 框架版本号字符串
207
+ """
208
+ # 获取版本文件路径
209
+ version_file = os.path.join(os.path.dirname(__file__), '..', '__version__.py')
210
+ default_version = '1.0.0'
211
+
212
+ if os.path.exists(version_file):
213
+ try:
214
+ with open(version_file, 'r', encoding='utf-8') as f:
215
+ content = f.read()
216
+ # 使用正则表达式提取版本号
217
+ version_match = re.search(r"__version__\s*=\s*['\"]([^'\"]*)['\"]", content)
218
+ if version_match:
219
+ return version_match.group(1)
220
+ except Exception:
221
+ # 如果读取失败,使用默认版本号
222
+ pass
223
+
224
+ return default_version
225
+
226
+
227
+ # ============================================================================
228
+ # 第三部分:大规模爬虫配置
229
+ # ============================================================================
230
+
231
+ class LargeScaleConfig:
232
+ """大规模爬虫配置类"""
233
+
234
+ @staticmethod
235
+ def conservative_config(concurrency: int = 8) -> Dict[str, Any]:
236
+ """
237
+ 保守配置 - 适用于资源有限的环境
238
+
239
+ 特点:
240
+ - 较小的队列容量
241
+ - 较低的并发数
242
+ - 较长的延迟
243
+ """
244
+ from crawlo.utils.queue_helper import QueueHelper
245
+
246
+ config = QueueHelper.use_redis_queue(
247
+ queue_name="crawlo:conservative",
248
+ max_retries=3,
249
+ timeout=300
250
+ )
251
+
252
+ config.update({
253
+ # 并发控制
254
+ 'CONCURRENCY': concurrency,
255
+ 'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 10,
256
+ 'MAX_RUNNING_SPIDERS': 1,
257
+
258
+ # 请求控制
259
+ 'DOWNLOAD_DELAY': 0.2,
260
+ 'RANDOMNESS': True,
261
+ 'RANDOM_RANGE': (0.8, 1.5),
262
+
263
+ # 内存控制
264
+ 'DOWNLOAD_MAXSIZE': 5 * 1024 * 1024, # 5MB
265
+ 'CONNECTION_POOL_LIMIT': concurrency * 2,
266
+
267
+ # 重试策略
268
+ 'MAX_RETRY_TIMES': 2,
269
+
270
+ # 使用增强引擎
271
+ 'ENGINE_CLASS': 'crawlo.core.engine.Engine'
272
+ })
273
+
274
+ return config
275
+
276
+ @staticmethod
277
+ def balanced_config(concurrency: int = 16) -> Dict[str, Any]:
278
+ """
279
+ 平衡配置 - 适用于一般生产环境
280
+
281
+ 特点:
282
+ - 中等的队列容量
283
+ - 平衡的并发数
284
+ - 适中的延迟
285
+ """
286
+ from crawlo.utils.queue_helper import QueueHelper
287
+
288
+ config = QueueHelper.use_redis_queue(
289
+ queue_name="crawlo:balanced",
290
+ max_retries=5,
291
+ timeout=600
292
+ )
293
+
294
+ config.update({
295
+ # 并发控制
296
+ 'CONCURRENCY': concurrency,
297
+ 'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 15,
298
+ 'MAX_RUNNING_SPIDERS': 2,
299
+
300
+ # 请求控制
301
+ 'DOWNLOAD_DELAY': 0.1,
302
+ 'RANDOMNESS': True,
303
+ 'RANDOM_RANGE': (0.5, 1.2),
304
+
305
+ # 内存控制
306
+ 'DOWNLOAD_MAXSIZE': 10 * 1024 * 1024, # 10MB
307
+ 'CONNECTION_POOL_LIMIT': concurrency * 3,
308
+
309
+ # 重试策略
310
+ 'MAX_RETRY_TIMES': 3,
311
+
312
+ # 使用增强引擎
313
+ 'ENGINE_CLASS': 'crawlo.core.engine.Engine'
314
+ })
315
+
316
+ return config
317
+
318
+ @staticmethod
319
+ def aggressive_config(concurrency: int = 32) -> Dict[str, Any]:
320
+ """
321
+ 激进配置 - 适用于高性能环境
322
+
323
+ 特点:
324
+ - 大的队列容量
325
+ - 高并发数
326
+ - 较短的延迟
327
+ """
328
+ from crawlo.utils.queue_helper import QueueHelper
329
+
330
+ config = QueueHelper.use_redis_queue(
331
+ queue_name="crawlo:aggressive",
332
+ max_retries=10,
333
+ timeout=900
334
+ )
335
+
336
+ config.update({
337
+ # 并发控制
338
+ 'CONCURRENCY': concurrency,
339
+ 'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 20,
340
+ 'MAX_RUNNING_SPIDERS': 3,
341
+
342
+ # 请求控制
343
+ 'DOWNLOAD_DELAY': 0.05,
344
+ 'RANDOMNESS': True,
345
+ 'RANDOM_RANGE': (0.3, 1.0),
346
+
347
+ # 内存控制
348
+ 'DOWNLOAD_MAXSIZE': 20 * 1024 * 1024, # 20MB
349
+ 'CONNECTION_POOL_LIMIT': concurrency * 4,
350
+
351
+ # 重试策略
352
+ 'MAX_RETRY_TIMES': 5,
353
+
354
+ # 使用增强引擎
355
+ 'ENGINE_CLASS': 'crawlo.core.engine.Engine'
356
+ })
357
+
358
+ return config
359
+
360
+ @staticmethod
361
+ def memory_optimized_config(concurrency: int = 12) -> Dict[str, Any]:
362
+ """
363
+ 内存优化配置 - 适用于大规模但内存受限的场景
364
+
365
+ 特点:
366
+ - 小队列,快速流转
367
+ - 严格的内存控制
368
+ - 使用Redis减少内存压力
369
+ """
370
+ from crawlo.utils.queue_helper import QueueHelper
371
+
372
+ config = QueueHelper.use_redis_queue(
373
+ queue_name="crawlo:memory_optimized",
374
+ max_retries=3,
375
+ timeout=300
376
+ )
377
+
378
+ config.update({
379
+ # 并发控制
380
+ 'CONCURRENCY': concurrency,
381
+ 'SCHEDULER_MAX_QUEUE_SIZE': concurrency * 5,
382
+ 'MAX_RUNNING_SPIDERS': 1,
383
+
384
+ # 请求控制
385
+ 'DOWNLOAD_DELAY': 0.1,
386
+ 'RANDOMNESS': False,
387
+
388
+ # 严格的内存控制
389
+ 'DOWNLOAD_MAXSIZE': 2 * 1024 * 1024, # 2MB
390
+ 'DOWNLOAD_WARN_SIZE': 512 * 1024, # 512KB
391
+ 'CONNECTION_POOL_LIMIT': concurrency,
392
+
393
+ # 重试策略
394
+ 'MAX_RETRY_TIMES': 2,
395
+
396
+ # 使用增强引擎
397
+ 'ENGINE_CLASS': 'crawlo.core.engine.Engine'
398
+ })
399
+
400
+ return config
401
+
402
+
403
+ def apply_large_scale_config(
404
+ settings_dict: Dict[str, Any],
405
+ config_type: str = "balanced",
406
+ concurrency: Optional[int] = None
407
+ ):
408
+ """
409
+ 应用大规模配置
410
+
411
+ Args:
412
+ settings_dict: 设置字典
413
+ config_type: 配置类型 ("conservative", "balanced", "aggressive", "memory_optimized")
414
+ concurrency: 并发数(可选,不指定则使用默认值)
415
+ """
416
+ config_map = {
417
+ "conservative": LargeScaleConfig.conservative_config,
418
+ "balanced": LargeScaleConfig.balanced_config,
419
+ "aggressive": LargeScaleConfig.aggressive_config,
420
+ "memory_optimized": LargeScaleConfig.memory_optimized_config
421
+ }
422
+
423
+ if config_type not in config_map:
424
+ raise ValueError(f"不支持的配置类型: {config_type}")
425
+
426
+ if concurrency:
427
+ config = config_map[config_type](concurrency)
428
+ else:
429
+ config = config_map[config_type]()
430
+
431
+ settings_dict.update(config)
432
+
433
+ return config
434
+
435
+
436
+ # 导出所有公共API
437
+ __all__ = [
438
+ 'ConfigUtils',
439
+ 'EnvConfigManager',
440
+ 'LargeScaleConfig',
441
+ 'apply_large_scale_config',
442
+ ]
@@ -10,7 +10,7 @@ from collections import deque
10
10
  from typing import Generator, Optional
11
11
 
12
12
  from crawlo import Request
13
- from crawlo.utils.log import get_logger
13
+ from crawlo.logging import get_logger
14
14
 
15
15
 
16
16
  class ControlledRequestMixin:
crawlo/utils/db_helper.py CHANGED
@@ -5,7 +5,7 @@ from typing import Any, Union, List, Dict, Tuple, Optional
5
5
  from datetime import date, time, datetime
6
6
  from enum import Enum
7
7
 
8
- from crawlo.utils.log import get_logger
8
+ from crawlo.logging import get_logger
9
9
 
10
10
  logger = get_logger(__name__)
11
11