crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (118) hide show
  1. crawlo/__init__.py +34 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/list.py +155 -155
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -196
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +186 -186
  12. crawlo/config.py +279 -279
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -171
  15. crawlo/core/enhanced_engine.py +189 -189
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +165 -165
  18. crawlo/crawler.py +1027 -1027
  19. crawlo/downloader/__init__.py +242 -242
  20. crawlo/downloader/aiohttp_downloader.py +212 -212
  21. crawlo/downloader/cffi_downloader.py +251 -251
  22. crawlo/downloader/httpx_downloader.py +259 -259
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +81 -81
  25. crawlo/extension/__init__.py +38 -31
  26. crawlo/extension/health_check.py +142 -0
  27. crawlo/extension/log_interval.py +58 -49
  28. crawlo/extension/log_stats.py +82 -44
  29. crawlo/extension/logging_extension.py +44 -35
  30. crawlo/extension/memory_monitor.py +89 -0
  31. crawlo/extension/performance_profiler.py +118 -0
  32. crawlo/extension/request_recorder.py +108 -0
  33. crawlo/filters/__init__.py +154 -154
  34. crawlo/filters/aioredis_filter.py +241 -241
  35. crawlo/filters/memory_filter.py +269 -269
  36. crawlo/items/__init__.py +23 -23
  37. crawlo/items/base.py +21 -21
  38. crawlo/items/fields.py +53 -53
  39. crawlo/items/items.py +104 -104
  40. crawlo/middleware/__init__.py +21 -21
  41. crawlo/middleware/default_header.py +32 -32
  42. crawlo/middleware/download_delay.py +28 -28
  43. crawlo/middleware/middleware_manager.py +135 -135
  44. crawlo/middleware/proxy.py +248 -248
  45. crawlo/middleware/request_ignore.py +30 -30
  46. crawlo/middleware/response_code.py +18 -18
  47. crawlo/middleware/response_filter.py +26 -26
  48. crawlo/middleware/retry.py +124 -124
  49. crawlo/mode_manager.py +200 -200
  50. crawlo/network/__init__.py +21 -21
  51. crawlo/network/request.py +311 -311
  52. crawlo/network/response.py +271 -271
  53. crawlo/pipelines/__init__.py +21 -21
  54. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  55. crawlo/pipelines/console_pipeline.py +39 -39
  56. crawlo/pipelines/csv_pipeline.py +316 -316
  57. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  58. crawlo/pipelines/json_pipeline.py +218 -218
  59. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  60. crawlo/pipelines/mongo_pipeline.py +132 -117
  61. crawlo/pipelines/mysql_pipeline.py +317 -195
  62. crawlo/pipelines/pipeline_manager.py +56 -56
  63. crawlo/pipelines/redis_dedup_pipeline.py +162 -162
  64. crawlo/project.py +153 -153
  65. crawlo/queue/pqueue.py +37 -37
  66. crawlo/queue/queue_manager.py +307 -307
  67. crawlo/queue/redis_priority_queue.py +208 -208
  68. crawlo/settings/__init__.py +7 -7
  69. crawlo/settings/default_settings.py +278 -244
  70. crawlo/settings/setting_manager.py +99 -99
  71. crawlo/spider/__init__.py +639 -639
  72. crawlo/stats_collector.py +59 -59
  73. crawlo/subscriber.py +131 -106
  74. crawlo/task_manager.py +30 -30
  75. crawlo/templates/crawlo.cfg.tmpl +10 -10
  76. crawlo/templates/project/__init__.py.tmpl +3 -3
  77. crawlo/templates/project/items.py.tmpl +17 -17
  78. crawlo/templates/project/middlewares.py.tmpl +111 -87
  79. crawlo/templates/project/pipelines.py.tmpl +97 -341
  80. crawlo/templates/project/run.py.tmpl +251 -251
  81. crawlo/templates/project/settings.py.tmpl +279 -250
  82. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  83. crawlo/templates/spider/spider.py.tmpl +142 -178
  84. crawlo/utils/__init__.py +7 -7
  85. crawlo/utils/controlled_spider_mixin.py +439 -439
  86. crawlo/utils/date_tools.py +233 -233
  87. crawlo/utils/db_helper.py +343 -343
  88. crawlo/utils/func_tools.py +82 -82
  89. crawlo/utils/large_scale_config.py +286 -286
  90. crawlo/utils/large_scale_helper.py +343 -343
  91. crawlo/utils/log.py +128 -128
  92. crawlo/utils/queue_helper.py +175 -175
  93. crawlo/utils/request.py +267 -267
  94. crawlo/utils/request_serializer.py +219 -219
  95. crawlo/utils/spider_loader.py +62 -62
  96. crawlo/utils/system.py +11 -11
  97. crawlo/utils/tools.py +4 -4
  98. crawlo/utils/url.py +39 -39
  99. crawlo-1.1.4.dist-info/METADATA +403 -0
  100. crawlo-1.1.4.dist-info/RECORD +117 -0
  101. examples/__init__.py +7 -7
  102. examples/controlled_spider_example.py +205 -205
  103. tests/__init__.py +7 -7
  104. tests/test_final_validation.py +153 -153
  105. tests/test_proxy_health_check.py +32 -32
  106. tests/test_proxy_middleware_integration.py +136 -136
  107. tests/test_proxy_providers.py +56 -56
  108. tests/test_proxy_stats.py +19 -19
  109. tests/test_proxy_strategies.py +59 -59
  110. tests/test_redis_config.py +28 -28
  111. tests/test_redis_queue.py +224 -224
  112. tests/test_request_serialization.py +70 -70
  113. tests/test_scheduler.py +241 -241
  114. crawlo-1.1.3.dist-info/METADATA +0 -635
  115. crawlo-1.1.3.dist-info/RECORD +0 -113
  116. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
  117. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
  118. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
crawlo/utils/log.py CHANGED
@@ -1,129 +1,129 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- 日志管理器:安全版本,使用字符串化 key 避免 unhashable 问题
4
- """
5
- import os
6
- from logging import (
7
- Formatter,
8
- StreamHandler,
9
- FileHandler,
10
- Logger,
11
- DEBUG,
12
- INFO,
13
- WARNING,
14
- ERROR,
15
- CRITICAL,
16
- )
17
-
18
- LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
19
-
20
-
21
- class LoggerManager:
22
- logger_cache = {}
23
- _default_filename = None
24
- _default_level = INFO
25
- _default_file_level = INFO
26
- _default_console_level = INFO
27
- _default_log_format = LOG_FORMAT
28
- _default_encoding = 'utf-8'
29
-
30
- _level_map = {
31
- 'DEBUG': DEBUG,
32
- 'INFO': INFO,
33
- 'WARNING': WARNING,
34
- 'ERROR': ERROR,
35
- 'CRITICAL': CRITICAL,
36
- }
37
-
38
- @classmethod
39
- def _to_level(cls, level):
40
- """安全转换为日志级别 int"""
41
- if level is None:
42
- return INFO
43
- if isinstance(level, int):
44
- return level
45
- if isinstance(level, str):
46
- return cls._level_map.get(level.upper(), INFO)
47
- if hasattr(level, 'get'): # 如 SettingManager 或 dict
48
- lv = level.get('LOG_LEVEL')
49
- if isinstance(lv, int):
50
- return lv
51
- if isinstance(lv, str):
52
- return cls._level_map.get(lv.upper(), INFO)
53
- return INFO
54
-
55
- @classmethod
56
- def configure(cls, settings=None, **kwargs):
57
- """
58
- 使用 settings 对象或关键字参数配置日志
59
- """
60
- # 优先使用 settings,否则用 kwargs
61
- get_val = settings.get if hasattr(settings, 'get') else (lambda k, d=None: kwargs.get(k, d))
62
-
63
- filename = get_val('LOG_FILE')
64
- level = get_val('LOG_LEVEL', 'INFO')
65
- file_level = get_val('LOG_FILE_LEVEL', level)
66
- console_level = get_val('LOG_CONSOLE_LEVEL', level)
67
- log_format = get_val('LOG_FORMAT', LOG_FORMAT)
68
- encoding = get_val('LOG_ENCODING', 'utf-8')
69
-
70
- cls._default_filename = filename
71
- cls._default_level = cls._to_level(level)
72
- cls._default_file_level = cls._to_level(file_level)
73
- cls._default_console_level = cls._to_level(console_level)
74
- cls._default_log_format = log_format
75
- cls._default_encoding = encoding
76
-
77
- @classmethod
78
- def get_logger(cls, name='default', level=None, filename=None):
79
- """
80
- 简化接口,只暴露必要参数
81
- """
82
- # 确定最终参数
83
- final_level = cls._to_level(level) if level is not None else cls._default_level
84
- final_filename = filename if filename is not None else cls._default_filename
85
-
86
- # ✅ 安全的字符串化 key,避免任何 unhashable 类型
87
- key_parts = [
88
- name,
89
- str(final_level),
90
- final_filename or 'no_file',
91
- ]
92
- key = '|'.join(key_parts) # 如 "my_spider|20|logs/app.log"
93
-
94
- if key in cls.logger_cache:
95
- return cls.logger_cache[key]
96
-
97
- # 创建 logger
98
- _logger = Logger(name=name)
99
- _logger.setLevel(final_level)
100
-
101
- formatter = Formatter(cls._default_log_format)
102
-
103
- # 控制台
104
- if cls._default_console_level is not False:
105
- ch = StreamHandler()
106
- ch.setFormatter(formatter)
107
- ch.setLevel(cls._default_console_level)
108
- _logger.addHandler(ch)
109
-
110
- # 文件
111
- if final_filename:
112
- try:
113
- log_dir = os.path.dirname(final_filename)
114
- if log_dir and not os.path.exists(log_dir):
115
- os.makedirs(log_dir, exist_ok=True)
116
-
117
- fh = FileHandler(final_filename, encoding=cls._default_encoding)
118
- fh.setFormatter(formatter)
119
- fh.setLevel(cls._default_file_level)
120
- _logger.addHandler(fh)
121
- except Exception as e:
122
- print(f"[Logger] 无法创建日志文件 {final_filename}: {e}")
123
-
124
- cls.logger_cache[key] = _logger
125
- return _logger
126
-
127
-
128
- # 全局快捷函数
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ 日志管理器:安全版本,使用字符串化 key 避免 unhashable 问题
4
+ """
5
+ import os
6
+ from logging import (
7
+ Formatter,
8
+ StreamHandler,
9
+ FileHandler,
10
+ Logger,
11
+ DEBUG,
12
+ INFO,
13
+ WARNING,
14
+ ERROR,
15
+ CRITICAL,
16
+ )
17
+
18
+ LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
19
+
20
+
21
+ class LoggerManager:
22
+ logger_cache = {}
23
+ _default_filename = None
24
+ _default_level = INFO
25
+ _default_file_level = INFO
26
+ _default_console_level = INFO
27
+ _default_log_format = LOG_FORMAT
28
+ _default_encoding = 'utf-8'
29
+
30
+ _level_map = {
31
+ 'DEBUG': DEBUG,
32
+ 'INFO': INFO,
33
+ 'WARNING': WARNING,
34
+ 'ERROR': ERROR,
35
+ 'CRITICAL': CRITICAL,
36
+ }
37
+
38
+ @classmethod
39
+ def _to_level(cls, level):
40
+ """安全转换为日志级别 int"""
41
+ if level is None:
42
+ return INFO
43
+ if isinstance(level, int):
44
+ return level
45
+ if isinstance(level, str):
46
+ return cls._level_map.get(level.upper(), INFO)
47
+ if hasattr(level, 'get'): # 如 SettingManager 或 dict
48
+ lv = level.get('LOG_LEVEL')
49
+ if isinstance(lv, int):
50
+ return lv
51
+ if isinstance(lv, str):
52
+ return cls._level_map.get(lv.upper(), INFO)
53
+ return INFO
54
+
55
+ @classmethod
56
+ def configure(cls, settings=None, **kwargs):
57
+ """
58
+ 使用 settings 对象或关键字参数配置日志
59
+ """
60
+ # 优先使用 settings,否则用 kwargs
61
+ get_val = settings.get if hasattr(settings, 'get') else (lambda k, d=None: kwargs.get(k, d))
62
+
63
+ filename = get_val('LOG_FILE')
64
+ level = get_val('LOG_LEVEL', 'INFO')
65
+ file_level = get_val('LOG_FILE_LEVEL', level)
66
+ console_level = get_val('LOG_CONSOLE_LEVEL', level)
67
+ log_format = get_val('LOG_FORMAT', LOG_FORMAT)
68
+ encoding = get_val('LOG_ENCODING', 'utf-8')
69
+
70
+ cls._default_filename = filename
71
+ cls._default_level = cls._to_level(level)
72
+ cls._default_file_level = cls._to_level(file_level)
73
+ cls._default_console_level = cls._to_level(console_level)
74
+ cls._default_log_format = log_format
75
+ cls._default_encoding = encoding
76
+
77
+ @classmethod
78
+ def get_logger(cls, name='default', level=None, filename=None):
79
+ """
80
+ 简化接口,只暴露必要参数
81
+ """
82
+ # 确定最终参数
83
+ final_level = cls._to_level(level) if level is not None else cls._default_level
84
+ final_filename = filename if filename is not None else cls._default_filename
85
+
86
+ # ✅ 安全的字符串化 key,避免任何 unhashable 类型
87
+ key_parts = [
88
+ name,
89
+ str(final_level),
90
+ final_filename or 'no_file',
91
+ ]
92
+ key = '|'.join(key_parts) # 如 "my_spider|20|logs/app.log"
93
+
94
+ if key in cls.logger_cache:
95
+ return cls.logger_cache[key]
96
+
97
+ # 创建 logger
98
+ _logger = Logger(name=name)
99
+ _logger.setLevel(final_level)
100
+
101
+ formatter = Formatter(cls._default_log_format)
102
+
103
+ # 控制台
104
+ if cls._default_console_level is not False:
105
+ ch = StreamHandler()
106
+ ch.setFormatter(formatter)
107
+ ch.setLevel(cls._default_console_level)
108
+ _logger.addHandler(ch)
109
+
110
+ # 文件
111
+ if final_filename:
112
+ try:
113
+ log_dir = os.path.dirname(final_filename)
114
+ if log_dir and not os.path.exists(log_dir):
115
+ os.makedirs(log_dir, exist_ok=True)
116
+
117
+ fh = FileHandler(final_filename, encoding=cls._default_encoding)
118
+ fh.setFormatter(formatter)
119
+ fh.setLevel(cls._default_file_level)
120
+ _logger.addHandler(fh)
121
+ except Exception as e:
122
+ print(f"[Logger] 无法创建日志文件 {final_filename}: {e}")
123
+
124
+ cls.logger_cache[key] = _logger
125
+ return _logger
126
+
127
+
128
+ # 全局快捷函数
129
129
  get_logger = LoggerManager.get_logger
@@ -1,176 +1,176 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 队列配置辅助工具
5
- 为用户提供简洁的队列配置接口
6
- """
7
- from typing import Dict, Any, Optional
8
-
9
-
10
- class QueueHelper:
11
- """队列配置辅助类"""
12
-
13
- @staticmethod
14
- def use_memory_queue(max_size: int = 2000) -> Dict[str, Any]:
15
- """
16
- 配置使用内存队列
17
-
18
- Args:
19
- max_size: 队列最大容量
20
-
21
- Returns:
22
- 配置字典
23
- """
24
- return {
25
- 'QUEUE_TYPE': 'memory',
26
- 'SCHEDULER_MAX_QUEUE_SIZE': max_size,
27
- }
28
-
29
- @staticmethod
30
- def use_redis_queue(
31
- host: str = "127.0.0.1",
32
- port: int = 6379,
33
- password: Optional[str] = None,
34
- db: int = 0,
35
- queue_name: str = "crawlo:requests",
36
- max_retries: int = 3,
37
- timeout: int = 300
38
- ) -> Dict[str, Any]:
39
- """
40
- 配置使用 Redis 分布式队列
41
-
42
- Args:
43
- host: Redis 主机地址
44
- port: Redis 端口
45
- password: Redis 密码(可选)
46
- db: Redis 数据库编号
47
- queue_name: 队列名称
48
- max_retries: 最大重试次数
49
- timeout: 操作超时时间(秒)
50
-
51
- Returns:
52
- 配置字典
53
- """
54
- if password:
55
- redis_url = f"redis://:{password}@{host}:{port}/{db}"
56
- else:
57
- redis_url = f"redis://{host}:{port}/{db}"
58
-
59
- return {
60
- 'QUEUE_TYPE': 'redis',
61
- 'REDIS_URL': redis_url,
62
- 'REDIS_HOST': host,
63
- 'REDIS_PORT': port,
64
- 'REDIS_PASSWORD': password or '',
65
- 'REDIS_DB': db,
66
- 'SCHEDULER_QUEUE_NAME': queue_name,
67
- 'QUEUE_MAX_RETRIES': max_retries,
68
- 'QUEUE_TIMEOUT': timeout,
69
- }
70
-
71
- @staticmethod
72
- def auto_queue(
73
- redis_fallback: bool = True,
74
- memory_max_size: int = 2000,
75
- **redis_kwargs
76
- ) -> Dict[str, Any]:
77
- """
78
- 配置自动选择队列类型
79
-
80
- Args:
81
- redis_fallback: Redis 不可用时是否回退到内存队列
82
- memory_max_size: 内存队列最大容量
83
- **redis_kwargs: Redis 配置参数
84
-
85
- Returns:
86
- 配置字典
87
- """
88
- config = {
89
- 'QUEUE_TYPE': 'auto',
90
- 'SCHEDULER_MAX_QUEUE_SIZE': memory_max_size,
91
- }
92
-
93
- # 添加 Redis 配置(用于自动检测)
94
- if redis_kwargs:
95
- redis_config = QueueHelper.use_redis_queue(**redis_kwargs)
96
- config.update(redis_config)
97
- config['QUEUE_TYPE'] = 'auto' # 确保是自动模式
98
-
99
- return config
100
-
101
-
102
- # 预定义的常用配置
103
- class QueuePresets:
104
- """预定义的队列配置"""
105
-
106
- # 开发环境:使用内存队列
107
- DEVELOPMENT = QueueHelper.use_memory_queue(max_size=1000)
108
-
109
- # 生产环境:使用 Redis 分布式队列
110
- PRODUCTION = QueueHelper.use_redis_queue(
111
- host="127.0.0.1",
112
- port=6379,
113
- queue_name="crawlo:production",
114
- max_retries=5,
115
- timeout=600
116
- )
117
-
118
- # 测试环境:自动选择,Redis 不可用时使用内存队列
119
- TESTING = QueueHelper.auto_queue(
120
- redis_fallback=True,
121
- memory_max_size=500,
122
- host="127.0.0.1",
123
- port=6379,
124
- queue_name="crawlo:testing"
125
- )
126
-
127
- # 高性能环境:Redis 集群
128
- HIGH_PERFORMANCE = QueueHelper.use_redis_queue(
129
- host="redis-cluster.example.com",
130
- port=6379,
131
- queue_name="crawlo:cluster",
132
- max_retries=10,
133
- timeout=300
134
- )
135
-
136
-
137
- def apply_queue_config(settings_dict: Dict[str, Any], config: Dict[str, Any]) -> None:
138
- """
139
- 将队列配置应用到设置字典
140
-
141
- Args:
142
- settings_dict: 现有的设置字典
143
- config: 队列配置字典
144
- """
145
- settings_dict.update(config)
146
-
147
-
148
- # 使用示例和文档
149
- USAGE_EXAMPLES = """
150
- # 使用示例:
151
-
152
- # 1. 在 settings.py 中使用内存队列
153
- from crawlo.utils.queue_helper import QueueHelper
154
- apply_queue_config(locals(), QueueHelper.use_memory_queue())
155
-
156
- # 2. 在 settings.py 中使用 Redis 队列
157
- apply_queue_config(locals(), QueueHelper.use_redis_queue(
158
- host="redis.example.com",
159
- password="your_password"
160
- ))
161
-
162
- # 3. 使用预定义配置
163
- from crawlo.utils.queue_helper import QueuePresets
164
- apply_queue_config(locals(), QueuePresets.PRODUCTION)
165
-
166
- # 4. 自动选择队列类型
167
- apply_queue_config(locals(), QueueHelper.auto_queue(
168
- host="127.0.0.1",
169
- port=6379
170
- ))
171
-
172
- # 5. 直接在 settings 中配置
173
- QUEUE_TYPE = 'auto' # 'memory', 'redis', 'auto'
174
- REDIS_URL = 'redis://127.0.0.1:6379/0'
175
- SCHEDULER_MAX_QUEUE_SIZE = 2000
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 队列配置辅助工具
5
+ 为用户提供简洁的队列配置接口
6
+ """
7
+ from typing import Dict, Any, Optional
8
+
9
+
10
+ class QueueHelper:
11
+ """队列配置辅助类"""
12
+
13
+ @staticmethod
14
+ def use_memory_queue(max_size: int = 2000) -> Dict[str, Any]:
15
+ """
16
+ 配置使用内存队列
17
+
18
+ Args:
19
+ max_size: 队列最大容量
20
+
21
+ Returns:
22
+ 配置字典
23
+ """
24
+ return {
25
+ 'QUEUE_TYPE': 'memory',
26
+ 'SCHEDULER_MAX_QUEUE_SIZE': max_size,
27
+ }
28
+
29
+ @staticmethod
30
+ def use_redis_queue(
31
+ host: str = "127.0.0.1",
32
+ port: int = 6379,
33
+ password: Optional[str] = None,
34
+ db: int = 0,
35
+ queue_name: str = "crawlo:requests",
36
+ max_retries: int = 3,
37
+ timeout: int = 300
38
+ ) -> Dict[str, Any]:
39
+ """
40
+ 配置使用 Redis 分布式队列
41
+
42
+ Args:
43
+ host: Redis 主机地址
44
+ port: Redis 端口
45
+ password: Redis 密码(可选)
46
+ db: Redis 数据库编号
47
+ queue_name: 队列名称
48
+ max_retries: 最大重试次数
49
+ timeout: 操作超时时间(秒)
50
+
51
+ Returns:
52
+ 配置字典
53
+ """
54
+ if password:
55
+ redis_url = f"redis://:{password}@{host}:{port}/{db}"
56
+ else:
57
+ redis_url = f"redis://{host}:{port}/{db}"
58
+
59
+ return {
60
+ 'QUEUE_TYPE': 'redis',
61
+ 'REDIS_URL': redis_url,
62
+ 'REDIS_HOST': host,
63
+ 'REDIS_PORT': port,
64
+ 'REDIS_PASSWORD': password or '',
65
+ 'REDIS_DB': db,
66
+ 'SCHEDULER_QUEUE_NAME': queue_name,
67
+ 'QUEUE_MAX_RETRIES': max_retries,
68
+ 'QUEUE_TIMEOUT': timeout,
69
+ }
70
+
71
+ @staticmethod
72
+ def auto_queue(
73
+ redis_fallback: bool = True,
74
+ memory_max_size: int = 2000,
75
+ **redis_kwargs
76
+ ) -> Dict[str, Any]:
77
+ """
78
+ 配置自动选择队列类型
79
+
80
+ Args:
81
+ redis_fallback: Redis 不可用时是否回退到内存队列
82
+ memory_max_size: 内存队列最大容量
83
+ **redis_kwargs: Redis 配置参数
84
+
85
+ Returns:
86
+ 配置字典
87
+ """
88
+ config = {
89
+ 'QUEUE_TYPE': 'auto',
90
+ 'SCHEDULER_MAX_QUEUE_SIZE': memory_max_size,
91
+ }
92
+
93
+ # 添加 Redis 配置(用于自动检测)
94
+ if redis_kwargs:
95
+ redis_config = QueueHelper.use_redis_queue(**redis_kwargs)
96
+ config.update(redis_config)
97
+ config['QUEUE_TYPE'] = 'auto' # 确保是自动模式
98
+
99
+ return config
100
+
101
+
102
+ # 预定义的常用配置
103
+ class QueuePresets:
104
+ """预定义的队列配置"""
105
+
106
+ # 开发环境:使用内存队列
107
+ DEVELOPMENT = QueueHelper.use_memory_queue(max_size=1000)
108
+
109
+ # 生产环境:使用 Redis 分布式队列
110
+ PRODUCTION = QueueHelper.use_redis_queue(
111
+ host="127.0.0.1",
112
+ port=6379,
113
+ queue_name="crawlo:production",
114
+ max_retries=5,
115
+ timeout=600
116
+ )
117
+
118
+ # 测试环境:自动选择,Redis 不可用时使用内存队列
119
+ TESTING = QueueHelper.auto_queue(
120
+ redis_fallback=True,
121
+ memory_max_size=500,
122
+ host="127.0.0.1",
123
+ port=6379,
124
+ queue_name="crawlo:testing"
125
+ )
126
+
127
+ # 高性能环境:Redis 集群
128
+ HIGH_PERFORMANCE = QueueHelper.use_redis_queue(
129
+ host="redis-cluster.example.com",
130
+ port=6379,
131
+ queue_name="crawlo:cluster",
132
+ max_retries=10,
133
+ timeout=300
134
+ )
135
+
136
+
137
+ def apply_queue_config(settings_dict: Dict[str, Any], config: Dict[str, Any]) -> None:
138
+ """
139
+ 将队列配置应用到设置字典
140
+
141
+ Args:
142
+ settings_dict: 现有的设置字典
143
+ config: 队列配置字典
144
+ """
145
+ settings_dict.update(config)
146
+
147
+
148
+ # 使用示例和文档
149
+ USAGE_EXAMPLES = """
150
+ # 使用示例:
151
+
152
+ # 1. 在 settings.py 中使用内存队列
153
+ from crawlo.utils.queue_helper import QueueHelper
154
+ apply_queue_config(locals(), QueueHelper.use_memory_queue())
155
+
156
+ # 2. 在 settings.py 中使用 Redis 队列
157
+ apply_queue_config(locals(), QueueHelper.use_redis_queue(
158
+ host="redis.example.com",
159
+ password="your_password"
160
+ ))
161
+
162
+ # 3. 使用预定义配置
163
+ from crawlo.utils.queue_helper import QueuePresets
164
+ apply_queue_config(locals(), QueuePresets.PRODUCTION)
165
+
166
+ # 4. 自动选择队列类型
167
+ apply_queue_config(locals(), QueueHelper.auto_queue(
168
+ host="127.0.0.1",
169
+ port=6379
170
+ ))
171
+
172
+ # 5. 直接在 settings 中配置
173
+ QUEUE_TYPE = 'auto' # 'memory', 'redis', 'auto'
174
+ REDIS_URL = 'redis://127.0.0.1:6379/0'
175
+ SCHEDULER_MAX_QUEUE_SIZE = 2000
176
176
  """