crawlo 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (113) hide show
  1. crawlo/__init__.py +34 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/list.py +155 -155
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -196
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +186 -186
  12. crawlo/config.py +279 -279
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -171
  15. crawlo/core/enhanced_engine.py +189 -189
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +166 -162
  18. crawlo/crawler.py +1027 -1027
  19. crawlo/downloader/__init__.py +242 -242
  20. crawlo/downloader/aiohttp_downloader.py +212 -212
  21. crawlo/downloader/cffi_downloader.py +251 -251
  22. crawlo/downloader/httpx_downloader.py +259 -257
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +82 -78
  25. crawlo/extension/__init__.py +31 -31
  26. crawlo/extension/log_interval.py +49 -49
  27. crawlo/extension/log_stats.py +44 -44
  28. crawlo/extension/logging_extension.py +34 -34
  29. crawlo/filters/__init__.py +154 -154
  30. crawlo/filters/aioredis_filter.py +242 -242
  31. crawlo/filters/memory_filter.py +269 -269
  32. crawlo/items/__init__.py +23 -23
  33. crawlo/items/base.py +21 -21
  34. crawlo/items/fields.py +53 -53
  35. crawlo/items/items.py +104 -104
  36. crawlo/middleware/__init__.py +21 -21
  37. crawlo/middleware/default_header.py +32 -32
  38. crawlo/middleware/download_delay.py +28 -28
  39. crawlo/middleware/middleware_manager.py +135 -135
  40. crawlo/middleware/proxy.py +248 -248
  41. crawlo/middleware/request_ignore.py +30 -30
  42. crawlo/middleware/response_code.py +18 -18
  43. crawlo/middleware/response_filter.py +26 -26
  44. crawlo/middleware/retry.py +125 -125
  45. crawlo/mode_manager.py +200 -200
  46. crawlo/network/__init__.py +21 -21
  47. crawlo/network/request.py +311 -311
  48. crawlo/network/response.py +271 -269
  49. crawlo/pipelines/__init__.py +22 -13
  50. crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
  51. crawlo/pipelines/console_pipeline.py +39 -39
  52. crawlo/pipelines/csv_pipeline.py +316 -316
  53. crawlo/pipelines/database_dedup_pipeline.py +225 -0
  54. crawlo/pipelines/json_pipeline.py +218 -218
  55. crawlo/pipelines/memory_dedup_pipeline.py +116 -0
  56. crawlo/pipelines/mongo_pipeline.py +116 -116
  57. crawlo/pipelines/mysql_pipeline.py +195 -195
  58. crawlo/pipelines/pipeline_manager.py +56 -56
  59. crawlo/pipelines/redis_dedup_pipeline.py +163 -0
  60. crawlo/project.py +153 -153
  61. crawlo/queue/pqueue.py +37 -37
  62. crawlo/queue/queue_manager.py +307 -303
  63. crawlo/queue/redis_priority_queue.py +208 -191
  64. crawlo/settings/__init__.py +7 -7
  65. crawlo/settings/default_settings.py +245 -226
  66. crawlo/settings/setting_manager.py +99 -99
  67. crawlo/spider/__init__.py +639 -639
  68. crawlo/stats_collector.py +59 -59
  69. crawlo/subscriber.py +106 -106
  70. crawlo/task_manager.py +30 -30
  71. crawlo/templates/crawlo.cfg.tmpl +10 -10
  72. crawlo/templates/project/__init__.py.tmpl +3 -3
  73. crawlo/templates/project/items.py.tmpl +17 -17
  74. crawlo/templates/project/middlewares.py.tmpl +86 -86
  75. crawlo/templates/project/pipelines.py.tmpl +341 -335
  76. crawlo/templates/project/run.py.tmpl +251 -238
  77. crawlo/templates/project/settings.py.tmpl +250 -247
  78. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  79. crawlo/templates/spider/spider.py.tmpl +177 -177
  80. crawlo/utils/__init__.py +7 -7
  81. crawlo/utils/controlled_spider_mixin.py +439 -335
  82. crawlo/utils/date_tools.py +233 -233
  83. crawlo/utils/db_helper.py +343 -343
  84. crawlo/utils/func_tools.py +82 -82
  85. crawlo/utils/large_scale_config.py +286 -286
  86. crawlo/utils/large_scale_helper.py +343 -343
  87. crawlo/utils/log.py +128 -128
  88. crawlo/utils/queue_helper.py +175 -175
  89. crawlo/utils/request.py +267 -267
  90. crawlo/utils/request_serializer.py +219 -219
  91. crawlo/utils/spider_loader.py +62 -62
  92. crawlo/utils/system.py +11 -11
  93. crawlo/utils/tools.py +4 -4
  94. crawlo/utils/url.py +39 -39
  95. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/METADATA +635 -567
  96. crawlo-1.1.3.dist-info/RECORD +113 -0
  97. examples/__init__.py +7 -7
  98. examples/controlled_spider_example.py +205 -0
  99. tests/__init__.py +7 -7
  100. tests/test_final_validation.py +153 -153
  101. tests/test_proxy_health_check.py +32 -32
  102. tests/test_proxy_middleware_integration.py +136 -136
  103. tests/test_proxy_providers.py +56 -56
  104. tests/test_proxy_stats.py +19 -19
  105. tests/test_proxy_strategies.py +59 -59
  106. tests/test_redis_config.py +28 -28
  107. tests/test_redis_queue.py +224 -224
  108. tests/test_request_serialization.py +70 -70
  109. tests/test_scheduler.py +241 -241
  110. crawlo-1.1.2.dist-info/RECORD +0 -108
  111. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
  112. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
  113. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0
crawlo/utils/log.py CHANGED
@@ -1,129 +1,129 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- 日志管理器:安全版本,使用字符串化 key 避免 unhashable 问题
4
- """
5
- import os
6
- from logging import (
7
- Formatter,
8
- StreamHandler,
9
- FileHandler,
10
- Logger,
11
- DEBUG,
12
- INFO,
13
- WARNING,
14
- ERROR,
15
- CRITICAL,
16
- )
17
-
18
- LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
19
-
20
-
21
- class LoggerManager:
22
- logger_cache = {}
23
- _default_filename = None
24
- _default_level = INFO
25
- _default_file_level = INFO
26
- _default_console_level = INFO
27
- _default_log_format = LOG_FORMAT
28
- _default_encoding = 'utf-8'
29
-
30
- _level_map = {
31
- 'DEBUG': DEBUG,
32
- 'INFO': INFO,
33
- 'WARNING': WARNING,
34
- 'ERROR': ERROR,
35
- 'CRITICAL': CRITICAL,
36
- }
37
-
38
- @classmethod
39
- def _to_level(cls, level):
40
- """安全转换为日志级别 int"""
41
- if level is None:
42
- return INFO
43
- if isinstance(level, int):
44
- return level
45
- if isinstance(level, str):
46
- return cls._level_map.get(level.upper(), INFO)
47
- if hasattr(level, 'get'): # 如 SettingManager 或 dict
48
- lv = level.get('LOG_LEVEL')
49
- if isinstance(lv, int):
50
- return lv
51
- if isinstance(lv, str):
52
- return cls._level_map.get(lv.upper(), INFO)
53
- return INFO
54
-
55
- @classmethod
56
- def configure(cls, settings=None, **kwargs):
57
- """
58
- 使用 settings 对象或关键字参数配置日志
59
- """
60
- # 优先使用 settings,否则用 kwargs
61
- get_val = settings.get if hasattr(settings, 'get') else (lambda k, d=None: kwargs.get(k, d))
62
-
63
- filename = get_val('LOG_FILE')
64
- level = get_val('LOG_LEVEL', 'INFO')
65
- file_level = get_val('LOG_FILE_LEVEL', level)
66
- console_level = get_val('LOG_CONSOLE_LEVEL', level)
67
- log_format = get_val('LOG_FORMAT', LOG_FORMAT)
68
- encoding = get_val('LOG_ENCODING', 'utf-8')
69
-
70
- cls._default_filename = filename
71
- cls._default_level = cls._to_level(level)
72
- cls._default_file_level = cls._to_level(file_level)
73
- cls._default_console_level = cls._to_level(console_level)
74
- cls._default_log_format = log_format
75
- cls._default_encoding = encoding
76
-
77
- @classmethod
78
- def get_logger(cls, name='default', level=None, filename=None):
79
- """
80
- 简化接口,只暴露必要参数
81
- """
82
- # 确定最终参数
83
- final_level = cls._to_level(level) if level is not None else cls._default_level
84
- final_filename = filename if filename is not None else cls._default_filename
85
-
86
- # ✅ 安全的字符串化 key,避免任何 unhashable 类型
87
- key_parts = [
88
- name,
89
- str(final_level),
90
- final_filename or 'no_file',
91
- ]
92
- key = '|'.join(key_parts) # 如 "my_spider|20|logs/app.log"
93
-
94
- if key in cls.logger_cache:
95
- return cls.logger_cache[key]
96
-
97
- # 创建 logger
98
- _logger = Logger(name=name)
99
- _logger.setLevel(final_level)
100
-
101
- formatter = Formatter(cls._default_log_format)
102
-
103
- # 控制台
104
- if cls._default_console_level is not False:
105
- ch = StreamHandler()
106
- ch.setFormatter(formatter)
107
- ch.setLevel(cls._default_console_level)
108
- _logger.addHandler(ch)
109
-
110
- # 文件
111
- if final_filename:
112
- try:
113
- log_dir = os.path.dirname(final_filename)
114
- if log_dir and not os.path.exists(log_dir):
115
- os.makedirs(log_dir, exist_ok=True)
116
-
117
- fh = FileHandler(final_filename, encoding=cls._default_encoding)
118
- fh.setFormatter(formatter)
119
- fh.setLevel(cls._default_file_level)
120
- _logger.addHandler(fh)
121
- except Exception as e:
122
- print(f"[Logger] 无法创建日志文件 {final_filename}: {e}")
123
-
124
- cls.logger_cache[key] = _logger
125
- return _logger
126
-
127
-
128
- # 全局快捷函数
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ 日志管理器:安全版本,使用字符串化 key 避免 unhashable 问题
4
+ """
5
+ import os
6
+ from logging import (
7
+ Formatter,
8
+ StreamHandler,
9
+ FileHandler,
10
+ Logger,
11
+ DEBUG,
12
+ INFO,
13
+ WARNING,
14
+ ERROR,
15
+ CRITICAL,
16
+ )
17
+
18
+ LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
19
+
20
+
21
+ class LoggerManager:
22
+ logger_cache = {}
23
+ _default_filename = None
24
+ _default_level = INFO
25
+ _default_file_level = INFO
26
+ _default_console_level = INFO
27
+ _default_log_format = LOG_FORMAT
28
+ _default_encoding = 'utf-8'
29
+
30
+ _level_map = {
31
+ 'DEBUG': DEBUG,
32
+ 'INFO': INFO,
33
+ 'WARNING': WARNING,
34
+ 'ERROR': ERROR,
35
+ 'CRITICAL': CRITICAL,
36
+ }
37
+
38
+ @classmethod
39
+ def _to_level(cls, level):
40
+ """安全转换为日志级别 int"""
41
+ if level is None:
42
+ return INFO
43
+ if isinstance(level, int):
44
+ return level
45
+ if isinstance(level, str):
46
+ return cls._level_map.get(level.upper(), INFO)
47
+ if hasattr(level, 'get'): # 如 SettingManager 或 dict
48
+ lv = level.get('LOG_LEVEL')
49
+ if isinstance(lv, int):
50
+ return lv
51
+ if isinstance(lv, str):
52
+ return cls._level_map.get(lv.upper(), INFO)
53
+ return INFO
54
+
55
+ @classmethod
56
+ def configure(cls, settings=None, **kwargs):
57
+ """
58
+ 使用 settings 对象或关键字参数配置日志
59
+ """
60
+ # 优先使用 settings,否则用 kwargs
61
+ get_val = settings.get if hasattr(settings, 'get') else (lambda k, d=None: kwargs.get(k, d))
62
+
63
+ filename = get_val('LOG_FILE')
64
+ level = get_val('LOG_LEVEL', 'INFO')
65
+ file_level = get_val('LOG_FILE_LEVEL', level)
66
+ console_level = get_val('LOG_CONSOLE_LEVEL', level)
67
+ log_format = get_val('LOG_FORMAT', LOG_FORMAT)
68
+ encoding = get_val('LOG_ENCODING', 'utf-8')
69
+
70
+ cls._default_filename = filename
71
+ cls._default_level = cls._to_level(level)
72
+ cls._default_file_level = cls._to_level(file_level)
73
+ cls._default_console_level = cls._to_level(console_level)
74
+ cls._default_log_format = log_format
75
+ cls._default_encoding = encoding
76
+
77
+ @classmethod
78
+ def get_logger(cls, name='default', level=None, filename=None):
79
+ """
80
+ 简化接口,只暴露必要参数
81
+ """
82
+ # 确定最终参数
83
+ final_level = cls._to_level(level) if level is not None else cls._default_level
84
+ final_filename = filename if filename is not None else cls._default_filename
85
+
86
+ # ✅ 安全的字符串化 key,避免任何 unhashable 类型
87
+ key_parts = [
88
+ name,
89
+ str(final_level),
90
+ final_filename or 'no_file',
91
+ ]
92
+ key = '|'.join(key_parts) # 如 "my_spider|20|logs/app.log"
93
+
94
+ if key in cls.logger_cache:
95
+ return cls.logger_cache[key]
96
+
97
+ # 创建 logger
98
+ _logger = Logger(name=name)
99
+ _logger.setLevel(final_level)
100
+
101
+ formatter = Formatter(cls._default_log_format)
102
+
103
+ # 控制台
104
+ if cls._default_console_level is not False:
105
+ ch = StreamHandler()
106
+ ch.setFormatter(formatter)
107
+ ch.setLevel(cls._default_console_level)
108
+ _logger.addHandler(ch)
109
+
110
+ # 文件
111
+ if final_filename:
112
+ try:
113
+ log_dir = os.path.dirname(final_filename)
114
+ if log_dir and not os.path.exists(log_dir):
115
+ os.makedirs(log_dir, exist_ok=True)
116
+
117
+ fh = FileHandler(final_filename, encoding=cls._default_encoding)
118
+ fh.setFormatter(formatter)
119
+ fh.setLevel(cls._default_file_level)
120
+ _logger.addHandler(fh)
121
+ except Exception as e:
122
+ print(f"[Logger] 无法创建日志文件 {final_filename}: {e}")
123
+
124
+ cls.logger_cache[key] = _logger
125
+ return _logger
126
+
127
+
128
+ # 全局快捷函数
129
129
  get_logger = LoggerManager.get_logger
@@ -1,176 +1,176 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 队列配置辅助工具
5
- 为用户提供简洁的队列配置接口
6
- """
7
- from typing import Dict, Any, Optional
8
-
9
-
10
- class QueueHelper:
11
- """队列配置辅助类"""
12
-
13
- @staticmethod
14
- def use_memory_queue(max_size: int = 2000) -> Dict[str, Any]:
15
- """
16
- 配置使用内存队列
17
-
18
- Args:
19
- max_size: 队列最大容量
20
-
21
- Returns:
22
- 配置字典
23
- """
24
- return {
25
- 'QUEUE_TYPE': 'memory',
26
- 'SCHEDULER_MAX_QUEUE_SIZE': max_size,
27
- }
28
-
29
- @staticmethod
30
- def use_redis_queue(
31
- host: str = "127.0.0.1",
32
- port: int = 6379,
33
- password: Optional[str] = None,
34
- db: int = 0,
35
- queue_name: str = "crawlo:requests",
36
- max_retries: int = 3,
37
- timeout: int = 300
38
- ) -> Dict[str, Any]:
39
- """
40
- 配置使用 Redis 分布式队列
41
-
42
- Args:
43
- host: Redis 主机地址
44
- port: Redis 端口
45
- password: Redis 密码(可选)
46
- db: Redis 数据库编号
47
- queue_name: 队列名称
48
- max_retries: 最大重试次数
49
- timeout: 操作超时时间(秒)
50
-
51
- Returns:
52
- 配置字典
53
- """
54
- if password:
55
- redis_url = f"redis://:{password}@{host}:{port}/{db}"
56
- else:
57
- redis_url = f"redis://{host}:{port}/{db}"
58
-
59
- return {
60
- 'QUEUE_TYPE': 'redis',
61
- 'REDIS_URL': redis_url,
62
- 'REDIS_HOST': host,
63
- 'REDIS_PORT': port,
64
- 'REDIS_PASSWORD': password or '',
65
- 'REDIS_DB': db,
66
- 'SCHEDULER_QUEUE_NAME': queue_name,
67
- 'QUEUE_MAX_RETRIES': max_retries,
68
- 'QUEUE_TIMEOUT': timeout,
69
- }
70
-
71
- @staticmethod
72
- def auto_queue(
73
- redis_fallback: bool = True,
74
- memory_max_size: int = 2000,
75
- **redis_kwargs
76
- ) -> Dict[str, Any]:
77
- """
78
- 配置自动选择队列类型
79
-
80
- Args:
81
- redis_fallback: Redis 不可用时是否回退到内存队列
82
- memory_max_size: 内存队列最大容量
83
- **redis_kwargs: Redis 配置参数
84
-
85
- Returns:
86
- 配置字典
87
- """
88
- config = {
89
- 'QUEUE_TYPE': 'auto',
90
- 'SCHEDULER_MAX_QUEUE_SIZE': memory_max_size,
91
- }
92
-
93
- # 添加 Redis 配置(用于自动检测)
94
- if redis_kwargs:
95
- redis_config = QueueHelper.use_redis_queue(**redis_kwargs)
96
- config.update(redis_config)
97
- config['QUEUE_TYPE'] = 'auto' # 确保是自动模式
98
-
99
- return config
100
-
101
-
102
- # 预定义的常用配置
103
- class QueuePresets:
104
- """预定义的队列配置"""
105
-
106
- # 开发环境:使用内存队列
107
- DEVELOPMENT = QueueHelper.use_memory_queue(max_size=1000)
108
-
109
- # 生产环境:使用 Redis 分布式队列
110
- PRODUCTION = QueueHelper.use_redis_queue(
111
- host="127.0.0.1",
112
- port=6379,
113
- queue_name="crawlo:production",
114
- max_retries=5,
115
- timeout=600
116
- )
117
-
118
- # 测试环境:自动选择,Redis 不可用时使用内存队列
119
- TESTING = QueueHelper.auto_queue(
120
- redis_fallback=True,
121
- memory_max_size=500,
122
- host="127.0.0.1",
123
- port=6379,
124
- queue_name="crawlo:testing"
125
- )
126
-
127
- # 高性能环境:Redis 集群
128
- HIGH_PERFORMANCE = QueueHelper.use_redis_queue(
129
- host="redis-cluster.example.com",
130
- port=6379,
131
- queue_name="crawlo:cluster",
132
- max_retries=10,
133
- timeout=300
134
- )
135
-
136
-
137
- def apply_queue_config(settings_dict: Dict[str, Any], config: Dict[str, Any]) -> None:
138
- """
139
- 将队列配置应用到设置字典
140
-
141
- Args:
142
- settings_dict: 现有的设置字典
143
- config: 队列配置字典
144
- """
145
- settings_dict.update(config)
146
-
147
-
148
- # 使用示例和文档
149
- USAGE_EXAMPLES = """
150
- # 使用示例:
151
-
152
- # 1. 在 settings.py 中使用内存队列
153
- from crawlo.utils.queue_helper import QueueHelper
154
- apply_queue_config(locals(), QueueHelper.use_memory_queue())
155
-
156
- # 2. 在 settings.py 中使用 Redis 队列
157
- apply_queue_config(locals(), QueueHelper.use_redis_queue(
158
- host="redis.example.com",
159
- password="your_password"
160
- ))
161
-
162
- # 3. 使用预定义配置
163
- from crawlo.utils.queue_helper import QueuePresets
164
- apply_queue_config(locals(), QueuePresets.PRODUCTION)
165
-
166
- # 4. 自动选择队列类型
167
- apply_queue_config(locals(), QueueHelper.auto_queue(
168
- host="127.0.0.1",
169
- port=6379
170
- ))
171
-
172
- # 5. 直接在 settings 中配置
173
- QUEUE_TYPE = 'auto' # 'memory', 'redis', 'auto'
174
- REDIS_URL = 'redis://127.0.0.1:6379/0'
175
- SCHEDULER_MAX_QUEUE_SIZE = 2000
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 队列配置辅助工具
5
+ 为用户提供简洁的队列配置接口
6
+ """
7
+ from typing import Dict, Any, Optional
8
+
9
+
10
+ class QueueHelper:
11
+ """队列配置辅助类"""
12
+
13
+ @staticmethod
14
+ def use_memory_queue(max_size: int = 2000) -> Dict[str, Any]:
15
+ """
16
+ 配置使用内存队列
17
+
18
+ Args:
19
+ max_size: 队列最大容量
20
+
21
+ Returns:
22
+ 配置字典
23
+ """
24
+ return {
25
+ 'QUEUE_TYPE': 'memory',
26
+ 'SCHEDULER_MAX_QUEUE_SIZE': max_size,
27
+ }
28
+
29
+ @staticmethod
30
+ def use_redis_queue(
31
+ host: str = "127.0.0.1",
32
+ port: int = 6379,
33
+ password: Optional[str] = None,
34
+ db: int = 0,
35
+ queue_name: str = "crawlo:requests",
36
+ max_retries: int = 3,
37
+ timeout: int = 300
38
+ ) -> Dict[str, Any]:
39
+ """
40
+ 配置使用 Redis 分布式队列
41
+
42
+ Args:
43
+ host: Redis 主机地址
44
+ port: Redis 端口
45
+ password: Redis 密码(可选)
46
+ db: Redis 数据库编号
47
+ queue_name: 队列名称
48
+ max_retries: 最大重试次数
49
+ timeout: 操作超时时间(秒)
50
+
51
+ Returns:
52
+ 配置字典
53
+ """
54
+ if password:
55
+ redis_url = f"redis://:{password}@{host}:{port}/{db}"
56
+ else:
57
+ redis_url = f"redis://{host}:{port}/{db}"
58
+
59
+ return {
60
+ 'QUEUE_TYPE': 'redis',
61
+ 'REDIS_URL': redis_url,
62
+ 'REDIS_HOST': host,
63
+ 'REDIS_PORT': port,
64
+ 'REDIS_PASSWORD': password or '',
65
+ 'REDIS_DB': db,
66
+ 'SCHEDULER_QUEUE_NAME': queue_name,
67
+ 'QUEUE_MAX_RETRIES': max_retries,
68
+ 'QUEUE_TIMEOUT': timeout,
69
+ }
70
+
71
+ @staticmethod
72
+ def auto_queue(
73
+ redis_fallback: bool = True,
74
+ memory_max_size: int = 2000,
75
+ **redis_kwargs
76
+ ) -> Dict[str, Any]:
77
+ """
78
+ 配置自动选择队列类型
79
+
80
+ Args:
81
+ redis_fallback: Redis 不可用时是否回退到内存队列
82
+ memory_max_size: 内存队列最大容量
83
+ **redis_kwargs: Redis 配置参数
84
+
85
+ Returns:
86
+ 配置字典
87
+ """
88
+ config = {
89
+ 'QUEUE_TYPE': 'auto',
90
+ 'SCHEDULER_MAX_QUEUE_SIZE': memory_max_size,
91
+ }
92
+
93
+ # 添加 Redis 配置(用于自动检测)
94
+ if redis_kwargs:
95
+ redis_config = QueueHelper.use_redis_queue(**redis_kwargs)
96
+ config.update(redis_config)
97
+ config['QUEUE_TYPE'] = 'auto' # 确保是自动模式
98
+
99
+ return config
100
+
101
+
102
+ # 预定义的常用配置
103
+ class QueuePresets:
104
+ """预定义的队列配置"""
105
+
106
+ # 开发环境:使用内存队列
107
+ DEVELOPMENT = QueueHelper.use_memory_queue(max_size=1000)
108
+
109
+ # 生产环境:使用 Redis 分布式队列
110
+ PRODUCTION = QueueHelper.use_redis_queue(
111
+ host="127.0.0.1",
112
+ port=6379,
113
+ queue_name="crawlo:production",
114
+ max_retries=5,
115
+ timeout=600
116
+ )
117
+
118
+ # 测试环境:自动选择,Redis 不可用时使用内存队列
119
+ TESTING = QueueHelper.auto_queue(
120
+ redis_fallback=True,
121
+ memory_max_size=500,
122
+ host="127.0.0.1",
123
+ port=6379,
124
+ queue_name="crawlo:testing"
125
+ )
126
+
127
+ # 高性能环境:Redis 集群
128
+ HIGH_PERFORMANCE = QueueHelper.use_redis_queue(
129
+ host="redis-cluster.example.com",
130
+ port=6379,
131
+ queue_name="crawlo:cluster",
132
+ max_retries=10,
133
+ timeout=300
134
+ )
135
+
136
+
137
+ def apply_queue_config(settings_dict: Dict[str, Any], config: Dict[str, Any]) -> None:
138
+ """
139
+ 将队列配置应用到设置字典
140
+
141
+ Args:
142
+ settings_dict: 现有的设置字典
143
+ config: 队列配置字典
144
+ """
145
+ settings_dict.update(config)
146
+
147
+
148
+ # 使用示例和文档
149
+ USAGE_EXAMPLES = """
150
+ # 使用示例:
151
+
152
+ # 1. 在 settings.py 中使用内存队列
153
+ from crawlo.utils.queue_helper import QueueHelper
154
+ apply_queue_config(locals(), QueueHelper.use_memory_queue())
155
+
156
+ # 2. 在 settings.py 中使用 Redis 队列
157
+ apply_queue_config(locals(), QueueHelper.use_redis_queue(
158
+ host="redis.example.com",
159
+ password="your_password"
160
+ ))
161
+
162
+ # 3. 使用预定义配置
163
+ from crawlo.utils.queue_helper import QueuePresets
164
+ apply_queue_config(locals(), QueuePresets.PRODUCTION)
165
+
166
+ # 4. 自动选择队列类型
167
+ apply_queue_config(locals(), QueueHelper.auto_queue(
168
+ host="127.0.0.1",
169
+ port=6379
170
+ ))
171
+
172
+ # 5. 直接在 settings 中配置
173
+ QUEUE_TYPE = 'auto' # 'memory', 'redis', 'auto'
174
+ REDIS_URL = 'redis://127.0.0.1:6379/0'
175
+ SCHEDULER_MAX_QUEUE_SIZE = 2000
176
176
  """