crawlo 1.3.2__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (219) hide show
  1. crawlo/__init__.py +63 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +322 -314
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +2 -2
  16. crawlo/core/engine.py +365 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +256 -256
  19. crawlo/crawler.py +1166 -1168
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +226 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +52 -45
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/filters/__init__.py +154 -154
  40. crawlo/filters/aioredis_filter.py +234 -234
  41. crawlo/filters/memory_filter.py +269 -269
  42. crawlo/items/__init__.py +23 -23
  43. crawlo/items/base.py +21 -21
  44. crawlo/items/fields.py +52 -52
  45. crawlo/items/items.py +104 -104
  46. crawlo/middleware/__init__.py +21 -21
  47. crawlo/middleware/default_header.py +132 -132
  48. crawlo/middleware/download_delay.py +104 -104
  49. crawlo/middleware/middleware_manager.py +135 -135
  50. crawlo/middleware/offsite.py +123 -123
  51. crawlo/middleware/proxy.py +386 -386
  52. crawlo/middleware/request_ignore.py +86 -86
  53. crawlo/middleware/response_code.py +163 -163
  54. crawlo/middleware/response_filter.py +136 -136
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/middleware/simple_proxy.py +65 -65
  57. crawlo/mode_manager.py +187 -187
  58. crawlo/network/__init__.py +21 -21
  59. crawlo/network/request.py +379 -379
  60. crawlo/network/response.py +359 -359
  61. crawlo/pipelines/__init__.py +21 -21
  62. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  63. crawlo/pipelines/console_pipeline.py +39 -39
  64. crawlo/pipelines/csv_pipeline.py +316 -316
  65. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  66. crawlo/pipelines/json_pipeline.py +218 -218
  67. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  68. crawlo/pipelines/mongo_pipeline.py +131 -131
  69. crawlo/pipelines/mysql_pipeline.py +318 -318
  70. crawlo/pipelines/pipeline_manager.py +75 -75
  71. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  72. crawlo/project.py +325 -297
  73. crawlo/queue/pqueue.py +37 -37
  74. crawlo/queue/queue_manager.py +379 -379
  75. crawlo/queue/redis_priority_queue.py +306 -306
  76. crawlo/settings/__init__.py +7 -7
  77. crawlo/settings/default_settings.py +225 -225
  78. crawlo/settings/setting_manager.py +198 -198
  79. crawlo/spider/__init__.py +639 -639
  80. crawlo/stats_collector.py +59 -59
  81. crawlo/subscriber.py +129 -129
  82. crawlo/task_manager.py +30 -30
  83. crawlo/templates/crawlo.cfg.tmpl +10 -10
  84. crawlo/templates/project/__init__.py.tmpl +3 -3
  85. crawlo/templates/project/items.py.tmpl +17 -17
  86. crawlo/templates/project/middlewares.py.tmpl +118 -118
  87. crawlo/templates/project/pipelines.py.tmpl +96 -96
  88. crawlo/templates/project/settings.py.tmpl +266 -266
  89. crawlo/templates/project/settings_distributed.py.tmpl +179 -179
  90. crawlo/templates/project/settings_gentle.py.tmpl +60 -60
  91. crawlo/templates/project/settings_high_performance.py.tmpl +130 -130
  92. crawlo/templates/project/settings_minimal.py.tmpl +34 -34
  93. crawlo/templates/project/settings_simple.py.tmpl +101 -101
  94. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  95. crawlo/templates/run.py.tmpl +38 -38
  96. crawlo/templates/spider/spider.py.tmpl +143 -143
  97. crawlo/tools/__init__.py +200 -200
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/tools/data_formatter.py +225 -225
  101. crawlo/tools/data_validator.py +180 -180
  102. crawlo/tools/date_tools.py +289 -289
  103. crawlo/tools/distributed_coordinator.py +388 -388
  104. crawlo/tools/encoding_converter.py +127 -127
  105. crawlo/tools/request_tools.py +82 -82
  106. crawlo/tools/retry_mechanism.py +224 -224
  107. crawlo/tools/scenario_adapter.py +262 -262
  108. crawlo/tools/text_cleaner.py +232 -232
  109. crawlo/utils/__init__.py +34 -34
  110. crawlo/utils/batch_processor.py +259 -259
  111. crawlo/utils/controlled_spider_mixin.py +439 -439
  112. crawlo/utils/db_helper.py +343 -343
  113. crawlo/utils/enhanced_error_handler.py +356 -356
  114. crawlo/utils/env_config.py +142 -142
  115. crawlo/utils/error_handler.py +123 -123
  116. crawlo/utils/func_tools.py +82 -82
  117. crawlo/utils/large_scale_config.py +286 -286
  118. crawlo/utils/large_scale_helper.py +344 -344
  119. crawlo/utils/log.py +199 -146
  120. crawlo/utils/performance_monitor.py +285 -285
  121. crawlo/utils/queue_helper.py +175 -175
  122. crawlo/utils/redis_connection_pool.py +351 -351
  123. crawlo/utils/redis_key_validator.py +198 -198
  124. crawlo/utils/request.py +267 -267
  125. crawlo/utils/request_serializer.py +218 -218
  126. crawlo/utils/spider_loader.py +61 -61
  127. crawlo/utils/system.py +11 -11
  128. crawlo/utils/tools.py +4 -4
  129. crawlo/utils/url.py +39 -39
  130. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/METADATA +1020 -1020
  131. crawlo-1.3.3.dist-info/RECORD +219 -0
  132. examples/__init__.py +7 -7
  133. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  134. tests/__init__.py +7 -7
  135. tests/advanced_tools_example.py +275 -275
  136. tests/authenticated_proxy_example.py +107 -107
  137. tests/cleaners_example.py +160 -160
  138. tests/config_validation_demo.py +142 -142
  139. tests/controlled_spider_example.py +205 -205
  140. tests/date_tools_example.py +180 -180
  141. tests/debug_pipelines.py +66 -66
  142. tests/dynamic_loading_example.py +523 -523
  143. tests/dynamic_loading_test.py +104 -104
  144. tests/env_config_example.py +133 -133
  145. tests/error_handling_example.py +171 -171
  146. tests/redis_key_validation_demo.py +130 -130
  147. tests/request_params_example.py +150 -150
  148. tests/response_improvements_example.py +144 -144
  149. tests/test_advanced_tools.py +148 -148
  150. tests/test_all_redis_key_configs.py +145 -145
  151. tests/test_authenticated_proxy.py +141 -141
  152. tests/test_cleaners.py +54 -54
  153. tests/test_comprehensive.py +146 -146
  154. tests/test_config_consistency.py +80 -80
  155. tests/test_config_merge.py +152 -152
  156. tests/test_config_validator.py +182 -182
  157. tests/test_crawlo_proxy_integration.py +108 -108
  158. tests/test_date_tools.py +123 -123
  159. tests/test_default_header_middleware.py +158 -158
  160. tests/test_distributed.py +65 -65
  161. tests/test_double_crawlo_fix.py +207 -207
  162. tests/test_double_crawlo_fix_simple.py +124 -124
  163. tests/test_download_delay_middleware.py +221 -221
  164. tests/test_downloader_proxy_compatibility.py +268 -268
  165. tests/test_dynamic_downloaders_proxy.py +124 -124
  166. tests/test_dynamic_proxy.py +92 -92
  167. tests/test_dynamic_proxy_config.py +146 -146
  168. tests/test_dynamic_proxy_real.py +109 -109
  169. tests/test_edge_cases.py +303 -303
  170. tests/test_enhanced_error_handler.py +270 -270
  171. tests/test_env_config.py +121 -121
  172. tests/test_error_handler_compatibility.py +112 -112
  173. tests/test_final_validation.py +153 -153
  174. tests/test_framework_env_usage.py +103 -103
  175. tests/test_integration.py +169 -169
  176. tests/test_item_dedup_redis_key.py +122 -122
  177. tests/test_mode_consistency.py +51 -51
  178. tests/test_offsite_middleware.py +221 -221
  179. tests/test_parsel.py +29 -29
  180. tests/test_performance.py +327 -327
  181. tests/test_proxy_api.py +264 -264
  182. tests/test_proxy_health_check.py +32 -32
  183. tests/test_proxy_middleware.py +121 -121
  184. tests/test_proxy_middleware_enhanced.py +216 -216
  185. tests/test_proxy_middleware_integration.py +136 -136
  186. tests/test_proxy_middleware_refactored.py +184 -184
  187. tests/test_proxy_providers.py +56 -56
  188. tests/test_proxy_stats.py +19 -19
  189. tests/test_proxy_strategies.py +59 -59
  190. tests/test_queue_manager_double_crawlo.py +173 -173
  191. tests/test_queue_manager_redis_key.py +176 -176
  192. tests/test_random_user_agent.py +72 -72
  193. tests/test_real_scenario_proxy.py +195 -195
  194. tests/test_redis_config.py +28 -28
  195. tests/test_redis_connection_pool.py +294 -294
  196. tests/test_redis_key_naming.py +181 -181
  197. tests/test_redis_key_validator.py +123 -123
  198. tests/test_redis_queue.py +224 -224
  199. tests/test_request_ignore_middleware.py +182 -182
  200. tests/test_request_params.py +111 -111
  201. tests/test_request_serialization.py +70 -70
  202. tests/test_response_code_middleware.py +349 -349
  203. tests/test_response_filter_middleware.py +427 -427
  204. tests/test_response_improvements.py +152 -152
  205. tests/test_retry_middleware.py +241 -241
  206. tests/test_scheduler.py +252 -252
  207. tests/test_scheduler_config_update.py +133 -133
  208. tests/test_simple_response.py +61 -61
  209. tests/test_telecom_spider_redis_key.py +205 -205
  210. tests/test_template_content.py +87 -87
  211. tests/test_template_redis_key.py +134 -134
  212. tests/test_tools.py +159 -159
  213. tests/test_user_agents.py +96 -96
  214. tests/tools_example.py +260 -260
  215. tests/verify_distributed.py +117 -117
  216. crawlo-1.3.2.dist-info/RECORD +0 -219
  217. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/WHEEL +0 -0
  218. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/entry_points.txt +0 -0
  219. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/top_level.txt +0 -0
crawlo/mode_manager.py CHANGED
@@ -1,188 +1,188 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 运行模式管理器
5
- ==============
6
- 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
-
8
- 支持的运行模式:
9
- 1. standalone - 单机模式(默认)
10
- 2. distributed - 分布式模式
11
- 3. auto - 自动检测模式
12
- """
13
- import os
14
- from enum import Enum
15
- from typing import Dict, Any, Optional
16
-
17
-
18
- class RunMode(Enum):
19
- """运行模式枚举"""
20
- STANDALONE = "standalone" # 单机模式
21
- DISTRIBUTED = "distributed" # 分布式模式
22
- AUTO = "auto" # 自动检测模式
23
-
24
-
25
- class ModeManager:
26
- """运行模式管理器"""
27
-
28
- def __init__(self):
29
- pass
30
-
31
- @staticmethod
32
- def get_standalone_settings() -> Dict[str, Any]:
33
- """获取单机模式配置"""
34
- return {
35
- 'QUEUE_TYPE': 'memory',
36
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
37
- 'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
38
- 'CONCURRENCY': 8,
39
- 'MAX_RUNNING_SPIDERS': 1,
40
- 'DOWNLOAD_DELAY': 1.0,
41
- }
42
-
43
- @staticmethod
44
- def get_distributed_settings(
45
- redis_host: str = '127.0.0.1',
46
- redis_port: int = 6379,
47
- redis_password: Optional[str] = None,
48
- redis_db: int = 0,
49
- project_name: str = 'crawlo'
50
- ) -> Dict[str, Any]:
51
- """获取分布式模式配置"""
52
- # 构建 Redis URL
53
- if redis_password:
54
- redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
55
- else:
56
- redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
57
-
58
- return {
59
- 'QUEUE_TYPE': 'redis',
60
- 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
61
- 'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline',
62
- 'REDIS_HOST': redis_host,
63
- 'REDIS_PORT': redis_port,
64
- 'REDIS_PASSWORD': redis_password,
65
- 'REDIS_DB': redis_db,
66
- 'REDIS_URL': redis_url,
67
- 'PROJECT_NAME': project_name,
68
- 'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests',
69
- 'CONCURRENCY': 16,
70
- 'MAX_RUNNING_SPIDERS': 10,
71
- 'DOWNLOAD_DELAY': 1.0,
72
- }
73
-
74
- @staticmethod
75
- def get_auto_settings() -> Dict[str, Any]:
76
- """获取自动检测模式配置"""
77
- # 默认使用内存队列和过滤器
78
- settings = ModeManager.get_standalone_settings()
79
- settings['QUEUE_TYPE'] = 'auto'
80
- return settings
81
-
82
- def resolve_mode_settings(
83
- self,
84
- mode: str = 'standalone',
85
- **kwargs
86
- ) -> Dict[str, Any]:
87
- """
88
- 解析运行模式并返回对应配置
89
-
90
- Args:
91
- mode: 运行模式 ('standalone', 'distributed', 'auto')
92
- **kwargs: 额外配置参数
93
-
94
- Returns:
95
- Dict[str, Any]: 配置字典
96
- """
97
- mode = RunMode(mode.lower())
98
- mode_info = None
99
-
100
- if mode == RunMode.STANDALONE:
101
- mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
102
- settings = self.get_standalone_settings()
103
-
104
- elif mode == RunMode.DISTRIBUTED:
105
- mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
106
- settings = self.get_distributed_settings(
107
- redis_host=kwargs.get('redis_host', '127.0.0.1'),
108
- redis_port=kwargs.get('redis_port', 6379),
109
- redis_password=kwargs.get('redis_password'),
110
- redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
111
- project_name=kwargs.get('project_name', 'crawlo')
112
- )
113
-
114
- elif mode == RunMode.AUTO:
115
- mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
116
- settings = self.get_auto_settings()
117
-
118
- else:
119
- raise ValueError(f"不支持的运行模式: {mode}")
120
-
121
- # 合并用户自定义配置
122
- user_settings = {k: v for k, v in kwargs.items()
123
- if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
124
- settings.update(user_settings)
125
-
126
- # 将模式信息添加到配置中,供后续使用
127
- settings['_mode_info'] = mode_info
128
-
129
- return settings
130
-
131
- def from_environment(self) -> Dict[str, Any]:
132
- """从环境变量构建配置"""
133
- config = {}
134
-
135
- # 扫描 CRAWLO_ 前缀的环境变量
136
- for key, value in os.environ.items():
137
- if key.startswith('CRAWLO_'):
138
- config_key = key[7:] # 去掉 'CRAWLO_' 前缀
139
- # 简单的类型转换
140
- if value.lower() in ('true', 'false'):
141
- config[config_key] = value.lower() == 'true'
142
- elif value.isdigit():
143
- config[config_key] = int(value)
144
- else:
145
- try:
146
- config[config_key] = float(value)
147
- except ValueError:
148
- config[config_key] = value
149
-
150
- return config
151
-
152
-
153
- # 便利函数
154
- def standalone_mode(**kwargs) -> Dict[str, Any]:
155
- """快速创建单机模式配置"""
156
- return ModeManager().resolve_mode_settings('standalone', **kwargs)
157
-
158
-
159
- def distributed_mode(
160
- redis_host: str = '127.0.0.1',
161
- redis_port: int = 6379,
162
- redis_password: Optional[str] = None,
163
- redis_db: int = 0, # 添加 redis_db 参数
164
- project_name: str = 'crawlo',
165
- **kwargs
166
- ) -> Dict[str, Any]:
167
- """快速创建分布式模式配置"""
168
- return ModeManager().resolve_mode_settings(
169
- 'distributed',
170
- redis_host=redis_host,
171
- redis_port=redis_port,
172
- redis_password=redis_password,
173
- redis_db=redis_db, # 传递 redis_db 参数
174
- project_name=project_name,
175
- **kwargs
176
- )
177
-
178
-
179
- def auto_mode(**kwargs) -> Dict[str, Any]:
180
- """快速创建自动检测模式配置"""
181
- return ModeManager().resolve_mode_settings('auto', **kwargs)
182
-
183
-
184
- # 环境变量支持
185
- def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
186
- """从环境变量创建配置"""
187
- # 移除直接使用 os.getenv(),要求通过 settings 配置
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 运行模式管理器
5
+ ==============
6
+ 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
+
8
+ 支持的运行模式:
9
+ 1. standalone - 单机模式(默认)
10
+ 2. distributed - 分布式模式
11
+ 3. auto - 自动检测模式
12
+ """
13
+ import os
14
+ from enum import Enum
15
+ from typing import Dict, Any, Optional
16
+
17
+
18
+ class RunMode(Enum):
19
+ """运行模式枚举"""
20
+ STANDALONE = "standalone" # 单机模式
21
+ DISTRIBUTED = "distributed" # 分布式模式
22
+ AUTO = "auto" # 自动检测模式
23
+
24
+
25
+ class ModeManager:
26
+ """运行模式管理器"""
27
+
28
+ def __init__(self):
29
+ pass
30
+
31
+ @staticmethod
32
+ def get_standalone_settings() -> Dict[str, Any]:
33
+ """获取单机模式配置"""
34
+ return {
35
+ 'QUEUE_TYPE': 'memory',
36
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
37
+ 'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
38
+ 'CONCURRENCY': 8,
39
+ 'MAX_RUNNING_SPIDERS': 1,
40
+ 'DOWNLOAD_DELAY': 1.0,
41
+ }
42
+
43
+ @staticmethod
44
+ def get_distributed_settings(
45
+ redis_host: str = '127.0.0.1',
46
+ redis_port: int = 6379,
47
+ redis_password: Optional[str] = None,
48
+ redis_db: int = 0,
49
+ project_name: str = 'crawlo'
50
+ ) -> Dict[str, Any]:
51
+ """获取分布式模式配置"""
52
+ # 构建 Redis URL
53
+ if redis_password:
54
+ redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
55
+ else:
56
+ redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
57
+
58
+ return {
59
+ 'QUEUE_TYPE': 'redis',
60
+ 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
61
+ 'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline',
62
+ 'REDIS_HOST': redis_host,
63
+ 'REDIS_PORT': redis_port,
64
+ 'REDIS_PASSWORD': redis_password,
65
+ 'REDIS_DB': redis_db,
66
+ 'REDIS_URL': redis_url,
67
+ 'PROJECT_NAME': project_name,
68
+ 'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests',
69
+ 'CONCURRENCY': 16,
70
+ 'MAX_RUNNING_SPIDERS': 10,
71
+ 'DOWNLOAD_DELAY': 1.0,
72
+ }
73
+
74
+ @staticmethod
75
+ def get_auto_settings() -> Dict[str, Any]:
76
+ """获取自动检测模式配置"""
77
+ # 默认使用内存队列和过滤器
78
+ settings = ModeManager.get_standalone_settings()
79
+ settings['QUEUE_TYPE'] = 'auto'
80
+ return settings
81
+
82
+ def resolve_mode_settings(
83
+ self,
84
+ mode: str = 'standalone',
85
+ **kwargs
86
+ ) -> Dict[str, Any]:
87
+ """
88
+ 解析运行模式并返回对应配置
89
+
90
+ Args:
91
+ mode: 运行模式 ('standalone', 'distributed', 'auto')
92
+ **kwargs: 额外配置参数
93
+
94
+ Returns:
95
+ Dict[str, Any]: 配置字典
96
+ """
97
+ mode = RunMode(mode.lower())
98
+ mode_info = None
99
+
100
+ if mode == RunMode.STANDALONE:
101
+ mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
102
+ settings = self.get_standalone_settings()
103
+
104
+ elif mode == RunMode.DISTRIBUTED:
105
+ mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
106
+ settings = self.get_distributed_settings(
107
+ redis_host=kwargs.get('redis_host', '127.0.0.1'),
108
+ redis_port=kwargs.get('redis_port', 6379),
109
+ redis_password=kwargs.get('redis_password'),
110
+ redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
111
+ project_name=kwargs.get('project_name', 'crawlo')
112
+ )
113
+
114
+ elif mode == RunMode.AUTO:
115
+ mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
116
+ settings = self.get_auto_settings()
117
+
118
+ else:
119
+ raise ValueError(f"不支持的运行模式: {mode}")
120
+
121
+ # 合并用户自定义配置
122
+ user_settings = {k: v for k, v in kwargs.items()
123
+ if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
124
+ settings.update(user_settings)
125
+
126
+ # 将模式信息添加到配置中,供后续使用
127
+ settings['_mode_info'] = mode_info
128
+
129
+ return settings
130
+
131
+ def from_environment(self) -> Dict[str, Any]:
132
+ """从环境变量构建配置"""
133
+ config = {}
134
+
135
+ # 扫描 CRAWLO_ 前缀的环境变量
136
+ for key, value in os.environ.items():
137
+ if key.startswith('CRAWLO_'):
138
+ config_key = key[7:] # 去掉 'CRAWLO_' 前缀
139
+ # 简单的类型转换
140
+ if value.lower() in ('true', 'false'):
141
+ config[config_key] = value.lower() == 'true'
142
+ elif value.isdigit():
143
+ config[config_key] = int(value)
144
+ else:
145
+ try:
146
+ config[config_key] = float(value)
147
+ except ValueError:
148
+ config[config_key] = value
149
+
150
+ return config
151
+
152
+
153
+ # 便利函数
154
+ def standalone_mode(**kwargs) -> Dict[str, Any]:
155
+ """快速创建单机模式配置"""
156
+ return ModeManager().resolve_mode_settings('standalone', **kwargs)
157
+
158
+
159
+ def distributed_mode(
160
+ redis_host: str = '127.0.0.1',
161
+ redis_port: int = 6379,
162
+ redis_password: Optional[str] = None,
163
+ redis_db: int = 0, # 添加 redis_db 参数
164
+ project_name: str = 'crawlo',
165
+ **kwargs
166
+ ) -> Dict[str, Any]:
167
+ """快速创建分布式模式配置"""
168
+ return ModeManager().resolve_mode_settings(
169
+ 'distributed',
170
+ redis_host=redis_host,
171
+ redis_port=redis_port,
172
+ redis_password=redis_password,
173
+ redis_db=redis_db, # 传递 redis_db 参数
174
+ project_name=project_name,
175
+ **kwargs
176
+ )
177
+
178
+
179
+ def auto_mode(**kwargs) -> Dict[str, Any]:
180
+ """快速创建自动检测模式配置"""
181
+ return ModeManager().resolve_mode_settings('auto', **kwargs)
182
+
183
+
184
+ # 环境变量支持
185
+ def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
186
+ """从环境变量创建配置"""
187
+ # 移除直接使用 os.getenv(),要求通过 settings 配置
188
188
  raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
@@ -1,21 +1,21 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Crawlo Network Module
5
- ====================
6
- 提供HTTP请求和响应对象的封装。
7
-
8
- 主要组件:
9
- - Request: HTTP请求封装
10
- - Response: HTTP响应封装
11
- - RequestPriority: 请求优先级常量
12
- """
13
-
14
- from .request import Request, RequestPriority
15
- from .response import Response
16
-
17
- __all__ = [
18
- 'Request',
19
- 'RequestPriority',
20
- 'Response',
21
- ]
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Crawlo Network Module
5
+ ====================
6
+ 提供HTTP请求和响应对象的封装。
7
+
8
+ 主要组件:
9
+ - Request: HTTP请求封装
10
+ - Response: HTTP响应封装
11
+ - RequestPriority: 请求优先级常量
12
+ """
13
+
14
+ from .request import Request, RequestPriority
15
+ from .response import Response
16
+
17
+ __all__ = [
18
+ 'Request',
19
+ 'RequestPriority',
20
+ 'Response',
21
+ ]