crawlo 1.1.4__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (186) hide show
  1. crawlo/__init__.py +61 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/cli.py +40 -40
  8. crawlo/commands/__init__.py +13 -13
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/list.py +155 -155
  12. crawlo/commands/run.py +285 -285
  13. crawlo/commands/startproject.py +300 -196
  14. crawlo/commands/stats.py +188 -188
  15. crawlo/commands/utils.py +186 -186
  16. crawlo/config.py +309 -279
  17. crawlo/config_validator.py +253 -0
  18. crawlo/core/__init__.py +2 -2
  19. crawlo/core/engine.py +346 -172
  20. crawlo/core/processor.py +40 -40
  21. crawlo/core/scheduler.py +137 -166
  22. crawlo/crawler.py +1027 -1027
  23. crawlo/downloader/__init__.py +266 -242
  24. crawlo/downloader/aiohttp_downloader.py +220 -212
  25. crawlo/downloader/cffi_downloader.py +256 -251
  26. crawlo/downloader/httpx_downloader.py +259 -259
  27. crawlo/downloader/hybrid_downloader.py +214 -0
  28. crawlo/downloader/playwright_downloader.py +403 -0
  29. crawlo/downloader/selenium_downloader.py +473 -0
  30. crawlo/event.py +11 -11
  31. crawlo/exceptions.py +81 -81
  32. crawlo/extension/__init__.py +37 -37
  33. crawlo/extension/health_check.py +141 -141
  34. crawlo/extension/log_interval.py +57 -57
  35. crawlo/extension/log_stats.py +81 -81
  36. crawlo/extension/logging_extension.py +43 -43
  37. crawlo/extension/memory_monitor.py +104 -88
  38. crawlo/extension/performance_profiler.py +133 -117
  39. crawlo/extension/request_recorder.py +107 -107
  40. crawlo/filters/__init__.py +154 -154
  41. crawlo/filters/aioredis_filter.py +280 -242
  42. crawlo/filters/memory_filter.py +269 -269
  43. crawlo/items/__init__.py +23 -23
  44. crawlo/items/base.py +21 -21
  45. crawlo/items/fields.py +53 -53
  46. crawlo/items/items.py +104 -104
  47. crawlo/middleware/__init__.py +21 -21
  48. crawlo/middleware/default_header.py +32 -32
  49. crawlo/middleware/download_delay.py +28 -28
  50. crawlo/middleware/middleware_manager.py +135 -135
  51. crawlo/middleware/proxy.py +272 -248
  52. crawlo/middleware/request_ignore.py +30 -30
  53. crawlo/middleware/response_code.py +18 -18
  54. crawlo/middleware/response_filter.py +26 -26
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/mode_manager.py +206 -201
  57. crawlo/network/__init__.py +21 -21
  58. crawlo/network/request.py +338 -311
  59. crawlo/network/response.py +360 -271
  60. crawlo/pipelines/__init__.py +21 -21
  61. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  62. crawlo/pipelines/console_pipeline.py +39 -39
  63. crawlo/pipelines/csv_pipeline.py +316 -316
  64. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  65. crawlo/pipelines/json_pipeline.py +218 -218
  66. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  67. crawlo/pipelines/mongo_pipeline.py +131 -131
  68. crawlo/pipelines/mysql_pipeline.py +316 -316
  69. crawlo/pipelines/pipeline_manager.py +56 -56
  70. crawlo/pipelines/redis_dedup_pipeline.py +166 -162
  71. crawlo/project.py +153 -153
  72. crawlo/queue/pqueue.py +37 -37
  73. crawlo/queue/queue_manager.py +320 -307
  74. crawlo/queue/redis_priority_queue.py +277 -209
  75. crawlo/settings/__init__.py +7 -7
  76. crawlo/settings/default_settings.py +216 -278
  77. crawlo/settings/setting_manager.py +99 -99
  78. crawlo/spider/__init__.py +639 -639
  79. crawlo/stats_collector.py +59 -59
  80. crawlo/subscriber.py +130 -130
  81. crawlo/task_manager.py +30 -30
  82. crawlo/templates/crawlo.cfg.tmpl +10 -10
  83. crawlo/templates/project/__init__.py.tmpl +3 -3
  84. crawlo/templates/project/items.py.tmpl +17 -17
  85. crawlo/templates/project/middlewares.py.tmpl +110 -110
  86. crawlo/templates/project/pipelines.py.tmpl +97 -97
  87. crawlo/templates/project/run.py.tmpl +251 -251
  88. crawlo/templates/project/settings.py.tmpl +326 -279
  89. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  90. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  91. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  92. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  93. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  94. crawlo/templates/spider/spider.py.tmpl +141 -141
  95. crawlo/tools/__init__.py +183 -0
  96. crawlo/tools/anti_crawler.py +269 -0
  97. crawlo/tools/authenticated_proxy.py +241 -0
  98. crawlo/tools/data_validator.py +181 -0
  99. crawlo/tools/date_tools.py +36 -0
  100. crawlo/tools/distributed_coordinator.py +387 -0
  101. crawlo/tools/retry_mechanism.py +221 -0
  102. crawlo/tools/scenario_adapter.py +263 -0
  103. crawlo/utils/__init__.py +35 -7
  104. crawlo/utils/batch_processor.py +261 -0
  105. crawlo/utils/controlled_spider_mixin.py +439 -439
  106. crawlo/utils/date_tools.py +290 -233
  107. crawlo/utils/db_helper.py +343 -343
  108. crawlo/utils/enhanced_error_handler.py +360 -0
  109. crawlo/utils/env_config.py +106 -0
  110. crawlo/utils/error_handler.py +126 -0
  111. crawlo/utils/func_tools.py +82 -82
  112. crawlo/utils/large_scale_config.py +286 -286
  113. crawlo/utils/large_scale_helper.py +343 -343
  114. crawlo/utils/log.py +128 -128
  115. crawlo/utils/performance_monitor.py +285 -0
  116. crawlo/utils/queue_helper.py +175 -175
  117. crawlo/utils/redis_connection_pool.py +335 -0
  118. crawlo/utils/redis_key_validator.py +200 -0
  119. crawlo/utils/request.py +267 -267
  120. crawlo/utils/request_serializer.py +219 -219
  121. crawlo/utils/spider_loader.py +62 -62
  122. crawlo/utils/system.py +11 -11
  123. crawlo/utils/tools.py +4 -4
  124. crawlo/utils/url.py +39 -39
  125. {crawlo-1.1.4.dist-info → crawlo-1.1.5.dist-info}/METADATA +401 -403
  126. crawlo-1.1.5.dist-info/RECORD +185 -0
  127. examples/__init__.py +7 -7
  128. tests/__init__.py +7 -7
  129. tests/advanced_tools_example.py +276 -0
  130. tests/authenticated_proxy_example.py +237 -0
  131. tests/cleaners_example.py +161 -0
  132. tests/config_validation_demo.py +103 -0
  133. {examples → tests}/controlled_spider_example.py +205 -205
  134. tests/date_tools_example.py +181 -0
  135. tests/dynamic_loading_example.py +524 -0
  136. tests/dynamic_loading_test.py +105 -0
  137. tests/env_config_example.py +134 -0
  138. tests/error_handling_example.py +172 -0
  139. tests/redis_key_validation_demo.py +131 -0
  140. tests/response_improvements_example.py +145 -0
  141. tests/test_advanced_tools.py +149 -0
  142. tests/test_all_redis_key_configs.py +146 -0
  143. tests/test_authenticated_proxy.py +142 -0
  144. tests/test_cleaners.py +55 -0
  145. tests/test_comprehensive.py +147 -0
  146. tests/test_config_validator.py +194 -0
  147. tests/test_date_tools.py +124 -0
  148. tests/test_dynamic_downloaders_proxy.py +125 -0
  149. tests/test_dynamic_proxy.py +93 -0
  150. tests/test_dynamic_proxy_config.py +147 -0
  151. tests/test_dynamic_proxy_real.py +110 -0
  152. tests/test_edge_cases.py +304 -0
  153. tests/test_enhanced_error_handler.py +271 -0
  154. tests/test_env_config.py +122 -0
  155. tests/test_error_handler_compatibility.py +113 -0
  156. tests/test_final_validation.py +153 -153
  157. tests/test_framework_env_usage.py +104 -0
  158. tests/test_integration.py +357 -0
  159. tests/test_item_dedup_redis_key.py +123 -0
  160. tests/test_parsel.py +30 -0
  161. tests/test_performance.py +328 -0
  162. tests/test_proxy_health_check.py +32 -32
  163. tests/test_proxy_middleware_integration.py +136 -136
  164. tests/test_proxy_providers.py +56 -56
  165. tests/test_proxy_stats.py +19 -19
  166. tests/test_proxy_strategies.py +59 -59
  167. tests/test_queue_manager_redis_key.py +177 -0
  168. tests/test_redis_config.py +28 -28
  169. tests/test_redis_connection_pool.py +295 -0
  170. tests/test_redis_key_naming.py +182 -0
  171. tests/test_redis_key_validator.py +124 -0
  172. tests/test_redis_queue.py +224 -224
  173. tests/test_request_serialization.py +70 -70
  174. tests/test_response_improvements.py +153 -0
  175. tests/test_scheduler.py +241 -241
  176. tests/test_simple_response.py +62 -0
  177. tests/test_telecom_spider_redis_key.py +206 -0
  178. tests/test_template_content.py +88 -0
  179. tests/test_template_redis_key.py +135 -0
  180. tests/test_tools.py +154 -0
  181. tests/tools_example.py +258 -0
  182. crawlo/core/enhanced_engine.py +0 -190
  183. crawlo-1.1.4.dist-info/RECORD +0 -117
  184. {crawlo-1.1.4.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
  185. {crawlo-1.1.4.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
  186. {crawlo-1.1.4.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
crawlo/mode_manager.py CHANGED
@@ -1,201 +1,206 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 运行模式管理器
5
- ==============
6
- 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
-
8
- 支持的运行模式:
9
- 1. standalone - 单机模式(默认)
10
- 2. distributed - 分布式模式
11
- 3. auto - 自动检测模式
12
- """
13
-
14
- from typing import Dict, Any, Optional
15
- from enum import Enum
16
- import os
17
- from crawlo.utils.log import get_logger
18
-
19
-
20
- class RunMode(Enum):
21
- """运行模式枚举"""
22
- STANDALONE = "standalone" # 单机模式
23
- DISTRIBUTED = "distributed" # 分布式模式
24
- AUTO = "auto" # 自动检测模式
25
-
26
-
27
- class ModeManager:
28
- """运行模式管理器"""
29
-
30
- def __init__(self):
31
- self.logger = get_logger(self.__class__.__name__)
32
-
33
- @staticmethod
34
- def get_standalone_settings() -> Dict[str, Any]:
35
- """获取单机模式配置"""
36
- return {
37
- 'QUEUE_TYPE': 'memory',
38
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
- 'CONCURRENCY': 8,
40
- 'MAX_RUNNING_SPIDERS': 1,
41
- 'DOWNLOAD_DELAY': 1.0,
42
- 'LOG_LEVEL': 'INFO',
43
- }
44
-
45
- @staticmethod
46
- def get_distributed_settings(
47
- redis_host: str = '127.0.0.1',
48
- redis_port: int = 6379,
49
- redis_password: Optional[str] = None,
50
- project_name: str = 'crawlo'
51
- ) -> Dict[str, Any]:
52
- """获取分布式模式配置"""
53
- # 构建 Redis URL
54
- if redis_password:
55
- redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
- else:
57
- redis_url = f'redis://{redis_host}:{redis_port}/0'
58
-
59
- return {
60
- 'QUEUE_TYPE': 'redis',
61
- 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
- 'REDIS_HOST': redis_host,
63
- 'REDIS_PORT': redis_port,
64
- 'REDIS_PASSWORD': redis_password,
65
- 'REDIS_URL': redis_url,
66
- 'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
67
- 'REDIS_KEY': f'{project_name}:fingerprint',
68
- 'CONCURRENCY': 16,
69
- 'MAX_RUNNING_SPIDERS': 1,
70
- 'DOWNLOAD_DELAY': 1.0,
71
- 'LOG_LEVEL': 'INFO',
72
- }
73
-
74
- @staticmethod
75
- def get_auto_settings() -> Dict[str, Any]:
76
- """获取自动检测模式配置"""
77
- return {
78
- 'QUEUE_TYPE': 'auto',
79
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
80
- 'CONCURRENCY': 12,
81
- 'MAX_RUNNING_SPIDERS': 1,
82
- 'DOWNLOAD_DELAY': 1.0,
83
- 'LOG_LEVEL': 'INFO',
84
- }
85
-
86
- def resolve_mode_settings(
87
- self,
88
- mode: str = 'standalone',
89
- **kwargs
90
- ) -> Dict[str, Any]:
91
- """
92
- 解析运行模式并返回对应配置
93
-
94
- Args:
95
- mode: 运行模式 ('standalone', 'distributed', 'auto')
96
- **kwargs: 额外配置参数
97
-
98
- Returns:
99
- Dict[str, Any]: 配置字典
100
- """
101
- mode = RunMode(mode.lower())
102
-
103
- if mode == RunMode.STANDALONE:
104
- self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
105
- settings = self.get_standalone_settings()
106
-
107
- elif mode == RunMode.DISTRIBUTED:
108
- self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
109
- settings = self.get_distributed_settings(
110
- redis_host=kwargs.get('redis_host', '127.0.0.1'),
111
- redis_port=kwargs.get('redis_port', 6379),
112
- redis_password=kwargs.get('redis_password'),
113
- project_name=kwargs.get('project_name', 'crawlo')
114
- )
115
-
116
- elif mode == RunMode.AUTO:
117
- self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
118
- settings = self.get_auto_settings()
119
-
120
- else:
121
- raise ValueError(f"不支持的运行模式: {mode}")
122
-
123
- # 合并用户自定义配置
124
- user_settings = {k: v for k, v in kwargs.items()
125
- if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
126
- settings.update(user_settings)
127
-
128
- return settings
129
-
130
- def from_environment(self) -> Dict[str, Any]:
131
- """从环境变量构建配置"""
132
- config = {}
133
-
134
- # 扫描 CRAWLO_ 前缀的环境变量
135
- for key, value in os.environ.items():
136
- if key.startswith('CRAWLO_'):
137
- config_key = key[7:] # 去掉 'CRAWLO_' 前缀
138
- # 简单的类型转换
139
- if value.lower() in ('true', 'false'):
140
- config[config_key] = value.lower() == 'true'
141
- elif value.isdigit():
142
- config[config_key] = int(value)
143
- else:
144
- try:
145
- config[config_key] = float(value)
146
- except ValueError:
147
- config[config_key] = value
148
-
149
- return config
150
-
151
-
152
- # 便利函数
153
- def standalone_mode(**kwargs) -> Dict[str, Any]:
154
- """快速创建单机模式配置"""
155
- return ModeManager().resolve_mode_settings('standalone', **kwargs)
156
-
157
-
158
- def distributed_mode(
159
- redis_host: str = '127.0.0.1',
160
- redis_port: int = 6379,
161
- redis_password: Optional[str] = None,
162
- project_name: str = 'crawlo',
163
- **kwargs
164
- ) -> Dict[str, Any]:
165
- """快速创建分布式模式配置"""
166
- return ModeManager().resolve_mode_settings(
167
- 'distributed',
168
- redis_host=redis_host,
169
- redis_port=redis_port,
170
- redis_password=redis_password,
171
- project_name=project_name,
172
- **kwargs
173
- )
174
-
175
-
176
- def auto_mode(**kwargs) -> Dict[str, Any]:
177
- """快速创建自动检测模式配置"""
178
- return ModeManager().resolve_mode_settings('auto', **kwargs)
179
-
180
-
181
- # 环境变量支持
182
- def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
183
- """从环境变量创建配置"""
184
- mode = os.getenv('CRAWLO_MODE', default_mode).lower()
185
-
186
- if mode == 'distributed':
187
- return distributed_mode(
188
- redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
189
- redis_port=int(os.getenv('REDIS_PORT', 6379)),
190
- redis_password=os.getenv('REDIS_PASSWORD'),
191
- project_name=os.getenv('PROJECT_NAME', 'crawlo'),
192
- CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
193
- )
194
- elif mode == 'auto':
195
- return auto_mode(
196
- CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
197
- )
198
- else: # standalone
199
- return standalone_mode(
200
- CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
201
- )
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 运行模式管理器
5
+ ==============
6
+ 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
+
8
+ 支持的运行模式:
9
+ 1. standalone - 单机模式(默认)
10
+ 2. distributed - 分布式模式
11
+ 3. auto - 自动检测模式
12
+ """
13
+
14
+ from typing import Dict, Any, Optional
15
+ from enum import Enum
16
+ import os
17
+ from crawlo.utils.log import get_logger
18
+
19
+
20
+ class RunMode(Enum):
21
+ """运行模式枚举"""
22
+ STANDALONE = "standalone" # 单机模式
23
+ DISTRIBUTED = "distributed" # 分布式模式
24
+ AUTO = "auto" # 自动检测模式
25
+
26
+
27
+ class ModeManager:
28
+ """运行模式管理器"""
29
+
30
+ def __init__(self):
31
+ self.logger = get_logger(self.__class__.__name__)
32
+
33
+ @staticmethod
34
+ def get_standalone_settings() -> Dict[str, Any]:
35
+ """获取单机模式配置"""
36
+ return {
37
+ 'QUEUE_TYPE': 'memory',
38
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
+ 'CONCURRENCY': 8,
40
+ 'MAX_RUNNING_SPIDERS': 1,
41
+ 'DOWNLOAD_DELAY': 1.0,
42
+ 'LOG_LEVEL': 'INFO',
43
+ }
44
+
45
+ @staticmethod
46
+ def get_distributed_settings(
47
+ redis_host: str = '127.0.0.1',
48
+ redis_port: int = 6379,
49
+ redis_password: Optional[str] = None,
50
+ project_name: str = 'crawlo'
51
+ ) -> Dict[str, Any]:
52
+ """获取分布式模式配置"""
53
+ # 构建 Redis URL
54
+ if redis_password:
55
+ redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
+ else:
57
+ redis_url = f'redis://{redis_host}:{redis_port}/0'
58
+
59
+ return {
60
+ 'QUEUE_TYPE': 'redis',
61
+ 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
+ 'REDIS_HOST': redis_host,
63
+ 'REDIS_PORT': redis_port,
64
+ 'REDIS_PASSWORD': redis_password,
65
+ 'REDIS_URL': redis_url,
66
+ 'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
67
+ # Redis key配置已移至各组件中,使用统一的命名规范
68
+ # crawlo:{project_name}:filter:fingerprint (请求去重)
69
+ 'CONCURRENCY': 16,
70
+ 'MAX_RUNNING_SPIDERS': 1,
71
+ 'DOWNLOAD_DELAY': 1.0,
72
+ 'LOG_LEVEL': 'INFO',
73
+ }
74
+
75
+ @staticmethod
76
+ def get_auto_settings() -> Dict[str, Any]:
77
+ """获取自动检测模式配置"""
78
+ return {
79
+ 'QUEUE_TYPE': 'auto',
80
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
81
+ 'CONCURRENCY': 12,
82
+ 'MAX_RUNNING_SPIDERS': 1,
83
+ 'DOWNLOAD_DELAY': 1.0,
84
+ 'LOG_LEVEL': 'INFO',
85
+ }
86
+
87
+ def resolve_mode_settings(
88
+ self,
89
+ mode: str = 'standalone',
90
+ **kwargs
91
+ ) -> Dict[str, Any]:
92
+ """
93
+ 解析运行模式并返回对应配置
94
+
95
+ Args:
96
+ mode: 运行模式 ('standalone', 'distributed', 'auto')
97
+ **kwargs: 额外配置参数
98
+
99
+ Returns:
100
+ Dict[str, Any]: 配置字典
101
+ """
102
+ mode = RunMode(mode.lower())
103
+
104
+ if mode == RunMode.STANDALONE:
105
+ self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
106
+ settings = self.get_standalone_settings()
107
+
108
+ elif mode == RunMode.DISTRIBUTED:
109
+ self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
110
+ settings = self.get_distributed_settings(
111
+ redis_host=kwargs.get('redis_host', '127.0.0.1'),
112
+ redis_port=kwargs.get('redis_port', 6379),
113
+ redis_password=kwargs.get('redis_password'),
114
+ project_name=kwargs.get('project_name', 'crawlo')
115
+ )
116
+
117
+ elif mode == RunMode.AUTO:
118
+ self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
119
+ settings = self.get_auto_settings()
120
+
121
+ else:
122
+ raise ValueError(f"不支持的运行模式: {mode}")
123
+
124
+ # 合并用户自定义配置
125
+ user_settings = {k: v for k, v in kwargs.items()
126
+ if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
127
+ settings.update(user_settings)
128
+
129
+ return settings
130
+
131
+ def from_environment(self) -> Dict[str, Any]:
132
+ """从环境变量构建配置"""
133
+ config = {}
134
+
135
+ # 扫描 CRAWLO_ 前缀的环境变量
136
+ for key, value in os.environ.items():
137
+ if key.startswith('CRAWLO_'):
138
+ config_key = key[7:] # 去掉 'CRAWLO_' 前缀
139
+ # 简单的类型转换
140
+ if value.lower() in ('true', 'false'):
141
+ config[config_key] = value.lower() == 'true'
142
+ elif value.isdigit():
143
+ config[config_key] = int(value)
144
+ else:
145
+ try:
146
+ config[config_key] = float(value)
147
+ except ValueError:
148
+ config[config_key] = value
149
+
150
+ return config
151
+
152
+
153
+ # 便利函数
154
+ def standalone_mode(**kwargs) -> Dict[str, Any]:
155
+ """快速创建单机模式配置"""
156
+ return ModeManager().resolve_mode_settings('standalone', **kwargs)
157
+
158
+
159
+ def distributed_mode(
160
+ redis_host: str = '127.0.0.1',
161
+ redis_port: int = 6379,
162
+ redis_password: Optional[str] = None,
163
+ project_name: str = 'crawlo',
164
+ **kwargs
165
+ ) -> Dict[str, Any]:
166
+ """快速创建分布式模式配置"""
167
+ return ModeManager().resolve_mode_settings(
168
+ 'distributed',
169
+ redis_host=redis_host,
170
+ redis_port=redis_port,
171
+ redis_password=redis_password,
172
+ project_name=project_name,
173
+ **kwargs
174
+ )
175
+
176
+
177
+ def auto_mode(**kwargs) -> Dict[str, Any]:
178
+ """快速创建自动检测模式配置"""
179
+ return ModeManager().resolve_mode_settings('auto', **kwargs)
180
+
181
+
182
+ # 环境变量支持
183
+ def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
184
+ """从环境变量创建配置"""
185
+ # 移除直接使用 os.getenv(),要求通过 settings 配置
186
+ raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
187
+
188
+ # 保留原有代码作为参考
189
+ # mode = os.getenv('CRAWLO_MODE', default_mode).lower()
190
+ #
191
+ # if mode == 'distributed':
192
+ # return distributed_mode(
193
+ # redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
194
+ # redis_port=int(os.getenv('REDIS_PORT', 6379)),
195
+ # redis_password=os.getenv('REDIS_PASSWORD'),
196
+ # project_name=os.getenv('PROJECT_NAME', 'crawlo'),
197
+ # CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
198
+ # )
199
+ # elif mode == 'auto':
200
+ # return auto_mode(
201
+ # CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
202
+ # )
203
+ # else: # standalone
204
+ # return standalone_mode(
205
+ # CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
206
+ # )
@@ -1,21 +1,21 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Crawlo Network Module
5
- ====================
6
- 提供HTTP请求和响应对象的封装。
7
-
8
- 主要组件:
9
- - Request: HTTP请求封装
10
- - Response: HTTP响应封装
11
- - RequestPriority: 请求优先级常量
12
- """
13
-
14
- from .request import Request, RequestPriority
15
- from .response import Response
16
-
17
- __all__ = [
18
- 'Request',
19
- 'RequestPriority',
20
- 'Response',
21
- ]
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Crawlo Network Module
5
+ ====================
6
+ 提供HTTP请求和响应对象的封装。
7
+
8
+ 主要组件:
9
+ - Request: HTTP请求封装
10
+ - Response: HTTP响应封装
11
+ - RequestPriority: 请求优先级常量
12
+ """
13
+
14
+ from .request import Request, RequestPriority
15
+ from .response import Response
16
+
17
+ __all__ = [
18
+ 'Request',
19
+ 'RequestPriority',
20
+ 'Response',
21
+ ]