crawlo 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (113) hide show
  1. crawlo/__init__.py +34 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/list.py +155 -155
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -196
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +186 -186
  12. crawlo/config.py +279 -279
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -171
  15. crawlo/core/enhanced_engine.py +189 -189
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +166 -162
  18. crawlo/crawler.py +1027 -1027
  19. crawlo/downloader/__init__.py +242 -242
  20. crawlo/downloader/aiohttp_downloader.py +212 -212
  21. crawlo/downloader/cffi_downloader.py +251 -251
  22. crawlo/downloader/httpx_downloader.py +259 -257
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +82 -78
  25. crawlo/extension/__init__.py +31 -31
  26. crawlo/extension/log_interval.py +49 -49
  27. crawlo/extension/log_stats.py +44 -44
  28. crawlo/extension/logging_extension.py +34 -34
  29. crawlo/filters/__init__.py +154 -154
  30. crawlo/filters/aioredis_filter.py +242 -242
  31. crawlo/filters/memory_filter.py +269 -269
  32. crawlo/items/__init__.py +23 -23
  33. crawlo/items/base.py +21 -21
  34. crawlo/items/fields.py +53 -53
  35. crawlo/items/items.py +104 -104
  36. crawlo/middleware/__init__.py +21 -21
  37. crawlo/middleware/default_header.py +32 -32
  38. crawlo/middleware/download_delay.py +28 -28
  39. crawlo/middleware/middleware_manager.py +135 -135
  40. crawlo/middleware/proxy.py +248 -248
  41. crawlo/middleware/request_ignore.py +30 -30
  42. crawlo/middleware/response_code.py +18 -18
  43. crawlo/middleware/response_filter.py +26 -26
  44. crawlo/middleware/retry.py +125 -125
  45. crawlo/mode_manager.py +200 -200
  46. crawlo/network/__init__.py +21 -21
  47. crawlo/network/request.py +311 -311
  48. crawlo/network/response.py +271 -269
  49. crawlo/pipelines/__init__.py +22 -13
  50. crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
  51. crawlo/pipelines/console_pipeline.py +39 -39
  52. crawlo/pipelines/csv_pipeline.py +316 -316
  53. crawlo/pipelines/database_dedup_pipeline.py +225 -0
  54. crawlo/pipelines/json_pipeline.py +218 -218
  55. crawlo/pipelines/memory_dedup_pipeline.py +116 -0
  56. crawlo/pipelines/mongo_pipeline.py +116 -116
  57. crawlo/pipelines/mysql_pipeline.py +195 -195
  58. crawlo/pipelines/pipeline_manager.py +56 -56
  59. crawlo/pipelines/redis_dedup_pipeline.py +163 -0
  60. crawlo/project.py +153 -153
  61. crawlo/queue/pqueue.py +37 -37
  62. crawlo/queue/queue_manager.py +307 -303
  63. crawlo/queue/redis_priority_queue.py +208 -191
  64. crawlo/settings/__init__.py +7 -7
  65. crawlo/settings/default_settings.py +245 -226
  66. crawlo/settings/setting_manager.py +99 -99
  67. crawlo/spider/__init__.py +639 -639
  68. crawlo/stats_collector.py +59 -59
  69. crawlo/subscriber.py +106 -106
  70. crawlo/task_manager.py +30 -30
  71. crawlo/templates/crawlo.cfg.tmpl +10 -10
  72. crawlo/templates/project/__init__.py.tmpl +3 -3
  73. crawlo/templates/project/items.py.tmpl +17 -17
  74. crawlo/templates/project/middlewares.py.tmpl +86 -86
  75. crawlo/templates/project/pipelines.py.tmpl +341 -335
  76. crawlo/templates/project/run.py.tmpl +251 -238
  77. crawlo/templates/project/settings.py.tmpl +250 -247
  78. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  79. crawlo/templates/spider/spider.py.tmpl +177 -177
  80. crawlo/utils/__init__.py +7 -7
  81. crawlo/utils/controlled_spider_mixin.py +439 -335
  82. crawlo/utils/date_tools.py +233 -233
  83. crawlo/utils/db_helper.py +343 -343
  84. crawlo/utils/func_tools.py +82 -82
  85. crawlo/utils/large_scale_config.py +286 -286
  86. crawlo/utils/large_scale_helper.py +343 -343
  87. crawlo/utils/log.py +128 -128
  88. crawlo/utils/queue_helper.py +175 -175
  89. crawlo/utils/request.py +267 -267
  90. crawlo/utils/request_serializer.py +219 -219
  91. crawlo/utils/spider_loader.py +62 -62
  92. crawlo/utils/system.py +11 -11
  93. crawlo/utils/tools.py +4 -4
  94. crawlo/utils/url.py +39 -39
  95. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/METADATA +635 -567
  96. crawlo-1.1.3.dist-info/RECORD +113 -0
  97. examples/__init__.py +7 -7
  98. examples/controlled_spider_example.py +205 -0
  99. tests/__init__.py +7 -7
  100. tests/test_final_validation.py +153 -153
  101. tests/test_proxy_health_check.py +32 -32
  102. tests/test_proxy_middleware_integration.py +136 -136
  103. tests/test_proxy_providers.py +56 -56
  104. tests/test_proxy_stats.py +19 -19
  105. tests/test_proxy_strategies.py +59 -59
  106. tests/test_redis_config.py +28 -28
  107. tests/test_redis_queue.py +224 -224
  108. tests/test_request_serialization.py +70 -70
  109. tests/test_scheduler.py +241 -241
  110. crawlo-1.1.2.dist-info/RECORD +0 -108
  111. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
  112. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
  113. {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,125 +1,125 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import List
4
- from asyncio.exceptions import TimeoutError
5
-
6
- try:
7
- from anyio import EndOfStream
8
- except ImportError:
9
- # 如果 anyio 不可用或者 EndOfStream 不存在,创建一个占位符
10
- class EndOfStream(Exception):
11
- pass
12
-
13
- try:
14
- from httpcore import ReadError
15
- except ImportError:
16
- class ReadError(Exception):
17
- pass
18
-
19
- try:
20
- from httpx import RemoteProtocolError, ConnectError, ReadTimeout
21
- except ImportError:
22
- class RemoteProtocolError(Exception):
23
- pass
24
- class ConnectError(Exception):
25
- pass
26
- class ReadTimeout(Exception):
27
- pass
28
-
29
- try:
30
- from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
31
- from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
32
- except ImportError:
33
- class ClientConnectionError(Exception):
34
- pass
35
- class ClientPayloadError(Exception):
36
- pass
37
- class ClientConnectorError(Exception):
38
- pass
39
- class ClientTimeout(Exception):
40
- pass
41
- class ClientConnectorSSLError(Exception):
42
- pass
43
- class ClientResponseError(Exception):
44
- pass
45
-
46
- from crawlo.utils.log import get_logger
47
- from crawlo.stats_collector import StatsCollector
48
-
49
- _retry_exceptions = [
50
- EndOfStream,
51
- ReadError,
52
- TimeoutError,
53
- ConnectError,
54
- ReadTimeout,
55
- ClientConnectorError,
56
- ClientResponseError,
57
- RemoteProtocolError,
58
- ClientTimeout,
59
- ClientConnectorSSLError,
60
- ClientPayloadError,
61
- ClientConnectionError
62
- ]
63
-
64
-
65
- class RetryMiddleware(object):
66
-
67
- def __init__(
68
- self,
69
- *,
70
- retry_http_codes: List,
71
- ignore_http_codes: List,
72
- max_retry_times: int,
73
- retry_exceptions: List,
74
- stats: StatsCollector,
75
- retry_priority: int
76
- ):
77
- self.retry_http_codes = retry_http_codes
78
- self.ignore_http_codes = ignore_http_codes
79
- self.max_retry_times = max_retry_times
80
- self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
81
- self.retry_priority = retry_priority
82
- self.stats = stats
83
- self.logger = get_logger(self.__class__.__name__)
84
-
85
- @classmethod
86
- def create_instance(cls, crawler):
87
- o = cls(
88
- retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
89
- ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
90
- max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
91
- retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
92
- stats=crawler.stats,
93
- retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
94
- )
95
- return o
96
-
97
- def process_response(self, request, response, spider):
98
- if request.meta.get('dont_retry', False):
99
- return response
100
- if response.status_code in self.ignore_http_codes:
101
- return response
102
- if response.status_code in self.retry_http_codes:
103
- # 重试逻辑
104
- reason = f"response code {response.status_code}"
105
- return self._retry(request, reason, spider) or response
106
- return response
107
-
108
- def process_exception(self, request, exc, spider):
109
- if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
110
- return self._retry(request=request, reason=type(exc).__name__, spider=spider)
111
-
112
- def _retry(self, request, reason, spider):
113
- retry_times = request.meta.get('retry_times', 0)
114
- if retry_times < self.max_retry_times:
115
- retry_times += 1
116
- self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
117
- request.meta['retry_times'] = retry_times
118
- # request.dont_retry = True
119
- request.meta['dont_retry'] = True
120
- request.priority = request.priority + self.retry_priority
121
- self.stats.inc_value("retry_count")
122
- return request
123
- else:
124
- self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
125
- return None
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ import asyncio
5
+
6
+ try:
7
+ from anyio import EndOfStream
8
+ except ImportError:
9
+ # 如果 anyio 不可用或者 EndOfStream 不存在,创建一个占位符
10
+ class EndOfStream(Exception):
11
+ pass
12
+
13
+ try:
14
+ from httpcore import ReadError
15
+ except ImportError:
16
+ class ReadError(Exception):
17
+ pass
18
+
19
+ try:
20
+ from httpx import RemoteProtocolError, ConnectError, ReadTimeout
21
+ except ImportError:
22
+ class RemoteProtocolError(Exception):
23
+ pass
24
+ class ConnectError(Exception):
25
+ pass
26
+ class ReadTimeout(Exception):
27
+ pass
28
+
29
+ try:
30
+ from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
31
+ from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
32
+ except ImportError:
33
+ class ClientConnectionError(Exception):
34
+ pass
35
+ class ClientPayloadError(Exception):
36
+ pass
37
+ class ClientConnectorError(Exception):
38
+ pass
39
+ class ClientTimeout(Exception):
40
+ pass
41
+ class ClientConnectorSSLError(Exception):
42
+ pass
43
+ class ClientResponseError(Exception):
44
+ pass
45
+
46
+ from crawlo.utils.log import get_logger
47
+ from crawlo.stats_collector import StatsCollector
48
+
49
+ _retry_exceptions = [
50
+ EndOfStream,
51
+ ReadError,
52
+ asyncio.TimeoutError,
53
+ ConnectError,
54
+ ReadTimeout,
55
+ ClientConnectorError,
56
+ ClientResponseError,
57
+ RemoteProtocolError,
58
+ ClientTimeout,
59
+ ClientConnectorSSLError,
60
+ ClientPayloadError,
61
+ ClientConnectionError
62
+ ]
63
+
64
+
65
+ class RetryMiddleware(object):
66
+
67
+ def __init__(
68
+ self,
69
+ *,
70
+ retry_http_codes: List,
71
+ ignore_http_codes: List,
72
+ max_retry_times: int,
73
+ retry_exceptions: List,
74
+ stats: StatsCollector,
75
+ retry_priority: int
76
+ ):
77
+ self.retry_http_codes = retry_http_codes
78
+ self.ignore_http_codes = ignore_http_codes
79
+ self.max_retry_times = max_retry_times
80
+ self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
81
+ self.retry_priority = retry_priority
82
+ self.stats = stats
83
+ self.logger = get_logger(self.__class__.__name__)
84
+
85
+ @classmethod
86
+ def create_instance(cls, crawler):
87
+ o = cls(
88
+ retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
89
+ ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
90
+ max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
91
+ retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
92
+ stats=crawler.stats,
93
+ retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
94
+ )
95
+ return o
96
+
97
+ def process_response(self, request, response, spider):
98
+ if request.meta.get('dont_retry', False):
99
+ return response
100
+ if response.status_code in self.ignore_http_codes:
101
+ return response
102
+ if response.status_code in self.retry_http_codes:
103
+ # 重试逻辑
104
+ reason = f"response code {response.status_code}"
105
+ return self._retry(request, reason, spider) or response
106
+ return response
107
+
108
+ def process_exception(self, request, exc, spider):
109
+ if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
110
+ return self._retry(request=request, reason=type(exc).__name__, spider=spider)
111
+
112
+ def _retry(self, request, reason, spider):
113
+ retry_times = request.meta.get('retry_times', 0)
114
+ if retry_times < self.max_retry_times:
115
+ retry_times += 1
116
+ self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
117
+ request.meta['retry_times'] = retry_times
118
+ # request.dont_retry = True
119
+ request.meta['dont_retry'] = True
120
+ request.priority = request.priority + self.retry_priority
121
+ self.stats.inc_value("retry_count")
122
+ return request
123
+ else:
124
+ self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
125
+ return None
crawlo/mode_manager.py CHANGED
@@ -1,201 +1,201 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 运行模式管理器
5
- ==============
6
- 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
-
8
- 支持的运行模式:
9
- 1. standalone - 单机模式(默认)
10
- 2. distributed - 分布式模式
11
- 3. auto - 自动检测模式
12
- """
13
-
14
- from typing import Dict, Any, Optional
15
- from enum import Enum
16
- import os
17
- from crawlo.utils.log import get_logger
18
-
19
-
20
- class RunMode(Enum):
21
- """运行模式枚举"""
22
- STANDALONE = "standalone" # 单机模式
23
- DISTRIBUTED = "distributed" # 分布式模式
24
- AUTO = "auto" # 自动检测模式
25
-
26
-
27
- class ModeManager:
28
- """运行模式管理器"""
29
-
30
- def __init__(self):
31
- self.logger = get_logger(self.__class__.__name__)
32
-
33
- @staticmethod
34
- def get_standalone_settings() -> Dict[str, Any]:
35
- """获取单机模式配置"""
36
- return {
37
- 'QUEUE_TYPE': 'memory',
38
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
- 'CONCURRENCY': 8,
40
- 'MAX_RUNNING_SPIDERS': 1,
41
- 'DOWNLOAD_DELAY': 1.0,
42
- 'LOG_LEVEL': 'INFO',
43
- }
44
-
45
- @staticmethod
46
- def get_distributed_settings(
47
- redis_host: str = '127.0.0.1',
48
- redis_port: int = 6379,
49
- redis_password: Optional[str] = None,
50
- project_name: str = 'crawlo'
51
- ) -> Dict[str, Any]:
52
- """获取分布式模式配置"""
53
- # 构建 Redis URL
54
- if redis_password:
55
- redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
- else:
57
- redis_url = f'redis://{redis_host}:{redis_port}/0'
58
-
59
- return {
60
- 'QUEUE_TYPE': 'redis',
61
- 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
- 'REDIS_HOST': redis_host,
63
- 'REDIS_PORT': redis_port,
64
- 'REDIS_PASSWORD': redis_password,
65
- 'REDIS_URL': redis_url,
66
- 'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
67
- 'REDIS_KEY': f'{project_name}:fingerprint',
68
- 'CONCURRENCY': 16,
69
- 'MAX_RUNNING_SPIDERS': 1,
70
- 'DOWNLOAD_DELAY': 1.0,
71
- 'LOG_LEVEL': 'INFO',
72
- }
73
-
74
- @staticmethod
75
- def get_auto_settings() -> Dict[str, Any]:
76
- """获取自动检测模式配置"""
77
- return {
78
- 'QUEUE_TYPE': 'auto',
79
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
80
- 'CONCURRENCY': 12,
81
- 'MAX_RUNNING_SPIDERS': 1,
82
- 'DOWNLOAD_DELAY': 1.0,
83
- 'LOG_LEVEL': 'INFO',
84
- }
85
-
86
- def resolve_mode_settings(
87
- self,
88
- mode: str = 'standalone',
89
- **kwargs
90
- ) -> Dict[str, Any]:
91
- """
92
- 解析运行模式并返回对应配置
93
-
94
- Args:
95
- mode: 运行模式 ('standalone', 'distributed', 'auto')
96
- **kwargs: 额外配置参数
97
-
98
- Returns:
99
- Dict[str, Any]: 配置字典
100
- """
101
- mode = RunMode(mode.lower())
102
-
103
- if mode == RunMode.STANDALONE:
104
- self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
105
- settings = self.get_standalone_settings()
106
-
107
- elif mode == RunMode.DISTRIBUTED:
108
- self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
109
- settings = self.get_distributed_settings(
110
- redis_host=kwargs.get('redis_host', '127.0.0.1'),
111
- redis_port=kwargs.get('redis_port', 6379),
112
- redis_password=kwargs.get('redis_password'),
113
- project_name=kwargs.get('project_name', 'crawlo')
114
- )
115
-
116
- elif mode == RunMode.AUTO:
117
- self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
118
- settings = self.get_auto_settings()
119
-
120
- else:
121
- raise ValueError(f"不支持的运行模式: {mode}")
122
-
123
- # 合并用户自定义配置
124
- user_settings = {k: v for k, v in kwargs.items()
125
- if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
126
- settings.update(user_settings)
127
-
128
- return settings
129
-
130
- def from_environment(self) -> Dict[str, Any]:
131
- """从环境变量构建配置"""
132
- config = {}
133
-
134
- # 扫描 CRAWLO_ 前缀的环境变量
135
- for key, value in os.environ.items():
136
- if key.startswith('CRAWLO_'):
137
- config_key = key[7:] # 去掉 'CRAWLO_' 前缀
138
- # 简单的类型转换
139
- if value.lower() in ('true', 'false'):
140
- config[config_key] = value.lower() == 'true'
141
- elif value.isdigit():
142
- config[config_key] = int(value)
143
- else:
144
- try:
145
- config[config_key] = float(value)
146
- except ValueError:
147
- config[config_key] = value
148
-
149
- return config
150
-
151
-
152
- # 便利函数
153
- def standalone_mode(**kwargs) -> Dict[str, Any]:
154
- """快速创建单机模式配置"""
155
- return ModeManager().resolve_mode_settings('standalone', **kwargs)
156
-
157
-
158
- def distributed_mode(
159
- redis_host: str = '127.0.0.1',
160
- redis_port: int = 6379,
161
- redis_password: Optional[str] = None,
162
- project_name: str = 'crawlo',
163
- **kwargs
164
- ) -> Dict[str, Any]:
165
- """快速创建分布式模式配置"""
166
- return ModeManager().resolve_mode_settings(
167
- 'distributed',
168
- redis_host=redis_host,
169
- redis_port=redis_port,
170
- redis_password=redis_password,
171
- project_name=project_name,
172
- **kwargs
173
- )
174
-
175
-
176
- def auto_mode(**kwargs) -> Dict[str, Any]:
177
- """快速创建自动检测模式配置"""
178
- return ModeManager().resolve_mode_settings('auto', **kwargs)
179
-
180
-
181
- # 环境变量支持
182
- def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
183
- """从环境变量创建配置"""
184
- mode = os.getenv('CRAWLO_MODE', default_mode).lower()
185
-
186
- if mode == 'distributed':
187
- return distributed_mode(
188
- redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
189
- redis_port=int(os.getenv('REDIS_PORT', 6379)),
190
- redis_password=os.getenv('REDIS_PASSWORD'),
191
- project_name=os.getenv('PROJECT_NAME', 'crawlo'),
192
- CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
193
- )
194
- elif mode == 'auto':
195
- return auto_mode(
196
- CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
197
- )
198
- else: # standalone
199
- return standalone_mode(
200
- CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 运行模式管理器
5
+ ==============
6
+ 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
+
8
+ 支持的运行模式:
9
+ 1. standalone - 单机模式(默认)
10
+ 2. distributed - 分布式模式
11
+ 3. auto - 自动检测模式
12
+ """
13
+
14
+ from typing import Dict, Any, Optional
15
+ from enum import Enum
16
+ import os
17
+ from crawlo.utils.log import get_logger
18
+
19
+
20
+ class RunMode(Enum):
21
+ """运行模式枚举"""
22
+ STANDALONE = "standalone" # 单机模式
23
+ DISTRIBUTED = "distributed" # 分布式模式
24
+ AUTO = "auto" # 自动检测模式
25
+
26
+
27
+ class ModeManager:
28
+ """运行模式管理器"""
29
+
30
+ def __init__(self):
31
+ self.logger = get_logger(self.__class__.__name__)
32
+
33
+ @staticmethod
34
+ def get_standalone_settings() -> Dict[str, Any]:
35
+ """获取单机模式配置"""
36
+ return {
37
+ 'QUEUE_TYPE': 'memory',
38
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
+ 'CONCURRENCY': 8,
40
+ 'MAX_RUNNING_SPIDERS': 1,
41
+ 'DOWNLOAD_DELAY': 1.0,
42
+ 'LOG_LEVEL': 'INFO',
43
+ }
44
+
45
+ @staticmethod
46
+ def get_distributed_settings(
47
+ redis_host: str = '127.0.0.1',
48
+ redis_port: int = 6379,
49
+ redis_password: Optional[str] = None,
50
+ project_name: str = 'crawlo'
51
+ ) -> Dict[str, Any]:
52
+ """获取分布式模式配置"""
53
+ # 构建 Redis URL
54
+ if redis_password:
55
+ redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
+ else:
57
+ redis_url = f'redis://{redis_host}:{redis_port}/0'
58
+
59
+ return {
60
+ 'QUEUE_TYPE': 'redis',
61
+ 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
+ 'REDIS_HOST': redis_host,
63
+ 'REDIS_PORT': redis_port,
64
+ 'REDIS_PASSWORD': redis_password,
65
+ 'REDIS_URL': redis_url,
66
+ 'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
67
+ 'REDIS_KEY': f'{project_name}:fingerprint',
68
+ 'CONCURRENCY': 16,
69
+ 'MAX_RUNNING_SPIDERS': 1,
70
+ 'DOWNLOAD_DELAY': 1.0,
71
+ 'LOG_LEVEL': 'INFO',
72
+ }
73
+
74
+ @staticmethod
75
+ def get_auto_settings() -> Dict[str, Any]:
76
+ """获取自动检测模式配置"""
77
+ return {
78
+ 'QUEUE_TYPE': 'auto',
79
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
80
+ 'CONCURRENCY': 12,
81
+ 'MAX_RUNNING_SPIDERS': 1,
82
+ 'DOWNLOAD_DELAY': 1.0,
83
+ 'LOG_LEVEL': 'INFO',
84
+ }
85
+
86
+ def resolve_mode_settings(
87
+ self,
88
+ mode: str = 'standalone',
89
+ **kwargs
90
+ ) -> Dict[str, Any]:
91
+ """
92
+ 解析运行模式并返回对应配置
93
+
94
+ Args:
95
+ mode: 运行模式 ('standalone', 'distributed', 'auto')
96
+ **kwargs: 额外配置参数
97
+
98
+ Returns:
99
+ Dict[str, Any]: 配置字典
100
+ """
101
+ mode = RunMode(mode.lower())
102
+
103
+ if mode == RunMode.STANDALONE:
104
+ self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
105
+ settings = self.get_standalone_settings()
106
+
107
+ elif mode == RunMode.DISTRIBUTED:
108
+ self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
109
+ settings = self.get_distributed_settings(
110
+ redis_host=kwargs.get('redis_host', '127.0.0.1'),
111
+ redis_port=kwargs.get('redis_port', 6379),
112
+ redis_password=kwargs.get('redis_password'),
113
+ project_name=kwargs.get('project_name', 'crawlo')
114
+ )
115
+
116
+ elif mode == RunMode.AUTO:
117
+ self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
118
+ settings = self.get_auto_settings()
119
+
120
+ else:
121
+ raise ValueError(f"不支持的运行模式: {mode}")
122
+
123
+ # 合并用户自定义配置
124
+ user_settings = {k: v for k, v in kwargs.items()
125
+ if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
126
+ settings.update(user_settings)
127
+
128
+ return settings
129
+
130
+ def from_environment(self) -> Dict[str, Any]:
131
+ """从环境变量构建配置"""
132
+ config = {}
133
+
134
+ # 扫描 CRAWLO_ 前缀的环境变量
135
+ for key, value in os.environ.items():
136
+ if key.startswith('CRAWLO_'):
137
+ config_key = key[7:] # 去掉 'CRAWLO_' 前缀
138
+ # 简单的类型转换
139
+ if value.lower() in ('true', 'false'):
140
+ config[config_key] = value.lower() == 'true'
141
+ elif value.isdigit():
142
+ config[config_key] = int(value)
143
+ else:
144
+ try:
145
+ config[config_key] = float(value)
146
+ except ValueError:
147
+ config[config_key] = value
148
+
149
+ return config
150
+
151
+
152
+ # 便利函数
153
+ def standalone_mode(**kwargs) -> Dict[str, Any]:
154
+ """快速创建单机模式配置"""
155
+ return ModeManager().resolve_mode_settings('standalone', **kwargs)
156
+
157
+
158
+ def distributed_mode(
159
+ redis_host: str = '127.0.0.1',
160
+ redis_port: int = 6379,
161
+ redis_password: Optional[str] = None,
162
+ project_name: str = 'crawlo',
163
+ **kwargs
164
+ ) -> Dict[str, Any]:
165
+ """快速创建分布式模式配置"""
166
+ return ModeManager().resolve_mode_settings(
167
+ 'distributed',
168
+ redis_host=redis_host,
169
+ redis_port=redis_port,
170
+ redis_password=redis_password,
171
+ project_name=project_name,
172
+ **kwargs
173
+ )
174
+
175
+
176
+ def auto_mode(**kwargs) -> Dict[str, Any]:
177
+ """快速创建自动检测模式配置"""
178
+ return ModeManager().resolve_mode_settings('auto', **kwargs)
179
+
180
+
181
+ # 环境变量支持
182
+ def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
183
+ """从环境变量创建配置"""
184
+ mode = os.getenv('CRAWLO_MODE', default_mode).lower()
185
+
186
+ if mode == 'distributed':
187
+ return distributed_mode(
188
+ redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
189
+ redis_port=int(os.getenv('REDIS_PORT', 6379)),
190
+ redis_password=os.getenv('REDIS_PASSWORD'),
191
+ project_name=os.getenv('PROJECT_NAME', 'crawlo'),
192
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
193
+ )
194
+ elif mode == 'auto':
195
+ return auto_mode(
196
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
197
+ )
198
+ else: # standalone
199
+ return standalone_mode(
200
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
201
201
  )