crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (118) hide show
  1. crawlo/__init__.py +34 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/list.py +155 -155
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -196
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +186 -186
  12. crawlo/config.py +279 -279
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -171
  15. crawlo/core/enhanced_engine.py +189 -189
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +165 -165
  18. crawlo/crawler.py +1027 -1027
  19. crawlo/downloader/__init__.py +242 -242
  20. crawlo/downloader/aiohttp_downloader.py +212 -212
  21. crawlo/downloader/cffi_downloader.py +251 -251
  22. crawlo/downloader/httpx_downloader.py +259 -259
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +81 -81
  25. crawlo/extension/__init__.py +38 -31
  26. crawlo/extension/health_check.py +142 -0
  27. crawlo/extension/log_interval.py +58 -49
  28. crawlo/extension/log_stats.py +82 -44
  29. crawlo/extension/logging_extension.py +44 -35
  30. crawlo/extension/memory_monitor.py +89 -0
  31. crawlo/extension/performance_profiler.py +118 -0
  32. crawlo/extension/request_recorder.py +108 -0
  33. crawlo/filters/__init__.py +154 -154
  34. crawlo/filters/aioredis_filter.py +241 -241
  35. crawlo/filters/memory_filter.py +269 -269
  36. crawlo/items/__init__.py +23 -23
  37. crawlo/items/base.py +21 -21
  38. crawlo/items/fields.py +53 -53
  39. crawlo/items/items.py +104 -104
  40. crawlo/middleware/__init__.py +21 -21
  41. crawlo/middleware/default_header.py +32 -32
  42. crawlo/middleware/download_delay.py +28 -28
  43. crawlo/middleware/middleware_manager.py +135 -135
  44. crawlo/middleware/proxy.py +248 -248
  45. crawlo/middleware/request_ignore.py +30 -30
  46. crawlo/middleware/response_code.py +18 -18
  47. crawlo/middleware/response_filter.py +26 -26
  48. crawlo/middleware/retry.py +124 -124
  49. crawlo/mode_manager.py +200 -200
  50. crawlo/network/__init__.py +21 -21
  51. crawlo/network/request.py +311 -311
  52. crawlo/network/response.py +271 -271
  53. crawlo/pipelines/__init__.py +21 -21
  54. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  55. crawlo/pipelines/console_pipeline.py +39 -39
  56. crawlo/pipelines/csv_pipeline.py +316 -316
  57. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  58. crawlo/pipelines/json_pipeline.py +218 -218
  59. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  60. crawlo/pipelines/mongo_pipeline.py +132 -117
  61. crawlo/pipelines/mysql_pipeline.py +317 -195
  62. crawlo/pipelines/pipeline_manager.py +56 -56
  63. crawlo/pipelines/redis_dedup_pipeline.py +162 -162
  64. crawlo/project.py +153 -153
  65. crawlo/queue/pqueue.py +37 -37
  66. crawlo/queue/queue_manager.py +307 -307
  67. crawlo/queue/redis_priority_queue.py +208 -208
  68. crawlo/settings/__init__.py +7 -7
  69. crawlo/settings/default_settings.py +278 -244
  70. crawlo/settings/setting_manager.py +99 -99
  71. crawlo/spider/__init__.py +639 -639
  72. crawlo/stats_collector.py +59 -59
  73. crawlo/subscriber.py +131 -106
  74. crawlo/task_manager.py +30 -30
  75. crawlo/templates/crawlo.cfg.tmpl +10 -10
  76. crawlo/templates/project/__init__.py.tmpl +3 -3
  77. crawlo/templates/project/items.py.tmpl +17 -17
  78. crawlo/templates/project/middlewares.py.tmpl +111 -87
  79. crawlo/templates/project/pipelines.py.tmpl +97 -341
  80. crawlo/templates/project/run.py.tmpl +251 -251
  81. crawlo/templates/project/settings.py.tmpl +279 -250
  82. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  83. crawlo/templates/spider/spider.py.tmpl +142 -178
  84. crawlo/utils/__init__.py +7 -7
  85. crawlo/utils/controlled_spider_mixin.py +439 -439
  86. crawlo/utils/date_tools.py +233 -233
  87. crawlo/utils/db_helper.py +343 -343
  88. crawlo/utils/func_tools.py +82 -82
  89. crawlo/utils/large_scale_config.py +286 -286
  90. crawlo/utils/large_scale_helper.py +343 -343
  91. crawlo/utils/log.py +128 -128
  92. crawlo/utils/queue_helper.py +175 -175
  93. crawlo/utils/request.py +267 -267
  94. crawlo/utils/request_serializer.py +219 -219
  95. crawlo/utils/spider_loader.py +62 -62
  96. crawlo/utils/system.py +11 -11
  97. crawlo/utils/tools.py +4 -4
  98. crawlo/utils/url.py +39 -39
  99. crawlo-1.1.4.dist-info/METADATA +403 -0
  100. crawlo-1.1.4.dist-info/RECORD +117 -0
  101. examples/__init__.py +7 -7
  102. examples/controlled_spider_example.py +205 -205
  103. tests/__init__.py +7 -7
  104. tests/test_final_validation.py +153 -153
  105. tests/test_proxy_health_check.py +32 -32
  106. tests/test_proxy_middleware_integration.py +136 -136
  107. tests/test_proxy_providers.py +56 -56
  108. tests/test_proxy_stats.py +19 -19
  109. tests/test_proxy_strategies.py +59 -59
  110. tests/test_redis_config.py +28 -28
  111. tests/test_redis_queue.py +224 -224
  112. tests/test_request_serialization.py +70 -70
  113. tests/test_scheduler.py +241 -241
  114. crawlo-1.1.3.dist-info/METADATA +0 -635
  115. crawlo-1.1.3.dist-info/RECORD +0 -113
  116. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
  117. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
  118. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
@@ -1,125 +1,125 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import List
4
- import asyncio
5
-
6
- try:
7
- from anyio import EndOfStream
8
- except ImportError:
9
- # 如果 anyio 不可用或者 EndOfStream 不存在,创建一个占位符
10
- class EndOfStream(Exception):
11
- pass
12
-
13
- try:
14
- from httpcore import ReadError
15
- except ImportError:
16
- class ReadError(Exception):
17
- pass
18
-
19
- try:
20
- from httpx import RemoteProtocolError, ConnectError, ReadTimeout
21
- except ImportError:
22
- class RemoteProtocolError(Exception):
23
- pass
24
- class ConnectError(Exception):
25
- pass
26
- class ReadTimeout(Exception):
27
- pass
28
-
29
- try:
30
- from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
31
- from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
32
- except ImportError:
33
- class ClientConnectionError(Exception):
34
- pass
35
- class ClientPayloadError(Exception):
36
- pass
37
- class ClientConnectorError(Exception):
38
- pass
39
- class ClientTimeout(Exception):
40
- pass
41
- class ClientConnectorSSLError(Exception):
42
- pass
43
- class ClientResponseError(Exception):
44
- pass
45
-
46
- from crawlo.utils.log import get_logger
47
- from crawlo.stats_collector import StatsCollector
48
-
49
- _retry_exceptions = [
50
- EndOfStream,
51
- ReadError,
52
- asyncio.TimeoutError,
53
- ConnectError,
54
- ReadTimeout,
55
- ClientConnectorError,
56
- ClientResponseError,
57
- RemoteProtocolError,
58
- ClientTimeout,
59
- ClientConnectorSSLError,
60
- ClientPayloadError,
61
- ClientConnectionError
62
- ]
63
-
64
-
65
- class RetryMiddleware(object):
66
-
67
- def __init__(
68
- self,
69
- *,
70
- retry_http_codes: List,
71
- ignore_http_codes: List,
72
- max_retry_times: int,
73
- retry_exceptions: List,
74
- stats: StatsCollector,
75
- retry_priority: int
76
- ):
77
- self.retry_http_codes = retry_http_codes
78
- self.ignore_http_codes = ignore_http_codes
79
- self.max_retry_times = max_retry_times
80
- self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
81
- self.retry_priority = retry_priority
82
- self.stats = stats
83
- self.logger = get_logger(self.__class__.__name__)
84
-
85
- @classmethod
86
- def create_instance(cls, crawler):
87
- o = cls(
88
- retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
89
- ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
90
- max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
91
- retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
92
- stats=crawler.stats,
93
- retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
94
- )
95
- return o
96
-
97
- def process_response(self, request, response, spider):
98
- if request.meta.get('dont_retry', False):
99
- return response
100
- if response.status_code in self.ignore_http_codes:
101
- return response
102
- if response.status_code in self.retry_http_codes:
103
- # 重试逻辑
104
- reason = f"response code {response.status_code}"
105
- return self._retry(request, reason, spider) or response
106
- return response
107
-
108
- def process_exception(self, request, exc, spider):
109
- if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
110
- return self._retry(request=request, reason=type(exc).__name__, spider=spider)
111
-
112
- def _retry(self, request, reason, spider):
113
- retry_times = request.meta.get('retry_times', 0)
114
- if retry_times < self.max_retry_times:
115
- retry_times += 1
116
- self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
117
- request.meta['retry_times'] = retry_times
118
- # request.dont_retry = True
119
- request.meta['dont_retry'] = True
120
- request.priority = request.priority + self.retry_priority
121
- self.stats.inc_value("retry_count")
122
- return request
123
- else:
124
- self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ import asyncio
5
+
6
+ try:
7
+ from anyio import EndOfStream
8
+ except ImportError:
9
+ # 如果 anyio 不可用或者 EndOfStream 不存在,创建一个占位符
10
+ class EndOfStream(Exception):
11
+ pass
12
+
13
+ try:
14
+ from httpcore import ReadError
15
+ except ImportError:
16
+ class ReadError(Exception):
17
+ pass
18
+
19
+ try:
20
+ from httpx import RemoteProtocolError, ConnectError, ReadTimeout
21
+ except ImportError:
22
+ class RemoteProtocolError(Exception):
23
+ pass
24
+ class ConnectError(Exception):
25
+ pass
26
+ class ReadTimeout(Exception):
27
+ pass
28
+
29
+ try:
30
+ from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
31
+ from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
32
+ except ImportError:
33
+ class ClientConnectionError(Exception):
34
+ pass
35
+ class ClientPayloadError(Exception):
36
+ pass
37
+ class ClientConnectorError(Exception):
38
+ pass
39
+ class ClientTimeout(Exception):
40
+ pass
41
+ class ClientConnectorSSLError(Exception):
42
+ pass
43
+ class ClientResponseError(Exception):
44
+ pass
45
+
46
+ from crawlo.utils.log import get_logger
47
+ from crawlo.stats_collector import StatsCollector
48
+
49
+ _retry_exceptions = [
50
+ EndOfStream,
51
+ ReadError,
52
+ asyncio.TimeoutError,
53
+ ConnectError,
54
+ ReadTimeout,
55
+ ClientConnectorError,
56
+ ClientResponseError,
57
+ RemoteProtocolError,
58
+ ClientTimeout,
59
+ ClientConnectorSSLError,
60
+ ClientPayloadError,
61
+ ClientConnectionError
62
+ ]
63
+
64
+
65
+ class RetryMiddleware(object):
66
+
67
+ def __init__(
68
+ self,
69
+ *,
70
+ retry_http_codes: List,
71
+ ignore_http_codes: List,
72
+ max_retry_times: int,
73
+ retry_exceptions: List,
74
+ stats: StatsCollector,
75
+ retry_priority: int
76
+ ):
77
+ self.retry_http_codes = retry_http_codes
78
+ self.ignore_http_codes = ignore_http_codes
79
+ self.max_retry_times = max_retry_times
80
+ self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
81
+ self.retry_priority = retry_priority
82
+ self.stats = stats
83
+ self.logger = get_logger(self.__class__.__name__)
84
+
85
+ @classmethod
86
+ def create_instance(cls, crawler):
87
+ o = cls(
88
+ retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
89
+ ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
90
+ max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
91
+ retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
92
+ stats=crawler.stats,
93
+ retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
94
+ )
95
+ return o
96
+
97
+ def process_response(self, request, response, spider):
98
+ if request.meta.get('dont_retry', False):
99
+ return response
100
+ if response.status_code in self.ignore_http_codes:
101
+ return response
102
+ if response.status_code in self.retry_http_codes:
103
+ # 重试逻辑
104
+ reason = f"response code {response.status_code}"
105
+ return self._retry(request, reason, spider) or response
106
+ return response
107
+
108
+ def process_exception(self, request, exc, spider):
109
+ if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
110
+ return self._retry(request=request, reason=type(exc).__name__, spider=spider)
111
+
112
+ def _retry(self, request, reason, spider):
113
+ retry_times = request.meta.get('retry_times', 0)
114
+ if retry_times < self.max_retry_times:
115
+ retry_times += 1
116
+ self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
117
+ request.meta['retry_times'] = retry_times
118
+ # request.dont_retry = True
119
+ request.meta['dont_retry'] = True
120
+ request.priority = request.priority + self.retry_priority
121
+ self.stats.inc_value("retry_count")
122
+ return request
123
+ else:
124
+ self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
125
125
  return None
crawlo/mode_manager.py CHANGED
@@ -1,201 +1,201 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 运行模式管理器
5
- ==============
6
- 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
-
8
- 支持的运行模式:
9
- 1. standalone - 单机模式(默认)
10
- 2. distributed - 分布式模式
11
- 3. auto - 自动检测模式
12
- """
13
-
14
- from typing import Dict, Any, Optional
15
- from enum import Enum
16
- import os
17
- from crawlo.utils.log import get_logger
18
-
19
-
20
- class RunMode(Enum):
21
- """运行模式枚举"""
22
- STANDALONE = "standalone" # 单机模式
23
- DISTRIBUTED = "distributed" # 分布式模式
24
- AUTO = "auto" # 自动检测模式
25
-
26
-
27
- class ModeManager:
28
- """运行模式管理器"""
29
-
30
- def __init__(self):
31
- self.logger = get_logger(self.__class__.__name__)
32
-
33
- @staticmethod
34
- def get_standalone_settings() -> Dict[str, Any]:
35
- """获取单机模式配置"""
36
- return {
37
- 'QUEUE_TYPE': 'memory',
38
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
- 'CONCURRENCY': 8,
40
- 'MAX_RUNNING_SPIDERS': 1,
41
- 'DOWNLOAD_DELAY': 1.0,
42
- 'LOG_LEVEL': 'INFO',
43
- }
44
-
45
- @staticmethod
46
- def get_distributed_settings(
47
- redis_host: str = '127.0.0.1',
48
- redis_port: int = 6379,
49
- redis_password: Optional[str] = None,
50
- project_name: str = 'crawlo'
51
- ) -> Dict[str, Any]:
52
- """获取分布式模式配置"""
53
- # 构建 Redis URL
54
- if redis_password:
55
- redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
- else:
57
- redis_url = f'redis://{redis_host}:{redis_port}/0'
58
-
59
- return {
60
- 'QUEUE_TYPE': 'redis',
61
- 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
- 'REDIS_HOST': redis_host,
63
- 'REDIS_PORT': redis_port,
64
- 'REDIS_PASSWORD': redis_password,
65
- 'REDIS_URL': redis_url,
66
- 'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
67
- 'REDIS_KEY': f'{project_name}:fingerprint',
68
- 'CONCURRENCY': 16,
69
- 'MAX_RUNNING_SPIDERS': 1,
70
- 'DOWNLOAD_DELAY': 1.0,
71
- 'LOG_LEVEL': 'INFO',
72
- }
73
-
74
- @staticmethod
75
- def get_auto_settings() -> Dict[str, Any]:
76
- """获取自动检测模式配置"""
77
- return {
78
- 'QUEUE_TYPE': 'auto',
79
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
80
- 'CONCURRENCY': 12,
81
- 'MAX_RUNNING_SPIDERS': 1,
82
- 'DOWNLOAD_DELAY': 1.0,
83
- 'LOG_LEVEL': 'INFO',
84
- }
85
-
86
- def resolve_mode_settings(
87
- self,
88
- mode: str = 'standalone',
89
- **kwargs
90
- ) -> Dict[str, Any]:
91
- """
92
- 解析运行模式并返回对应配置
93
-
94
- Args:
95
- mode: 运行模式 ('standalone', 'distributed', 'auto')
96
- **kwargs: 额外配置参数
97
-
98
- Returns:
99
- Dict[str, Any]: 配置字典
100
- """
101
- mode = RunMode(mode.lower())
102
-
103
- if mode == RunMode.STANDALONE:
104
- self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
105
- settings = self.get_standalone_settings()
106
-
107
- elif mode == RunMode.DISTRIBUTED:
108
- self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
109
- settings = self.get_distributed_settings(
110
- redis_host=kwargs.get('redis_host', '127.0.0.1'),
111
- redis_port=kwargs.get('redis_port', 6379),
112
- redis_password=kwargs.get('redis_password'),
113
- project_name=kwargs.get('project_name', 'crawlo')
114
- )
115
-
116
- elif mode == RunMode.AUTO:
117
- self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
118
- settings = self.get_auto_settings()
119
-
120
- else:
121
- raise ValueError(f"不支持的运行模式: {mode}")
122
-
123
- # 合并用户自定义配置
124
- user_settings = {k: v for k, v in kwargs.items()
125
- if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
126
- settings.update(user_settings)
127
-
128
- return settings
129
-
130
- def from_environment(self) -> Dict[str, Any]:
131
- """从环境变量构建配置"""
132
- config = {}
133
-
134
- # 扫描 CRAWLO_ 前缀的环境变量
135
- for key, value in os.environ.items():
136
- if key.startswith('CRAWLO_'):
137
- config_key = key[7:] # 去掉 'CRAWLO_' 前缀
138
- # 简单的类型转换
139
- if value.lower() in ('true', 'false'):
140
- config[config_key] = value.lower() == 'true'
141
- elif value.isdigit():
142
- config[config_key] = int(value)
143
- else:
144
- try:
145
- config[config_key] = float(value)
146
- except ValueError:
147
- config[config_key] = value
148
-
149
- return config
150
-
151
-
152
- # 便利函数
153
- def standalone_mode(**kwargs) -> Dict[str, Any]:
154
- """快速创建单机模式配置"""
155
- return ModeManager().resolve_mode_settings('standalone', **kwargs)
156
-
157
-
158
- def distributed_mode(
159
- redis_host: str = '127.0.0.1',
160
- redis_port: int = 6379,
161
- redis_password: Optional[str] = None,
162
- project_name: str = 'crawlo',
163
- **kwargs
164
- ) -> Dict[str, Any]:
165
- """快速创建分布式模式配置"""
166
- return ModeManager().resolve_mode_settings(
167
- 'distributed',
168
- redis_host=redis_host,
169
- redis_port=redis_port,
170
- redis_password=redis_password,
171
- project_name=project_name,
172
- **kwargs
173
- )
174
-
175
-
176
- def auto_mode(**kwargs) -> Dict[str, Any]:
177
- """快速创建自动检测模式配置"""
178
- return ModeManager().resolve_mode_settings('auto', **kwargs)
179
-
180
-
181
- # 环境变量支持
182
- def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
183
- """从环境变量创建配置"""
184
- mode = os.getenv('CRAWLO_MODE', default_mode).lower()
185
-
186
- if mode == 'distributed':
187
- return distributed_mode(
188
- redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
189
- redis_port=int(os.getenv('REDIS_PORT', 6379)),
190
- redis_password=os.getenv('REDIS_PASSWORD'),
191
- project_name=os.getenv('PROJECT_NAME', 'crawlo'),
192
- CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
193
- )
194
- elif mode == 'auto':
195
- return auto_mode(
196
- CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
197
- )
198
- else: # standalone
199
- return standalone_mode(
200
- CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 运行模式管理器
5
+ ==============
6
+ 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
+
8
+ 支持的运行模式:
9
+ 1. standalone - 单机模式(默认)
10
+ 2. distributed - 分布式模式
11
+ 3. auto - 自动检测模式
12
+ """
13
+
14
+ from typing import Dict, Any, Optional
15
+ from enum import Enum
16
+ import os
17
+ from crawlo.utils.log import get_logger
18
+
19
+
20
+ class RunMode(Enum):
21
+ """运行模式枚举"""
22
+ STANDALONE = "standalone" # 单机模式
23
+ DISTRIBUTED = "distributed" # 分布式模式
24
+ AUTO = "auto" # 自动检测模式
25
+
26
+
27
+ class ModeManager:
28
+ """运行模式管理器"""
29
+
30
+ def __init__(self):
31
+ self.logger = get_logger(self.__class__.__name__)
32
+
33
+ @staticmethod
34
+ def get_standalone_settings() -> Dict[str, Any]:
35
+ """获取单机模式配置"""
36
+ return {
37
+ 'QUEUE_TYPE': 'memory',
38
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
+ 'CONCURRENCY': 8,
40
+ 'MAX_RUNNING_SPIDERS': 1,
41
+ 'DOWNLOAD_DELAY': 1.0,
42
+ 'LOG_LEVEL': 'INFO',
43
+ }
44
+
45
+ @staticmethod
46
+ def get_distributed_settings(
47
+ redis_host: str = '127.0.0.1',
48
+ redis_port: int = 6379,
49
+ redis_password: Optional[str] = None,
50
+ project_name: str = 'crawlo'
51
+ ) -> Dict[str, Any]:
52
+ """获取分布式模式配置"""
53
+ # 构建 Redis URL
54
+ if redis_password:
55
+ redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
+ else:
57
+ redis_url = f'redis://{redis_host}:{redis_port}/0'
58
+
59
+ return {
60
+ 'QUEUE_TYPE': 'redis',
61
+ 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
+ 'REDIS_HOST': redis_host,
63
+ 'REDIS_PORT': redis_port,
64
+ 'REDIS_PASSWORD': redis_password,
65
+ 'REDIS_URL': redis_url,
66
+ 'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
67
+ 'REDIS_KEY': f'{project_name}:fingerprint',
68
+ 'CONCURRENCY': 16,
69
+ 'MAX_RUNNING_SPIDERS': 1,
70
+ 'DOWNLOAD_DELAY': 1.0,
71
+ 'LOG_LEVEL': 'INFO',
72
+ }
73
+
74
+ @staticmethod
75
+ def get_auto_settings() -> Dict[str, Any]:
76
+ """获取自动检测模式配置"""
77
+ return {
78
+ 'QUEUE_TYPE': 'auto',
79
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
80
+ 'CONCURRENCY': 12,
81
+ 'MAX_RUNNING_SPIDERS': 1,
82
+ 'DOWNLOAD_DELAY': 1.0,
83
+ 'LOG_LEVEL': 'INFO',
84
+ }
85
+
86
+ def resolve_mode_settings(
87
+ self,
88
+ mode: str = 'standalone',
89
+ **kwargs
90
+ ) -> Dict[str, Any]:
91
+ """
92
+ 解析运行模式并返回对应配置
93
+
94
+ Args:
95
+ mode: 运行模式 ('standalone', 'distributed', 'auto')
96
+ **kwargs: 额外配置参数
97
+
98
+ Returns:
99
+ Dict[str, Any]: 配置字典
100
+ """
101
+ mode = RunMode(mode.lower())
102
+
103
+ if mode == RunMode.STANDALONE:
104
+ self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
105
+ settings = self.get_standalone_settings()
106
+
107
+ elif mode == RunMode.DISTRIBUTED:
108
+ self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
109
+ settings = self.get_distributed_settings(
110
+ redis_host=kwargs.get('redis_host', '127.0.0.1'),
111
+ redis_port=kwargs.get('redis_port', 6379),
112
+ redis_password=kwargs.get('redis_password'),
113
+ project_name=kwargs.get('project_name', 'crawlo')
114
+ )
115
+
116
+ elif mode == RunMode.AUTO:
117
+ self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
118
+ settings = self.get_auto_settings()
119
+
120
+ else:
121
+ raise ValueError(f"不支持的运行模式: {mode}")
122
+
123
+ # 合并用户自定义配置
124
+ user_settings = {k: v for k, v in kwargs.items()
125
+ if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
126
+ settings.update(user_settings)
127
+
128
+ return settings
129
+
130
+ def from_environment(self) -> Dict[str, Any]:
131
+ """从环境变量构建配置"""
132
+ config = {}
133
+
134
+ # 扫描 CRAWLO_ 前缀的环境变量
135
+ for key, value in os.environ.items():
136
+ if key.startswith('CRAWLO_'):
137
+ config_key = key[7:] # 去掉 'CRAWLO_' 前缀
138
+ # 简单的类型转换
139
+ if value.lower() in ('true', 'false'):
140
+ config[config_key] = value.lower() == 'true'
141
+ elif value.isdigit():
142
+ config[config_key] = int(value)
143
+ else:
144
+ try:
145
+ config[config_key] = float(value)
146
+ except ValueError:
147
+ config[config_key] = value
148
+
149
+ return config
150
+
151
+
152
+ # 便利函数
153
+ def standalone_mode(**kwargs) -> Dict[str, Any]:
154
+ """快速创建单机模式配置"""
155
+ return ModeManager().resolve_mode_settings('standalone', **kwargs)
156
+
157
+
158
+ def distributed_mode(
159
+ redis_host: str = '127.0.0.1',
160
+ redis_port: int = 6379,
161
+ redis_password: Optional[str] = None,
162
+ project_name: str = 'crawlo',
163
+ **kwargs
164
+ ) -> Dict[str, Any]:
165
+ """快速创建分布式模式配置"""
166
+ return ModeManager().resolve_mode_settings(
167
+ 'distributed',
168
+ redis_host=redis_host,
169
+ redis_port=redis_port,
170
+ redis_password=redis_password,
171
+ project_name=project_name,
172
+ **kwargs
173
+ )
174
+
175
+
176
+ def auto_mode(**kwargs) -> Dict[str, Any]:
177
+ """快速创建自动检测模式配置"""
178
+ return ModeManager().resolve_mode_settings('auto', **kwargs)
179
+
180
+
181
+ # 环境变量支持
182
+ def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
183
+ """从环境变量创建配置"""
184
+ mode = os.getenv('CRAWLO_MODE', default_mode).lower()
185
+
186
+ if mode == 'distributed':
187
+ return distributed_mode(
188
+ redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
189
+ redis_port=int(os.getenv('REDIS_PORT', 6379)),
190
+ redis_password=os.getenv('REDIS_PASSWORD'),
191
+ project_name=os.getenv('PROJECT_NAME', 'crawlo'),
192
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
193
+ )
194
+ elif mode == 'auto':
195
+ return auto_mode(
196
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
197
+ )
198
+ else: # standalone
199
+ return standalone_mode(
200
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
201
201
  )