crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,19 @@
1
- from typing import Optional
1
+ from typing import Optional, Dict, Any, Union, Awaitable, Literal
2
2
  import redis.asyncio as aioredis
3
+ import asyncio
4
+ from inspect import iscoroutinefunction
5
+
6
+ # 尝试导入Redis集群支持
7
+ try:
8
+ from redis.asyncio.cluster import RedisCluster
9
+ REDIS_CLUSTER_AVAILABLE = True
10
+ except ImportError:
11
+ RedisCluster = None
12
+ REDIS_CLUSTER_AVAILABLE = False
3
13
 
4
14
  from crawlo.filters import BaseFilter
5
- from crawlo.utils.log import get_logger
6
- from crawlo.utils.request import request_fingerprint
7
- from crawlo.utils.redis_connection_pool import get_redis_pool
15
+ from crawlo.logging import get_logger
16
+ from crawlo.utils.redis_connection_pool import get_redis_pool, RedisConnectionPool
8
17
 
9
18
 
10
19
  class AioRedisFilter(BaseFilter):
@@ -16,20 +25,16 @@ class AioRedisFilter(BaseFilter):
16
25
  - TTL 自动过期清理机制
17
26
  - Pipeline 批量操作优化性能
18
27
  - 容错设计和连接池管理
19
-
20
- 适用场景:
21
- - 分布式爬虫系统
22
- - 大规模数据处理
23
- - 需要持久化去重的场景
28
+ - Redis集群支持
24
29
  """
25
30
 
26
31
  def __init__(
27
32
  self,
28
33
  redis_key: str,
29
- client: aioredis.Redis,
30
- stats: dict,
34
+ client: Optional[aioredis.Redis] = None,
35
+ stats: Optional[Dict[str, Any]] = None,
31
36
  debug: bool = False,
32
- log_level: str = 'INFO',
37
+ log_level: int = 20, # logging.INFO
33
38
  cleanup_fp: bool = False,
34
39
  ttl: Optional[int] = None
35
40
  ):
@@ -44,7 +49,7 @@ class AioRedisFilter(BaseFilter):
44
49
  :param cleanup_fp: 关闭时是否清理指纹
45
50
  :param ttl: 指纹过期时间(秒)
46
51
  """
47
- self.logger = get_logger(self.__class__.__name__, log_level)
52
+ self.logger = get_logger(self.__class__.__name__)
48
53
  super().__init__(self.logger, stats, debug)
49
54
 
50
55
  self.redis_key = redis_key
@@ -53,7 +58,7 @@ class AioRedisFilter(BaseFilter):
53
58
  self.ttl = ttl
54
59
 
55
60
  # 保存连接池引用(用于延迟初始化)
56
- self._redis_pool = None
61
+ self._redis_pool: Optional[RedisConnectionPool] = None
57
62
 
58
63
  # 性能计数器
59
64
  self._redis_operations = 0
@@ -105,7 +110,7 @@ class AioRedisFilter(BaseFilter):
105
110
  cleanup_fp=crawler.settings.get_bool('CLEANUP_FP', False),
106
111
  ttl=ttl,
107
112
  debug=crawler.settings.get_bool('FILTER_DEBUG', False),
108
- log_level=crawler.settings.get('LOG_LEVEL', 'INFO')
113
+ log_level=getattr(crawler.settings, 'LOG_LEVEL_NUM', 20) # 默认INFO级别
109
114
  )
110
115
 
111
116
  # 保存连接池引用,以便在需要时获取连接
@@ -120,16 +125,41 @@ class AioRedisFilter(BaseFilter):
120
125
 
121
126
  if self.redis is None and self._redis_pool is not None:
122
127
  try:
123
- self.redis = await self._redis_pool.get_connection()
128
+ connection = await self._redis_pool.get_connection()
129
+ # 确保返回的是Redis客户端而不是连接池本身
130
+ if hasattr(connection, 'ping'):
131
+ self.redis = connection
132
+ else:
133
+ self.redis = connection
124
134
  except Exception as e:
125
135
  self._connection_failed = True
126
136
  self.logger.error(f"Redis连接失败,将使用本地去重: {e}")
127
137
  return None
128
138
  return self.redis
129
139
 
130
- async def requested(self, request) -> bool:
140
+ def _is_cluster_mode(self) -> bool:
141
+ """检查是否为集群模式"""
142
+ if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None:
143
+ # 检查 redis 是否为 RedisCluster 实例
144
+ if self.redis is not None and isinstance(self.redis, RedisCluster):
145
+ return True
146
+ return False
147
+
148
+ def requested(self, request) -> bool:
149
+ """
150
+ 检查请求是否已存在(同步方法)
151
+
152
+ :param request: 请求对象
153
+ :return: True 表示重复,False 表示新请求
154
+ """
155
+ # 这个方法需要同步实现,但Redis操作是异步的
156
+ # 在实际使用中,应该通过异步方式调用 _requested_async
157
+ # 由于BaseFilter要求同步方法,我们在这里返回False表示不重复
158
+ return False
159
+
160
+ async def requested_async(self, request) -> bool:
131
161
  """
132
- 检查请求是否已存在(优化版本)
162
+ 异步检查请求是否已存在
133
163
 
134
164
  :param request: 请求对象
135
165
  :return: True 表示重复,False 表示新请求
@@ -142,32 +172,38 @@ class AioRedisFilter(BaseFilter):
142
172
  if redis_client is None:
143
173
  return False
144
174
 
145
- # 使用统一的指纹生成器
146
- from crawlo.utils.fingerprint import FingerprintGenerator
147
- fp = str(FingerprintGenerator.request_fingerprint(
148
- request.method,
149
- request.url,
150
- request.body or b'',
151
- dict(request.headers) if hasattr(request, 'headers') else None
152
- ))
175
+ # 使用基类的指纹生成方法
176
+ fp = str(self._get_fingerprint(request))
153
177
  self._redis_operations += 1
154
178
 
155
- # 使用 pipeline 优化性能
156
- pipe = redis_client.pipeline()
157
- pipe.sismember(self.redis_key, fp)
158
-
159
- results = await pipe.execute()
160
- exists = results[0]
179
+ # 检查指纹是否存在
180
+ if self._is_cluster_mode():
181
+ # 集群模式下使用哈希标签确保键在同一个slot
182
+ hash_tag = "{filter}"
183
+ redis_key_with_tag = f"{self.redis_key}{hash_tag}"
184
+ # 直接调用异步方法
185
+ result = redis_client.sismember(redis_key_with_tag, fp)
186
+ if asyncio.iscoroutine(result):
187
+ exists = await result
188
+ else:
189
+ exists = result
190
+ else:
191
+ # 直接调用异步方法
192
+ result = redis_client.sismember(self.redis_key, fp)
193
+ if asyncio.iscoroutine(result):
194
+ exists = await result
195
+ else:
196
+ exists = result
161
197
 
162
198
  self._pipeline_operations += 1
163
199
 
164
200
  if exists:
165
201
  if self.debug:
166
202
  self.logger.debug(f"发现重复请求: {fp[:20]}...")
167
- return True
203
+ return bool(exists)
168
204
 
169
205
  # 如果不存在,添加指纹并设置TTL
170
- await self.add_fingerprint(fp)
206
+ await self._add_fingerprint_async(fp)
171
207
  return False
172
208
 
173
209
  except Exception as e:
@@ -175,9 +211,19 @@ class AioRedisFilter(BaseFilter):
175
211
  # 在网络异常时返回False,避免丢失请求
176
212
  return False
177
213
 
178
- async def add_fingerprint(self, fp: str) -> bool:
214
+ def add_fingerprint(self, fp: str) -> None:
179
215
  """
180
- 添加新指纹到Redis集合(优化版本)
216
+ 添加新指纹到Redis集合(同步方法)
217
+
218
+ :param fp: 请求指纹字符串
219
+ """
220
+ # 这个方法需要同步实现,但Redis操作是异步的
221
+ # 在实际使用中,应该通过异步方式调用 _add_fingerprint_async
222
+ pass
223
+
224
+ async def _add_fingerprint_async(self, fp: str) -> bool:
225
+ """
226
+ 异步添加新指纹到Redis集合
181
227
 
182
228
  :param fp: 请求指纹字符串
183
229
  :return: 是否成功添加(True 表示新添加,False 表示已存在)
@@ -192,22 +238,44 @@ class AioRedisFilter(BaseFilter):
192
238
 
193
239
  fp = str(fp)
194
240
 
195
- # 使用 pipeline 优化性能
196
- pipe = redis_client.pipeline()
197
- pipe.sadd(self.redis_key, fp)
198
-
199
- if self.ttl and self.ttl > 0:
200
- pipe.expire(self.redis_key, self.ttl)
201
-
202
- results = await pipe.execute()
203
- added = results[0] == 1 # sadd 返回 1 表示新添加
241
+ # 添加指纹
242
+ if self._is_cluster_mode():
243
+ # 集群模式下使用哈希标签确保键在同一个slot
244
+ hash_tag = "{filter}"
245
+ redis_key_with_tag = f"{self.redis_key}{hash_tag}"
246
+ # 直接调用异步方法
247
+ result = redis_client.sadd(redis_key_with_tag, fp)
248
+ if asyncio.iscoroutine(result):
249
+ added = await result
250
+ else:
251
+ added = result
252
+ if self.ttl and self.ttl > 0:
253
+ expire_result = redis_client.expire(redis_key_with_tag, self.ttl)
254
+ if asyncio.iscoroutine(expire_result):
255
+ await expire_result
256
+ else:
257
+ expire_result # 不需要等待同步结果
258
+ added = added == 1 # sadd 返回 1 表示新添加
259
+ else:
260
+ # 直接调用异步方法
261
+ result = redis_client.sadd(self.redis_key, fp)
262
+ if asyncio.iscoroutine(result):
263
+ added = await result
264
+ else:
265
+ added = result
266
+ if self.ttl and self.ttl > 0:
267
+ expire_result = redis_client.expire(self.redis_key, self.ttl)
268
+ if asyncio.iscoroutine(expire_result):
269
+ await expire_result
270
+ else:
271
+ expire_result # 不需要等待同步结果
204
272
 
205
273
  self._pipeline_operations += 1
206
274
 
207
275
  if self.debug and added:
208
276
  self.logger.debug(f"添加新指纹: {fp[:20]}...")
209
277
 
210
- return added
278
+ return bool(added)
211
279
 
212
280
  except Exception as e:
213
281
  self.logger.error(f"添加指纹失败: {fp[:20]}... - {e}")
@@ -252,8 +320,24 @@ class AioRedisFilter(BaseFilter):
252
320
  return False
253
321
 
254
322
  # 检查指纹是否存在
255
- exists = await redis_client.sismember(self.redis_key, str(fp))
256
- return exists
323
+ if self._is_cluster_mode():
324
+ # 集群模式下使用哈希标签确保键在同一个slot
325
+ hash_tag = "{filter}"
326
+ redis_key_with_tag = f"{self.redis_key}{hash_tag}"
327
+ # 直接调用异步方法
328
+ result = redis_client.sismember(redis_key_with_tag, str(fp))
329
+ if asyncio.iscoroutine(result):
330
+ exists = await result
331
+ else:
332
+ exists = result
333
+ else:
334
+ # 直接调用异步方法
335
+ result = redis_client.sismember(self.redis_key, str(fp))
336
+ if asyncio.iscoroutine(result):
337
+ exists = await result
338
+ else:
339
+ exists = result
340
+ return bool(exists)
257
341
  except Exception as e:
258
342
  self.logger.error(f"检查指纹存在性失败: {fp[:20]}... - {e}")
259
343
  # 在网络异常时返回False,避免丢失请求
@@ -261,4 +345,4 @@ class AioRedisFilter(BaseFilter):
261
345
 
262
346
 
263
347
  # 为了兼容性,确保导出类
264
- __all__ = ['AioRedisFilter']
348
+ __all__ = ['AioRedisFilter']
@@ -15,8 +15,7 @@ from weakref import WeakSet
15
15
  from typing import Set, TextIO, Optional
16
16
 
17
17
  from crawlo.filters import BaseFilter
18
- from crawlo.utils.log import get_logger
19
- from crawlo.utils.request import request_fingerprint
18
+ from crawlo.logging import get_logger
20
19
 
21
20
 
22
21
  class MemoryFilter(BaseFilter):
@@ -47,10 +46,7 @@ class MemoryFilter(BaseFilter):
47
46
 
48
47
  # 初始化日志和统计
49
48
  debug = crawler.settings.get_bool('FILTER_DEBUG', False)
50
- logger = get_logger(
51
- self.__class__.__name__,
52
- crawler.settings.get('LOG_LEVEL', 'INFO')
53
- )
49
+ logger = get_logger(self.__class__.__name__)
54
50
  super().__init__(logger, crawler.stats, debug)
55
51
 
56
52
  # 性能计数器
@@ -102,18 +98,10 @@ class MemoryFilter(BaseFilter):
102
98
  :return: 是否重复
103
99
  """
104
100
  with self._lock:
105
- # 使用统一的指纹生成器
106
- from crawlo.utils.fingerprint import FingerprintGenerator
107
- fp = FingerprintGenerator.request_fingerprint(
108
- request.method,
109
- request.url,
110
- request.body or b'',
111
- dict(request.headers) if hasattr(request, 'headers') else None
112
- )
101
+ # 使用基类的指纹生成方法
102
+ fp = self._get_fingerprint(request)
113
103
  if fp in self.fingerprints:
114
104
  self._dupe_count += 1
115
- # if self.debug:
116
- # self.logger.debug(f"发现重复请求: {fp[:20]}...") # 注释掉重复的日志
117
105
  return True
118
106
 
119
107
  self.add_fingerprint(fp)
@@ -185,17 +173,14 @@ class MemoryFileFilter(BaseFilter):
185
173
  def __init__(self, crawler):
186
174
  """
187
175
  初始化过滤器
188
- :param crawler: Scrapy Crawler对象,用于获取配置
176
+ :param crawler: 爬虫框架Crawler对象,用于获取配置
189
177
  """
190
178
  self.fingerprints: Set[str] = set() # 主存储集合
191
179
  self._lock = threading.RLock() # 线程安全锁
192
180
  self._file: Optional[TextIO] = None # 文件句柄
193
181
 
194
182
  debug = crawler.settings.get_bool("FILTER_DEBUG", False)
195
- logger = get_logger(
196
- self.__class__.__name__, # 使用类名作为日志标识
197
- crawler.settings.get("LOG_LEVEL", "INFO")
198
- )
183
+ logger = get_logger(self.__class__.__name__)
199
184
  super().__init__(logger, crawler.stats, debug)
200
185
 
201
186
  # 初始化文件存储
crawlo/framework.py CHANGED
@@ -11,10 +11,10 @@ import os
11
11
  import sys
12
12
  from typing import Type, Optional, List, Union
13
13
 
14
- from .crawler import ModernCrawler, CrawlerProcess
14
+ from .crawler import Crawler, CrawlerProcess
15
15
  from .initialization import initialize_framework
16
16
  from .logging import get_logger
17
- from .utils.env_config import get_version
17
+ from .utils.config_manager import EnvConfigManager
18
18
 
19
19
 
20
20
  class CrawloFramework:
@@ -50,7 +50,7 @@ class CrawloFramework:
50
50
  self._logger = get_logger('crawlo.framework')
51
51
 
52
52
  # 获取版本号
53
- version = get_version()
53
+ version = EnvConfigManager.get_version()
54
54
 
55
55
  # 创建进程管理器
56
56
  self._process = CrawlerProcess(self._settings)
@@ -195,9 +195,13 @@ class CrawloFramework:
195
195
 
196
196
  self._logger.info(f"Starting spiders: {', '.join(spider_names)}")
197
197
 
198
- return await self._process.crawl_multiple(spider_classes_or_names, settings)
198
+ try:
199
+ return await self._process.crawl_multiple(spider_classes_or_names, settings)
200
+ finally:
201
+ # 清理全局Redis连接池
202
+ await self._cleanup_global_resources()
199
203
 
200
- def create_crawler(self, spider_cls: Type, settings=None) -> ModernCrawler:
204
+ def create_crawler(self, spider_cls: Type, settings=None) -> Crawler:
201
205
  """
202
206
  创建Crawler实例
203
207
 
@@ -206,10 +210,10 @@ class CrawloFramework:
206
210
  settings: 额外配置
207
211
 
208
212
  Returns:
209
- ModernCrawler实例
213
+ Crawler实例
210
214
  """
211
215
  merged_settings = self._merge_settings(settings)
212
- return ModernCrawler(spider_cls, merged_settings)
216
+ return Crawler(spider_cls, merged_settings)
213
217
 
214
218
  def _merge_settings(self, additional_settings):
215
219
  """合并配置"""
@@ -234,6 +238,16 @@ class CrawloFramework:
234
238
  def get_metrics(self) -> dict:
235
239
  """获取框架指标"""
236
240
  return self._process.get_metrics()
241
+
242
+ async def _cleanup_global_resources(self):
243
+ """清理全局资源(Redis连接池等)"""
244
+ try:
245
+ # 清理全局Redis连接池
246
+ from crawlo.utils.redis_connection_pool import close_all_pools
247
+ await close_all_pools()
248
+ self._logger.debug("Global resources cleaned up")
249
+ except Exception as e:
250
+ self._logger.warning(f"Failed to cleanup global resources: {e}")
237
251
 
238
252
 
239
253
  # 全局框架实例
@@ -279,7 +293,7 @@ async def run_spiders(spider_classes_or_names: List[Union[Type, str]],
279
293
  return await framework.run_multiple(spider_classes_or_names)
280
294
 
281
295
 
282
- def create_crawler(spider_cls: Type, settings=None, **kwargs) -> ModernCrawler:
296
+ def create_crawler(spider_cls: Type, settings=None, **kwargs) -> Crawler:
283
297
  """创建Crawler的便捷函数"""
284
298
  framework = get_framework(settings, **kwargs)
285
299
  return framework.create_crawler(spider_cls)
@@ -5,10 +5,15 @@
5
5
  """
6
6
 
7
7
  import time
8
+ from typing import TYPE_CHECKING
9
+
8
10
  from .registry import BaseInitializer, register_initializer
9
11
  from .phases import InitializationPhase, PhaseResult
10
12
  from .context import InitializationContext
11
13
 
14
+ if TYPE_CHECKING:
15
+ from crawlo.logging import LogConfig
16
+
12
17
 
13
18
  class LoggingInitializer(BaseInitializer):
14
19
  """日志系统初始化器"""
@@ -28,7 +33,7 @@ class LoggingInitializer(BaseInitializer):
28
33
  log_config = self._get_log_config(context)
29
34
 
30
35
  # 确保日志目录存在
31
- if log_config.file_path and log_config.file_enabled:
36
+ if log_config and log_config.file_path and log_config.file_enabled:
32
37
  import os
33
38
  log_dir = os.path.dirname(log_config.file_path)
34
39
  if log_dir and not os.path.exists(log_dir):
@@ -58,7 +63,7 @@ class LoggingInitializer(BaseInitializer):
58
63
  error=e
59
64
  )
60
65
 
61
- def _get_log_config(self, context: InitializationContext) -> 'LogConfig':
66
+ def _get_log_config(self, context: InitializationContext) -> 'LogConfig | None':
62
67
  """
63
68
  获取日志配置
64
69
 
@@ -70,6 +75,7 @@ class LoggingInitializer(BaseInitializer):
70
75
  """
71
76
  # 导入日志配置类
72
77
  from crawlo.logging import LogConfig
78
+ from crawlo.utils.config_manager import ConfigUtils
73
79
 
74
80
  # 按优先级获取配置:自定义配置 > 上下文配置 > 项目配置 > 默认配置
75
81
  config_sources = [
@@ -80,7 +86,7 @@ class LoggingInitializer(BaseInitializer):
80
86
 
81
87
  # 遍历配置源
82
88
  for config_source in config_sources:
83
- if config_source:
89
+ if config_source and ConfigUtils.has_config_prefix(config_source, 'LOG_'):
84
90
  log_config = self._create_log_config_from_source(config_source)
85
91
  if log_config:
86
92
  return log_config
@@ -88,7 +94,7 @@ class LoggingInitializer(BaseInitializer):
88
94
  # 使用默认配置
89
95
  return LogConfig()
90
96
 
91
- def _create_log_config_from_source(self, config_source) -> 'LogConfig':
97
+ def _create_log_config_from_source(self, config_source) -> 'LogConfig | None':
92
98
  """
93
99
  从配置源创建日志配置
94
100
 
@@ -100,30 +106,25 @@ class LoggingInitializer(BaseInitializer):
100
106
  """
101
107
  # 导入日志配置类
102
108
  from crawlo.logging import LogConfig
109
+ from crawlo.utils.config_manager import ConfigUtils
103
110
 
104
111
  # 检查配置源是否有效
105
112
  if not config_source:
106
113
  return None
107
114
 
108
115
  # 检查是否有日志相关配置
109
- has_keys_method = hasattr(config_source, 'keys')
110
- if has_keys_method:
111
- has_log_config = any(key.startswith('LOG_') for key in config_source.keys())
112
- else:
113
- has_log_config = any(key.startswith('LOG_') for key in dir(config_source))
114
-
115
- if not has_log_config:
116
+ if not ConfigUtils.has_config_prefix(config_source, 'LOG_'):
116
117
  return None
117
118
 
118
119
  # 从配置源获取日志配置
119
- log_level = config_source.get('LOG_LEVEL', 'INFO')
120
- log_file = config_source.get('LOG_FILE')
121
- log_format = config_source.get('LOG_FORMAT', '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s')
122
- log_encoding = config_source.get('LOG_ENCODING', 'utf-8')
123
- log_max_bytes = config_source.get('LOG_MAX_BYTES', 10 * 1024 * 1024)
124
- log_backup_count = config_source.get('LOG_BACKUP_COUNT', 5)
125
- log_console_enabled = config_source.get('LOG_CONSOLE_ENABLED', True)
126
- log_file_enabled = config_source.get('LOG_FILE_ENABLED', True)
120
+ log_level = ConfigUtils.get_config_value([config_source], 'LOG_LEVEL', 'INFO')
121
+ log_file = ConfigUtils.get_config_value([config_source], 'LOG_FILE')
122
+ log_format = ConfigUtils.get_config_value([config_source], 'LOG_FORMAT', '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s')
123
+ log_encoding = ConfigUtils.get_config_value([config_source], 'LOG_ENCODING', 'utf-8')
124
+ log_max_bytes = ConfigUtils.get_config_value([config_source], 'LOG_MAX_BYTES', 10 * 1024 * 1024, int)
125
+ log_backup_count = ConfigUtils.get_config_value([config_source], 'LOG_BACKUP_COUNT', 5, int)
126
+ log_console_enabled = ConfigUtils.get_config_value([config_source], 'LOG_CONSOLE_ENABLED', True, bool)
127
+ log_file_enabled = ConfigUtils.get_config_value([config_source], 'LOG_FILE_ENABLED', True, bool)
127
128
 
128
129
  # 创建日志配置
129
130
  return LogConfig(
@@ -176,10 +177,8 @@ class LoggingInitializer(BaseInitializer):
176
177
  settings_module = importlib.import_module(settings_module_path)
177
178
 
178
179
  # 创建配置字典
179
- project_config = {}
180
- for key in dir(settings_module):
181
- if key.isupper():
182
- project_config[key] = getattr(settings_module, key)
180
+ from crawlo.utils.config_manager import ConfigUtils
181
+ project_config = ConfigUtils.merge_config_sources([settings_module])
183
182
 
184
183
  return project_config
185
184
 
@@ -268,50 +267,8 @@ class CoreComponentsInitializer(BaseInitializer):
268
267
  error=e
269
268
  )
270
269
 
271
- def _initialize_engine(self, context: InitializationContext):
272
- """初始化引擎"""
273
- try:
274
- # 注意:Engine需要crawler参数,不能在此阶段初始化
275
- pass
276
- except Exception as e:
277
- context.add_error(f"Failed to initialize engine: {e}")
278
- raise
279
-
280
- def _initialize_scheduler(self, context: InitializationContext):
281
- """初始化调度器"""
282
- try:
283
- # 注意:Scheduler需要很多参数,不能在此阶段初始化
284
- pass
285
- except Exception as e:
286
- context.add_error(f"Failed to initialize scheduler: {e}")
287
- raise
288
-
289
- def _initialize_downloader(self, context: InitializationContext):
290
- """初始化下载器"""
291
- try:
292
- # 注意:下载器类需要crawler参数,不能在此阶段初始化实例
293
- pass
294
- except Exception as e:
295
- context.add_error(f"Failed to initialize downloader: {e}")
296
- raise
297
-
298
- def _initialize_pipeline_manager(self, context: InitializationContext):
299
- """初始化管道管理器"""
300
- try:
301
- # 注意:PipelineManager需要crawler参数,不能在此阶段初始化
302
- pass
303
- except Exception as e:
304
- context.add_error(f"Failed to initialize pipeline manager: {e}")
305
- raise
306
-
307
- def _initialize_middleware_manager(self, context: InitializationContext):
308
- """初始化中间件管理器"""
309
- try:
310
- # 注意:MiddlewareManager需要crawler参数,不能在此阶段初始化
311
- pass
312
- except Exception as e:
313
- context.add_error(f"Failed to initialize middleware manager: {e}")
314
- raise
270
+ # 注意:核心组件需要crawler参数,不能在此阶段初始化
271
+ # 实际初始化将在crawler创建时进行
315
272
 
316
273
 
317
274
  class ExtensionsInitializer(BaseInitializer):