crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,19 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Optional, Dict, Any, Union, Awaitable, Literal
|
|
2
2
|
import redis.asyncio as aioredis
|
|
3
|
+
import asyncio
|
|
4
|
+
from inspect import iscoroutinefunction
|
|
5
|
+
|
|
6
|
+
# 尝试导入Redis集群支持
|
|
7
|
+
try:
|
|
8
|
+
from redis.asyncio.cluster import RedisCluster
|
|
9
|
+
REDIS_CLUSTER_AVAILABLE = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
RedisCluster = None
|
|
12
|
+
REDIS_CLUSTER_AVAILABLE = False
|
|
3
13
|
|
|
4
14
|
from crawlo.filters import BaseFilter
|
|
5
|
-
from crawlo.
|
|
6
|
-
from crawlo.utils.
|
|
7
|
-
from crawlo.utils.redis_connection_pool import get_redis_pool
|
|
15
|
+
from crawlo.logging import get_logger
|
|
16
|
+
from crawlo.utils.redis_connection_pool import get_redis_pool, RedisConnectionPool
|
|
8
17
|
|
|
9
18
|
|
|
10
19
|
class AioRedisFilter(BaseFilter):
|
|
@@ -16,20 +25,16 @@ class AioRedisFilter(BaseFilter):
|
|
|
16
25
|
- TTL 自动过期清理机制
|
|
17
26
|
- Pipeline 批量操作优化性能
|
|
18
27
|
- 容错设计和连接池管理
|
|
19
|
-
|
|
20
|
-
适用场景:
|
|
21
|
-
- 分布式爬虫系统
|
|
22
|
-
- 大规模数据处理
|
|
23
|
-
- 需要持久化去重的场景
|
|
28
|
+
- Redis集群支持
|
|
24
29
|
"""
|
|
25
30
|
|
|
26
31
|
def __init__(
|
|
27
32
|
self,
|
|
28
33
|
redis_key: str,
|
|
29
|
-
client: aioredis.Redis,
|
|
30
|
-
stats:
|
|
34
|
+
client: Optional[aioredis.Redis] = None,
|
|
35
|
+
stats: Optional[Dict[str, Any]] = None,
|
|
31
36
|
debug: bool = False,
|
|
32
|
-
log_level:
|
|
37
|
+
log_level: int = 20, # logging.INFO
|
|
33
38
|
cleanup_fp: bool = False,
|
|
34
39
|
ttl: Optional[int] = None
|
|
35
40
|
):
|
|
@@ -44,7 +49,7 @@ class AioRedisFilter(BaseFilter):
|
|
|
44
49
|
:param cleanup_fp: 关闭时是否清理指纹
|
|
45
50
|
:param ttl: 指纹过期时间(秒)
|
|
46
51
|
"""
|
|
47
|
-
self.logger = get_logger(self.__class__.__name__
|
|
52
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
48
53
|
super().__init__(self.logger, stats, debug)
|
|
49
54
|
|
|
50
55
|
self.redis_key = redis_key
|
|
@@ -53,7 +58,7 @@ class AioRedisFilter(BaseFilter):
|
|
|
53
58
|
self.ttl = ttl
|
|
54
59
|
|
|
55
60
|
# 保存连接池引用(用于延迟初始化)
|
|
56
|
-
self._redis_pool = None
|
|
61
|
+
self._redis_pool: Optional[RedisConnectionPool] = None
|
|
57
62
|
|
|
58
63
|
# 性能计数器
|
|
59
64
|
self._redis_operations = 0
|
|
@@ -105,7 +110,7 @@ class AioRedisFilter(BaseFilter):
|
|
|
105
110
|
cleanup_fp=crawler.settings.get_bool('CLEANUP_FP', False),
|
|
106
111
|
ttl=ttl,
|
|
107
112
|
debug=crawler.settings.get_bool('FILTER_DEBUG', False),
|
|
108
|
-
log_level=crawler.settings
|
|
113
|
+
log_level=getattr(crawler.settings, 'LOG_LEVEL_NUM', 20) # 默认INFO级别
|
|
109
114
|
)
|
|
110
115
|
|
|
111
116
|
# 保存连接池引用,以便在需要时获取连接
|
|
@@ -120,16 +125,41 @@ class AioRedisFilter(BaseFilter):
|
|
|
120
125
|
|
|
121
126
|
if self.redis is None and self._redis_pool is not None:
|
|
122
127
|
try:
|
|
123
|
-
|
|
128
|
+
connection = await self._redis_pool.get_connection()
|
|
129
|
+
# 确保返回的是Redis客户端而不是连接池本身
|
|
130
|
+
if hasattr(connection, 'ping'):
|
|
131
|
+
self.redis = connection
|
|
132
|
+
else:
|
|
133
|
+
self.redis = connection
|
|
124
134
|
except Exception as e:
|
|
125
135
|
self._connection_failed = True
|
|
126
136
|
self.logger.error(f"Redis连接失败,将使用本地去重: {e}")
|
|
127
137
|
return None
|
|
128
138
|
return self.redis
|
|
129
139
|
|
|
130
|
-
|
|
140
|
+
def _is_cluster_mode(self) -> bool:
|
|
141
|
+
"""检查是否为集群模式"""
|
|
142
|
+
if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None:
|
|
143
|
+
# 检查 redis 是否为 RedisCluster 实例
|
|
144
|
+
if self.redis is not None and isinstance(self.redis, RedisCluster):
|
|
145
|
+
return True
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
def requested(self, request) -> bool:
|
|
149
|
+
"""
|
|
150
|
+
检查请求是否已存在(同步方法)
|
|
151
|
+
|
|
152
|
+
:param request: 请求对象
|
|
153
|
+
:return: True 表示重复,False 表示新请求
|
|
154
|
+
"""
|
|
155
|
+
# 这个方法需要同步实现,但Redis操作是异步的
|
|
156
|
+
# 在实际使用中,应该通过异步方式调用 _requested_async
|
|
157
|
+
# 由于BaseFilter要求同步方法,我们在这里返回False表示不重复
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
async def requested_async(self, request) -> bool:
|
|
131
161
|
"""
|
|
132
|
-
|
|
162
|
+
异步检查请求是否已存在
|
|
133
163
|
|
|
134
164
|
:param request: 请求对象
|
|
135
165
|
:return: True 表示重复,False 表示新请求
|
|
@@ -142,32 +172,38 @@ class AioRedisFilter(BaseFilter):
|
|
|
142
172
|
if redis_client is None:
|
|
143
173
|
return False
|
|
144
174
|
|
|
145
|
-
#
|
|
146
|
-
|
|
147
|
-
fp = str(FingerprintGenerator.request_fingerprint(
|
|
148
|
-
request.method,
|
|
149
|
-
request.url,
|
|
150
|
-
request.body or b'',
|
|
151
|
-
dict(request.headers) if hasattr(request, 'headers') else None
|
|
152
|
-
))
|
|
175
|
+
# 使用基类的指纹生成方法
|
|
176
|
+
fp = str(self._get_fingerprint(request))
|
|
153
177
|
self._redis_operations += 1
|
|
154
178
|
|
|
155
|
-
#
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
179
|
+
# 检查指纹是否存在
|
|
180
|
+
if self._is_cluster_mode():
|
|
181
|
+
# 集群模式下使用哈希标签确保键在同一个slot
|
|
182
|
+
hash_tag = "{filter}"
|
|
183
|
+
redis_key_with_tag = f"{self.redis_key}{hash_tag}"
|
|
184
|
+
# 直接调用异步方法
|
|
185
|
+
result = redis_client.sismember(redis_key_with_tag, fp)
|
|
186
|
+
if asyncio.iscoroutine(result):
|
|
187
|
+
exists = await result
|
|
188
|
+
else:
|
|
189
|
+
exists = result
|
|
190
|
+
else:
|
|
191
|
+
# 直接调用异步方法
|
|
192
|
+
result = redis_client.sismember(self.redis_key, fp)
|
|
193
|
+
if asyncio.iscoroutine(result):
|
|
194
|
+
exists = await result
|
|
195
|
+
else:
|
|
196
|
+
exists = result
|
|
161
197
|
|
|
162
198
|
self._pipeline_operations += 1
|
|
163
199
|
|
|
164
200
|
if exists:
|
|
165
201
|
if self.debug:
|
|
166
202
|
self.logger.debug(f"发现重复请求: {fp[:20]}...")
|
|
167
|
-
return
|
|
203
|
+
return bool(exists)
|
|
168
204
|
|
|
169
205
|
# 如果不存在,添加指纹并设置TTL
|
|
170
|
-
await self.
|
|
206
|
+
await self._add_fingerprint_async(fp)
|
|
171
207
|
return False
|
|
172
208
|
|
|
173
209
|
except Exception as e:
|
|
@@ -175,9 +211,19 @@ class AioRedisFilter(BaseFilter):
|
|
|
175
211
|
# 在网络异常时返回False,避免丢失请求
|
|
176
212
|
return False
|
|
177
213
|
|
|
178
|
-
|
|
214
|
+
def add_fingerprint(self, fp: str) -> None:
|
|
179
215
|
"""
|
|
180
|
-
添加新指纹到Redis
|
|
216
|
+
添加新指纹到Redis集合(同步方法)
|
|
217
|
+
|
|
218
|
+
:param fp: 请求指纹字符串
|
|
219
|
+
"""
|
|
220
|
+
# 这个方法需要同步实现,但Redis操作是异步的
|
|
221
|
+
# 在实际使用中,应该通过异步方式调用 _add_fingerprint_async
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
async def _add_fingerprint_async(self, fp: str) -> bool:
|
|
225
|
+
"""
|
|
226
|
+
异步添加新指纹到Redis集合
|
|
181
227
|
|
|
182
228
|
:param fp: 请求指纹字符串
|
|
183
229
|
:return: 是否成功添加(True 表示新添加,False 表示已存在)
|
|
@@ -192,22 +238,44 @@ class AioRedisFilter(BaseFilter):
|
|
|
192
238
|
|
|
193
239
|
fp = str(fp)
|
|
194
240
|
|
|
195
|
-
#
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
241
|
+
# 添加指纹
|
|
242
|
+
if self._is_cluster_mode():
|
|
243
|
+
# 集群模式下使用哈希标签确保键在同一个slot
|
|
244
|
+
hash_tag = "{filter}"
|
|
245
|
+
redis_key_with_tag = f"{self.redis_key}{hash_tag}"
|
|
246
|
+
# 直接调用异步方法
|
|
247
|
+
result = redis_client.sadd(redis_key_with_tag, fp)
|
|
248
|
+
if asyncio.iscoroutine(result):
|
|
249
|
+
added = await result
|
|
250
|
+
else:
|
|
251
|
+
added = result
|
|
252
|
+
if self.ttl and self.ttl > 0:
|
|
253
|
+
expire_result = redis_client.expire(redis_key_with_tag, self.ttl)
|
|
254
|
+
if asyncio.iscoroutine(expire_result):
|
|
255
|
+
await expire_result
|
|
256
|
+
else:
|
|
257
|
+
expire_result # 不需要等待同步结果
|
|
258
|
+
added = added == 1 # sadd 返回 1 表示新添加
|
|
259
|
+
else:
|
|
260
|
+
# 直接调用异步方法
|
|
261
|
+
result = redis_client.sadd(self.redis_key, fp)
|
|
262
|
+
if asyncio.iscoroutine(result):
|
|
263
|
+
added = await result
|
|
264
|
+
else:
|
|
265
|
+
added = result
|
|
266
|
+
if self.ttl and self.ttl > 0:
|
|
267
|
+
expire_result = redis_client.expire(self.redis_key, self.ttl)
|
|
268
|
+
if asyncio.iscoroutine(expire_result):
|
|
269
|
+
await expire_result
|
|
270
|
+
else:
|
|
271
|
+
expire_result # 不需要等待同步结果
|
|
204
272
|
|
|
205
273
|
self._pipeline_operations += 1
|
|
206
274
|
|
|
207
275
|
if self.debug and added:
|
|
208
276
|
self.logger.debug(f"添加新指纹: {fp[:20]}...")
|
|
209
277
|
|
|
210
|
-
return added
|
|
278
|
+
return bool(added)
|
|
211
279
|
|
|
212
280
|
except Exception as e:
|
|
213
281
|
self.logger.error(f"添加指纹失败: {fp[:20]}... - {e}")
|
|
@@ -252,8 +320,24 @@ class AioRedisFilter(BaseFilter):
|
|
|
252
320
|
return False
|
|
253
321
|
|
|
254
322
|
# 检查指纹是否存在
|
|
255
|
-
|
|
256
|
-
|
|
323
|
+
if self._is_cluster_mode():
|
|
324
|
+
# 集群模式下使用哈希标签确保键在同一个slot
|
|
325
|
+
hash_tag = "{filter}"
|
|
326
|
+
redis_key_with_tag = f"{self.redis_key}{hash_tag}"
|
|
327
|
+
# 直接调用异步方法
|
|
328
|
+
result = redis_client.sismember(redis_key_with_tag, str(fp))
|
|
329
|
+
if asyncio.iscoroutine(result):
|
|
330
|
+
exists = await result
|
|
331
|
+
else:
|
|
332
|
+
exists = result
|
|
333
|
+
else:
|
|
334
|
+
# 直接调用异步方法
|
|
335
|
+
result = redis_client.sismember(self.redis_key, str(fp))
|
|
336
|
+
if asyncio.iscoroutine(result):
|
|
337
|
+
exists = await result
|
|
338
|
+
else:
|
|
339
|
+
exists = result
|
|
340
|
+
return bool(exists)
|
|
257
341
|
except Exception as e:
|
|
258
342
|
self.logger.error(f"检查指纹存在性失败: {fp[:20]}... - {e}")
|
|
259
343
|
# 在网络异常时返回False,避免丢失请求
|
|
@@ -261,4 +345,4 @@ class AioRedisFilter(BaseFilter):
|
|
|
261
345
|
|
|
262
346
|
|
|
263
347
|
# 为了兼容性,确保导出类
|
|
264
|
-
__all__ = ['AioRedisFilter']
|
|
348
|
+
__all__ = ['AioRedisFilter']
|
crawlo/filters/memory_filter.py
CHANGED
|
@@ -15,8 +15,7 @@ from weakref import WeakSet
|
|
|
15
15
|
from typing import Set, TextIO, Optional
|
|
16
16
|
|
|
17
17
|
from crawlo.filters import BaseFilter
|
|
18
|
-
from crawlo.
|
|
19
|
-
from crawlo.utils.request import request_fingerprint
|
|
18
|
+
from crawlo.logging import get_logger
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
class MemoryFilter(BaseFilter):
|
|
@@ -47,10 +46,7 @@ class MemoryFilter(BaseFilter):
|
|
|
47
46
|
|
|
48
47
|
# 初始化日志和统计
|
|
49
48
|
debug = crawler.settings.get_bool('FILTER_DEBUG', False)
|
|
50
|
-
logger = get_logger(
|
|
51
|
-
self.__class__.__name__,
|
|
52
|
-
crawler.settings.get('LOG_LEVEL', 'INFO')
|
|
53
|
-
)
|
|
49
|
+
logger = get_logger(self.__class__.__name__)
|
|
54
50
|
super().__init__(logger, crawler.stats, debug)
|
|
55
51
|
|
|
56
52
|
# 性能计数器
|
|
@@ -102,18 +98,10 @@ class MemoryFilter(BaseFilter):
|
|
|
102
98
|
:return: 是否重复
|
|
103
99
|
"""
|
|
104
100
|
with self._lock:
|
|
105
|
-
#
|
|
106
|
-
|
|
107
|
-
fp = FingerprintGenerator.request_fingerprint(
|
|
108
|
-
request.method,
|
|
109
|
-
request.url,
|
|
110
|
-
request.body or b'',
|
|
111
|
-
dict(request.headers) if hasattr(request, 'headers') else None
|
|
112
|
-
)
|
|
101
|
+
# 使用基类的指纹生成方法
|
|
102
|
+
fp = self._get_fingerprint(request)
|
|
113
103
|
if fp in self.fingerprints:
|
|
114
104
|
self._dupe_count += 1
|
|
115
|
-
# if self.debug:
|
|
116
|
-
# self.logger.debug(f"发现重复请求: {fp[:20]}...") # 注释掉重复的日志
|
|
117
105
|
return True
|
|
118
106
|
|
|
119
107
|
self.add_fingerprint(fp)
|
|
@@ -185,17 +173,14 @@ class MemoryFileFilter(BaseFilter):
|
|
|
185
173
|
def __init__(self, crawler):
|
|
186
174
|
"""
|
|
187
175
|
初始化过滤器
|
|
188
|
-
:param crawler:
|
|
176
|
+
:param crawler: 爬虫框架Crawler对象,用于获取配置
|
|
189
177
|
"""
|
|
190
178
|
self.fingerprints: Set[str] = set() # 主存储集合
|
|
191
179
|
self._lock = threading.RLock() # 线程安全锁
|
|
192
180
|
self._file: Optional[TextIO] = None # 文件句柄
|
|
193
181
|
|
|
194
182
|
debug = crawler.settings.get_bool("FILTER_DEBUG", False)
|
|
195
|
-
logger = get_logger(
|
|
196
|
-
self.__class__.__name__, # 使用类名作为日志标识
|
|
197
|
-
crawler.settings.get("LOG_LEVEL", "INFO")
|
|
198
|
-
)
|
|
183
|
+
logger = get_logger(self.__class__.__name__)
|
|
199
184
|
super().__init__(logger, crawler.stats, debug)
|
|
200
185
|
|
|
201
186
|
# 初始化文件存储
|
crawlo/framework.py
CHANGED
|
@@ -11,10 +11,10 @@ import os
|
|
|
11
11
|
import sys
|
|
12
12
|
from typing import Type, Optional, List, Union
|
|
13
13
|
|
|
14
|
-
from .crawler import
|
|
14
|
+
from .crawler import Crawler, CrawlerProcess
|
|
15
15
|
from .initialization import initialize_framework
|
|
16
16
|
from .logging import get_logger
|
|
17
|
-
from .utils.
|
|
17
|
+
from .utils.config_manager import EnvConfigManager
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class CrawloFramework:
|
|
@@ -50,7 +50,7 @@ class CrawloFramework:
|
|
|
50
50
|
self._logger = get_logger('crawlo.framework')
|
|
51
51
|
|
|
52
52
|
# 获取版本号
|
|
53
|
-
version = get_version()
|
|
53
|
+
version = EnvConfigManager.get_version()
|
|
54
54
|
|
|
55
55
|
# 创建进程管理器
|
|
56
56
|
self._process = CrawlerProcess(self._settings)
|
|
@@ -195,9 +195,13 @@ class CrawloFramework:
|
|
|
195
195
|
|
|
196
196
|
self._logger.info(f"Starting spiders: {', '.join(spider_names)}")
|
|
197
197
|
|
|
198
|
-
|
|
198
|
+
try:
|
|
199
|
+
return await self._process.crawl_multiple(spider_classes_or_names, settings)
|
|
200
|
+
finally:
|
|
201
|
+
# 清理全局Redis连接池
|
|
202
|
+
await self._cleanup_global_resources()
|
|
199
203
|
|
|
200
|
-
def create_crawler(self, spider_cls: Type, settings=None) ->
|
|
204
|
+
def create_crawler(self, spider_cls: Type, settings=None) -> Crawler:
|
|
201
205
|
"""
|
|
202
206
|
创建Crawler实例
|
|
203
207
|
|
|
@@ -206,10 +210,10 @@ class CrawloFramework:
|
|
|
206
210
|
settings: 额外配置
|
|
207
211
|
|
|
208
212
|
Returns:
|
|
209
|
-
|
|
213
|
+
Crawler实例
|
|
210
214
|
"""
|
|
211
215
|
merged_settings = self._merge_settings(settings)
|
|
212
|
-
return
|
|
216
|
+
return Crawler(spider_cls, merged_settings)
|
|
213
217
|
|
|
214
218
|
def _merge_settings(self, additional_settings):
|
|
215
219
|
"""合并配置"""
|
|
@@ -234,6 +238,16 @@ class CrawloFramework:
|
|
|
234
238
|
def get_metrics(self) -> dict:
|
|
235
239
|
"""获取框架指标"""
|
|
236
240
|
return self._process.get_metrics()
|
|
241
|
+
|
|
242
|
+
async def _cleanup_global_resources(self):
|
|
243
|
+
"""清理全局资源(Redis连接池等)"""
|
|
244
|
+
try:
|
|
245
|
+
# 清理全局Redis连接池
|
|
246
|
+
from crawlo.utils.redis_connection_pool import close_all_pools
|
|
247
|
+
await close_all_pools()
|
|
248
|
+
self._logger.debug("Global resources cleaned up")
|
|
249
|
+
except Exception as e:
|
|
250
|
+
self._logger.warning(f"Failed to cleanup global resources: {e}")
|
|
237
251
|
|
|
238
252
|
|
|
239
253
|
# 全局框架实例
|
|
@@ -279,7 +293,7 @@ async def run_spiders(spider_classes_or_names: List[Union[Type, str]],
|
|
|
279
293
|
return await framework.run_multiple(spider_classes_or_names)
|
|
280
294
|
|
|
281
295
|
|
|
282
|
-
def create_crawler(spider_cls: Type, settings=None, **kwargs) ->
|
|
296
|
+
def create_crawler(spider_cls: Type, settings=None, **kwargs) -> Crawler:
|
|
283
297
|
"""创建Crawler的便捷函数"""
|
|
284
298
|
framework = get_framework(settings, **kwargs)
|
|
285
299
|
return framework.create_crawler(spider_cls)
|
|
@@ -5,10 +5,15 @@
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import time
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
8
10
|
from .registry import BaseInitializer, register_initializer
|
|
9
11
|
from .phases import InitializationPhase, PhaseResult
|
|
10
12
|
from .context import InitializationContext
|
|
11
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from crawlo.logging import LogConfig
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
class LoggingInitializer(BaseInitializer):
|
|
14
19
|
"""日志系统初始化器"""
|
|
@@ -28,7 +33,7 @@ class LoggingInitializer(BaseInitializer):
|
|
|
28
33
|
log_config = self._get_log_config(context)
|
|
29
34
|
|
|
30
35
|
# 确保日志目录存在
|
|
31
|
-
if log_config.file_path and log_config.file_enabled:
|
|
36
|
+
if log_config and log_config.file_path and log_config.file_enabled:
|
|
32
37
|
import os
|
|
33
38
|
log_dir = os.path.dirname(log_config.file_path)
|
|
34
39
|
if log_dir and not os.path.exists(log_dir):
|
|
@@ -58,7 +63,7 @@ class LoggingInitializer(BaseInitializer):
|
|
|
58
63
|
error=e
|
|
59
64
|
)
|
|
60
65
|
|
|
61
|
-
def _get_log_config(self, context: InitializationContext) -> 'LogConfig':
|
|
66
|
+
def _get_log_config(self, context: InitializationContext) -> 'LogConfig | None':
|
|
62
67
|
"""
|
|
63
68
|
获取日志配置
|
|
64
69
|
|
|
@@ -70,6 +75,7 @@ class LoggingInitializer(BaseInitializer):
|
|
|
70
75
|
"""
|
|
71
76
|
# 导入日志配置类
|
|
72
77
|
from crawlo.logging import LogConfig
|
|
78
|
+
from crawlo.utils.config_manager import ConfigUtils
|
|
73
79
|
|
|
74
80
|
# 按优先级获取配置:自定义配置 > 上下文配置 > 项目配置 > 默认配置
|
|
75
81
|
config_sources = [
|
|
@@ -80,7 +86,7 @@ class LoggingInitializer(BaseInitializer):
|
|
|
80
86
|
|
|
81
87
|
# 遍历配置源
|
|
82
88
|
for config_source in config_sources:
|
|
83
|
-
if config_source:
|
|
89
|
+
if config_source and ConfigUtils.has_config_prefix(config_source, 'LOG_'):
|
|
84
90
|
log_config = self._create_log_config_from_source(config_source)
|
|
85
91
|
if log_config:
|
|
86
92
|
return log_config
|
|
@@ -88,7 +94,7 @@ class LoggingInitializer(BaseInitializer):
|
|
|
88
94
|
# 使用默认配置
|
|
89
95
|
return LogConfig()
|
|
90
96
|
|
|
91
|
-
def _create_log_config_from_source(self, config_source) -> 'LogConfig':
|
|
97
|
+
def _create_log_config_from_source(self, config_source) -> 'LogConfig | None':
|
|
92
98
|
"""
|
|
93
99
|
从配置源创建日志配置
|
|
94
100
|
|
|
@@ -100,30 +106,25 @@ class LoggingInitializer(BaseInitializer):
|
|
|
100
106
|
"""
|
|
101
107
|
# 导入日志配置类
|
|
102
108
|
from crawlo.logging import LogConfig
|
|
109
|
+
from crawlo.utils.config_manager import ConfigUtils
|
|
103
110
|
|
|
104
111
|
# 检查配置源是否有效
|
|
105
112
|
if not config_source:
|
|
106
113
|
return None
|
|
107
114
|
|
|
108
115
|
# 检查是否有日志相关配置
|
|
109
|
-
|
|
110
|
-
if has_keys_method:
|
|
111
|
-
has_log_config = any(key.startswith('LOG_') for key in config_source.keys())
|
|
112
|
-
else:
|
|
113
|
-
has_log_config = any(key.startswith('LOG_') for key in dir(config_source))
|
|
114
|
-
|
|
115
|
-
if not has_log_config:
|
|
116
|
+
if not ConfigUtils.has_config_prefix(config_source, 'LOG_'):
|
|
116
117
|
return None
|
|
117
118
|
|
|
118
119
|
# 从配置源获取日志配置
|
|
119
|
-
log_level =
|
|
120
|
-
log_file =
|
|
121
|
-
log_format =
|
|
122
|
-
log_encoding =
|
|
123
|
-
log_max_bytes =
|
|
124
|
-
log_backup_count =
|
|
125
|
-
log_console_enabled =
|
|
126
|
-
log_file_enabled =
|
|
120
|
+
log_level = ConfigUtils.get_config_value([config_source], 'LOG_LEVEL', 'INFO')
|
|
121
|
+
log_file = ConfigUtils.get_config_value([config_source], 'LOG_FILE')
|
|
122
|
+
log_format = ConfigUtils.get_config_value([config_source], 'LOG_FORMAT', '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s')
|
|
123
|
+
log_encoding = ConfigUtils.get_config_value([config_source], 'LOG_ENCODING', 'utf-8')
|
|
124
|
+
log_max_bytes = ConfigUtils.get_config_value([config_source], 'LOG_MAX_BYTES', 10 * 1024 * 1024, int)
|
|
125
|
+
log_backup_count = ConfigUtils.get_config_value([config_source], 'LOG_BACKUP_COUNT', 5, int)
|
|
126
|
+
log_console_enabled = ConfigUtils.get_config_value([config_source], 'LOG_CONSOLE_ENABLED', True, bool)
|
|
127
|
+
log_file_enabled = ConfigUtils.get_config_value([config_source], 'LOG_FILE_ENABLED', True, bool)
|
|
127
128
|
|
|
128
129
|
# 创建日志配置
|
|
129
130
|
return LogConfig(
|
|
@@ -176,10 +177,8 @@ class LoggingInitializer(BaseInitializer):
|
|
|
176
177
|
settings_module = importlib.import_module(settings_module_path)
|
|
177
178
|
|
|
178
179
|
# 创建配置字典
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
if key.isupper():
|
|
182
|
-
project_config[key] = getattr(settings_module, key)
|
|
180
|
+
from crawlo.utils.config_manager import ConfigUtils
|
|
181
|
+
project_config = ConfigUtils.merge_config_sources([settings_module])
|
|
183
182
|
|
|
184
183
|
return project_config
|
|
185
184
|
|
|
@@ -268,50 +267,8 @@ class CoreComponentsInitializer(BaseInitializer):
|
|
|
268
267
|
error=e
|
|
269
268
|
)
|
|
270
269
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
try:
|
|
274
|
-
# 注意:Engine需要crawler参数,不能在此阶段初始化
|
|
275
|
-
pass
|
|
276
|
-
except Exception as e:
|
|
277
|
-
context.add_error(f"Failed to initialize engine: {e}")
|
|
278
|
-
raise
|
|
279
|
-
|
|
280
|
-
def _initialize_scheduler(self, context: InitializationContext):
|
|
281
|
-
"""初始化调度器"""
|
|
282
|
-
try:
|
|
283
|
-
# 注意:Scheduler需要很多参数,不能在此阶段初始化
|
|
284
|
-
pass
|
|
285
|
-
except Exception as e:
|
|
286
|
-
context.add_error(f"Failed to initialize scheduler: {e}")
|
|
287
|
-
raise
|
|
288
|
-
|
|
289
|
-
def _initialize_downloader(self, context: InitializationContext):
|
|
290
|
-
"""初始化下载器"""
|
|
291
|
-
try:
|
|
292
|
-
# 注意:下载器类需要crawler参数,不能在此阶段初始化实例
|
|
293
|
-
pass
|
|
294
|
-
except Exception as e:
|
|
295
|
-
context.add_error(f"Failed to initialize downloader: {e}")
|
|
296
|
-
raise
|
|
297
|
-
|
|
298
|
-
def _initialize_pipeline_manager(self, context: InitializationContext):
|
|
299
|
-
"""初始化管道管理器"""
|
|
300
|
-
try:
|
|
301
|
-
# 注意:PipelineManager需要crawler参数,不能在此阶段初始化
|
|
302
|
-
pass
|
|
303
|
-
except Exception as e:
|
|
304
|
-
context.add_error(f"Failed to initialize pipeline manager: {e}")
|
|
305
|
-
raise
|
|
306
|
-
|
|
307
|
-
def _initialize_middleware_manager(self, context: InitializationContext):
|
|
308
|
-
"""初始化中间件管理器"""
|
|
309
|
-
try:
|
|
310
|
-
# 注意:MiddlewareManager需要crawler参数,不能在此阶段初始化
|
|
311
|
-
pass
|
|
312
|
-
except Exception as e:
|
|
313
|
-
context.add_error(f"Failed to initialize middleware manager: {e}")
|
|
314
|
-
raise
|
|
270
|
+
# 注意:核心组件需要crawler参数,不能在此阶段初始化
|
|
271
|
+
# 实际初始化将在crawler创建时进行
|
|
315
272
|
|
|
316
273
|
|
|
317
274
|
class ExtensionsInitializer(BaseInitializer):
|