crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
from typing import Optional, List, Dict
|
|
3
|
-
|
|
3
|
+
|
|
4
4
|
from pymongo.errors import PyMongoError
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
from crawlo.exceptions import ItemDiscard
|
|
7
|
+
from crawlo.logging import get_logger
|
|
8
|
+
from crawlo.utils.mongo_connection_pool import MongoConnectionPoolManager
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
class MongoPipeline:
|
|
10
12
|
def __init__(self, crawler):
|
|
11
13
|
self.crawler = crawler
|
|
12
14
|
self.settings = crawler.settings
|
|
13
|
-
self.logger = get_logger(self.__class__.__name__
|
|
15
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
14
16
|
|
|
15
17
|
# 初始化连接参数
|
|
16
18
|
self.client = None
|
|
@@ -43,17 +45,21 @@ class MongoPipeline:
|
|
|
43
45
|
async def _ensure_connection(self):
|
|
44
46
|
"""确保连接已建立"""
|
|
45
47
|
if self.client is None:
|
|
46
|
-
#
|
|
47
|
-
self.client =
|
|
48
|
-
self.mongo_uri,
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
48
|
+
# 使用单例连接池管理器
|
|
49
|
+
self.client = await MongoConnectionPoolManager.get_client(
|
|
50
|
+
mongo_uri=self.mongo_uri,
|
|
51
|
+
db_name=self.db_name,
|
|
52
|
+
max_pool_size=self.max_pool_size,
|
|
53
|
+
min_pool_size=self.min_pool_size,
|
|
54
|
+
connect_timeout_ms=self.connect_timeout_ms,
|
|
55
|
+
socket_timeout_ms=self.socket_timeout_ms
|
|
53
56
|
)
|
|
54
57
|
self.db = self.client[self.db_name]
|
|
55
58
|
self.collection = self.db[self.collection_name]
|
|
56
|
-
self.logger.info(
|
|
59
|
+
self.logger.info(
|
|
60
|
+
f"MongoDB连接建立 (集合: {self.collection_name}, "
|
|
61
|
+
f"使用全局共享连接池)"
|
|
62
|
+
)
|
|
57
63
|
|
|
58
64
|
async def process_item(self, item, spider) -> Optional[dict]:
|
|
59
65
|
"""处理item的核心方法(带重试机制)"""
|
|
@@ -126,7 +132,10 @@ class MongoPipeline:
|
|
|
126
132
|
# 在关闭前刷新剩余的批量数据
|
|
127
133
|
if self.use_batch and self.batch_buffer:
|
|
128
134
|
await self._flush_batch(self.crawler.spider)
|
|
129
|
-
|
|
135
|
+
|
|
136
|
+
# 注意:不再关闭客户端,因为客户端是全局共享的
|
|
137
|
+
# 客户端的关闭由 MongoConnectionPoolManager.close_all_clients() 统一管理
|
|
130
138
|
if self.client:
|
|
131
|
-
self.
|
|
132
|
-
|
|
139
|
+
self.logger.info(
|
|
140
|
+
f"MongoDB Pipeline 关闭,但保留全局共享连接池以供其他爬虫使用"
|
|
141
|
+
)
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
import asyncio
|
|
3
|
-
import aiomysql
|
|
4
|
-
from asyncmy import create_pool
|
|
5
|
-
from typing import Optional, List, Dict, Any
|
|
6
|
-
from abc import ABC, abstractmethod
|
|
7
3
|
import async_timeout
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import List, Dict, Any
|
|
8
6
|
|
|
9
|
-
from crawlo.items import Item
|
|
10
7
|
from crawlo.exceptions import ItemDiscard
|
|
8
|
+
from crawlo.items import Item
|
|
11
9
|
from crawlo.utils.db_helper import SQLBuilder
|
|
12
|
-
from crawlo.
|
|
10
|
+
from crawlo.logging import get_logger
|
|
11
|
+
from crawlo.utils.mysql_connection_pool import MySQLConnectionPoolManager
|
|
13
12
|
from . import BasePipeline
|
|
14
13
|
|
|
15
14
|
|
|
@@ -19,7 +18,7 @@ class BaseMySQLPipeline(BasePipeline, ABC):
|
|
|
19
18
|
def __init__(self, crawler):
|
|
20
19
|
self.crawler = crawler
|
|
21
20
|
self.settings = crawler.settings
|
|
22
|
-
self.logger = get_logger(self.__class__.__name__
|
|
21
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
23
22
|
|
|
24
23
|
# 记录管道初始化
|
|
25
24
|
self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
|
|
@@ -203,20 +202,13 @@ class BaseMySQLPipeline(BasePipeline, ABC):
|
|
|
203
202
|
await self._flush_batch(spider_name)
|
|
204
203
|
except Exception as e:
|
|
205
204
|
self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
|
|
206
|
-
|
|
205
|
+
|
|
206
|
+
# 注意:不再关闭连接池,因为连接池是全局共享的
|
|
207
|
+
# 连接池的关闭由 MySQLConnectionPoolManager.close_all_pools() 统一管理
|
|
207
208
|
if self.pool:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
'minsize': getattr(self.pool, 'minsize', 'unknown'),
|
|
212
|
-
'maxsize': getattr(self.pool, 'maxsize', 'unknown')
|
|
213
|
-
}
|
|
214
|
-
self.logger.info(f"正在关闭MySQL连接池,当前状态: {pool_stats}")
|
|
215
|
-
self.pool.close()
|
|
216
|
-
await self.pool.wait_closed()
|
|
217
|
-
self.logger.info("MySQL连接池已关闭")
|
|
218
|
-
except Exception as e:
|
|
219
|
-
self.logger.error(f"关闭MySQL连接池时发生错误: {e}")
|
|
209
|
+
self.logger.info(
|
|
210
|
+
f"MySQL Pipeline 关闭,但保留全局共享连接池以供其他爬虫使用"
|
|
211
|
+
)
|
|
220
212
|
|
|
221
213
|
async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
|
|
222
214
|
"""生成插入SQL语句,子类可以重写此方法"""
|
|
@@ -253,9 +245,9 @@ class AsyncmyMySQLPipeline(BaseMySQLPipeline):
|
|
|
253
245
|
|
|
254
246
|
async def _ensure_pool(self):
|
|
255
247
|
"""确保连接池已初始化(线程安全)"""
|
|
256
|
-
if self._pool_initialized:
|
|
248
|
+
if self._pool_initialized and self.pool:
|
|
257
249
|
# 检查连接池是否仍然有效
|
|
258
|
-
if
|
|
250
|
+
if hasattr(self.pool, 'closed') and not self.pool.closed:
|
|
259
251
|
return
|
|
260
252
|
else:
|
|
261
253
|
self.logger.warning("连接池已初始化但无效,重新初始化")
|
|
@@ -263,7 +255,9 @@ class AsyncmyMySQLPipeline(BaseMySQLPipeline):
|
|
|
263
255
|
async with self._pool_lock:
|
|
264
256
|
if not self._pool_initialized: # 双重检查避免竞争条件
|
|
265
257
|
try:
|
|
266
|
-
|
|
258
|
+
# 使用单例连接池管理器
|
|
259
|
+
self.pool = await MySQLConnectionPoolManager.get_pool(
|
|
260
|
+
pool_type='asyncmy',
|
|
267
261
|
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
268
262
|
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
269
263
|
user=self.settings.get('MYSQL_USER', 'root'),
|
|
@@ -274,11 +268,10 @@ class AsyncmyMySQLPipeline(BaseMySQLPipeline):
|
|
|
274
268
|
echo=self.settings.get_bool('MYSQL_ECHO', False)
|
|
275
269
|
)
|
|
276
270
|
self._pool_initialized = True
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
self.logger.info(f"MySQL连接池初始化完成(表: {self.table_name}, 配置: {pool_stats})")
|
|
271
|
+
self.logger.info(
|
|
272
|
+
f"MySQL连接池初始化完成(表: {self.table_name}, "
|
|
273
|
+
f"使用全局共享连接池)"
|
|
274
|
+
)
|
|
282
275
|
except Exception as e:
|
|
283
276
|
self.logger.error(f"MySQL连接池初始化失败: {e}")
|
|
284
277
|
# 重置状态以便重试
|
|
@@ -391,9 +384,9 @@ class AiomysqlMySQLPipeline(BaseMySQLPipeline):
|
|
|
391
384
|
|
|
392
385
|
async def _ensure_pool(self):
|
|
393
386
|
"""延迟初始化连接池(线程安全)"""
|
|
394
|
-
if self._pool_initialized:
|
|
387
|
+
if self._pool_initialized and self.pool:
|
|
395
388
|
# 检查连接池是否仍然有效
|
|
396
|
-
if
|
|
389
|
+
if hasattr(self.pool, 'closed') and not self.pool.closed:
|
|
397
390
|
return
|
|
398
391
|
else:
|
|
399
392
|
self.logger.warning("连接池已初始化但无效,重新初始化")
|
|
@@ -401,23 +394,22 @@ class AiomysqlMySQLPipeline(BaseMySQLPipeline):
|
|
|
401
394
|
async with self._pool_lock:
|
|
402
395
|
if not self._pool_initialized:
|
|
403
396
|
try:
|
|
404
|
-
|
|
397
|
+
# 使用单例连接池管理器
|
|
398
|
+
self.pool = await MySQLConnectionPoolManager.get_pool(
|
|
399
|
+
pool_type='aiomysql',
|
|
405
400
|
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
406
401
|
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
407
402
|
user=self.settings.get('MYSQL_USER', 'root'),
|
|
408
403
|
password=self.settings.get('MYSQL_PASSWORD', ''),
|
|
409
404
|
db=self.settings.get('MYSQL_DB', 'scrapy_db'),
|
|
410
405
|
minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
|
|
411
|
-
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5)
|
|
412
|
-
cursorclass=aiomysql.DictCursor,
|
|
413
|
-
autocommit=False
|
|
406
|
+
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5)
|
|
414
407
|
)
|
|
415
408
|
self._pool_initialized = True
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
self.logger.info(f"aiomysql连接池已初始化(表: {self.table_name}, 配置: {pool_stats})")
|
|
409
|
+
self.logger.info(
|
|
410
|
+
f"aiomysql连接池已初始化(表: {self.table_name}, "
|
|
411
|
+
f"使用全局共享连接池)"
|
|
412
|
+
)
|
|
421
413
|
except Exception as e:
|
|
422
414
|
self.logger.error(f"aiomysql连接池初始化失败: {e}")
|
|
423
415
|
# 重置状态以便重试
|
|
@@ -4,11 +4,11 @@ from typing import List
|
|
|
4
4
|
from pprint import pformat
|
|
5
5
|
from asyncio import create_task
|
|
6
6
|
|
|
7
|
-
from crawlo.
|
|
8
|
-
from crawlo.event import
|
|
7
|
+
from crawlo.logging import get_logger
|
|
8
|
+
from crawlo.event import CrawlerEvent
|
|
9
9
|
from crawlo.utils.misc import load_object
|
|
10
10
|
from crawlo.project import common_call
|
|
11
|
-
from crawlo.exceptions import PipelineInitError, ItemDiscard, InvalidOutputError
|
|
11
|
+
from crawlo.exceptions import PipelineInitError, ItemDiscard, InvalidOutputError
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def get_dedup_pipeline_classes():
|
|
@@ -34,7 +34,7 @@ class PipelineManager:
|
|
|
34
34
|
self.pipelines: List = []
|
|
35
35
|
self.methods: List = []
|
|
36
36
|
|
|
37
|
-
self.logger = get_logger(self.__class__.__name__
|
|
37
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
38
38
|
pipelines = self.crawler.settings.get_list('PIPELINES')
|
|
39
39
|
dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
40
40
|
|
|
@@ -88,13 +88,13 @@ class PipelineManager:
|
|
|
88
88
|
item = await common_call(method, item, self.crawler.spider)
|
|
89
89
|
if item is None:
|
|
90
90
|
raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
|
|
91
|
-
except
|
|
91
|
+
except ItemDiscard as exc:
|
|
92
92
|
self.logger.debug(f"Item discarded by pipeline: {exc}")
|
|
93
|
-
create_task(self.crawler.subscriber.notify(
|
|
93
|
+
create_task(self.crawler.subscriber.notify(CrawlerEvent.ITEM_DISCARD, item, exc, self.crawler.spider))
|
|
94
94
|
# 重新抛出异常,确保上层调用者也能捕获到,并停止执行后续管道
|
|
95
95
|
raise
|
|
96
|
-
except
|
|
96
|
+
except ItemDiscard:
|
|
97
97
|
# 异常已经被处理和通知,这里只需要重新抛出
|
|
98
98
|
raise
|
|
99
99
|
else:
|
|
100
|
-
create_task(self.crawler.subscriber.notify(
|
|
100
|
+
create_task(self.crawler.subscriber.notify(CrawlerEvent.ITEM_SUCCESSFUL, item, self.crawler.spider))
|
|
@@ -17,9 +17,9 @@ from typing import Optional
|
|
|
17
17
|
|
|
18
18
|
from crawlo import Item
|
|
19
19
|
from crawlo.spider import Spider
|
|
20
|
-
from crawlo.exceptions import
|
|
20
|
+
from crawlo.exceptions import ItemDiscard
|
|
21
21
|
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
22
|
-
from crawlo.
|
|
22
|
+
from crawlo.logging import get_logger
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class RedisDedupPipeline:
|
|
@@ -31,8 +31,7 @@ class RedisDedupPipeline:
|
|
|
31
31
|
redis_port: int = 6379,
|
|
32
32
|
redis_db: int = 0,
|
|
33
33
|
redis_password: Optional[str] = None,
|
|
34
|
-
redis_key: str = 'crawlo:item_fingerprints'
|
|
35
|
-
log_level: str = "INFO"
|
|
34
|
+
redis_key: str = 'crawlo:item_fingerprints'
|
|
36
35
|
):
|
|
37
36
|
"""
|
|
38
37
|
初始化 Redis 去重管道
|
|
@@ -42,9 +41,8 @@ class RedisDedupPipeline:
|
|
|
42
41
|
:param redis_db: Redis 数据库编号
|
|
43
42
|
:param redis_password: Redis 密码
|
|
44
43
|
:param redis_key: 存储指纹的 Redis 键名
|
|
45
|
-
:param log_level: 日志级别
|
|
46
44
|
"""
|
|
47
|
-
self.logger = get_logger(self.__class__.__name__
|
|
45
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
48
46
|
|
|
49
47
|
# 初始化 Redis 连接
|
|
50
48
|
try:
|
|
@@ -59,8 +57,6 @@ class RedisDedupPipeline:
|
|
|
59
57
|
)
|
|
60
58
|
# 测试连接
|
|
61
59
|
self.redis_client.ping()
|
|
62
|
-
# Change INFO level log to DEBUG level to avoid redundant output
|
|
63
|
-
# self.logger.debug(f"Redis connection successful: {redis_host}:{redis_port}/{redis_db}") # 注释掉重复的日志
|
|
64
60
|
except Exception as e:
|
|
65
61
|
self.logger.error(f"Redis connection failed: {e}")
|
|
66
62
|
raise RuntimeError(f"Redis 连接失败: {e}")
|
|
@@ -82,8 +78,7 @@ class RedisDedupPipeline:
|
|
|
82
78
|
redis_port=settings.get_int('REDIS_PORT', 6379),
|
|
83
79
|
redis_db=settings.get_int('REDIS_DB', 0),
|
|
84
80
|
redis_password=settings.get('REDIS_PASSWORD') or None,
|
|
85
|
-
redis_key=redis_key
|
|
86
|
-
log_level=settings.get('LOG_LEVEL', 'INFO')
|
|
81
|
+
redis_key=redis_key
|
|
87
82
|
)
|
|
88
83
|
|
|
89
84
|
def process_item(self, item: Item, spider: Spider) -> Item:
|
|
@@ -92,7 +87,7 @@ class RedisDedupPipeline:
|
|
|
92
87
|
|
|
93
88
|
:param item: 要处理的数据项
|
|
94
89
|
:param spider: 爬虫实例
|
|
95
|
-
:return: 处理后的数据项或抛出
|
|
90
|
+
:return: 处理后的数据项或抛出 ItemDiscard 异常
|
|
96
91
|
"""
|
|
97
92
|
try:
|
|
98
93
|
# 生成数据项指纹
|
|
@@ -150,8 +145,12 @@ class RedisDedupPipeline:
|
|
|
150
145
|
|
|
151
146
|
# 注意:默认情况下不清理 Redis 中的指纹
|
|
152
147
|
# 如果需要清理,可以在设置中配置
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
148
|
+
# 安全访问crawler和settings
|
|
149
|
+
crawler = getattr(spider, 'crawler', None)
|
|
150
|
+
if crawler and hasattr(crawler, 'settings'):
|
|
151
|
+
settings = crawler.settings
|
|
152
|
+
if settings.getbool('REDIS_DEDUP_CLEANUP', False):
|
|
153
|
+
deleted = self.redis_client.delete(self.redis_key)
|
|
154
|
+
self.logger.info(f" - Cleaned fingerprints: {deleted}")
|
|
156
155
|
except Exception as e:
|
|
157
156
|
self.logger.error(f"Error closing spider: {e}")
|
crawlo/project.py
CHANGED
|
@@ -5,7 +5,7 @@ from inspect import iscoroutinefunction
|
|
|
5
5
|
from typing import Callable, Optional, Any
|
|
6
6
|
|
|
7
7
|
from crawlo.settings.setting_manager import SettingManager
|
|
8
|
-
from crawlo.
|
|
8
|
+
from crawlo.logging import get_logger
|
|
9
9
|
|
|
10
10
|
# 使用全局logger,避免每个模块都创建自己的延迟初始化函数
|
|
11
11
|
# 延迟获取logger,确保在日志系统配置之后获取
|
crawlo/queue/__init__.py
CHANGED
crawlo/queue/queue_manager.py
CHANGED
|
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
from crawlo.queue.pqueue import SpiderPriorityQueue
|
|
17
17
|
from crawlo.utils.error_handler import ErrorHandler
|
|
18
|
-
from crawlo.
|
|
18
|
+
from crawlo.logging import get_logger
|
|
19
19
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
20
20
|
|
|
21
21
|
try:
|
|
@@ -123,9 +123,11 @@ class QueueConfig:
|
|
|
123
123
|
max_queue_size: int = 1000,
|
|
124
124
|
max_retries: int = 3,
|
|
125
125
|
timeout: int = 300,
|
|
126
|
+
run_mode: Optional[str] = None, # 新增:运行模式
|
|
126
127
|
**kwargs
|
|
127
128
|
):
|
|
128
129
|
self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
|
|
130
|
+
self.run_mode = run_mode # 保存运行模式
|
|
129
131
|
|
|
130
132
|
# Redis 配置
|
|
131
133
|
if redis_url:
|
|
@@ -166,7 +168,8 @@ class QueueConfig:
|
|
|
166
168
|
queue_name=queue_name,
|
|
167
169
|
max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
|
|
168
170
|
max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
|
|
169
|
-
timeout=settings.get_int('QUEUE_TIMEOUT', 300)
|
|
171
|
+
timeout=settings.get_int('QUEUE_TIMEOUT', 300),
|
|
172
|
+
run_mode=settings.get('RUN_MODE') # 传递运行模式
|
|
170
173
|
)
|
|
171
174
|
|
|
172
175
|
|
|
@@ -224,6 +227,17 @@ class QueueManager:
|
|
|
224
227
|
|
|
225
228
|
return False # 默认不需要更新配置
|
|
226
229
|
|
|
230
|
+
except RuntimeError as e:
|
|
231
|
+
# Distributed 模式下的 RuntimeError 必须重新抛出
|
|
232
|
+
if self.config.run_mode == 'distributed':
|
|
233
|
+
self.logger.error(f"Queue initialization failed: {e}")
|
|
234
|
+
self._health_status = "error"
|
|
235
|
+
raise # 重新抛出异常
|
|
236
|
+
# 其他模式记录错误但不抛出
|
|
237
|
+
self.logger.error(f"Queue initialization failed: {e}")
|
|
238
|
+
self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
|
|
239
|
+
self._health_status = "error"
|
|
240
|
+
return False
|
|
227
241
|
except Exception as e:
|
|
228
242
|
# 记录详细的错误信息和堆栈跟踪
|
|
229
243
|
self.logger.error(f"Queue initialization failed: {e}")
|
|
@@ -403,23 +417,63 @@ class QueueManager:
|
|
|
403
417
|
return QueueType.MEMORY
|
|
404
418
|
|
|
405
419
|
elif self.config.queue_type == QueueType.REDIS:
|
|
406
|
-
#
|
|
407
|
-
|
|
408
|
-
|
|
420
|
+
# Distributed 模式:必须使用 Redis,不允许降级
|
|
421
|
+
if self.config.run_mode == 'distributed':
|
|
422
|
+
# 分布式模式必须确保 Redis 可用
|
|
423
|
+
if not REDIS_AVAILABLE:
|
|
424
|
+
error_msg = (
|
|
425
|
+
"Distributed 模式要求 Redis 可用,但 Redis 客户端库未安装。\n"
|
|
426
|
+
"请安装 Redis 支持: pip install redis"
|
|
427
|
+
)
|
|
428
|
+
self.logger.error(error_msg)
|
|
429
|
+
raise RuntimeError(error_msg)
|
|
430
|
+
|
|
431
|
+
if not self.config.redis_url:
|
|
432
|
+
error_msg = (
|
|
433
|
+
"Distributed 模式要求配置 Redis 连接信息。\n"
|
|
434
|
+
"请在 settings.py 中配置 REDIS_HOST、REDIS_PORT 等参数"
|
|
435
|
+
)
|
|
436
|
+
self.logger.error(error_msg)
|
|
437
|
+
raise RuntimeError(error_msg)
|
|
438
|
+
|
|
409
439
|
# 测试 Redis 连接
|
|
410
440
|
try:
|
|
411
441
|
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
412
442
|
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
413
443
|
await test_queue.connect()
|
|
414
444
|
await test_queue.close()
|
|
415
|
-
self.logger.debug("
|
|
445
|
+
self.logger.debug("Distributed mode: Redis connection verified")
|
|
416
446
|
return QueueType.REDIS
|
|
417
447
|
except Exception as e:
|
|
418
|
-
|
|
419
|
-
|
|
448
|
+
error_msg = (
|
|
449
|
+
f"Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。\n"
|
|
450
|
+
f"错误信息: {e}\n"
|
|
451
|
+
f"Redis URL: {self.config.redis_url}\n"
|
|
452
|
+
f"请检查:\n"
|
|
453
|
+
f" 1. Redis 服务是否正在运行\n"
|
|
454
|
+
f" 2. Redis 连接配置是否正确\n"
|
|
455
|
+
f" 3. 网络连接是否正常"
|
|
456
|
+
)
|
|
457
|
+
self.logger.error(error_msg)
|
|
458
|
+
raise RuntimeError(error_msg) from e
|
|
420
459
|
else:
|
|
421
|
-
|
|
422
|
-
|
|
460
|
+
# 非 distributed 模式:QUEUE_TYPE='redis' 时允许降级到 memory
|
|
461
|
+
# 这提供了向后兼容性和更好的容错性
|
|
462
|
+
if REDIS_AVAILABLE and self.config.redis_url:
|
|
463
|
+
# 测试 Redis 连接
|
|
464
|
+
try:
|
|
465
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
466
|
+
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
467
|
+
await test_queue.connect()
|
|
468
|
+
await test_queue.close()
|
|
469
|
+
self.logger.debug("Redis mode: Redis available, using distributed queue")
|
|
470
|
+
return QueueType.REDIS
|
|
471
|
+
except Exception as e:
|
|
472
|
+
self.logger.warning(f"Redis mode: Redis unavailable ({e}), falling back to memory queue")
|
|
473
|
+
return QueueType.MEMORY
|
|
474
|
+
else:
|
|
475
|
+
self.logger.warning("Redis mode: Redis not configured, falling back to memory queue")
|
|
476
|
+
return QueueType.MEMORY
|
|
423
477
|
|
|
424
478
|
elif self.config.queue_type == QueueType.MEMORY:
|
|
425
479
|
return QueueType.MEMORY
|
|
@@ -489,9 +543,21 @@ class QueueManager:
|
|
|
489
543
|
except Exception as e:
|
|
490
544
|
self.logger.warning(f"Queue health check failed: {e}")
|
|
491
545
|
self._health_status = "unhealthy"
|
|
492
|
-
|
|
493
|
-
#
|
|
494
|
-
if self.
|
|
546
|
+
|
|
547
|
+
# Distributed 模式下 Redis 健康检查失败应该报错
|
|
548
|
+
if self.config.run_mode == 'distributed':
|
|
549
|
+
error_msg = (
|
|
550
|
+
f"Distributed 模式下 Redis 健康检查失败。\n"
|
|
551
|
+
f"错误信息: {e}\n"
|
|
552
|
+
f"Redis URL: {self.config.redis_url}\n"
|
|
553
|
+
f"分布式模式不允许降级到内存队列,请修复 Redis 连接问题。"
|
|
554
|
+
)
|
|
555
|
+
self.logger.error(error_msg)
|
|
556
|
+
raise RuntimeError(error_msg) from e
|
|
557
|
+
|
|
558
|
+
# 非 Distributed 模式:如果是Redis队列且健康检查失败,尝试切换到内存队列
|
|
559
|
+
# 对于 AUTO 模式允许回退
|
|
560
|
+
if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
|
|
495
561
|
self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
|
|
496
562
|
try:
|
|
497
563
|
await self._queue.close()
|