crawlo 1.1.5__tar.gz → 1.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.1.5/crawlo.egg-info → crawlo-1.1.6}/PKG-INFO +1 -1
- crawlo-1.1.6/crawlo/__version__.py +1 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/run.py +7 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/startproject.py +134 -15
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/config.py +3 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/core/engine.py +1 -1
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/filters/aioredis_filter.py +4 -3
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/mode_manager.py +9 -3
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/pipeline_manager.py +11 -6
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/redis_dedup_pipeline.py +3 -2
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/project.py +38 -3
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/queue/queue_manager.py +15 -1
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/queue/redis_priority_queue.py +37 -15
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/settings/default_settings.py +3 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/settings/setting_manager.py +24 -1
- {crawlo-1.1.5 → crawlo-1.1.6/crawlo.egg-info}/PKG-INFO +1 -1
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo.egg-info/SOURCES.txt +4 -0
- crawlo-1.1.6/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +82 -0
- crawlo-1.1.6/tests/test_double_crawlo_fix.py +208 -0
- crawlo-1.1.6/tests/test_double_crawlo_fix_simple.py +125 -0
- crawlo-1.1.6/tests/test_queue_manager_double_crawlo.py +231 -0
- crawlo-1.1.5/crawlo/__version__.py +0 -1
- {crawlo-1.1.5 → crawlo-1.1.6}/LICENSE +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/MANIFEST.in +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/README.md +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/cleaners/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/cleaners/data_formatter.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/cleaners/encoding_converter.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/cleaners/text_cleaner.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/cli.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/check.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/list.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/stats.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/commands/utils.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/config_validator.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/core/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/core/processor.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/crawler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/event.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/exceptions.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/items/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/items/base.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/items/fields.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/items/items.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/network/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/network/request.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/network/response.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/stats_collector.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/subscriber.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/task_manager.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/run.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/settings.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/date_tools.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/log.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/redis_connection_pool.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/request.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/system.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/tools.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo/utils/url.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/examples/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/pyproject.toml +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/requirements.txt +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/setup.cfg +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/__init__.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/cleaners_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/config_validation_demo.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/date_tools_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/env_config_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/error_handling_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/response_improvements_example.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_cleaners.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_comprehensive.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_config_validator.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_date_tools.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_edge_cases.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_env_config.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_final_validation.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_integration.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_parsel.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_performance.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_redis_config.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_redis_queue.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_request_serialization.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_response_improvements.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_scheduler.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_simple_response.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_template_content.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/test_tools.py +0 -0
- {crawlo-1.1.5 → crawlo-1.1.6}/tests/tools_example.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.6"
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import sys
|
|
9
9
|
import asyncio
|
|
10
10
|
import configparser
|
|
11
|
+
import os
|
|
11
12
|
from pathlib import Path
|
|
12
13
|
from importlib import import_module
|
|
13
14
|
|
|
@@ -32,6 +33,12 @@ def get_project_root():
|
|
|
32
33
|
向上查找 crawlo.cfg 来确定项目根目录
|
|
33
34
|
"""
|
|
34
35
|
current = Path.cwd()
|
|
36
|
+
# 首先检查当前目录及其子目录
|
|
37
|
+
for root, dirs, files in os.walk(current):
|
|
38
|
+
if "crawlo.cfg" in files:
|
|
39
|
+
return Path(root)
|
|
40
|
+
|
|
41
|
+
# 如果在子目录中没找到,再向上查找
|
|
35
42
|
for _ in range(10):
|
|
36
43
|
cfg = current / "crawlo.cfg"
|
|
37
44
|
if cfg.exists():
|
|
@@ -10,7 +10,7 @@ import re
|
|
|
10
10
|
import sys
|
|
11
11
|
import os
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Optional
|
|
13
|
+
from typing import Optional, List
|
|
14
14
|
|
|
15
15
|
# 添加项目根目录到路径,以便能够导入utils模块
|
|
16
16
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
|
|
@@ -56,6 +56,19 @@ TEMPLATE_TYPES = {
|
|
|
56
56
|
'gentle': '温和模板 - 低负载配置,对目标网站友好'
|
|
57
57
|
}
|
|
58
58
|
|
|
59
|
+
# 可选的模块组件
|
|
60
|
+
OPTIONAL_MODULES = {
|
|
61
|
+
'mysql': 'MySQL数据库支持',
|
|
62
|
+
'mongodb': 'MongoDB数据库支持',
|
|
63
|
+
'redis': 'Redis支持(分布式队列和去重)',
|
|
64
|
+
'proxy': '代理支持',
|
|
65
|
+
'monitoring': '监控和性能分析',
|
|
66
|
+
'dedup': '去重功能',
|
|
67
|
+
'httpx': 'HttpX下载器',
|
|
68
|
+
'aiohttp': 'AioHttp下载器',
|
|
69
|
+
'curl': 'CurlCffi下载器'
|
|
70
|
+
}
|
|
71
|
+
|
|
59
72
|
|
|
60
73
|
def show_error_panel(title, content):
|
|
61
74
|
"""显示错误面板的简单实现"""
|
|
@@ -84,9 +97,10 @@ def _render_template(tmpl_path, context):
|
|
|
84
97
|
return content
|
|
85
98
|
|
|
86
99
|
|
|
87
|
-
def _copytree_with_templates(src, dst, context, template_type='default'):
|
|
100
|
+
def _copytree_with_templates(src, dst, context, template_type='default', modules: List[str] = None):
|
|
88
101
|
"""
|
|
89
102
|
递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
|
|
103
|
+
支持选择性模块复制。
|
|
90
104
|
"""
|
|
91
105
|
src_path = Path(src)
|
|
92
106
|
dst_path = Path(dst)
|
|
@@ -96,24 +110,43 @@ def _copytree_with_templates(src, dst, context, template_type='default'):
|
|
|
96
110
|
rel_path = item.relative_to(src_path)
|
|
97
111
|
dst_item = dst_path / rel_path
|
|
98
112
|
|
|
113
|
+
# 检查是否应该包含此文件(基于模块选择)
|
|
114
|
+
if not _should_include_file(rel_path, modules):
|
|
115
|
+
continue
|
|
116
|
+
|
|
99
117
|
if item.is_dir():
|
|
100
118
|
dst_item.mkdir(parents=True, exist_ok=True)
|
|
101
119
|
else:
|
|
102
120
|
if item.suffix == '.tmpl':
|
|
121
|
+
rendered_content = None
|
|
103
122
|
# 处理特定模板类型的设置文件
|
|
104
|
-
if item.name == 'settings.py.tmpl'
|
|
105
|
-
#
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
123
|
+
if item.name == 'settings.py.tmpl':
|
|
124
|
+
# 对于设置文件,根据模板类型选择相应的内容模板
|
|
125
|
+
if template_type != 'default':
|
|
126
|
+
# 使用特定模板类型的设置文件
|
|
127
|
+
template_file_name = f'settings_{template_type}.py.tmpl'
|
|
128
|
+
template_file_path = src_path / template_file_name
|
|
129
|
+
if template_file_path.exists():
|
|
130
|
+
rendered_content = _render_template(template_file_path, context)
|
|
131
|
+
else:
|
|
132
|
+
# 如果特定模板不存在,使用默认模板
|
|
133
|
+
rendered_content = _render_template(item, context)
|
|
110
134
|
else:
|
|
111
|
-
#
|
|
135
|
+
# 使用默认模板
|
|
112
136
|
rendered_content = _render_template(item, context)
|
|
137
|
+
# 跳过其他以 settings_ 开头的模板文件,避免重复处理
|
|
138
|
+
elif item.name.startswith('settings_') and item.name.endswith('.py.tmpl'):
|
|
139
|
+
continue
|
|
113
140
|
else:
|
|
114
141
|
rendered_content = _render_template(item, context)
|
|
115
142
|
|
|
116
|
-
|
|
143
|
+
# 确保设置文件始终命名为 settings.py
|
|
144
|
+
if item.name == 'settings.py.tmpl':
|
|
145
|
+
# 特殊处理设置模板文件,统一生成为 settings.py
|
|
146
|
+
final_dst = dst_item.parent / 'settings.py'
|
|
147
|
+
else:
|
|
148
|
+
final_dst = dst_item.with_suffix('')
|
|
149
|
+
|
|
117
150
|
final_dst.parent.mkdir(parents=True, exist_ok=True)
|
|
118
151
|
with open(final_dst, 'w', encoding='utf-8') as f:
|
|
119
152
|
f.write(rendered_content)
|
|
@@ -121,6 +154,54 @@ def _copytree_with_templates(src, dst, context, template_type='default'):
|
|
|
121
154
|
shutil.copy2(item, dst_item)
|
|
122
155
|
|
|
123
156
|
|
|
157
|
+
def _should_include_file(rel_path, modules: List[str]) -> bool:
|
|
158
|
+
"""
|
|
159
|
+
根据选择的模块决定是否包含文件
|
|
160
|
+
"""
|
|
161
|
+
if modules is None:
|
|
162
|
+
# 如果没有指定模块,则包含所有文件
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
# 基础文件始终包含
|
|
166
|
+
basic_files = [
|
|
167
|
+
'__init__.py.tmpl',
|
|
168
|
+
'settings.py.tmpl',
|
|
169
|
+
'spiders/__init__.py.tmpl',
|
|
170
|
+
'items.py.tmpl',
|
|
171
|
+
'middlewares.py.tmpl',
|
|
172
|
+
'run.py.tmpl'
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
path_str = str(rel_path).replace('\\', '/')
|
|
176
|
+
|
|
177
|
+
# 始终包含基础文件
|
|
178
|
+
if path_str in basic_files:
|
|
179
|
+
return True
|
|
180
|
+
|
|
181
|
+
# 根据模块选择包含特定文件
|
|
182
|
+
if 'mysql' in modules and 'mysql' in path_str:
|
|
183
|
+
return True
|
|
184
|
+
if 'mongodb' in modules and 'mongo' in path_str:
|
|
185
|
+
return True
|
|
186
|
+
if 'redis' in modules and 'redis' in path_str:
|
|
187
|
+
return True
|
|
188
|
+
if 'proxy' in modules and 'proxy' in path_str:
|
|
189
|
+
return True
|
|
190
|
+
if 'monitoring' in modules and ('monitor' in path_str or 'stats' in path_str):
|
|
191
|
+
return True
|
|
192
|
+
if 'dedup' in modules and 'dedup' in path_str:
|
|
193
|
+
return True
|
|
194
|
+
if 'httpx' in modules and 'httpx' in path_str:
|
|
195
|
+
return True
|
|
196
|
+
if 'aiohttp' in modules and 'aiohttp' in path_str:
|
|
197
|
+
return True
|
|
198
|
+
if 'curl' in modules and 'cffi' in path_str:
|
|
199
|
+
return True
|
|
200
|
+
|
|
201
|
+
# 默认不包含特定模块文件
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
|
|
124
205
|
def validate_project_name(project_name: str) -> tuple[bool, str]:
|
|
125
206
|
"""
|
|
126
207
|
验证项目名称是否有效
|
|
@@ -184,18 +265,52 @@ def show_template_options():
|
|
|
184
265
|
print(f" {template_type}: {description}")
|
|
185
266
|
|
|
186
267
|
|
|
268
|
+
def show_module_options():
|
|
269
|
+
"""显示可用的模块选项"""
|
|
270
|
+
if RICH_AVAILABLE:
|
|
271
|
+
table = Table(title="可选模块组件", show_header=True, header_style="bold magenta")
|
|
272
|
+
table.add_column("模块", style="cyan", no_wrap=True)
|
|
273
|
+
table.add_column("描述", style="green")
|
|
274
|
+
|
|
275
|
+
for module, description in OPTIONAL_MODULES.items():
|
|
276
|
+
table.add_row(module, description)
|
|
277
|
+
|
|
278
|
+
console.print(table)
|
|
279
|
+
else:
|
|
280
|
+
print("可选模块组件:")
|
|
281
|
+
for module, description in OPTIONAL_MODULES.items():
|
|
282
|
+
print(f" {module}: {description}")
|
|
283
|
+
|
|
284
|
+
|
|
187
285
|
def main(args):
|
|
188
|
-
if len(args) < 1
|
|
189
|
-
console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name> [template_type]")
|
|
286
|
+
if len(args) < 1:
|
|
287
|
+
console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name> [template_type] [--modules module1,module2]")
|
|
190
288
|
console.print("💡 Examples:")
|
|
191
289
|
console.print(" [blue]crawlo startproject[/blue] my_spider_project")
|
|
192
290
|
console.print(" [blue]crawlo startproject[/blue] news_crawler simple")
|
|
193
|
-
console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed")
|
|
291
|
+
console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed --modules mysql,proxy")
|
|
194
292
|
show_template_options()
|
|
293
|
+
show_module_options()
|
|
195
294
|
return 1
|
|
196
295
|
|
|
296
|
+
# 解析参数
|
|
197
297
|
project_name = args[0]
|
|
198
|
-
template_type =
|
|
298
|
+
template_type = 'default'
|
|
299
|
+
modules = None
|
|
300
|
+
|
|
301
|
+
# 解析可选参数
|
|
302
|
+
if len(args) > 1:
|
|
303
|
+
for i, arg in enumerate(args[1:], 1):
|
|
304
|
+
if arg.startswith('--modules='):
|
|
305
|
+
modules_str = arg.split('=', 1)[1]
|
|
306
|
+
modules = [m.strip() for m in modules_str.split(',') if m.strip()]
|
|
307
|
+
elif arg.startswith('--modules'):
|
|
308
|
+
# 处理 --modules module1,module2 格式
|
|
309
|
+
if i + 1 < len(args):
|
|
310
|
+
modules_str = args[i + 1]
|
|
311
|
+
modules = [m.strip() for m in modules_str.split(',') if m.strip()]
|
|
312
|
+
elif not arg.startswith('--') and arg in TEMPLATE_TYPES:
|
|
313
|
+
template_type = arg
|
|
199
314
|
|
|
200
315
|
# 验证模板类型
|
|
201
316
|
if template_type not in TEMPLATE_TYPES:
|
|
@@ -249,7 +364,7 @@ def main(args):
|
|
|
249
364
|
|
|
250
365
|
# 3. 复制并渲染项目包内容
|
|
251
366
|
package_dir = project_dir / project_name
|
|
252
|
-
_copytree_with_templates(template_dir, package_dir, context, template_type)
|
|
367
|
+
_copytree_with_templates(template_dir, package_dir, context, template_type, modules)
|
|
253
368
|
console.print(f":white_check_mark: Created project package: [green]{package_dir}[/green]")
|
|
254
369
|
|
|
255
370
|
# 4. 创建 logs 目录
|
|
@@ -267,6 +382,10 @@ def main(args):
|
|
|
267
382
|
# 显示使用的模板类型
|
|
268
383
|
if template_type != 'default':
|
|
269
384
|
console.print(f":information: 使用模板类型: [bold blue]{template_type}[/bold blue] - {TEMPLATE_TYPES[template_type]}")
|
|
385
|
+
|
|
386
|
+
# 显示选择的模块
|
|
387
|
+
if modules:
|
|
388
|
+
console.print(f":information: 选择的模块: [bold blue]{', '.join(modules)}[/bold blue]")
|
|
270
389
|
|
|
271
390
|
# 下一步操作提示(对齐美观 + 语法高亮)
|
|
272
391
|
next_steps = f"""
|
|
@@ -158,6 +158,7 @@ class CrawloConfig:
|
|
|
158
158
|
redis_host: str = '127.0.0.1',
|
|
159
159
|
redis_port: int = 6379,
|
|
160
160
|
redis_password: Optional[str] = None,
|
|
161
|
+
redis_db: int = 0, # 添加 redis_db 参数
|
|
161
162
|
project_name: str = 'crawlo',
|
|
162
163
|
concurrency: int = 16,
|
|
163
164
|
download_delay: float = 1.0,
|
|
@@ -170,6 +171,7 @@ class CrawloConfig:
|
|
|
170
171
|
redis_host: Redis 服务器地址
|
|
171
172
|
redis_port: Redis 端口
|
|
172
173
|
redis_password: Redis 密码
|
|
174
|
+
redis_db: Redis 数据库编号
|
|
173
175
|
project_name: 项目名称(用于命名空间)
|
|
174
176
|
concurrency: 并发数
|
|
175
177
|
download_delay: 下载延迟
|
|
@@ -179,6 +181,7 @@ class CrawloConfig:
|
|
|
179
181
|
redis_host=redis_host,
|
|
180
182
|
redis_port=redis_port,
|
|
181
183
|
redis_password=redis_password,
|
|
184
|
+
redis_db=redis_db, # 传递 redis_db 参数
|
|
182
185
|
project_name=project_name,
|
|
183
186
|
CONCURRENCY=concurrency,
|
|
184
187
|
DOWNLOAD_DELAY=download_delay,
|
|
@@ -70,7 +70,7 @@ class Engine(object):
|
|
|
70
70
|
def engine_start(self):
|
|
71
71
|
self.running = True
|
|
72
72
|
self.logger.info(
|
|
73
|
-
f"Crawlo (version {self.settings.
|
|
73
|
+
f"Crawlo (version {self.settings.get('VERSION')}) started. "
|
|
74
74
|
f"(project name : {self.settings.get('PROJECT_NAME')})"
|
|
75
75
|
)
|
|
76
76
|
|
|
@@ -75,7 +75,8 @@ class AioRedisFilter(BaseFilter):
|
|
|
75
75
|
def create_instance(cls, crawler) -> 'BaseFilter':
|
|
76
76
|
"""从爬虫配置创建过滤器实例"""
|
|
77
77
|
redis_url = crawler.settings.get('REDIS_URL', 'redis://localhost:6379')
|
|
78
|
-
decode_responses
|
|
78
|
+
# 确保 decode_responses=False 以避免编码问题
|
|
79
|
+
decode_responses = False # crawler.settings.get_bool('DECODE_RESPONSES', False)
|
|
79
80
|
ttl_setting = crawler.settings.get_int('REDIS_TTL')
|
|
80
81
|
|
|
81
82
|
# 处理TTL设置
|
|
@@ -84,7 +85,7 @@ class AioRedisFilter(BaseFilter):
|
|
|
84
85
|
ttl = max(0, int(ttl_setting)) if ttl_setting > 0 else None
|
|
85
86
|
|
|
86
87
|
try:
|
|
87
|
-
#
|
|
88
|
+
# 使用优化的连接池,确保 decode_responses=False
|
|
88
89
|
redis_pool = get_redis_pool(
|
|
89
90
|
redis_url,
|
|
90
91
|
max_connections=20,
|
|
@@ -92,7 +93,7 @@ class AioRedisFilter(BaseFilter):
|
|
|
92
93
|
socket_timeout=30,
|
|
93
94
|
health_check_interval=30,
|
|
94
95
|
retry_on_timeout=True,
|
|
95
|
-
decode_responses=decode_responses,
|
|
96
|
+
decode_responses=decode_responses, # 确保不自动解码响应
|
|
96
97
|
encoding='utf-8'
|
|
97
98
|
)
|
|
98
99
|
|
|
@@ -47,21 +47,24 @@ class ModeManager:
|
|
|
47
47
|
redis_host: str = '127.0.0.1',
|
|
48
48
|
redis_port: int = 6379,
|
|
49
49
|
redis_password: Optional[str] = None,
|
|
50
|
+
redis_db: int = 0, # 添加 redis_db 参数
|
|
50
51
|
project_name: str = 'crawlo'
|
|
51
52
|
) -> Dict[str, Any]:
|
|
52
53
|
"""获取分布式模式配置"""
|
|
53
|
-
# 构建 Redis URL
|
|
54
|
+
# 构建 Redis URL,使用传入的 redis_db 参数
|
|
54
55
|
if redis_password:
|
|
55
|
-
redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/
|
|
56
|
+
redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
|
|
56
57
|
else:
|
|
57
|
-
redis_url = f'redis://{redis_host}:{redis_port}/
|
|
58
|
+
redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
|
|
58
59
|
|
|
59
60
|
return {
|
|
61
|
+
'PROJECT_NAME': project_name, # 添加项目名称到配置中
|
|
60
62
|
'QUEUE_TYPE': 'redis',
|
|
61
63
|
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
62
64
|
'REDIS_HOST': redis_host,
|
|
63
65
|
'REDIS_PORT': redis_port,
|
|
64
66
|
'REDIS_PASSWORD': redis_password,
|
|
67
|
+
'REDIS_DB': redis_db, # 添加 Redis 数据库编号到配置中
|
|
65
68
|
'REDIS_URL': redis_url,
|
|
66
69
|
'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
|
|
67
70
|
# Redis key配置已移至各组件中,使用统一的命名规范
|
|
@@ -111,6 +114,7 @@ class ModeManager:
|
|
|
111
114
|
redis_host=kwargs.get('redis_host', '127.0.0.1'),
|
|
112
115
|
redis_port=kwargs.get('redis_port', 6379),
|
|
113
116
|
redis_password=kwargs.get('redis_password'),
|
|
117
|
+
redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
|
|
114
118
|
project_name=kwargs.get('project_name', 'crawlo')
|
|
115
119
|
)
|
|
116
120
|
|
|
@@ -160,6 +164,7 @@ def distributed_mode(
|
|
|
160
164
|
redis_host: str = '127.0.0.1',
|
|
161
165
|
redis_port: int = 6379,
|
|
162
166
|
redis_password: Optional[str] = None,
|
|
167
|
+
redis_db: int = 0, # 添加 redis_db 参数
|
|
163
168
|
project_name: str = 'crawlo',
|
|
164
169
|
**kwargs
|
|
165
170
|
) -> Dict[str, Any]:
|
|
@@ -169,6 +174,7 @@ def distributed_mode(
|
|
|
169
174
|
redis_host=redis_host,
|
|
170
175
|
redis_port=redis_port,
|
|
171
176
|
redis_password=redis_password,
|
|
177
|
+
redis_db=redis_db, # 传递 redis_db 参数
|
|
172
178
|
project_name=project_name,
|
|
173
179
|
**kwargs
|
|
174
180
|
)
|
|
@@ -30,12 +30,17 @@ class PipelineManager:
|
|
|
30
30
|
|
|
31
31
|
def _add_pipelines(self, pipelines):
|
|
32
32
|
for pipeline in pipelines:
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
33
|
+
try:
|
|
34
|
+
pipeline_cls = load_class(pipeline)
|
|
35
|
+
if not hasattr(pipeline_cls, 'from_crawler'):
|
|
36
|
+
raise PipelineInitError(
|
|
37
|
+
f"Pipeline init failed, must inherit from `BasePipeline` or have a `create_instance` method"
|
|
38
|
+
)
|
|
39
|
+
self.pipelines.append(pipeline_cls.from_crawler(self.crawler))
|
|
40
|
+
except Exception as e:
|
|
41
|
+
self.logger.error(f"Failed to load pipeline {pipeline}: {e}")
|
|
42
|
+
# 可以选择继续加载其他管道或抛出异常
|
|
43
|
+
raise
|
|
39
44
|
if pipelines:
|
|
40
45
|
self.logger.info(f"enabled pipelines: \n {pformat(pipelines)}")
|
|
41
46
|
|
|
@@ -13,13 +13,14 @@
|
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
15
|
import hashlib
|
|
16
|
-
from typing import
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
17
18
|
import redis
|
|
18
19
|
|
|
19
20
|
from crawlo import Item
|
|
21
|
+
from crawlo.exceptions import DropItem
|
|
20
22
|
from crawlo.spider import Spider
|
|
21
23
|
from crawlo.utils.log import get_logger
|
|
22
|
-
from crawlo.exceptions import DropItem
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class RedisDedupPipeline:
|
|
@@ -30,6 +30,15 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
30
30
|
2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
|
|
31
31
|
"""
|
|
32
32
|
path = os.path.abspath(start_path)
|
|
33
|
+
|
|
34
|
+
# 首先检查当前目录及其子目录
|
|
35
|
+
for root, dirs, files in os.walk(path):
|
|
36
|
+
if "crawlo.cfg" in files:
|
|
37
|
+
cfg_path = os.path.join(root, "crawlo.cfg")
|
|
38
|
+
logger.info(f"✅ 找到项目配置文件: {cfg_path}")
|
|
39
|
+
return root
|
|
40
|
+
|
|
41
|
+
# 如果在子目录中没找到,再向上查找
|
|
33
42
|
while True:
|
|
34
43
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
35
44
|
if os.path.isfile(cfg_file):
|
|
@@ -128,17 +137,43 @@ def load_class(_path):
|
|
|
128
137
|
raise TypeError(f"args expect str or object, got {_path}")
|
|
129
138
|
|
|
130
139
|
module_name, class_name = _path.rsplit('.', 1)
|
|
131
|
-
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
module = import_module(module_name)
|
|
143
|
+
except ImportError as e:
|
|
144
|
+
# 尝试不同的导入方式
|
|
145
|
+
try:
|
|
146
|
+
# 尝试直接导入完整路径
|
|
147
|
+
module = import_module(_path)
|
|
148
|
+
return module
|
|
149
|
+
except ImportError:
|
|
150
|
+
pass
|
|
151
|
+
raise ImportError(f"Cannot import module {module_name}: {e}")
|
|
132
152
|
|
|
133
153
|
try:
|
|
134
154
|
cls = getattr(module, class_name)
|
|
135
155
|
except AttributeError:
|
|
136
|
-
|
|
156
|
+
# 提供更详细的错误信息
|
|
157
|
+
available_attrs = [attr for attr in dir(module) if not attr.startswith('_')]
|
|
158
|
+
raise NameError(f"Module {module_name!r} has no class named {class_name!r}. Available attributes: {available_attrs}")
|
|
137
159
|
return cls
|
|
138
160
|
|
|
139
161
|
|
|
140
162
|
def merge_settings(spider, settings):
|
|
141
163
|
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
164
|
+
# 检查 settings 是否为 SettingManager 实例
|
|
165
|
+
if not hasattr(settings, 'update_attributes'):
|
|
166
|
+
logger.error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
|
|
167
|
+
# 如果是字典,创建一个新的 SettingManager 实例
|
|
168
|
+
if isinstance(settings, dict):
|
|
169
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
170
|
+
new_settings = SettingManager()
|
|
171
|
+
new_settings.update_attributes(settings)
|
|
172
|
+
settings = new_settings
|
|
173
|
+
else:
|
|
174
|
+
logger.error("无法处理的 settings 类型")
|
|
175
|
+
return
|
|
176
|
+
|
|
142
177
|
if hasattr(spider, 'custom_settings'):
|
|
143
178
|
custom_settings = getattr(spider, 'custom_settings')
|
|
144
179
|
settings.update_attributes(custom_settings)
|
|
@@ -150,4 +185,4 @@ async def common_call(func: Callable, *args, **kwargs):
|
|
|
150
185
|
if iscoroutinefunction(func):
|
|
151
186
|
return await func(*args, **kwargs)
|
|
152
187
|
else:
|
|
153
|
-
return func(*args, **kwargs)
|
|
188
|
+
return func(*args, **kwargs)
|
|
@@ -267,7 +267,21 @@ class QueueManager:
|
|
|
267
267
|
if ':' in self.config.queue_name:
|
|
268
268
|
parts = self.config.queue_name.split(':')
|
|
269
269
|
if len(parts) >= 2:
|
|
270
|
-
|
|
270
|
+
# 处理可能的双重 crawlo 前缀
|
|
271
|
+
if parts[0] == "crawlo" and parts[1] == "crawlo":
|
|
272
|
+
# 双重 crawlo 前缀,取第三个部分作为项目名称
|
|
273
|
+
if len(parts) >= 3:
|
|
274
|
+
project_name = parts[2]
|
|
275
|
+
else:
|
|
276
|
+
project_name = "default"
|
|
277
|
+
elif parts[0] == "crawlo":
|
|
278
|
+
# 正常的 crawlo 前缀,取第二个部分作为项目名称
|
|
279
|
+
project_name = parts[1]
|
|
280
|
+
else:
|
|
281
|
+
# 没有 crawlo 前缀,使用第一个部分作为项目名称
|
|
282
|
+
project_name = parts[0]
|
|
283
|
+
else:
|
|
284
|
+
project_name = self.config.queue_name or "default"
|
|
271
285
|
else:
|
|
272
286
|
project_name = self.config.queue_name or "default"
|
|
273
287
|
|
|
@@ -1,17 +1,16 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import pickle
|
|
2
3
|
import time
|
|
3
|
-
import
|
|
4
|
+
import traceback
|
|
4
5
|
from typing import Optional
|
|
6
|
+
|
|
5
7
|
import redis.asyncio as aioredis
|
|
6
|
-
import traceback
|
|
7
|
-
import os
|
|
8
8
|
|
|
9
9
|
from crawlo import Request
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.utils.request_serializer import RequestSerializer
|
|
12
10
|
from crawlo.utils.error_handler import ErrorHandler
|
|
11
|
+
from crawlo.utils.log import get_logger
|
|
13
12
|
from crawlo.utils.redis_connection_pool import get_redis_pool, OptimizedRedisConnectionPool
|
|
14
|
-
|
|
13
|
+
from crawlo.utils.request_serializer import RequestSerializer
|
|
15
14
|
|
|
16
15
|
logger = get_logger(__name__)
|
|
17
16
|
error_handler = ErrorHandler(__name__)
|
|
@@ -45,11 +44,8 @@ class RedisPriorityQueue:
|
|
|
45
44
|
if queue_name is None:
|
|
46
45
|
self.queue_name = f"crawlo:{module_name}:queue:requests"
|
|
47
46
|
else:
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
self.queue_name = f"crawlo:{module_name}:queue:requests"
|
|
51
|
-
else:
|
|
52
|
-
self.queue_name = queue_name
|
|
47
|
+
# 保持用户提供的队列名称不变,不做修改
|
|
48
|
+
self.queue_name = queue_name
|
|
53
49
|
|
|
54
50
|
# 如果未提供 processing_queue,则根据 queue_name 自动生成
|
|
55
51
|
if processing_queue is None:
|
|
@@ -85,14 +81,16 @@ class RedisPriorityQueue:
|
|
|
85
81
|
|
|
86
82
|
for attempt in range(max_retries):
|
|
87
83
|
try:
|
|
88
|
-
#
|
|
84
|
+
# 使用优化的连接池,确保 decode_responses=False 以避免编码问题
|
|
89
85
|
self._redis_pool = get_redis_pool(
|
|
90
86
|
self.redis_url,
|
|
91
87
|
max_connections=self.max_connections,
|
|
92
88
|
socket_connect_timeout=5,
|
|
93
89
|
socket_timeout=30,
|
|
94
90
|
health_check_interval=30,
|
|
95
|
-
retry_on_timeout=True
|
|
91
|
+
retry_on_timeout=True,
|
|
92
|
+
decode_responses=False, # 确保不自动解码响应
|
|
93
|
+
encoding='utf-8'
|
|
96
94
|
)
|
|
97
95
|
|
|
98
96
|
self._redis = await self._redis_pool.get_connection()
|
|
@@ -131,7 +129,15 @@ class RedisPriorityQueue:
|
|
|
131
129
|
# 🔥 使用专用的序列化工具清理 Request
|
|
132
130
|
clean_request = self.request_serializer.prepare_for_serialization(request)
|
|
133
131
|
|
|
134
|
-
|
|
132
|
+
# 确保序列化后的数据可以被正确反序列化
|
|
133
|
+
try:
|
|
134
|
+
serialized = pickle.dumps(clean_request)
|
|
135
|
+
# 验证序列化数据可以被反序列化
|
|
136
|
+
pickle.loads(serialized)
|
|
137
|
+
except Exception as serialize_error:
|
|
138
|
+
logger.error(f"❌ 请求序列化验证失败 (Module: {self.module_name}): {serialize_error}")
|
|
139
|
+
return False
|
|
140
|
+
|
|
135
141
|
pipe = self._redis.pipeline()
|
|
136
142
|
pipe.zadd(self.queue_name, {key: score})
|
|
137
143
|
pipe.hset(f"{self.queue_name}:data", key, serialized)
|
|
@@ -174,7 +180,23 @@ class RedisPriorityQueue:
|
|
|
174
180
|
pipe.hdel(f"{self.queue_name}:data", key)
|
|
175
181
|
await pipe.execute()
|
|
176
182
|
|
|
177
|
-
|
|
183
|
+
# 更安全的反序列化方式
|
|
184
|
+
try:
|
|
185
|
+
# 首先尝试标准的 pickle 反序列化
|
|
186
|
+
request = pickle.loads(serialized)
|
|
187
|
+
return request
|
|
188
|
+
except UnicodeDecodeError:
|
|
189
|
+
# 如果出现编码错误,尝试使用 latin1 解码
|
|
190
|
+
request = pickle.loads(serialized, encoding='latin1')
|
|
191
|
+
return request
|
|
192
|
+
except Exception as pickle_error:
|
|
193
|
+
# 如果pickle反序列化失败,记录错误并跳过这个任务
|
|
194
|
+
logger.error(f"❌ 无法反序列化请求数据 (Module: {self.module_name}): {pickle_error}")
|
|
195
|
+
# 从processing队列中移除这个无效的任务
|
|
196
|
+
await self._redis.zrem(self.processing_queue, processing_key)
|
|
197
|
+
await self._redis.hdel(f"{self.processing_queue}:data", processing_key)
|
|
198
|
+
# 继续尝试下一个任务
|
|
199
|
+
continue
|
|
178
200
|
|
|
179
201
|
# 检查是否超时
|
|
180
202
|
if asyncio.get_event_loop().time() - start_time > timeout:
|
|
@@ -97,4 +97,27 @@ class SettingManager(MutableMapping):
|
|
|
97
97
|
self.set(key, value)
|
|
98
98
|
|
|
99
99
|
def copy(self):
|
|
100
|
-
return deepcopy(self)
|
|
100
|
+
return deepcopy(self)
|
|
101
|
+
|
|
102
|
+
def __deepcopy__(self, memo):
|
|
103
|
+
"""
|
|
104
|
+
自定义深度复制方法,避免复制logger等不可pickle的对象
|
|
105
|
+
"""
|
|
106
|
+
# 创建一个新的实例
|
|
107
|
+
cls = self.__class__
|
|
108
|
+
new_instance = cls.__new__(cls)
|
|
109
|
+
|
|
110
|
+
# 复制attributes字典,但排除不可pickle的对象
|
|
111
|
+
new_attributes = {}
|
|
112
|
+
for key, value in self.attributes.items():
|
|
113
|
+
try:
|
|
114
|
+
# 尝试深度复制值
|
|
115
|
+
new_attributes[key] = deepcopy(value, memo)
|
|
116
|
+
except Exception:
|
|
117
|
+
# 如果复制失败,保留原始引用(对于logger等对象)
|
|
118
|
+
new_attributes[key] = value
|
|
119
|
+
|
|
120
|
+
# 设置新实例的attributes
|
|
121
|
+
new_instance.attributes = new_attributes
|
|
122
|
+
|
|
123
|
+
return new_instance
|