crawlo 1.2.4__tar.gz → 1.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.2.4/crawlo.egg-info → crawlo-1.2.6}/PKG-INFO +1 -1
- crawlo-1.2.6/crawlo/__version__.py +1 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cli.py +12 -5
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/startproject.py +22 -6
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/core/engine.py +3 -1
- crawlo-1.2.6/crawlo/core/scheduler.py +240 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/filters/aioredis_filter.py +44 -91
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/queue_manager.py +47 -8
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/redis_priority_queue.py +9 -2
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/settings/default_settings.py +5 -7
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/settings.py.tmpl +3 -65
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/settings_distributed.py.tmpl +4 -7
- crawlo-1.2.6/crawlo/templates/project/settings_gentle.py.tmpl +101 -0
- crawlo-1.2.6/crawlo/templates/project/settings_high_performance.py.tmpl +135 -0
- crawlo-1.2.6/crawlo/templates/project/settings_simple.py.tmpl +99 -0
- {crawlo-1.2.4/crawlo/templates/project → crawlo-1.2.6/crawlo/templates}/run.py.tmpl +1 -3
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/redis_connection_pool.py +19 -2
- {crawlo-1.2.4 → crawlo-1.2.6/crawlo.egg-info}/PKG-INFO +1 -1
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/SOURCES.txt +1 -1
- crawlo-1.2.4/crawlo/__version__.py +0 -1
- crawlo-1.2.4/crawlo/core/scheduler.py +0 -144
- crawlo-1.2.4/crawlo/templates/project/settings_gentle.py.tmpl +0 -134
- crawlo-1.2.4/crawlo/templates/project/settings_high_performance.py.tmpl +0 -156
- crawlo-1.2.4/crawlo/templates/project/settings_simple.py.tmpl +0 -109
- {crawlo-1.2.4 → crawlo-1.2.6}/LICENSE +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/MANIFEST.in +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/README.md +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/data_formatter.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/encoding_converter.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/text_cleaner.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/check.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/help.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/list.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/run.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/stats.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/utils.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/config.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/config_validator.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/core/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/core/processor.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/crawler.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/data/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/data/user_agents.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/event.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/exceptions.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/base.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/fields.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/items.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/offsite.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/mode_manager.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/network/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/network/request.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/network/response.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/pipeline_manager.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/project.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/settings/setting_manager.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/stats_collector.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/subscriber.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/task_manager.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/date_tools.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/log.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/request.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/system.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/tools.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/url.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/examples/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/pyproject.toml +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/requirements.txt +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/setup.cfg +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/__init__.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/cleaners_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/config_validation_demo.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/date_tools_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/env_config_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/error_handling_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/response_improvements_example.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_cleaners.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_comprehensive.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_config_validator.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_crawlo_proxy_integration.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_date_tools.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_default_header_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_edge_cases.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_env_config.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_final_validation.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_integration.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_offsite_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_parsel.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_performance.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_api.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_config.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_queue.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_request_serialization.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_response_improvements.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_scheduler.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_simple_response.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_template_content.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_tools.py +0 -0
- {crawlo-1.2.4 → crawlo-1.2.6}/tests/tools_example.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.6"
|
|
@@ -10,12 +10,19 @@ from crawlo.commands import get_commands
|
|
|
10
10
|
def main():
|
|
11
11
|
# 获取框架版本号
|
|
12
12
|
version_file = os.path.join(os.path.dirname(__file__), '__version__.py')
|
|
13
|
+
VERSION = '1.0.0' # 默认版本号
|
|
13
14
|
if os.path.exists(version_file):
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
15
|
+
try:
|
|
16
|
+
with open(version_file, 'r') as f:
|
|
17
|
+
content = f.read()
|
|
18
|
+
# 使用正则表达式提取版本号
|
|
19
|
+
import re
|
|
20
|
+
version_match = re.search(r"__version__\s*=\s*['\"]([^'\"]*)['\"]", content)
|
|
21
|
+
if version_match:
|
|
22
|
+
VERSION = version_match.group(1)
|
|
23
|
+
except Exception:
|
|
24
|
+
# 如果读取失败,使用默认版本号
|
|
25
|
+
pass
|
|
19
26
|
|
|
20
27
|
# 获取所有可用命令
|
|
21
28
|
commands = get_commands()
|
|
@@ -108,7 +108,11 @@ def _copytree_with_templates(src, dst, context, template_type='default', modules
|
|
|
108
108
|
|
|
109
109
|
for item in src_path.rglob('*'):
|
|
110
110
|
rel_path = item.relative_to(src_path)
|
|
111
|
-
|
|
111
|
+
# 对于run.py.tmpl文件,需要特殊处理,将其放到项目根目录
|
|
112
|
+
if item.name == 'run.py.tmpl':
|
|
113
|
+
dst_item = dst_path.parent / rel_path # 放到项目根目录
|
|
114
|
+
else:
|
|
115
|
+
dst_item = dst_path / rel_path
|
|
112
116
|
|
|
113
117
|
# 检查是否应该包含此文件
|
|
114
118
|
path_str = str(rel_path).replace('\\', '/')
|
|
@@ -147,6 +151,9 @@ def _copytree_with_templates(src, dst, context, template_type='default', modules
|
|
|
147
151
|
if item.name == 'settings.py.tmpl':
|
|
148
152
|
# 特殊处理设置模板文件,统一生成为 settings.py
|
|
149
153
|
final_dst = dst_item.parent / 'settings.py'
|
|
154
|
+
# 特殊处理run.py.tmpl文件
|
|
155
|
+
elif item.name == 'run.py.tmpl':
|
|
156
|
+
final_dst = dst_item.with_suffix('') # 去掉.tmpl后缀
|
|
150
157
|
else:
|
|
151
158
|
final_dst = dst_item.with_suffix('')
|
|
152
159
|
|
|
@@ -171,8 +178,8 @@ def _should_include_file(rel_path, modules: List[str]) -> bool:
|
|
|
171
178
|
'settings.py.tmpl',
|
|
172
179
|
'spiders/__init__.py.tmpl',
|
|
173
180
|
'items.py.tmpl',
|
|
174
|
-
'middlewares.py.tmpl'
|
|
175
|
-
'run.py.tmpl'
|
|
181
|
+
'middlewares.py.tmpl'
|
|
182
|
+
# 移除了'run.py.tmpl',因为它现在在模板根目录
|
|
176
183
|
]
|
|
177
184
|
|
|
178
185
|
path_str = str(rel_path).replace('\\', '/')
|
|
@@ -364,16 +371,25 @@ def main(args):
|
|
|
364
371
|
else:
|
|
365
372
|
console.print("[yellow]⚠ 警告:[/yellow] 找不到模板 'crawlo.cfg.tmpl'。")
|
|
366
373
|
|
|
367
|
-
# 3.
|
|
374
|
+
# 3. 渲染 run.py.tmpl (放在项目根目录)
|
|
375
|
+
run_template = TEMPLATES_DIR / 'run.py.tmpl'
|
|
376
|
+
if run_template.exists():
|
|
377
|
+
run_content = _render_template(run_template, context)
|
|
378
|
+
(project_dir / 'run.py').write_text(run_content, encoding='utf-8')
|
|
379
|
+
console.print(f":white_check_mark: 已创建 [green]{project_dir / 'run.py'}[/green]")
|
|
380
|
+
else:
|
|
381
|
+
console.print("[yellow]⚠ 警告:[/yellow] 找不到模板 'run.py.tmpl'。")
|
|
382
|
+
|
|
383
|
+
# 4. 复制并渲染项目包内容
|
|
368
384
|
package_dir = project_dir / project_name
|
|
369
385
|
_copytree_with_templates(template_dir, package_dir, context, template_type, modules)
|
|
370
386
|
console.print(f":white_check_mark: 已创建项目包: [green]{package_dir}[/green]")
|
|
371
387
|
|
|
372
|
-
#
|
|
388
|
+
# 5. 创建 logs 目录
|
|
373
389
|
(project_dir / 'logs').mkdir(exist_ok=True)
|
|
374
390
|
console.print(":white_check_mark: 已创建 logs 目录")
|
|
375
391
|
|
|
376
|
-
#
|
|
392
|
+
# 6. 创建 output 目录(用于数据输出)
|
|
377
393
|
(project_dir / 'output').mkdir(exist_ok=True)
|
|
378
394
|
console.print(":white_check_mark: 已创建 output 目录")
|
|
379
395
|
|
|
@@ -88,8 +88,9 @@ class Engine(object):
|
|
|
88
88
|
self.downloader = downloader_cls(self.crawler)
|
|
89
89
|
if hasattr(self.downloader, 'open'):
|
|
90
90
|
if asyncio.iscoroutinefunction(self.downloader.open):
|
|
91
|
-
|
|
91
|
+
self.downloader.open()
|
|
92
92
|
else:
|
|
93
|
+
# DownloaderBase.open() 是同步方法,直接调用而不是await
|
|
93
94
|
self.downloader.open()
|
|
94
95
|
|
|
95
96
|
self.processor = Processor(self.crawler)
|
|
@@ -97,6 +98,7 @@ class Engine(object):
|
|
|
97
98
|
if asyncio.iscoroutinefunction(self.processor.open):
|
|
98
99
|
await self.processor.open()
|
|
99
100
|
else:
|
|
101
|
+
# Processor.open() 是同步方法
|
|
100
102
|
self.processor.open()
|
|
101
103
|
|
|
102
104
|
self.start_requests = iter(spider.start_requests())
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from typing import Optional, Callable
|
|
4
|
+
import traceback
|
|
5
|
+
|
|
6
|
+
from crawlo.utils.log import get_logger
|
|
7
|
+
from crawlo.utils.request import set_request
|
|
8
|
+
from crawlo.utils.request_serializer import RequestSerializer
|
|
9
|
+
from crawlo.utils.error_handler import ErrorHandler
|
|
10
|
+
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
11
|
+
from crawlo.project import load_class, common_call
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Scheduler:
|
|
15
|
+
def __init__(self, crawler, dupe_filter, stats, log_level, priority):
|
|
16
|
+
self.crawler = crawler
|
|
17
|
+
self.queue_manager: Optional[QueueManager] = None
|
|
18
|
+
self.request_serializer = RequestSerializer() # 专门处理序列化
|
|
19
|
+
|
|
20
|
+
self.logger = get_logger(name=self.__class__.__name__, level=log_level)
|
|
21
|
+
self.error_handler = ErrorHandler(self.__class__.__name__, log_level)
|
|
22
|
+
self.stats = stats
|
|
23
|
+
self.dupe_filter = dupe_filter
|
|
24
|
+
self.priority = priority
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def create_instance(cls, crawler):
|
|
28
|
+
filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
|
|
29
|
+
o = cls(
|
|
30
|
+
crawler=crawler,
|
|
31
|
+
dupe_filter=filter_cls.create_instance(crawler),
|
|
32
|
+
stats=crawler.stats,
|
|
33
|
+
log_level=crawler.settings.get('LOG_LEVEL'),
|
|
34
|
+
priority=crawler.settings.get('DEPTH_PRIORITY')
|
|
35
|
+
)
|
|
36
|
+
return o
|
|
37
|
+
|
|
38
|
+
async def open(self):
|
|
39
|
+
"""初始化调度器和队列"""
|
|
40
|
+
self.logger.info("开始初始化调度器...")
|
|
41
|
+
try:
|
|
42
|
+
# 创建队列配置
|
|
43
|
+
queue_config = QueueConfig.from_settings(self.crawler.settings)
|
|
44
|
+
|
|
45
|
+
# 创建队列管理器
|
|
46
|
+
self.queue_manager = QueueManager(queue_config)
|
|
47
|
+
|
|
48
|
+
# 初始化队列
|
|
49
|
+
self.logger.info("开始初始化队列管理器...")
|
|
50
|
+
needs_config_update = await self.queue_manager.initialize()
|
|
51
|
+
|
|
52
|
+
self.logger.info(f"队列初始化完成,needs_config_update: {needs_config_update}")
|
|
53
|
+
self.logger.info(f"当前队列类型: {self.queue_manager._queue_type}")
|
|
54
|
+
|
|
55
|
+
# 检查是否需要更新过滤器配置
|
|
56
|
+
if needs_config_update:
|
|
57
|
+
# 如果返回True,说明队列类型发生了变化,需要检查当前队列类型来决定更新方向
|
|
58
|
+
self.logger.info("需要更新配置...")
|
|
59
|
+
if self.queue_manager._queue_type == QueueType.REDIS:
|
|
60
|
+
self.logger.info("更新为Redis配置...")
|
|
61
|
+
self._update_filter_config_for_redis()
|
|
62
|
+
else:
|
|
63
|
+
self.logger.info("更新为内存配置...")
|
|
64
|
+
self._update_filter_config_if_needed()
|
|
65
|
+
else:
|
|
66
|
+
# 检查是否需要更新配置(即使队列管理器没有要求更新)
|
|
67
|
+
self.logger.debug("检查是否需要更新配置...")
|
|
68
|
+
if self.queue_manager._queue_type == QueueType.REDIS:
|
|
69
|
+
# 检查当前过滤器是否为内存过滤器
|
|
70
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
71
|
+
if 'memory_filter' in current_filter_class:
|
|
72
|
+
self.logger.info("检测到需要更新为Redis配置...")
|
|
73
|
+
self._update_filter_config_for_redis()
|
|
74
|
+
elif self.queue_manager._queue_type == QueueType.MEMORY:
|
|
75
|
+
# 检查当前过滤器是否为Redis过滤器
|
|
76
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
77
|
+
if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
|
|
78
|
+
self.logger.info("检测到需要更新为内存配置...")
|
|
79
|
+
self._update_filter_config_if_needed()
|
|
80
|
+
|
|
81
|
+
# 只有在确实需要更新配置时才重新创建过滤器实例
|
|
82
|
+
# 检查是否真的进行了配置更新
|
|
83
|
+
filter_updated = (
|
|
84
|
+
(self.queue_manager._queue_type == QueueType.REDIS and 'memory_filter' in self.crawler.settings.get('FILTER_CLASS', '')) or
|
|
85
|
+
(self.queue_manager._queue_type == QueueType.MEMORY and ('aioredis_filter' in self.crawler.settings.get('FILTER_CLASS', '') or 'redis_filter' in self.crawler.settings.get('FILTER_CLASS', '')))
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if needs_config_update or filter_updated:
|
|
89
|
+
# 重新创建过滤器实例,确保使用更新后的配置
|
|
90
|
+
self.logger.debug("重新创建过滤器实例...")
|
|
91
|
+
filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
|
|
92
|
+
self.dupe_filter = filter_cls.create_instance(self.crawler)
|
|
93
|
+
self.logger.info(f"✅ 过滤器实例已更新为: {type(self.dupe_filter).__name__}")
|
|
94
|
+
else:
|
|
95
|
+
self.logger.debug("过滤器配置无需更新,跳过重新创建")
|
|
96
|
+
|
|
97
|
+
# 输出队列状态和配置信息
|
|
98
|
+
status = self.queue_manager.get_status()
|
|
99
|
+
current_filter = self.crawler.settings.get('FILTER_CLASS')
|
|
100
|
+
current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
101
|
+
|
|
102
|
+
self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
|
|
103
|
+
self.logger.info(f'当前过滤器: {type(self.dupe_filter).__name__} ({current_filter})')
|
|
104
|
+
self.logger.info(f'当前去重管道: {current_dedup_pipeline}')
|
|
105
|
+
self.logger.info("调度器初始化完成")
|
|
106
|
+
except Exception as e:
|
|
107
|
+
self.logger.error(f"❌ 调度器初始化失败: {e}")
|
|
108
|
+
self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
|
|
109
|
+
raise
|
|
110
|
+
|
|
111
|
+
def _update_filter_config_if_needed(self):
|
|
112
|
+
"""如果队列类型切换到内存模式,则更新过滤器配置"""
|
|
113
|
+
if self.queue_manager and self.queue_manager._queue_type == QueueType.MEMORY:
|
|
114
|
+
# 检查当前过滤器是否为Redis过滤器
|
|
115
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
116
|
+
if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
|
|
117
|
+
# 更新为内存过滤器
|
|
118
|
+
self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.memory_filter.MemoryFilter')
|
|
119
|
+
self.logger.info("✅ 已更新过滤器配置为内存模式")
|
|
120
|
+
|
|
121
|
+
# 检查当前去重管道是否为Redis去重管道
|
|
122
|
+
current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
|
|
123
|
+
if 'redis_dedup_pipeline' in current_dedup_pipeline:
|
|
124
|
+
# 更新为内存去重管道
|
|
125
|
+
self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline')
|
|
126
|
+
# 同时更新PIPELINES列表中的去重管道
|
|
127
|
+
pipelines = self.crawler.settings.get('PIPELINES', [])
|
|
128
|
+
if current_dedup_pipeline in pipelines:
|
|
129
|
+
# 找到并替换Redis去重管道为内存去重管道
|
|
130
|
+
index = pipelines.index(current_dedup_pipeline)
|
|
131
|
+
pipelines[index] = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
132
|
+
self.crawler.settings.set('PIPELINES', pipelines)
|
|
133
|
+
self.logger.info("✅ 已更新去重管道配置为内存模式")
|
|
134
|
+
|
|
135
|
+
def _update_filter_config_for_redis(self):
|
|
136
|
+
"""如果队列类型是Redis,则更新过滤器配置为Redis实现"""
|
|
137
|
+
if self.queue_manager and self.queue_manager._queue_type == QueueType.REDIS:
|
|
138
|
+
# 检查当前过滤器是否为内存过滤器
|
|
139
|
+
current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
|
|
140
|
+
if 'memory_filter' in current_filter_class:
|
|
141
|
+
# 更新为Redis过滤器
|
|
142
|
+
self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.aioredis_filter.AioRedisFilter')
|
|
143
|
+
self.logger.info("✅ 已更新过滤器配置为Redis模式")
|
|
144
|
+
|
|
145
|
+
# 检查当前去重管道是否为内存去重管道
|
|
146
|
+
current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
|
|
147
|
+
if 'memory_dedup_pipeline' in current_dedup_pipeline:
|
|
148
|
+
# 更新为Redis去重管道
|
|
149
|
+
self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline')
|
|
150
|
+
# 同时更新PIPELINES列表中的去重管道
|
|
151
|
+
pipelines = self.crawler.settings.get('PIPELINES', [])
|
|
152
|
+
if current_dedup_pipeline in pipelines:
|
|
153
|
+
# 找到并替换内存去重管道为Redis去重管道
|
|
154
|
+
index = pipelines.index(current_dedup_pipeline)
|
|
155
|
+
pipelines[index] = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
156
|
+
self.crawler.settings.set('PIPELINES', pipelines)
|
|
157
|
+
self.logger.info("✅ 已更新去重管道配置为Redis模式")
|
|
158
|
+
|
|
159
|
+
async def next_request(self):
|
|
160
|
+
"""获取下一个请求"""
|
|
161
|
+
if not self.queue_manager:
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
request = await self.queue_manager.get()
|
|
166
|
+
|
|
167
|
+
# 恢复 callback(从 Redis 队列取出时)
|
|
168
|
+
if request:
|
|
169
|
+
spider = getattr(self.crawler, 'spider', None)
|
|
170
|
+
request = self.request_serializer.restore_after_deserialization(request, spider)
|
|
171
|
+
|
|
172
|
+
return request
|
|
173
|
+
except Exception as e:
|
|
174
|
+
self.error_handler.handle_error(
|
|
175
|
+
e,
|
|
176
|
+
context="获取下一个请求失败",
|
|
177
|
+
raise_error=False
|
|
178
|
+
)
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
async def enqueue_request(self, request):
|
|
182
|
+
"""将请求加入队列"""
|
|
183
|
+
if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
|
|
184
|
+
self.dupe_filter.log_stats(request)
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
if not self.queue_manager:
|
|
188
|
+
self.logger.error("队列管理器未初始化")
|
|
189
|
+
return False
|
|
190
|
+
|
|
191
|
+
set_request(request, self.priority)
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
# 使用统一的队列接口
|
|
195
|
+
success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
|
|
196
|
+
|
|
197
|
+
if success:
|
|
198
|
+
self.logger.debug(f"✅ 请求入队成功: {request.url}")
|
|
199
|
+
|
|
200
|
+
return success
|
|
201
|
+
except Exception as e:
|
|
202
|
+
self.error_handler.handle_error(
|
|
203
|
+
e,
|
|
204
|
+
context="请求入队失败",
|
|
205
|
+
raise_error=False
|
|
206
|
+
)
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
def idle(self) -> bool:
|
|
210
|
+
"""检查队列是否为空"""
|
|
211
|
+
return len(self) == 0
|
|
212
|
+
|
|
213
|
+
async def async_idle(self) -> bool:
|
|
214
|
+
"""异步检查队列是否为空(更精确)"""
|
|
215
|
+
if not self.queue_manager:
|
|
216
|
+
return True
|
|
217
|
+
# 使用队列管理器的异步empty方法
|
|
218
|
+
return await self.queue_manager.async_empty()
|
|
219
|
+
|
|
220
|
+
async def close(self):
|
|
221
|
+
"""关闭调度器"""
|
|
222
|
+
try:
|
|
223
|
+
if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
|
|
224
|
+
await closed()
|
|
225
|
+
|
|
226
|
+
if self.queue_manager:
|
|
227
|
+
await self.queue_manager.close()
|
|
228
|
+
except Exception as e:
|
|
229
|
+
self.error_handler.handle_error(
|
|
230
|
+
e,
|
|
231
|
+
context="关闭调度器失败",
|
|
232
|
+
raise_error=False
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def __len__(self):
|
|
236
|
+
"""获取队列大小"""
|
|
237
|
+
if not self.queue_manager:
|
|
238
|
+
return 0
|
|
239
|
+
# 返回同步的近似值,实际大小需要异步获取
|
|
240
|
+
return 0 if self.queue_manager.empty() else 1
|
|
@@ -1,18 +1,6 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Redis 过滤器实现
|
|
5
|
-
=================
|
|
6
|
-
提供基于 Redis 的分布式请求去重功能。
|
|
7
|
-
|
|
8
|
-
特点:
|
|
9
|
-
- 分布式支持: 多节点共享去重数据
|
|
10
|
-
- TTL 支持: 自动过期清理
|
|
11
|
-
- 高性能: 使用 Redis pipeline 优化
|
|
12
|
-
- 容错设计: 网络异常自动重试
|
|
13
|
-
"""
|
|
14
|
-
import redis.asyncio as aioredis
|
|
15
1
|
from typing import Optional
|
|
2
|
+
import redis.asyncio as aioredis
|
|
3
|
+
|
|
16
4
|
from crawlo.filters import BaseFilter
|
|
17
5
|
from crawlo.utils.log import get_logger
|
|
18
6
|
from crawlo.utils.request import request_fingerprint
|
|
@@ -70,6 +58,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
70
58
|
# 性能计数器
|
|
71
59
|
self._redis_operations = 0
|
|
72
60
|
self._pipeline_operations = 0
|
|
61
|
+
|
|
62
|
+
# 连接状态标记,避免重复尝试连接失败的Redis
|
|
63
|
+
self._connection_failed = False
|
|
73
64
|
|
|
74
65
|
@classmethod
|
|
75
66
|
def create_instance(cls, crawler) -> 'BaseFilter':
|
|
@@ -123,8 +114,17 @@ class AioRedisFilter(BaseFilter):
|
|
|
123
114
|
|
|
124
115
|
async def _get_redis_client(self):
|
|
125
116
|
"""获取Redis客户端实例(延迟初始化)"""
|
|
117
|
+
# 如果之前连接失败,直接返回None
|
|
118
|
+
if self._connection_failed:
|
|
119
|
+
return None
|
|
120
|
+
|
|
126
121
|
if self.redis is None and self._redis_pool is not None:
|
|
127
|
-
|
|
122
|
+
try:
|
|
123
|
+
self.redis = await self._redis_pool.get_connection()
|
|
124
|
+
except Exception as e:
|
|
125
|
+
self._connection_failed = True
|
|
126
|
+
self.logger.error(f"Redis连接失败,将使用本地去重: {e}")
|
|
127
|
+
return None
|
|
128
128
|
return self.redis
|
|
129
129
|
|
|
130
130
|
async def requested(self, request) -> bool:
|
|
@@ -136,13 +136,17 @@ class AioRedisFilter(BaseFilter):
|
|
|
136
136
|
"""
|
|
137
137
|
try:
|
|
138
138
|
# 确保Redis客户端已初始化
|
|
139
|
-
await self._get_redis_client()
|
|
139
|
+
redis_client = await self._get_redis_client()
|
|
140
|
+
|
|
141
|
+
# 如果Redis不可用,返回False表示不重复(避免丢失请求)
|
|
142
|
+
if redis_client is None:
|
|
143
|
+
return False
|
|
140
144
|
|
|
141
145
|
fp = str(request_fingerprint(request))
|
|
142
146
|
self._redis_operations += 1
|
|
143
147
|
|
|
144
148
|
# 使用 pipeline 优化性能
|
|
145
|
-
pipe =
|
|
149
|
+
pipe = redis_client.pipeline()
|
|
146
150
|
pipe.sismember(self.redis_key, fp)
|
|
147
151
|
|
|
148
152
|
results = await pipe.execute()
|
|
@@ -173,12 +177,16 @@ class AioRedisFilter(BaseFilter):
|
|
|
173
177
|
"""
|
|
174
178
|
try:
|
|
175
179
|
# 确保Redis客户端已初始化
|
|
176
|
-
await self._get_redis_client()
|
|
180
|
+
redis_client = await self._get_redis_client()
|
|
181
|
+
|
|
182
|
+
# 如果Redis不可用,返回False表示添加失败
|
|
183
|
+
if redis_client is None:
|
|
184
|
+
return False
|
|
177
185
|
|
|
178
186
|
fp = str(fp)
|
|
179
187
|
|
|
180
188
|
# 使用 pipeline 优化性能
|
|
181
|
-
pipe =
|
|
189
|
+
pipe = redis_client.pipeline()
|
|
182
190
|
pipe.sadd(self.redis_key, fp)
|
|
183
191
|
|
|
184
192
|
if self.ttl and self.ttl > 0:
|
|
@@ -197,85 +205,30 @@ class AioRedisFilter(BaseFilter):
|
|
|
197
205
|
except Exception as e:
|
|
198
206
|
self.logger.error(f"添加指纹失败: {fp[:20]}... - {e}")
|
|
199
207
|
return False
|
|
200
|
-
|
|
201
|
-
def __contains__(self,
|
|
208
|
+
|
|
209
|
+
async def __contains__(self, fp: str) -> bool:
|
|
202
210
|
"""
|
|
203
|
-
|
|
211
|
+
检查指纹是否存在于Redis集合中
|
|
204
212
|
|
|
205
|
-
:param
|
|
206
|
-
:return:
|
|
213
|
+
:param fp: 请求指纹字符串
|
|
214
|
+
:return: 是否存在
|
|
207
215
|
"""
|
|
208
|
-
# 这是一个同步方法,不能直接调用异步Redis操作
|
|
209
|
-
# 建议使用 requested() 方法替代
|
|
210
|
-
raise NotImplementedError("请使用 requested() 方法进行异步检查")
|
|
211
|
-
|
|
212
|
-
async def get_stats(self) -> dict:
|
|
213
|
-
"""获取过滤器详细统计信息"""
|
|
214
216
|
try:
|
|
215
217
|
# 确保Redis客户端已初始化
|
|
216
|
-
await self._get_redis_client()
|
|
217
|
-
|
|
218
|
-
count = await self.redis.scard(self.redis_key)
|
|
218
|
+
redis_client = await self._get_redis_client()
|
|
219
219
|
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
remaining_ttl = await self.redis.ttl(self.redis_key)
|
|
224
|
-
if remaining_ttl > 0:
|
|
225
|
-
ttl_info = f"剩余 {remaining_ttl} 秒"
|
|
226
|
-
else:
|
|
227
|
-
ttl_info = f"配置 {self.ttl} 秒"
|
|
228
|
-
|
|
229
|
-
stats = {
|
|
230
|
-
'filter_type': 'AioRedisFilter',
|
|
231
|
-
'指纹总数': count,
|
|
232
|
-
'Redis键名': self.redis_key,
|
|
233
|
-
'TTL配置': ttl_info,
|
|
234
|
-
'Redis操作数': self._redis_operations,
|
|
235
|
-
'Pipeline操作数': self._pipeline_operations,
|
|
236
|
-
'性能优化率': f"{self._pipeline_operations / max(1, self._redis_operations) * 100:.1f}%"
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
# 合并基类统计
|
|
240
|
-
base_stats = super().get_stats()
|
|
241
|
-
stats.update(base_stats)
|
|
242
|
-
|
|
243
|
-
return stats
|
|
220
|
+
# 如果Redis不可用,返回False表示不存在
|
|
221
|
+
if redis_client is None:
|
|
222
|
+
return False
|
|
244
223
|
|
|
224
|
+
# 检查指纹是否存在
|
|
225
|
+
exists = await redis_client.sismember(self.redis_key, str(fp))
|
|
226
|
+
return exists
|
|
245
227
|
except Exception as e:
|
|
246
|
-
self.logger.error(f"
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
async def clear_all(self) -> int:
|
|
250
|
-
"""清空所有指纹数据"""
|
|
251
|
-
try:
|
|
252
|
-
# 确保Redis客户端已初始化
|
|
253
|
-
await self._get_redis_client()
|
|
254
|
-
|
|
255
|
-
deleted = await self.redis.delete(self.redis_key)
|
|
256
|
-
self.logger.info(f"已清除指纹数: {deleted}")
|
|
257
|
-
return deleted
|
|
258
|
-
except Exception as e:
|
|
259
|
-
self.logger.error("清空指纹失败")
|
|
260
|
-
raise
|
|
228
|
+
self.logger.error(f"检查指纹存在性失败: {fp[:20]}... - {e}")
|
|
229
|
+
# 在网络异常时返回False,避免丢失请求
|
|
230
|
+
return False
|
|
261
231
|
|
|
262
|
-
async def closed(self, reason: Optional[str] = None) -> None:
|
|
263
|
-
"""爬虫关闭时的清理操作"""
|
|
264
|
-
try:
|
|
265
|
-
# 确保Redis客户端已初始化
|
|
266
|
-
await self._get_redis_client()
|
|
267
|
-
|
|
268
|
-
if self.cleanup_fp:
|
|
269
|
-
deleted = await self.redis.delete(self.redis_key)
|
|
270
|
-
self.logger.info(f"爬虫关闭清理: 已删除{deleted}个指纹")
|
|
271
|
-
else:
|
|
272
|
-
count = await self.redis.scard(self.redis_key)
|
|
273
|
-
ttl_info = f"{self.ttl}秒" if self.ttl else "持久化"
|
|
274
|
-
self.logger.info(f"保留指纹数: {count} (TTL: {ttl_info})")
|
|
275
|
-
finally:
|
|
276
|
-
await self._close_redis()
|
|
277
232
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# 连接池会自动管理连接,这里不需要显式关闭
|
|
281
|
-
self.logger.debug("Redis连接已释放")
|
|
233
|
+
# 为了兼容性,确保导出类
|
|
234
|
+
__all__ = ['AioRedisFilter']
|
|
@@ -4,11 +4,11 @@
|
|
|
4
4
|
统一的队列管理器
|
|
5
5
|
提供简洁、一致的队列接口,自动处理不同队列类型的差异
|
|
6
6
|
"""
|
|
7
|
-
|
|
8
|
-
from enum import Enum
|
|
7
|
+
import os
|
|
9
8
|
import asyncio
|
|
10
9
|
import traceback
|
|
11
|
-
import
|
|
10
|
+
from typing import Optional, Dict, Any, Union
|
|
11
|
+
from enum import Enum
|
|
12
12
|
|
|
13
13
|
from crawlo.utils.log import get_logger
|
|
14
14
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
@@ -103,11 +103,24 @@ class QueueManager:
|
|
|
103
103
|
self._queue_type = queue_type
|
|
104
104
|
|
|
105
105
|
# 测试队列健康状态
|
|
106
|
-
await self._health_check()
|
|
106
|
+
health_check_result = await self._health_check()
|
|
107
107
|
|
|
108
108
|
self.logger.info(f"✅ 队列初始化成功: {queue_type.value}")
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
# 只在调试模式下输出详细配置信息
|
|
110
|
+
self.logger.debug(f"📊 队列配置: {self._get_queue_info()}")
|
|
111
|
+
|
|
112
|
+
# 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
|
|
113
|
+
if health_check_result:
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
# 如果队列类型是Redis,检查是否需要更新配置
|
|
117
|
+
if queue_type == QueueType.REDIS:
|
|
118
|
+
# 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
|
|
119
|
+
# 但我们不需要总是返回True,只有在确实需要更新时才返回True
|
|
120
|
+
# 调度器会进行更详细的检查
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
return False # 默认不需要更新配置
|
|
111
124
|
|
|
112
125
|
except Exception as e:
|
|
113
126
|
# 记录详细的错误信息和堆栈跟踪
|
|
@@ -265,7 +278,15 @@ class QueueManager:
|
|
|
265
278
|
raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
|
|
266
279
|
if not self.config.redis_url:
|
|
267
280
|
raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
|
|
268
|
-
|
|
281
|
+
# 测试 Redis 连接
|
|
282
|
+
try:
|
|
283
|
+
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
284
|
+
await test_queue.connect()
|
|
285
|
+
await test_queue.close()
|
|
286
|
+
return QueueType.REDIS
|
|
287
|
+
except Exception as e:
|
|
288
|
+
# 如果强制使用Redis但连接失败,则抛出异常
|
|
289
|
+
raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
|
|
269
290
|
|
|
270
291
|
elif self.config.queue_type == QueueType.MEMORY:
|
|
271
292
|
return QueueType.MEMORY
|
|
@@ -307,7 +328,7 @@ class QueueManager:
|
|
|
307
328
|
else:
|
|
308
329
|
raise ValueError(f"不支持的队列类型: {queue_type}")
|
|
309
330
|
|
|
310
|
-
async def _health_check(self) ->
|
|
331
|
+
async def _health_check(self) -> bool:
|
|
311
332
|
"""健康检查"""
|
|
312
333
|
try:
|
|
313
334
|
if self._queue_type == QueueType.REDIS:
|
|
@@ -317,9 +338,27 @@ class QueueManager:
|
|
|
317
338
|
else:
|
|
318
339
|
# 内存队列总是健康的
|
|
319
340
|
self._health_status = "healthy"
|
|
341
|
+
return False # 内存队列不需要更新配置
|
|
320
342
|
except Exception as e:
|
|
321
343
|
self.logger.warning(f"队列健康检查失败: {e}")
|
|
322
344
|
self._health_status = "unhealthy"
|
|
345
|
+
# 如果是Redis队列且健康检查失败,尝试切换到内存队列
|
|
346
|
+
if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
|
|
347
|
+
self.logger.info("Redis队列不可用,尝试切换到内存队列...")
|
|
348
|
+
try:
|
|
349
|
+
await self._queue.close()
|
|
350
|
+
except:
|
|
351
|
+
pass
|
|
352
|
+
self._queue = None
|
|
353
|
+
# 重新创建内存队列
|
|
354
|
+
self._queue = await self._create_queue(QueueType.MEMORY)
|
|
355
|
+
self._queue_type = QueueType.MEMORY
|
|
356
|
+
self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
|
|
357
|
+
self._health_status = "healthy"
|
|
358
|
+
self.logger.info("✅ 已切换到内存队列")
|
|
359
|
+
# 返回一个信号,表示需要更新过滤器和去重管道配置
|
|
360
|
+
return True
|
|
361
|
+
return False
|
|
323
362
|
|
|
324
363
|
def _get_queue_info(self) -> Dict[str, Any]:
|
|
325
364
|
"""获取队列配置信息"""
|
|
@@ -77,7 +77,13 @@ class RedisPriorityQueue:
|
|
|
77
77
|
"""异步连接 Redis,支持重试"""
|
|
78
78
|
async with self._lock:
|
|
79
79
|
if self._redis is not None:
|
|
80
|
-
|
|
80
|
+
# 如果已经连接,测试连接是否仍然有效
|
|
81
|
+
try:
|
|
82
|
+
await self._redis.ping()
|
|
83
|
+
return self._redis
|
|
84
|
+
except Exception:
|
|
85
|
+
# 连接失效,重新连接
|
|
86
|
+
self._redis = None
|
|
81
87
|
|
|
82
88
|
for attempt in range(max_retries):
|
|
83
89
|
try:
|
|
@@ -97,7 +103,8 @@ class RedisPriorityQueue:
|
|
|
97
103
|
|
|
98
104
|
# 测试连接
|
|
99
105
|
await self._redis.ping()
|
|
100
|
-
|
|
106
|
+
# 只在调试模式下输出详细连接信息
|
|
107
|
+
logger.debug(f"✅ Redis 连接成功 (Module: {self.module_name})")
|
|
101
108
|
return self._redis
|
|
102
109
|
except Exception as e:
|
|
103
110
|
error_msg = f"⚠️ Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
|