crawlo 1.3.5__tar.gz → 1.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.3.5/crawlo.egg-info → crawlo-1.3.7}/PKG-INFO +74 -1
- {crawlo-1.3.5 → crawlo-1.3.7}/README.md +73 -0
- crawlo-1.3.7/crawlo/__version__.py +1 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/framework.py +3 -2
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/queue_manager.py +26 -7
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/redis_priority_queue.py +43 -2
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/settings/default_settings.py +8 -8
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings.py.tmpl +3 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_distributed.py.tmpl +3 -0
- {crawlo-1.3.5 → crawlo-1.3.7/crawlo.egg-info}/PKG-INFO +74 -1
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/SOURCES.txt +4 -0
- crawlo-1.3.7/tests/simple_queue_type_test.py +42 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_crawlo_proxy_integration.py +1 -1
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_double_crawlo_fix.py +10 -13
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_queue_manager_redis_key.py +39 -36
- crawlo-1.3.7/tests/test_queue_naming.py +155 -0
- crawlo-1.3.7/tests/test_queue_type.py +107 -0
- crawlo-1.3.7/tests/test_redis_queue_name_fix.py +176 -0
- crawlo-1.3.5/crawlo/__version__.py +0 -1
- {crawlo-1.3.5 → crawlo-1.3.7}/LICENSE +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/MANIFEST.in +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/cli.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/check.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/help.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/list.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/run.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/stats.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/utils.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/config_validator.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/engine.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/processor.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/crawler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/data/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/data/user_agents.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/event.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/exceptions.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/base.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/crawler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/registry.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/built_in.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/context.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/core.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/phases.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/registry.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/base.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/fields.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/items.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/factory.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/manager.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/offsite.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/simple_proxy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/mode_manager.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/network/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/network/request.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/network/response.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/pipeline_manager.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/project.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/settings/setting_manager.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/stats_collector.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/subscriber.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/task_manager.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/run.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/spiders_init.py.tmpl +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/data_formatter.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/encoding_converter.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/network_diagnostic.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/request_tools.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/text_cleaner.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/class_loader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/log.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/redis_connection_pool.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/request.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/system.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/tools.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/url.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/examples/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/pyproject.toml +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/requirements.txt +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/setup.cfg +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/__init__.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/baidu_performance_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/baidu_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/cleaners_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/comprehensive_framework_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/comprehensive_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/comprehensive_testing_summary.md +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/config_validation_demo.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/date_tools_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_configure.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_framework_logger.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_log_config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_log_levels.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_pipelines.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/detailed_log_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/distributed_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/distributed_test_debug.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/env_config_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/error_handling_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_command_test_report.md +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_comprehensive_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_log_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_validation_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/fix_log_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/framework_performance_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/log_buffering_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/log_generation_timing_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/optimized_performance_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/performance_comparison.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/queue_blocking_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/queue_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/request_params_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/response_improvements_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/scrapy_comparison/ofweek_scrapy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/scrapy_comparison/scrapy_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_command_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_crawlo_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_log_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_log_test2.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_optimization_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_spider_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/spider_log_timing_test.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_all_commands.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_batch_processor.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_cleaners.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_component_factory.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_comprehensive.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_config_consistency.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_config_merge.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_config_validator.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_controlled_spider_mixin.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_date_tools.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_default_header_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_distributed.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_edge_cases.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_enhanced_error_handler_comprehensive.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_env_config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_factories.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_final_validation.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_framework_logger.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_framework_startup.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_get_component_logger.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_integration.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_large_scale_config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_large_scale_helper.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_logging_system.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_mode_change.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_mode_consistency.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_offsite_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_parsel.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_performance.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_performance_monitor.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_api.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware_refactored.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_queue_empty_check.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_random_user_agent.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_config.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_queue.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_request_params.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_request_serialization.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_response_improvements.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_scheduler.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_scheduler_config_update.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_simple_response.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_template_content.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_tools.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_user_agents.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/tools_example.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/untested_features_report.md +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/verify_debug.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/verify_distributed.py +0 -0
- {crawlo-1.3.5 → crawlo-1.3.7}/tests/verify_log_fix.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlo
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.7
|
|
4
4
|
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
5
|
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
6
|
Author: crawl-coder
|
|
@@ -630,6 +630,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
|
|
|
630
630
|
|
|
631
631
|
推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
|
|
632
632
|
|
|
633
|
+
#### Redis Key 命名规范
|
|
634
|
+
|
|
635
|
+
在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
|
|
636
|
+
|
|
637
|
+
##### 默认命名规则
|
|
638
|
+
Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
|
|
639
|
+
|
|
640
|
+
其中:
|
|
641
|
+
- `PROJECT_NAME`:项目名称,用于区分不同项目
|
|
642
|
+
- `component`:组件类型,如`queue`、`filter`、`item`
|
|
643
|
+
- `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
|
|
644
|
+
|
|
645
|
+
##### 具体Key格式
|
|
646
|
+
1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
|
|
647
|
+
- 用于存储待处理的请求任务
|
|
648
|
+
|
|
649
|
+
2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
|
|
650
|
+
- 用于存储正在处理的请求任务
|
|
651
|
+
|
|
652
|
+
3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
|
|
653
|
+
- 用于存储处理失败的请求任务
|
|
654
|
+
|
|
655
|
+
4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
|
|
656
|
+
- 用于存储请求URL的指纹,实现去重功能
|
|
657
|
+
|
|
658
|
+
5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
|
|
659
|
+
- 用于存储数据项的指纹,防止重复存储
|
|
660
|
+
|
|
661
|
+
##### 自定义队列名称
|
|
662
|
+
用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
|
|
663
|
+
- 处理中队列:将`:queue:requests`替换为`:queue:processing`
|
|
664
|
+
- 失败队列:将`:queue:requests`替换为`:queue:failed`
|
|
665
|
+
|
|
666
|
+
示例配置:
|
|
667
|
+
```python
|
|
668
|
+
# settings.py
|
|
669
|
+
SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
670
|
+
```
|
|
671
|
+
|
|
672
|
+
##### 命名规范优势
|
|
673
|
+
1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
|
|
674
|
+
2. **组件分类清晰**:通过组件类型区分不同功能模块
|
|
675
|
+
3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
|
|
676
|
+
4. **防止命名冲突**:避免不同项目或组件间的Key冲突
|
|
677
|
+
|
|
633
678
|
<!-- 配置系统 section -->
|
|
634
679
|
<h2 align="center">🎛️ 配置系统</h2>
|
|
635
680
|
|
|
@@ -1095,6 +1140,34 @@ asyncio.run(process.crawl('my_spider_name'))
|
|
|
1095
1140
|
|
|
1096
1141
|
---
|
|
1097
1142
|
|
|
1143
|
+
<!-- Redis键名修复说明 section -->
|
|
1144
|
+
<h2 align="center">🔧 Redis键名修复说明</h2>
|
|
1145
|
+
|
|
1146
|
+
在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
|
|
1147
|
+
|
|
1148
|
+
- **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
|
|
1149
|
+
- **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
|
|
1150
|
+
- **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
|
|
1151
|
+
|
|
1152
|
+
**修复内容**:
|
|
1153
|
+
|
|
1154
|
+
1. **队列管理器优化**:
|
|
1155
|
+
- 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
|
|
1156
|
+
- 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
|
|
1157
|
+
|
|
1158
|
+
2. **Redis队列实现改进**:
|
|
1159
|
+
- 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
|
|
1160
|
+
- 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
|
|
1161
|
+
|
|
1162
|
+
3. **配置文件调整**:
|
|
1163
|
+
- 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
|
|
1164
|
+
- 在所有模板和示例项目的配置文件中保持了一致性
|
|
1165
|
+
|
|
1166
|
+
**验证测试**:
|
|
1167
|
+
通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
|
|
1168
|
+
|
|
1169
|
+
---
|
|
1170
|
+
|
|
1098
1171
|
<!-- 文档 section -->
|
|
1099
1172
|
<h2 align="center">📚 文档</h2>
|
|
1100
1173
|
|
|
@@ -580,6 +580,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
|
|
|
580
580
|
|
|
581
581
|
推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
|
|
582
582
|
|
|
583
|
+
#### Redis Key 命名规范
|
|
584
|
+
|
|
585
|
+
在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
|
|
586
|
+
|
|
587
|
+
##### 默认命名规则
|
|
588
|
+
Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
|
|
589
|
+
|
|
590
|
+
其中:
|
|
591
|
+
- `PROJECT_NAME`:项目名称,用于区分不同项目
|
|
592
|
+
- `component`:组件类型,如`queue`、`filter`、`item`
|
|
593
|
+
- `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
|
|
594
|
+
|
|
595
|
+
##### 具体Key格式
|
|
596
|
+
1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
|
|
597
|
+
- 用于存储待处理的请求任务
|
|
598
|
+
|
|
599
|
+
2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
|
|
600
|
+
- 用于存储正在处理的请求任务
|
|
601
|
+
|
|
602
|
+
3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
|
|
603
|
+
- 用于存储处理失败的请求任务
|
|
604
|
+
|
|
605
|
+
4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
|
|
606
|
+
- 用于存储请求URL的指纹,实现去重功能
|
|
607
|
+
|
|
608
|
+
5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
|
|
609
|
+
- 用于存储数据项的指纹,防止重复存储
|
|
610
|
+
|
|
611
|
+
##### 自定义队列名称
|
|
612
|
+
用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
|
|
613
|
+
- 处理中队列:将`:queue:requests`替换为`:queue:processing`
|
|
614
|
+
- 失败队列:将`:queue:requests`替换为`:queue:failed`
|
|
615
|
+
|
|
616
|
+
示例配置:
|
|
617
|
+
```python
|
|
618
|
+
# settings.py
|
|
619
|
+
SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
620
|
+
```
|
|
621
|
+
|
|
622
|
+
##### 命名规范优势
|
|
623
|
+
1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
|
|
624
|
+
2. **组件分类清晰**:通过组件类型区分不同功能模块
|
|
625
|
+
3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
|
|
626
|
+
4. **防止命名冲突**:避免不同项目或组件间的Key冲突
|
|
627
|
+
|
|
583
628
|
<!-- 配置系统 section -->
|
|
584
629
|
<h2 align="center">🎛️ 配置系统</h2>
|
|
585
630
|
|
|
@@ -1045,6 +1090,34 @@ asyncio.run(process.crawl('my_spider_name'))
|
|
|
1045
1090
|
|
|
1046
1091
|
---
|
|
1047
1092
|
|
|
1093
|
+
<!-- Redis键名修复说明 section -->
|
|
1094
|
+
<h2 align="center">🔧 Redis键名修复说明</h2>
|
|
1095
|
+
|
|
1096
|
+
在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
|
|
1097
|
+
|
|
1098
|
+
- **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
|
|
1099
|
+
- **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
|
|
1100
|
+
- **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
|
|
1101
|
+
|
|
1102
|
+
**修复内容**:
|
|
1103
|
+
|
|
1104
|
+
1. **队列管理器优化**:
|
|
1105
|
+
- 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
|
|
1106
|
+
- 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
|
|
1107
|
+
|
|
1108
|
+
2. **Redis队列实现改进**:
|
|
1109
|
+
- 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
|
|
1110
|
+
- 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
|
|
1111
|
+
|
|
1112
|
+
3. **配置文件调整**:
|
|
1113
|
+
- 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
|
|
1114
|
+
- 在所有模板和示例项目的配置文件中保持了一致性
|
|
1115
|
+
|
|
1116
|
+
**验证测试**:
|
|
1117
|
+
通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
|
|
1118
|
+
|
|
1119
|
+
---
|
|
1120
|
+
|
|
1048
1121
|
<!-- 文档 section -->
|
|
1049
1122
|
<h2 align="center">📚 文档</h2>
|
|
1050
1123
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '1.3.7'
|
|
@@ -57,9 +57,10 @@ class CrawloFramework:
|
|
|
57
57
|
|
|
58
58
|
self._logger.info(f"Crawlo Framework Started {version}")
|
|
59
59
|
|
|
60
|
-
#
|
|
60
|
+
# 获取运行模式和队列类型并记录日志
|
|
61
61
|
run_mode = self._settings.get('RUN_MODE', 'unknown')
|
|
62
|
-
self.
|
|
62
|
+
queue_type = self._settings.get('QUEUE_TYPE', 'unknown')
|
|
63
|
+
self._logger.info(f"RunMode: {run_mode}, QueueType: {queue_type}")
|
|
63
64
|
|
|
64
65
|
# 记录项目名称
|
|
65
66
|
project_name = self._settings.get('PROJECT_NAME', 'unknown')
|
|
@@ -146,6 +146,17 @@ class QueueConfig:
|
|
|
146
146
|
@classmethod
|
|
147
147
|
def from_settings(cls, settings) -> 'QueueConfig':
|
|
148
148
|
"""Create configuration from settings"""
|
|
149
|
+
# 获取项目名称,用于生成默认队列名称
|
|
150
|
+
project_name = settings.get('PROJECT_NAME', 'default')
|
|
151
|
+
default_queue_name = f"crawlo:{project_name}:queue:requests"
|
|
152
|
+
|
|
153
|
+
# 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
|
|
154
|
+
scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
|
|
155
|
+
if scheduler_queue_name is not None:
|
|
156
|
+
queue_name = scheduler_queue_name
|
|
157
|
+
else:
|
|
158
|
+
queue_name = default_queue_name
|
|
159
|
+
|
|
149
160
|
return cls(
|
|
150
161
|
queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
|
|
151
162
|
redis_url=settings.get('REDIS_URL'),
|
|
@@ -153,7 +164,7 @@ class QueueConfig:
|
|
|
153
164
|
redis_port=settings.get_int('REDIS_PORT', 6379),
|
|
154
165
|
redis_password=settings.get('REDIS_PASSWORD'),
|
|
155
166
|
redis_db=settings.get_int('REDIS_DB', 0),
|
|
156
|
-
queue_name=
|
|
167
|
+
queue_name=queue_name,
|
|
157
168
|
max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
|
|
158
169
|
max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
|
|
159
170
|
timeout=settings.get_int('QUEUE_TIMEOUT', 300)
|
|
@@ -423,15 +434,23 @@ class QueueManager:
|
|
|
423
434
|
except ImportError as e:
|
|
424
435
|
raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
|
|
425
436
|
|
|
426
|
-
#
|
|
437
|
+
# 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
|
|
427
438
|
project_name = "default"
|
|
428
439
|
if ':' in self.config.queue_name:
|
|
429
440
|
parts = self.config.queue_name.split(':')
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
if
|
|
433
|
-
|
|
434
|
-
|
|
441
|
+
if len(parts) >= 2:
|
|
442
|
+
# 处理可能的双重 crawlo 前缀
|
|
443
|
+
if parts[0] == "crawlo" and parts[1] == "crawlo":
|
|
444
|
+
# 双重 crawlo 前缀,取"crawlo"作为项目名称
|
|
445
|
+
project_name = "crawlo"
|
|
446
|
+
elif parts[0] == "crawlo":
|
|
447
|
+
# 正常的 crawlo 前缀,取第二个部分作为项目名称
|
|
448
|
+
project_name = parts[1]
|
|
449
|
+
else:
|
|
450
|
+
# 没有 crawlo 前缀,使用第一个部分作为项目名称
|
|
451
|
+
project_name = parts[0]
|
|
452
|
+
else:
|
|
453
|
+
project_name = self.config.queue_name or "default"
|
|
435
454
|
else:
|
|
436
455
|
project_name = self.config.queue_name or "default"
|
|
437
456
|
|
|
@@ -63,8 +63,8 @@ class RedisPriorityQueue:
|
|
|
63
63
|
if queue_name is None:
|
|
64
64
|
self.queue_name = f"crawlo:{module_name}:queue:requests"
|
|
65
65
|
else:
|
|
66
|
-
#
|
|
67
|
-
self.queue_name = queue_name
|
|
66
|
+
# 处理多重 crawlo 前缀,规范化队列名称
|
|
67
|
+
self.queue_name = self._normalize_queue_name(queue_name)
|
|
68
68
|
|
|
69
69
|
# 如果未提供 processing_queue,则根据 queue_name 自动生成
|
|
70
70
|
if processing_queue is None:
|
|
@@ -92,6 +92,47 @@ class RedisPriorityQueue:
|
|
|
92
92
|
self._lock = asyncio.Lock() # 用于连接初始化的锁
|
|
93
93
|
self.request_serializer = RequestSerializer() # 处理序列化
|
|
94
94
|
|
|
95
|
+
def _normalize_queue_name(self, queue_name: str) -> str:
|
|
96
|
+
"""
|
|
97
|
+
规范化队列名称,处理多重 crawlo 前缀
|
|
98
|
+
|
|
99
|
+
:param queue_name: 原始队列名称
|
|
100
|
+
:return: 规范化后的队列名称
|
|
101
|
+
"""
|
|
102
|
+
# 如果队列名称已经符合规范(以 crawlo: 开头且不是 crawlo:crawlo:),则保持不变
|
|
103
|
+
if queue_name.startswith("crawlo:") and not queue_name.startswith("crawlo:crawlo:"):
|
|
104
|
+
return queue_name
|
|
105
|
+
|
|
106
|
+
# 处理三重 crawlo 前缀,简化为标准格式
|
|
107
|
+
if queue_name.startswith("crawlo:crawlo:crawlo:"):
|
|
108
|
+
# 三重 crawlo 前缀,简化为标准 crawlo: 格式
|
|
109
|
+
remaining = queue_name[21:] # 去掉 "crawlo:crawlo:crawlo:" 前缀
|
|
110
|
+
if remaining:
|
|
111
|
+
return f"crawlo:{remaining}"
|
|
112
|
+
else:
|
|
113
|
+
return "crawlo:requests" # 默认名称
|
|
114
|
+
|
|
115
|
+
# 处理双重 crawlo 前缀
|
|
116
|
+
elif queue_name.startswith("crawlo:crawlo:"):
|
|
117
|
+
# 双重 crawlo 前缀,简化为标准 crawlo: 格式
|
|
118
|
+
remaining = queue_name[14:] # 去掉 "crawlo:crawlo:" 前缀
|
|
119
|
+
if remaining:
|
|
120
|
+
return f"crawlo:{remaining}"
|
|
121
|
+
else:
|
|
122
|
+
return "crawlo:requests" # 默认名称
|
|
123
|
+
|
|
124
|
+
# 处理无 crawlo 前缀的情况
|
|
125
|
+
elif not queue_name.startswith("crawlo:"):
|
|
126
|
+
# 无 crawlo 前缀,添加 crawlo: 前缀
|
|
127
|
+
if queue_name:
|
|
128
|
+
return f"crawlo:{queue_name}"
|
|
129
|
+
else:
|
|
130
|
+
return "crawlo:requests" # 默认名称
|
|
131
|
+
|
|
132
|
+
# 其他情况,保持不变
|
|
133
|
+
else:
|
|
134
|
+
return queue_name
|
|
135
|
+
|
|
95
136
|
async def connect(self, max_retries=3, delay=1):
|
|
96
137
|
"""异步连接 Redis,支持重试"""
|
|
97
138
|
async with self._lock:
|
|
@@ -60,7 +60,8 @@ REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
|
|
|
60
60
|
ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
|
|
61
61
|
|
|
62
62
|
# 调度器队列名称(遵循统一命名规范)
|
|
63
|
-
|
|
63
|
+
# 当使用Redis队列时,取消注释并设置此值,或在项目配置文件中设置
|
|
64
|
+
# SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
|
|
64
65
|
|
|
65
66
|
# 队列类型:memory/redis/auto
|
|
66
67
|
QUEUE_TYPE = 'auto'
|
|
@@ -97,13 +98,12 @@ if REDIS_PASSWORD:
|
|
|
97
98
|
else:
|
|
98
99
|
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
99
100
|
|
|
100
|
-
#
|
|
101
|
-
#
|
|
102
|
-
# crawlo:{PROJECT_NAME}:
|
|
103
|
-
# crawlo:{PROJECT_NAME}:
|
|
104
|
-
# crawlo:{PROJECT_NAME}:queue:
|
|
105
|
-
# crawlo:{PROJECT_NAME}:queue:
|
|
106
|
-
# crawlo:{PROJECT_NAME}:queue:failed (失败队列)
|
|
101
|
+
# Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
|
|
102
|
+
# - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
|
|
103
|
+
# - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
|
|
104
|
+
# - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
|
|
105
|
+
# - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
|
|
106
|
+
# - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
|
|
107
107
|
|
|
108
108
|
REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
|
|
109
109
|
CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
|
|
@@ -29,6 +29,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
|
29
29
|
# ============================== 队列配置 ==============================
|
|
30
30
|
# 队列类型: 'memory', 'redis', 'auto'
|
|
31
31
|
QUEUE_TYPE = 'memory'
|
|
32
|
+
# 当使用Redis队列时,可自定义队列名称
|
|
33
|
+
# 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
|
|
34
|
+
# SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
32
35
|
|
|
33
36
|
# ============================== 去重过滤器 ==============================
|
|
34
37
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
@@ -28,6 +28,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
|
28
28
|
|
|
29
29
|
# ============================== 队列配置 ==============================
|
|
30
30
|
QUEUE_TYPE = 'redis'
|
|
31
|
+
# 当使用Redis队列时,可自定义队列名称
|
|
32
|
+
# 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
|
|
33
|
+
# SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
31
34
|
|
|
32
35
|
# ============================== 去重过滤器 ==============================
|
|
33
36
|
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlo
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.7
|
|
4
4
|
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
5
|
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
6
|
Author: crawl-coder
|
|
@@ -630,6 +630,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
|
|
|
630
630
|
|
|
631
631
|
推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
|
|
632
632
|
|
|
633
|
+
#### Redis Key 命名规范
|
|
634
|
+
|
|
635
|
+
在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
|
|
636
|
+
|
|
637
|
+
##### 默认命名规则
|
|
638
|
+
Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
|
|
639
|
+
|
|
640
|
+
其中:
|
|
641
|
+
- `PROJECT_NAME`:项目名称,用于区分不同项目
|
|
642
|
+
- `component`:组件类型,如`queue`、`filter`、`item`
|
|
643
|
+
- `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
|
|
644
|
+
|
|
645
|
+
##### 具体Key格式
|
|
646
|
+
1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
|
|
647
|
+
- 用于存储待处理的请求任务
|
|
648
|
+
|
|
649
|
+
2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
|
|
650
|
+
- 用于存储正在处理的请求任务
|
|
651
|
+
|
|
652
|
+
3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
|
|
653
|
+
- 用于存储处理失败的请求任务
|
|
654
|
+
|
|
655
|
+
4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
|
|
656
|
+
- 用于存储请求URL的指纹,实现去重功能
|
|
657
|
+
|
|
658
|
+
5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
|
|
659
|
+
- 用于存储数据项的指纹,防止重复存储
|
|
660
|
+
|
|
661
|
+
##### 自定义队列名称
|
|
662
|
+
用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
|
|
663
|
+
- 处理中队列:将`:queue:requests`替换为`:queue:processing`
|
|
664
|
+
- 失败队列:将`:queue:requests`替换为`:queue:failed`
|
|
665
|
+
|
|
666
|
+
示例配置:
|
|
667
|
+
```python
|
|
668
|
+
# settings.py
|
|
669
|
+
SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
670
|
+
```
|
|
671
|
+
|
|
672
|
+
##### 命名规范优势
|
|
673
|
+
1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
|
|
674
|
+
2. **组件分类清晰**:通过组件类型区分不同功能模块
|
|
675
|
+
3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
|
|
676
|
+
4. **防止命名冲突**:避免不同项目或组件间的Key冲突
|
|
677
|
+
|
|
633
678
|
<!-- 配置系统 section -->
|
|
634
679
|
<h2 align="center">🎛️ 配置系统</h2>
|
|
635
680
|
|
|
@@ -1095,6 +1140,34 @@ asyncio.run(process.crawl('my_spider_name'))
|
|
|
1095
1140
|
|
|
1096
1141
|
---
|
|
1097
1142
|
|
|
1143
|
+
<!-- Redis键名修复说明 section -->
|
|
1144
|
+
<h2 align="center">🔧 Redis键名修复说明</h2>
|
|
1145
|
+
|
|
1146
|
+
在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
|
|
1147
|
+
|
|
1148
|
+
- **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
|
|
1149
|
+
- **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
|
|
1150
|
+
- **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
|
|
1151
|
+
|
|
1152
|
+
**修复内容**:
|
|
1153
|
+
|
|
1154
|
+
1. **队列管理器优化**:
|
|
1155
|
+
- 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
|
|
1156
|
+
- 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
|
|
1157
|
+
|
|
1158
|
+
2. **Redis队列实现改进**:
|
|
1159
|
+
- 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
|
|
1160
|
+
- 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
|
|
1161
|
+
|
|
1162
|
+
3. **配置文件调整**:
|
|
1163
|
+
- 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
|
|
1164
|
+
- 在所有模板和示例项目的配置文件中保持了一致性
|
|
1165
|
+
|
|
1166
|
+
**验证测试**:
|
|
1167
|
+
通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
|
|
1168
|
+
|
|
1169
|
+
---
|
|
1170
|
+
|
|
1098
1171
|
<!-- 文档 section -->
|
|
1099
1172
|
<h2 align="center">📚 文档</h2>
|
|
1100
1173
|
|
|
@@ -203,6 +203,7 @@ tests/simple_crawlo_test.py
|
|
|
203
203
|
tests/simple_log_test.py
|
|
204
204
|
tests/simple_log_test2.py
|
|
205
205
|
tests/simple_optimization_test.py
|
|
206
|
+
tests/simple_queue_type_test.py
|
|
206
207
|
tests/simple_spider_test.py
|
|
207
208
|
tests/simple_test.py
|
|
208
209
|
tests/spider_log_timing_test.py
|
|
@@ -264,6 +265,8 @@ tests/test_proxy_strategies.py
|
|
|
264
265
|
tests/test_queue_empty_check.py
|
|
265
266
|
tests/test_queue_manager_double_crawlo.py
|
|
266
267
|
tests/test_queue_manager_redis_key.py
|
|
268
|
+
tests/test_queue_naming.py
|
|
269
|
+
tests/test_queue_type.py
|
|
267
270
|
tests/test_random_user_agent.py
|
|
268
271
|
tests/test_real_scenario_proxy.py
|
|
269
272
|
tests/test_redis_config.py
|
|
@@ -271,6 +274,7 @@ tests/test_redis_connection_pool.py
|
|
|
271
274
|
tests/test_redis_key_naming.py
|
|
272
275
|
tests/test_redis_key_validator.py
|
|
273
276
|
tests/test_redis_queue.py
|
|
277
|
+
tests/test_redis_queue_name_fix.py
|
|
274
278
|
tests/test_request_ignore_middleware.py
|
|
275
279
|
tests/test_request_params.py
|
|
276
280
|
tests/test_request_serialization.py
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简单测试 QUEUE_TYPE 配置获取
|
|
5
|
+
验证我们的日志格式修改是否正常工作
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
# 添加项目根目录到路径
|
|
12
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
13
|
+
|
|
14
|
+
from crawlo.config import CrawloConfig
|
|
15
|
+
from crawlo.framework import CrawloFramework
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_log_format():
|
|
19
|
+
"""测试日志格式修改是否正常工作"""
|
|
20
|
+
print("=== 测试日志格式修改 ===")
|
|
21
|
+
|
|
22
|
+
# 创建单机模式配置
|
|
23
|
+
config = CrawloConfig.standalone(concurrency=4)
|
|
24
|
+
|
|
25
|
+
# 创建框架实例,这会触发日志输出
|
|
26
|
+
framework = CrawloFramework(config.to_dict())
|
|
27
|
+
|
|
28
|
+
# 获取配置信息
|
|
29
|
+
run_mode = framework.settings.get('RUN_MODE', 'not found')
|
|
30
|
+
queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
|
|
31
|
+
|
|
32
|
+
print(f"从配置中获取到的信息:")
|
|
33
|
+
print(f" RunMode: {run_mode}")
|
|
34
|
+
print(f" QueueType: {queue_type}")
|
|
35
|
+
|
|
36
|
+
print("\n✅ 日志格式修改测试完成")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
if __name__ == "__main__":
|
|
40
|
+
print("开始简单测试 QUEUE_TYPE 配置获取...")
|
|
41
|
+
test_log_format()
|
|
42
|
+
print("\n测试结束!")
|
|
@@ -34,18 +34,18 @@ async def test_redis_queue_naming():
|
|
|
34
34
|
{
|
|
35
35
|
"name": "双重 crawlo 前缀",
|
|
36
36
|
"queue_name": "crawlo:crawlo:queue:requests",
|
|
37
|
-
"expected_module": "
|
|
38
|
-
"expected_queue": "crawlo:
|
|
39
|
-
"expected_processing": "crawlo:
|
|
40
|
-
"expected_failed": "crawlo:
|
|
37
|
+
"expected_module": "test_project",
|
|
38
|
+
"expected_queue": "crawlo:queue:requests", # 修复后的期望值
|
|
39
|
+
"expected_processing": "crawlo:queue:processing",
|
|
40
|
+
"expected_failed": "crawlo:queue:failed"
|
|
41
41
|
},
|
|
42
42
|
{
|
|
43
43
|
"name": "三重 crawlo 前缀",
|
|
44
44
|
"queue_name": "crawlo:crawlo:crawlo:queue:requests",
|
|
45
|
-
"expected_module": "
|
|
46
|
-
"expected_queue": "crawlo:
|
|
47
|
-
"expected_processing": "crawlo:
|
|
48
|
-
"expected_failed": "crawlo:
|
|
45
|
+
"expected_module": "test_project",
|
|
46
|
+
"expected_queue": "crawlo:queue:requests", # 修复后的期望值
|
|
47
|
+
"expected_processing": "crawlo:queue:processing",
|
|
48
|
+
"expected_failed": "crawlo:queue:failed"
|
|
49
49
|
},
|
|
50
50
|
{
|
|
51
51
|
"name": "无 crawlo 前缀",
|
|
@@ -138,11 +138,8 @@ async def test_queue_manager_naming():
|
|
|
138
138
|
if len(parts) >= 2:
|
|
139
139
|
# 处理可能的双重 crawlo 前缀
|
|
140
140
|
if parts[0] == "crawlo" and parts[1] == "crawlo":
|
|
141
|
-
# 双重 crawlo
|
|
142
|
-
|
|
143
|
-
project_name = parts[2]
|
|
144
|
-
else:
|
|
145
|
-
project_name = "default"
|
|
141
|
+
# 双重 crawlo 前缀,取"crawlo"作为项目名称
|
|
142
|
+
project_name = "crawlo"
|
|
146
143
|
elif parts[0] == "crawlo":
|
|
147
144
|
# 正常的 crawlo 前缀,取第二个部分作为项目名称
|
|
148
145
|
project_name = parts[1]
|