crawlo 1.3.0__tar.gz → 1.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crawlo-1.3.0/crawlo.egg-info → crawlo-1.3.2}/PKG-INFO +13 -4
- {crawlo-1.3.0 → crawlo-1.3.2}/README.md +12 -3
- crawlo-1.3.2/crawlo/__version__.py +1 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/utils.py +12 -2
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/crawler.py +91 -20
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/offsite.py +12 -3
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/mode_manager.py +21 -46
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/mysql_pipeline.py +5 -4
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/pipeline_manager.py +2 -1
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/project.py +16 -3
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/settings/default_settings.py +13 -4
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/settings/setting_manager.py +29 -6
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/settings.py.tmpl +5 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/settings_distributed.py.tmpl +6 -1
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/settings_gentle.py.tmpl +5 -40
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/settings_minimal.py.tmpl +5 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/settings_simple.py.tmpl +5 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/run.py.tmpl +0 -9
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/log.py +4 -4
- {crawlo-1.3.0 → crawlo-1.3.2/crawlo.egg-info}/PKG-INFO +13 -4
- crawlo-1.3.0/crawlo/__version__.py +0 -1
- {crawlo-1.3.0 → crawlo-1.3.2}/LICENSE +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/MANIFEST.in +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/cli.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/check.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/help.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/list.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/run.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/commands/stats.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/config.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/config_validator.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/core/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/core/engine.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/core/processor.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/data/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/data/user_agents.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/event.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/exceptions.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/items/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/items/base.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/items/fields.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/items/items.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/middleware/simple_proxy.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/network/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/network/request.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/network/response.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/queue/queue_manager.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/queue/redis_priority_queue.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/stats_collector.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/subscriber.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/task_manager.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/data_formatter.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/encoding_converter.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/request_tools.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/tools/text_cleaner.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/redis_connection_pool.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/request.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/system.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/tools.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo/utils/url.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo.egg-info/SOURCES.txt +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/examples/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/pyproject.toml +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/requirements.txt +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/setup.cfg +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/__init__.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/cleaners_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/config_validation_demo.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/date_tools_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/debug_pipelines.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/env_config_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/error_handling_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/request_params_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/response_improvements_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_cleaners.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_comprehensive.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_config_consistency.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_config_merge.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_config_validator.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_crawlo_proxy_integration.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_date_tools.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_default_header_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_distributed.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_edge_cases.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_env_config.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_final_validation.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_integration.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_mode_consistency.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_offsite_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_parsel.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_performance.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_api.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_middleware_refactored.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_random_user_agent.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_redis_config.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_redis_queue.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_request_params.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_request_serialization.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_response_improvements.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_scheduler.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_scheduler_config_update.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_simple_response.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_template_content.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_tools.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/test_user_agents.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/tools_example.py +0 -0
- {crawlo-1.3.0 → crawlo-1.3.2}/tests/verify_distributed.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlo
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.2
|
|
4
4
|
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
5
|
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
6
|
Author: crawl-coder
|
|
@@ -56,7 +56,7 @@ Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
|
56
56
|
|
|
57
57
|
<p align="center">
|
|
58
58
|
<a href="https://www.python.org/downloads/">
|
|
59
|
-
<img src="https://img.shields.io/badge/python
|
|
59
|
+
<img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
|
|
60
60
|
</a>
|
|
61
61
|
<a href="LICENSE">
|
|
62
62
|
<img src="https://img.shields.io/badge/license-MIT-green" alt="License">
|
|
@@ -429,13 +429,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
|
|
|
429
429
|
|
|
430
430
|
用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
|
|
431
431
|
|
|
432
|
-
|
|
432
|
+
``python
|
|
433
433
|
# settings.py
|
|
434
434
|
CUSTOM_MIDDLEWARES = [
|
|
435
435
|
'myproject.middlewares.CustomMiddleware',
|
|
436
436
|
]
|
|
437
437
|
```
|
|
438
438
|
|
|
439
|
+
> **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
|
|
440
|
+
> - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
|
|
441
|
+
> - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
|
|
442
|
+
>
|
|
443
|
+
> 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
|
|
444
|
+
|
|
445
|
+
> **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
|
|
446
|
+
> 这样可以在所有默认中间件处理后再应用代理设置。
|
|
447
|
+
|
|
439
448
|
#### 管道配置
|
|
440
449
|
|
|
441
450
|
框架默认加载以下管道:
|
|
@@ -930,7 +939,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
|
|
|
930
939
|
|
|
931
940
|
如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
|
|
932
941
|
|
|
933
|
-
|
|
942
|
+
``python
|
|
934
943
|
# settings.py
|
|
935
944
|
MIDDLEWARES = [
|
|
936
945
|
# 注释掉复杂版代理中间件
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
8
|
<a href="https://www.python.org/downloads/">
|
|
9
|
-
<img src="https://img.shields.io/badge/python
|
|
9
|
+
<img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
|
|
10
10
|
</a>
|
|
11
11
|
<a href="LICENSE">
|
|
12
12
|
<img src="https://img.shields.io/badge/license-MIT-green" alt="License">
|
|
@@ -379,13 +379,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
|
|
|
379
379
|
|
|
380
380
|
用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
|
|
381
381
|
|
|
382
|
-
|
|
382
|
+
``python
|
|
383
383
|
# settings.py
|
|
384
384
|
CUSTOM_MIDDLEWARES = [
|
|
385
385
|
'myproject.middlewares.CustomMiddleware',
|
|
386
386
|
]
|
|
387
387
|
```
|
|
388
388
|
|
|
389
|
+
> **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
|
|
390
|
+
> - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
|
|
391
|
+
> - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
|
|
392
|
+
>
|
|
393
|
+
> 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
|
|
394
|
+
|
|
395
|
+
> **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
|
|
396
|
+
> 这样可以在所有默认中间件处理后再应用代理设置。
|
|
397
|
+
|
|
389
398
|
#### 管道配置
|
|
390
399
|
|
|
391
400
|
框架默认加载以下管道:
|
|
@@ -880,7 +889,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
|
|
|
880
889
|
|
|
881
890
|
如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
|
|
882
891
|
|
|
883
|
-
|
|
892
|
+
``python
|
|
884
893
|
# settings.py
|
|
885
894
|
MIDDLEWARES = [
|
|
886
895
|
# 注释掉复杂版代理中间件
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '1.3.2'
|
|
@@ -133,8 +133,11 @@ def validate_spider_name(spider_name: str) -> bool:
|
|
|
133
133
|
bool: 是否有效
|
|
134
134
|
"""
|
|
135
135
|
import re
|
|
136
|
+
# 清理爬虫名称中的不可见字符
|
|
137
|
+
cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
|
|
138
|
+
|
|
136
139
|
# 爬虫名称应该是有效的Python标识符
|
|
137
|
-
return
|
|
140
|
+
return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
|
|
138
141
|
|
|
139
142
|
|
|
140
143
|
def format_file_size(size_bytes: int) -> str:
|
|
@@ -181,7 +184,14 @@ def is_valid_domain(domain: str) -> bool:
|
|
|
181
184
|
bool: 是否有效
|
|
182
185
|
"""
|
|
183
186
|
import re
|
|
187
|
+
# 清理域名中的不可见字符
|
|
188
|
+
cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
|
|
189
|
+
|
|
184
190
|
pattern = re.compile(
|
|
185
191
|
r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
|
|
186
192
|
)
|
|
187
|
-
return bool(pattern.match(
|
|
193
|
+
return bool(pattern.match(cleaned_domain))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# 添加导入
|
|
197
|
+
import unicodedata
|
|
@@ -36,13 +36,12 @@ from .spider import Spider, get_global_spider_registry
|
|
|
36
36
|
from .core.engine import Engine
|
|
37
37
|
from .subscriber import Subscriber
|
|
38
38
|
from .extension import ExtensionManager
|
|
39
|
+
from crawlo.utils.log import get_logger
|
|
39
40
|
from .stats_collector import StatsCollector
|
|
40
41
|
from .event import spider_opened, spider_closed
|
|
41
42
|
from .settings.setting_manager import SettingManager
|
|
42
43
|
from crawlo.project import merge_settings, get_settings
|
|
43
44
|
|
|
44
|
-
# 使用自定义日志系统
|
|
45
|
-
from crawlo.utils.log import get_logger
|
|
46
45
|
logger = get_logger(__name__)
|
|
47
46
|
|
|
48
47
|
|
|
@@ -112,7 +111,12 @@ class Crawler:
|
|
|
112
111
|
- Exception handling and cleanup
|
|
113
112
|
"""
|
|
114
113
|
|
|
115
|
-
def __init__(
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
spider_cls: Type[Spider],
|
|
117
|
+
settings: SettingManager,
|
|
118
|
+
context: Optional[CrawlerContext] = None
|
|
119
|
+
):
|
|
116
120
|
self.spider_cls = spider_cls
|
|
117
121
|
self.spider: Optional[Spider] = None
|
|
118
122
|
self.engine: Optional[Engine] = None
|
|
@@ -137,6 +141,22 @@ class Crawler:
|
|
|
137
141
|
'error_count': 0
|
|
138
142
|
}
|
|
139
143
|
|
|
144
|
+
# Initialize components
|
|
145
|
+
self.subscriber = self._create_subscriber()
|
|
146
|
+
self.spider = self._create_spider()
|
|
147
|
+
self.engine = self._create_engine()
|
|
148
|
+
self.stats = self._create_stats()
|
|
149
|
+
# Note: Do not initialize extension manager here, let it initialize in the engine
|
|
150
|
+
|
|
151
|
+
# Validate crawler state
|
|
152
|
+
self._validate_crawler_state()
|
|
153
|
+
|
|
154
|
+
# 打印启动信息,确保在日志系统配置之后打印
|
|
155
|
+
self._log_startup_info()
|
|
156
|
+
|
|
157
|
+
# 将启动爬虫名称的日志移到这里,确保在日志系统配置之后打印
|
|
158
|
+
logger.info(f"Starting running {self.spider.name}")
|
|
159
|
+
|
|
140
160
|
async def crawl(self):
|
|
141
161
|
"""
|
|
142
162
|
Start the crawler core process
|
|
@@ -233,6 +253,52 @@ class Crawler:
|
|
|
233
253
|
return self._end_time - self._start_time
|
|
234
254
|
return 0.0
|
|
235
255
|
|
|
256
|
+
def _log_startup_info(self):
|
|
257
|
+
"""Print startup information, including run mode and key configuration checks"""
|
|
258
|
+
# Get run mode
|
|
259
|
+
run_mode = self.settings.get('RUN_MODE', 'standalone')
|
|
260
|
+
|
|
261
|
+
# Get version number
|
|
262
|
+
version = self.settings.get('VERSION', '1.0.0')
|
|
263
|
+
if not version or version == 'None':
|
|
264
|
+
version = '1.0.0'
|
|
265
|
+
|
|
266
|
+
# Print framework start info
|
|
267
|
+
logger.info(f"Crawlo Framework Started {version}")
|
|
268
|
+
|
|
269
|
+
# Add mode info if available
|
|
270
|
+
mode_info = self.settings.get('_mode_info')
|
|
271
|
+
if mode_info:
|
|
272
|
+
logger.info(mode_info)
|
|
273
|
+
else:
|
|
274
|
+
# 如果没有_mode_info,添加默认信息
|
|
275
|
+
logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
|
|
276
|
+
|
|
277
|
+
# Get actual queue type
|
|
278
|
+
queue_type = self.settings.get('QUEUE_TYPE', 'memory')
|
|
279
|
+
|
|
280
|
+
# Display information based on run mode and queue type combination
|
|
281
|
+
if run_mode == 'distributed':
|
|
282
|
+
logger.info("Run Mode: distributed")
|
|
283
|
+
logger.info("Distributed Mode - Multi-node collaboration supported")
|
|
284
|
+
# Show Redis configuration
|
|
285
|
+
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
286
|
+
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
287
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
288
|
+
elif run_mode == 'standalone':
|
|
289
|
+
if queue_type == 'redis':
|
|
290
|
+
logger.info("Run Mode: standalone+redis")
|
|
291
|
+
# Show Redis configuration
|
|
292
|
+
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
293
|
+
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
294
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
295
|
+
elif queue_type == 'auto':
|
|
296
|
+
logger.info("Run Mode: standalone+auto")
|
|
297
|
+
else: # memory
|
|
298
|
+
logger.info("Run Mode: standalone")
|
|
299
|
+
else:
|
|
300
|
+
logger.info(f"Run Mode: {run_mode}")
|
|
301
|
+
|
|
236
302
|
async def _ensure_cleanup(self):
|
|
237
303
|
"""Ensure resource cleanup"""
|
|
238
304
|
try:
|
|
@@ -483,7 +549,10 @@ class CrawlerProcess:
|
|
|
483
549
|
signal.signal(signal.SIGINT, self._shutdown)
|
|
484
550
|
signal.signal(signal.SIGTERM, self._shutdown)
|
|
485
551
|
|
|
486
|
-
|
|
552
|
+
# 注意:移除在这里调用_log_startup_info(),因为这时候日志系统可能还没有被正确配置
|
|
553
|
+
# 日志系统的配置是在project.py的get_settings函数中进行的,而CrawlerProcess的实例化
|
|
554
|
+
# 是在get_settings函数返回之前进行的,所以这时候调用_log_startup_info()可能会导致
|
|
555
|
+
# 日志信息没有被正确写入到日志文件中
|
|
487
556
|
|
|
488
557
|
logger.debug(
|
|
489
558
|
f"CrawlerProcess initialized successfully\n"
|
|
@@ -983,39 +1052,41 @@ class CrawlerProcess:
|
|
|
983
1052
|
if not version or version == 'None':
|
|
984
1053
|
version = '1.0.0'
|
|
985
1054
|
|
|
986
|
-
#
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
1055
|
+
# Print framework start info
|
|
1056
|
+
logger.info(f"Crawlo Framework Started {version}")
|
|
1057
|
+
|
|
1058
|
+
# Add mode info if available
|
|
1059
|
+
mode_info = self.settings.get('_mode_info')
|
|
1060
|
+
if mode_info:
|
|
1061
|
+
logger.info(mode_info)
|
|
1062
|
+
else:
|
|
1063
|
+
# 如果没有_mode_info,添加默认信息
|
|
1064
|
+
logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
|
|
990
1065
|
|
|
991
1066
|
# Get actual queue type
|
|
992
1067
|
queue_type = self.settings.get('QUEUE_TYPE', 'memory')
|
|
993
1068
|
|
|
994
1069
|
# Display information based on run mode and queue type combination
|
|
995
1070
|
if run_mode == 'distributed':
|
|
996
|
-
|
|
997
|
-
|
|
1071
|
+
logger.info("Run Mode: distributed")
|
|
1072
|
+
logger.info("Distributed Mode - Multi-node collaboration supported")
|
|
998
1073
|
# Show Redis configuration
|
|
999
1074
|
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
1000
1075
|
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
1001
|
-
|
|
1076
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
1002
1077
|
elif run_mode == 'standalone':
|
|
1003
1078
|
if queue_type == 'redis':
|
|
1004
|
-
|
|
1079
|
+
logger.info("Run Mode: standalone+redis")
|
|
1005
1080
|
# Show Redis configuration
|
|
1006
1081
|
redis_host = self.settings.get('REDIS_HOST', 'localhost')
|
|
1007
1082
|
redis_port = self.settings.get('REDIS_PORT', 6379)
|
|
1008
|
-
|
|
1083
|
+
logger.info(f"Redis Address: {redis_host}:{redis_port}")
|
|
1009
1084
|
elif queue_type == 'auto':
|
|
1010
|
-
|
|
1085
|
+
logger.info("Run Mode: standalone+auto")
|
|
1011
1086
|
else: # memory
|
|
1012
|
-
|
|
1087
|
+
logger.info("Run Mode: standalone")
|
|
1013
1088
|
else:
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
# Print startup information at INFO level
|
|
1017
|
-
for info in startup_info:
|
|
1018
|
-
logger.info(info)
|
|
1089
|
+
logger.info(f"Run Mode: {run_mode}")
|
|
1019
1090
|
|
|
1020
1091
|
|
|
1021
1092
|
# === Utility functions ===
|
|
@@ -28,8 +28,16 @@ class OffsiteMiddleware:
|
|
|
28
28
|
创建中间件实例
|
|
29
29
|
从爬虫设置中获取允许的域名列表
|
|
30
30
|
"""
|
|
31
|
-
#
|
|
32
|
-
allowed_domains =
|
|
31
|
+
# 优先使用 Spider 实例的 allowed_domains,回退到全局设置中的 ALLOWED_DOMAINS
|
|
32
|
+
allowed_domains = []
|
|
33
|
+
|
|
34
|
+
# 检查当前爬虫实例是否有 allowed_domains 属性
|
|
35
|
+
if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'allowed_domains'):
|
|
36
|
+
allowed_domains = getattr(crawler.spider, 'allowed_domains', [])
|
|
37
|
+
|
|
38
|
+
# 如果 Spider 实例没有设置 allowed_domains,则从全局设置中获取
|
|
39
|
+
if not allowed_domains:
|
|
40
|
+
allowed_domains = crawler.settings.get_list('ALLOWED_DOMAINS')
|
|
33
41
|
|
|
34
42
|
# 如果没有配置允许的域名,则禁用此中间件
|
|
35
43
|
if not allowed_domains:
|
|
@@ -45,7 +53,8 @@ class OffsiteMiddleware:
|
|
|
45
53
|
# 编译域名正则表达式以提高性能
|
|
46
54
|
o._compile_domains()
|
|
47
55
|
|
|
48
|
-
crawler.logger
|
|
56
|
+
# 使用中间件自己的logger而不是crawler.logger
|
|
57
|
+
o.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
|
|
49
58
|
return o
|
|
50
59
|
|
|
51
60
|
def _compile_domains(self):
|
|
@@ -14,8 +14,6 @@ import os
|
|
|
14
14
|
from enum import Enum
|
|
15
15
|
from typing import Dict, Any, Optional
|
|
16
16
|
|
|
17
|
-
from crawlo.utils.log import get_logger
|
|
18
|
-
|
|
19
17
|
|
|
20
18
|
class RunMode(Enum):
|
|
21
19
|
"""运行模式枚举"""
|
|
@@ -28,7 +26,7 @@ class ModeManager:
|
|
|
28
26
|
"""运行模式管理器"""
|
|
29
27
|
|
|
30
28
|
def __init__(self):
|
|
31
|
-
|
|
29
|
+
pass
|
|
32
30
|
|
|
33
31
|
@staticmethod
|
|
34
32
|
def get_standalone_settings() -> Dict[str, Any]:
|
|
@@ -40,7 +38,6 @@ class ModeManager:
|
|
|
40
38
|
'CONCURRENCY': 8,
|
|
41
39
|
'MAX_RUNNING_SPIDERS': 1,
|
|
42
40
|
'DOWNLOAD_DELAY': 1.0,
|
|
43
|
-
'LOG_LEVEL': 'INFO',
|
|
44
41
|
}
|
|
45
42
|
|
|
46
43
|
@staticmethod
|
|
@@ -48,45 +45,39 @@ class ModeManager:
|
|
|
48
45
|
redis_host: str = '127.0.0.1',
|
|
49
46
|
redis_port: int = 6379,
|
|
50
47
|
redis_password: Optional[str] = None,
|
|
51
|
-
redis_db: int = 0,
|
|
48
|
+
redis_db: int = 0,
|
|
52
49
|
project_name: str = 'crawlo'
|
|
53
50
|
) -> Dict[str, Any]:
|
|
54
51
|
"""获取分布式模式配置"""
|
|
55
|
-
# 构建 Redis URL
|
|
52
|
+
# 构建 Redis URL
|
|
56
53
|
if redis_password:
|
|
57
54
|
redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
|
|
58
55
|
else:
|
|
59
56
|
redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
|
|
60
|
-
|
|
57
|
+
|
|
61
58
|
return {
|
|
62
|
-
'PROJECT_NAME': project_name, # 添加项目名称到配置中
|
|
63
59
|
'QUEUE_TYPE': 'redis',
|
|
64
60
|
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
61
|
+
'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline',
|
|
65
62
|
'REDIS_HOST': redis_host,
|
|
66
63
|
'REDIS_PORT': redis_port,
|
|
67
64
|
'REDIS_PASSWORD': redis_password,
|
|
68
|
-
'REDIS_DB': redis_db,
|
|
65
|
+
'REDIS_DB': redis_db,
|
|
69
66
|
'REDIS_URL': redis_url,
|
|
70
|
-
'
|
|
71
|
-
|
|
72
|
-
# crawlo:{project_name}:filter:fingerprint (请求去重)
|
|
67
|
+
'PROJECT_NAME': project_name,
|
|
68
|
+
'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests',
|
|
73
69
|
'CONCURRENCY': 16,
|
|
74
|
-
'MAX_RUNNING_SPIDERS':
|
|
70
|
+
'MAX_RUNNING_SPIDERS': 10,
|
|
75
71
|
'DOWNLOAD_DELAY': 1.0,
|
|
76
|
-
'LOG_LEVEL': 'INFO',
|
|
77
72
|
}
|
|
78
73
|
|
|
79
74
|
@staticmethod
|
|
80
75
|
def get_auto_settings() -> Dict[str, Any]:
|
|
81
76
|
"""获取自动检测模式配置"""
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
'MAX_RUNNING_SPIDERS': 1,
|
|
87
|
-
'DOWNLOAD_DELAY': 1.0,
|
|
88
|
-
'LOG_LEVEL': 'INFO',
|
|
89
|
-
}
|
|
77
|
+
# 默认使用内存队列和过滤器
|
|
78
|
+
settings = ModeManager.get_standalone_settings()
|
|
79
|
+
settings['QUEUE_TYPE'] = 'auto'
|
|
80
|
+
return settings
|
|
90
81
|
|
|
91
82
|
def resolve_mode_settings(
|
|
92
83
|
self,
|
|
@@ -104,13 +95,14 @@ class ModeManager:
|
|
|
104
95
|
Dict[str, Any]: 配置字典
|
|
105
96
|
"""
|
|
106
97
|
mode = RunMode(mode.lower())
|
|
98
|
+
mode_info = None
|
|
107
99
|
|
|
108
100
|
if mode == RunMode.STANDALONE:
|
|
109
|
-
|
|
101
|
+
mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
|
|
110
102
|
settings = self.get_standalone_settings()
|
|
111
103
|
|
|
112
104
|
elif mode == RunMode.DISTRIBUTED:
|
|
113
|
-
|
|
105
|
+
mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
|
|
114
106
|
settings = self.get_distributed_settings(
|
|
115
107
|
redis_host=kwargs.get('redis_host', '127.0.0.1'),
|
|
116
108
|
redis_port=kwargs.get('redis_port', 6379),
|
|
@@ -120,7 +112,7 @@ class ModeManager:
|
|
|
120
112
|
)
|
|
121
113
|
|
|
122
114
|
elif mode == RunMode.AUTO:
|
|
123
|
-
|
|
115
|
+
mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
|
|
124
116
|
settings = self.get_auto_settings()
|
|
125
117
|
|
|
126
118
|
else:
|
|
@@ -131,6 +123,9 @@ class ModeManager:
|
|
|
131
123
|
if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
|
|
132
124
|
settings.update(user_settings)
|
|
133
125
|
|
|
126
|
+
# 将模式信息添加到配置中,供后续使用
|
|
127
|
+
settings['_mode_info'] = mode_info
|
|
128
|
+
|
|
134
129
|
return settings
|
|
135
130
|
|
|
136
131
|
def from_environment(self) -> Dict[str, Any]:
|
|
@@ -190,24 +185,4 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
|
|
|
190
185
|
def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
|
|
191
186
|
"""从环境变量创建配置"""
|
|
192
187
|
# 移除直接使用 os.getenv(),要求通过 settings 配置
|
|
193
|
-
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
194
|
-
|
|
195
|
-
# 保留原有代码作为参考
|
|
196
|
-
# mode = os.getenv('CRAWLO_MODE', default_mode).lower()
|
|
197
|
-
#
|
|
198
|
-
# if mode == 'distributed':
|
|
199
|
-
# return distributed_mode(
|
|
200
|
-
# redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
|
|
201
|
-
# redis_port=int(os.getenv('REDIS_PORT', 6379)),
|
|
202
|
-
# redis_password=os.getenv('REDIS_PASSWORD'),
|
|
203
|
-
# project_name=os.getenv('PROJECT_NAME', 'crawlo'),
|
|
204
|
-
# CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
|
|
205
|
-
# )
|
|
206
|
-
# elif mode == 'auto':
|
|
207
|
-
# return auto_mode(
|
|
208
|
-
# CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
|
|
209
|
-
# )
|
|
210
|
-
# else: # standalone
|
|
211
|
-
# return standalone_mode(
|
|
212
|
-
# CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
|
|
213
|
-
# )
|
|
188
|
+
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
@@ -7,6 +7,7 @@ from typing import Optional, List, Dict
|
|
|
7
7
|
from crawlo.exceptions import ItemDiscard
|
|
8
8
|
from crawlo.utils.db_helper import make_insert_sql, make_batch_sql
|
|
9
9
|
from crawlo.utils.log import get_logger
|
|
10
|
+
from . import BasePipeline
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class AsyncmyMySQLPipeline:
|
|
@@ -200,7 +201,7 @@ class AiomysqlMySQLPipeline:
|
|
|
200
201
|
crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
|
|
201
202
|
|
|
202
203
|
@classmethod
|
|
203
|
-
def
|
|
204
|
+
def from_crawler(cls, crawler):
|
|
204
205
|
return cls(crawler)
|
|
205
206
|
|
|
206
207
|
async def _init_pool(self):
|
|
@@ -213,12 +214,12 @@ class AiomysqlMySQLPipeline:
|
|
|
213
214
|
try:
|
|
214
215
|
self.pool = await aiomysql.create_pool(
|
|
215
216
|
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
216
|
-
port=self.settings.
|
|
217
|
+
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
217
218
|
user=self.settings.get('MYSQL_USER', 'root'),
|
|
218
219
|
password=self.settings.get('MYSQL_PASSWORD', ''),
|
|
219
220
|
db=self.settings.get('MYSQL_DB', 'scrapy_db'),
|
|
220
|
-
minsize=self.settings.
|
|
221
|
-
maxsize=self.settings.
|
|
221
|
+
minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
|
|
222
|
+
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5),
|
|
222
223
|
cursorclass=aiomysql.DictCursor,
|
|
223
224
|
autocommit=False
|
|
224
225
|
)
|
|
@@ -30,6 +30,7 @@ class PipelineManager:
|
|
|
30
30
|
# 移除所有去重管道实例(如果存在)
|
|
31
31
|
pipelines = [item for item in pipelines if item != dedup_pipeline]
|
|
32
32
|
# 在开头插入去重管道
|
|
33
|
+
self.logger.debug(f"{dedup_pipeline} insert successful")
|
|
33
34
|
pipelines.insert(0, dedup_pipeline)
|
|
34
35
|
|
|
35
36
|
self._add_pipelines(pipelines)
|
|
@@ -46,7 +47,7 @@ class PipelineManager:
|
|
|
46
47
|
pipeline_cls = load_class(pipeline)
|
|
47
48
|
if not hasattr(pipeline_cls, 'from_crawler'):
|
|
48
49
|
raise PipelineInitError(
|
|
49
|
-
f"Pipeline init failed, must inherit from `BasePipeline` or have a `
|
|
50
|
+
f"Pipeline init failed, must inherit from `BasePipeline` or have a `from_crawler` method"
|
|
50
51
|
)
|
|
51
52
|
self.pipelines.append(pipeline_cls.from_crawler(self.crawler))
|
|
52
53
|
except Exception as e:
|
|
@@ -268,12 +268,25 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
268
268
|
except Exception as e:
|
|
269
269
|
raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
|
|
270
270
|
|
|
271
|
-
# 5.
|
|
271
|
+
# 5. 根据 RUN_MODE 获取相应配置
|
|
272
|
+
run_mode = settings.get('RUN_MODE', 'standalone')
|
|
273
|
+
if run_mode:
|
|
274
|
+
from crawlo.mode_manager import ModeManager
|
|
275
|
+
mode_manager = ModeManager()
|
|
276
|
+
mode_settings = mode_manager.resolve_mode_settings(run_mode)
|
|
277
|
+
# 合并模式配置,但不覆盖用户已设置的配置
|
|
278
|
+
for key, value in mode_settings.items():
|
|
279
|
+
# 只有当用户没有设置该配置项时才应用模式配置
|
|
280
|
+
if key not in settings.attributes:
|
|
281
|
+
settings.set(key, value)
|
|
282
|
+
logger.debug(f"🔧 已应用 {run_mode} 模式配置")
|
|
283
|
+
|
|
284
|
+
# 6. 合并运行时配置
|
|
272
285
|
if custom_settings:
|
|
273
286
|
settings.update_attributes(custom_settings)
|
|
274
287
|
logger.debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
|
|
275
288
|
|
|
276
|
-
#
|
|
289
|
+
# 7. 显示核心配置摘要(INFO级别)
|
|
277
290
|
# _log_settings_summary(settings)
|
|
278
291
|
|
|
279
292
|
# 配置日志系统
|
|
@@ -281,4 +294,4 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
281
294
|
|
|
282
295
|
# 将项目初始化完成的消息改为DEBUG级别
|
|
283
296
|
logger.debug("🎉 Crawlo 项目配置初始化完成!")
|
|
284
|
-
return settings
|
|
297
|
+
return settings
|
|
@@ -48,7 +48,18 @@ QUEUE_TYPE = 'auto'
|
|
|
48
48
|
# 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
|
|
49
49
|
# 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
|
|
50
50
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
51
|
-
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
51
|
+
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
MYSQL_HOST = '127.0.0.1'
|
|
55
|
+
MYSQL_PORT = 3306
|
|
56
|
+
MYSQL_USER = 'root'
|
|
57
|
+
MYSQL_PASSWORD = '123456'
|
|
58
|
+
MYSQL_DB = 'crawl_pro'
|
|
59
|
+
MYSQL_TABLE = 'crawlo'
|
|
60
|
+
MYSQL_BATCH_SIZE = 100
|
|
61
|
+
MYSQL_USE_BATCH = False # 是否启用批量插入
|
|
62
|
+
|
|
52
63
|
|
|
53
64
|
# --- Redis 过滤器配置 ---
|
|
54
65
|
# 使用环境变量配置工具获取 Redis 配置
|
|
@@ -85,7 +96,6 @@ MIDDLEWARES = [
|
|
|
85
96
|
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
|
|
86
97
|
'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
|
|
87
98
|
'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
|
|
88
|
-
# 'crawlo.middleware.proxy.ProxyMiddleware', # 4. 设置代理(默认不启用)
|
|
89
99
|
'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
|
|
90
100
|
|
|
91
101
|
# === 响应处理阶段 ===
|
|
@@ -98,8 +108,7 @@ MIDDLEWARES = [
|
|
|
98
108
|
|
|
99
109
|
# 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
|
|
100
110
|
PIPELINES = [
|
|
101
|
-
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
102
|
-
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(可选)
|
|
111
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
103
112
|
]
|
|
104
113
|
|
|
105
114
|
# 明确添加默认去重管道到管道列表开头
|
|
@@ -27,7 +27,14 @@ class SettingManager(MutableMapping):
|
|
|
27
27
|
user_middlewares = user_config['MIDDLEWARES']
|
|
28
28
|
# 如果用户配置了空列表,则仍然使用默认配置
|
|
29
29
|
if user_middlewares:
|
|
30
|
-
|
|
30
|
+
# 过滤掉空值和注释
|
|
31
|
+
user_middlewares = [middleware for middleware in user_middlewares if middleware and not middleware.strip().startswith('#')]
|
|
32
|
+
# 合并默认中间件和用户中间件,去重但保持顺序
|
|
33
|
+
merged_middlewares = default_middlewares[:]
|
|
34
|
+
for middleware in user_middlewares:
|
|
35
|
+
if middleware not in merged_middlewares:
|
|
36
|
+
merged_middlewares.append(middleware)
|
|
37
|
+
self.attributes['MIDDLEWARES'] = merged_middlewares
|
|
31
38
|
|
|
32
39
|
# 合并管道配置
|
|
33
40
|
if 'PIPELINES' in user_config:
|
|
@@ -37,8 +44,12 @@ class SettingManager(MutableMapping):
|
|
|
37
44
|
if user_pipelines:
|
|
38
45
|
# 过滤掉空值和注释
|
|
39
46
|
user_pipelines = [pipeline for pipeline in user_pipelines if pipeline and not pipeline.strip().startswith('#')]
|
|
40
|
-
|
|
41
|
-
|
|
47
|
+
# 合并默认管道和用户管道,去重但保持顺序
|
|
48
|
+
merged_pipelines = default_pipelines[:]
|
|
49
|
+
for pipeline in user_pipelines:
|
|
50
|
+
if pipeline not in merged_pipelines:
|
|
51
|
+
merged_pipelines.append(pipeline)
|
|
52
|
+
self.attributes['PIPELINES'] = merged_pipelines
|
|
42
53
|
|
|
43
54
|
# 特殊处理PIPELINES,确保去重管道在最前面
|
|
44
55
|
dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
|
|
@@ -56,8 +67,14 @@ class SettingManager(MutableMapping):
|
|
|
56
67
|
user_extensions = user_config['EXTENSIONS']
|
|
57
68
|
# 如果用户配置了空列表,则仍然使用默认配置
|
|
58
69
|
if user_extensions:
|
|
59
|
-
|
|
60
|
-
|
|
70
|
+
# 过滤掉空值和注释
|
|
71
|
+
user_extensions = [extension for extension in user_extensions if extension and not extension.strip().startswith('#')]
|
|
72
|
+
# 合并默认扩展和用户扩展,去重但保持顺序
|
|
73
|
+
merged_extensions = default_extensions[:]
|
|
74
|
+
for extension in user_extensions:
|
|
75
|
+
if extension not in merged_extensions:
|
|
76
|
+
merged_extensions.append(extension)
|
|
77
|
+
self.attributes['EXTENSIONS'] = merged_extensions
|
|
61
78
|
|
|
62
79
|
# 更新其他用户配置
|
|
63
80
|
for key, value in user_config.items():
|
|
@@ -119,9 +136,15 @@ class SettingManager(MutableMapping):
|
|
|
119
136
|
def set_settings(self, module):
|
|
120
137
|
if isinstance(module, str):
|
|
121
138
|
module = import_module(module)
|
|
139
|
+
|
|
140
|
+
# 收集模块中的所有配置项
|
|
141
|
+
module_settings = {}
|
|
122
142
|
for key in dir(module):
|
|
123
143
|
if key.isupper():
|
|
124
|
-
|
|
144
|
+
module_settings[key] = getattr(module, key)
|
|
145
|
+
|
|
146
|
+
# 使用合并逻辑而不是直接设置
|
|
147
|
+
self._merge_config(module_settings)
|
|
125
148
|
|
|
126
149
|
# 实现 MutableMapping 必须的方法
|
|
127
150
|
def __getitem__(self, item):
|