crawlo 1.4.0__tar.gz → 1.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.4.0/crawlo.egg-info → crawlo-1.4.2}/PKG-INFO +1 -1
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/__init__.py +9 -4
- crawlo-1.4.2/crawlo/__version__.py +1 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/core/__init__.py +8 -2
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/core/scheduler.py +2 -2
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/downloader/aiohttp_downloader.py +7 -2
- crawlo-1.4.2/crawlo/extension/log_interval.py +95 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/initialization/__init__.py +6 -2
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/middleware_manager.py +1 -1
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/mode_manager.py +13 -7
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/database_dedup_pipeline.py +5 -8
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/memory_dedup_pipeline.py +5 -15
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/redis_dedup_pipeline.py +2 -15
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/project.py +18 -7
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/settings/default_settings.py +114 -150
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/settings/setting_manager.py +14 -9
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/distributed_coordinator.py +4 -8
- crawlo-1.4.2/crawlo/utils/fingerprint.py +123 -0
- {crawlo-1.4.0 → crawlo-1.4.2/crawlo.egg-info}/PKG-INFO +1 -1
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo.egg-info/SOURCES.txt +16 -0
- crawlo-1.4.2/examples/test_project/__init__.py +7 -0
- crawlo-1.4.2/examples/test_project/run.py +35 -0
- crawlo-1.4.2/examples/test_project/test_project/__init__.py +4 -0
- crawlo-1.4.2/examples/test_project/test_project/items.py +18 -0
- crawlo-1.4.2/examples/test_project/test_project/middlewares.py +119 -0
- crawlo-1.4.2/examples/test_project/test_project/pipelines.py +97 -0
- crawlo-1.4.2/examples/test_project/test_project/settings.py +170 -0
- crawlo-1.4.2/examples/test_project/test_project/spiders/__init__.py +10 -0
- crawlo-1.4.2/examples/test_project/test_project/spiders/of_week_dis.py +144 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/debug_framework_logger.py +1 -1
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/debug_log_levels.py +1 -1
- crawlo-1.4.2/tests/test_all_pipeline_fingerprints.py +134 -0
- crawlo-1.4.2/tests/test_default_header_middleware.py +314 -0
- crawlo-1.4.2/tests/test_fingerprint_consistency.py +136 -0
- crawlo-1.4.2/tests/test_fingerprint_simple.py +52 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_framework_logger.py +1 -1
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_framework_startup.py +1 -1
- crawlo-1.4.2/tests/test_hash_performance.py +100 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_mode_change.py +1 -1
- crawlo-1.4.2/tests/test_offsite_middleware.py +245 -0
- crawlo-1.4.2/tests/test_offsite_middleware_simple.py +204 -0
- crawlo-1.4.2/tests/test_pipeline_fingerprint_consistency.py +87 -0
- crawlo-1.4.0/crawlo/__version__.py +0 -1
- crawlo-1.4.0/crawlo/extension/log_interval.py +0 -58
- crawlo-1.4.0/tests/test_default_header_middleware.py +0 -159
- crawlo-1.4.0/tests/test_offsite_middleware.py +0 -222
- {crawlo-1.4.0 → crawlo-1.4.2}/LICENSE +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/MANIFEST.in +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/README.md +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/cli.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/check.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/help.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/list.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/run.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/stats.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/commands/utils.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/config_validator.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/core/engine.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/core/processor.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/crawler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/data/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/data/user_agents.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/event.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/exceptions.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/factories/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/factories/base.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/factories/crawler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/factories/registry.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/framework.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/initialization/built_in.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/initialization/context.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/initialization/core.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/initialization/phases.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/initialization/registry.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/items/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/items/base.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/items/fields.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/items/items.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/logging/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/logging/config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/logging/factory.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/logging/manager.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/offsite.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/middleware/simple_proxy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/network/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/network/request.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/network/response.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/pipelines/pipeline_manager.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/queue/queue_manager.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/queue/redis_priority_queue.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/stats_collector.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/subscriber.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/task_manager.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/settings.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/run.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/templates/spiders_init.py.tmpl +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/data_formatter.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/encoding_converter.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/network_diagnostic.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/request_tools.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/tools/text_cleaner.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/class_loader.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/log.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/redis_connection_pool.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/request.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/system.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/tools.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo/utils/url.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/examples/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/pyproject.toml +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/requirements.txt +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/setup.cfg +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/__init__.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/baidu_performance_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/baidu_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/cleaners_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/comprehensive_framework_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/comprehensive_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/comprehensive_testing_summary.md +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/config_validation_demo.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/date_tools_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/debug_configure.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/debug_log_config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/debug_pipelines.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/detailed_log_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/distributed_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/distributed_test_debug.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/env_config_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/error_handling_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/final_command_test_report.md +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/final_comprehensive_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/final_log_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/final_validation_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/fix_log_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/framework_performance_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/log_buffering_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/log_generation_timing_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/optimized_performance_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/performance_comparison.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/queue_blocking_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/queue_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/request_params_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/response_improvements_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/scrapy_comparison/ofweek_scrapy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/scrapy_comparison/scrapy_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_command_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_crawlo_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_log_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_log_test2.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_optimization_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_queue_type_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_spider_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/simple_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/spider_log_timing_test.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_all_commands.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_batch_processor.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_cleaners.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_component_factory.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_comprehensive.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_config_consistency.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_config_merge.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_config_validator.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_controlled_spider_mixin.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_crawlo_proxy_integration.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_date_tools.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_dedup_fix.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_distributed.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_edge_cases.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_enhanced_error_handler_comprehensive.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_env_config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_factories.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_final_validation.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_get_component_logger.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_integration.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_large_scale_config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_large_scale_helper.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_logging_system.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_mode_consistency.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_parsel.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_performance.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_performance_monitor.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_api.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_middleware_refactored.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_queue_empty_check.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_queue_naming.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_queue_type.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_random_user_agent.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_redis_config.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_redis_queue.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_redis_queue_name_fix.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_request_params.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_request_serialization.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_response_improvements.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_scheduler.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_scheduler_config_update.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_simple_response.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_template_content.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_tools.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/test_user_agents.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/tools_example.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/untested_features_report.md +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/verify_debug.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/verify_distributed.py +0 -0
- {crawlo-1.4.0 → crawlo-1.4.2}/tests/verify_log_fix.py +0 -0
|
@@ -28,30 +28,35 @@ from crawlo import tools
|
|
|
28
28
|
|
|
29
29
|
# 框架核心模块 - 使用TYPE_CHECKING避免循环导入
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
|
-
from crawlo.
|
|
31
|
+
from crawlo.initialization import get_framework_initializer, initialize_framework
|
|
32
32
|
|
|
33
33
|
# 为了向后兼容,从tools中导入cleaners相关的功能
|
|
34
34
|
import crawlo.tools as cleaners
|
|
35
35
|
|
|
36
|
+
|
|
36
37
|
# 延迟导入的辅助函数
|
|
37
38
|
def get_framework_initializer():
|
|
38
39
|
"""延迟导入get_framework_initializer以避免循环依赖"""
|
|
39
|
-
from crawlo.
|
|
40
|
+
from crawlo.initialization import get_framework_initializer as _get_framework_initializer
|
|
40
41
|
return _get_framework_initializer()
|
|
41
42
|
|
|
43
|
+
|
|
42
44
|
def initialize_framework(custom_settings=None):
|
|
43
45
|
"""延迟导入initialize_framework以避免循环依赖"""
|
|
44
|
-
from crawlo.
|
|
46
|
+
from crawlo.initialization import initialize_framework as _initialize_framework
|
|
45
47
|
return _initialize_framework(custom_settings)
|
|
46
48
|
|
|
49
|
+
|
|
47
50
|
# 向后兼容的别名
|
|
48
51
|
def get_bootstrap_manager():
|
|
49
52
|
"""向后兼容的别名"""
|
|
50
53
|
return get_framework_initializer()
|
|
51
54
|
|
|
55
|
+
|
|
52
56
|
# 版本号:优先从元数据读取
|
|
53
57
|
try:
|
|
54
58
|
from importlib.metadata import version
|
|
59
|
+
|
|
55
60
|
__version__ = version("crawlo")
|
|
56
61
|
except Exception:
|
|
57
62
|
# 开发模式下可能未安装,回退到 __version__.py 或 dev
|
|
@@ -85,4 +90,4 @@ __all__ = [
|
|
|
85
90
|
'get_framework_initializer',
|
|
86
91
|
'get_bootstrap_manager',
|
|
87
92
|
'__version__',
|
|
88
|
-
]
|
|
93
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '1.4.2'
|
|
@@ -10,37 +10,43 @@ from ..initialization import (
|
|
|
10
10
|
is_framework_ready
|
|
11
11
|
)
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
# 向后兼容的别名
|
|
14
15
|
def async_initialize_framework(*args, **kwargs):
|
|
15
16
|
"""Async wrapper for framework initialization"""
|
|
16
17
|
return initialize_framework(*args, **kwargs)
|
|
17
18
|
|
|
19
|
+
|
|
18
20
|
def get_framework_initializer():
|
|
19
21
|
"""Get framework initializer - compatibility function"""
|
|
20
22
|
from ..initialization.core import CoreInitializer
|
|
21
23
|
return CoreInitializer()
|
|
22
24
|
|
|
25
|
+
|
|
23
26
|
def get_framework_logger(name='crawlo.core'):
|
|
24
27
|
"""Get framework logger - compatibility function"""
|
|
25
28
|
from ..logging import get_logger
|
|
26
29
|
return get_logger(name)
|
|
27
30
|
|
|
31
|
+
|
|
28
32
|
# 向后兼容
|
|
29
33
|
def bootstrap_framework(*args, **kwargs):
|
|
30
34
|
"""Bootstrap framework - compatibility function"""
|
|
31
35
|
return initialize_framework(*args, **kwargs)
|
|
32
36
|
|
|
37
|
+
|
|
33
38
|
def get_bootstrap_manager():
|
|
34
39
|
"""Get bootstrap manager - compatibility function"""
|
|
35
40
|
return get_framework_initializer()
|
|
36
41
|
|
|
42
|
+
|
|
37
43
|
__all__ = [
|
|
38
44
|
'initialize_framework',
|
|
39
|
-
'async_initialize_framework',
|
|
45
|
+
'async_initialize_framework',
|
|
40
46
|
'get_framework_initializer',
|
|
41
47
|
'is_framework_ready',
|
|
42
48
|
'get_framework_logger',
|
|
43
49
|
# 向后兼容
|
|
44
50
|
'bootstrap_framework',
|
|
45
51
|
'get_bootstrap_manager'
|
|
46
|
-
]
|
|
52
|
+
]
|
|
@@ -77,8 +77,8 @@ class Scheduler:
|
|
|
77
77
|
# 只有在确实需要更新配置时才重新创建过滤器实例
|
|
78
78
|
# 检查是否真的进行了配置更新
|
|
79
79
|
filter_updated = (
|
|
80
|
-
(self.queue_manager._queue_type == QueueType.REDIS and '
|
|
81
|
-
(self.queue_manager._queue_type == QueueType.MEMORY and
|
|
80
|
+
(self.queue_manager._queue_type == QueueType.REDIS and 'aioredis_filter' in self.crawler.settings.get('FILTER_CLASS', '')) or
|
|
81
|
+
(self.queue_manager._queue_type == QueueType.MEMORY and 'memory_filter' in self.crawler.settings.get('FILTER_CLASS', ''))
|
|
82
82
|
)
|
|
83
83
|
|
|
84
84
|
if needs_config_update or filter_updated:
|
|
@@ -55,8 +55,13 @@ class AioHttpDownloader(DownloaderBase):
|
|
|
55
55
|
family=0, # 允许IPv4和IPv6
|
|
56
56
|
)
|
|
57
57
|
|
|
58
|
-
# 超时控制
|
|
59
|
-
timeout = ClientTimeout(
|
|
58
|
+
# 超时控制 - 增加更多超时设置
|
|
59
|
+
timeout = ClientTimeout(
|
|
60
|
+
total=timeout_secs,
|
|
61
|
+
connect=timeout_secs/2, # 连接超时
|
|
62
|
+
sock_read=timeout_secs, # 读取超时
|
|
63
|
+
sock_connect=timeout_secs/2 # socket连接超时
|
|
64
|
+
)
|
|
60
65
|
|
|
61
66
|
# 请求追踪
|
|
62
67
|
trace_config = TraceConfig()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from crawlo.utils.log import get_logger
|
|
7
|
+
from crawlo.event import spider_opened, spider_closed
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LogIntervalExtension(object):
|
|
11
|
+
|
|
12
|
+
def __init__(self, crawler: Any):
|
|
13
|
+
self.task: Optional[asyncio.Task] = None
|
|
14
|
+
self.stats = crawler.stats
|
|
15
|
+
self.item_count = 0
|
|
16
|
+
self.response_count = 0
|
|
17
|
+
self.seconds = crawler.settings.get('INTERVAL', 60) # 默认60秒
|
|
18
|
+
|
|
19
|
+
# 修复时间单位计算逻辑
|
|
20
|
+
if self.seconds % 60 == 0:
|
|
21
|
+
self.interval = int(self.seconds / 60)
|
|
22
|
+
self.unit = 'min'
|
|
23
|
+
else:
|
|
24
|
+
self.interval = self.seconds
|
|
25
|
+
self.unit = 's'
|
|
26
|
+
|
|
27
|
+
# 处理单数情况
|
|
28
|
+
if self.interval == 1 and self.unit == 'min':
|
|
29
|
+
self.interval_display = ""
|
|
30
|
+
else:
|
|
31
|
+
self.interval_display = str(self.interval)
|
|
32
|
+
|
|
33
|
+
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
34
|
+
self.logger.info(f"LogIntervalExtension initialized with interval: {self.seconds} seconds")
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def create_instance(cls, crawler: Any) -> 'LogIntervalExtension':
|
|
38
|
+
o = cls(crawler)
|
|
39
|
+
crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
|
|
40
|
+
crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
|
|
41
|
+
return o
|
|
42
|
+
|
|
43
|
+
async def spider_opened(self) -> None:
|
|
44
|
+
self.logger.info("Spider opened, starting interval logging task")
|
|
45
|
+
self.task = asyncio.create_task(self.interval_log())
|
|
46
|
+
self.logger.info("Interval logging task started")
|
|
47
|
+
|
|
48
|
+
async def spider_closed(self) -> None:
|
|
49
|
+
self.logger.info("Spider closed, stopping interval logging task")
|
|
50
|
+
if self.task:
|
|
51
|
+
self.task.cancel()
|
|
52
|
+
try:
|
|
53
|
+
await self.task
|
|
54
|
+
except asyncio.CancelledError:
|
|
55
|
+
pass
|
|
56
|
+
self.task = None
|
|
57
|
+
|
|
58
|
+
async def interval_log(self) -> None:
|
|
59
|
+
iteration = 0
|
|
60
|
+
while True:
|
|
61
|
+
try:
|
|
62
|
+
iteration += 1
|
|
63
|
+
self.logger.debug(f"Interval log iteration {iteration} starting")
|
|
64
|
+
last_item_count = self.stats.get_value('item_successful_count', default=0)
|
|
65
|
+
last_response_count = self.stats.get_value('response_received_count', default=0)
|
|
66
|
+
item_rate = last_item_count - self.item_count
|
|
67
|
+
response_rate = last_response_count - self.response_count
|
|
68
|
+
|
|
69
|
+
# 添加调试信息
|
|
70
|
+
self.logger.debug(f"Debug info - Iteration: {iteration}, Last item count: {last_item_count}, Last response count: {last_response_count}")
|
|
71
|
+
self.logger.debug(f"Debug info - Previous item count: {self.item_count}, Previous response count: {self.response_count}")
|
|
72
|
+
self.logger.debug(f"Debug info - Item rate: {item_rate}, Response rate: {response_rate}")
|
|
73
|
+
|
|
74
|
+
self.item_count, self.response_count = last_item_count, last_response_count
|
|
75
|
+
|
|
76
|
+
# 修复效率计算,确保使用正确的单位
|
|
77
|
+
if self.unit == 'min' and self.seconds > 0:
|
|
78
|
+
# 转换为每分钟速率
|
|
79
|
+
pages_per_min = response_rate * 60 / self.seconds if self.seconds > 0 else 0
|
|
80
|
+
items_per_min = item_rate * 60 / self.seconds if self.seconds > 0 else 0
|
|
81
|
+
self.logger.info(
|
|
82
|
+
f'Crawled {last_response_count} pages (at {pages_per_min:.0f} pages/min),'
|
|
83
|
+
f' Got {last_item_count} items (at {items_per_min:.0f} items/min).'
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
# 使用原始单位
|
|
87
|
+
self.logger.info(
|
|
88
|
+
f'Crawled {last_response_count} pages (at {response_rate} pages/{self.interval_display}{self.unit}),'
|
|
89
|
+
f' Got {last_item_count} items (at {item_rate} items/{self.interval_display}{self.unit}).'
|
|
90
|
+
)
|
|
91
|
+
self.logger.debug(f"Interval log iteration {iteration} completed, sleeping for {self.seconds} seconds")
|
|
92
|
+
await asyncio.sleep(self.seconds)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
self.logger.error(f"Error in interval logging: {e}")
|
|
95
|
+
await asyncio.sleep(self.seconds) # 即使出错也继续执行
|
|
@@ -16,25 +16,29 @@ from .context import InitializationContext
|
|
|
16
16
|
from .core import CoreInitializer
|
|
17
17
|
from .phases import InitializationPhase
|
|
18
18
|
|
|
19
|
+
|
|
19
20
|
# 公共接口
|
|
20
21
|
def initialize_framework(settings=None, **kwargs):
|
|
21
22
|
"""初始化框架的主要入口"""
|
|
22
23
|
return CoreInitializer().initialize(settings, **kwargs)
|
|
23
24
|
|
|
25
|
+
|
|
24
26
|
def is_framework_ready():
|
|
25
27
|
"""检查框架是否已准备就绪"""
|
|
26
28
|
return CoreInitializer().is_ready
|
|
27
29
|
|
|
30
|
+
|
|
28
31
|
def get_framework_context():
|
|
29
32
|
"""获取框架初始化上下文"""
|
|
30
33
|
return CoreInitializer().context
|
|
31
34
|
|
|
35
|
+
|
|
32
36
|
__all__ = [
|
|
33
37
|
'InitializerRegistry',
|
|
34
|
-
'InitializationContext',
|
|
38
|
+
'InitializationContext',
|
|
35
39
|
'CoreInitializer',
|
|
36
40
|
'InitializationPhase',
|
|
37
41
|
'initialize_framework',
|
|
38
42
|
'is_framework_ready',
|
|
39
43
|
'get_framework_context'
|
|
40
|
-
]
|
|
44
|
+
]
|
|
@@ -86,7 +86,7 @@ class MiddlewareManager:
|
|
|
86
86
|
response = await self._process_exception(request, exp)
|
|
87
87
|
else:
|
|
88
88
|
create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
|
|
89
|
-
|
|
89
|
+
self._stats.inc_value('response_received_count')
|
|
90
90
|
if isinstance(response, Response):
|
|
91
91
|
response = await self._process_response(request, response)
|
|
92
92
|
if isinstance(response, Request):
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
支持的运行模式:
|
|
9
9
|
1. standalone - 单机模式(默认)
|
|
10
|
-
2. distributed - 分布式模式
|
|
10
|
+
2. distributed - 分布式模式
|
|
11
11
|
3. auto - 自动检测模式
|
|
12
12
|
"""
|
|
13
13
|
import os
|
|
@@ -29,7 +29,7 @@ class ModeManager:
|
|
|
29
29
|
# 延迟初始化logger,避免循环依赖
|
|
30
30
|
self._logger = None
|
|
31
31
|
self._debug("运行模式管理器初始化完成")
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
def _get_logger(self):
|
|
34
34
|
"""延迟获取logger实例"""
|
|
35
35
|
if self._logger is None:
|
|
@@ -40,7 +40,7 @@ class ModeManager:
|
|
|
40
40
|
# 如果日志系统尚未初始化,返回None
|
|
41
41
|
pass
|
|
42
42
|
return self._logger
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
def _debug(self, message: str):
|
|
45
45
|
"""调试日志"""
|
|
46
46
|
logger = self._get_logger()
|
|
@@ -73,7 +73,7 @@ class ModeManager:
|
|
|
73
73
|
redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
|
|
74
74
|
else:
|
|
75
75
|
redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
return {
|
|
78
78
|
'QUEUE_TYPE': 'redis',
|
|
79
79
|
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
@@ -119,6 +119,7 @@ class ModeManager:
|
|
|
119
119
|
|
|
120
120
|
if mode == RunMode.STANDALONE:
|
|
121
121
|
mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
|
|
122
|
+
# 对于单机模式,如果用户设置了QUEUE_TYPE为'auto',应该保留用户的设置
|
|
122
123
|
settings = self.get_standalone_settings()
|
|
123
124
|
self._debug("应用单机模式配置")
|
|
124
125
|
|
|
@@ -142,8 +143,13 @@ class ModeManager:
|
|
|
142
143
|
raise ValueError(f"不支持的运行模式: {mode}")
|
|
143
144
|
|
|
144
145
|
# 合并用户自定义配置
|
|
145
|
-
user_settings = {
|
|
146
|
-
|
|
146
|
+
user_settings = {
|
|
147
|
+
k: v for k,
|
|
148
|
+
v in kwargs.items() if k not in [
|
|
149
|
+
'redis_host',
|
|
150
|
+
'redis_port',
|
|
151
|
+
'redis_password',
|
|
152
|
+
'project_name']}
|
|
147
153
|
settings.update(user_settings)
|
|
148
154
|
self._debug(f"合并用户自定义配置: {list(user_settings.keys())}")
|
|
149
155
|
|
|
@@ -210,4 +216,4 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
|
|
|
210
216
|
def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
|
|
211
217
|
"""从环境变量创建配置"""
|
|
212
218
|
# 移除直接使用 os.getenv(),要求通过 settings 配置
|
|
213
|
-
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
219
|
+
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
@@ -38,6 +38,7 @@ except ImportError:
|
|
|
38
38
|
|
|
39
39
|
from crawlo import Item
|
|
40
40
|
from crawlo.spider import Spider
|
|
41
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
41
42
|
from crawlo.utils.log import get_logger
|
|
42
43
|
from crawlo.exceptions import DropItem, ItemDiscard
|
|
43
44
|
|
|
@@ -109,6 +110,9 @@ class BloomDedupPipeline:
|
|
|
109
110
|
self.logger.debug(f"Processing new item: {fingerprint[:20]}...")
|
|
110
111
|
return item
|
|
111
112
|
|
|
113
|
+
except ItemDiscard:
|
|
114
|
+
# 重新抛出ItemDiscard异常,确保管道管理器能正确处理
|
|
115
|
+
raise
|
|
112
116
|
except Exception as e:
|
|
113
117
|
self.logger.error(f"Error processing item: {e}")
|
|
114
118
|
# 在错误时继续处理,避免丢失数据
|
|
@@ -123,21 +127,7 @@ class BloomDedupPipeline:
|
|
|
123
127
|
:param item: 数据项
|
|
124
128
|
:return: 指纹字符串
|
|
125
129
|
"""
|
|
126
|
-
|
|
127
|
-
try:
|
|
128
|
-
item_dict = item.to_dict()
|
|
129
|
-
except AttributeError:
|
|
130
|
-
# 兼容没有to_dict方法的Item实现
|
|
131
|
-
item_dict = dict(item)
|
|
132
|
-
|
|
133
|
-
# 对字典进行排序以确保一致性
|
|
134
|
-
sorted_items = sorted(item_dict.items())
|
|
135
|
-
|
|
136
|
-
# 生成指纹字符串
|
|
137
|
-
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
138
|
-
|
|
139
|
-
# 使用 SHA256 生成固定长度的指纹
|
|
140
|
-
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
130
|
+
return FingerprintGenerator.item_fingerprint(item)
|
|
141
131
|
|
|
142
132
|
def close_spider(self, spider: Spider) -> None:
|
|
143
133
|
"""
|
|
@@ -17,6 +17,7 @@ import aiomysql
|
|
|
17
17
|
from crawlo import Item
|
|
18
18
|
from crawlo.exceptions import DropItem, ItemDiscard
|
|
19
19
|
from crawlo.spider import Spider
|
|
20
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
20
21
|
from crawlo.utils.log import get_logger
|
|
21
22
|
|
|
22
23
|
|
|
@@ -140,6 +141,9 @@ class DatabaseDedupPipeline:
|
|
|
140
141
|
self.logger.debug(f"Processing new item: {fingerprint[:20]}...")
|
|
141
142
|
return item
|
|
142
143
|
|
|
144
|
+
except ItemDiscard:
|
|
145
|
+
# 重新抛出ItemDiscard异常,确保管道管理器能正确处理
|
|
146
|
+
raise
|
|
143
147
|
except Exception as e:
|
|
144
148
|
self.logger.error(f"Error processing item: {e}")
|
|
145
149
|
# 在错误时继续处理,避免丢失数据
|
|
@@ -190,11 +194,4 @@ class DatabaseDedupPipeline:
|
|
|
190
194
|
:param item: 数据项
|
|
191
195
|
:return: 指纹字符串
|
|
192
196
|
"""
|
|
193
|
-
|
|
194
|
-
try:
|
|
195
|
-
item_dict = item.to_dict()
|
|
196
|
-
except AttributeError:
|
|
197
|
-
# 兼容没有to_dict方法的Item实现
|
|
198
|
-
item_dict = dict(item)
|
|
199
|
-
|
|
200
|
-
# 对字典进行排序以确保一致性
|
|
197
|
+
return FingerprintGenerator.item_fingerprint(item)
|
|
@@ -18,6 +18,7 @@ from typing import Set
|
|
|
18
18
|
from crawlo import Item
|
|
19
19
|
from crawlo.exceptions import DropItem, ItemDiscard
|
|
20
20
|
from crawlo.spider import Spider
|
|
21
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
21
22
|
from crawlo.utils.log import get_logger
|
|
22
23
|
|
|
23
24
|
|
|
@@ -71,6 +72,9 @@ class MemoryDedupPipeline:
|
|
|
71
72
|
self.logger.debug(f"Processing new item: {fingerprint[:20]}...")
|
|
72
73
|
return item
|
|
73
74
|
|
|
75
|
+
except ItemDiscard:
|
|
76
|
+
# 重新抛出ItemDiscard异常,确保管道管理器能正确处理
|
|
77
|
+
raise
|
|
74
78
|
except Exception as e:
|
|
75
79
|
self.logger.error(f"Error processing item: {e}")
|
|
76
80
|
# 在错误时继续处理,避免丢失数据
|
|
@@ -85,21 +89,7 @@ class MemoryDedupPipeline:
|
|
|
85
89
|
:param item: 数据项
|
|
86
90
|
:return: 指纹字符串
|
|
87
91
|
"""
|
|
88
|
-
|
|
89
|
-
try:
|
|
90
|
-
item_dict = item.to_dict()
|
|
91
|
-
except AttributeError:
|
|
92
|
-
# 兼容没有to_dict方法的Item实现
|
|
93
|
-
item_dict = dict(item)
|
|
94
|
-
|
|
95
|
-
# 对字典进行排序以确保一致性
|
|
96
|
-
sorted_items = sorted(item_dict.items())
|
|
97
|
-
|
|
98
|
-
# 生成指纹字符串
|
|
99
|
-
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
100
|
-
|
|
101
|
-
# 使用 SHA256 生成固定长度的指纹
|
|
102
|
-
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
92
|
+
return FingerprintGenerator.item_fingerprint(item)
|
|
103
93
|
|
|
104
94
|
def close_spider(self, spider: Spider) -> None:
|
|
105
95
|
"""
|
|
@@ -18,6 +18,7 @@ from typing import Optional
|
|
|
18
18
|
from crawlo import Item
|
|
19
19
|
from crawlo.spider import Spider
|
|
20
20
|
from crawlo.exceptions import DropItem, ItemDiscard
|
|
21
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
21
22
|
from crawlo.utils.log import get_logger
|
|
22
23
|
|
|
23
24
|
|
|
@@ -132,21 +133,7 @@ class RedisDedupPipeline:
|
|
|
132
133
|
:param item: 数据项
|
|
133
134
|
:return: 指纹字符串
|
|
134
135
|
"""
|
|
135
|
-
|
|
136
|
-
try:
|
|
137
|
-
item_dict = item.to_dict()
|
|
138
|
-
except AttributeError:
|
|
139
|
-
# 兼容没有to_dict方法的Item实现
|
|
140
|
-
item_dict = dict(item)
|
|
141
|
-
|
|
142
|
-
# 对字典进行排序以确保一致性
|
|
143
|
-
sorted_items = sorted(item_dict.items())
|
|
144
|
-
|
|
145
|
-
# 生成指纹字符串
|
|
146
|
-
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
147
|
-
|
|
148
|
-
# 使用 SHA256 生成固定长度的指纹
|
|
149
|
-
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
136
|
+
return FingerprintGenerator.item_fingerprint(item)
|
|
150
137
|
|
|
151
138
|
def close_spider(self, spider: Spider) -> None:
|
|
152
139
|
"""
|
|
@@ -289,11 +289,22 @@ def _load_project_settings(custom_settings: Optional[dict] = None) -> SettingMan
|
|
|
289
289
|
if run_mode:
|
|
290
290
|
from crawlo.mode_manager import ModeManager
|
|
291
291
|
mode_manager = ModeManager()
|
|
292
|
-
|
|
293
|
-
|
|
292
|
+
# 获取项目名称并传递给模式配置
|
|
293
|
+
project_name = settings.get('PROJECT_NAME', 'crawlo')
|
|
294
|
+
mode_settings = mode_manager.resolve_mode_settings(run_mode, project_name=project_name)
|
|
295
|
+
|
|
296
|
+
# 特殊处理:如果用户在settings.py中明确设置了QUEUE_TYPE为'auto',
|
|
297
|
+
# 即使在单机模式下也应该保留这个设置
|
|
298
|
+
user_queue_type = settings.get('QUEUE_TYPE')
|
|
299
|
+
if user_queue_type == 'auto' and run_mode == 'standalone':
|
|
300
|
+
mode_settings['QUEUE_TYPE'] = 'auto'
|
|
301
|
+
|
|
302
|
+
# 合并模式配置
|
|
294
303
|
for key, value in mode_settings.items():
|
|
295
|
-
#
|
|
296
|
-
|
|
304
|
+
# 对于特定的配置项,模式配置应该优先于用户配置
|
|
305
|
+
# 特别是与运行模式密切相关的配置项
|
|
306
|
+
priority_keys = ['QUEUE_TYPE', 'FILTER_CLASS', 'DEFAULT_DEDUP_PIPELINE']
|
|
307
|
+
if key in priority_keys or key not in settings.attributes:
|
|
297
308
|
settings.set(key, value)
|
|
298
309
|
_temp_debug(f"🔧 已应用 {run_mode} 模式配置")
|
|
299
310
|
|
|
@@ -311,9 +322,9 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
311
322
|
获取配置管理器实例(主入口函数)
|
|
312
323
|
|
|
313
324
|
注意:这个函数现在作为向后兼容的入口,实际的初始化逻辑已经移到
|
|
314
|
-
crawlo.
|
|
325
|
+
crawlo.initialization 模块中。建议使用新的初始化方式:
|
|
315
326
|
|
|
316
|
-
>>> from crawlo.
|
|
327
|
+
>>> from crawlo.initialization import initialize_framework
|
|
317
328
|
>>> settings = initialize_framework(custom_settings)
|
|
318
329
|
|
|
319
330
|
Args:
|
|
@@ -323,5 +334,5 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
323
334
|
SettingManager: 已加载配置的实例
|
|
324
335
|
"""
|
|
325
336
|
# 使用新的统一初始化管理器
|
|
326
|
-
from crawlo.
|
|
337
|
+
from crawlo.initialization import initialize_framework
|
|
327
338
|
return initialize_framework(custom_settings)
|