crawlo 1.2.9__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.2.9/crawlo.egg-info → crawlo-1.3.0}/PKG-INFO +1 -1
- crawlo-1.3.0/crawlo/__version__.py +1 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/run.py +26 -35
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/engine.py +1 -2
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/crawler.py +48 -53
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/logging_extension.py +4 -2
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/middleware_manager.py +1 -1
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/mode_manager.py +37 -36
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/pipeline_manager.py +13 -1
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/project.py +28 -34
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/settings/setting_manager.py +31 -19
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/log.py +20 -61
- {crawlo-1.2.9 → crawlo-1.3.0/crawlo.egg-info}/PKG-INFO +1 -1
- crawlo-1.2.9/crawlo/__version__.py +0 -1
- {crawlo-1.2.9 → crawlo-1.3.0}/LICENSE +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/MANIFEST.in +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/README.md +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/cli.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/check.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/help.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/list.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/stats.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/utils.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/config_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/processor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/data/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/data/user_agents.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/event.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/exceptions.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/base.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/fields.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/items.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/offsite.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/simple_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/network/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/network/request.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/network/response.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/queue_manager.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/redis_priority_queue.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/settings/default_settings.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/stats_collector.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/subscriber.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/task_manager.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/run.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/data_formatter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/encoding_converter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/request_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/text_cleaner.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/redis_connection_pool.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/request.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/system.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/url.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/SOURCES.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/examples/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/pyproject.toml +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/requirements.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/setup.cfg +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/cleaners_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/config_validation_demo.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/date_tools_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/debug_pipelines.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/env_config_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/error_handling_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/request_params_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/response_improvements_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_cleaners.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_comprehensive.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_config_consistency.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_config_merge.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_config_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_crawlo_proxy_integration.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_date_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_default_header_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_distributed.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_edge_cases.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_env_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_final_validation.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_integration.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_mode_consistency.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_offsite_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_parsel.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_performance.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_api.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware_refactored.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_random_user_agent.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_queue.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_request_params.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_request_serialization.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_response_improvements.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_scheduler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_scheduler_config_update.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_simple_response.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_template_content.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_user_agents.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/tools_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.0}/tests/verify_distributed.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '1.3.0'
|
|
@@ -5,26 +5,27 @@
|
|
|
5
5
|
# @Author : crawl-coder
|
|
6
6
|
# @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
|
|
7
7
|
"""
|
|
8
|
+
import os
|
|
8
9
|
import sys
|
|
9
10
|
import asyncio
|
|
10
11
|
import configparser
|
|
11
|
-
import os
|
|
12
|
-
from pathlib import Path
|
|
13
12
|
from importlib import import_module
|
|
14
13
|
|
|
14
|
+
from rich import box
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
from rich.panel import Panel
|
|
17
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
17
18
|
from rich.table import Table
|
|
18
19
|
from rich.text import Text
|
|
19
|
-
from rich import box
|
|
20
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
21
20
|
|
|
21
|
+
from crawlo.commands.stats import record_stats
|
|
22
22
|
from crawlo.crawler import CrawlerProcess
|
|
23
|
-
from crawlo.utils.log import get_logger
|
|
24
23
|
from crawlo.project import get_settings, _find_project_root
|
|
25
|
-
|
|
24
|
+
# 使用自定义日志系统
|
|
25
|
+
from crawlo.utils.log import get_logger
|
|
26
26
|
|
|
27
27
|
logger = get_logger(__name__)
|
|
28
|
+
|
|
28
29
|
console = Console()
|
|
29
30
|
|
|
30
31
|
|
|
@@ -77,6 +78,9 @@ def main(args):
|
|
|
77
78
|
用法:
|
|
78
79
|
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
79
80
|
"""
|
|
81
|
+
# 添加调试信息
|
|
82
|
+
logger.debug("DEBUG: 进入main函数")
|
|
83
|
+
|
|
80
84
|
if len(args) < 1:
|
|
81
85
|
console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
82
86
|
console.print("示例:")
|
|
@@ -187,21 +191,7 @@ def main(args):
|
|
|
187
191
|
return 1
|
|
188
192
|
|
|
189
193
|
# 显示即将运行的爬虫列表
|
|
190
|
-
|
|
191
|
-
title=f"启动全部 {len(spider_names)} 个爬虫",
|
|
192
|
-
box=box.ROUNDED,
|
|
193
|
-
show_header=True,
|
|
194
|
-
header_style="bold magenta"
|
|
195
|
-
)
|
|
196
|
-
table.add_column("名称", style="cyan")
|
|
197
|
-
table.add_column("类名", style="green")
|
|
198
|
-
|
|
199
|
-
for name in sorted(spider_names):
|
|
200
|
-
cls = process.get_spider_class(name)
|
|
201
|
-
table.add_row(name, cls.__name__)
|
|
202
|
-
|
|
203
|
-
console.print(table)
|
|
204
|
-
console.print()
|
|
194
|
+
# 根据用户要求,不再显示详细的爬虫列表信息
|
|
205
195
|
|
|
206
196
|
# 注册 stats 记录(除非 --no-stats)
|
|
207
197
|
if not no_stats:
|
|
@@ -260,20 +250,21 @@ def main(args):
|
|
|
260
250
|
spider_class = process.get_spider_class(spider_name)
|
|
261
251
|
|
|
262
252
|
# 显示启动信息
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
253
|
+
# 根据用户要求,不再显示项目启动信息
|
|
254
|
+
# if not show_json:
|
|
255
|
+
# info_table = Table(
|
|
256
|
+
# title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
|
|
257
|
+
# box=box.SIMPLE,
|
|
258
|
+
# show_header=False,
|
|
259
|
+
# title_style="bold green"
|
|
260
|
+
# )
|
|
261
|
+
# info_table.add_column("Key", style="yellow")
|
|
262
|
+
# info_table.add_column("Value", style="cyan")
|
|
263
|
+
# info_table.add_row("Project", project_package)
|
|
264
|
+
# info_table.add_row("Class", spider_class.__name__)
|
|
265
|
+
# info_table.add_row("Module", spider_class.__module__)
|
|
266
|
+
# console.print(info_table)
|
|
267
|
+
# console.print()
|
|
277
268
|
|
|
278
269
|
# 注册 stats 记录
|
|
279
270
|
if not no_stats:
|
|
@@ -75,8 +75,7 @@ class Engine(object):
|
|
|
75
75
|
version = '1.0.0'
|
|
76
76
|
# Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
|
|
77
77
|
self.logger.debug(
|
|
78
|
-
f"Crawlo Started version {version}
|
|
79
|
-
# f"(project name : {self.settings.get('PROJECT_NAME')})"
|
|
78
|
+
f"Crawlo Started version {version}"
|
|
80
79
|
)
|
|
81
80
|
|
|
82
81
|
async def start_spider(self, spider):
|
|
@@ -21,7 +21,7 @@ Example Usage:
|
|
|
21
21
|
# Single crawler run
|
|
22
22
|
crawler = Crawler(MySpider, settings)
|
|
23
23
|
await crawler.crawl()
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
# Multi-crawler concurrent management
|
|
26
26
|
process = CrawlerProcess()
|
|
27
27
|
await process.crawl([Spider1, Spider2])
|
|
@@ -34,7 +34,6 @@ import threading
|
|
|
34
34
|
from typing import Type, Optional, Set, List, Union, Dict, Any
|
|
35
35
|
from .spider import Spider, get_global_spider_registry
|
|
36
36
|
from .core.engine import Engine
|
|
37
|
-
from .utils.log import get_logger
|
|
38
37
|
from .subscriber import Subscriber
|
|
39
38
|
from .extension import ExtensionManager
|
|
40
39
|
from .stats_collector import StatsCollector
|
|
@@ -42,16 +41,9 @@ from .event import spider_opened, spider_closed
|
|
|
42
41
|
from .settings.setting_manager import SettingManager
|
|
43
42
|
from crawlo.project import merge_settings, get_settings
|
|
44
43
|
|
|
45
|
-
#
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def _get_logger():
|
|
50
|
-
"""延迟获取logger实例,确保在配置加载后创建"""
|
|
51
|
-
global logger
|
|
52
|
-
if logger is None:
|
|
53
|
-
logger = get_logger(__name__)
|
|
54
|
-
return logger
|
|
44
|
+
# 使用自定义日志系统
|
|
45
|
+
from crawlo.utils.log import get_logger
|
|
46
|
+
logger = get_logger(__name__)
|
|
55
47
|
|
|
56
48
|
|
|
57
49
|
class CrawlerContext:
|
|
@@ -110,7 +102,7 @@ class CrawlerContext:
|
|
|
110
102
|
class Crawler:
|
|
111
103
|
"""
|
|
112
104
|
Single crawler runtime instance, managing Spider and engine lifecycle
|
|
113
|
-
|
|
105
|
+
|
|
114
106
|
Provides functionality:
|
|
115
107
|
- Spider lifecycle management (initialization, running, closing)
|
|
116
108
|
- Engine component coordination management
|
|
@@ -148,7 +140,7 @@ class Crawler:
|
|
|
148
140
|
async def crawl(self):
|
|
149
141
|
"""
|
|
150
142
|
Start the crawler core process
|
|
151
|
-
|
|
143
|
+
|
|
152
144
|
Includes the following stages:
|
|
153
145
|
1. Initialization stage: Create all components
|
|
154
146
|
2. Validation stage: Check configuration and state
|
|
@@ -190,12 +182,12 @@ class Crawler:
|
|
|
190
182
|
# Update context status
|
|
191
183
|
self.context.increment_completed()
|
|
192
184
|
|
|
193
|
-
|
|
185
|
+
logger.info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
|
|
194
186
|
|
|
195
187
|
except Exception as e:
|
|
196
188
|
self._performance_metrics['error_count'] += 1
|
|
197
189
|
self.context.increment_failed(str(e))
|
|
198
|
-
|
|
190
|
+
logger.error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
|
|
199
191
|
raise
|
|
200
192
|
finally:
|
|
201
193
|
self.context.decrement_active()
|
|
@@ -213,7 +205,7 @@ class Crawler:
|
|
|
213
205
|
else:
|
|
214
206
|
spider_name = 'Unknown'
|
|
215
207
|
|
|
216
|
-
|
|
208
|
+
logger.info(f"Starting running {spider_name}")
|
|
217
209
|
|
|
218
210
|
def _validate_crawler_state(self):
|
|
219
211
|
"""
|
|
@@ -233,7 +225,7 @@ class Crawler:
|
|
|
233
225
|
if not self.spider.name:
|
|
234
226
|
raise ValueError("Spider name cannot be empty")
|
|
235
227
|
|
|
236
|
-
|
|
228
|
+
logger.debug(f"Spider {self.spider.name} state validation passed")
|
|
237
229
|
|
|
238
230
|
def _get_total_duration(self) -> float:
|
|
239
231
|
"""Get total runtime"""
|
|
@@ -247,7 +239,7 @@ class Crawler:
|
|
|
247
239
|
if not self._closed:
|
|
248
240
|
await self.close()
|
|
249
241
|
except Exception as e:
|
|
250
|
-
|
|
242
|
+
logger.warning(f"Error cleaning up resources: {e}")
|
|
251
243
|
|
|
252
244
|
def get_performance_metrics(self) -> Dict[str, Any]:
|
|
253
245
|
"""Get performance metrics"""
|
|
@@ -267,7 +259,7 @@ class Crawler:
|
|
|
267
259
|
def _create_spider(self) -> Spider:
|
|
268
260
|
"""
|
|
269
261
|
Create and validate spider instance (enhanced version)
|
|
270
|
-
|
|
262
|
+
|
|
271
263
|
Performs the following validations:
|
|
272
264
|
- Spider name must exist
|
|
273
265
|
- start_requests method must be callable
|
|
@@ -300,7 +292,7 @@ class Crawler:
|
|
|
300
292
|
|
|
301
293
|
# parse method check (warning instead of error)
|
|
302
294
|
if not callable(getattr(spider, 'parse', None)):
|
|
303
|
-
|
|
295
|
+
logger.warning(
|
|
304
296
|
f"Spider '{spider.name}' does not define 'parse' method.\n"
|
|
305
297
|
f"Ensure all Requests specify a callback function, otherwise responses will be ignored."
|
|
306
298
|
)
|
|
@@ -308,27 +300,29 @@ class Crawler:
|
|
|
308
300
|
# Set spider configuration
|
|
309
301
|
self._set_spider(spider)
|
|
310
302
|
|
|
311
|
-
|
|
303
|
+
logger.debug(f"Spider '{spider.name}' initialized successfully")
|
|
312
304
|
return spider
|
|
313
305
|
|
|
314
306
|
def _create_engine(self) -> Engine:
|
|
315
307
|
"""Create and initialize engine"""
|
|
316
308
|
engine = Engine(self)
|
|
317
309
|
engine.engine_start()
|
|
318
|
-
|
|
310
|
+
logger.debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
319
311
|
return engine
|
|
320
312
|
|
|
321
313
|
def _create_stats(self) -> StatsCollector:
|
|
322
314
|
"""Create stats collector"""
|
|
323
315
|
stats = StatsCollector(self)
|
|
324
|
-
|
|
316
|
+
logger.debug(
|
|
317
|
+
f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
325
318
|
return stats
|
|
326
319
|
|
|
327
320
|
def _create_extension(self) -> ExtensionManager:
|
|
328
321
|
"""Create extension manager"""
|
|
329
322
|
# Modify extension manager creation method, delay initialization until needed
|
|
330
323
|
extension = ExtensionManager.create_instance(self)
|
|
331
|
-
|
|
324
|
+
logger.debug(
|
|
325
|
+
f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
332
326
|
return extension
|
|
333
327
|
|
|
334
328
|
def _set_spider(self, spider: Spider):
|
|
@@ -343,12 +337,12 @@ class Crawler:
|
|
|
343
337
|
# Merge spider custom configuration
|
|
344
338
|
merge_settings(spider, self.settings)
|
|
345
339
|
|
|
346
|
-
|
|
340
|
+
logger.debug(f"Spider '{spider.name}' configuration merged successfully")
|
|
347
341
|
|
|
348
342
|
async def close(self, reason='finished') -> None:
|
|
349
343
|
"""
|
|
350
344
|
Close crawler and clean up resources (enhanced version)
|
|
351
|
-
|
|
345
|
+
|
|
352
346
|
Ensure closing only once and handle all cleanup operations
|
|
353
347
|
"""
|
|
354
348
|
async with self._close_lock:
|
|
@@ -371,15 +365,15 @@ class Crawler:
|
|
|
371
365
|
from crawlo.commands.stats import record_stats
|
|
372
366
|
record_stats(self)
|
|
373
367
|
except ImportError:
|
|
374
|
-
|
|
368
|
+
logger.debug("Statistics recording module does not exist, skipping statistics recording")
|
|
375
369
|
|
|
376
|
-
|
|
370
|
+
logger.info(
|
|
377
371
|
f"Spider '{getattr(self.spider, 'name', 'Unknown')}' closed, "
|
|
378
372
|
f"reason: {reason}, took: {self._get_total_duration():.2f} seconds"
|
|
379
373
|
)
|
|
380
374
|
|
|
381
375
|
except Exception as e:
|
|
382
|
-
|
|
376
|
+
logger.error(f"Error closing crawler: {e}", exc_info=True)
|
|
383
377
|
finally:
|
|
384
378
|
# Ensure resource cleanup
|
|
385
379
|
await self._cleanup_resources()
|
|
@@ -413,13 +407,13 @@ class Crawler:
|
|
|
413
407
|
if cleanup_tasks:
|
|
414
408
|
await asyncio.gather(*cleanup_tasks, return_exceptions=True)
|
|
415
409
|
|
|
416
|
-
|
|
410
|
+
logger.debug("Resource cleanup completed")
|
|
417
411
|
|
|
418
412
|
|
|
419
413
|
class CrawlerProcess:
|
|
420
414
|
"""
|
|
421
415
|
Crawler process manager
|
|
422
|
-
|
|
416
|
+
|
|
423
417
|
Supported features:
|
|
424
418
|
- Multi-crawler concurrent scheduling and resource management
|
|
425
419
|
- Automatic module discovery and spider registration
|
|
@@ -428,15 +422,15 @@ class CrawlerProcess:
|
|
|
428
422
|
- Real-time status monitoring and statistics
|
|
429
423
|
- Error recovery and retry mechanism
|
|
430
424
|
- Large-scale crawler optimization support
|
|
431
|
-
|
|
425
|
+
|
|
432
426
|
Usage example:
|
|
433
427
|
# Basic usage
|
|
434
428
|
process = CrawlerProcess()
|
|
435
429
|
await process.crawl(MySpider)
|
|
436
|
-
|
|
430
|
+
|
|
437
431
|
# Multi-crawler concurrency
|
|
438
432
|
await process.crawl([Spider1, Spider2, 'spider_name'])
|
|
439
|
-
|
|
433
|
+
|
|
440
434
|
# Custom concurrency
|
|
441
435
|
process = CrawlerProcess(max_concurrency=8)
|
|
442
436
|
"""
|
|
@@ -563,7 +557,7 @@ class CrawlerProcess:
|
|
|
563
557
|
def auto_discover(modules: List[str]):
|
|
564
558
|
"""
|
|
565
559
|
Automatically import modules, trigger Spider class definition and registration (enhanced version)
|
|
566
|
-
|
|
560
|
+
|
|
567
561
|
Supports recursive scanning and error recovery
|
|
568
562
|
"""
|
|
569
563
|
import importlib
|
|
@@ -617,7 +611,7 @@ class CrawlerProcess:
|
|
|
617
611
|
async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
|
|
618
612
|
"""
|
|
619
613
|
Start one or more crawlers
|
|
620
|
-
|
|
614
|
+
|
|
621
615
|
Enhanced features:
|
|
622
616
|
- Intelligent concurrency control
|
|
623
617
|
- Real-time monitoring and statistics
|
|
@@ -639,7 +633,7 @@ class CrawlerProcess:
|
|
|
639
633
|
await self.start_monitoring()
|
|
640
634
|
|
|
641
635
|
try:
|
|
642
|
-
# Phase 3:
|
|
636
|
+
# Phase 3: Initialize context and monitoring
|
|
643
637
|
spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
|
|
644
638
|
|
|
645
639
|
logger.debug(
|
|
@@ -738,7 +732,7 @@ class CrawlerProcess:
|
|
|
738
732
|
) -> List[Type[Spider]]:
|
|
739
733
|
"""
|
|
740
734
|
Resolve input to spider class list
|
|
741
|
-
|
|
735
|
+
|
|
742
736
|
Supports various input formats and validates uniqueness
|
|
743
737
|
"""
|
|
744
738
|
inputs = self._normalize_inputs(spiders_input)
|
|
@@ -762,7 +756,8 @@ class CrawlerProcess:
|
|
|
762
756
|
seen_spider_names.add(spider_name)
|
|
763
757
|
spider_classes.append(spider_cls)
|
|
764
758
|
|
|
765
|
-
logger.debug(
|
|
759
|
+
logger.debug(
|
|
760
|
+
f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
|
|
766
761
|
|
|
767
762
|
except Exception as e:
|
|
768
763
|
logger.error(f"Failed to resolve spider: {item} - {e}")
|
|
@@ -774,7 +769,7 @@ class CrawlerProcess:
|
|
|
774
769
|
def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
|
|
775
770
|
"""
|
|
776
771
|
Normalize input to list
|
|
777
|
-
|
|
772
|
+
|
|
778
773
|
Supports more input types and provides better error information
|
|
779
774
|
"""
|
|
780
775
|
if isinstance(spiders_input, (type, str)):
|
|
@@ -793,7 +788,7 @@ class CrawlerProcess:
|
|
|
793
788
|
def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
|
|
794
789
|
"""
|
|
795
790
|
Resolve single input item to spider class
|
|
796
|
-
|
|
791
|
+
|
|
797
792
|
Provides better error prompts and debugging information
|
|
798
793
|
"""
|
|
799
794
|
if isinstance(item, type) and issubclass(item, Spider):
|
|
@@ -820,7 +815,7 @@ class CrawlerProcess:
|
|
|
820
815
|
async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
|
|
821
816
|
"""
|
|
822
817
|
Spider running function limited by semaphore
|
|
823
|
-
|
|
818
|
+
|
|
824
819
|
Includes enhanced error handling and monitoring functionality
|
|
825
820
|
"""
|
|
826
821
|
task = asyncio.current_task()
|
|
@@ -888,7 +883,7 @@ class CrawlerProcess:
|
|
|
888
883
|
def _shutdown(self, _signum, _frame):
|
|
889
884
|
"""
|
|
890
885
|
Graceful shutdown signal handling
|
|
891
|
-
|
|
886
|
+
|
|
892
887
|
Provides better shutdown experience and resource cleanup
|
|
893
888
|
"""
|
|
894
889
|
signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
|
|
@@ -913,7 +908,7 @@ class CrawlerProcess:
|
|
|
913
908
|
async def _wait_for_shutdown(self):
|
|
914
909
|
"""
|
|
915
910
|
Wait for all active tasks to complete
|
|
916
|
-
|
|
911
|
+
|
|
917
912
|
Provides better shutdown time control and progress feedback
|
|
918
913
|
"""
|
|
919
914
|
try:
|
|
@@ -967,15 +962,15 @@ class CrawlerProcess:
|
|
|
967
962
|
def _get_default_settings(cls) -> SettingManager:
|
|
968
963
|
"""
|
|
969
964
|
Load default configuration
|
|
970
|
-
|
|
965
|
+
|
|
971
966
|
Provides better error handling and fallback strategy
|
|
972
967
|
"""
|
|
973
968
|
try:
|
|
974
969
|
settings = get_settings()
|
|
975
|
-
|
|
970
|
+
logger.debug("Default configuration loaded successfully")
|
|
976
971
|
return settings
|
|
977
972
|
except Exception as e:
|
|
978
|
-
|
|
973
|
+
logger.warning(f"Unable to load default configuration: {e}, using empty configuration")
|
|
979
974
|
return SettingManager()
|
|
980
975
|
|
|
981
976
|
def _log_startup_info(self):
|
|
@@ -990,7 +985,7 @@ class CrawlerProcess:
|
|
|
990
985
|
|
|
991
986
|
# Build startup info log
|
|
992
987
|
startup_info = [
|
|
993
|
-
f"Crawlo Framework Started
|
|
988
|
+
f"Crawlo Framework Started {version}"
|
|
994
989
|
]
|
|
995
990
|
|
|
996
991
|
# Get actual queue type
|
|
@@ -1018,7 +1013,7 @@ class CrawlerProcess:
|
|
|
1018
1013
|
else:
|
|
1019
1014
|
startup_info.append(f"Run Mode: {run_mode}")
|
|
1020
1015
|
|
|
1021
|
-
# Print startup information
|
|
1016
|
+
# Print startup information at INFO level
|
|
1022
1017
|
for info in startup_info:
|
|
1023
1018
|
logger.info(info)
|
|
1024
1019
|
|
|
@@ -1032,7 +1027,7 @@ def create_crawler_with_optimizations(
|
|
|
1032
1027
|
) -> Crawler:
|
|
1033
1028
|
"""
|
|
1034
1029
|
Create an optimized crawler instance
|
|
1035
|
-
|
|
1030
|
+
|
|
1036
1031
|
:param spider_cls: Spider class
|
|
1037
1032
|
:param settings: Settings manager
|
|
1038
1033
|
:param optimization_kwargs: Optimization parameters
|
|
@@ -1056,7 +1051,7 @@ def create_process_with_large_scale_config(
|
|
|
1056
1051
|
) -> CrawlerProcess:
|
|
1057
1052
|
"""
|
|
1058
1053
|
Create a process manager that supports large-scale optimization
|
|
1059
|
-
|
|
1054
|
+
|
|
1060
1055
|
:param config_type: Configuration type ('conservative', 'balanced', 'aggressive', 'memory_optimized')
|
|
1061
1056
|
:param concurrency: Concurrency count
|
|
1062
1057
|
:param kwargs: Other parameters
|
|
@@ -1100,4 +1095,4 @@ __all__ = [
|
|
|
1100
1095
|
'CrawlerContext',
|
|
1101
1096
|
'create_crawler_with_optimizations',
|
|
1102
1097
|
'create_process_with_large_scale_config'
|
|
1103
|
-
]
|
|
1098
|
+
]
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
from crawlo.exceptions import NotConfigured
|
|
3
|
-
from crawlo.utils.log import get_logger
|
|
4
3
|
from crawlo.utils.log import LoggerManager
|
|
5
4
|
|
|
5
|
+
# 使用全局logger,避免每个模块都创建自己的延迟初始化函数
|
|
6
|
+
logger = LoggerManager.get_logger(__name__)
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
class CustomLoggerExtension:
|
|
8
10
|
"""
|
|
@@ -32,7 +34,7 @@ class CustomLoggerExtension:
|
|
|
32
34
|
return cls(crawler.settings)
|
|
33
35
|
|
|
34
36
|
def spider_opened(self, spider: Any) -> None:
|
|
35
|
-
logger = get_logger(__name__)
|
|
37
|
+
logger = LoggerManager.get_logger(__name__)
|
|
36
38
|
try:
|
|
37
39
|
logger.info(
|
|
38
40
|
f"CustomLoggerExtension: Logging initialized. "
|
|
@@ -133,4 +133,4 @@ class MiddlewareManager:
|
|
|
133
133
|
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
134
134
|
method = getattr(type(middleware), method_name)
|
|
135
135
|
base_method = getattr(BaseMiddleware, method_name)
|
|
136
|
-
return False if method == base_method else True
|
|
136
|
+
return False if method == base_method else True
|