crawlo 1.2.9__tar.gz → 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.2.9/crawlo.egg-info → crawlo-1.3.1}/PKG-INFO +13 -4
- {crawlo-1.2.9 → crawlo-1.3.1}/README.md +12 -3
- crawlo-1.3.1/crawlo/__version__.py +1 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/run.py +26 -35
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/utils.py +12 -2
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/engine.py +1 -2
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/crawler.py +135 -69
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/logging_extension.py +4 -2
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/middleware_manager.py +1 -1
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/offsite.py +2 -1
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/mode_manager.py +37 -100
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/mysql_pipeline.py +5 -4
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/pipeline_manager.py +15 -2
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/project.py +44 -37
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/settings/default_settings.py +13 -4
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/settings/setting_manager.py +55 -20
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/log.py +21 -62
- {crawlo-1.2.9 → crawlo-1.3.1/crawlo.egg-info}/PKG-INFO +13 -4
- crawlo-1.2.9/crawlo/__version__.py +0 -1
- {crawlo-1.2.9 → crawlo-1.3.1}/LICENSE +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/MANIFEST.in +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/cli.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/check.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/help.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/list.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/stats.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/config_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/processor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/data/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/data/user_agents.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/event.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/exceptions.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/base.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/fields.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/items.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/simple_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/network/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/network/request.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/network/response.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/queue_manager.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/redis_priority_queue.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/stats_collector.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/subscriber.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/task_manager.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/run.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/data_formatter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/encoding_converter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/request_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/text_cleaner.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/redis_connection_pool.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/request.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/system.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/url.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/SOURCES.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/examples/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/pyproject.toml +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/requirements.txt +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/setup.cfg +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/__init__.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/cleaners_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/config_validation_demo.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/date_tools_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/debug_pipelines.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/env_config_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/error_handling_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/request_params_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/response_improvements_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_cleaners.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_comprehensive.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_config_consistency.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_config_merge.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_config_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_crawlo_proxy_integration.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_date_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_default_header_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_distributed.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_edge_cases.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_env_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_final_validation.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_integration.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_mode_consistency.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_offsite_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_parsel.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_performance.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_api.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware_refactored.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_random_user_agent.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_config.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_queue.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_request_params.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_request_serialization.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_response_improvements.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_scheduler.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_scheduler_config_update.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_simple_response.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_template_content.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_tools.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_user_agents.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/tools_example.py +0 -0
- {crawlo-1.2.9 → crawlo-1.3.1}/tests/verify_distributed.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlo
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.1
|
|
4
4
|
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
5
|
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
6
|
Author: crawl-coder
|
|
@@ -56,7 +56,7 @@ Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
|
56
56
|
|
|
57
57
|
<p align="center">
|
|
58
58
|
<a href="https://www.python.org/downloads/">
|
|
59
|
-
<img src="https://img.shields.io/badge/python
|
|
59
|
+
<img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
|
|
60
60
|
</a>
|
|
61
61
|
<a href="LICENSE">
|
|
62
62
|
<img src="https://img.shields.io/badge/license-MIT-green" alt="License">
|
|
@@ -429,13 +429,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
|
|
|
429
429
|
|
|
430
430
|
用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
|
|
431
431
|
|
|
432
|
-
|
|
432
|
+
``python
|
|
433
433
|
# settings.py
|
|
434
434
|
CUSTOM_MIDDLEWARES = [
|
|
435
435
|
'myproject.middlewares.CustomMiddleware',
|
|
436
436
|
]
|
|
437
437
|
```
|
|
438
438
|
|
|
439
|
+
> **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
|
|
440
|
+
> - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
|
|
441
|
+
> - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
|
|
442
|
+
>
|
|
443
|
+
> 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
|
|
444
|
+
|
|
445
|
+
> **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
|
|
446
|
+
> 这样可以在所有默认中间件处理后再应用代理设置。
|
|
447
|
+
|
|
439
448
|
#### 管道配置
|
|
440
449
|
|
|
441
450
|
框架默认加载以下管道:
|
|
@@ -930,7 +939,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
|
|
|
930
939
|
|
|
931
940
|
如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
|
|
932
941
|
|
|
933
|
-
|
|
942
|
+
``python
|
|
934
943
|
# settings.py
|
|
935
944
|
MIDDLEWARES = [
|
|
936
945
|
# 注释掉复杂版代理中间件
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
8
|
<a href="https://www.python.org/downloads/">
|
|
9
|
-
<img src="https://img.shields.io/badge/python
|
|
9
|
+
<img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
|
|
10
10
|
</a>
|
|
11
11
|
<a href="LICENSE">
|
|
12
12
|
<img src="https://img.shields.io/badge/license-MIT-green" alt="License">
|
|
@@ -379,13 +379,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
|
|
|
379
379
|
|
|
380
380
|
用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
|
|
381
381
|
|
|
382
|
-
|
|
382
|
+
``python
|
|
383
383
|
# settings.py
|
|
384
384
|
CUSTOM_MIDDLEWARES = [
|
|
385
385
|
'myproject.middlewares.CustomMiddleware',
|
|
386
386
|
]
|
|
387
387
|
```
|
|
388
388
|
|
|
389
|
+
> **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
|
|
390
|
+
> - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
|
|
391
|
+
> - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
|
|
392
|
+
>
|
|
393
|
+
> 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
|
|
394
|
+
|
|
395
|
+
> **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
|
|
396
|
+
> 这样可以在所有默认中间件处理后再应用代理设置。
|
|
397
|
+
|
|
389
398
|
#### 管道配置
|
|
390
399
|
|
|
391
400
|
框架默认加载以下管道:
|
|
@@ -880,7 +889,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
|
|
|
880
889
|
|
|
881
890
|
如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
|
|
882
891
|
|
|
883
|
-
|
|
892
|
+
``python
|
|
884
893
|
# settings.py
|
|
885
894
|
MIDDLEWARES = [
|
|
886
895
|
# 注释掉复杂版代理中间件
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '1.3.1'
|
|
@@ -5,26 +5,27 @@
|
|
|
5
5
|
# @Author : crawl-coder
|
|
6
6
|
# @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
|
|
7
7
|
"""
|
|
8
|
+
import os
|
|
8
9
|
import sys
|
|
9
10
|
import asyncio
|
|
10
11
|
import configparser
|
|
11
|
-
import os
|
|
12
|
-
from pathlib import Path
|
|
13
12
|
from importlib import import_module
|
|
14
13
|
|
|
14
|
+
from rich import box
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
from rich.panel import Panel
|
|
17
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
17
18
|
from rich.table import Table
|
|
18
19
|
from rich.text import Text
|
|
19
|
-
from rich import box
|
|
20
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
21
20
|
|
|
21
|
+
from crawlo.commands.stats import record_stats
|
|
22
22
|
from crawlo.crawler import CrawlerProcess
|
|
23
|
-
from crawlo.utils.log import get_logger
|
|
24
23
|
from crawlo.project import get_settings, _find_project_root
|
|
25
|
-
|
|
24
|
+
# 使用自定义日志系统
|
|
25
|
+
from crawlo.utils.log import get_logger
|
|
26
26
|
|
|
27
27
|
logger = get_logger(__name__)
|
|
28
|
+
|
|
28
29
|
console = Console()
|
|
29
30
|
|
|
30
31
|
|
|
@@ -77,6 +78,9 @@ def main(args):
|
|
|
77
78
|
用法:
|
|
78
79
|
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
79
80
|
"""
|
|
81
|
+
# 添加调试信息
|
|
82
|
+
logger.debug("DEBUG: 进入main函数")
|
|
83
|
+
|
|
80
84
|
if len(args) < 1:
|
|
81
85
|
console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
82
86
|
console.print("示例:")
|
|
@@ -187,21 +191,7 @@ def main(args):
|
|
|
187
191
|
return 1
|
|
188
192
|
|
|
189
193
|
# 显示即将运行的爬虫列表
|
|
190
|
-
|
|
191
|
-
title=f"启动全部 {len(spider_names)} 个爬虫",
|
|
192
|
-
box=box.ROUNDED,
|
|
193
|
-
show_header=True,
|
|
194
|
-
header_style="bold magenta"
|
|
195
|
-
)
|
|
196
|
-
table.add_column("名称", style="cyan")
|
|
197
|
-
table.add_column("类名", style="green")
|
|
198
|
-
|
|
199
|
-
for name in sorted(spider_names):
|
|
200
|
-
cls = process.get_spider_class(name)
|
|
201
|
-
table.add_row(name, cls.__name__)
|
|
202
|
-
|
|
203
|
-
console.print(table)
|
|
204
|
-
console.print()
|
|
194
|
+
# 根据用户要求,不再显示详细的爬虫列表信息
|
|
205
195
|
|
|
206
196
|
# 注册 stats 记录(除非 --no-stats)
|
|
207
197
|
if not no_stats:
|
|
@@ -260,20 +250,21 @@ def main(args):
|
|
|
260
250
|
spider_class = process.get_spider_class(spider_name)
|
|
261
251
|
|
|
262
252
|
# 显示启动信息
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
253
|
+
# 根据用户要求,不再显示项目启动信息
|
|
254
|
+
# if not show_json:
|
|
255
|
+
# info_table = Table(
|
|
256
|
+
# title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
|
|
257
|
+
# box=box.SIMPLE,
|
|
258
|
+
# show_header=False,
|
|
259
|
+
# title_style="bold green"
|
|
260
|
+
# )
|
|
261
|
+
# info_table.add_column("Key", style="yellow")
|
|
262
|
+
# info_table.add_column("Value", style="cyan")
|
|
263
|
+
# info_table.add_row("Project", project_package)
|
|
264
|
+
# info_table.add_row("Class", spider_class.__name__)
|
|
265
|
+
# info_table.add_row("Module", spider_class.__module__)
|
|
266
|
+
# console.print(info_table)
|
|
267
|
+
# console.print()
|
|
277
268
|
|
|
278
269
|
# 注册 stats 记录
|
|
279
270
|
if not no_stats:
|
|
@@ -133,8 +133,11 @@ def validate_spider_name(spider_name: str) -> bool:
|
|
|
133
133
|
bool: 是否有效
|
|
134
134
|
"""
|
|
135
135
|
import re
|
|
136
|
+
# 清理爬虫名称中的不可见字符
|
|
137
|
+
cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
|
|
138
|
+
|
|
136
139
|
# 爬虫名称应该是有效的Python标识符
|
|
137
|
-
return
|
|
140
|
+
return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
|
|
138
141
|
|
|
139
142
|
|
|
140
143
|
def format_file_size(size_bytes: int) -> str:
|
|
@@ -181,7 +184,14 @@ def is_valid_domain(domain: str) -> bool:
|
|
|
181
184
|
bool: 是否有效
|
|
182
185
|
"""
|
|
183
186
|
import re
|
|
187
|
+
# 清理域名中的不可见字符
|
|
188
|
+
cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
|
|
189
|
+
|
|
184
190
|
pattern = re.compile(
|
|
185
191
|
r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
|
|
186
192
|
)
|
|
187
|
-
return bool(pattern.match(
|
|
193
|
+
return bool(pattern.match(cleaned_domain))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# 添加导入
|
|
197
|
+
import unicodedata
|
|
@@ -75,8 +75,7 @@ class Engine(object):
|
|
|
75
75
|
version = '1.0.0'
|
|
76
76
|
# Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
|
|
77
77
|
self.logger.debug(
|
|
78
|
-
f"Crawlo Started version {version}
|
|
79
|
-
# f"(project name : {self.settings.get('PROJECT_NAME')})"
|
|
78
|
+
f"Crawlo Started version {version}"
|
|
80
79
|
)
|
|
81
80
|
|
|
82
81
|
async def start_spider(self, spider):
|