crawlo 1.2.1__tar.gz → 1.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.2.1/crawlo.egg-info → crawlo-1.2.3}/PKG-INFO +1 -1
- crawlo-1.2.3/crawlo/__version__.py +1 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/cli.py +16 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/check.py +69 -69
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/genspider.py +25 -25
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/help.py +4 -4
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/list.py +23 -23
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/run.py +34 -34
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/startproject.py +35 -36
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/stats.py +21 -21
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/utils.py +4 -4
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/config.py +4 -4
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/config_validator.py +1 -2
- crawlo-1.2.3/crawlo/data/__init__.py +6 -0
- crawlo-1.2.3/crawlo/data/user_agents.py +108 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/downloader/hybrid_downloader.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/downloader/selenium_downloader.py +1 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/health_check.py +2 -2
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/items/fields.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/offsite.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/proxy.py +3 -2
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/request_ignore.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/response_code.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/response_filter.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/retry.py +1 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/mode_manager.py +3 -3
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/database_dedup_pipeline.py +1 -3
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/memory_dedup_pipeline.py +2 -2
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/mysql_pipeline.py +4 -3
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/redis_dedup_pipeline.py +2 -4
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/project.py +2 -2
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/subscriber.py +1 -2
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/settings.py.tmpl +1 -3
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/settings_distributed.py.tmpl +2 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/batch_processor.py +2 -3
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/controlled_spider_mixin.py +1 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/enhanced_error_handler.py +3 -6
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/env_config.py +1 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/error_handler.py +2 -4
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/large_scale_helper.py +2 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/performance_monitor.py +5 -4
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/redis_connection_pool.py +4 -4
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/redis_key_validator.py +1 -2
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/request_serializer.py +1 -2
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/spider_loader.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3/crawlo.egg-info}/PKG-INFO +1 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo.egg-info/SOURCES.txt +2 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/requirements.txt +5 -1
- crawlo-1.2.1/crawlo/__version__.py +0 -1
- {crawlo-1.2.1 → crawlo-1.2.3}/LICENSE +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/MANIFEST.in +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/README.md +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/cleaners/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/cleaners/data_formatter.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/cleaners/encoding_converter.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/cleaners/text_cleaner.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/core/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/core/engine.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/core/processor.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/crawler.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/event.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/exceptions.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/items/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/items/base.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/items/items.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/network/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/network/request.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/network/response.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/pipelines/pipeline_manager.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/queue/queue_manager.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/queue/redis_priority_queue.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/settings/default_settings.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/settings/setting_manager.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/stats_collector.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/task_manager.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/run.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/date_tools.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/log.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/request.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/system.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/tools.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo/utils/url.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/aiohttp_settings.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/curl_cffi_settings.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/default_header_middleware_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/default_header_spider_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/download_delay_middleware_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/httpx_settings.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/multi_downloader_proxy_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/offsite_middleware_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/offsite_spider_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/proxy_spider_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/request_ignore_middleware_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/request_ignore_spider_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/response_code_middleware_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/response_filter_middleware_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/tong_hua_shun_settings.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/examples/tong_hua_shun_spider.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/pyproject.toml +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/setup.cfg +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/__init__.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/cleaners_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/config_validation_demo.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/date_tools_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/env_config_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/error_handling_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/response_improvements_example.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_cleaners.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_comprehensive.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_config_validator.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_crawlo_proxy_integration.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_date_tools.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_default_header_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_edge_cases.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_env_config.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_final_validation.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_integration.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_offsite_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_parsel.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_performance.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_api.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_redis_config.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_redis_queue.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_request_serialization.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_response_improvements.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_scheduler.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_simple_response.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_template_content.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/test_tools.py +0 -0
- {crawlo-1.2.1 → crawlo-1.2.3}/tests/tools_example.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.3"
|
|
@@ -3,10 +3,20 @@
|
|
|
3
3
|
# -*- coding: UTF-8 -*-
|
|
4
4
|
import sys
|
|
5
5
|
import argparse
|
|
6
|
+
import os
|
|
6
7
|
from crawlo.commands import get_commands
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def main():
|
|
11
|
+
# 获取框架版本号
|
|
12
|
+
version_file = os.path.join(os.path.dirname(__file__), '__version__.py')
|
|
13
|
+
if os.path.exists(version_file):
|
|
14
|
+
with open(version_file, 'r') as f:
|
|
15
|
+
exec(f.read())
|
|
16
|
+
VERSION = locals().get('__version__', '1.0.0')
|
|
17
|
+
else:
|
|
18
|
+
VERSION = '1.0.0'
|
|
19
|
+
|
|
10
20
|
# 获取所有可用命令
|
|
11
21
|
commands = get_commands()
|
|
12
22
|
|
|
@@ -19,11 +29,17 @@ def main():
|
|
|
19
29
|
|
|
20
30
|
# 添加帮助参数
|
|
21
31
|
parser.add_argument('-h', '--help', action='store_true', help='显示帮助信息')
|
|
32
|
+
parser.add_argument('-v', '--version', action='store_true', help='显示版本信息')
|
|
22
33
|
parser.add_argument('command', nargs='?', help='可用命令: ' + ', '.join(commands.keys()))
|
|
23
34
|
|
|
24
35
|
# 解析已知参数
|
|
25
36
|
args, unknown = parser.parse_known_args()
|
|
26
37
|
|
|
38
|
+
# 处理版本参数
|
|
39
|
+
if args.version:
|
|
40
|
+
print(f"Crawlo {VERSION}")
|
|
41
|
+
sys.exit(0)
|
|
42
|
+
|
|
27
43
|
# 处理帮助参数
|
|
28
44
|
if args.help or (args.command is None and not unknown):
|
|
29
45
|
# 导入并运行帮助命令
|
|
@@ -63,7 +63,7 @@ def auto_fix_spider_file(spider_cls, file_path: Path):
|
|
|
63
63
|
break
|
|
64
64
|
|
|
65
65
|
if not class_node:
|
|
66
|
-
return False, "
|
|
66
|
+
return False, "在文件中找不到类定义。"
|
|
67
67
|
|
|
68
68
|
# 1. 修复 name 为空或缺失
|
|
69
69
|
name_assign = None
|
|
@@ -221,12 +221,12 @@ def auto_fix_spider_file(spider_cls, file_path: Path):
|
|
|
221
221
|
fixed_source = astor.to_source(tree)
|
|
222
222
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
223
223
|
f.write(fixed_source)
|
|
224
|
-
return True, "
|
|
224
|
+
return True, "文件自动修复成功。"
|
|
225
225
|
else:
|
|
226
|
-
return False, "
|
|
226
|
+
return False, "未找到可修复的问题。"
|
|
227
227
|
|
|
228
228
|
except Exception as e:
|
|
229
|
-
return False, f"
|
|
229
|
+
return False, f"自动修复失败: {e}"
|
|
230
230
|
|
|
231
231
|
|
|
232
232
|
class SpiderChangeHandler(FileSystemEventHandler):
|
|
@@ -242,14 +242,14 @@ class SpiderChangeHandler(FileSystemEventHandler):
|
|
|
242
242
|
if event.src_path.endswith(".py") and "spiders" in event.src_path:
|
|
243
243
|
file_path = Path(event.src_path)
|
|
244
244
|
spider_name = file_path.stem
|
|
245
|
-
self.console.print(f"\n:eyes: [bold blue]
|
|
245
|
+
self.console.print(f"\n:eyes: [bold blue]检测到变更[/bold blue] [cyan]{file_path}[/cyan]")
|
|
246
246
|
self.check_and_fix_spider(spider_name)
|
|
247
247
|
|
|
248
248
|
def check_and_fix_spider(self, spider_name):
|
|
249
249
|
try:
|
|
250
250
|
process = CrawlerProcess(spider_modules=self.spider_modules)
|
|
251
251
|
if spider_name not in process.get_spider_names():
|
|
252
|
-
self.console.print(f"[yellow]⚠️ {spider_name}
|
|
252
|
+
self.console.print(f"[yellow]⚠️ {spider_name} 不是已注册的爬虫。[/yellow]")
|
|
253
253
|
return
|
|
254
254
|
|
|
255
255
|
cls = process.get_spider_class(spider_name)
|
|
@@ -257,23 +257,23 @@ class SpiderChangeHandler(FileSystemEventHandler):
|
|
|
257
257
|
|
|
258
258
|
# 简化检查
|
|
259
259
|
if not getattr(cls, "name", None):
|
|
260
|
-
issues.append("
|
|
260
|
+
issues.append("缺少或为空的 'name' 属性")
|
|
261
261
|
if not callable(getattr(cls, "start_requests", None)):
|
|
262
|
-
issues.append("
|
|
262
|
+
issues.append("缺少 'start_requests' 方法")
|
|
263
263
|
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
264
|
-
issues.append("'start_urls'
|
|
264
|
+
issues.append("'start_urls' 是字符串")
|
|
265
265
|
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
266
|
-
issues.append("'allowed_domains'
|
|
266
|
+
issues.append("'allowed_domains' 是字符串")
|
|
267
267
|
|
|
268
268
|
try:
|
|
269
269
|
spider = cls.create_instance(None)
|
|
270
270
|
if not callable(getattr(spider, "parse", None)):
|
|
271
|
-
issues.append("
|
|
271
|
+
issues.append("缺少 'parse' 方法")
|
|
272
272
|
except Exception:
|
|
273
|
-
issues.append("
|
|
273
|
+
issues.append("实例化失败")
|
|
274
274
|
|
|
275
275
|
if issues:
|
|
276
|
-
self.console.print(f"[red]❌ {spider_name}
|
|
276
|
+
self.console.print(f"[red]❌ {spider_name} 存在问题:[/red]")
|
|
277
277
|
for issue in issues:
|
|
278
278
|
self.console.print(f" • {issue}")
|
|
279
279
|
|
|
@@ -281,21 +281,21 @@ class SpiderChangeHandler(FileSystemEventHandler):
|
|
|
281
281
|
file_path = Path(cls.__file__)
|
|
282
282
|
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
283
283
|
if fixed:
|
|
284
|
-
self.console.print(f"[green]✅
|
|
284
|
+
self.console.print(f"[green]✅ 自动修复: {msg}[/green]")
|
|
285
285
|
else:
|
|
286
|
-
self.console.print(f"[yellow]⚠️
|
|
286
|
+
self.console.print(f"[yellow]⚠️ 无法修复: {msg}[/yellow]")
|
|
287
287
|
else:
|
|
288
|
-
self.console.print(f"[green]✅ {spider_name}
|
|
288
|
+
self.console.print(f"[green]✅ {spider_name} 合规。[/green]")
|
|
289
289
|
|
|
290
290
|
except Exception as e:
|
|
291
|
-
self.console.print(f"[red]❌
|
|
291
|
+
self.console.print(f"[red]❌ 检查 {spider_name} 时出错: {e}[/red]")
|
|
292
292
|
|
|
293
293
|
|
|
294
|
-
def watch_spiders(project_root, project_package, show_fix
|
|
295
|
-
|
|
294
|
+
def watch_spiders(project_root: Path, project_package: str, show_fix: bool):
|
|
295
|
+
"""监听 spiders 目录变化并自动检查"""
|
|
296
296
|
spider_path = project_root / project_package / "spiders"
|
|
297
297
|
if not spider_path.exists():
|
|
298
|
-
console.print(f"[red]❌
|
|
298
|
+
console.print(f"[bold red]❌ Spider 目录未找到:[/bold red] {spider_path}")
|
|
299
299
|
return
|
|
300
300
|
|
|
301
301
|
spider_modules = [f"{project_package}.spiders"]
|
|
@@ -304,9 +304,9 @@ def watch_spiders(project_root, project_package, show_fix=False):
|
|
|
304
304
|
observer.schedule(event_handler, str(spider_path), recursive=False)
|
|
305
305
|
|
|
306
306
|
console.print(Panel(
|
|
307
|
-
f":eyes: [bold blue]
|
|
308
|
-
"
|
|
309
|
-
title="🚀
|
|
307
|
+
f":eyes: [bold blue]监听[/bold blue] [cyan]{spider_path}[/cyan] 中的变更\n"
|
|
308
|
+
"编辑任何爬虫文件以触发自动检查...",
|
|
309
|
+
title="🚀 已启动监听模式",
|
|
310
310
|
border_style="blue"
|
|
311
311
|
))
|
|
312
312
|
|
|
@@ -315,7 +315,7 @@ def watch_spiders(project_root, project_package, show_fix=False):
|
|
|
315
315
|
while True:
|
|
316
316
|
time.sleep(1)
|
|
317
317
|
except KeyboardInterrupt:
|
|
318
|
-
console.print("\n[bold red]🛑
|
|
318
|
+
console.print("\n[bold red]🛑 监听模式已停止。[/bold red]")
|
|
319
319
|
observer.stop()
|
|
320
320
|
observer.join()
|
|
321
321
|
|
|
@@ -337,24 +337,24 @@ def main(args):
|
|
|
337
337
|
|
|
338
338
|
valid_args = {"--fix", "-f", "--ci", "--json", "--watch"}
|
|
339
339
|
if any(arg not in valid_args for arg in args):
|
|
340
|
-
console.print("[bold red]❌
|
|
340
|
+
console.print("[bold red]❌ 错误:[/bold red] 用法: [blue]crawlo check[/blue] [--fix] [--ci] [--json] [--watch]")
|
|
341
341
|
return 1
|
|
342
342
|
|
|
343
343
|
try:
|
|
344
344
|
# 1. 查找项目根目录
|
|
345
345
|
project_root = get_project_root()
|
|
346
346
|
if not project_root:
|
|
347
|
-
msg = ":cross_mark: [bold red]
|
|
347
|
+
msg = ":cross_mark: [bold red]找不到 'crawlo.cfg'[/bold red]\n💡 请在项目目录中运行此命令。"
|
|
348
348
|
if show_json:
|
|
349
|
-
console.print_json(data={"success": False, "error": "
|
|
349
|
+
console.print_json(data={"success": False, "error": "未找到项目根目录"})
|
|
350
350
|
return 1
|
|
351
351
|
elif show_ci:
|
|
352
|
-
console.print("❌
|
|
352
|
+
console.print("❌ 未找到项目根目录。缺少 crawlo.cfg。")
|
|
353
353
|
return 1
|
|
354
354
|
else:
|
|
355
355
|
console.print(Panel(
|
|
356
356
|
Text.from_markup(msg),
|
|
357
|
-
title="❌
|
|
357
|
+
title="❌ 非Crawlo项目",
|
|
358
358
|
border_style="red",
|
|
359
359
|
padding=(1, 2)
|
|
360
360
|
))
|
|
@@ -367,7 +367,7 @@ def main(args):
|
|
|
367
367
|
# 2. 读取 crawlo.cfg
|
|
368
368
|
cfg_file = project_root / "crawlo.cfg"
|
|
369
369
|
if not cfg_file.exists():
|
|
370
|
-
msg = f"
|
|
370
|
+
msg = f"配置文件未找到: {cfg_file}"
|
|
371
371
|
if show_json:
|
|
372
372
|
console.print_json(data={"success": False, "error": msg})
|
|
373
373
|
return 1
|
|
@@ -375,14 +375,14 @@ def main(args):
|
|
|
375
375
|
console.print(f"❌ {msg}")
|
|
376
376
|
return 1
|
|
377
377
|
else:
|
|
378
|
-
console.print(Panel(msg, title="❌
|
|
378
|
+
console.print(Panel(msg, title="❌ 缺少配置文件", border_style="red"))
|
|
379
379
|
return 1
|
|
380
380
|
|
|
381
381
|
config = configparser.ConfigParser()
|
|
382
382
|
config.read(cfg_file, encoding="utf-8")
|
|
383
383
|
|
|
384
384
|
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
385
|
-
msg = "
|
|
385
|
+
msg = "crawlo.cfg 中缺少 [settings] 部分或 'default' 选项"
|
|
386
386
|
if show_json:
|
|
387
387
|
console.print_json(data={"success": False, "error": msg})
|
|
388
388
|
return 1
|
|
@@ -390,7 +390,7 @@ def main(args):
|
|
|
390
390
|
console.print(f"❌ {msg}")
|
|
391
391
|
return 1
|
|
392
392
|
else:
|
|
393
|
-
console.print(Panel(msg, title="❌
|
|
393
|
+
console.print(Panel(msg, title="❌ 无效配置", border_style="red"))
|
|
394
394
|
return 1
|
|
395
395
|
|
|
396
396
|
settings_module = config.get("settings", "default")
|
|
@@ -400,7 +400,7 @@ def main(args):
|
|
|
400
400
|
try:
|
|
401
401
|
import_module(project_package)
|
|
402
402
|
except ImportError as e:
|
|
403
|
-
msg = f"
|
|
403
|
+
msg = f"导入项目包 '{project_package}' 失败: {e}"
|
|
404
404
|
if show_json:
|
|
405
405
|
console.print_json(data={"success": False, "error": msg})
|
|
406
406
|
return 1
|
|
@@ -408,7 +408,7 @@ def main(args):
|
|
|
408
408
|
console.print(f"❌ {msg}")
|
|
409
409
|
return 1
|
|
410
410
|
else:
|
|
411
|
-
console.print(Panel(msg, title="❌
|
|
411
|
+
console.print(Panel(msg, title="❌ 导入错误", border_style="red"))
|
|
412
412
|
return 1
|
|
413
413
|
|
|
414
414
|
# 4. 加载爬虫
|
|
@@ -417,23 +417,23 @@ def main(args):
|
|
|
417
417
|
spider_names = process.get_spider_names()
|
|
418
418
|
|
|
419
419
|
if not spider_names:
|
|
420
|
-
msg = "
|
|
420
|
+
msg = "未找到爬虫。"
|
|
421
421
|
if show_json:
|
|
422
422
|
console.print_json(data={"success": True, "warning": msg})
|
|
423
423
|
return 0
|
|
424
424
|
elif show_ci:
|
|
425
|
-
console.print("📭
|
|
425
|
+
console.print("📭 未找到爬虫。")
|
|
426
426
|
return 0
|
|
427
427
|
else:
|
|
428
428
|
console.print(Panel(
|
|
429
429
|
Text.from_markup(
|
|
430
|
-
":envelope_with_arrow: [bold]
|
|
431
|
-
"[bold]💡
|
|
432
|
-
" •
|
|
433
|
-
" •
|
|
434
|
-
" •
|
|
430
|
+
":envelope_with_arrow: [bold]未找到爬虫[/bold]\n\n"
|
|
431
|
+
"[bold]💡 确保:[/bold]\n"
|
|
432
|
+
" • 爬虫定义于 '[cyan]spiders[/cyan]' 模块\n"
|
|
433
|
+
" • 具有 [green]`name`[/green] 属性\n"
|
|
434
|
+
" • 模块已正确导入"
|
|
435
435
|
),
|
|
436
|
-
title="📭
|
|
436
|
+
title="📭 未找到爬虫",
|
|
437
437
|
border_style="yellow",
|
|
438
438
|
padding=(1, 2)
|
|
439
439
|
))
|
|
@@ -441,13 +441,13 @@ def main(args):
|
|
|
441
441
|
|
|
442
442
|
# 5. 如果启用 watch 模式,启动监听
|
|
443
443
|
if show_watch:
|
|
444
|
-
console.print("[bold blue]:eyes:
|
|
444
|
+
console.print("[bold blue]:eyes: 启动监听模式...[/bold blue]")
|
|
445
445
|
watch_spiders(project_root, project_package, show_fix)
|
|
446
446
|
return 0 # watch 是长期运行,不返回
|
|
447
447
|
|
|
448
448
|
# 6. 开始检查(非 watch 模式)
|
|
449
449
|
if not show_ci and not show_json:
|
|
450
|
-
console.print(f":mag: [bold]
|
|
450
|
+
console.print(f":mag: [bold]正在检查 {len(spider_names)} 个爬虫...[/bold]\n")
|
|
451
451
|
|
|
452
452
|
issues_found = False
|
|
453
453
|
results = []
|
|
@@ -458,29 +458,29 @@ def main(args):
|
|
|
458
458
|
|
|
459
459
|
# 检查 name 属性
|
|
460
460
|
if not getattr(cls, "name", None):
|
|
461
|
-
issues.append("
|
|
461
|
+
issues.append("缺少或为空的 'name' 属性")
|
|
462
462
|
elif not isinstance(cls.name, str):
|
|
463
|
-
issues.append("'name'
|
|
463
|
+
issues.append("'name' 不是字符串")
|
|
464
464
|
|
|
465
465
|
# 检查 start_requests 是否可调用
|
|
466
466
|
if not callable(getattr(cls, "start_requests", None)):
|
|
467
|
-
issues.append("
|
|
467
|
+
issues.append("缺少或不可调用的 'start_requests' 方法")
|
|
468
468
|
|
|
469
469
|
# 检查 start_urls 类型(不应是字符串)
|
|
470
470
|
if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
|
|
471
|
-
issues.append("'start_urls'
|
|
471
|
+
issues.append("'start_urls' 是字符串;应为列表或元组")
|
|
472
472
|
|
|
473
473
|
# 检查 allowed_domains 类型
|
|
474
474
|
if hasattr(cls, "allowed_domains") and isinstance(cls.allowed_domains, str):
|
|
475
|
-
issues.append("'allowed_domains'
|
|
475
|
+
issues.append("'allowed_domains' 是字符串;应为列表或元组")
|
|
476
476
|
|
|
477
477
|
# 实例化并检查 parse 方法
|
|
478
478
|
try:
|
|
479
479
|
spider = cls.create_instance(None)
|
|
480
480
|
if not callable(getattr(spider, "parse", None)):
|
|
481
|
-
issues.append("
|
|
481
|
+
issues.append("未定义 'parse' 方法(推荐)")
|
|
482
482
|
except Exception as e:
|
|
483
|
-
issues.append(f"
|
|
483
|
+
issues.append(f"实例化爬虫失败: {e}")
|
|
484
484
|
|
|
485
485
|
# 自动修复(如果启用)
|
|
486
486
|
if issues and show_fix:
|
|
@@ -489,14 +489,14 @@ def main(args):
|
|
|
489
489
|
fixed, msg = auto_fix_spider_file(cls, file_path)
|
|
490
490
|
if fixed:
|
|
491
491
|
if not show_ci and not show_json:
|
|
492
|
-
console.print(f"[green]🔧
|
|
492
|
+
console.print(f"[green]🔧 已自动修复 {name} → {msg}[/green]")
|
|
493
493
|
issues = [] # 认为已修复
|
|
494
494
|
else:
|
|
495
495
|
if not show_ci and not show_json:
|
|
496
|
-
console.print(f"[yellow]⚠️
|
|
496
|
+
console.print(f"[yellow]⚠️ 无法自动修复 {name}: {msg}[/yellow]")
|
|
497
497
|
except Exception as e:
|
|
498
498
|
if not show_ci and not show_json:
|
|
499
|
-
console.print(f"[yellow]⚠️
|
|
499
|
+
console.print(f"[yellow]⚠️ 找不到 {name} 的源文件: {e}[/yellow]")
|
|
500
500
|
|
|
501
501
|
results.append({
|
|
502
502
|
"name": name,
|
|
@@ -525,26 +525,26 @@ def main(args):
|
|
|
525
525
|
|
|
526
526
|
if show_ci:
|
|
527
527
|
if issues_found:
|
|
528
|
-
console.print("❌
|
|
528
|
+
console.print("❌ 合规性检查失败。")
|
|
529
529
|
for r in results:
|
|
530
530
|
if r["issues"]:
|
|
531
531
|
console.print(f" • {r['name']}: {', '.join(r['issues'])}")
|
|
532
532
|
else:
|
|
533
|
-
console.print("✅
|
|
533
|
+
console.print("✅ 所有爬虫合规。")
|
|
534
534
|
return 1 if issues_found else 0
|
|
535
535
|
|
|
536
536
|
# 9. 默认 rich 输出
|
|
537
537
|
table = Table(
|
|
538
|
-
title="🔍
|
|
538
|
+
title="🔍 爬虫合规性检查结果",
|
|
539
539
|
box=box.ROUNDED,
|
|
540
540
|
show_header=True,
|
|
541
541
|
header_style="bold magenta",
|
|
542
542
|
title_style="bold green"
|
|
543
543
|
)
|
|
544
|
-
table.add_column("
|
|
545
|
-
table.add_column("
|
|
546
|
-
table.add_column("
|
|
547
|
-
table.add_column("
|
|
544
|
+
table.add_column("状态", style="bold", width=4)
|
|
545
|
+
table.add_column("名称", style="cyan")
|
|
546
|
+
table.add_column("类名", style="green")
|
|
547
|
+
table.add_column("问题", style="yellow", overflow="fold")
|
|
548
548
|
|
|
549
549
|
for res in results:
|
|
550
550
|
if res["issues"]:
|
|
@@ -561,29 +561,29 @@ def main(args):
|
|
|
561
561
|
|
|
562
562
|
if issues_found:
|
|
563
563
|
console.print(Panel(
|
|
564
|
-
":warning: [bold red]
|
|
565
|
-
title="⚠️
|
|
564
|
+
":warning: [bold red]一些爬虫存在问题。[/bold red]\n请在运行前修复这些问题。",
|
|
565
|
+
title="⚠️ 合规性检查失败",
|
|
566
566
|
border_style="red",
|
|
567
567
|
padding=(1, 2)
|
|
568
568
|
))
|
|
569
569
|
return 1
|
|
570
570
|
else:
|
|
571
571
|
console.print(Panel(
|
|
572
|
-
":tada: [bold green]
|
|
573
|
-
title="🎉
|
|
572
|
+
":tada: [bold green]所有爬虫都合规且定义良好![/bold green]\n准备开始爬取! 🕷️🚀",
|
|
573
|
+
title="🎉 检查通过",
|
|
574
574
|
border_style="green",
|
|
575
575
|
padding=(1, 2)
|
|
576
576
|
))
|
|
577
577
|
return 0
|
|
578
578
|
|
|
579
579
|
except Exception as e:
|
|
580
|
-
logger.exception("
|
|
580
|
+
logger.exception("执行 'crawlo check' 时发生异常")
|
|
581
581
|
if show_json:
|
|
582
582
|
console.print_json(data={"success": False, "error": str(e)})
|
|
583
583
|
elif show_ci:
|
|
584
|
-
console.print(f"❌
|
|
584
|
+
console.print(f"❌ 意外错误: {e}")
|
|
585
585
|
else:
|
|
586
|
-
console.print(f"[bold red]❌
|
|
586
|
+
console.print(f"[bold red]❌ 检查过程中发生意外错误:[/bold red] {e}")
|
|
587
587
|
return 1
|
|
588
588
|
|
|
589
589
|
|
|
@@ -37,8 +37,8 @@ def _render_template(tmpl_path, context):
|
|
|
37
37
|
|
|
38
38
|
def main(args):
|
|
39
39
|
if len(args) < 2:
|
|
40
|
-
console.print("[bold red]
|
|
41
|
-
console.print("💡
|
|
40
|
+
console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo genspider[/blue] <爬虫名称> <域名>")
|
|
41
|
+
console.print("💡 示例:")
|
|
42
42
|
console.print(" [blue]crawlo genspider[/blue] news_spider news.example.com")
|
|
43
43
|
console.print(" [blue]crawlo genspider[/blue] product_spider shop.example.com")
|
|
44
44
|
return 1
|
|
@@ -49,28 +49,28 @@ def main(args):
|
|
|
49
49
|
# 验证爬虫名称
|
|
50
50
|
if not validate_spider_name(spider_name):
|
|
51
51
|
show_error_panel(
|
|
52
|
-
"
|
|
53
|
-
f"
|
|
54
|
-
"💡
|
|
55
|
-
" •
|
|
56
|
-
" •
|
|
57
|
-
" •
|
|
52
|
+
"无效的爬虫名称",
|
|
53
|
+
f"爬虫名称 '[cyan]{spider_name}[/cyan]' 无效。\n"
|
|
54
|
+
"💡 爬虫名称应:\n"
|
|
55
|
+
" • 以小写字母开头\n"
|
|
56
|
+
" • 只能包含小写字母、数字和下划线\n"
|
|
57
|
+
" • 是有效的Python标识符"
|
|
58
58
|
)
|
|
59
59
|
return 1
|
|
60
60
|
|
|
61
61
|
# 验证域名格式
|
|
62
62
|
if not is_valid_domain(domain):
|
|
63
63
|
show_error_panel(
|
|
64
|
-
"
|
|
65
|
-
f"
|
|
66
|
-
"💡
|
|
64
|
+
"无效的域名",
|
|
65
|
+
f"域名 '[cyan]{domain}[/cyan]' 格式无效。\n"
|
|
66
|
+
"💡 请提供有效的域名,如 'example.com'"
|
|
67
67
|
)
|
|
68
68
|
return 1
|
|
69
69
|
|
|
70
70
|
# 验证项目环境
|
|
71
71
|
is_valid, project_package, error_msg = validate_project_environment()
|
|
72
72
|
if not is_valid:
|
|
73
|
-
show_error_panel("
|
|
73
|
+
show_error_panel("非Crawlo项目", error_msg)
|
|
74
74
|
return 1
|
|
75
75
|
|
|
76
76
|
project_root = get_project_root()
|
|
@@ -91,10 +91,10 @@ def main(args):
|
|
|
91
91
|
if item_classes:
|
|
92
92
|
default_item_class = item_classes[0].__name__
|
|
93
93
|
else:
|
|
94
|
-
console.print("[yellow]:warning:
|
|
94
|
+
console.print("[yellow]:warning: 警告:[/yellow] 在 [cyan]items.py[/cyan] 中未找到项目类,使用 [green]ExampleItem[/green]。")
|
|
95
95
|
|
|
96
96
|
except ImportError as e:
|
|
97
|
-
console.print(f"[yellow]:warning:
|
|
97
|
+
console.print(f"[yellow]:warning: 警告:[/yellow] 导入 [cyan]{items_module_path}[/cyan] 失败: {e}")
|
|
98
98
|
# 仍使用默认 ExampleItem,不中断流程
|
|
99
99
|
|
|
100
100
|
# 创建爬虫文件
|
|
@@ -104,8 +104,8 @@ def main(args):
|
|
|
104
104
|
spider_file = spiders_dir / f'{spider_name}.py'
|
|
105
105
|
if spider_file.exists():
|
|
106
106
|
show_error_panel(
|
|
107
|
-
"
|
|
108
|
-
f"
|
|
107
|
+
"爬虫已存在",
|
|
108
|
+
f"爬虫 '[cyan]{spider_name}[/cyan]' 已存在于\n[green]{spider_file}[/green]"
|
|
109
109
|
)
|
|
110
110
|
return 1
|
|
111
111
|
|
|
@@ -113,8 +113,8 @@ def main(args):
|
|
|
113
113
|
tmpl_path = TEMPLATES_DIR / 'spider' / 'spider.py.tmpl'
|
|
114
114
|
if not tmpl_path.exists():
|
|
115
115
|
show_error_panel(
|
|
116
|
-
"
|
|
117
|
-
f"
|
|
116
|
+
"模板未找到",
|
|
117
|
+
f"模板文件未找到于 [cyan]{tmpl_path}[/cyan]"
|
|
118
118
|
)
|
|
119
119
|
return 1
|
|
120
120
|
|
|
@@ -134,11 +134,11 @@ def main(args):
|
|
|
134
134
|
with open(spider_file, 'w', encoding='utf-8') as f:
|
|
135
135
|
f.write(content)
|
|
136
136
|
|
|
137
|
-
console.print(f":white_check_mark: [green]
|
|
138
|
-
console.print(f" →
|
|
139
|
-
console.print(f" →
|
|
140
|
-
console.print(f" →
|
|
141
|
-
console.print("\n[bold]
|
|
137
|
+
console.print(f":white_check_mark: [green]爬虫 '[bold]{spider_name}[/bold]' 创建成功![/green]")
|
|
138
|
+
console.print(f" → 位置: [cyan]{spider_file}[/cyan]")
|
|
139
|
+
console.print(f" → 类名: [yellow]{class_name}[/yellow]")
|
|
140
|
+
console.print(f" → 域名: [blue]{domain}[/blue]")
|
|
141
|
+
console.print("\n[bold]下一步操作:[/bold]")
|
|
142
142
|
console.print(f" [blue]crawlo run[/blue] {spider_name}")
|
|
143
143
|
console.print(f" [blue]crawlo check[/blue] {spider_name}")
|
|
144
144
|
|
|
@@ -146,7 +146,7 @@ def main(args):
|
|
|
146
146
|
|
|
147
147
|
except Exception as e:
|
|
148
148
|
show_error_panel(
|
|
149
|
-
"
|
|
150
|
-
f"
|
|
149
|
+
"创建失败",
|
|
150
|
+
f"创建爬虫失败: {e}"
|
|
151
151
|
)
|
|
152
152
|
return 1
|
|
@@ -137,7 +137,7 @@ def show_help():
|
|
|
137
137
|
console.print()
|
|
138
138
|
|
|
139
139
|
# 显示更多信息
|
|
140
|
-
console.print("[bold green]更多信息:[/bold green]")
|
|
141
|
-
console.print(" 文档: https://crawlo.readthedocs.io/")
|
|
142
|
-
console.print(" 源码: https://github.com/crawl-coder/Crawlo")
|
|
143
|
-
console.print(" 问题: https://github.com/crawl-coder/Crawlo/issues")
|
|
140
|
+
# console.print("[bold green]更多信息:[/bold green]")
|
|
141
|
+
# console.print(" 文档: https://crawlo.readthedocs.io/")
|
|
142
|
+
# console.print(" 源码: https://github.com/crawl-coder/Crawlo")
|
|
143
|
+
# console.print(" 问题: https://github.com/crawl-coder/Crawlo/issues")
|
|
@@ -34,9 +34,9 @@ def main(args):
|
|
|
34
34
|
filtered_args = [arg for arg in args if not arg.startswith('--')]
|
|
35
35
|
if filtered_args:
|
|
36
36
|
if show_json:
|
|
37
|
-
console.print_json(data={"success": False, "error": "
|
|
37
|
+
console.print_json(data={"success": False, "error": "用法: crawlo list [--json]"})
|
|
38
38
|
else:
|
|
39
|
-
console.print("[bold red]❌
|
|
39
|
+
console.print("[bold red]❌ 错误:[/bold red] 用法: [blue]crawlo list[/blue] [--json]")
|
|
40
40
|
return 1
|
|
41
41
|
|
|
42
42
|
try:
|
|
@@ -46,7 +46,7 @@ def main(args):
|
|
|
46
46
|
if show_json:
|
|
47
47
|
console.print_json(data={"success": False, "error": error_msg})
|
|
48
48
|
else:
|
|
49
|
-
show_error_panel("
|
|
49
|
+
show_error_panel("非Crawlo项目", error_msg)
|
|
50
50
|
return 1
|
|
51
51
|
|
|
52
52
|
# 初始化 CrawlerProcess 并加载爬虫模块
|
|
@@ -60,18 +60,18 @@ def main(args):
|
|
|
60
60
|
console.print_json(data={
|
|
61
61
|
"success": True,
|
|
62
62
|
"spiders": [],
|
|
63
|
-
"message": "
|
|
63
|
+
"message": "项目中未找到爬虫"
|
|
64
64
|
})
|
|
65
65
|
else:
|
|
66
66
|
console.print(Panel(
|
|
67
67
|
Text.from_markup(
|
|
68
|
-
":envelope_with_arrow: [bold]
|
|
69
|
-
"[bold]💡
|
|
70
|
-
" •
|
|
71
|
-
" •
|
|
72
|
-
" •
|
|
68
|
+
":envelope_with_arrow: [bold]未找到爬虫[/bold] 于 '[cyan]spiders/[/cyan]' 目录。\n\n"
|
|
69
|
+
"[bold]💡 确保:[/bold]\n"
|
|
70
|
+
" • 爬虫类继承自 [blue]`crawlo.spider.Spider`[/blue]\n"
|
|
71
|
+
" • 每个爬虫都有 [green]`name`[/green] 属性\n"
|
|
72
|
+
" • 爬虫已在 [cyan]`spiders/__init__.py`[/cyan] 中导入 (如果使用包)"
|
|
73
73
|
),
|
|
74
|
-
title="📭
|
|
74
|
+
title="📭 未找到爬虫",
|
|
75
75
|
border_style="yellow",
|
|
76
76
|
padding=(1, 2)
|
|
77
77
|
))
|
|
@@ -108,18 +108,18 @@ def main(args):
|
|
|
108
108
|
|
|
109
109
|
# 表格输出
|
|
110
110
|
table = Table(
|
|
111
|
-
title=f"📋
|
|
111
|
+
title=f"📋 找到 {len(spider_names)} 个爬虫",
|
|
112
112
|
box=box.ROUNDED,
|
|
113
113
|
show_header=True,
|
|
114
114
|
header_style="bold magenta",
|
|
115
115
|
title_style="bold green"
|
|
116
116
|
)
|
|
117
|
-
table.add_column("
|
|
118
|
-
table.add_column("
|
|
119
|
-
table.add_column("
|
|
120
|
-
table.add_column("
|
|
121
|
-
table.add_column("
|
|
122
|
-
table.add_column("
|
|
117
|
+
table.add_column("名称", style="cyan", no_wrap=True)
|
|
118
|
+
table.add_column("类名", style="green")
|
|
119
|
+
table.add_column("模块", style="dim")
|
|
120
|
+
table.add_column("URL数", style="blue", justify="center")
|
|
121
|
+
table.add_column("域名", style="yellow")
|
|
122
|
+
table.add_column("自定义设置", style="magenta", justify="center")
|
|
123
123
|
|
|
124
124
|
for info in spider_info:
|
|
125
125
|
domains_display = ", ".join(info["allowed_domains"][:2]) # 显示前2个域名
|
|
@@ -140,10 +140,10 @@ def main(args):
|
|
|
140
140
|
console.print(table)
|
|
141
141
|
|
|
142
142
|
# 显示使用提示
|
|
143
|
-
console.print("\n[bold]🚀
|
|
144
|
-
console.print(" [blue]crawlo run[/blue]
|
|
145
|
-
console.print(" [blue]crawlo run[/blue] all #
|
|
146
|
-
console.print(" [blue]crawlo check[/blue]
|
|
143
|
+
console.print("\n[bold]🚀 下一步操作:[/bold]")
|
|
144
|
+
console.print(" [blue]crawlo run[/blue] <爬虫名称> # 运行指定爬虫")
|
|
145
|
+
console.print(" [blue]crawlo run[/blue] all # 运行所有爬虫")
|
|
146
|
+
console.print(" [blue]crawlo check[/blue] <爬虫名称> # 检查爬虫有效性")
|
|
147
147
|
|
|
148
148
|
return 0
|
|
149
149
|
|
|
@@ -151,6 +151,6 @@ def main(args):
|
|
|
151
151
|
if show_json:
|
|
152
152
|
console.print_json(data={"success": False, "error": str(e)})
|
|
153
153
|
else:
|
|
154
|
-
console.print(f"[bold red]❌
|
|
155
|
-
logger.exception("
|
|
154
|
+
console.print(f"[bold red]❌ 意外错误:[/bold red] {e}")
|
|
155
|
+
logger.exception("执行 'crawlo list' 时发生异常")
|
|
156
156
|
return 1
|