crawlo 1.3.2__tar.gz → 1.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.3.2/crawlo.egg-info → crawlo-1.3.4}/PKG-INFO +120 -14
- {crawlo-1.3.2 → crawlo-1.3.4}/README.md +119 -13
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/__init__.py +24 -0
- crawlo-1.3.4/crawlo/__version__.py +1 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/run.py +58 -32
- crawlo-1.3.4/crawlo/core/__init__.py +46 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/core/engine.py +119 -45
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/core/scheduler.py +4 -3
- crawlo-1.3.4/crawlo/crawler.py +639 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/aiohttp_downloader.py +4 -2
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/__init__.py +1 -1
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/logging_extension.py +23 -7
- crawlo-1.3.4/crawlo/factories/__init__.py +28 -0
- crawlo-1.3.4/crawlo/factories/base.py +69 -0
- crawlo-1.3.4/crawlo/factories/crawler.py +104 -0
- crawlo-1.3.4/crawlo/factories/registry.py +85 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/filters/aioredis_filter.py +25 -2
- crawlo-1.3.4/crawlo/framework.py +292 -0
- crawlo-1.3.4/crawlo/initialization/__init__.py +40 -0
- crawlo-1.3.4/crawlo/initialization/built_in.py +426 -0
- crawlo-1.3.4/crawlo/initialization/context.py +142 -0
- crawlo-1.3.4/crawlo/initialization/core.py +194 -0
- crawlo-1.3.4/crawlo/initialization/phases.py +149 -0
- crawlo-1.3.4/crawlo/initialization/registry.py +146 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/base.py +2 -1
- crawlo-1.3.4/crawlo/logging/__init__.py +38 -0
- crawlo-1.3.4/crawlo/logging/config.py +97 -0
- crawlo-1.3.4/crawlo/logging/factory.py +129 -0
- crawlo-1.3.4/crawlo/logging/manager.py +112 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/middleware_manager.py +1 -1
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/offsite.py +1 -1
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/mode_manager.py +26 -1
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/pipeline_manager.py +2 -1
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/project.py +76 -46
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/pqueue.py +11 -5
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/queue_manager.py +143 -19
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/redis_priority_queue.py +69 -49
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/settings/default_settings.py +110 -14
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/settings/setting_manager.py +29 -13
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/spider/__init__.py +34 -16
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/stats_collector.py +17 -3
- crawlo-1.3.4/crawlo/task_manager.py +139 -0
- crawlo-1.3.4/crawlo/templates/project/settings.py.tmpl +168 -0
- crawlo-1.3.4/crawlo/templates/project/settings_distributed.py.tmpl +167 -0
- crawlo-1.3.4/crawlo/templates/project/settings_gentle.py.tmpl +167 -0
- crawlo-1.3.4/crawlo/templates/project/settings_high_performance.py.tmpl +168 -0
- crawlo-1.3.4/crawlo/templates/project/settings_minimal.py.tmpl +66 -0
- crawlo-1.3.4/crawlo/templates/project/settings_simple.py.tmpl +165 -0
- crawlo-1.3.4/crawlo/templates/project/spiders/__init__.py.tmpl +10 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/run.py.tmpl +10 -14
- crawlo-1.3.4/crawlo/templates/spiders_init.py.tmpl +10 -0
- crawlo-1.3.4/crawlo/tools/network_diagnostic.py +365 -0
- crawlo-1.3.4/crawlo/utils/class_loader.py +26 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/error_handler.py +76 -35
- crawlo-1.3.4/crawlo/utils/log.py +44 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/redis_connection_pool.py +43 -6
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/request_serializer.py +8 -1
- {crawlo-1.3.2 → crawlo-1.3.4/crawlo.egg-info}/PKG-INFO +120 -14
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/SOURCES.txt +61 -2
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/authenticated_proxy_example.py +2 -2
- crawlo-1.3.4/tests/baidu_performance_test.py +109 -0
- crawlo-1.3.4/tests/baidu_test.py +60 -0
- crawlo-1.3.4/tests/comprehensive_framework_test.py +213 -0
- crawlo-1.3.4/tests/comprehensive_test.py +82 -0
- crawlo-1.3.4/tests/comprehensive_testing_summary.md +187 -0
- crawlo-1.3.4/tests/debug_configure.py +70 -0
- crawlo-1.3.4/tests/debug_framework_logger.py +85 -0
- crawlo-1.3.4/tests/debug_log_levels.py +64 -0
- crawlo-1.3.4/tests/distributed_test.py +67 -0
- crawlo-1.3.4/tests/distributed_test_debug.py +77 -0
- crawlo-1.3.4/tests/final_command_test_report.md +0 -0
- crawlo-1.3.4/tests/final_comprehensive_test.py +152 -0
- crawlo-1.3.4/tests/final_validation_test.py +183 -0
- crawlo-1.3.4/tests/framework_performance_test.py +203 -0
- crawlo-1.3.4/tests/optimized_performance_test.py +212 -0
- crawlo-1.3.4/tests/performance_comparison.py +246 -0
- crawlo-1.3.4/tests/queue_blocking_test.py +114 -0
- crawlo-1.3.4/tests/queue_test.py +90 -0
- crawlo-1.3.4/tests/scrapy_comparison/ofweek_scrapy.py +139 -0
- crawlo-1.3.4/tests/scrapy_comparison/scrapy_test.py +134 -0
- crawlo-1.3.4/tests/simple_command_test.py +120 -0
- crawlo-1.3.4/tests/simple_crawlo_test.py +128 -0
- crawlo-1.3.4/tests/simple_log_test.py +58 -0
- crawlo-1.3.4/tests/simple_optimization_test.py +129 -0
- crawlo-1.3.4/tests/simple_spider_test.py +50 -0
- crawlo-1.3.4/tests/simple_test.py +48 -0
- crawlo-1.3.4/tests/test_all_commands.py +231 -0
- crawlo-1.3.4/tests/test_batch_processor.py +179 -0
- crawlo-1.3.4/tests/test_component_factory.py +175 -0
- crawlo-1.3.4/tests/test_controlled_spider_mixin.py +80 -0
- crawlo-1.3.4/tests/test_enhanced_error_handler_comprehensive.py +246 -0
- crawlo-1.3.4/tests/test_factories.py +253 -0
- crawlo-1.3.4/tests/test_framework_logger.py +67 -0
- crawlo-1.3.4/tests/test_framework_startup.py +65 -0
- crawlo-1.3.4/tests/test_large_scale_config.py +113 -0
- crawlo-1.3.4/tests/test_large_scale_helper.py +236 -0
- crawlo-1.3.4/tests/test_mode_change.py +73 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_mode_consistency.py +1 -1
- crawlo-1.3.4/tests/test_performance_monitor.py +116 -0
- crawlo-1.3.4/tests/test_queue_empty_check.py +42 -0
- crawlo-1.3.4/tests/untested_features_report.md +139 -0
- crawlo-1.3.4/tests/verify_debug.py +52 -0
- crawlo-1.3.4/tests/verify_log_fix.py +112 -0
- crawlo-1.3.2/crawlo/__version__.py +0 -1
- crawlo-1.3.2/crawlo/core/__init__.py +0 -2
- crawlo-1.3.2/crawlo/crawler.py +0 -1169
- crawlo-1.3.2/crawlo/task_manager.py +0 -30
- crawlo-1.3.2/crawlo/templates/project/settings.py.tmpl +0 -267
- crawlo-1.3.2/crawlo/templates/project/settings_distributed.py.tmpl +0 -180
- crawlo-1.3.2/crawlo/templates/project/settings_gentle.py.tmpl +0 -61
- crawlo-1.3.2/crawlo/templates/project/settings_high_performance.py.tmpl +0 -131
- crawlo-1.3.2/crawlo/templates/project/settings_minimal.py.tmpl +0 -35
- crawlo-1.3.2/crawlo/templates/project/settings_simple.py.tmpl +0 -102
- crawlo-1.3.2/crawlo/templates/project/spiders/__init__.py.tmpl +0 -6
- crawlo-1.3.2/crawlo/utils/log.py +0 -147
- crawlo-1.3.2/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
- {crawlo-1.3.2 → crawlo-1.3.4}/LICENSE +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/MANIFEST.in +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/cli.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/check.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/help.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/list.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/stats.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/utils.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/config.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/config_validator.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/core/processor.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/data/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/data/user_agents.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/event.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/exceptions.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/fields.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/items.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/simple_proxy.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/network/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/network/request.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/network/response.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/subscriber.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/middlewares.py.tmpl +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/pipelines.py.tmpl +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/data_formatter.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/encoding_converter.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/request_tools.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/text_cleaner.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/request.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/system.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/tools.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/url.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/examples/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/pyproject.toml +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/requirements.txt +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/setup.cfg +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/__init__.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/cleaners_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/config_validation_demo.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/date_tools_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/debug_pipelines.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/env_config_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/error_handling_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/request_params_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/response_improvements_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_cleaners.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_comprehensive.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_config_consistency.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_config_merge.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_config_validator.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_crawlo_proxy_integration.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_date_tools.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_default_header_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_distributed.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_download_delay_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_downloader_proxy_compatibility.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_edge_cases.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_env_config.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_final_validation.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_integration.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_offsite_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_parsel.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_performance.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_api.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware_enhanced.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware_refactored.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_random_user_agent.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_real_scenario_proxy.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_config.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_queue.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_request_ignore_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_request_params.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_request_serialization.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_response_code_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_response_filter_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_response_improvements.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_retry_middleware.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_scheduler.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_scheduler_config_update.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_simple_response.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_template_content.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_tools.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_user_agents.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/tools_example.py +0 -0
- {crawlo-1.3.2 → crawlo-1.3.4}/tests/verify_distributed.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlo
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.4
|
|
4
4
|
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
5
|
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
6
|
Author: crawl-coder
|
|
@@ -132,13 +132,13 @@ Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
|
132
132
|
|
|
133
133
|
### 安装
|
|
134
134
|
|
|
135
|
-
|
|
135
|
+
``bash
|
|
136
136
|
pip install crawlo
|
|
137
137
|
```
|
|
138
138
|
|
|
139
139
|
### 创建项目
|
|
140
140
|
|
|
141
|
-
|
|
141
|
+
``bash
|
|
142
142
|
# 创建默认项目
|
|
143
143
|
crawlo startproject myproject
|
|
144
144
|
|
|
@@ -153,7 +153,7 @@ cd myproject
|
|
|
153
153
|
|
|
154
154
|
### 生成爬虫
|
|
155
155
|
|
|
156
|
-
|
|
156
|
+
``bash
|
|
157
157
|
# 在项目目录中生成爬虫
|
|
158
158
|
crawlo genspider news_spider news.example.com
|
|
159
159
|
```
|
|
@@ -182,7 +182,7 @@ class MySpider(Spider):
|
|
|
182
182
|
|
|
183
183
|
### 运行爬虫
|
|
184
184
|
|
|
185
|
-
|
|
185
|
+
``bash
|
|
186
186
|
# 使用命令行工具运行爬虫(推荐)
|
|
187
187
|
crawlo run myspider
|
|
188
188
|
|
|
@@ -289,7 +289,7 @@ Crawlo 提供了多种灵活的配置方式,以适应不同的使用场景和
|
|
|
289
289
|
|
|
290
290
|
使用 `CrawloConfig` 配置工厂是推荐的配置方式,它提供了类型安全和智能提示。
|
|
291
291
|
|
|
292
|
-
|
|
292
|
+
``python
|
|
293
293
|
from crawlo.config import CrawloConfig
|
|
294
294
|
from crawlo.crawler import CrawlerProcess
|
|
295
295
|
|
|
@@ -363,7 +363,7 @@ process = CrawlerProcess(settings=config.to_dict())
|
|
|
363
363
|
适用于开发调试、小规模数据采集、个人项目。
|
|
364
364
|
|
|
365
365
|
**推荐配置方式:**
|
|
366
|
-
|
|
366
|
+
``python
|
|
367
367
|
from crawlo.config import CrawloConfig
|
|
368
368
|
config = CrawloConfig.standalone(concurrency=4, download_delay=1.0)
|
|
369
369
|
process = CrawlerProcess(settings=config.to_dict())
|
|
@@ -379,7 +379,7 @@ process = CrawlerProcess(settings=config.to_dict())
|
|
|
379
379
|
适用于大规模数据采集、多节点协同工作、高并发需求。
|
|
380
380
|
|
|
381
381
|
**推荐配置方式:**
|
|
382
|
-
|
|
382
|
+
``python
|
|
383
383
|
from crawlo.config import CrawloConfig
|
|
384
384
|
config = CrawloConfig.distributed(
|
|
385
385
|
redis_host='your_redis_host',
|
|
@@ -400,7 +400,7 @@ process = CrawlerProcess(settings=config.to_dict())
|
|
|
400
400
|
适用于希望根据环境自动选择最佳运行方式。
|
|
401
401
|
|
|
402
402
|
**推荐配置方式:**
|
|
403
|
-
|
|
403
|
+
``python
|
|
404
404
|
from crawlo.config import CrawloConfig
|
|
405
405
|
config = CrawloConfig.auto(concurrency=12)
|
|
406
406
|
process = CrawlerProcess(settings=config.to_dict())
|
|
@@ -453,7 +453,7 @@ CUSTOM_MIDDLEWARES = [
|
|
|
453
453
|
|
|
454
454
|
用户可以通过`CUSTOM_PIPELINES`配置自定义管道:
|
|
455
455
|
|
|
456
|
-
|
|
456
|
+
``python
|
|
457
457
|
# settings.py
|
|
458
458
|
CUSTOM_PIPELINES = [
|
|
459
459
|
'crawlo.pipelines.json_pipeline.JsonPipeline',
|
|
@@ -839,7 +839,7 @@ request = Request(
|
|
|
839
839
|
|
|
840
840
|
可以同时使用多种参数类型,框架会自动处理:
|
|
841
841
|
|
|
842
|
-
|
|
842
|
+
``python
|
|
843
843
|
# GET请求同时使用params和form_data(都会作为查询参数)
|
|
844
844
|
request = Request(
|
|
845
845
|
url='https://api.example.com/search',
|
|
@@ -881,7 +881,7 @@ request = Request(
|
|
|
881
881
|
|
|
882
882
|
Request类支持链式调用来简化配置:
|
|
883
883
|
|
|
884
|
-
|
|
884
|
+
``python
|
|
885
885
|
request = Request('https://example.com')\
|
|
886
886
|
.add_header('User-Agent', 'Crawlo Bot')\
|
|
887
887
|
.set_proxy('http://proxy.example.com:8080')\
|
|
@@ -894,7 +894,7 @@ request = Request('https://example.com')\
|
|
|
894
894
|
|
|
895
895
|
Crawlo提供了多种预定义的请求优先级:
|
|
896
896
|
|
|
897
|
-
|
|
897
|
+
``python
|
|
898
898
|
from crawlo import Request, RequestPriority
|
|
899
899
|
|
|
900
900
|
# 设置不同的优先级
|
|
@@ -909,7 +909,7 @@ background_request = Request('https://example.com', priority=RequestPriority.BAC
|
|
|
909
909
|
|
|
910
910
|
对于需要JavaScript渲染的页面,可以启用动态加载器:
|
|
911
911
|
|
|
912
|
-
|
|
912
|
+
``python
|
|
913
913
|
# 启用动态加载器
|
|
914
914
|
request = Request('https://example.com')\
|
|
915
915
|
.set_dynamic_loader(use_dynamic=True)
|
|
@@ -980,12 +980,118 @@ PROXY_LIST = [
|
|
|
980
980
|
|
|
981
981
|
---
|
|
982
982
|
|
|
983
|
+
<!-- 高级工具 section -->
|
|
984
|
+
<h2 align="center">🛠️ 高级工具</h2>
|
|
985
|
+
|
|
986
|
+
Crawlo 框架提供了一系列高级工具,帮助开发者更好地处理大规模爬虫任务和复杂场景。
|
|
987
|
+
|
|
988
|
+
### 1. 工厂模式相关模块
|
|
989
|
+
|
|
990
|
+
**功能**:
|
|
991
|
+
- 组件创建和依赖注入
|
|
992
|
+
- 单例模式支持
|
|
993
|
+
- 统一的组件管理机制
|
|
994
|
+
|
|
995
|
+
**使用场景**:
|
|
996
|
+
- 需要统一管理组件创建过程
|
|
997
|
+
- 需要依赖注入功能
|
|
998
|
+
- 需要单例组件实例
|
|
999
|
+
|
|
1000
|
+
### 2. 批处理工具
|
|
1001
|
+
|
|
1002
|
+
**功能**:
|
|
1003
|
+
- 大规模数据处理
|
|
1004
|
+
- 并发控制
|
|
1005
|
+
- 内存使用优化
|
|
1006
|
+
|
|
1007
|
+
**使用场景**:
|
|
1008
|
+
- 处理大量数据项
|
|
1009
|
+
- 需要控制并发数量
|
|
1010
|
+
- 内存敏感的数据处理任务
|
|
1011
|
+
|
|
1012
|
+
### 3. 受控爬虫混入类
|
|
1013
|
+
|
|
1014
|
+
**功能**:
|
|
1015
|
+
- 控制大规模请求生成
|
|
1016
|
+
- 防止内存溢出
|
|
1017
|
+
- 动态并发控制
|
|
1018
|
+
|
|
1019
|
+
**使用场景**:
|
|
1020
|
+
- 需要生成大量请求的爬虫
|
|
1021
|
+
- 内存受限的环境
|
|
1022
|
+
- 需要精确控制并发的场景
|
|
1023
|
+
|
|
1024
|
+
### 4. 大规模配置工具
|
|
1025
|
+
|
|
1026
|
+
**功能**:
|
|
1027
|
+
- 针对不同场景的优化配置
|
|
1028
|
+
- 简化配置过程
|
|
1029
|
+
- 提高爬取效率和稳定性
|
|
1030
|
+
|
|
1031
|
+
**配置类型**:
|
|
1032
|
+
- **保守型**: 资源受限环境
|
|
1033
|
+
- **平衡型**: 一般生产环境
|
|
1034
|
+
- **激进型**: 高性能服务器
|
|
1035
|
+
- **内存优化型**: 内存受限但要处理大量请求
|
|
1036
|
+
|
|
1037
|
+
**使用场景**:
|
|
1038
|
+
- 处理数万+请求的大规模爬取
|
|
1039
|
+
- 不同性能环境的适配
|
|
1040
|
+
- 快速配置优化
|
|
1041
|
+
|
|
1042
|
+
### 5. 大规模爬虫辅助工具
|
|
1043
|
+
|
|
1044
|
+
**功能**:
|
|
1045
|
+
- 批量数据处理
|
|
1046
|
+
- 进度管理和断点续传
|
|
1047
|
+
- 内存使用优化
|
|
1048
|
+
- 多种数据源支持
|
|
1049
|
+
|
|
1050
|
+
**组件**:
|
|
1051
|
+
- **LargeScaleHelper**: 批量迭代大量数据
|
|
1052
|
+
- **ProgressManager**: 进度管理
|
|
1053
|
+
- **MemoryOptimizer**: 内存优化
|
|
1054
|
+
- **DataSourceAdapter**: 数据源适配器
|
|
1055
|
+
|
|
1056
|
+
**使用场景**:
|
|
1057
|
+
- 处理数万+ URL的爬虫
|
|
1058
|
+
- 需要断点续传的功能
|
|
1059
|
+
- 内存敏感的大规模处理任务
|
|
1060
|
+
|
|
1061
|
+
### 6. 自动爬虫模块导入
|
|
1062
|
+
|
|
1063
|
+
**功能**:
|
|
1064
|
+
- 自动发现和导入爬虫模块
|
|
1065
|
+
- 无需手动导入即可注册爬虫
|
|
1066
|
+
- 智能扫描项目中的爬虫文件
|
|
1067
|
+
|
|
1068
|
+
**使用方式**:
|
|
1069
|
+
框架会自动扫描指定的`spider_modules`路径,导入其中的所有爬虫模块并自动注册爬虫类。用户只需在创建`CrawlerProcess`时指定`spider_modules`参数:
|
|
1070
|
+
|
|
1071
|
+
```python
|
|
1072
|
+
# 指定爬虫模块路径,框架会自动导入并注册所有爬虫
|
|
1073
|
+
spider_modules = ['myproject.spiders']
|
|
1074
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
1075
|
+
|
|
1076
|
+
# 运行指定的爬虫(无需手动导入)
|
|
1077
|
+
asyncio.run(process.crawl('my_spider_name'))
|
|
1078
|
+
```
|
|
1079
|
+
|
|
1080
|
+
**优势**:
|
|
1081
|
+
- 简化项目结构,减少样板代码
|
|
1082
|
+
- 自动化管理爬虫注册过程
|
|
1083
|
+
- 提高开发效率,降低出错概率
|
|
1084
|
+
- 保持代码整洁和一致性
|
|
1085
|
+
|
|
1086
|
+
有关这些高级工具的详细使用方法和实际案例,请参考 [高级工具示例项目](examples/advanced_tools_example/)。
|
|
1087
|
+
|
|
983
1088
|
<!-- 示例项目 section -->
|
|
984
1089
|
<h2 align="center">📦 示例项目</h2>
|
|
985
1090
|
|
|
986
1091
|
- [OFweek分布式爬虫](examples/ofweek_distributed/) - 复杂的分布式爬虫示例,包含Redis去重功能
|
|
987
1092
|
- [OFweek独立爬虫](examples/ofweek_standalone/) - 独立运行的爬虫示例
|
|
988
1093
|
- [OFweek混合模式爬虫](examples/ofweek_spider/) - 支持单机和分布式模式切换的爬虫示例
|
|
1094
|
+
- [高级工具示例](examples/advanced_tools_example/) - 展示Crawlo框架中各种高级工具的使用方法,包括工厂模式、批处理工具、受控爬虫混入类、大规模配置工具和大规模爬虫辅助工具
|
|
989
1095
|
|
|
990
1096
|
---
|
|
991
1097
|
|
|
@@ -82,13 +82,13 @@
|
|
|
82
82
|
|
|
83
83
|
### 安装
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
``bash
|
|
86
86
|
pip install crawlo
|
|
87
87
|
```
|
|
88
88
|
|
|
89
89
|
### 创建项目
|
|
90
90
|
|
|
91
|
-
|
|
91
|
+
``bash
|
|
92
92
|
# 创建默认项目
|
|
93
93
|
crawlo startproject myproject
|
|
94
94
|
|
|
@@ -103,7 +103,7 @@ cd myproject
|
|
|
103
103
|
|
|
104
104
|
### 生成爬虫
|
|
105
105
|
|
|
106
|
-
|
|
106
|
+
``bash
|
|
107
107
|
# 在项目目录中生成爬虫
|
|
108
108
|
crawlo genspider news_spider news.example.com
|
|
109
109
|
```
|
|
@@ -132,7 +132,7 @@ class MySpider(Spider):
|
|
|
132
132
|
|
|
133
133
|
### 运行爬虫
|
|
134
134
|
|
|
135
|
-
|
|
135
|
+
``bash
|
|
136
136
|
# 使用命令行工具运行爬虫(推荐)
|
|
137
137
|
crawlo run myspider
|
|
138
138
|
|
|
@@ -239,7 +239,7 @@ Crawlo 提供了多种灵活的配置方式,以适应不同的使用场景和
|
|
|
239
239
|
|
|
240
240
|
使用 `CrawloConfig` 配置工厂是推荐的配置方式,它提供了类型安全和智能提示。
|
|
241
241
|
|
|
242
|
-
|
|
242
|
+
``python
|
|
243
243
|
from crawlo.config import CrawloConfig
|
|
244
244
|
from crawlo.crawler import CrawlerProcess
|
|
245
245
|
|
|
@@ -313,7 +313,7 @@ process = CrawlerProcess(settings=config.to_dict())
|
|
|
313
313
|
适用于开发调试、小规模数据采集、个人项目。
|
|
314
314
|
|
|
315
315
|
**推荐配置方式:**
|
|
316
|
-
|
|
316
|
+
``python
|
|
317
317
|
from crawlo.config import CrawloConfig
|
|
318
318
|
config = CrawloConfig.standalone(concurrency=4, download_delay=1.0)
|
|
319
319
|
process = CrawlerProcess(settings=config.to_dict())
|
|
@@ -329,7 +329,7 @@ process = CrawlerProcess(settings=config.to_dict())
|
|
|
329
329
|
适用于大规模数据采集、多节点协同工作、高并发需求。
|
|
330
330
|
|
|
331
331
|
**推荐配置方式:**
|
|
332
|
-
|
|
332
|
+
``python
|
|
333
333
|
from crawlo.config import CrawloConfig
|
|
334
334
|
config = CrawloConfig.distributed(
|
|
335
335
|
redis_host='your_redis_host',
|
|
@@ -350,7 +350,7 @@ process = CrawlerProcess(settings=config.to_dict())
|
|
|
350
350
|
适用于希望根据环境自动选择最佳运行方式。
|
|
351
351
|
|
|
352
352
|
**推荐配置方式:**
|
|
353
|
-
|
|
353
|
+
``python
|
|
354
354
|
from crawlo.config import CrawloConfig
|
|
355
355
|
config = CrawloConfig.auto(concurrency=12)
|
|
356
356
|
process = CrawlerProcess(settings=config.to_dict())
|
|
@@ -403,7 +403,7 @@ CUSTOM_MIDDLEWARES = [
|
|
|
403
403
|
|
|
404
404
|
用户可以通过`CUSTOM_PIPELINES`配置自定义管道:
|
|
405
405
|
|
|
406
|
-
|
|
406
|
+
``python
|
|
407
407
|
# settings.py
|
|
408
408
|
CUSTOM_PIPELINES = [
|
|
409
409
|
'crawlo.pipelines.json_pipeline.JsonPipeline',
|
|
@@ -789,7 +789,7 @@ request = Request(
|
|
|
789
789
|
|
|
790
790
|
可以同时使用多种参数类型,框架会自动处理:
|
|
791
791
|
|
|
792
|
-
|
|
792
|
+
``python
|
|
793
793
|
# GET请求同时使用params和form_data(都会作为查询参数)
|
|
794
794
|
request = Request(
|
|
795
795
|
url='https://api.example.com/search',
|
|
@@ -831,7 +831,7 @@ request = Request(
|
|
|
831
831
|
|
|
832
832
|
Request类支持链式调用来简化配置:
|
|
833
833
|
|
|
834
|
-
|
|
834
|
+
``python
|
|
835
835
|
request = Request('https://example.com')\
|
|
836
836
|
.add_header('User-Agent', 'Crawlo Bot')\
|
|
837
837
|
.set_proxy('http://proxy.example.com:8080')\
|
|
@@ -844,7 +844,7 @@ request = Request('https://example.com')\
|
|
|
844
844
|
|
|
845
845
|
Crawlo提供了多种预定义的请求优先级:
|
|
846
846
|
|
|
847
|
-
|
|
847
|
+
``python
|
|
848
848
|
from crawlo import Request, RequestPriority
|
|
849
849
|
|
|
850
850
|
# 设置不同的优先级
|
|
@@ -859,7 +859,7 @@ background_request = Request('https://example.com', priority=RequestPriority.BAC
|
|
|
859
859
|
|
|
860
860
|
对于需要JavaScript渲染的页面,可以启用动态加载器:
|
|
861
861
|
|
|
862
|
-
|
|
862
|
+
``python
|
|
863
863
|
# 启用动态加载器
|
|
864
864
|
request = Request('https://example.com')\
|
|
865
865
|
.set_dynamic_loader(use_dynamic=True)
|
|
@@ -930,12 +930,118 @@ PROXY_LIST = [
|
|
|
930
930
|
|
|
931
931
|
---
|
|
932
932
|
|
|
933
|
+
<!-- 高级工具 section -->
|
|
934
|
+
<h2 align="center">🛠️ 高级工具</h2>
|
|
935
|
+
|
|
936
|
+
Crawlo 框架提供了一系列高级工具,帮助开发者更好地处理大规模爬虫任务和复杂场景。
|
|
937
|
+
|
|
938
|
+
### 1. 工厂模式相关模块
|
|
939
|
+
|
|
940
|
+
**功能**:
|
|
941
|
+
- 组件创建和依赖注入
|
|
942
|
+
- 单例模式支持
|
|
943
|
+
- 统一的组件管理机制
|
|
944
|
+
|
|
945
|
+
**使用场景**:
|
|
946
|
+
- 需要统一管理组件创建过程
|
|
947
|
+
- 需要依赖注入功能
|
|
948
|
+
- 需要单例组件实例
|
|
949
|
+
|
|
950
|
+
### 2. 批处理工具
|
|
951
|
+
|
|
952
|
+
**功能**:
|
|
953
|
+
- 大规模数据处理
|
|
954
|
+
- 并发控制
|
|
955
|
+
- 内存使用优化
|
|
956
|
+
|
|
957
|
+
**使用场景**:
|
|
958
|
+
- 处理大量数据项
|
|
959
|
+
- 需要控制并发数量
|
|
960
|
+
- 内存敏感的数据处理任务
|
|
961
|
+
|
|
962
|
+
### 3. 受控爬虫混入类
|
|
963
|
+
|
|
964
|
+
**功能**:
|
|
965
|
+
- 控制大规模请求生成
|
|
966
|
+
- 防止内存溢出
|
|
967
|
+
- 动态并发控制
|
|
968
|
+
|
|
969
|
+
**使用场景**:
|
|
970
|
+
- 需要生成大量请求的爬虫
|
|
971
|
+
- 内存受限的环境
|
|
972
|
+
- 需要精确控制并发的场景
|
|
973
|
+
|
|
974
|
+
### 4. 大规模配置工具
|
|
975
|
+
|
|
976
|
+
**功能**:
|
|
977
|
+
- 针对不同场景的优化配置
|
|
978
|
+
- 简化配置过程
|
|
979
|
+
- 提高爬取效率和稳定性
|
|
980
|
+
|
|
981
|
+
**配置类型**:
|
|
982
|
+
- **保守型**: 资源受限环境
|
|
983
|
+
- **平衡型**: 一般生产环境
|
|
984
|
+
- **激进型**: 高性能服务器
|
|
985
|
+
- **内存优化型**: 内存受限但要处理大量请求
|
|
986
|
+
|
|
987
|
+
**使用场景**:
|
|
988
|
+
- 处理数万+请求的大规模爬取
|
|
989
|
+
- 不同性能环境的适配
|
|
990
|
+
- 快速配置优化
|
|
991
|
+
|
|
992
|
+
### 5. 大规模爬虫辅助工具
|
|
993
|
+
|
|
994
|
+
**功能**:
|
|
995
|
+
- 批量数据处理
|
|
996
|
+
- 进度管理和断点续传
|
|
997
|
+
- 内存使用优化
|
|
998
|
+
- 多种数据源支持
|
|
999
|
+
|
|
1000
|
+
**组件**:
|
|
1001
|
+
- **LargeScaleHelper**: 批量迭代大量数据
|
|
1002
|
+
- **ProgressManager**: 进度管理
|
|
1003
|
+
- **MemoryOptimizer**: 内存优化
|
|
1004
|
+
- **DataSourceAdapter**: 数据源适配器
|
|
1005
|
+
|
|
1006
|
+
**使用场景**:
|
|
1007
|
+
- 处理数万+ URL的爬虫
|
|
1008
|
+
- 需要断点续传的功能
|
|
1009
|
+
- 内存敏感的大规模处理任务
|
|
1010
|
+
|
|
1011
|
+
### 6. 自动爬虫模块导入
|
|
1012
|
+
|
|
1013
|
+
**功能**:
|
|
1014
|
+
- 自动发现和导入爬虫模块
|
|
1015
|
+
- 无需手动导入即可注册爬虫
|
|
1016
|
+
- 智能扫描项目中的爬虫文件
|
|
1017
|
+
|
|
1018
|
+
**使用方式**:
|
|
1019
|
+
框架会自动扫描指定的`spider_modules`路径,导入其中的所有爬虫模块并自动注册爬虫类。用户只需在创建`CrawlerProcess`时指定`spider_modules`参数:
|
|
1020
|
+
|
|
1021
|
+
```python
|
|
1022
|
+
# 指定爬虫模块路径,框架会自动导入并注册所有爬虫
|
|
1023
|
+
spider_modules = ['myproject.spiders']
|
|
1024
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
1025
|
+
|
|
1026
|
+
# 运行指定的爬虫(无需手动导入)
|
|
1027
|
+
asyncio.run(process.crawl('my_spider_name'))
|
|
1028
|
+
```
|
|
1029
|
+
|
|
1030
|
+
**优势**:
|
|
1031
|
+
- 简化项目结构,减少样板代码
|
|
1032
|
+
- 自动化管理爬虫注册过程
|
|
1033
|
+
- 提高开发效率,降低出错概率
|
|
1034
|
+
- 保持代码整洁和一致性
|
|
1035
|
+
|
|
1036
|
+
有关这些高级工具的详细使用方法和实际案例,请参考 [高级工具示例项目](examples/advanced_tools_example/)。
|
|
1037
|
+
|
|
933
1038
|
<!-- 示例项目 section -->
|
|
934
1039
|
<h2 align="center">📦 示例项目</h2>
|
|
935
1040
|
|
|
936
1041
|
- [OFweek分布式爬虫](examples/ofweek_distributed/) - 复杂的分布式爬虫示例,包含Redis去重功能
|
|
937
1042
|
- [OFweek独立爬虫](examples/ofweek_standalone/) - 独立运行的爬虫示例
|
|
938
1043
|
- [OFweek混合模式爬虫](examples/ofweek_spider/) - 支持单机和分布式模式切换的爬虫示例
|
|
1044
|
+
- [高级工具示例](examples/advanced_tools_example/) - 展示Crawlo框架中各种高级工具的使用方法,包括工厂模式、批处理工具、受控爬虫混入类、大规模配置工具和大规模爬虫辅助工具
|
|
939
1045
|
|
|
940
1046
|
---
|
|
941
1047
|
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
"""
|
|
4
4
|
Crawlo - 一个异步爬虫框架
|
|
5
5
|
"""
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
6
8
|
from crawlo.spider import Spider
|
|
7
9
|
from crawlo.items import Item, Field
|
|
8
10
|
from crawlo.network.request import Request
|
|
@@ -24,9 +26,29 @@ from crawlo.utils import (
|
|
|
24
26
|
)
|
|
25
27
|
from crawlo import tools
|
|
26
28
|
|
|
29
|
+
# 框架核心模块 - 使用TYPE_CHECKING避免循环导入
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from crawlo.core.framework_initializer import get_framework_initializer, initialize_framework
|
|
32
|
+
|
|
27
33
|
# 为了向后兼容,从tools中导入cleaners相关的功能
|
|
28
34
|
import crawlo.tools as cleaners
|
|
29
35
|
|
|
36
|
+
# 延迟导入的辅助函数
|
|
37
|
+
def get_framework_initializer():
|
|
38
|
+
"""延迟导入get_framework_initializer以避免循环依赖"""
|
|
39
|
+
from crawlo.core.framework_initializer import get_framework_initializer as _get_framework_initializer
|
|
40
|
+
return _get_framework_initializer()
|
|
41
|
+
|
|
42
|
+
def initialize_framework(custom_settings=None):
|
|
43
|
+
"""延迟导入initialize_framework以避免循环依赖"""
|
|
44
|
+
from crawlo.core.framework_initializer import initialize_framework as _initialize_framework
|
|
45
|
+
return _initialize_framework(custom_settings)
|
|
46
|
+
|
|
47
|
+
# 向后兼容的别名
|
|
48
|
+
def get_bootstrap_manager():
|
|
49
|
+
"""向后兼容的别名"""
|
|
50
|
+
return get_framework_initializer()
|
|
51
|
+
|
|
30
52
|
# 版本号:优先从元数据读取
|
|
31
53
|
try:
|
|
32
54
|
from importlib.metadata import version
|
|
@@ -60,5 +82,7 @@ __all__ = [
|
|
|
60
82
|
'from_timestamp_with_tz',
|
|
61
83
|
'cleaners',
|
|
62
84
|
'tools',
|
|
85
|
+
'get_framework_initializer',
|
|
86
|
+
'get_bootstrap_manager',
|
|
63
87
|
'__version__',
|
|
64
88
|
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '1.3.4'
|
|
@@ -21,10 +21,23 @@ from rich.text import Text
|
|
|
21
21
|
from crawlo.commands.stats import record_stats
|
|
22
22
|
from crawlo.crawler import CrawlerProcess
|
|
23
23
|
from crawlo.project import get_settings, _find_project_root
|
|
24
|
-
#
|
|
24
|
+
# 使用新的统一初始化系统
|
|
25
|
+
from crawlo.initialization import initialize_framework
|
|
26
|
+
from crawlo.core import get_framework_initializer
|
|
25
27
|
from crawlo.utils.log import get_logger
|
|
26
28
|
|
|
27
|
-
logger
|
|
29
|
+
# 延迟获取logger,确保在日志系统配置之后获取
|
|
30
|
+
_logger = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def logger():
|
|
34
|
+
"""延迟获取logger实例,确保在日志系统配置之后获取"""
|
|
35
|
+
global _logger
|
|
36
|
+
if _logger is None:
|
|
37
|
+
# 使用改进后的日志系统,可以安全地在任何时候创建
|
|
38
|
+
_logger = get_logger(__name__)
|
|
39
|
+
return _logger
|
|
40
|
+
|
|
28
41
|
|
|
29
42
|
console = Console()
|
|
30
43
|
|
|
@@ -35,15 +48,15 @@ def check_redis_connection(settings):
|
|
|
35
48
|
# 检查是否为分布式模式
|
|
36
49
|
run_mode = settings.get('RUN_MODE', 'standalone')
|
|
37
50
|
queue_type = settings.get('QUEUE_TYPE', 'memory')
|
|
38
|
-
|
|
51
|
+
|
|
39
52
|
if run_mode == 'distributed' or queue_type == 'redis':
|
|
40
53
|
import redis.asyncio as redis
|
|
41
54
|
redis_url = settings.get('REDIS_URL', 'redis://127.0.0.1:6379/0')
|
|
42
55
|
redis_host = settings.get('REDIS_HOST', '127.0.0.1')
|
|
43
56
|
redis_port = settings.get('REDIS_PORT', 6379)
|
|
44
|
-
|
|
57
|
+
|
|
45
58
|
console.print(f"检查 Redis 连接: {redis_host}:{redis_port}")
|
|
46
|
-
|
|
59
|
+
|
|
47
60
|
# 创建Redis连接进行测试
|
|
48
61
|
async def _test_redis():
|
|
49
62
|
try:
|
|
@@ -54,11 +67,11 @@ def check_redis_connection(settings):
|
|
|
54
67
|
except Exception as e:
|
|
55
68
|
console.print(f"Redis 连接失败: {e}")
|
|
56
69
|
return False
|
|
57
|
-
|
|
70
|
+
|
|
58
71
|
# 运行异步测试
|
|
59
72
|
if not asyncio.run(_test_redis()):
|
|
60
73
|
raise ConnectionError(f"无法连接到 Redis 服务器 {redis_host}:{redis_port}")
|
|
61
|
-
|
|
74
|
+
|
|
62
75
|
console.print("Redis 连接正常")
|
|
63
76
|
return True
|
|
64
77
|
else:
|
|
@@ -78,11 +91,15 @@ def main(args):
|
|
|
78
91
|
用法:
|
|
79
92
|
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
80
93
|
"""
|
|
94
|
+
# 确保框架已初始化
|
|
95
|
+
init_manager = get_framework_initializer()
|
|
96
|
+
|
|
81
97
|
# 添加调试信息
|
|
82
|
-
logger.debug("DEBUG: 进入main函数")
|
|
83
|
-
|
|
98
|
+
logger().debug("DEBUG: 进入main函数")
|
|
99
|
+
|
|
84
100
|
if len(args) < 1:
|
|
85
|
-
console.print(
|
|
101
|
+
console.print(
|
|
102
|
+
"[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
86
103
|
console.print("示例:")
|
|
87
104
|
console.print(" [blue]crawlo run baidu[/blue]")
|
|
88
105
|
console.print(" [blue]crawlo run all[/blue]")
|
|
@@ -153,9 +170,9 @@ def main(args):
|
|
|
153
170
|
console.print(Panel(msg, title="导入错误", border_style="red"))
|
|
154
171
|
return 1
|
|
155
172
|
|
|
156
|
-
# 4.
|
|
157
|
-
settings =
|
|
158
|
-
|
|
173
|
+
# 4. 启动框架并加载 settings
|
|
174
|
+
settings = initialize_framework()
|
|
175
|
+
|
|
159
176
|
# 检查Redis连接(如果是分布式模式)
|
|
160
177
|
if not check_redis_connection(settings):
|
|
161
178
|
if show_json:
|
|
@@ -163,9 +180,22 @@ def main(args):
|
|
|
163
180
|
return 1
|
|
164
181
|
else:
|
|
165
182
|
return 1
|
|
166
|
-
|
|
167
|
-
|
|
183
|
+
|
|
184
|
+
# 从配置中获取SPIDER_MODULES
|
|
185
|
+
spider_modules = settings.get('SPIDER_MODULES', [f"{project_package}.spiders"])
|
|
186
|
+
logger().debug(f"SPIDER_MODULES from settings: {spider_modules}")
|
|
168
187
|
process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
|
|
188
|
+
|
|
189
|
+
# 不再需要手动导入爬虫模块,框架内部会自动处理
|
|
190
|
+
# 检查注册表中的爬虫
|
|
191
|
+
from crawlo.spider import get_global_spider_registry
|
|
192
|
+
registry = get_global_spider_registry()
|
|
193
|
+
spider_names = list(registry.keys())
|
|
194
|
+
logger().debug(f"Registered spiders after import: {spider_names}")
|
|
195
|
+
|
|
196
|
+
# 调试信息
|
|
197
|
+
logger().debug(f"SPIDER_MODULES: {spider_modules}")
|
|
198
|
+
logger().debug(f"Available spiders: {process.get_spider_names()}")
|
|
169
199
|
|
|
170
200
|
# === 情况1:运行所有爬虫 ===
|
|
171
201
|
if spider_arg.lower() == "all":
|
|
@@ -193,19 +223,14 @@ def main(args):
|
|
|
193
223
|
# 显示即将运行的爬虫列表
|
|
194
224
|
# 根据用户要求,不再显示详细的爬虫列表信息
|
|
195
225
|
|
|
196
|
-
# 注册 stats 记录(除非 --no-stats)
|
|
197
|
-
if not no_stats:
|
|
198
|
-
for crawler in process.crawlers:
|
|
199
|
-
crawler.signals.connect(record_stats, signal="spider_closed")
|
|
200
|
-
|
|
201
226
|
# 并行运行所有爬虫
|
|
202
227
|
with Progress(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
228
|
+
SpinnerColumn(),
|
|
229
|
+
TextColumn("[progress.description]{task.description}"),
|
|
230
|
+
transient=True,
|
|
206
231
|
) as progress:
|
|
207
232
|
task = progress.add_task("正在运行所有爬虫...", total=None)
|
|
208
|
-
asyncio.run(process.
|
|
233
|
+
asyncio.run(process.crawl_multiple(spider_names))
|
|
209
234
|
|
|
210
235
|
if show_json:
|
|
211
236
|
console.print_json(data={"success": True, "spiders": spider_names})
|
|
@@ -267,15 +292,16 @@ def main(args):
|
|
|
267
292
|
# console.print()
|
|
268
293
|
|
|
269
294
|
# 注册 stats 记录
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
295
|
+
# 注意:CrawlerProcess没有crawlers属性,我们需要在运行时注册
|
|
296
|
+
# if not no_stats:
|
|
297
|
+
# for crawler in process.crawlers:
|
|
298
|
+
# crawler.signals.connect(record_stats, signal="spider_closed")
|
|
273
299
|
|
|
274
300
|
# 运行爬虫
|
|
275
301
|
with Progress(
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
302
|
+
SpinnerColumn(),
|
|
303
|
+
TextColumn("[progress.description]{task.description}"),
|
|
304
|
+
transient=True,
|
|
279
305
|
) as progress:
|
|
280
306
|
task = progress.add_task(f"正在运行 {spider_name}...", total=None)
|
|
281
307
|
asyncio.run(process.crawl(spider_name))
|
|
@@ -298,7 +324,7 @@ def main(args):
|
|
|
298
324
|
console.print(f"[bold yellow]{msg}[/bold yellow]")
|
|
299
325
|
return 1
|
|
300
326
|
except Exception as e:
|
|
301
|
-
logger.exception("Exception during 'crawlo run'")
|
|
327
|
+
logger().exception("Exception during 'crawlo run'")
|
|
302
328
|
msg = f"意外错误: {e}"
|
|
303
329
|
if show_json:
|
|
304
330
|
console.print_json(data={"success": False, "error": msg})
|
|
@@ -312,4 +338,4 @@ if __name__ == "__main__":
|
|
|
312
338
|
支持直接运行:
|
|
313
339
|
python -m crawlo.commands.run spider_name
|
|
314
340
|
"""
|
|
315
|
-
sys.exit(main(sys.argv[1:]))
|
|
341
|
+
sys.exit(main(sys.argv[1:]))
|