crawlo 1.1.6__tar.gz → 1.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.1.6/crawlo.egg-info → crawlo-1.1.8}/PKG-INFO +237 -12
- {crawlo-1.1.6 → crawlo-1.1.8}/README.md +237 -12
- crawlo-1.1.8/crawlo/__version__.py +1 -0
- crawlo-1.1.8/crawlo/cli.py +66 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/__init__.py +2 -1
- crawlo-1.1.8/crawlo/commands/help.py +133 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/middlewares.py.tmpl +0 -1
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/pipelines.py.tmpl +0 -1
- crawlo-1.1.8/crawlo/templates/project/run.py.tmpl +46 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/settings.py.tmpl +2 -2
- {crawlo-1.1.6 → crawlo-1.1.8/crawlo.egg-info}/PKG-INFO +237 -12
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo.egg-info/SOURCES.txt +1 -0
- crawlo-1.1.6/crawlo/__version__.py +0 -1
- crawlo-1.1.6/crawlo/cli.py +0 -41
- crawlo-1.1.6/crawlo/templates/project/run.py.tmpl +0 -252
- {crawlo-1.1.6 → crawlo-1.1.8}/LICENSE +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/MANIFEST.in +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/cleaners/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/cleaners/data_formatter.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/cleaners/encoding_converter.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/cleaners/text_cleaner.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/check.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/list.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/run.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/stats.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/commands/utils.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/config.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/config_validator.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/core/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/core/engine.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/core/processor.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/core/scheduler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/crawler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/downloader/httpx_downloader.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/downloader/hybrid_downloader.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/downloader/playwright_downloader.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/downloader/selenium_downloader.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/event.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/exceptions.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/health_check.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/log_interval.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/log_stats.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/logging_extension.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/memory_monitor.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/performance_profiler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/extension/request_recorder.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/filters/aioredis_filter.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/items/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/items/base.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/items/fields.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/items/items.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/middleware/retry.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/mode_manager.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/network/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/network/request.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/network/response.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/mongo_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/mysql_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/pipeline_manager.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/project.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/queue/queue_manager.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/queue/redis_priority_queue.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/settings/default_settings.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/settings/setting_manager.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/stats_collector.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/subscriber.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/task_manager.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/items.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/templates/spider/spider.py.tmpl +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/anti_crawler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/authenticated_proxy.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/data_validator.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/date_tools.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/distributed_coordinator.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/retry_mechanism.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/tools/scenario_adapter.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/batch_processor.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/controlled_spider_mixin.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/date_tools.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/enhanced_error_handler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/env_config.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/error_handler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/log.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/performance_monitor.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/redis_connection_pool.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/redis_key_validator.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/request.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/system.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/tools.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo/utils/url.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/examples/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/pyproject.toml +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/requirements.txt +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/setup.cfg +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/__init__.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/advanced_tools_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/authenticated_proxy_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/cleaners_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/config_validation_demo.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/controlled_spider_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/date_tools_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/dynamic_loading_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/dynamic_loading_test.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/env_config_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/error_handling_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/redis_key_validation_demo.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/response_improvements_example.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_advanced_tools.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_all_redis_key_configs.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_authenticated_proxy.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_cleaners.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_comprehensive.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_config_validator.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_date_tools.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_double_crawlo_fix.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_double_crawlo_fix_simple.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_dynamic_downloaders_proxy.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_dynamic_proxy.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_dynamic_proxy_config.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_dynamic_proxy_real.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_edge_cases.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_enhanced_error_handler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_env_config.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_error_handler_compatibility.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_final_validation.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_framework_env_usage.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_integration.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_item_dedup_redis_key.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_parsel.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_performance.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_queue_manager_double_crawlo.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_queue_manager_redis_key.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_redis_config.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_redis_connection_pool.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_redis_key_naming.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_redis_key_validator.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_redis_queue.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_request_serialization.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_response_improvements.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_scheduler.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_simple_response.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_telecom_spider_redis_key.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_template_content.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_template_redis_key.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/test_tools.py +0 -0
- {crawlo-1.1.6 → crawlo-1.1.8}/tests/tools_example.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlo
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.8
|
|
4
4
|
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
5
|
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
6
|
Author: crawl-coder
|
|
@@ -80,10 +80,25 @@ pip install crawlo
|
|
|
80
80
|
### 创建项目
|
|
81
81
|
|
|
82
82
|
```bash
|
|
83
|
+
# 创建默认项目
|
|
83
84
|
crawlo startproject myproject
|
|
85
|
+
|
|
86
|
+
# 创建分布式模板项目
|
|
87
|
+
crawlo startproject myproject distributed
|
|
88
|
+
|
|
89
|
+
# 创建项目并选择特定模块
|
|
90
|
+
crawlo startproject myproject --modules mysql,redis,proxy
|
|
91
|
+
|
|
84
92
|
cd myproject
|
|
85
93
|
```
|
|
86
94
|
|
|
95
|
+
### 生成爬虫
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# 在项目目录中生成爬虫
|
|
99
|
+
crawlo genspider news_spider news.example.com
|
|
100
|
+
```
|
|
101
|
+
|
|
87
102
|
### 编写爬虫
|
|
88
103
|
|
|
89
104
|
```python
|
|
@@ -109,9 +124,158 @@ class MySpider(Spider):
|
|
|
109
124
|
### 运行爬虫
|
|
110
125
|
|
|
111
126
|
```bash
|
|
112
|
-
|
|
127
|
+
# 使用命令行工具运行爬虫(推荐)
|
|
128
|
+
crawlo run myspider
|
|
129
|
+
|
|
130
|
+
# 使用项目自带的 run.py 脚本运行
|
|
131
|
+
python run.py
|
|
132
|
+
|
|
133
|
+
# 运行所有爬虫
|
|
134
|
+
crawlo run all
|
|
135
|
+
|
|
136
|
+
# 在项目子目录中也能正确运行
|
|
137
|
+
cd subdirectory
|
|
138
|
+
crawlo run myspider
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## 📜 命令行工具
|
|
142
|
+
|
|
143
|
+
Crawlo 提供了丰富的命令行工具来帮助开发和管理爬虫项目:
|
|
144
|
+
|
|
145
|
+
### 获取帮助
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# 显示帮助信息
|
|
149
|
+
crawlo -h
|
|
150
|
+
crawlo --help
|
|
151
|
+
crawlo help
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### crawlo startproject
|
|
155
|
+
|
|
156
|
+
创建新的爬虫项目。
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# 基本用法
|
|
160
|
+
crawlo startproject <project_name> [template_type] [--modules module1,module2]
|
|
161
|
+
|
|
162
|
+
# 示例
|
|
163
|
+
crawlo startproject my_spider_project
|
|
164
|
+
crawlo startproject news_crawler simple
|
|
165
|
+
crawlo startproject ecommerce_spider distributed --modules mysql,proxy
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**参数说明:**
|
|
169
|
+
- `project_name`: 项目名称(必须是有效的Python标识符)
|
|
170
|
+
- `template_type`: 模板类型(可选)
|
|
171
|
+
- `default`: 默认模板 - 通用配置,适合大多数项目
|
|
172
|
+
- `simple`: 简化模板 - 最小配置,适合快速开始
|
|
173
|
+
- `distributed`: 分布式模板 - 针对分布式爬取优化
|
|
174
|
+
- `high-performance`: 高性能模板 - 针对大规模高并发优化
|
|
175
|
+
- `gentle`: 温和模板 - 低负载配置,对目标网站友好
|
|
176
|
+
- `--modules`: 选择要包含的模块组件(可选)
|
|
177
|
+
- `mysql`: MySQL数据库支持
|
|
178
|
+
- `mongodb`: MongoDB数据库支持
|
|
179
|
+
- `redis`: Redis支持(分布式队列和去重)
|
|
180
|
+
- `proxy`: 代理支持
|
|
181
|
+
- `monitoring`: 监控和性能分析
|
|
182
|
+
- `dedup`: 去重功能
|
|
183
|
+
- `httpx`: HttpX下载器
|
|
184
|
+
- `aiohttp`: AioHttp下载器
|
|
185
|
+
- `curl`: CurlCffi下载器
|
|
186
|
+
|
|
187
|
+
### crawlo genspider
|
|
188
|
+
|
|
189
|
+
在现有项目中生成新的爬虫。
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
# 基本用法
|
|
193
|
+
crawlo genspider <spider_name> <domain>
|
|
194
|
+
|
|
195
|
+
# 示例
|
|
196
|
+
crawlo genspider news_spider news.example.com
|
|
197
|
+
crawlo genspider product_spider shop.example.com
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
**参数说明:**
|
|
201
|
+
- `spider_name`: 爬虫名称(必须是有效的Python标识符)
|
|
202
|
+
- `domain`: 目标域名
|
|
203
|
+
|
|
204
|
+
### crawlo run
|
|
205
|
+
|
|
206
|
+
运行爬虫。
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
# 基本用法
|
|
210
|
+
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
211
|
+
|
|
212
|
+
# 示例
|
|
213
|
+
crawlo run myspider
|
|
214
|
+
crawlo run all
|
|
215
|
+
crawlo run all --json --no-stats
|
|
113
216
|
```
|
|
114
217
|
|
|
218
|
+
**参数说明:**
|
|
219
|
+
- `spider_name`: 要运行的爬虫名称
|
|
220
|
+
- `all`: 运行所有爬虫
|
|
221
|
+
- `--json`: 以JSON格式输出结果
|
|
222
|
+
- `--no-stats`: 不记录统计信息
|
|
223
|
+
|
|
224
|
+
### crawlo list
|
|
225
|
+
|
|
226
|
+
列出项目中所有可用的爬虫。
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
# 基本用法
|
|
230
|
+
crawlo list [--json]
|
|
231
|
+
|
|
232
|
+
# 示例
|
|
233
|
+
crawlo list
|
|
234
|
+
crawlo list --json
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
**参数说明:**
|
|
238
|
+
- `--json`: 以JSON格式输出结果
|
|
239
|
+
|
|
240
|
+
### crawlo check
|
|
241
|
+
|
|
242
|
+
检查爬虫定义的合规性。
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
# 基本用法
|
|
246
|
+
crawlo check [--fix] [--ci] [--json] [--watch]
|
|
247
|
+
|
|
248
|
+
# 示例
|
|
249
|
+
crawlo check
|
|
250
|
+
crawlo check --fix
|
|
251
|
+
crawlo check --ci
|
|
252
|
+
crawlo check --watch
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
**参数说明:**
|
|
256
|
+
- `--fix`: 自动修复常见问题
|
|
257
|
+
- `--ci`: CI模式输出(简洁格式)
|
|
258
|
+
- `--json`: 以JSON格式输出结果
|
|
259
|
+
- `--watch`: 监听模式,文件更改时自动检查
|
|
260
|
+
|
|
261
|
+
### crawlo stats
|
|
262
|
+
|
|
263
|
+
查看爬虫运行统计信息。
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
# 基本用法
|
|
267
|
+
crawlo stats [spider_name] [--all]
|
|
268
|
+
|
|
269
|
+
# 示例
|
|
270
|
+
crawlo stats
|
|
271
|
+
crawlo stats myspider
|
|
272
|
+
crawlo stats myspider --all
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
**参数说明:**
|
|
276
|
+
- `spider_name`: 指定要查看统计信息的爬虫名称
|
|
277
|
+
- `--all`: 显示指定爬虫的所有历史运行记录
|
|
278
|
+
|
|
115
279
|
## 🏗️ 架构设计
|
|
116
280
|
|
|
117
281
|
### 组件交互图
|
|
@@ -176,6 +340,7 @@ crawlo crawl myspider
|
|
|
176
340
|
│ │ │ - ValidationPipeline │ │ │
|
|
177
341
|
│ │ │ - ProcessingPipeline │ │ │
|
|
178
342
|
│ │ │ - StoragePipeline │ │ │
|
|
343
|
+
│ │ │ - DeduplicationPipeline │ │ │
|
|
179
344
|
│ │ └─────────────────────────┘ │ │
|
|
180
345
|
│ └──────────────────────────────┘ │
|
|
181
346
|
└─────────────────────────────────────┘
|
|
@@ -229,12 +394,13 @@ crawlo crawl myspider
|
|
|
229
394
|
▼ ▼
|
|
230
395
|
┌─────────────────┐ 7.生成数据 ┌─────────────┐
|
|
231
396
|
│ Processor ├────────────────►│ Pipeline │
|
|
232
|
-
└─────────────────┘
|
|
233
|
-
│ 8.存储数据
|
|
234
|
-
▼
|
|
235
|
-
┌─────────────────┐
|
|
236
|
-
│ Items │
|
|
237
|
-
└─────────────────┘
|
|
397
|
+
└─────────────────┘ └──────┬──────┘
|
|
398
|
+
│ 8.存储数据 │ 9.去重处理
|
|
399
|
+
▼ ▼
|
|
400
|
+
┌─────────────────┐ ┌─────────────────┐
|
|
401
|
+
│ Items │◄─────────────┤ Deduplication │
|
|
402
|
+
└─────────────────┘ │ Pipeline │
|
|
403
|
+
└─────────────────┘
|
|
238
404
|
```
|
|
239
405
|
|
|
240
406
|
### 模块层次结构图
|
|
@@ -298,6 +464,8 @@ crawlo/
|
|
|
298
464
|
│ ├── pipeline_manager.py # 管道管理器
|
|
299
465
|
│ ├── base_pipeline.py # 管道基类
|
|
300
466
|
│ ├── console_pipeline.py # 控制台输出管道
|
|
467
|
+
│ ├── json_pipeline.py # JSON存储管道
|
|
468
|
+
│ ├── redis_dedup_pipeline.py # Redis去重管道
|
|
301
469
|
│ └── mysql_pipeline.py # MySQL存储管道
|
|
302
470
|
│
|
|
303
471
|
├── extension/ # 扩展组件
|
|
@@ -335,7 +503,7 @@ crawlo/
|
|
|
335
503
|
- **QueueManager**: 统一的队列管理器,支持内存队列和Redis队列的自动切换
|
|
336
504
|
- **Filter**: 请求去重过滤器,支持内存和Redis两种实现
|
|
337
505
|
- **Middleware**: 中间件系统,处理请求/响应的预处理和后处理
|
|
338
|
-
- **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)
|
|
506
|
+
- **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)和去重功能
|
|
339
507
|
- **Spider**: 爬虫基类,定义爬取逻辑
|
|
340
508
|
|
|
341
509
|
### 运行模式
|
|
@@ -356,12 +524,64 @@ CONCURRENCY = 16
|
|
|
356
524
|
DOWNLOAD_DELAY = 1.0
|
|
357
525
|
QUEUE_TYPE = 'memory' # 单机模式
|
|
358
526
|
# QUEUE_TYPE = 'redis' # 分布式模式
|
|
527
|
+
|
|
528
|
+
# Redis 配置 (分布式模式下使用)
|
|
529
|
+
REDIS_HOST = 'localhost'
|
|
530
|
+
REDIS_PORT = 6379
|
|
531
|
+
REDIS_DB = 0
|
|
532
|
+
REDIS_PASSWORD = ''
|
|
533
|
+
|
|
534
|
+
# 数据管道配置
|
|
535
|
+
PIPELINES = [
|
|
536
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
537
|
+
'crawlo.pipelines.json_pipeline.JsonPipeline',
|
|
538
|
+
'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline', # Redis去重管道
|
|
539
|
+
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL存储管道
|
|
540
|
+
]
|
|
359
541
|
```
|
|
360
542
|
|
|
361
|
-
###
|
|
543
|
+
### MySQL 管道配置
|
|
362
544
|
|
|
545
|
+
Crawlo 提供了现成的 MySQL 管道实现,可以轻松将爬取的数据存储到 MySQL 数据库中:
|
|
546
|
+
|
|
547
|
+
```python
|
|
548
|
+
# 在 settings.py 中启用 MySQL 管道
|
|
549
|
+
PIPELINES = [
|
|
550
|
+
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
|
|
551
|
+
]
|
|
552
|
+
|
|
553
|
+
# MySQL 数据库配置
|
|
554
|
+
MYSQL_HOST = 'localhost'
|
|
555
|
+
MYSQL_PORT = 3306
|
|
556
|
+
MYSQL_USER = 'your_username'
|
|
557
|
+
MYSQL_PASSWORD = 'your_password'
|
|
558
|
+
MYSQL_DB = 'your_database'
|
|
559
|
+
MYSQL_TABLE = 'your_table_name'
|
|
560
|
+
|
|
561
|
+
# 可选的批量插入配置
|
|
562
|
+
MYSQL_BATCH_SIZE = 100
|
|
563
|
+
MYSQL_USE_BATCH = True
|
|
363
564
|
```
|
|
364
|
-
|
|
565
|
+
|
|
566
|
+
MySQL 管道特性:
|
|
567
|
+
- **异步操作**:基于 asyncmy 驱动,提供高性能的异步数据库操作
|
|
568
|
+
- **连接池**:自动管理数据库连接,提高效率
|
|
569
|
+
- **批量插入**:支持批量插入以提高性能
|
|
570
|
+
- **事务支持**:确保数据一致性
|
|
571
|
+
- **灵活配置**:支持自定义表名、批量大小等参数
|
|
572
|
+
|
|
573
|
+
### 命令行配置
|
|
574
|
+
|
|
575
|
+
``bash
|
|
576
|
+
# 运行单个爬虫
|
|
577
|
+
crawlo run myspider
|
|
578
|
+
|
|
579
|
+
# 运行所有爬虫
|
|
580
|
+
crawlo run all
|
|
581
|
+
|
|
582
|
+
# 在项目子目录中也能正确运行
|
|
583
|
+
cd subdirectory
|
|
584
|
+
crawlo run myspider
|
|
365
585
|
```
|
|
366
586
|
|
|
367
587
|
## 🧩 核心组件
|
|
@@ -370,7 +590,11 @@ crawlo crawl myspider --concurrency=32 --delay=0.5
|
|
|
370
590
|
灵活的中间件系统,支持请求预处理、响应处理和异常处理。
|
|
371
591
|
|
|
372
592
|
### 管道系统
|
|
373
|
-
|
|
593
|
+
可扩展的数据处理管道,支持多种存储方式(控制台、数据库等)和去重功能:
|
|
594
|
+
- **ConsolePipeline**: 控制台输出管道
|
|
595
|
+
- **JsonPipeline**: JSON文件存储管道
|
|
596
|
+
- **RedisDedupPipeline**: Redis去重管道,基于Redis集合实现分布式去重
|
|
597
|
+
- **AsyncmyMySQLPipeline**: MySQL数据库存储管道,基于asyncmy驱动
|
|
374
598
|
|
|
375
599
|
### 扩展组件
|
|
376
600
|
功能增强扩展,包括日志、监控、性能分析等。
|
|
@@ -382,6 +606,7 @@ crawlo crawl myspider --concurrency=32 --delay=0.5
|
|
|
382
606
|
|
|
383
607
|
- [API数据采集](examples/api_data_collection/) - 简单的API数据采集示例
|
|
384
608
|
- [电信设备许可证](examples/telecom_licenses_distributed/) - 分布式爬取示例
|
|
609
|
+
- [OFweek分布式爬虫](examples/ofweek_distributed/) - 复杂的分布式爬虫示例,包含Redis去重功能
|
|
385
610
|
|
|
386
611
|
## 📚 文档
|
|
387
612
|
|
|
@@ -30,10 +30,25 @@ pip install crawlo
|
|
|
30
30
|
### 创建项目
|
|
31
31
|
|
|
32
32
|
```bash
|
|
33
|
+
# 创建默认项目
|
|
33
34
|
crawlo startproject myproject
|
|
35
|
+
|
|
36
|
+
# 创建分布式模板项目
|
|
37
|
+
crawlo startproject myproject distributed
|
|
38
|
+
|
|
39
|
+
# 创建项目并选择特定模块
|
|
40
|
+
crawlo startproject myproject --modules mysql,redis,proxy
|
|
41
|
+
|
|
34
42
|
cd myproject
|
|
35
43
|
```
|
|
36
44
|
|
|
45
|
+
### 生成爬虫
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# 在项目目录中生成爬虫
|
|
49
|
+
crawlo genspider news_spider news.example.com
|
|
50
|
+
```
|
|
51
|
+
|
|
37
52
|
### 编写爬虫
|
|
38
53
|
|
|
39
54
|
```python
|
|
@@ -59,9 +74,158 @@ class MySpider(Spider):
|
|
|
59
74
|
### 运行爬虫
|
|
60
75
|
|
|
61
76
|
```bash
|
|
62
|
-
|
|
77
|
+
# 使用命令行工具运行爬虫(推荐)
|
|
78
|
+
crawlo run myspider
|
|
79
|
+
|
|
80
|
+
# 使用项目自带的 run.py 脚本运行
|
|
81
|
+
python run.py
|
|
82
|
+
|
|
83
|
+
# 运行所有爬虫
|
|
84
|
+
crawlo run all
|
|
85
|
+
|
|
86
|
+
# 在项目子目录中也能正确运行
|
|
87
|
+
cd subdirectory
|
|
88
|
+
crawlo run myspider
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## 📜 命令行工具
|
|
92
|
+
|
|
93
|
+
Crawlo 提供了丰富的命令行工具来帮助开发和管理爬虫项目:
|
|
94
|
+
|
|
95
|
+
### 获取帮助
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# 显示帮助信息
|
|
99
|
+
crawlo -h
|
|
100
|
+
crawlo --help
|
|
101
|
+
crawlo help
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### crawlo startproject
|
|
105
|
+
|
|
106
|
+
创建新的爬虫项目。
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# 基本用法
|
|
110
|
+
crawlo startproject <project_name> [template_type] [--modules module1,module2]
|
|
111
|
+
|
|
112
|
+
# 示例
|
|
113
|
+
crawlo startproject my_spider_project
|
|
114
|
+
crawlo startproject news_crawler simple
|
|
115
|
+
crawlo startproject ecommerce_spider distributed --modules mysql,proxy
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**参数说明:**
|
|
119
|
+
- `project_name`: 项目名称(必须是有效的Python标识符)
|
|
120
|
+
- `template_type`: 模板类型(可选)
|
|
121
|
+
- `default`: 默认模板 - 通用配置,适合大多数项目
|
|
122
|
+
- `simple`: 简化模板 - 最小配置,适合快速开始
|
|
123
|
+
- `distributed`: 分布式模板 - 针对分布式爬取优化
|
|
124
|
+
- `high-performance`: 高性能模板 - 针对大规模高并发优化
|
|
125
|
+
- `gentle`: 温和模板 - 低负载配置,对目标网站友好
|
|
126
|
+
- `--modules`: 选择要包含的模块组件(可选)
|
|
127
|
+
- `mysql`: MySQL数据库支持
|
|
128
|
+
- `mongodb`: MongoDB数据库支持
|
|
129
|
+
- `redis`: Redis支持(分布式队列和去重)
|
|
130
|
+
- `proxy`: 代理支持
|
|
131
|
+
- `monitoring`: 监控和性能分析
|
|
132
|
+
- `dedup`: 去重功能
|
|
133
|
+
- `httpx`: HttpX下载器
|
|
134
|
+
- `aiohttp`: AioHttp下载器
|
|
135
|
+
- `curl`: CurlCffi下载器
|
|
136
|
+
|
|
137
|
+
### crawlo genspider
|
|
138
|
+
|
|
139
|
+
在现有项目中生成新的爬虫。
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# 基本用法
|
|
143
|
+
crawlo genspider <spider_name> <domain>
|
|
144
|
+
|
|
145
|
+
# 示例
|
|
146
|
+
crawlo genspider news_spider news.example.com
|
|
147
|
+
crawlo genspider product_spider shop.example.com
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**参数说明:**
|
|
151
|
+
- `spider_name`: 爬虫名称(必须是有效的Python标识符)
|
|
152
|
+
- `domain`: 目标域名
|
|
153
|
+
|
|
154
|
+
### crawlo run
|
|
155
|
+
|
|
156
|
+
运行爬虫。
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# 基本用法
|
|
160
|
+
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
161
|
+
|
|
162
|
+
# 示例
|
|
163
|
+
crawlo run myspider
|
|
164
|
+
crawlo run all
|
|
165
|
+
crawlo run all --json --no-stats
|
|
63
166
|
```
|
|
64
167
|
|
|
168
|
+
**参数说明:**
|
|
169
|
+
- `spider_name`: 要运行的爬虫名称
|
|
170
|
+
- `all`: 运行所有爬虫
|
|
171
|
+
- `--json`: 以JSON格式输出结果
|
|
172
|
+
- `--no-stats`: 不记录统计信息
|
|
173
|
+
|
|
174
|
+
### crawlo list
|
|
175
|
+
|
|
176
|
+
列出项目中所有可用的爬虫。
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
# 基本用法
|
|
180
|
+
crawlo list [--json]
|
|
181
|
+
|
|
182
|
+
# 示例
|
|
183
|
+
crawlo list
|
|
184
|
+
crawlo list --json
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
**参数说明:**
|
|
188
|
+
- `--json`: 以JSON格式输出结果
|
|
189
|
+
|
|
190
|
+
### crawlo check
|
|
191
|
+
|
|
192
|
+
检查爬虫定义的合规性。
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
# 基本用法
|
|
196
|
+
crawlo check [--fix] [--ci] [--json] [--watch]
|
|
197
|
+
|
|
198
|
+
# 示例
|
|
199
|
+
crawlo check
|
|
200
|
+
crawlo check --fix
|
|
201
|
+
crawlo check --ci
|
|
202
|
+
crawlo check --watch
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
**参数说明:**
|
|
206
|
+
- `--fix`: 自动修复常见问题
|
|
207
|
+
- `--ci`: CI模式输出(简洁格式)
|
|
208
|
+
- `--json`: 以JSON格式输出结果
|
|
209
|
+
- `--watch`: 监听模式,文件更改时自动检查
|
|
210
|
+
|
|
211
|
+
### crawlo stats
|
|
212
|
+
|
|
213
|
+
查看爬虫运行统计信息。
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
# 基本用法
|
|
217
|
+
crawlo stats [spider_name] [--all]
|
|
218
|
+
|
|
219
|
+
# 示例
|
|
220
|
+
crawlo stats
|
|
221
|
+
crawlo stats myspider
|
|
222
|
+
crawlo stats myspider --all
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
**参数说明:**
|
|
226
|
+
- `spider_name`: 指定要查看统计信息的爬虫名称
|
|
227
|
+
- `--all`: 显示指定爬虫的所有历史运行记录
|
|
228
|
+
|
|
65
229
|
## 🏗️ 架构设计
|
|
66
230
|
|
|
67
231
|
### 组件交互图
|
|
@@ -126,6 +290,7 @@ crawlo crawl myspider
|
|
|
126
290
|
│ │ │ - ValidationPipeline │ │ │
|
|
127
291
|
│ │ │ - ProcessingPipeline │ │ │
|
|
128
292
|
│ │ │ - StoragePipeline │ │ │
|
|
293
|
+
│ │ │ - DeduplicationPipeline │ │ │
|
|
129
294
|
│ │ └─────────────────────────┘ │ │
|
|
130
295
|
│ └──────────────────────────────┘ │
|
|
131
296
|
└─────────────────────────────────────┘
|
|
@@ -179,12 +344,13 @@ crawlo crawl myspider
|
|
|
179
344
|
▼ ▼
|
|
180
345
|
┌─────────────────┐ 7.生成数据 ┌─────────────┐
|
|
181
346
|
│ Processor ├────────────────►│ Pipeline │
|
|
182
|
-
└─────────────────┘
|
|
183
|
-
│ 8.存储数据
|
|
184
|
-
▼
|
|
185
|
-
┌─────────────────┐
|
|
186
|
-
│ Items │
|
|
187
|
-
└─────────────────┘
|
|
347
|
+
└─────────────────┘ └──────┬──────┘
|
|
348
|
+
│ 8.存储数据 │ 9.去重处理
|
|
349
|
+
▼ ▼
|
|
350
|
+
┌─────────────────┐ ┌─────────────────┐
|
|
351
|
+
│ Items │◄─────────────┤ Deduplication │
|
|
352
|
+
└─────────────────┘ │ Pipeline │
|
|
353
|
+
└─────────────────┘
|
|
188
354
|
```
|
|
189
355
|
|
|
190
356
|
### 模块层次结构图
|
|
@@ -248,6 +414,8 @@ crawlo/
|
|
|
248
414
|
│ ├── pipeline_manager.py # 管道管理器
|
|
249
415
|
│ ├── base_pipeline.py # 管道基类
|
|
250
416
|
│ ├── console_pipeline.py # 控制台输出管道
|
|
417
|
+
│ ├── json_pipeline.py # JSON存储管道
|
|
418
|
+
│ ├── redis_dedup_pipeline.py # Redis去重管道
|
|
251
419
|
│ └── mysql_pipeline.py # MySQL存储管道
|
|
252
420
|
│
|
|
253
421
|
├── extension/ # 扩展组件
|
|
@@ -285,7 +453,7 @@ crawlo/
|
|
|
285
453
|
- **QueueManager**: 统一的队列管理器,支持内存队列和Redis队列的自动切换
|
|
286
454
|
- **Filter**: 请求去重过滤器,支持内存和Redis两种实现
|
|
287
455
|
- **Middleware**: 中间件系统,处理请求/响应的预处理和后处理
|
|
288
|
-
- **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)
|
|
456
|
+
- **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)和去重功能
|
|
289
457
|
- **Spider**: 爬虫基类,定义爬取逻辑
|
|
290
458
|
|
|
291
459
|
### 运行模式
|
|
@@ -306,12 +474,64 @@ CONCURRENCY = 16
|
|
|
306
474
|
DOWNLOAD_DELAY = 1.0
|
|
307
475
|
QUEUE_TYPE = 'memory' # 单机模式
|
|
308
476
|
# QUEUE_TYPE = 'redis' # 分布式模式
|
|
477
|
+
|
|
478
|
+
# Redis 配置 (分布式模式下使用)
|
|
479
|
+
REDIS_HOST = 'localhost'
|
|
480
|
+
REDIS_PORT = 6379
|
|
481
|
+
REDIS_DB = 0
|
|
482
|
+
REDIS_PASSWORD = ''
|
|
483
|
+
|
|
484
|
+
# 数据管道配置
|
|
485
|
+
PIPELINES = [
|
|
486
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
487
|
+
'crawlo.pipelines.json_pipeline.JsonPipeline',
|
|
488
|
+
'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline', # Redis去重管道
|
|
489
|
+
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL存储管道
|
|
490
|
+
]
|
|
309
491
|
```
|
|
310
492
|
|
|
311
|
-
###
|
|
493
|
+
### MySQL 管道配置
|
|
312
494
|
|
|
495
|
+
Crawlo 提供了现成的 MySQL 管道实现,可以轻松将爬取的数据存储到 MySQL 数据库中:
|
|
496
|
+
|
|
497
|
+
```python
|
|
498
|
+
# 在 settings.py 中启用 MySQL 管道
|
|
499
|
+
PIPELINES = [
|
|
500
|
+
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
# MySQL 数据库配置
|
|
504
|
+
MYSQL_HOST = 'localhost'
|
|
505
|
+
MYSQL_PORT = 3306
|
|
506
|
+
MYSQL_USER = 'your_username'
|
|
507
|
+
MYSQL_PASSWORD = 'your_password'
|
|
508
|
+
MYSQL_DB = 'your_database'
|
|
509
|
+
MYSQL_TABLE = 'your_table_name'
|
|
510
|
+
|
|
511
|
+
# 可选的批量插入配置
|
|
512
|
+
MYSQL_BATCH_SIZE = 100
|
|
513
|
+
MYSQL_USE_BATCH = True
|
|
313
514
|
```
|
|
314
|
-
|
|
515
|
+
|
|
516
|
+
MySQL 管道特性:
|
|
517
|
+
- **异步操作**:基于 asyncmy 驱动,提供高性能的异步数据库操作
|
|
518
|
+
- **连接池**:自动管理数据库连接,提高效率
|
|
519
|
+
- **批量插入**:支持批量插入以提高性能
|
|
520
|
+
- **事务支持**:确保数据一致性
|
|
521
|
+
- **灵活配置**:支持自定义表名、批量大小等参数
|
|
522
|
+
|
|
523
|
+
### 命令行配置
|
|
524
|
+
|
|
525
|
+
``bash
|
|
526
|
+
# 运行单个爬虫
|
|
527
|
+
crawlo run myspider
|
|
528
|
+
|
|
529
|
+
# 运行所有爬虫
|
|
530
|
+
crawlo run all
|
|
531
|
+
|
|
532
|
+
# 在项目子目录中也能正确运行
|
|
533
|
+
cd subdirectory
|
|
534
|
+
crawlo run myspider
|
|
315
535
|
```
|
|
316
536
|
|
|
317
537
|
## 🧩 核心组件
|
|
@@ -320,7 +540,11 @@ crawlo crawl myspider --concurrency=32 --delay=0.5
|
|
|
320
540
|
灵活的中间件系统,支持请求预处理、响应处理和异常处理。
|
|
321
541
|
|
|
322
542
|
### 管道系统
|
|
323
|
-
|
|
543
|
+
可扩展的数据处理管道,支持多种存储方式(控制台、数据库等)和去重功能:
|
|
544
|
+
- **ConsolePipeline**: 控制台输出管道
|
|
545
|
+
- **JsonPipeline**: JSON文件存储管道
|
|
546
|
+
- **RedisDedupPipeline**: Redis去重管道,基于Redis集合实现分布式去重
|
|
547
|
+
- **AsyncmyMySQLPipeline**: MySQL数据库存储管道,基于asyncmy驱动
|
|
324
548
|
|
|
325
549
|
### 扩展组件
|
|
326
550
|
功能增强扩展,包括日志、监控、性能分析等。
|
|
@@ -332,6 +556,7 @@ crawlo crawl myspider --concurrency=32 --delay=0.5
|
|
|
332
556
|
|
|
333
557
|
- [API数据采集](examples/api_data_collection/) - 简单的API数据采集示例
|
|
334
558
|
- [电信设备许可证](examples/telecom_licenses_distributed/) - 分布式爬取示例
|
|
559
|
+
- [OFweek分布式爬虫](examples/ofweek_distributed/) - 复杂的分布式爬虫示例,包含Redis去重功能
|
|
335
560
|
|
|
336
561
|
## 📚 文档
|
|
337
562
|
|
|
@@ -348,4 +573,4 @@ crawlo crawl myspider --concurrency=32 --delay=0.5
|
|
|
348
573
|
|
|
349
574
|
## 📄 许可证
|
|
350
575
|
|
|
351
|
-
本项目采用 MIT 许可证,详情请见 [LICENSE](LICENSE) 文件。
|
|
576
|
+
本项目采用 MIT 许可证,详情请见 [LICENSE](LICENSE) 文件。
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.8"
|