crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -30,12 +30,10 @@ locals().update(config.to_dict())
|
|
|
30
30
|
# 爬虫模块配置
|
|
31
31
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
32
32
|
|
|
33
|
-
#
|
|
34
|
-
# 为DefaultHeaderMiddleware配置默认请求头
|
|
33
|
+
# 默认请求头
|
|
35
34
|
# DEFAULT_REQUEST_HEADERS = {}
|
|
36
35
|
|
|
37
36
|
# 允许的域名
|
|
38
|
-
# 为OffsiteMiddleware配置允许的域名
|
|
39
37
|
# ALLOWED_DOMAINS = []
|
|
40
38
|
|
|
41
39
|
# 数据管道
|
|
@@ -63,6 +61,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
|
63
61
|
LOG_LEVEL = 'INFO'
|
|
64
62
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
65
63
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
64
|
+
LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
|
|
65
|
+
LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
|
|
66
|
+
# 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
|
|
67
|
+
# 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
|
|
66
68
|
STATS_DUMP = True
|
|
67
69
|
|
|
68
70
|
# 输出配置
|
|
@@ -108,10 +110,10 @@ MONGO_USE_BATCH = True # 是否启用批量插入
|
|
|
108
110
|
|
|
109
111
|
# =================================== 浏览器指纹模拟 ===================================
|
|
110
112
|
|
|
111
|
-
# 浏览器指纹模拟(仅
|
|
112
|
-
CURL_BROWSER_TYPE = "chrome" #
|
|
113
|
+
# 浏览器指纹模拟(仅CurlCffiDownloader有效)
|
|
114
|
+
CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
|
|
113
115
|
|
|
114
|
-
#
|
|
116
|
+
# 自定义浏览器版本映射
|
|
115
117
|
CURL_BROWSER_VERSION_MAP = {
|
|
116
118
|
"chrome": "chrome136",
|
|
117
119
|
"edge": "edge101",
|
|
@@ -129,11 +131,11 @@ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
|
129
131
|
REQUEST_STATS_ENABLED = True # 是否启用请求统计
|
|
130
132
|
STATS_RESET_ON_START = False # 启动时是否重置统计
|
|
131
133
|
|
|
132
|
-
# HttpX
|
|
134
|
+
# HttpX专用配置
|
|
133
135
|
HTTPX_HTTP2 = True # 是否启用HTTP/2支持
|
|
134
136
|
HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
|
|
135
137
|
|
|
136
|
-
# AioHttp
|
|
138
|
+
# AioHttp专用配置
|
|
137
139
|
AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
|
|
138
140
|
AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
139
141
|
|
|
@@ -143,17 +145,16 @@ CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
|
143
145
|
|
|
144
146
|
# =================================== 代理配置 ===================================
|
|
145
147
|
|
|
146
|
-
#
|
|
147
|
-
#
|
|
148
|
+
# 简单代理(SimpleProxyMiddleware)
|
|
149
|
+
# 配置代理列表后中间件自动启用
|
|
148
150
|
# PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
|
|
149
151
|
|
|
150
|
-
#
|
|
151
|
-
#
|
|
152
|
+
# 动态代理(ProxyMiddleware)
|
|
153
|
+
# 配置代理API URL后中间件自动启用
|
|
152
154
|
# PROXY_API_URL = "http://your-proxy-api.com/get-proxy"
|
|
153
155
|
|
|
154
156
|
# =================================== 内存监控配置 ===================================
|
|
155
157
|
|
|
156
|
-
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
157
158
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
158
159
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
159
160
|
MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# 项目基本信息
|
|
12
12
|
PROJECT_NAME = '{{project_name}}'
|
|
13
13
|
|
|
14
|
-
#
|
|
14
|
+
# 运行模式:standalone/distributed/auto
|
|
15
15
|
RUN_MODE = 'standalone'
|
|
16
16
|
|
|
17
17
|
# 并发配置
|
|
@@ -23,16 +23,16 @@ RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子
|
|
|
23
23
|
|
|
24
24
|
# =================================== 核心组件配置 ===================================
|
|
25
25
|
|
|
26
|
-
#
|
|
26
|
+
# 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
|
|
27
27
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
28
28
|
|
|
29
|
-
#
|
|
29
|
+
# 队列类型:memory/redis/auto
|
|
30
30
|
QUEUE_TYPE = 'memory'
|
|
31
31
|
|
|
32
|
-
#
|
|
32
|
+
# 去重过滤器:MemoryFilter/AioRedisFilter
|
|
33
33
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
34
34
|
|
|
35
|
-
#
|
|
35
|
+
# 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
|
|
36
36
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
37
37
|
|
|
38
38
|
# =================================== 爬虫配置 ===================================
|
|
@@ -40,12 +40,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipe
|
|
|
40
40
|
# 爬虫模块配置
|
|
41
41
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
42
42
|
|
|
43
|
-
#
|
|
44
|
-
# 为DefaultHeaderMiddleware配置默认请求头
|
|
43
|
+
# 默认请求头
|
|
45
44
|
# DEFAULT_REQUEST_HEADERS = {}
|
|
46
45
|
|
|
47
46
|
# 允许的域名
|
|
48
|
-
# 为OffsiteMiddleware配置允许的域名
|
|
49
47
|
# ALLOWED_DOMAINS = []
|
|
50
48
|
|
|
51
49
|
# 数据管道
|
|
@@ -73,6 +71,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
|
73
71
|
LOG_LEVEL = 'INFO'
|
|
74
72
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
75
73
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
74
|
+
LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
|
|
75
|
+
LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
|
|
76
|
+
# 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
|
|
77
|
+
# 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
|
|
76
78
|
STATS_DUMP = True
|
|
77
79
|
|
|
78
80
|
# 输出配置
|
|
@@ -119,28 +121,25 @@ MONGO_USE_BATCH = False # 是否启用批量插入
|
|
|
119
121
|
# =================================== 网络配置 ===================================
|
|
120
122
|
|
|
121
123
|
# 代理配置
|
|
122
|
-
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
123
124
|
PROXY_ENABLED = False # 是否启用代理
|
|
124
125
|
|
|
125
|
-
#
|
|
126
|
-
PROXY_LIST = [] #
|
|
126
|
+
# 简单代理(SimpleProxyMiddleware)
|
|
127
|
+
PROXY_LIST = [] # 代理列表
|
|
127
128
|
|
|
128
|
-
#
|
|
129
|
-
PROXY_API_URL = "" #
|
|
129
|
+
# 动态代理(ProxyMiddleware)
|
|
130
|
+
PROXY_API_URL = "" # 代理API地址
|
|
130
131
|
|
|
131
|
-
#
|
|
132
|
-
# 示例: "proxy" 适用于 {"proxy": "http://1.1.1.1:8080"}
|
|
133
|
-
# 示例: "data.proxy" 适用于 {"data": {"proxy": "http://1.1.1.1:8080"}}
|
|
132
|
+
# 代理提取方式:"proxy" 或 "data.proxy"
|
|
134
133
|
PROXY_EXTRACTOR = "proxy"
|
|
135
134
|
|
|
136
135
|
# 代理刷新控制
|
|
137
136
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
138
|
-
PROXY_API_TIMEOUT = 10 #
|
|
137
|
+
PROXY_API_TIMEOUT = 10 # API超时时间
|
|
139
138
|
|
|
140
|
-
# 浏览器指纹模拟(仅
|
|
141
|
-
CURL_BROWSER_TYPE = "chrome" #
|
|
139
|
+
# 浏览器指纹模拟(仅CurlCffiDownloader有效)
|
|
140
|
+
CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
|
|
142
141
|
|
|
143
|
-
#
|
|
142
|
+
# 自定义浏览器版本映射
|
|
144
143
|
CURL_BROWSER_VERSION_MAP = {
|
|
145
144
|
"chrome": "chrome136",
|
|
146
145
|
"edge": "edge101",
|
|
@@ -157,11 +156,11 @@ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
|
157
156
|
REQUEST_STATS_ENABLED = True # 是否启用请求统计
|
|
158
157
|
STATS_RESET_ON_START = False # 启动时是否重置统计
|
|
159
158
|
|
|
160
|
-
# HttpX
|
|
159
|
+
# HttpX专用配置
|
|
161
160
|
HTTPX_HTTP2 = True # 是否启用HTTP/2支持
|
|
162
161
|
HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
|
|
163
162
|
|
|
164
|
-
# AioHttp
|
|
163
|
+
# AioHttp专用配置
|
|
165
164
|
AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
|
|
166
165
|
AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
167
166
|
|
|
@@ -170,7 +169,6 @@ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
|
170
169
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
171
170
|
|
|
172
171
|
# 内存监控配置
|
|
173
|
-
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
174
172
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
175
173
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
176
174
|
MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# 项目基本信息
|
|
12
12
|
PROJECT_NAME = '{{project_name}}'
|
|
13
13
|
|
|
14
|
-
#
|
|
14
|
+
# 运行模式:standalone/distributed/auto
|
|
15
15
|
RUN_MODE = 'standalone'
|
|
16
16
|
|
|
17
17
|
# 并发配置
|
|
@@ -22,17 +22,17 @@ RANDOMNESS = False # 禁用随机延迟以保证性能
|
|
|
22
22
|
|
|
23
23
|
# =================================== 核心组件配置 ===================================
|
|
24
24
|
|
|
25
|
-
#
|
|
25
|
+
# 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
|
|
26
26
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
27
27
|
|
|
28
|
-
#
|
|
28
|
+
# 队列类型:memory/redis/auto
|
|
29
29
|
QUEUE_TYPE = 'auto'
|
|
30
30
|
|
|
31
|
-
#
|
|
31
|
+
# 去重过滤器:MemoryFilter/AioRedisFilter
|
|
32
32
|
# 高性能模式下,如果Redis可用则使用Redis去重,否则使用内存去重
|
|
33
33
|
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
34
34
|
|
|
35
|
-
#
|
|
35
|
+
# 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
|
|
36
36
|
# 高性能模式下,如果Redis可用则使用Redis去重,否则使用内存去重
|
|
37
37
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
38
38
|
|
|
@@ -41,12 +41,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeli
|
|
|
41
41
|
# 爬虫模块配置
|
|
42
42
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
43
43
|
|
|
44
|
-
#
|
|
45
|
-
# 为DefaultHeaderMiddleware配置默认请求头
|
|
44
|
+
# 默认请求头
|
|
46
45
|
# DEFAULT_REQUEST_HEADERS = {}
|
|
47
46
|
|
|
48
47
|
# 允许的域名
|
|
49
|
-
# 为OffsiteMiddleware配置允许的域名
|
|
50
48
|
# ALLOWED_DOMAINS = []
|
|
51
49
|
|
|
52
50
|
# 数据管道
|
|
@@ -74,6 +72,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
|
74
72
|
LOG_LEVEL = 'INFO'
|
|
75
73
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
76
74
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
75
|
+
LOG_MAX_BYTES = 50 * 1024 * 1024 # 50MB,适用于高负载场景
|
|
76
|
+
LOG_BACKUP_COUNT = 20 # 20个备份文件,适用于高负载场景
|
|
77
|
+
# 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
|
|
78
|
+
# 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
|
|
77
79
|
STATS_DUMP = True
|
|
78
80
|
|
|
79
81
|
# 输出配置
|
|
@@ -120,28 +122,25 @@ MONGO_USE_BATCH = True # 是否启用批量插入
|
|
|
120
122
|
# =================================== 网络配置 ===================================
|
|
121
123
|
|
|
122
124
|
# 代理配置
|
|
123
|
-
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
124
125
|
PROXY_ENABLED = False # 是否启用代理
|
|
125
126
|
|
|
126
|
-
#
|
|
127
|
-
PROXY_LIST = [] #
|
|
127
|
+
# 简单代理(SimpleProxyMiddleware)
|
|
128
|
+
PROXY_LIST = [] # 代理列表
|
|
128
129
|
|
|
129
|
-
#
|
|
130
|
-
PROXY_API_URL = "" #
|
|
130
|
+
# 动态代理(ProxyMiddleware)
|
|
131
|
+
PROXY_API_URL = "" # 代理API地址
|
|
131
132
|
|
|
132
|
-
#
|
|
133
|
-
# 示例: "proxy" 适用于 {"proxy": "http://1.1.1.1:8080"}
|
|
134
|
-
# 示例: "data.proxy" 适用于 {"data": {"proxy": "http://1.1.1.1:8080"}}
|
|
133
|
+
# 代理提取方式:"proxy" 或 "data.proxy"
|
|
135
134
|
PROXY_EXTRACTOR = "proxy"
|
|
136
135
|
|
|
137
136
|
# 代理刷新控制
|
|
138
137
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
139
|
-
PROXY_API_TIMEOUT = 10 #
|
|
138
|
+
PROXY_API_TIMEOUT = 10 # API超时时间
|
|
140
139
|
|
|
141
|
-
# 浏览器指纹模拟(仅
|
|
142
|
-
CURL_BROWSER_TYPE = "chrome" #
|
|
140
|
+
# 浏览器指纹模拟(仅CurlCffiDownloader有效)
|
|
141
|
+
CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
|
|
143
142
|
|
|
144
|
-
#
|
|
143
|
+
# 自定义浏览器版本映射
|
|
145
144
|
CURL_BROWSER_VERSION_MAP = {
|
|
146
145
|
"chrome": "chrome136",
|
|
147
146
|
"edge": "edge101",
|
|
@@ -158,11 +157,11 @@ HEALTH_CHECK_INTERVAL = 30 # 健康检查间隔(秒)
|
|
|
158
157
|
REQUEST_STATS_ENABLED = True # 是否启用请求统计
|
|
159
158
|
STATS_RESET_ON_START = False # 启动时是否重置统计
|
|
160
159
|
|
|
161
|
-
# HttpX
|
|
160
|
+
# HttpX专用配置
|
|
162
161
|
HTTPX_HTTP2 = True # 是否启用HTTP/2支持
|
|
163
162
|
HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
|
|
164
163
|
|
|
165
|
-
# AioHttp
|
|
164
|
+
# AioHttp专用配置
|
|
166
165
|
AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
|
|
167
166
|
AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
168
167
|
|
|
@@ -171,7 +170,6 @@ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
|
171
170
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
172
171
|
|
|
173
172
|
# 内存监控配置
|
|
174
|
-
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
175
173
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
176
174
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
177
175
|
MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# 项目基本信息
|
|
12
12
|
PROJECT_NAME = '{{project_name}}'
|
|
13
13
|
|
|
14
|
-
#
|
|
14
|
+
# 运行模式:standalone/distributed/auto
|
|
15
15
|
RUN_MODE = 'standalone'
|
|
16
16
|
|
|
17
17
|
# 并发配置
|
|
@@ -21,16 +21,16 @@ DOWNLOAD_DELAY = 1.0
|
|
|
21
21
|
|
|
22
22
|
# =================================== 核心组件配置 ===================================
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
|
|
25
25
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
26
26
|
|
|
27
|
-
#
|
|
27
|
+
# 队列类型:memory/redis/auto
|
|
28
28
|
QUEUE_TYPE = 'memory'
|
|
29
29
|
|
|
30
|
-
#
|
|
30
|
+
# 去重过滤器:MemoryFilter/AioRedisFilter
|
|
31
31
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
32
32
|
|
|
33
|
-
#
|
|
33
|
+
# 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
|
|
34
34
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
35
35
|
|
|
36
36
|
# =================================== 爬虫配置 ===================================
|
|
@@ -38,12 +38,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipe
|
|
|
38
38
|
# 爬虫模块配置
|
|
39
39
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
40
40
|
|
|
41
|
-
#
|
|
42
|
-
# 为DefaultHeaderMiddleware配置默认请求头
|
|
41
|
+
# 默认请求头
|
|
43
42
|
# DEFAULT_REQUEST_HEADERS = {}
|
|
44
43
|
|
|
45
44
|
# 允许的域名
|
|
46
|
-
# 为OffsiteMiddleware配置允许的域名
|
|
47
45
|
# ALLOWED_DOMAINS = []
|
|
48
46
|
|
|
49
47
|
# 数据管道
|
|
@@ -71,6 +69,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
|
71
69
|
LOG_LEVEL = 'INFO'
|
|
72
70
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
73
71
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
72
|
+
LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
|
|
73
|
+
LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
|
|
74
|
+
# 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
|
|
75
|
+
# 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
|
|
74
76
|
STATS_DUMP = True
|
|
75
77
|
|
|
76
78
|
# 输出配置
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# 项目基本信息
|
|
12
12
|
PROJECT_NAME = '{{project_name}}'
|
|
13
13
|
|
|
14
|
-
#
|
|
14
|
+
# 运行模式:standalone/distributed/auto
|
|
15
15
|
RUN_MODE = 'standalone'
|
|
16
16
|
|
|
17
17
|
# 并发配置
|
|
@@ -21,16 +21,16 @@ DOWNLOAD_DELAY = 1.0
|
|
|
21
21
|
|
|
22
22
|
# =================================== 核心组件配置 ===================================
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
|
|
25
25
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
26
26
|
|
|
27
|
-
#
|
|
27
|
+
# 队列类型:memory/redis/auto
|
|
28
28
|
QUEUE_TYPE = 'memory'
|
|
29
29
|
|
|
30
|
-
#
|
|
30
|
+
# 去重过滤器:MemoryFilter/AioRedisFilter
|
|
31
31
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
32
32
|
|
|
33
|
-
#
|
|
33
|
+
# 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
|
|
34
34
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
35
35
|
|
|
36
36
|
# =================================== 爬虫配置 ===================================
|
|
@@ -38,12 +38,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipe
|
|
|
38
38
|
# 爬虫模块配置
|
|
39
39
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
40
40
|
|
|
41
|
-
#
|
|
42
|
-
# 为DefaultHeaderMiddleware配置默认请求头
|
|
41
|
+
# 默认请求头
|
|
43
42
|
# DEFAULT_REQUEST_HEADERS = {}
|
|
44
43
|
|
|
45
44
|
# 允许的域名
|
|
46
|
-
# 为OffsiteMiddleware配置允许的域名
|
|
47
45
|
# ALLOWED_DOMAINS = []
|
|
48
46
|
|
|
49
47
|
# 数据管道
|
|
@@ -71,6 +69,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
|
71
69
|
LOG_LEVEL = 'INFO'
|
|
72
70
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
73
71
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
72
|
+
LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
|
|
73
|
+
LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
|
|
74
|
+
# 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
|
|
75
|
+
# 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
|
|
74
76
|
STATS_DUMP = True
|
|
75
77
|
|
|
76
78
|
# 输出配置
|
|
@@ -117,28 +119,25 @@ MONGO_USE_BATCH = False # 是否启用批量插入
|
|
|
117
119
|
# =================================== 网络配置 ===================================
|
|
118
120
|
|
|
119
121
|
# 代理配置
|
|
120
|
-
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
121
122
|
PROXY_ENABLED = False # 是否启用代理
|
|
122
123
|
|
|
123
|
-
#
|
|
124
|
-
PROXY_LIST = [] #
|
|
124
|
+
# 简单代理(SimpleProxyMiddleware)
|
|
125
|
+
PROXY_LIST = [] # 代理列表
|
|
125
126
|
|
|
126
|
-
#
|
|
127
|
-
PROXY_API_URL = "" #
|
|
127
|
+
# 动态代理(ProxyMiddleware)
|
|
128
|
+
PROXY_API_URL = "" # 代理API地址
|
|
128
129
|
|
|
129
|
-
#
|
|
130
|
-
# 示例: "proxy" 适用于 {"proxy": "http://1.1.1.1:8080"}
|
|
131
|
-
# 示例: "data.proxy" 适用于 {"data": {"proxy": "http://1.1.1.1:8080"}}
|
|
130
|
+
# 代理提取方式:"proxy" 或 "data.proxy"
|
|
132
131
|
PROXY_EXTRACTOR = "proxy"
|
|
133
132
|
|
|
134
133
|
# 代理刷新控制
|
|
135
134
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
136
|
-
PROXY_API_TIMEOUT = 10 #
|
|
135
|
+
PROXY_API_TIMEOUT = 10 # API超时时间
|
|
137
136
|
|
|
138
|
-
# 浏览器指纹模拟(仅
|
|
139
|
-
CURL_BROWSER_TYPE = "chrome" #
|
|
137
|
+
# 浏览器指纹模拟(仅CurlCffiDownloader有效)
|
|
138
|
+
CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
|
|
140
139
|
|
|
141
|
-
#
|
|
140
|
+
# 自定义浏览器版本映射
|
|
142
141
|
CURL_BROWSER_VERSION_MAP = {
|
|
143
142
|
"chrome": "chrome136",
|
|
144
143
|
"edge": "edge101",
|
|
@@ -155,11 +154,11 @@ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
|
155
154
|
REQUEST_STATS_ENABLED = True # 是否启用请求统计
|
|
156
155
|
STATS_RESET_ON_START = False # 启动时是否重置统计
|
|
157
156
|
|
|
158
|
-
# HttpX
|
|
157
|
+
# HttpX专用配置
|
|
159
158
|
HTTPX_HTTP2 = True # 是否启用HTTP/2支持
|
|
160
159
|
HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
|
|
161
160
|
|
|
162
|
-
# AioHttp
|
|
161
|
+
# AioHttp专用配置
|
|
163
162
|
AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
|
|
164
163
|
AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
165
164
|
|
|
@@ -168,7 +167,6 @@ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
|
168
167
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
169
168
|
|
|
170
169
|
# 内存监控配置
|
|
171
|
-
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
172
170
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
173
171
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
174
172
|
MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
|
crawlo/templates/run.py.tmpl
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
|
2
2
|
"""
|
|
3
|
-
{{
|
|
4
|
-
=======================================
|
|
5
|
-
由 `crawlo genspider` 命令生成的爬虫。
|
|
3
|
+
爬虫:{{spider_name}}
|
|
6
4
|
"""
|
|
7
5
|
|
|
8
6
|
from crawlo.spider import Spider
|
|
@@ -11,9 +9,7 @@ from ..items import {{item_class}}
|
|
|
11
9
|
|
|
12
10
|
|
|
13
11
|
class {{class_name}}(Spider):
|
|
14
|
-
"""
|
|
15
|
-
爬虫:{{spider_name}}
|
|
16
|
-
"""
|
|
12
|
+
"""{{spider_name}} 爬虫"""
|
|
17
13
|
name = '{{spider_name}}'
|
|
18
14
|
allowed_domains = ['{{domain}}']
|
|
19
15
|
start_urls = ['https://{{domain}}/']
|
|
@@ -22,16 +18,12 @@ class {{class_name}}(Spider):
|
|
|
22
18
|
custom_settings = {}
|
|
23
19
|
|
|
24
20
|
def start_requests(self):
|
|
25
|
-
"""
|
|
26
|
-
生成初始请求。
|
|
27
|
-
"""
|
|
21
|
+
"""生成初始请求"""
|
|
28
22
|
for url in self.start_urls:
|
|
29
23
|
yield Request(url=url, callback=self.parse)
|
|
30
24
|
|
|
31
25
|
def parse(self, response):
|
|
32
|
-
"""
|
|
33
|
-
解析响应的主方法。
|
|
34
|
-
"""
|
|
26
|
+
"""解析响应"""
|
|
35
27
|
self.logger.info(f'正在解析页面: {response.url}')
|
|
36
28
|
|
|
37
29
|
yield {
|
crawlo/tools/__init__.py
CHANGED
|
@@ -35,60 +35,6 @@ from .text_cleaner import (
|
|
|
35
35
|
extract_urls
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
from .data_formatter import (
|
|
39
|
-
DataFormatter,
|
|
40
|
-
format_number,
|
|
41
|
-
format_currency,
|
|
42
|
-
format_percentage,
|
|
43
|
-
format_phone_number,
|
|
44
|
-
format_chinese_id_card,
|
|
45
|
-
capitalize_words
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
from .encoding_converter import (
|
|
49
|
-
EncodingConverter,
|
|
50
|
-
detect_encoding,
|
|
51
|
-
to_utf8,
|
|
52
|
-
convert_encoding
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
# 数据验证工具封装
|
|
56
|
-
from .data_validator import (
|
|
57
|
-
DataValidator,
|
|
58
|
-
validate_email,
|
|
59
|
-
validate_phone,
|
|
60
|
-
validate_url,
|
|
61
|
-
validate_chinese_id_card,
|
|
62
|
-
validate_date,
|
|
63
|
-
validate_number_range,
|
|
64
|
-
check_data_integrity
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
# 请求处理工具
|
|
68
|
-
from .request_tools import (
|
|
69
|
-
build_url,
|
|
70
|
-
add_query_params,
|
|
71
|
-
merge_headers
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
# 重试机制封装
|
|
75
|
-
from .retry_mechanism import (
|
|
76
|
-
RetryMechanism,
|
|
77
|
-
retry,
|
|
78
|
-
should_retry,
|
|
79
|
-
exponential_backoff
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
# 带认证代理工具
|
|
83
|
-
from .authenticated_proxy import (
|
|
84
|
-
AuthenticatedProxy,
|
|
85
|
-
create_proxy_config,
|
|
86
|
-
format_proxy_for_request,
|
|
87
|
-
parse_proxy_url,
|
|
88
|
-
validate_proxy_url,
|
|
89
|
-
get_proxy_info
|
|
90
|
-
)
|
|
91
|
-
|
|
92
38
|
# 分布式协调工具
|
|
93
39
|
from .distributed_coordinator import (
|
|
94
40
|
TaskDistributor,
|
|
@@ -118,8 +64,6 @@ __all__ = [
|
|
|
118
64
|
|
|
119
65
|
# 数据清洗工具
|
|
120
66
|
"TextCleaner",
|
|
121
|
-
"DataFormatter",
|
|
122
|
-
"EncodingConverter",
|
|
123
67
|
"remove_html_tags",
|
|
124
68
|
"decode_html_entities",
|
|
125
69
|
"remove_extra_whitespace",
|
|
@@ -129,53 +73,6 @@ __all__ = [
|
|
|
129
73
|
"extract_numbers",
|
|
130
74
|
"extract_emails",
|
|
131
75
|
"extract_urls",
|
|
132
|
-
"format_number",
|
|
133
|
-
"format_currency",
|
|
134
|
-
"format_percentage",
|
|
135
|
-
"format_phone_number",
|
|
136
|
-
"format_chinese_id_card",
|
|
137
|
-
"capitalize_words",
|
|
138
|
-
"detect_encoding",
|
|
139
|
-
"to_utf8",
|
|
140
|
-
"convert_encoding",
|
|
141
|
-
|
|
142
|
-
# 数据验证工具
|
|
143
|
-
"DataValidator",
|
|
144
|
-
"validate_email",
|
|
145
|
-
"validate_phone",
|
|
146
|
-
"validate_url",
|
|
147
|
-
"validate_chinese_id_card",
|
|
148
|
-
"validate_date",
|
|
149
|
-
"validate_number_range",
|
|
150
|
-
"check_data_integrity",
|
|
151
|
-
|
|
152
|
-
# 请求处理工具
|
|
153
|
-
"build_url",
|
|
154
|
-
"add_query_params",
|
|
155
|
-
"merge_headers",
|
|
156
|
-
|
|
157
|
-
# 重试机制封装
|
|
158
|
-
"RetryMechanism",
|
|
159
|
-
"retry",
|
|
160
|
-
"should_retry",
|
|
161
|
-
"exponential_backoff",
|
|
162
|
-
|
|
163
|
-
# 反爬虫应对工具
|
|
164
|
-
"ProxyPoolManager",
|
|
165
|
-
"CaptchaHandler",
|
|
166
|
-
"AntiCrawler",
|
|
167
|
-
"get_random_user_agent",
|
|
168
|
-
"rotate_proxy",
|
|
169
|
-
"handle_captcha",
|
|
170
|
-
"detect_rate_limiting",
|
|
171
|
-
|
|
172
|
-
# 带认证代理工具
|
|
173
|
-
"AuthenticatedProxy",
|
|
174
|
-
"create_proxy_config",
|
|
175
|
-
"format_proxy_for_request",
|
|
176
|
-
"parse_proxy_url",
|
|
177
|
-
"validate_proxy_url",
|
|
178
|
-
"get_proxy_info",
|
|
179
76
|
|
|
180
77
|
# 分布式协调工具
|
|
181
78
|
"TaskDistributor",
|