crawlo 1.4.3__py3-none-any.whl → 1.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +52 -17
- crawlo/crawler.py +6 -0
- crawlo/queue/pqueue.py +2 -6
- crawlo/queue/queue_manager.py +1 -2
- crawlo/settings/default_settings.py +11 -30
- crawlo/templates/project/settings.py.tmpl +51 -65
- crawlo/templates/project/settings_distributed.py.tmpl +59 -67
- crawlo/templates/project/settings_gentle.py.tmpl +45 -40
- crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
- crawlo/templates/project/settings_minimal.py.tmpl +37 -26
- crawlo/templates/project/settings_simple.py.tmpl +45 -40
- crawlo/templates/run.py.tmpl +3 -7
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/METADATA +1 -1
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/RECORD +21 -27
- tests/test_multi_directory.py +68 -0
- tests/test_multiple_spider_modules.py +81 -0
- tests/test_spider_modules.py +85 -0
- examples/test_project/__init__.py +0 -7
- examples/test_project/run.py +0 -35
- examples/test_project/test_project/__init__.py +0 -4
- examples/test_project/test_project/items.py +0 -18
- examples/test_project/test_project/middlewares.py +0 -119
- examples/test_project/test_project/pipelines.py +0 -97
- examples/test_project/test_project/settings.py +0 -170
- examples/test_project/test_project/spiders/__init__.py +0 -10
- examples/test_project/test_project/spiders/of_week_dis.py +0 -144
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/WHEEL +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/top_level.txt +0 -0
|
@@ -4,86 +4,77 @@
|
|
|
4
4
|
=============================
|
|
5
5
|
基于 Crawlo 框架的分布式爬虫项目配置。
|
|
6
6
|
适合大规模数据采集和多节点部署。
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import os
|
|
10
|
-
|
|
11
|
-
# ============================== 项目基本信息 ==============================
|
|
12
|
-
PROJECT_NAME = '{{project_name}}'
|
|
13
|
-
|
|
14
|
-
# ============================== 运行模式 ==============================
|
|
15
|
-
RUN_MODE = 'distributed'
|
|
16
7
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
DOWNLOAD_DELAY = 1.0
|
|
8
|
+
此配置使用 CrawloConfig.distributed() 工厂方法创建分布式模式配置,
|
|
9
|
+
支持多节点协同工作,适用于大规模数据采集任务。
|
|
10
|
+
"""
|
|
21
11
|
|
|
22
|
-
|
|
23
|
-
# 可选下载器:
|
|
24
|
-
# DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
25
|
-
# DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
|
|
26
|
-
# DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
|
|
27
|
-
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
12
|
+
from crawlo.config import CrawloConfig
|
|
28
13
|
|
|
29
|
-
#
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
14
|
+
# 使用分布式模式配置工厂创建配置
|
|
15
|
+
config = CrawloConfig.distributed(
|
|
16
|
+
project_name='{{project_name}}',
|
|
17
|
+
redis_host='127.0.0.1',
|
|
18
|
+
redis_port=6379,
|
|
19
|
+
redis_password='',
|
|
20
|
+
redis_db=0,
|
|
21
|
+
concurrency=16,
|
|
22
|
+
download_delay=1.0
|
|
23
|
+
)
|
|
34
24
|
|
|
35
|
-
#
|
|
36
|
-
|
|
25
|
+
# 将配置转换为当前模块的全局变量
|
|
26
|
+
locals().update(config.to_dict())
|
|
37
27
|
|
|
38
|
-
#
|
|
39
|
-
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
28
|
+
# =================================== 爬虫配置 ===================================
|
|
40
29
|
|
|
41
|
-
#
|
|
30
|
+
# 爬虫模块配置
|
|
42
31
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
43
32
|
|
|
44
|
-
#
|
|
45
|
-
# MIDDLEWARES = [
|
|
46
|
-
# 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
|
|
47
|
-
# ]
|
|
48
|
-
|
|
49
|
-
# ============================== 默认请求头配置 ==============================
|
|
33
|
+
# 默认请求头配置
|
|
50
34
|
# 为DefaultHeaderMiddleware配置默认请求头
|
|
51
|
-
DEFAULT_REQUEST_HEADERS = {
|
|
52
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
53
|
-
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
54
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
55
|
-
}
|
|
35
|
+
# DEFAULT_REQUEST_HEADERS = {}
|
|
56
36
|
|
|
57
|
-
#
|
|
37
|
+
# 允许的域名
|
|
58
38
|
# 为OffsiteMiddleware配置允许的域名
|
|
59
|
-
# ALLOWED_DOMAINS = [
|
|
39
|
+
# ALLOWED_DOMAINS = []
|
|
60
40
|
|
|
61
|
-
#
|
|
41
|
+
# 数据管道
|
|
42
|
+
# 如需添加自定义管道,请取消注释并添加
|
|
62
43
|
# PIPELINES = [
|
|
63
|
-
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
|
|
44
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
|
|
45
|
+
# # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
|
|
64
46
|
# ]
|
|
65
47
|
|
|
66
|
-
#
|
|
48
|
+
# =================================== 系统配置 ===================================
|
|
49
|
+
|
|
50
|
+
# 扩展组件
|
|
51
|
+
# 如需添加自定义扩展,请取消注释并添加
|
|
67
52
|
# EXTENSIONS = [
|
|
68
|
-
# '
|
|
69
|
-
# 'crawlo.extension.log_stats.LogStats',
|
|
70
|
-
# 'crawlo.extension.logging_extension.CustomLoggerExtension',
|
|
53
|
+
# # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
|
|
71
54
|
# ]
|
|
72
55
|
|
|
73
|
-
#
|
|
56
|
+
# 中间件
|
|
57
|
+
# 如需添加自定义中间件,请取消注释并添加
|
|
58
|
+
# MIDDLEWARES = [
|
|
59
|
+
# # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
|
|
60
|
+
# ]
|
|
61
|
+
|
|
62
|
+
# 日志配置
|
|
74
63
|
LOG_LEVEL = 'INFO'
|
|
75
64
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
76
65
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
77
66
|
STATS_DUMP = True
|
|
78
67
|
|
|
79
|
-
#
|
|
68
|
+
# 输出配置
|
|
80
69
|
OUTPUT_DIR = 'output'
|
|
81
70
|
|
|
82
|
-
#
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
71
|
+
# =================================== 数据库配置 ===================================
|
|
72
|
+
|
|
73
|
+
# Redis配置
|
|
74
|
+
REDIS_HOST = '127.0.0.1'
|
|
75
|
+
REDIS_PORT = 6379
|
|
76
|
+
REDIS_PASSWORD = ''
|
|
77
|
+
REDIS_DB = 0
|
|
87
78
|
|
|
88
79
|
# 根据是否有密码生成 URL
|
|
89
80
|
if REDIS_PASSWORD:
|
|
@@ -91,18 +82,18 @@ if REDIS_PASSWORD:
|
|
|
91
82
|
else:
|
|
92
83
|
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
93
84
|
|
|
94
|
-
#
|
|
95
|
-
MYSQL_HOST =
|
|
96
|
-
MYSQL_PORT =
|
|
97
|
-
MYSQL_USER =
|
|
98
|
-
MYSQL_PASSWORD =
|
|
99
|
-
MYSQL_DB =
|
|
85
|
+
# MySQL配置
|
|
86
|
+
MYSQL_HOST = '127.0.0.1'
|
|
87
|
+
MYSQL_PORT = 3306
|
|
88
|
+
MYSQL_USER = 'root'
|
|
89
|
+
MYSQL_PASSWORD = '123456'
|
|
90
|
+
MYSQL_DB = '{{project_name}}'
|
|
100
91
|
MYSQL_TABLE = '{{project_name}}_data'
|
|
101
92
|
MYSQL_BATCH_SIZE = 100
|
|
102
93
|
MYSQL_USE_BATCH = True # 是否启用批量插入
|
|
103
94
|
|
|
104
|
-
#
|
|
105
|
-
MONGO_URI =
|
|
95
|
+
# MongoDB配置
|
|
96
|
+
MONGO_URI = 'mongodb://localhost:27017'
|
|
106
97
|
MONGO_DATABASE = '{{project_name}}_db'
|
|
107
98
|
MONGO_COLLECTION = '{{project_name}}_items'
|
|
108
99
|
MONGO_MAX_POOL_SIZE = 200
|
|
@@ -110,7 +101,9 @@ MONGO_MIN_POOL_SIZE = 20
|
|
|
110
101
|
MONGO_BATCH_SIZE = 100 # 批量插入条数
|
|
111
102
|
MONGO_USE_BATCH = True # 是否启用批量插入
|
|
112
103
|
|
|
113
|
-
#
|
|
104
|
+
# =================================== 网络配置 ===================================
|
|
105
|
+
|
|
106
|
+
# 代理配置
|
|
114
107
|
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
115
108
|
PROXY_ENABLED = False # 是否启用代理
|
|
116
109
|
|
|
@@ -129,7 +122,6 @@ PROXY_EXTRACTOR = "proxy"
|
|
|
129
122
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
130
123
|
PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
|
|
131
124
|
|
|
132
|
-
# ============================== Curl-Cffi 特有配置 ==============================
|
|
133
125
|
# 浏览器指纹模拟(仅 CurlCffi 下载器有效)
|
|
134
126
|
CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
|
|
135
127
|
|
|
@@ -141,7 +133,7 @@ CURL_BROWSER_VERSION_MAP = {
|
|
|
141
133
|
"firefox": "firefox135",
|
|
142
134
|
}
|
|
143
135
|
|
|
144
|
-
#
|
|
136
|
+
# 下载器优化配置
|
|
145
137
|
# 下载器健康检查
|
|
146
138
|
DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
|
|
147
139
|
HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
@@ -162,7 +154,7 @@ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
|
162
154
|
CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
163
155
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
164
156
|
|
|
165
|
-
#
|
|
157
|
+
# 内存监控配置
|
|
166
158
|
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
167
159
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
168
160
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
@@ -6,77 +6,81 @@
|
|
|
6
6
|
对目标网站友好,适合长期运行。
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# =================================== 基础配置 ===================================
|
|
10
|
+
|
|
11
|
+
# 项目基本信息
|
|
10
12
|
PROJECT_NAME = '{{project_name}}'
|
|
11
13
|
|
|
12
|
-
#
|
|
14
|
+
# 运行模式
|
|
13
15
|
RUN_MODE = 'standalone'
|
|
14
16
|
|
|
15
|
-
#
|
|
17
|
+
# 并发配置
|
|
16
18
|
CONCURRENCY = 2
|
|
17
19
|
MAX_RUNNING_SPIDERS = 1
|
|
18
20
|
DOWNLOAD_DELAY = 3.0 # 增加延迟以降低目标网站压力
|
|
19
21
|
RANDOMNESS = True # 启用随机延迟
|
|
20
22
|
RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子
|
|
21
23
|
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
#
|
|
25
|
-
# DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
|
|
26
|
-
# DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
|
|
24
|
+
# =================================== 核心组件配置 ===================================
|
|
25
|
+
|
|
26
|
+
# 下载器配置
|
|
27
27
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
28
28
|
|
|
29
|
-
#
|
|
29
|
+
# 队列配置
|
|
30
30
|
QUEUE_TYPE = 'memory'
|
|
31
31
|
|
|
32
|
-
#
|
|
32
|
+
# 去重过滤器
|
|
33
33
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
34
34
|
|
|
35
|
-
#
|
|
35
|
+
# 默认去重管道
|
|
36
36
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
37
37
|
|
|
38
|
-
#
|
|
39
|
-
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
38
|
+
# =================================== 爬虫配置 ===================================
|
|
40
39
|
|
|
41
|
-
#
|
|
42
|
-
|
|
43
|
-
# 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
|
|
44
|
-
# ]
|
|
40
|
+
# 爬虫模块配置
|
|
41
|
+
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
45
42
|
|
|
46
|
-
#
|
|
43
|
+
# 默认请求头配置
|
|
47
44
|
# 为DefaultHeaderMiddleware配置默认请求头
|
|
48
|
-
DEFAULT_REQUEST_HEADERS = {
|
|
49
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
50
|
-
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
51
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
52
|
-
}
|
|
45
|
+
# DEFAULT_REQUEST_HEADERS = {}
|
|
53
46
|
|
|
54
|
-
#
|
|
47
|
+
# 允许的域名
|
|
55
48
|
# 为OffsiteMiddleware配置允许的域名
|
|
56
|
-
# ALLOWED_DOMAINS = [
|
|
49
|
+
# ALLOWED_DOMAINS = []
|
|
57
50
|
|
|
58
|
-
#
|
|
51
|
+
# 数据管道
|
|
52
|
+
# 如需添加自定义管道,请取消注释并添加
|
|
59
53
|
# PIPELINES = [
|
|
60
|
-
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
|
|
54
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
|
|
55
|
+
# # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
|
|
61
56
|
# ]
|
|
62
57
|
|
|
63
|
-
#
|
|
58
|
+
# =================================== 系统配置 ===================================
|
|
59
|
+
|
|
60
|
+
# 扩展组件
|
|
61
|
+
# 如需添加自定义扩展,请取消注释并添加
|
|
64
62
|
# EXTENSIONS = [
|
|
65
|
-
# '
|
|
66
|
-
#
|
|
67
|
-
|
|
63
|
+
# # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
|
|
64
|
+
# ]
|
|
65
|
+
|
|
66
|
+
# 中间件
|
|
67
|
+
# 如需添加自定义中间件,请取消注释并添加
|
|
68
|
+
# MIDDLEWARES = [
|
|
69
|
+
# # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
|
|
68
70
|
# ]
|
|
69
71
|
|
|
70
|
-
#
|
|
72
|
+
# 日志配置
|
|
71
73
|
LOG_LEVEL = 'INFO'
|
|
72
74
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
73
75
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
74
76
|
STATS_DUMP = True
|
|
75
77
|
|
|
76
|
-
#
|
|
78
|
+
# 输出配置
|
|
77
79
|
OUTPUT_DIR = 'output'
|
|
78
80
|
|
|
79
|
-
#
|
|
81
|
+
# =================================== 数据库配置 ===================================
|
|
82
|
+
|
|
83
|
+
# Redis配置
|
|
80
84
|
REDIS_HOST = '127.0.0.1'
|
|
81
85
|
REDIS_PORT = 6379
|
|
82
86
|
REDIS_PASSWORD = ''
|
|
@@ -88,7 +92,7 @@ if REDIS_PASSWORD:
|
|
|
88
92
|
else:
|
|
89
93
|
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
90
94
|
|
|
91
|
-
#
|
|
95
|
+
# MySQL配置
|
|
92
96
|
MYSQL_HOST = '127.0.0.1'
|
|
93
97
|
MYSQL_PORT = 3306
|
|
94
98
|
MYSQL_USER = 'root'
|
|
@@ -98,7 +102,7 @@ MYSQL_TABLE = '{{project_name}}_data'
|
|
|
98
102
|
MYSQL_BATCH_SIZE = 100
|
|
99
103
|
MYSQL_USE_BATCH = False # 是否启用批量插入
|
|
100
104
|
|
|
101
|
-
#
|
|
105
|
+
# MongoDB配置
|
|
102
106
|
MONGO_URI = 'mongodb://localhost:27017'
|
|
103
107
|
MONGO_DATABASE = '{{project_name}}_db'
|
|
104
108
|
MONGO_COLLECTION = '{{project_name}}_items'
|
|
@@ -107,7 +111,9 @@ MONGO_MIN_POOL_SIZE = 20
|
|
|
107
111
|
MONGO_BATCH_SIZE = 100 # 批量插入条数
|
|
108
112
|
MONGO_USE_BATCH = False # 是否启用批量插入
|
|
109
113
|
|
|
110
|
-
#
|
|
114
|
+
# =================================== 网络配置 ===================================
|
|
115
|
+
|
|
116
|
+
# 代理配置
|
|
111
117
|
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
112
118
|
PROXY_ENABLED = False # 是否启用代理
|
|
113
119
|
|
|
@@ -126,7 +132,6 @@ PROXY_EXTRACTOR = "proxy"
|
|
|
126
132
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
127
133
|
PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
|
|
128
134
|
|
|
129
|
-
# ============================== Curl-Cffi 特有配置 ==============================
|
|
130
135
|
# 浏览器指纹模拟(仅 CurlCffi 下载器有效)
|
|
131
136
|
CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
|
|
132
137
|
|
|
@@ -138,7 +143,7 @@ CURL_BROWSER_VERSION_MAP = {
|
|
|
138
143
|
"firefox": "firefox135",
|
|
139
144
|
}
|
|
140
145
|
|
|
141
|
-
#
|
|
146
|
+
# 下载器优化配置
|
|
142
147
|
# 下载器健康检查
|
|
143
148
|
DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
|
|
144
149
|
HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
@@ -159,7 +164,7 @@ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
|
159
164
|
CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
160
165
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
161
166
|
|
|
162
|
-
#
|
|
167
|
+
# 内存监控配置
|
|
163
168
|
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
164
169
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
165
170
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
@@ -6,78 +6,82 @@
|
|
|
6
6
|
适合大规模高并发场景。
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# =================================== 基础配置 ===================================
|
|
10
|
+
|
|
11
|
+
# 项目基本信息
|
|
10
12
|
PROJECT_NAME = '{{project_name}}'
|
|
11
13
|
|
|
12
|
-
#
|
|
14
|
+
# 运行模式
|
|
13
15
|
RUN_MODE = 'standalone'
|
|
14
16
|
|
|
15
|
-
#
|
|
17
|
+
# 并发配置
|
|
16
18
|
CONCURRENCY = 32
|
|
17
19
|
MAX_RUNNING_SPIDERS = 10
|
|
18
20
|
DOWNLOAD_DELAY = 0.1
|
|
19
21
|
RANDOMNESS = False # 禁用随机延迟以保证性能
|
|
20
22
|
|
|
21
|
-
#
|
|
22
|
-
|
|
23
|
-
#
|
|
24
|
-
# DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
|
|
25
|
-
# DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
|
|
23
|
+
# =================================== 核心组件配置 ===================================
|
|
24
|
+
|
|
25
|
+
# 下载器配置
|
|
26
26
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
27
27
|
|
|
28
|
-
#
|
|
28
|
+
# 队列配置
|
|
29
29
|
QUEUE_TYPE = 'auto'
|
|
30
30
|
|
|
31
|
-
#
|
|
31
|
+
# 去重过滤器
|
|
32
32
|
# 高性能模式下,如果Redis可用则使用Redis去重,否则使用内存去重
|
|
33
33
|
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
34
34
|
|
|
35
|
-
#
|
|
35
|
+
# 默认去重管道
|
|
36
36
|
# 高性能模式下,如果Redis可用则使用Redis去重,否则使用内存去重
|
|
37
37
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
38
38
|
|
|
39
|
-
#
|
|
40
|
-
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
39
|
+
# =================================== 爬虫配置 ===================================
|
|
41
40
|
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
# 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
|
|
45
|
-
# ]
|
|
41
|
+
# 爬虫模块配置
|
|
42
|
+
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
46
43
|
|
|
47
|
-
#
|
|
44
|
+
# 默认请求头配置
|
|
48
45
|
# 为DefaultHeaderMiddleware配置默认请求头
|
|
49
|
-
DEFAULT_REQUEST_HEADERS = {
|
|
50
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
51
|
-
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
52
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
53
|
-
}
|
|
46
|
+
# DEFAULT_REQUEST_HEADERS = {}
|
|
54
47
|
|
|
55
|
-
#
|
|
48
|
+
# 允许的域名
|
|
56
49
|
# 为OffsiteMiddleware配置允许的域名
|
|
57
|
-
# ALLOWED_DOMAINS = [
|
|
50
|
+
# ALLOWED_DOMAINS = []
|
|
58
51
|
|
|
59
|
-
#
|
|
52
|
+
# 数据管道
|
|
53
|
+
# 如需添加自定义管道,请取消注释并添加
|
|
60
54
|
# PIPELINES = [
|
|
61
|
-
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
|
|
55
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
|
|
56
|
+
# # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
|
|
62
57
|
# ]
|
|
63
58
|
|
|
64
|
-
#
|
|
59
|
+
# =================================== 系统配置 ===================================
|
|
60
|
+
|
|
61
|
+
# 扩展组件
|
|
62
|
+
# 如需添加自定义扩展,请取消注释并添加
|
|
65
63
|
# EXTENSIONS = [
|
|
66
|
-
# '
|
|
67
|
-
#
|
|
68
|
-
|
|
64
|
+
# # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
|
|
65
|
+
# ]
|
|
66
|
+
|
|
67
|
+
# 中间件
|
|
68
|
+
# 如需添加自定义中间件,请取消注释并添加
|
|
69
|
+
# MIDDLEWARES = [
|
|
70
|
+
# # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
|
|
69
71
|
# ]
|
|
70
72
|
|
|
71
|
-
#
|
|
73
|
+
# 日志配置
|
|
72
74
|
LOG_LEVEL = 'INFO'
|
|
73
75
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
74
76
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
75
77
|
STATS_DUMP = True
|
|
76
78
|
|
|
77
|
-
#
|
|
79
|
+
# 输出配置
|
|
78
80
|
OUTPUT_DIR = 'output'
|
|
79
81
|
|
|
80
|
-
#
|
|
82
|
+
# =================================== 数据库配置 ===================================
|
|
83
|
+
|
|
84
|
+
# Redis配置
|
|
81
85
|
REDIS_HOST = '127.0.0.1'
|
|
82
86
|
REDIS_PORT = 6379
|
|
83
87
|
REDIS_PASSWORD = ''
|
|
@@ -89,7 +93,7 @@ if REDIS_PASSWORD:
|
|
|
89
93
|
else:
|
|
90
94
|
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
91
95
|
|
|
92
|
-
#
|
|
96
|
+
# MySQL配置
|
|
93
97
|
MYSQL_HOST = '127.0.0.1'
|
|
94
98
|
MYSQL_PORT = 3306
|
|
95
99
|
MYSQL_USER = 'root'
|
|
@@ -99,7 +103,7 @@ MYSQL_TABLE = '{{project_name}}_data'
|
|
|
99
103
|
MYSQL_BATCH_SIZE = 100
|
|
100
104
|
MYSQL_USE_BATCH = True # 是否启用批量插入
|
|
101
105
|
|
|
102
|
-
#
|
|
106
|
+
# MongoDB配置
|
|
103
107
|
MONGO_URI = 'mongodb://localhost:27017'
|
|
104
108
|
MONGO_DATABASE = '{{project_name}}_db'
|
|
105
109
|
MONGO_COLLECTION = '{{project_name}}_items'
|
|
@@ -108,7 +112,9 @@ MONGO_MIN_POOL_SIZE = 20
|
|
|
108
112
|
MONGO_BATCH_SIZE = 100 # 批量插入条数
|
|
109
113
|
MONGO_USE_BATCH = True # 是否启用批量插入
|
|
110
114
|
|
|
111
|
-
#
|
|
115
|
+
# =================================== 网络配置 ===================================
|
|
116
|
+
|
|
117
|
+
# 代理配置
|
|
112
118
|
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
113
119
|
PROXY_ENABLED = False # 是否启用代理
|
|
114
120
|
|
|
@@ -127,7 +133,6 @@ PROXY_EXTRACTOR = "proxy"
|
|
|
127
133
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
128
134
|
PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
|
|
129
135
|
|
|
130
|
-
# ============================== Curl-Cffi 特有配置 ==============================
|
|
131
136
|
# 浏览器指纹模拟(仅 CurlCffi 下载器有效)
|
|
132
137
|
CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
|
|
133
138
|
|
|
@@ -139,7 +144,7 @@ CURL_BROWSER_VERSION_MAP = {
|
|
|
139
144
|
"firefox": "firefox135",
|
|
140
145
|
}
|
|
141
146
|
|
|
142
|
-
#
|
|
147
|
+
# 下载器优化配置
|
|
143
148
|
# 下载器健康检查
|
|
144
149
|
DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
|
|
145
150
|
HEALTH_CHECK_INTERVAL = 30 # 健康检查间隔(秒)
|
|
@@ -160,7 +165,7 @@ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
|
160
165
|
CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
161
166
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
162
167
|
|
|
163
|
-
#
|
|
168
|
+
# 内存监控配置
|
|
164
169
|
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
165
170
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
166
171
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
@@ -6,61 +6,72 @@
|
|
|
6
6
|
仅包含最基本和常用的配置项。
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# =================================== 基础配置 ===================================
|
|
10
|
+
|
|
11
|
+
# 项目基本信息
|
|
10
12
|
PROJECT_NAME = '{{project_name}}'
|
|
11
13
|
|
|
12
|
-
#
|
|
14
|
+
# 运行模式
|
|
13
15
|
RUN_MODE = 'standalone'
|
|
14
16
|
|
|
15
|
-
#
|
|
17
|
+
# 并发配置
|
|
16
18
|
CONCURRENCY = 4
|
|
17
19
|
MAX_RUNNING_SPIDERS = 1
|
|
18
20
|
DOWNLOAD_DELAY = 1.0
|
|
19
21
|
|
|
20
|
-
#
|
|
22
|
+
# =================================== 核心组件配置 ===================================
|
|
23
|
+
|
|
24
|
+
# 下载器配置
|
|
21
25
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
22
26
|
|
|
23
|
-
#
|
|
27
|
+
# 队列配置
|
|
24
28
|
QUEUE_TYPE = 'memory'
|
|
25
29
|
|
|
26
|
-
#
|
|
30
|
+
# 去重过滤器
|
|
27
31
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
28
32
|
|
|
29
|
-
#
|
|
33
|
+
# 默认去重管道
|
|
30
34
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
31
35
|
|
|
32
|
-
#
|
|
36
|
+
# =================================== 爬虫配置 ===================================
|
|
37
|
+
|
|
38
|
+
# 爬虫模块配置
|
|
33
39
|
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
34
40
|
|
|
35
|
-
#
|
|
41
|
+
# 默认请求头配置
|
|
36
42
|
# 为DefaultHeaderMiddleware配置默认请求头
|
|
37
|
-
DEFAULT_REQUEST_HEADERS = {
|
|
38
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
39
|
-
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
40
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
41
|
-
}
|
|
43
|
+
# DEFAULT_REQUEST_HEADERS = {}
|
|
42
44
|
|
|
43
|
-
#
|
|
45
|
+
# 允许的域名
|
|
44
46
|
# 为OffsiteMiddleware配置允许的域名
|
|
45
|
-
# ALLOWED_DOMAINS = [
|
|
47
|
+
# ALLOWED_DOMAINS = []
|
|
46
48
|
|
|
47
|
-
#
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
# 数据管道
|
|
50
|
+
# 如需添加自定义管道,请取消注释并添加
|
|
51
|
+
# PIPELINES = [
|
|
52
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
|
|
53
|
+
# # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
|
|
54
|
+
# ]
|
|
51
55
|
|
|
52
|
-
#
|
|
56
|
+
# =================================== 系统配置 ===================================
|
|
57
|
+
|
|
58
|
+
# 扩展组件
|
|
59
|
+
# 如需添加自定义扩展,请取消注释并添加
|
|
53
60
|
# EXTENSIONS = [
|
|
54
|
-
# '
|
|
55
|
-
#
|
|
56
|
-
|
|
61
|
+
# # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
|
|
62
|
+
# ]
|
|
63
|
+
|
|
64
|
+
# 中间件
|
|
65
|
+
# 如需添加自定义中间件,请取消注释并添加
|
|
66
|
+
# MIDDLEWARES = [
|
|
67
|
+
# # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
|
|
57
68
|
# ]
|
|
58
69
|
|
|
59
|
-
#
|
|
70
|
+
# 日志配置
|
|
60
71
|
LOG_LEVEL = 'INFO'
|
|
61
72
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
62
73
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
63
74
|
STATS_DUMP = True
|
|
64
75
|
|
|
65
|
-
#
|
|
76
|
+
# 输出配置
|
|
66
77
|
OUTPUT_DIR = 'output'
|