aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,183 +1,483 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Default settings for aioscrapy.
|
|
3
|
+
aioscrapy的默认设置。
|
|
4
|
+
|
|
5
|
+
This module contains the default values for all settings used by aioscrapy.
|
|
6
|
+
It defines configuration for downloaders, middlewares, extensions, and other
|
|
7
|
+
components of the crawling system.
|
|
8
|
+
此模块包含aioscrapy使用的所有设置的默认值。
|
|
9
|
+
它为下载器、中间件、扩展和爬取系统的其他组件定义配置。
|
|
3
10
|
|
|
4
11
|
For more information about these settings you can read the settings
|
|
5
12
|
documentation in docs/topics/settings.rst
|
|
13
|
+
有关这些设置的更多信息,您可以阅读docs/topics/settings.rst中的设置文档。
|
|
6
14
|
|
|
7
15
|
Aioscrapy developers, if you add a setting here remember to:
|
|
16
|
+
Aioscrapy开发人员,如果您在此处添加设置,请记住:
|
|
8
17
|
|
|
9
18
|
* add it in alphabetical order
|
|
19
|
+
按字母顺序添加
|
|
10
20
|
* group similar settings without leaving blank lines
|
|
21
|
+
分组类似设置,不留空行
|
|
11
22
|
* add its documentation to the available settings documentation
|
|
23
|
+
将其文档添加到可用的设置文档中
|
|
12
24
|
(docs/topics/settings.rst)
|
|
13
|
-
|
|
14
25
|
"""
|
|
15
26
|
|
|
16
27
|
import sys
|
|
17
28
|
from os.path import join, abspath, dirname
|
|
18
29
|
|
|
30
|
+
# Auto throttle settings
|
|
31
|
+
# 自动限流设置
|
|
32
|
+
|
|
33
|
+
# Whether to enable the AutoThrottle extension
|
|
34
|
+
# 是否启用AutoThrottle扩展
|
|
19
35
|
AUTOTHROTTLE_ENABLED = False
|
|
36
|
+
|
|
37
|
+
# Whether to enable AutoThrottle debugging (displays adjustment decisions)
|
|
38
|
+
# 是否启用AutoThrottle调试(显示调整决策)
|
|
20
39
|
AUTOTHROTTLE_DEBUG = False
|
|
40
|
+
|
|
41
|
+
# Maximum delay in seconds for throttled requests
|
|
42
|
+
# 限流请求的最大延迟(秒)
|
|
21
43
|
AUTOTHROTTLE_MAX_DELAY = 60.0
|
|
44
|
+
|
|
45
|
+
# Initial delay in seconds for throttled requests
|
|
46
|
+
# 限流请求的初始延迟(秒)
|
|
22
47
|
AUTOTHROTTLE_START_DELAY = 5.0
|
|
48
|
+
|
|
49
|
+
# Target average number of concurrent requests per domain
|
|
50
|
+
# 每个域的目标平均并发请求数
|
|
23
51
|
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
|
24
52
|
|
|
53
|
+
# Default bot name used for the User-Agent header and logging
|
|
54
|
+
# 用于User-Agent头和日志记录的默认机器人名称
|
|
25
55
|
BOT_NAME = 'aioscrapybot'
|
|
26
56
|
|
|
57
|
+
# Close spider settings
|
|
58
|
+
# 关闭爬虫设置
|
|
59
|
+
|
|
60
|
+
# Number of seconds after which the spider will be closed
|
|
61
|
+
# 爬虫将被关闭的秒数(0表示禁用)
|
|
27
62
|
CLOSESPIDER_TIMEOUT = 0
|
|
63
|
+
|
|
64
|
+
# Number of pages after which the spider will be closed
|
|
65
|
+
# 爬虫将被关闭的页面数(0表示禁用)
|
|
28
66
|
CLOSESPIDER_PAGECOUNT = 0
|
|
67
|
+
|
|
68
|
+
# Number of items after which the spider will be closed
|
|
69
|
+
# 爬虫将被关闭的项目数(0表示禁用)
|
|
29
70
|
CLOSESPIDER_ITEMCOUNT = 0
|
|
71
|
+
|
|
72
|
+
# Number of errors after which the spider will be closed
|
|
73
|
+
# 爬虫将被关闭的错误数(0表示禁用)
|
|
30
74
|
CLOSESPIDER_ERRORCOUNT = 0
|
|
31
75
|
|
|
76
|
+
# Module where custom commands are defined
|
|
77
|
+
# 定义自定义命令的模块
|
|
32
78
|
COMMANDS_MODULE = ''
|
|
33
79
|
|
|
80
|
+
# Number of concurrent parsers for processing responses
|
|
81
|
+
# 用于处理响应的并发解析器数量
|
|
34
82
|
CONCURRENT_PARSER = 1
|
|
35
83
|
|
|
84
|
+
# Concurrency settings
|
|
85
|
+
# 并发设置
|
|
86
|
+
|
|
87
|
+
# Maximum number of concurrent requests across all domains
|
|
88
|
+
# 所有域的最大并发请求数
|
|
36
89
|
CONCURRENT_REQUESTS = 16
|
|
90
|
+
|
|
91
|
+
# Maximum number of concurrent requests per domain
|
|
92
|
+
# 每个域的最大并发请求数
|
|
37
93
|
CONCURRENT_REQUESTS_PER_DOMAIN = 8
|
|
94
|
+
|
|
95
|
+
# Maximum number of concurrent requests per IP address (0 means unlimited)
|
|
96
|
+
# 每个IP地址的最大并发请求数(0表示无限制)
|
|
38
97
|
CONCURRENT_REQUESTS_PER_IP = 0
|
|
39
98
|
|
|
99
|
+
# Default headers used for all requests
|
|
100
|
+
# 用于所有请求的默认头
|
|
40
101
|
DEFAULT_REQUEST_HEADERS = {
|
|
41
102
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
42
103
|
'Accept-Language': 'en',
|
|
43
104
|
}
|
|
44
105
|
|
|
106
|
+
# Depth settings
|
|
107
|
+
# 深度设置
|
|
108
|
+
|
|
109
|
+
# Maximum depth to crawl (0 means no limit)
|
|
110
|
+
# 爬取的最大深度(0表示无限制)
|
|
45
111
|
DEPTH_LIMIT = 0
|
|
112
|
+
|
|
113
|
+
# Whether to log verbose depth stats
|
|
114
|
+
# 是否记录详细的深度统计信息
|
|
46
115
|
DEPTH_STATS_VERBOSE = False
|
|
116
|
+
|
|
117
|
+
# Priority adjustment based on depth (-1 means decrease priority with depth)
|
|
118
|
+
# 基于深度的优先级调整(-1表示随着深度增加而降低优先级)
|
|
47
119
|
DEPTH_PRIORITY = 0
|
|
48
120
|
|
|
121
|
+
# Download settings
|
|
122
|
+
# 下载设置
|
|
123
|
+
|
|
124
|
+
# Delay in seconds between consecutive requests to the same domain
|
|
125
|
+
# 对同一域的连续请求之间的延迟(秒)
|
|
49
126
|
DOWNLOAD_DELAY = 0
|
|
50
127
|
|
|
128
|
+
# Custom download handlers for different schemes (http, https, etc.)
|
|
129
|
+
# 不同协议(http、https等)的自定义下载处理程序
|
|
51
130
|
DOWNLOAD_HANDLERS = {}
|
|
131
|
+
|
|
132
|
+
# Base download handlers for http and https
|
|
133
|
+
# http和https的基本下载处理程序
|
|
52
134
|
DOWNLOAD_HANDLERS_BASE = {
|
|
53
135
|
'http': 'aioscrapy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler',
|
|
54
136
|
'https': 'aioscrapy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler',
|
|
55
137
|
}
|
|
56
138
|
|
|
139
|
+
# Mapping of different HTTP client libraries to their download handlers
|
|
140
|
+
# 不同HTTP客户端库到其下载处理程序的映射
|
|
57
141
|
DOWNLOAD_HANDLERS_MAP = {
|
|
142
|
+
# aiohttp handlers (default)
|
|
143
|
+
# aiohttp处理程序(默认)
|
|
58
144
|
'aiohttp': DOWNLOAD_HANDLERS_BASE,
|
|
145
|
+
|
|
146
|
+
# httpx handlers
|
|
147
|
+
# httpx处理程序
|
|
59
148
|
'httpx': {
|
|
60
149
|
'http': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler',
|
|
61
150
|
'https': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler',
|
|
62
151
|
},
|
|
152
|
+
|
|
153
|
+
# requests handlers
|
|
154
|
+
# requests处理程序
|
|
63
155
|
'requests': {
|
|
64
156
|
'http': 'aioscrapy.core.downloader.handlers.requests.RequestsDownloadHandler',
|
|
65
157
|
'https': 'aioscrapy.core.downloader.handlers.requests.RequestsDownloadHandler',
|
|
66
158
|
},
|
|
159
|
+
|
|
160
|
+
# pyhttpx handlers
|
|
161
|
+
# pyhttpx处理程序
|
|
67
162
|
'pyhttpx': {
|
|
68
163
|
'http': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxDownloadHandler',
|
|
69
164
|
'https': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxDownloadHandler',
|
|
70
165
|
},
|
|
166
|
+
|
|
167
|
+
# playwright handlers (for JavaScript rendering)
|
|
168
|
+
# playwright处理程序(用于JavaScript渲染)
|
|
71
169
|
'playwright': {
|
|
72
|
-
'http': 'aioscrapy.core.downloader.handlers.playwright.
|
|
73
|
-
'https': 'aioscrapy.core.downloader.handlers.playwright.
|
|
170
|
+
'http': 'aioscrapy.core.downloader.handlers.webdriver.playwright.PlaywrightDownloadHandler',
|
|
171
|
+
'https': 'aioscrapy.core.downloader.handlers.webdriver.playwright.PlaywrightDownloadHandler',
|
|
172
|
+
},
|
|
173
|
+
|
|
174
|
+
# DrissionPage handlers (for JavaScript rendering)
|
|
175
|
+
# DrissionPage处理程序(用于JavaScript渲染)
|
|
176
|
+
'dp': {
|
|
177
|
+
'http': 'aioscrapy.core.downloader.handlers.webdriver.drissionpage.DrissionPageDownloadHandler',
|
|
178
|
+
'https': 'aioscrapy.core.downloader.handlers.webdriver.drissionpage.DrissionPageDownloadHandler',
|
|
74
179
|
},
|
|
180
|
+
|
|
181
|
+
# curl_cffi handlers
|
|
182
|
+
# curl_cffi处理程序
|
|
75
183
|
'curl_cffi': {
|
|
76
184
|
'http': 'aioscrapy.core.downloader.handlers.curl_cffi.CurlCffiDownloadHandler',
|
|
77
185
|
'https': 'aioscrapy.core.downloader.handlers.curl_cffi.CurlCffiDownloadHandler',
|
|
78
186
|
},
|
|
79
187
|
}
|
|
80
188
|
|
|
189
|
+
# Download timeout in seconds (3 minutes)
|
|
190
|
+
# 下载超时时间(秒)(3分钟)
|
|
81
191
|
DOWNLOAD_TIMEOUT = 180 # 3mins
|
|
82
192
|
|
|
193
|
+
# Downloader class to use
|
|
194
|
+
# 要使用的下载器类
|
|
83
195
|
DOWNLOADER = 'aioscrapy.core.downloader.Downloader'
|
|
84
196
|
|
|
197
|
+
# Custom downloader middlewares
|
|
198
|
+
# 自定义下载器中间件
|
|
85
199
|
DOWNLOADER_MIDDLEWARES = {}
|
|
86
200
|
|
|
201
|
+
# Base downloader middlewares with their priorities
|
|
202
|
+
# 基本下载器中间件及其优先级
|
|
87
203
|
DOWNLOADER_MIDDLEWARES_BASE = {
|
|
88
|
-
# Engine side
|
|
89
|
-
|
|
90
|
-
'aioscrapy.libs.downloader.
|
|
91
|
-
'aioscrapy.libs.downloader.
|
|
92
|
-
'aioscrapy.libs.downloader.
|
|
93
|
-
'aioscrapy.libs.downloader.
|
|
94
|
-
'aioscrapy.libs.downloader.
|
|
95
|
-
#
|
|
204
|
+
# Engine side middlewares
|
|
205
|
+
# 引擎端中间件
|
|
206
|
+
'aioscrapy.libs.downloader.downloadtimeout.DownloadTimeoutMiddleware': 350, # Handles download timeouts
|
|
207
|
+
'aioscrapy.libs.downloader.defaultheaders.DefaultHeadersMiddleware': 400, # Adds default headers
|
|
208
|
+
'aioscrapy.libs.downloader.useragent.UserAgentMiddleware': 500, # Sets User-Agent
|
|
209
|
+
'aioscrapy.libs.downloader.retry.RetryMiddleware': 550, # Retries failed requests
|
|
210
|
+
'aioscrapy.libs.downloader.stats.DownloaderStats': 850, # Collects download stats
|
|
211
|
+
'aioscrapy.libs.downloader.ja3fingerprint.TLSCiphersMiddleware': 950, # Manages TLS fingerprints
|
|
212
|
+
# Downloader side middlewares
|
|
213
|
+
# 下载器端中间件
|
|
96
214
|
}
|
|
97
215
|
|
|
216
|
+
# Whether to collect downloader statistics
|
|
217
|
+
# 是否收集下载器统计信息
|
|
98
218
|
DOWNLOADER_STATS = True
|
|
99
219
|
|
|
220
|
+
# Duplicate filter settings (commented out by default)
|
|
221
|
+
# 重复过滤器设置(默认注释掉)
|
|
222
|
+
|
|
223
|
+
# Class to use for filtering duplicate requests
|
|
224
|
+
# 用于过滤重复请求的类
|
|
100
225
|
# DUPEFILTER_CLASS = 'aioscrapy.dupefilters.disk.RFPDupeFilter'
|
|
226
|
+
|
|
227
|
+
# Whether to enable debug logging for the duplicate filter
|
|
228
|
+
# 是否为重复过滤器启用调试日志记录
|
|
101
229
|
# DUPEFILTER_DEBUG = False
|
|
102
230
|
|
|
231
|
+
# Editor to use when editing spiders with the 'edit' command
|
|
232
|
+
# 使用'edit'命令编辑爬虫时使用的编辑器
|
|
103
233
|
EDITOR = 'vi'
|
|
104
234
|
if sys.platform == 'win32':
|
|
105
235
|
EDITOR = '%s -m idlelibs.idle'
|
|
106
236
|
|
|
237
|
+
# Extensions settings
|
|
238
|
+
# 扩展设置
|
|
239
|
+
|
|
240
|
+
# Custom extensions to enable
|
|
241
|
+
# 要启用的自定义扩展
|
|
107
242
|
EXTENSIONS = {}
|
|
108
243
|
|
|
244
|
+
# Base extensions with their priorities
|
|
245
|
+
# 基本扩展及其优先级
|
|
109
246
|
EXTENSIONS_BASE = {
|
|
247
|
+
# Core statistics extension
|
|
248
|
+
# 核心统计扩展
|
|
110
249
|
'aioscrapy.libs.extensions.corestats.CoreStats': 0,
|
|
250
|
+
|
|
251
|
+
# Close spider extension (handles automatic closing)
|
|
252
|
+
# 关闭爬虫扩展(处理自动关闭)
|
|
111
253
|
'aioscrapy.libs.extensions.closespider.CloseSpider': 0,
|
|
254
|
+
|
|
255
|
+
# Log statistics extension
|
|
256
|
+
# 日志统计扩展
|
|
112
257
|
'aioscrapy.libs.extensions.logstats.LogStats': 0,
|
|
258
|
+
|
|
259
|
+
# Auto throttle extension (commented out by default)
|
|
260
|
+
# 自动限流扩展(默认注释掉)
|
|
113
261
|
# 'aioscrapy.libs.extensions.throttle.AutoThrottle': 0,
|
|
114
262
|
}
|
|
115
263
|
|
|
264
|
+
# File storage settings
|
|
265
|
+
# 文件存储设置
|
|
266
|
+
|
|
267
|
+
# Access control list for Amazon S3 file storage
|
|
268
|
+
# Amazon S3文件存储的访问控制列表
|
|
116
269
|
FILES_STORE_S3_ACL = 'private'
|
|
270
|
+
|
|
271
|
+
# Access control list for Google Cloud Storage file storage
|
|
272
|
+
# Google Cloud Storage文件存储的访问控制列表
|
|
117
273
|
FILES_STORE_GCS_ACL = ''
|
|
118
274
|
|
|
275
|
+
# HTTP proxy settings
|
|
276
|
+
# HTTP代理设置
|
|
277
|
+
|
|
278
|
+
# Whether to enable HTTP proxy support
|
|
279
|
+
# 是否启用HTTP代理支持
|
|
119
280
|
HTTPPROXY_ENABLED = True
|
|
281
|
+
|
|
282
|
+
# Encoding used for proxy authentication
|
|
283
|
+
# 用于代理认证的编码
|
|
120
284
|
HTTPPROXY_AUTH_ENCODING = 'latin-1'
|
|
121
285
|
|
|
286
|
+
# Item processing settings
|
|
287
|
+
# 项目处理设置
|
|
288
|
+
|
|
289
|
+
# Class to use for processing items
|
|
290
|
+
# 用于处理项目的类
|
|
122
291
|
ITEM_PROCESSOR = 'aioscrapy.middleware.ItemPipelineManager'
|
|
123
292
|
|
|
293
|
+
# Custom item pipelines to enable
|
|
294
|
+
# 要启用的自定义项目管道
|
|
124
295
|
ITEM_PIPELINES = {}
|
|
296
|
+
|
|
297
|
+
# Base item pipelines
|
|
298
|
+
# 基本项目管道
|
|
125
299
|
ITEM_PIPELINES_BASE = {}
|
|
126
300
|
|
|
301
|
+
# Logging settings
|
|
302
|
+
# 日志设置
|
|
303
|
+
|
|
304
|
+
# Whether to enable logging
|
|
305
|
+
# 是否启用日志记录
|
|
127
306
|
LOG_ENABLED = True
|
|
307
|
+
|
|
308
|
+
# Encoding used for log files
|
|
309
|
+
# 用于日志文件的编码
|
|
128
310
|
LOG_ENCODING = 'utf-8'
|
|
311
|
+
|
|
312
|
+
# Log file rotation size
|
|
313
|
+
# 日志文件轮转大小
|
|
129
314
|
LOG_ROTATION = '20MB'
|
|
315
|
+
|
|
316
|
+
# Number of log files to keep
|
|
317
|
+
# 要保留的日志文件数量
|
|
130
318
|
LOG_RETENTION = 10
|
|
319
|
+
|
|
320
|
+
# Class to use for formatting log messages
|
|
321
|
+
# 用于格式化日志消息的类
|
|
131
322
|
LOG_FORMATTER = 'aioscrapy.logformatter.LogFormatter'
|
|
323
|
+
|
|
324
|
+
# Format string for log messages
|
|
325
|
+
# 日志消息的格式字符串
|
|
132
326
|
LOG_FORMAT = "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{process}</cyan> | <cyan>{extra[spidername]}</cyan> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | <level>{message}</level>"
|
|
327
|
+
|
|
328
|
+
# Whether to log to standard output
|
|
329
|
+
# 是否记录到标准输出
|
|
133
330
|
LOG_STDOUT = True
|
|
331
|
+
|
|
332
|
+
# Minimum level of messages to log
|
|
333
|
+
# 要记录的消息的最低级别
|
|
134
334
|
LOG_LEVEL = 'DEBUG'
|
|
335
|
+
|
|
336
|
+
# Log file path (None means no log file)
|
|
337
|
+
# 日志文件路径(None表示没有日志文件)
|
|
135
338
|
LOG_FILE = None
|
|
136
339
|
|
|
340
|
+
# Whether to enable debug logging for the scheduler
|
|
341
|
+
# 是否为调度器启用调试日志记录
|
|
137
342
|
SCHEDULER_DEBUG = False
|
|
138
343
|
|
|
344
|
+
# Interval in seconds between logging of crawl stats
|
|
345
|
+
# 爬取统计信息日志记录之间的间隔(秒)
|
|
139
346
|
LOGSTATS_INTERVAL = 60.0
|
|
140
347
|
|
|
348
|
+
# Module where newly created spiders will be placed
|
|
349
|
+
# 新创建的爬虫将被放置的模块
|
|
141
350
|
NEWSPIDER_MODULE = ''
|
|
142
351
|
|
|
352
|
+
# Whether to randomize the download delay (between 0.5 * DOWNLOAD_DELAY and 1.5 * DOWNLOAD_DELAY)
|
|
353
|
+
# 是否随机化下载延迟(在0.5 * DOWNLOAD_DELAY和1.5 * DOWNLOAD_DELAY之间)
|
|
143
354
|
RANDOMIZE_DOWNLOAD_DELAY = True
|
|
144
355
|
|
|
356
|
+
# Redirect settings
|
|
357
|
+
# 重定向设置
|
|
358
|
+
|
|
359
|
+
# Whether to follow redirects
|
|
360
|
+
# 是否跟随重定向
|
|
145
361
|
REDIRECT_ENABLED = True
|
|
362
|
+
|
|
363
|
+
# Maximum number of redirects to follow for a request
|
|
364
|
+
# 一个请求要跟随的最大重定向次数
|
|
146
365
|
REDIRECT_MAX_TIMES = 20
|
|
147
366
|
|
|
367
|
+
# Referer settings
|
|
368
|
+
# 引用设置
|
|
369
|
+
|
|
370
|
+
# Whether to enable referer middleware
|
|
371
|
+
# 是否启用引用中间件
|
|
148
372
|
REFERER_ENABLED = True
|
|
373
|
+
|
|
374
|
+
# Policy for setting the Referer header
|
|
375
|
+
# 设置Referer头的策略
|
|
149
376
|
REFERRER_POLICY = 'aioscrapy.libs.spider.referer.DefaultReferrerPolicy'
|
|
150
377
|
|
|
378
|
+
# Retry settings
|
|
379
|
+
# 重试设置
|
|
380
|
+
|
|
381
|
+
# Whether to retry failed requests
|
|
382
|
+
# 是否重试失败的请求
|
|
151
383
|
RETRY_ENABLED = True
|
|
152
|
-
|
|
384
|
+
|
|
385
|
+
# Number of times to retry a failed request (initial response + 2 retries = 3 requests)
|
|
386
|
+
# 重试失败请求的次数(初始响应 + 2次重试 = 3个请求)
|
|
387
|
+
RETRY_TIMES = 2
|
|
388
|
+
|
|
389
|
+
# HTTP status codes to retry
|
|
390
|
+
# 要重试的HTTP状态码
|
|
153
391
|
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
|
|
392
|
+
|
|
393
|
+
# Priority adjustment for retried requests (negative means lower priority)
|
|
394
|
+
# 重试请求的优先级调整(负数表示较低优先级)
|
|
154
395
|
RETRY_PRIORITY_ADJUST = -1
|
|
155
396
|
|
|
397
|
+
# Scheduler settings
|
|
398
|
+
# 调度器设置
|
|
399
|
+
|
|
400
|
+
# Scheduler class to use
|
|
401
|
+
# 要使用的调度器类
|
|
156
402
|
SCHEDULER = 'aioscrapy.core.scheduler.Scheduler'
|
|
403
|
+
|
|
404
|
+
# Queue class used by the scheduler
|
|
405
|
+
# 调度器使用的队列类
|
|
157
406
|
SCHEDULER_QUEUE_CLASS = 'aioscrapy.queue.memory.SpiderPriorityQueue'
|
|
407
|
+
|
|
408
|
+
# Serializer class used by the scheduler for serializing requests
|
|
409
|
+
# 调度器用于序列化请求的序列化器类
|
|
158
410
|
SCHEDULER_SERIALIZER = 'aioscrapy.serializer.JsonSerializer'
|
|
159
411
|
|
|
412
|
+
# Maximum size in bytes for the scraper slot (controls memory usage)
|
|
413
|
+
# 刮取器槽的最大大小(字节)(控制内存使用)
|
|
160
414
|
SCRAPER_SLOT_MAX_ACTIVE_SIZE = 5000000
|
|
161
415
|
|
|
416
|
+
# Spider loader settings
|
|
417
|
+
# 爬虫加载器设置
|
|
418
|
+
|
|
419
|
+
# Class to use for loading spiders
|
|
420
|
+
# 用于加载爬虫的类
|
|
162
421
|
SPIDER_LOADER_CLASS = 'aioscrapy.spiderloader.SpiderLoader'
|
|
422
|
+
|
|
423
|
+
# Whether to only warn (instead of error) when a spider module cannot be imported
|
|
424
|
+
# 当爬虫模块无法导入时是否只发出警告(而不是错误)
|
|
163
425
|
SPIDER_LOADER_WARN_ONLY = False
|
|
164
426
|
|
|
427
|
+
# Spider middleware settings
|
|
428
|
+
# 爬虫中间件设置
|
|
429
|
+
|
|
430
|
+
# Custom spider middlewares to enable
|
|
431
|
+
# 要启用的自定义爬虫中间件
|
|
165
432
|
SPIDER_MIDDLEWARES = {}
|
|
433
|
+
|
|
434
|
+
# Base spider middlewares with their priorities
|
|
435
|
+
# 基本爬虫中间件及其优先级
|
|
166
436
|
SPIDER_MIDDLEWARES_BASE = {
|
|
437
|
+
# Handles HTTP errors (e.g., 404, 500)
|
|
438
|
+
# 处理HTTP错误(例如,404、500)
|
|
167
439
|
'aioscrapy.libs.spider.httperror.HttpErrorMiddleware': 50,
|
|
440
|
+
|
|
441
|
+
# Filters out requests to URLs outside the domains allowed by the spider
|
|
442
|
+
# 过滤掉对爬虫允许的域之外的URL的请求
|
|
168
443
|
'aioscrapy.libs.spider.offsite.OffsiteMiddleware': 500,
|
|
444
|
+
|
|
445
|
+
# Sets the Referer header
|
|
446
|
+
# 设置Referer头
|
|
169
447
|
'aioscrapy.libs.spider.referer.RefererMiddleware': 700,
|
|
448
|
+
|
|
449
|
+
# Filters out requests with URLs longer than URLLENGTH_LIMIT
|
|
450
|
+
# 过滤掉URL长度超过URLLENGTH_LIMIT的请求
|
|
170
451
|
'aioscrapy.libs.spider.urllength.UrlLengthMiddleware': 800,
|
|
452
|
+
|
|
453
|
+
# Tracks request depth
|
|
454
|
+
# 跟踪请求深度
|
|
171
455
|
'aioscrapy.libs.spider.depth.DepthMiddleware': 900,
|
|
172
456
|
}
|
|
173
457
|
|
|
458
|
+
# List of modules where spiders are expected to be defined
|
|
459
|
+
# 预期定义爬虫的模块列表
|
|
174
460
|
SPIDER_MODULES = []
|
|
175
461
|
|
|
462
|
+
# Statistics collection settings
|
|
463
|
+
# 统计收集设置
|
|
464
|
+
|
|
465
|
+
# Class to use for collecting crawler stats
|
|
466
|
+
# 用于收集爬虫统计信息的类
|
|
176
467
|
STATS_CLASS = 'aioscrapy.statscollectors.MemoryStatsCollector'
|
|
468
|
+
|
|
469
|
+
# Whether to dump stats when the spider finishes
|
|
470
|
+
# 爬虫完成时是否转储统计信息
|
|
177
471
|
STATS_DUMP = True
|
|
178
472
|
|
|
473
|
+
# Directory where project templates are stored
|
|
474
|
+
# 存储项目模板的目录
|
|
179
475
|
TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
|
|
180
476
|
|
|
477
|
+
# Maximum allowed length for URLs
|
|
478
|
+
# URL的最大允许长度
|
|
181
479
|
URLLENGTH_LIMIT = 2083
|
|
182
480
|
|
|
183
|
-
|
|
481
|
+
# Whether to close the spider when it becomes idle (no more requests)
|
|
482
|
+
# 当爬虫变为空闲状态(没有更多请求)时是否关闭爬虫
|
|
483
|
+
CLOSE_SPIDER_ON_IDLE = True
|