aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,183 +1,483 @@
1
1
  """
2
- This module contains the default values for all settings used by Aioscrapy.
2
+ Default settings for aioscrapy.
3
+ aioscrapy的默认设置。
4
+
5
+ This module contains the default values for all settings used by aioscrapy.
6
+ It defines configuration for downloaders, middlewares, extensions, and other
7
+ components of the crawling system.
8
+ 此模块包含aioscrapy使用的所有设置的默认值。
9
+ 它为下载器、中间件、扩展和爬取系统的其他组件定义配置。
3
10
 
4
11
  For more information about these settings you can read the settings
5
12
  documentation in docs/topics/settings.rst
13
+ 有关这些设置的更多信息,您可以阅读docs/topics/settings.rst中的设置文档。
6
14
 
7
15
  Aioscrapy developers, if you add a setting here remember to:
16
+ Aioscrapy开发人员,如果您在此处添加设置,请记住:
8
17
 
9
18
  * add it in alphabetical order
19
+ 按字母顺序添加
10
20
  * group similar settings without leaving blank lines
21
+ 分组类似设置,不留空行
11
22
  * add its documentation to the available settings documentation
23
+ 将其文档添加到可用的设置文档中
12
24
  (docs/topics/settings.rst)
13
-
14
25
  """
15
26
 
16
27
  import sys
17
28
  from os.path import join, abspath, dirname
18
29
 
30
+ # Auto throttle settings
31
+ # 自动限流设置
32
+
33
+ # Whether to enable the AutoThrottle extension
34
+ # 是否启用AutoThrottle扩展
19
35
  AUTOTHROTTLE_ENABLED = False
36
+
37
+ # Whether to enable AutoThrottle debugging (displays adjustment decisions)
38
+ # 是否启用AutoThrottle调试(显示调整决策)
20
39
  AUTOTHROTTLE_DEBUG = False
40
+
41
+ # Maximum delay in seconds for throttled requests
42
+ # 限流请求的最大延迟(秒)
21
43
  AUTOTHROTTLE_MAX_DELAY = 60.0
44
+
45
+ # Initial delay in seconds for throttled requests
46
+ # 限流请求的初始延迟(秒)
22
47
  AUTOTHROTTLE_START_DELAY = 5.0
48
+
49
+ # Target average number of concurrent requests per domain
50
+ # 每个域的目标平均并发请求数
23
51
  AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
24
52
 
53
+ # Default bot name used for the User-Agent header and logging
54
+ # 用于User-Agent头和日志记录的默认机器人名称
25
55
  BOT_NAME = 'aioscrapybot'
26
56
 
57
+ # Close spider settings
58
+ # 关闭爬虫设置
59
+
60
+ # Number of seconds after which the spider will be closed
61
+ # 爬虫将被关闭的秒数(0表示禁用)
27
62
  CLOSESPIDER_TIMEOUT = 0
63
+
64
+ # Number of pages after which the spider will be closed
65
+ # 爬虫将被关闭的页面数(0表示禁用)
28
66
  CLOSESPIDER_PAGECOUNT = 0
67
+
68
+ # Number of items after which the spider will be closed
69
+ # 爬虫将被关闭的项目数(0表示禁用)
29
70
  CLOSESPIDER_ITEMCOUNT = 0
71
+
72
+ # Number of errors after which the spider will be closed
73
+ # 爬虫将被关闭的错误数(0表示禁用)
30
74
  CLOSESPIDER_ERRORCOUNT = 0
31
75
 
76
+ # Module where custom commands are defined
77
+ # 定义自定义命令的模块
32
78
  COMMANDS_MODULE = ''
33
79
 
80
+ # Number of concurrent parsers for processing responses
81
+ # 用于处理响应的并发解析器数量
34
82
  CONCURRENT_PARSER = 1
35
83
 
84
+ # Concurrency settings
85
+ # 并发设置
86
+
87
+ # Maximum number of concurrent requests across all domains
88
+ # 所有域的最大并发请求数
36
89
  CONCURRENT_REQUESTS = 16
90
+
91
+ # Maximum number of concurrent requests per domain
92
+ # 每个域的最大并发请求数
37
93
  CONCURRENT_REQUESTS_PER_DOMAIN = 8
94
+
95
+ # Maximum number of concurrent requests per IP address (0 means unlimited)
96
+ # 每个IP地址的最大并发请求数(0表示无限制)
38
97
  CONCURRENT_REQUESTS_PER_IP = 0
39
98
 
99
+ # Default headers used for all requests
100
+ # 用于所有请求的默认头
40
101
  DEFAULT_REQUEST_HEADERS = {
41
102
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42
103
  'Accept-Language': 'en',
43
104
  }
44
105
 
106
+ # Depth settings
107
+ # 深度设置
108
+
109
+ # Maximum depth to crawl (0 means no limit)
110
+ # 爬取的最大深度(0表示无限制)
45
111
  DEPTH_LIMIT = 0
112
+
113
+ # Whether to log verbose depth stats
114
+ # 是否记录详细的深度统计信息
46
115
  DEPTH_STATS_VERBOSE = False
116
+
117
+ # Priority adjustment based on depth (-1 means decrease priority with depth)
118
+ # 基于深度的优先级调整(-1表示随着深度增加而降低优先级)
47
119
  DEPTH_PRIORITY = 0
48
120
 
121
+ # Download settings
122
+ # 下载设置
123
+
124
+ # Delay in seconds between consecutive requests to the same domain
125
+ # 对同一域的连续请求之间的延迟(秒)
49
126
  DOWNLOAD_DELAY = 0
50
127
 
128
+ # Custom download handlers for different schemes (http, https, etc.)
129
+ # 不同协议(http、https等)的自定义下载处理程序
51
130
  DOWNLOAD_HANDLERS = {}
131
+
132
+ # Base download handlers for http and https
133
+ # http和https的基本下载处理程序
52
134
  DOWNLOAD_HANDLERS_BASE = {
53
135
  'http': 'aioscrapy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler',
54
136
  'https': 'aioscrapy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler',
55
137
  }
56
138
 
139
+ # Mapping of different HTTP client libraries to their download handlers
140
+ # 不同HTTP客户端库到其下载处理程序的映射
57
141
  DOWNLOAD_HANDLERS_MAP = {
142
+ # aiohttp handlers (default)
143
+ # aiohttp处理程序(默认)
58
144
  'aiohttp': DOWNLOAD_HANDLERS_BASE,
145
+
146
+ # httpx handlers
147
+ # httpx处理程序
59
148
  'httpx': {
60
149
  'http': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler',
61
150
  'https': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler',
62
151
  },
152
+
153
+ # requests handlers
154
+ # requests处理程序
63
155
  'requests': {
64
156
  'http': 'aioscrapy.core.downloader.handlers.requests.RequestsDownloadHandler',
65
157
  'https': 'aioscrapy.core.downloader.handlers.requests.RequestsDownloadHandler',
66
158
  },
159
+
160
+ # pyhttpx handlers
161
+ # pyhttpx处理程序
67
162
  'pyhttpx': {
68
163
  'http': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxDownloadHandler',
69
164
  'https': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxDownloadHandler',
70
165
  },
166
+
167
+ # playwright handlers (for JavaScript rendering)
168
+ # playwright处理程序(用于JavaScript渲染)
71
169
  'playwright': {
72
- 'http': 'aioscrapy.core.downloader.handlers.playwright.PlaywrightHandler',
73
- 'https': 'aioscrapy.core.downloader.handlers.playwright.PlaywrightHandler',
170
+ 'http': 'aioscrapy.core.downloader.handlers.webdriver.playwright.PlaywrightDownloadHandler',
171
+ 'https': 'aioscrapy.core.downloader.handlers.webdriver.playwright.PlaywrightDownloadHandler',
172
+ },
173
+
174
+ # DrissionPage handlers (for JavaScript rendering)
175
+ # DrissionPage处理程序(用于JavaScript渲染)
176
+ 'dp': {
177
+ 'http': 'aioscrapy.core.downloader.handlers.webdriver.drissionpage.DrissionPageDownloadHandler',
178
+ 'https': 'aioscrapy.core.downloader.handlers.webdriver.drissionpage.DrissionPageDownloadHandler',
74
179
  },
180
+
181
+ # curl_cffi handlers
182
+ # curl_cffi处理程序
75
183
  'curl_cffi': {
76
184
  'http': 'aioscrapy.core.downloader.handlers.curl_cffi.CurlCffiDownloadHandler',
77
185
  'https': 'aioscrapy.core.downloader.handlers.curl_cffi.CurlCffiDownloadHandler',
78
186
  },
79
187
  }
80
188
 
189
+ # Download timeout in seconds (3 minutes)
190
+ # 下载超时时间(秒)(3分钟)
81
191
  DOWNLOAD_TIMEOUT = 180 # 3mins
82
192
 
193
+ # Downloader class to use
194
+ # 要使用的下载器类
83
195
  DOWNLOADER = 'aioscrapy.core.downloader.Downloader'
84
196
 
197
+ # Custom downloader middlewares
198
+ # 自定义下载器中间件
85
199
  DOWNLOADER_MIDDLEWARES = {}
86
200
 
201
+ # Base downloader middlewares with their priorities
202
+ # 基本下载器中间件及其优先级
87
203
  DOWNLOADER_MIDDLEWARES_BASE = {
88
- # Engine side
89
- 'aioscrapy.libs.downloader.downloadtimeout.DownloadTimeoutMiddleware': 350,
90
- 'aioscrapy.libs.downloader.defaultheaders.DefaultHeadersMiddleware': 400,
91
- 'aioscrapy.libs.downloader.useragent.UserAgentMiddleware': 500,
92
- 'aioscrapy.libs.downloader.retry.RetryMiddleware': 550,
93
- 'aioscrapy.libs.downloader.stats.DownloaderStats': 850,
94
- 'aioscrapy.libs.downloader.ja3fingerprint.TLSCiphersMiddleware': 950,
95
- # Downloader side
204
+ # Engine side middlewares
205
+ # 引擎端中间件
206
+ 'aioscrapy.libs.downloader.downloadtimeout.DownloadTimeoutMiddleware': 350, # Handles download timeouts
207
+ 'aioscrapy.libs.downloader.defaultheaders.DefaultHeadersMiddleware': 400, # Adds default headers
208
+ 'aioscrapy.libs.downloader.useragent.UserAgentMiddleware': 500, # Sets User-Agent
209
+ 'aioscrapy.libs.downloader.retry.RetryMiddleware': 550, # Retries failed requests
210
+ 'aioscrapy.libs.downloader.stats.DownloaderStats': 850, # Collects download stats
211
+ 'aioscrapy.libs.downloader.ja3fingerprint.TLSCiphersMiddleware': 950, # Manages TLS fingerprints
212
+ # Downloader side middlewares
213
+ # 下载器端中间件
96
214
  }
97
215
 
216
+ # Whether to collect downloader statistics
217
+ # 是否收集下载器统计信息
98
218
  DOWNLOADER_STATS = True
99
219
 
220
+ # Duplicate filter settings (commented out by default)
221
+ # 重复过滤器设置(默认注释掉)
222
+
223
+ # Class to use for filtering duplicate requests
224
+ # 用于过滤重复请求的类
100
225
  # DUPEFILTER_CLASS = 'aioscrapy.dupefilters.disk.RFPDupeFilter'
226
+
227
+ # Whether to enable debug logging for the duplicate filter
228
+ # 是否为重复过滤器启用调试日志记录
101
229
  # DUPEFILTER_DEBUG = False
102
230
 
231
+ # Editor to use when editing spiders with the 'edit' command
232
+ # 使用'edit'命令编辑爬虫时使用的编辑器
103
233
  EDITOR = 'vi'
104
234
  if sys.platform == 'win32':
105
235
  EDITOR = '%s -m idlelibs.idle'
106
236
 
237
+ # Extensions settings
238
+ # 扩展设置
239
+
240
+ # Custom extensions to enable
241
+ # 要启用的自定义扩展
107
242
  EXTENSIONS = {}
108
243
 
244
+ # Base extensions with their priorities
245
+ # 基本扩展及其优先级
109
246
  EXTENSIONS_BASE = {
247
+ # Core statistics extension
248
+ # 核心统计扩展
110
249
  'aioscrapy.libs.extensions.corestats.CoreStats': 0,
250
+
251
+ # Close spider extension (handles automatic closing)
252
+ # 关闭爬虫扩展(处理自动关闭)
111
253
  'aioscrapy.libs.extensions.closespider.CloseSpider': 0,
254
+
255
+ # Log statistics extension
256
+ # 日志统计扩展
112
257
  'aioscrapy.libs.extensions.logstats.LogStats': 0,
258
+
259
+ # Auto throttle extension (commented out by default)
260
+ # 自动限流扩展(默认注释掉)
113
261
  # 'aioscrapy.libs.extensions.throttle.AutoThrottle': 0,
114
262
  }
115
263
 
264
+ # File storage settings
265
+ # 文件存储设置
266
+
267
+ # Access control list for Amazon S3 file storage
268
+ # Amazon S3文件存储的访问控制列表
116
269
  FILES_STORE_S3_ACL = 'private'
270
+
271
+ # Access control list for Google Cloud Storage file storage
272
+ # Google Cloud Storage文件存储的访问控制列表
117
273
  FILES_STORE_GCS_ACL = ''
118
274
 
275
+ # HTTP proxy settings
276
+ # HTTP代理设置
277
+
278
+ # Whether to enable HTTP proxy support
279
+ # 是否启用HTTP代理支持
119
280
  HTTPPROXY_ENABLED = True
281
+
282
+ # Encoding used for proxy authentication
283
+ # 用于代理认证的编码
120
284
  HTTPPROXY_AUTH_ENCODING = 'latin-1'
121
285
 
286
+ # Item processing settings
287
+ # 项目处理设置
288
+
289
+ # Class to use for processing items
290
+ # 用于处理项目的类
122
291
  ITEM_PROCESSOR = 'aioscrapy.middleware.ItemPipelineManager'
123
292
 
293
+ # Custom item pipelines to enable
294
+ # 要启用的自定义项目管道
124
295
  ITEM_PIPELINES = {}
296
+
297
+ # Base item pipelines
298
+ # 基本项目管道
125
299
  ITEM_PIPELINES_BASE = {}
126
300
 
301
+ # Logging settings
302
+ # 日志设置
303
+
304
+ # Whether to enable logging
305
+ # 是否启用日志记录
127
306
  LOG_ENABLED = True
307
+
308
+ # Encoding used for log files
309
+ # 用于日志文件的编码
128
310
  LOG_ENCODING = 'utf-8'
311
+
312
+ # Log file rotation size
313
+ # 日志文件轮转大小
129
314
  LOG_ROTATION = '20MB'
315
+
316
+ # Number of log files to keep
317
+ # 要保留的日志文件数量
130
318
  LOG_RETENTION = 10
319
+
320
+ # Class to use for formatting log messages
321
+ # 用于格式化日志消息的类
131
322
  LOG_FORMATTER = 'aioscrapy.logformatter.LogFormatter'
323
+
324
+ # Format string for log messages
325
+ # 日志消息的格式字符串
132
326
  LOG_FORMAT = "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{process}</cyan> | <cyan>{extra[spidername]}</cyan> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | <level>{message}</level>"
327
+
328
+ # Whether to log to standard output
329
+ # 是否记录到标准输出
133
330
  LOG_STDOUT = True
331
+
332
+ # Minimum level of messages to log
333
+ # 要记录的消息的最低级别
134
334
  LOG_LEVEL = 'DEBUG'
335
+
336
+ # Log file path (None means no log file)
337
+ # 日志文件路径(None表示没有日志文件)
135
338
  LOG_FILE = None
136
339
 
340
+ # Whether to enable debug logging for the scheduler
341
+ # 是否为调度器启用调试日志记录
137
342
  SCHEDULER_DEBUG = False
138
343
 
344
+ # Interval in seconds between logging of crawl stats
345
+ # 爬取统计信息日志记录之间的间隔(秒)
139
346
  LOGSTATS_INTERVAL = 60.0
140
347
 
348
+ # Module where newly created spiders will be placed
349
+ # 新创建的爬虫将被放置的模块
141
350
  NEWSPIDER_MODULE = ''
142
351
 
352
+ # Whether to randomize the download delay (between 0.5 * DOWNLOAD_DELAY and 1.5 * DOWNLOAD_DELAY)
353
+ # 是否随机化下载延迟(在0.5 * DOWNLOAD_DELAY和1.5 * DOWNLOAD_DELAY之间)
143
354
  RANDOMIZE_DOWNLOAD_DELAY = True
144
355
 
356
+ # Redirect settings
357
+ # 重定向设置
358
+
359
+ # Whether to follow redirects
360
+ # 是否跟随重定向
145
361
  REDIRECT_ENABLED = True
362
+
363
+ # Maximum number of redirects to follow for a request
364
+ # 一个请求要跟随的最大重定向次数
146
365
  REDIRECT_MAX_TIMES = 20
147
366
 
367
+ # Referer settings
368
+ # 引用设置
369
+
370
+ # Whether to enable referer middleware
371
+ # 是否启用引用中间件
148
372
  REFERER_ENABLED = True
373
+
374
+ # Policy for setting the Referer header
375
+ # 设置Referer头的策略
149
376
  REFERRER_POLICY = 'aioscrapy.libs.spider.referer.DefaultReferrerPolicy'
150
377
 
378
+ # Retry settings
379
+ # 重试设置
380
+
381
+ # Whether to retry failed requests
382
+ # 是否重试失败的请求
151
383
  RETRY_ENABLED = True
152
- RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
384
+
385
+ # Number of times to retry a failed request (initial response + 2 retries = 3 requests)
386
+ # 重试失败请求的次数(初始响应 + 2次重试 = 3个请求)
387
+ RETRY_TIMES = 2
388
+
389
+ # HTTP status codes to retry
390
+ # 要重试的HTTP状态码
153
391
  RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
392
+
393
+ # Priority adjustment for retried requests (negative means lower priority)
394
+ # 重试请求的优先级调整(负数表示较低优先级)
154
395
  RETRY_PRIORITY_ADJUST = -1
155
396
 
397
+ # Scheduler settings
398
+ # 调度器设置
399
+
400
+ # Scheduler class to use
401
+ # 要使用的调度器类
156
402
  SCHEDULER = 'aioscrapy.core.scheduler.Scheduler'
403
+
404
+ # Queue class used by the scheduler
405
+ # 调度器使用的队列类
157
406
  SCHEDULER_QUEUE_CLASS = 'aioscrapy.queue.memory.SpiderPriorityQueue'
407
+
408
+ # Serializer class used by the scheduler for serializing requests
409
+ # 调度器用于序列化请求的序列化器类
158
410
  SCHEDULER_SERIALIZER = 'aioscrapy.serializer.JsonSerializer'
159
411
 
412
+ # Maximum size in bytes for the scraper slot (controls memory usage)
413
+ # 刮取器槽的最大大小(字节)(控制内存使用)
160
414
  SCRAPER_SLOT_MAX_ACTIVE_SIZE = 5000000
161
415
 
416
+ # Spider loader settings
417
+ # 爬虫加载器设置
418
+
419
+ # Class to use for loading spiders
420
+ # 用于加载爬虫的类
162
421
  SPIDER_LOADER_CLASS = 'aioscrapy.spiderloader.SpiderLoader'
422
+
423
+ # Whether to only warn (instead of error) when a spider module cannot be imported
424
+ # 当爬虫模块无法导入时是否只发出警告(而不是错误)
163
425
  SPIDER_LOADER_WARN_ONLY = False
164
426
 
427
+ # Spider middleware settings
428
+ # 爬虫中间件设置
429
+
430
+ # Custom spider middlewares to enable
431
+ # 要启用的自定义爬虫中间件
165
432
  SPIDER_MIDDLEWARES = {}
433
+
434
+ # Base spider middlewares with their priorities
435
+ # 基本爬虫中间件及其优先级
166
436
  SPIDER_MIDDLEWARES_BASE = {
437
+ # Handles HTTP errors (e.g., 404, 500)
438
+ # 处理HTTP错误(例如,404、500)
167
439
  'aioscrapy.libs.spider.httperror.HttpErrorMiddleware': 50,
440
+
441
+ # Filters out requests to URLs outside the domains allowed by the spider
442
+ # 过滤掉对爬虫允许的域之外的URL的请求
168
443
  'aioscrapy.libs.spider.offsite.OffsiteMiddleware': 500,
444
+
445
+ # Sets the Referer header
446
+ # 设置Referer头
169
447
  'aioscrapy.libs.spider.referer.RefererMiddleware': 700,
448
+
449
+ # Filters out requests with URLs longer than URLLENGTH_LIMIT
450
+ # 过滤掉URL长度超过URLLENGTH_LIMIT的请求
170
451
  'aioscrapy.libs.spider.urllength.UrlLengthMiddleware': 800,
452
+
453
+ # Tracks request depth
454
+ # 跟踪请求深度
171
455
  'aioscrapy.libs.spider.depth.DepthMiddleware': 900,
172
456
  }
173
457
 
458
+ # List of modules where spiders are expected to be defined
459
+ # 预期定义爬虫的模块列表
174
460
  SPIDER_MODULES = []
175
461
 
462
+ # Statistics collection settings
463
+ # 统计收集设置
464
+
465
+ # Class to use for collecting crawler stats
466
+ # 用于收集爬虫统计信息的类
176
467
  STATS_CLASS = 'aioscrapy.statscollectors.MemoryStatsCollector'
468
+
469
+ # Whether to dump stats when the spider finishes
470
+ # 爬虫完成时是否转储统计信息
177
471
  STATS_DUMP = True
178
472
 
473
+ # Directory where project templates are stored
474
+ # 存储项目模板的目录
179
475
  TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
180
476
 
477
+ # Maximum allowed length for URLs
478
+ # URL的最大允许长度
181
479
  URLLENGTH_LIMIT = 2083
182
480
 
183
- CLOSE_SPIDER_ON_IDLE = False
481
+ # Whether to close the spider when it becomes idle (no more requests)
482
+ # 当爬虫变为空闲状态(没有更多请求)时是否关闭爬虫
483
+ CLOSE_SPIDER_ON_IDLE = True