aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/utils/conf.py CHANGED
@@ -1,3 +1,14 @@
1
+ """
2
+ Configuration utilities for aioscrapy.
3
+ aioscrapy的配置实用工具。
4
+
5
+ This module provides utility functions for working with aioscrapy configuration.
6
+ It includes functions for handling component lists, command-line arguments,
7
+ finding configuration files, and processing feed export parameters.
8
+ 此模块提供了用于处理aioscrapy配置的实用函数。
9
+ 它包括用于处理组件列表、命令行参数、查找配置文件和处理Feed导出参数的函数。
10
+ """
11
+
1
12
  import numbers
2
13
  import os
3
14
  import sys
@@ -13,14 +24,64 @@ from aioscrapy.utils.python import without_none_values
13
24
 
14
25
 
15
26
  def build_component_list(compdict, custom=None, convert=update_classpath):
16
- """Compose a component list from a { class: order } dictionary."""
27
+ """
28
+ Compose a component list from a dictionary mapping classes to their order.
29
+ 从将类映射到其顺序的字典中组合组件列表。
30
+
31
+ This function builds an ordered list of components from a dictionary that maps
32
+ component classes to their order values. Components with lower order values
33
+ come first in the resulting list. Components with None values are excluded.
34
+ 此函数从将组件类映射到其顺序值的字典构建有序组件列表。
35
+ 具有较低顺序值的组件在结果列表中排在前面。值为None的组件将被排除。
17
36
 
37
+ The function also handles class path updates through the convert function,
38
+ which by default uses update_classpath to handle deprecated class paths.
39
+ 该函数还通过convert函数处理类路径更新,默认情况下使用update_classpath
40
+ 处理已弃用的类路径。
41
+
42
+ Args:
43
+ compdict: Dictionary mapping component classes to order values.
44
+ 将组件类映射到顺序值的字典。
45
+ Values should be real numbers or None.
46
+ 值应为实数或None。
47
+ custom: Additional components to include, either as a dictionary to update
48
+ compdict, or as a list/tuple of components (for backward compatibility).
49
+ 要包含的其他组件,可以是用于更新compdict的字典,
50
+ 也可以是组件的列表/元组(用于向后兼容性)。
51
+ Defaults to None.
52
+ 默认为None。
53
+ convert: Function to convert/update class paths.
54
+ 用于转换/更新类路径的函数。
55
+ Defaults to update_classpath.
56
+ 默认为update_classpath。
57
+
58
+ Returns:
59
+ list: Ordered list of component classes.
60
+ 有序的组件类列表。
61
+
62
+ Raises:
63
+ ValueError: If multiple component paths convert to the same object,
64
+ or if a component value is not a real number or None.
65
+ 如果多个组件路径转换为同一对象,
66
+ 或者如果组件值不是实数或None。
67
+ """
18
68
  def _check_components(complist):
69
+ """
70
+ Check that no two components in the list convert to the same object.
71
+ 检查列表中没有两个组件转换为同一对象。
72
+ """
19
73
  if len({convert(c) for c in complist}) != len(complist):
20
74
  raise ValueError(f'Some paths in {complist!r} convert to the same object, '
21
75
  'please update your settings')
22
76
 
23
77
  def _map_keys(compdict):
78
+ """
79
+ Convert all keys in the component dictionary using the convert function.
80
+ 使用convert函数转换组件字典中的所有键。
81
+
82
+ Handles both BaseSettings objects and regular dictionaries.
83
+ 处理BaseSettings对象和常规字典。
84
+ """
24
85
  if isinstance(compdict, BaseSettings):
25
86
  compbs = BaseSettings()
26
87
  for k, v in compdict.items():
@@ -38,13 +99,17 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
38
99
  return {convert(k): v for k, v in compdict.items()}
39
100
 
40
101
  def _validate_values(compdict):
41
- """Fail if a value in the components dict is not a real number or None."""
102
+ """
103
+ Fail if a value in the components dict is not a real number or None.
104
+ 如果组件字典中的值不是实数或None,则失败。
105
+ """
42
106
  for name, value in compdict.items():
43
107
  if value is not None and not isinstance(value, numbers.Real):
44
108
  raise ValueError(f'Invalid value {value} for component {name}, '
45
109
  'please provide a real number or None instead')
46
110
 
47
111
  # BEGIN Backward compatibility for old (base, custom) call signature
112
+ # 开始向后兼容旧的(base, custom)调用签名
48
113
  if isinstance(custom, (list, tuple)):
49
114
  _check_components(custom)
50
115
  return type(custom)(convert(c) for c in custom)
@@ -52,41 +117,138 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
52
117
  if custom is not None:
53
118
  compdict.update(custom)
54
119
  # END Backward compatibility
120
+ # 结束向后兼容
55
121
 
122
+ # Validate all values in the dictionary
123
+ # 验证字典中的所有值
56
124
  _validate_values(compdict)
125
+
126
+ # Convert keys and remove None values
127
+ # 转换键并删除None值
57
128
  compdict = without_none_values(_map_keys(compdict))
129
+
130
+ # Sort components by their order values and return just the component classes
131
+ # 按组件的顺序值排序,并仅返回组件类
58
132
  return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
59
133
 
60
134
 
61
135
  def arglist_to_dict(arglist):
62
- """Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
63
- dict
64
136
  """
137
+ Convert a list of key=value arguments to a dictionary.
138
+ 将key=value参数列表转换为字典。
139
+
140
+ This function takes a list of strings in the format 'key=value' and converts
141
+ them into a dictionary where each key is mapped to its corresponding value.
142
+ 此函数接受格式为'key=value'的字符串列表,并将它们转换为字典,
143
+ 其中每个键都映射到其对应的值。
144
+
145
+ Args:
146
+ arglist: List of strings in the format 'key=value'.
147
+ 格式为'key=value'的字符串列表。
148
+ Example: ['arg1=val1', 'arg2=val2', ...]
149
+ 示例:['arg1=val1', 'arg2=val2', ...]
150
+
151
+ Returns:
152
+ dict: Dictionary mapping keys to values.
153
+ 将键映射到值的字典。
154
+ Example: {'arg1': 'val1', 'arg2': 'val2', ...}
155
+ 示例:{'arg1': 'val1', 'arg2': 'val2', ...}
156
+
157
+ Raises:
158
+ ValueError: If any string in the list doesn't contain an equals sign.
159
+ 如果列表中的任何字符串不包含等号。
160
+ """
161
+ # Split each string at the first equals sign and convert to a dictionary
162
+ # 在第一个等号处分割每个字符串并转换为字典
65
163
  return dict(x.split('=', 1) for x in arglist)
66
164
 
67
165
 
68
166
  def closest_aioscrapy_cfg(path='.', prevpath=None):
69
- """Return the path to the closest scrapy.cfg file by traversing the current
70
- directory and its parents
71
167
  """
168
+ Find the closest aioscrapy.cfg file by traversing up the directory tree.
169
+ 通过向上遍历目录树查找最近的aioscrapy.cfg文件。
170
+
171
+ This function searches for an aioscrapy.cfg file in the specified directory
172
+ and its parent directories. It starts from the given path and moves up the
173
+ directory tree until it finds a configuration file or reaches the root directory.
174
+ 此函数在指定目录及其父目录中搜索aioscrapy.cfg文件。
175
+ 它从给定路径开始,向上移动目录树,直到找到配置文件或到达根目录。
176
+
177
+ Args:
178
+ path: Directory path to start the search from.
179
+ 开始搜索的目录路径。
180
+ Defaults to the current directory ('.').
181
+ 默认为当前目录('.')。
182
+ prevpath: Path from the previous recursive call, used to detect when we've
183
+ reached the root directory.
184
+ 上一个递归调用的路径,用于检测何时到达根目录。
185
+ Defaults to None.
186
+ 默认为None。
187
+
188
+ Returns:
189
+ str: Absolute path to the closest aioscrapy.cfg file, or an empty string
190
+ if no configuration file is found.
191
+ 最近的aioscrapy.cfg文件的绝对路径,如果未找到配置文件,则为空字符串。
192
+ """
193
+ # If we've reached the root directory (path doesn't change between iterations)
194
+ # 如果我们已经到达根目录(路径在迭代之间没有变化)
72
195
  if path == prevpath:
73
196
  return ''
197
+
198
+ # Convert to absolute path to ensure consistent behavior
199
+ # 转换为绝对路径以确保一致的行为
74
200
  path = os.path.abspath(path)
201
+
202
+ # Check if aioscrapy.cfg exists in the current directory
203
+ # 检查当前目录中是否存在aioscrapy.cfg
75
204
  cfgfile = os.path.join(path, 'aioscrapy.cfg')
76
205
  if os.path.exists(cfgfile):
77
206
  return cfgfile
207
+
208
+ # Recursively check the parent directory
209
+ # 递归检查父目录
78
210
  return closest_aioscrapy_cfg(os.path.dirname(path), path)
79
211
 
80
212
 
81
213
  def init_env(project='default', set_syspath=True):
82
- """Initialize environment to use command-line tool from inside a project
83
- dir. This sets the Scrapy settings module and modifies the Python path to
84
- be able to locate the project module.
85
214
  """
215
+ Initialize environment for running aioscrapy from inside a project directory.
216
+ 初始化环境,以便从项目目录内运行aioscrapy。
217
+
218
+ This function sets up the environment for running aioscrapy commands from within
219
+ a project directory. It:
220
+ 1. Sets the AIOSCRAPY_SETTINGS_MODULE environment variable based on the project
221
+ 2. Adds the project directory to sys.path if needed
222
+
223
+ 此函数设置环境,以便从项目目录内运行aioscrapy命令。它:
224
+ 1. 根据项目设置AIOSCRAPY_SETTINGS_MODULE环境变量
225
+ 2. 如果需要,将项目目录添加到sys.path
226
+
227
+ Args:
228
+ project: The project name to use for settings lookup in scrapy.cfg.
229
+ 用于在scrapy.cfg中查找设置的项目名称。
230
+ Defaults to 'default'.
231
+ 默认为'default'。
232
+ set_syspath: Whether to add the project directory to sys.path.
233
+ 是否将项目目录添加到sys.path。
234
+ Defaults to True.
235
+ 默认为True。
236
+ """
237
+ # Get the configuration from scrapy.cfg
238
+ # 从scrapy.cfg获取配置
86
239
  cfg = get_config()
240
+
241
+ # Set the settings module environment variable if defined in the config
242
+ # 如果在配置中定义,则设置设置模块环境变量
87
243
  if cfg.has_option('settings', project):
88
244
  os.environ['AIOSCRAPY_SETTINGS_MODULE'] = cfg.get('settings', project)
245
+
246
+ # Find the closest aioscrapy.cfg file
247
+ # 查找最近的aioscrapy.cfg文件
89
248
  closest = closest_aioscrapy_cfg()
249
+
250
+ # If a config file was found, add its directory to sys.path if needed
251
+ # 如果找到配置文件,则在需要时将其目录添加到sys.path
90
252
  if closest:
91
253
  projdir = os.path.dirname(closest)
92
254
  if set_syspath and projdir not in sys.path:
@@ -94,53 +256,200 @@ def init_env(project='default', set_syspath=True):
94
256
 
95
257
 
96
258
  def get_config(use_closest=True):
97
- """Get Scrapy config file as a ConfigParser"""
259
+ """
260
+ Get aioscrapy configuration as a ConfigParser object.
261
+ 获取aioscrapy配置作为ConfigParser对象。
262
+
263
+ This function reads the aioscrapy configuration from various possible locations
264
+ and returns it as a ConfigParser object. By default, it looks for configuration
265
+ in standard system locations and the closest aioscrapy.cfg file in the current
266
+ directory or its parents.
267
+ 此函数从各种可能的位置读取aioscrapy配置,并将其作为ConfigParser对象返回。
268
+ 默认情况下,它在标准系统位置和当前目录或其父目录中最近的aioscrapy.cfg文件中
269
+ 查找配置。
270
+
271
+ Args:
272
+ use_closest: Whether to include the closest aioscrapy.cfg file in the
273
+ configuration sources.
274
+ 是否在配置源中包含最近的aioscrapy.cfg文件。
275
+ Defaults to True.
276
+ 默认为True。
277
+
278
+ Returns:
279
+ ConfigParser: A ConfigParser object with the loaded configuration.
280
+ 加载了配置的ConfigParser对象。
281
+ """
282
+ # Get the list of configuration file paths to read
283
+ # 获取要读取的配置文件路径列表
98
284
  sources = get_sources(use_closest)
285
+
286
+ # Create a new ConfigParser and read the configuration files
287
+ # 创建一个新的ConfigParser并读取配置文件
99
288
  cfg = ConfigParser()
100
289
  cfg.read(sources)
290
+
101
291
  return cfg
102
292
 
103
293
 
104
294
  def get_sources(use_closest=True):
295
+ """
296
+ Get a list of possible configuration file paths.
297
+ 获取可能的配置文件路径列表。
298
+
299
+ This function returns a list of paths where aioscrapy configuration files might
300
+ be located. It includes standard system locations and optionally the closest
301
+ aioscrapy.cfg file in the current directory or its parents.
302
+ 此函数返回可能位于aioscrapy配置文件的路径列表。它包括标准系统位置,
303
+ 以及可选的当前目录或其父目录中最近的aioscrapy.cfg文件。
304
+
305
+ The function looks for configuration files in the following locations:
306
+ 该函数在以下位置查找配置文件:
307
+ 1. /etc/scrapy.cfg (Unix system-wide)
308
+ 2. c:\\scrapy\\scrapy.cfg (Windows system-wide)
309
+ 3. $XDG_CONFIG_HOME/scrapy.cfg (or ~/.config/scrapy.cfg)
310
+ 4. ~/.scrapy.cfg (user home directory)
311
+ 5. The closest aioscrapy.cfg file (if use_closest is True)
312
+
313
+ Args:
314
+ use_closest: Whether to include the closest aioscrapy.cfg file in the
315
+ returned list.
316
+ 是否在返回的列表中包含最近的aioscrapy.cfg文件。
317
+ Defaults to True.
318
+ 默认为True。
319
+
320
+ Returns:
321
+ list: A list of file paths to check for configuration.
322
+ 要检查配置的文件路径列表。
323
+ """
324
+ # Get XDG config home directory (Linux standard) or default to ~/.config
325
+ # 获取XDG配置主目录(Linux标准)或默认为~/.config
105
326
  xdg_config_home = os.environ.get('XDG_CONFIG_HOME') or os.path.expanduser('~/.config')
327
+
328
+ # List of standard locations to check for configuration files
329
+ # 检查配置文件的标准位置列表
106
330
  sources = [
107
- '/etc/scrapy.cfg',
108
- r'c:\scrapy\scrapy.cfg',
109
- xdg_config_home + '/scrapy.cfg',
110
- os.path.expanduser('~/.scrapy.cfg'),
331
+ '/etc/scrapy.cfg', # Unix system-wide
332
+ r'c:\scrapy\scrapy.cfg', # Windows system-wide
333
+ xdg_config_home + '/scrapy.cfg', # XDG config directory
334
+ os.path.expanduser('~/.scrapy.cfg'), # User home directory
111
335
  ]
336
+
337
+ # Optionally add the closest aioscrapy.cfg file
338
+ # 可选地添加最近的aioscrapy.cfg文件
112
339
  if use_closest:
113
340
  sources.append(closest_aioscrapy_cfg())
341
+
114
342
  return sources
115
343
 
116
344
 
117
345
  def feed_complete_default_values_from_settings(feed, settings):
346
+ """
347
+ Complete feed export configuration with default values from settings.
348
+ 使用设置中的默认值完成Feed导出配置。
349
+
350
+ This function takes a feed export configuration dictionary and fills in any
351
+ missing values with defaults from the project settings. It creates a new
352
+ dictionary without modifying the original.
353
+ 此函数接受Feed导出配置字典,并使用项目设置中的默认值填充任何缺失的值。
354
+ 它创建一个新字典,而不修改原始字典。
355
+
356
+ The following feed export settings are handled:
357
+ 处理以下Feed导出设置:
358
+ - batch_item_count: Number of items per batch
359
+ - encoding: Character encoding for the exported data
360
+ - fields: List of fields to export
361
+ - store_empty: Whether to store empty feeds
362
+ - uri_params: Parameters for URI formatting
363
+ - item_export_kwargs: Additional keyword arguments for item export
364
+ - indent: Indentation level for formatted outputs
365
+
366
+ Args:
367
+ feed: Original feed export configuration dictionary.
368
+ 原始Feed导出配置字典。
369
+ settings: Project settings object.
370
+ 项目设置对象。
371
+
372
+ Returns:
373
+ dict: A new dictionary with the feed export configuration, including
374
+ defaults for any missing values.
375
+ 包含Feed导出配置的新字典,包括任何缺失值的默认值。
376
+ """
377
+ # Create a copy of the original feed dictionary to avoid modifying it
378
+ # 创建原始feed字典的副本,以避免修改它
118
379
  out = feed.copy()
380
+
381
+ # Set default values for all feed export settings from the project settings
382
+ # 从项目设置中为所有Feed导出设置设置默认值
119
383
  out.setdefault("batch_item_count", settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT'))
120
384
  out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"])
121
385
  out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None)
122
386
  out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
123
387
  out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
124
388
  out.setdefault("item_export_kwargs", dict())
389
+
390
+ # Handle indentation specially since it might be None
391
+ # 特别处理缩进,因为它可能是None
125
392
  if settings["FEED_EXPORT_INDENT"] is None:
126
393
  out.setdefault("indent", None)
127
394
  else:
128
395
  out.setdefault("indent", settings.getint("FEED_EXPORT_INDENT"))
396
+
129
397
  return out
130
398
 
131
399
 
132
400
  def feed_process_params_from_cli(settings, output, output_format=None,
133
401
  overwrite_output=None):
134
402
  """
135
- Receives feed export params (from the 'crawl' or 'runspider' commands),
136
- checks for inconsistencies in their quantities and returns a dictionary
137
- suitable to be used as the FEEDS setting.
403
+ Process feed export parameters from command-line arguments.
404
+ 处理来自命令行参数的Feed导出参数。
405
+
406
+ This function processes feed export parameters provided via command-line arguments
407
+ (from the 'crawl' or 'runspider' commands), checks for inconsistencies, and
408
+ returns a dictionary suitable to be used as the FEEDS setting.
409
+ 此函数处理通过命令行参数提供的Feed导出参数(来自'crawl'或'runspider'命令),
410
+ 检查不一致性,并返回一个适合用作FEEDS设置的字典。
411
+
412
+ It handles:
413
+ 它处理:
414
+ - Output URIs (-o/--output or -O/--overwrite-output options)
415
+ - Output format (-t option, deprecated)
416
+ - Format specified in the URI (e.g., file.json:json)
417
+ - Format inferred from file extension (e.g., file.json)
418
+ - Overwrite flag (-O/--overwrite-output option)
419
+
420
+ Args:
421
+ settings: Project settings object.
422
+ 项目设置对象。
423
+ output: List of output URIs from -o/--output options.
424
+ 来自-o/--output选项的输出URI列表。
425
+ output_format: Output format from -t option (deprecated).
426
+ 来自-t选项的输出格式(已弃用)。
427
+ Defaults to None.
428
+ 默认为None。
429
+ overwrite_output: List of output URIs from -O/--overwrite-output options.
430
+ 来自-O/--overwrite-output选项的输出URI列表。
431
+ Defaults to None.
432
+ 默认为None。
433
+
434
+ Returns:
435
+ dict: A dictionary suitable for use as the FEEDS setting.
436
+ 适合用作FEEDS设置的字典。
437
+
438
+ Raises:
439
+ UsageError: If there are inconsistencies in the provided parameters.
440
+ 如果提供的参数中存在不一致。
138
441
  """
442
+ # Get the list of valid output formats from settings
443
+ # 从设置中获取有效输出格式的列表
139
444
  valid_output_formats = without_none_values(
140
445
  settings.getwithbase('FEED_EXPORTERS')
141
446
  ).keys()
142
447
 
143
448
  def check_valid_format(output_format):
449
+ """
450
+ Check if the output format is valid and raise an error if not.
451
+ 检查输出格式是否有效,如果无效则引发错误。
452
+ """
144
453
  if output_format not in valid_output_formats:
145
454
  raise UsageError(
146
455
  f"Unrecognized output format '{output_format}'. "
@@ -149,6 +458,8 @@ def feed_process_params_from_cli(settings, output, output_format=None,
149
458
  "<URI>:<FORMAT>) or as a file extension."
150
459
  )
151
460
 
461
+ # Handle -O/--overwrite-output option
462
+ # 处理-O/--overwrite-output选项
152
463
  overwrite = False
153
464
  if overwrite_output:
154
465
  if output:
@@ -158,6 +469,8 @@ def feed_process_params_from_cli(settings, output, output_format=None,
158
469
  output = overwrite_output
159
470
  overwrite = True
160
471
 
472
+ # Handle -t option (deprecated)
473
+ # 处理-t选项(已弃用)
161
474
  if output_format:
162
475
  if len(output) == 1:
163
476
  check_valid_format(output_format)
@@ -174,22 +487,37 @@ def feed_process_params_from_cli(settings, output, output_format=None,
174
487
  'URIs are specified'
175
488
  )
176
489
 
490
+ # Process each output URI
491
+ # 处理每个输出URI
177
492
  result = {}
178
493
  for element in output:
494
+ # Try to extract format from URI (e.g., file.json:json)
495
+ # 尝试从URI中提取格式(例如,file.json:json)
179
496
  try:
180
497
  feed_uri, feed_format = element.rsplit(':', 1)
181
498
  except ValueError:
499
+ # If no format in URI, infer from file extension
500
+ # 如果URI中没有格式,从文件扩展名推断
182
501
  feed_uri = element
183
502
  feed_format = os.path.splitext(element)[1].replace('.', '')
184
503
  else:
504
+ # Special case for stdout
505
+ # stdout的特殊情况
185
506
  if feed_uri == '-':
186
507
  feed_uri = 'stdout:'
508
+
509
+ # Validate the format
510
+ # 验证格式
187
511
  check_valid_format(feed_format)
512
+
513
+ # Add to result dictionary
514
+ # 添加到结果字典
188
515
  result[feed_uri] = {'format': feed_format}
189
516
  if overwrite:
190
517
  result[feed_uri]['overwrite'] = True
191
518
 
192
519
  # FEEDS setting should take precedence over the matching CLI options
520
+ # FEEDS设置应优先于匹配的CLI选项
193
521
  result.update(settings.getdict('FEEDS'))
194
522
 
195
523
  return result