aio-scrapy 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -41
- aio_scrapy-2.1.6.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +187 -3
- aioscrapy/core/downloader/handlers/curl_cffi.py +124 -3
- aioscrapy/core/downloader/handlers/httpx.py +133 -3
- aioscrapy/core/downloader/handlers/pyhttpx.py +132 -3
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +313 -13
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
aioscrapy/utils/conf.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration utilities for aioscrapy.
|
|
3
|
+
aioscrapy的配置实用工具。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with aioscrapy configuration.
|
|
6
|
+
It includes functions for handling component lists, command-line arguments,
|
|
7
|
+
finding configuration files, and processing feed export parameters.
|
|
8
|
+
此模块提供了用于处理aioscrapy配置的实用函数。
|
|
9
|
+
它包括用于处理组件列表、命令行参数、查找配置文件和处理Feed导出参数的函数。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
import numbers
|
|
2
13
|
import os
|
|
3
14
|
import sys
|
|
@@ -13,14 +24,64 @@ from aioscrapy.utils.python import without_none_values
|
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
def build_component_list(compdict, custom=None, convert=update_classpath):
|
|
16
|
-
"""
|
|
27
|
+
"""
|
|
28
|
+
Compose a component list from a dictionary mapping classes to their order.
|
|
29
|
+
从将类映射到其顺序的字典中组合组件列表。
|
|
30
|
+
|
|
31
|
+
This function builds an ordered list of components from a dictionary that maps
|
|
32
|
+
component classes to their order values. Components with lower order values
|
|
33
|
+
come first in the resulting list. Components with None values are excluded.
|
|
34
|
+
此函数从将组件类映射到其顺序值的字典构建有序组件列表。
|
|
35
|
+
具有较低顺序值的组件在结果列表中排在前面。值为None的组件将被排除。
|
|
17
36
|
|
|
37
|
+
The function also handles class path updates through the convert function,
|
|
38
|
+
which by default uses update_classpath to handle deprecated class paths.
|
|
39
|
+
该函数还通过convert函数处理类路径更新,默认情况下使用update_classpath
|
|
40
|
+
处理已弃用的类路径。
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
compdict: Dictionary mapping component classes to order values.
|
|
44
|
+
将组件类映射到顺序值的字典。
|
|
45
|
+
Values should be real numbers or None.
|
|
46
|
+
值应为实数或None。
|
|
47
|
+
custom: Additional components to include, either as a dictionary to update
|
|
48
|
+
compdict, or as a list/tuple of components (for backward compatibility).
|
|
49
|
+
要包含的其他组件,可以是用于更新compdict的字典,
|
|
50
|
+
也可以是组件的列表/元组(用于向后兼容性)。
|
|
51
|
+
Defaults to None.
|
|
52
|
+
默认为None。
|
|
53
|
+
convert: Function to convert/update class paths.
|
|
54
|
+
用于转换/更新类路径的函数。
|
|
55
|
+
Defaults to update_classpath.
|
|
56
|
+
默认为update_classpath。
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
list: Ordered list of component classes.
|
|
60
|
+
有序的组件类列表。
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
ValueError: If multiple component paths convert to the same object,
|
|
64
|
+
or if a component value is not a real number or None.
|
|
65
|
+
如果多个组件路径转换为同一对象,
|
|
66
|
+
或者如果组件值不是实数或None。
|
|
67
|
+
"""
|
|
18
68
|
def _check_components(complist):
|
|
69
|
+
"""
|
|
70
|
+
Check that no two components in the list convert to the same object.
|
|
71
|
+
检查列表中没有两个组件转换为同一对象。
|
|
72
|
+
"""
|
|
19
73
|
if len({convert(c) for c in complist}) != len(complist):
|
|
20
74
|
raise ValueError(f'Some paths in {complist!r} convert to the same object, '
|
|
21
75
|
'please update your settings')
|
|
22
76
|
|
|
23
77
|
def _map_keys(compdict):
|
|
78
|
+
"""
|
|
79
|
+
Convert all keys in the component dictionary using the convert function.
|
|
80
|
+
使用convert函数转换组件字典中的所有键。
|
|
81
|
+
|
|
82
|
+
Handles both BaseSettings objects and regular dictionaries.
|
|
83
|
+
处理BaseSettings对象和常规字典。
|
|
84
|
+
"""
|
|
24
85
|
if isinstance(compdict, BaseSettings):
|
|
25
86
|
compbs = BaseSettings()
|
|
26
87
|
for k, v in compdict.items():
|
|
@@ -38,13 +99,17 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
|
|
|
38
99
|
return {convert(k): v for k, v in compdict.items()}
|
|
39
100
|
|
|
40
101
|
def _validate_values(compdict):
|
|
41
|
-
"""
|
|
102
|
+
"""
|
|
103
|
+
Fail if a value in the components dict is not a real number or None.
|
|
104
|
+
如果组件字典中的值不是实数或None,则失败。
|
|
105
|
+
"""
|
|
42
106
|
for name, value in compdict.items():
|
|
43
107
|
if value is not None and not isinstance(value, numbers.Real):
|
|
44
108
|
raise ValueError(f'Invalid value {value} for component {name}, '
|
|
45
109
|
'please provide a real number or None instead')
|
|
46
110
|
|
|
47
111
|
# BEGIN Backward compatibility for old (base, custom) call signature
|
|
112
|
+
# 开始向后兼容旧的(base, custom)调用签名
|
|
48
113
|
if isinstance(custom, (list, tuple)):
|
|
49
114
|
_check_components(custom)
|
|
50
115
|
return type(custom)(convert(c) for c in custom)
|
|
@@ -52,41 +117,138 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
|
|
|
52
117
|
if custom is not None:
|
|
53
118
|
compdict.update(custom)
|
|
54
119
|
# END Backward compatibility
|
|
120
|
+
# 结束向后兼容
|
|
55
121
|
|
|
122
|
+
# Validate all values in the dictionary
|
|
123
|
+
# 验证字典中的所有值
|
|
56
124
|
_validate_values(compdict)
|
|
125
|
+
|
|
126
|
+
# Convert keys and remove None values
|
|
127
|
+
# 转换键并删除None值
|
|
57
128
|
compdict = without_none_values(_map_keys(compdict))
|
|
129
|
+
|
|
130
|
+
# Sort components by their order values and return just the component classes
|
|
131
|
+
# 按组件的顺序值排序,并仅返回组件类
|
|
58
132
|
return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
|
|
59
133
|
|
|
60
134
|
|
|
61
135
|
def arglist_to_dict(arglist):
|
|
62
|
-
"""Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
|
|
63
|
-
dict
|
|
64
136
|
"""
|
|
137
|
+
Convert a list of key=value arguments to a dictionary.
|
|
138
|
+
将key=value参数列表转换为字典。
|
|
139
|
+
|
|
140
|
+
This function takes a list of strings in the format 'key=value' and converts
|
|
141
|
+
them into a dictionary where each key is mapped to its corresponding value.
|
|
142
|
+
此函数接受格式为'key=value'的字符串列表,并将它们转换为字典,
|
|
143
|
+
其中每个键都映射到其对应的值。
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
arglist: List of strings in the format 'key=value'.
|
|
147
|
+
格式为'key=value'的字符串列表。
|
|
148
|
+
Example: ['arg1=val1', 'arg2=val2', ...]
|
|
149
|
+
示例:['arg1=val1', 'arg2=val2', ...]
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
dict: Dictionary mapping keys to values.
|
|
153
|
+
将键映射到值的字典。
|
|
154
|
+
Example: {'arg1': 'val1', 'arg2': 'val2', ...}
|
|
155
|
+
示例:{'arg1': 'val1', 'arg2': 'val2', ...}
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
ValueError: If any string in the list doesn't contain an equals sign.
|
|
159
|
+
如果列表中的任何字符串不包含等号。
|
|
160
|
+
"""
|
|
161
|
+
# Split each string at the first equals sign and convert to a dictionary
|
|
162
|
+
# 在第一个等号处分割每个字符串并转换为字典
|
|
65
163
|
return dict(x.split('=', 1) for x in arglist)
|
|
66
164
|
|
|
67
165
|
|
|
68
166
|
def closest_aioscrapy_cfg(path='.', prevpath=None):
|
|
69
|
-
"""Return the path to the closest scrapy.cfg file by traversing the current
|
|
70
|
-
directory and its parents
|
|
71
167
|
"""
|
|
168
|
+
Find the closest aioscrapy.cfg file by traversing up the directory tree.
|
|
169
|
+
通过向上遍历目录树查找最近的aioscrapy.cfg文件。
|
|
170
|
+
|
|
171
|
+
This function searches for an aioscrapy.cfg file in the specified directory
|
|
172
|
+
and its parent directories. It starts from the given path and moves up the
|
|
173
|
+
directory tree until it finds a configuration file or reaches the root directory.
|
|
174
|
+
此函数在指定目录及其父目录中搜索aioscrapy.cfg文件。
|
|
175
|
+
它从给定路径开始,向上移动目录树,直到找到配置文件或到达根目录。
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
path: Directory path to start the search from.
|
|
179
|
+
开始搜索的目录路径。
|
|
180
|
+
Defaults to the current directory ('.').
|
|
181
|
+
默认为当前目录('.')。
|
|
182
|
+
prevpath: Path from the previous recursive call, used to detect when we've
|
|
183
|
+
reached the root directory.
|
|
184
|
+
上一个递归调用的路径,用于检测何时到达根目录。
|
|
185
|
+
Defaults to None.
|
|
186
|
+
默认为None。
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
str: Absolute path to the closest aioscrapy.cfg file, or an empty string
|
|
190
|
+
if no configuration file is found.
|
|
191
|
+
最近的aioscrapy.cfg文件的绝对路径,如果未找到配置文件,则为空字符串。
|
|
192
|
+
"""
|
|
193
|
+
# If we've reached the root directory (path doesn't change between iterations)
|
|
194
|
+
# 如果我们已经到达根目录(路径在迭代之间没有变化)
|
|
72
195
|
if path == prevpath:
|
|
73
196
|
return ''
|
|
197
|
+
|
|
198
|
+
# Convert to absolute path to ensure consistent behavior
|
|
199
|
+
# 转换为绝对路径以确保一致的行为
|
|
74
200
|
path = os.path.abspath(path)
|
|
201
|
+
|
|
202
|
+
# Check if aioscrapy.cfg exists in the current directory
|
|
203
|
+
# 检查当前目录中是否存在aioscrapy.cfg
|
|
75
204
|
cfgfile = os.path.join(path, 'aioscrapy.cfg')
|
|
76
205
|
if os.path.exists(cfgfile):
|
|
77
206
|
return cfgfile
|
|
207
|
+
|
|
208
|
+
# Recursively check the parent directory
|
|
209
|
+
# 递归检查父目录
|
|
78
210
|
return closest_aioscrapy_cfg(os.path.dirname(path), path)
|
|
79
211
|
|
|
80
212
|
|
|
81
213
|
def init_env(project='default', set_syspath=True):
|
|
82
|
-
"""Initialize environment to use command-line tool from inside a project
|
|
83
|
-
dir. This sets the Scrapy settings module and modifies the Python path to
|
|
84
|
-
be able to locate the project module.
|
|
85
214
|
"""
|
|
215
|
+
Initialize environment for running aioscrapy from inside a project directory.
|
|
216
|
+
初始化环境,以便从项目目录内运行aioscrapy。
|
|
217
|
+
|
|
218
|
+
This function sets up the environment for running aioscrapy commands from within
|
|
219
|
+
a project directory. It:
|
|
220
|
+
1. Sets the AIOSCRAPY_SETTINGS_MODULE environment variable based on the project
|
|
221
|
+
2. Adds the project directory to sys.path if needed
|
|
222
|
+
|
|
223
|
+
此函数设置环境,以便从项目目录内运行aioscrapy命令。它:
|
|
224
|
+
1. 根据项目设置AIOSCRAPY_SETTINGS_MODULE环境变量
|
|
225
|
+
2. 如果需要,将项目目录添加到sys.path
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
project: The project name to use for settings lookup in scrapy.cfg.
|
|
229
|
+
用于在scrapy.cfg中查找设置的项目名称。
|
|
230
|
+
Defaults to 'default'.
|
|
231
|
+
默认为'default'。
|
|
232
|
+
set_syspath: Whether to add the project directory to sys.path.
|
|
233
|
+
是否将项目目录添加到sys.path。
|
|
234
|
+
Defaults to True.
|
|
235
|
+
默认为True。
|
|
236
|
+
"""
|
|
237
|
+
# Get the configuration from scrapy.cfg
|
|
238
|
+
# 从scrapy.cfg获取配置
|
|
86
239
|
cfg = get_config()
|
|
240
|
+
|
|
241
|
+
# Set the settings module environment variable if defined in the config
|
|
242
|
+
# 如果在配置中定义,则设置设置模块环境变量
|
|
87
243
|
if cfg.has_option('settings', project):
|
|
88
244
|
os.environ['AIOSCRAPY_SETTINGS_MODULE'] = cfg.get('settings', project)
|
|
245
|
+
|
|
246
|
+
# Find the closest aioscrapy.cfg file
|
|
247
|
+
# 查找最近的aioscrapy.cfg文件
|
|
89
248
|
closest = closest_aioscrapy_cfg()
|
|
249
|
+
|
|
250
|
+
# If a config file was found, add its directory to sys.path if needed
|
|
251
|
+
# 如果找到配置文件,则在需要时将其目录添加到sys.path
|
|
90
252
|
if closest:
|
|
91
253
|
projdir = os.path.dirname(closest)
|
|
92
254
|
if set_syspath and projdir not in sys.path:
|
|
@@ -94,53 +256,200 @@ def init_env(project='default', set_syspath=True):
|
|
|
94
256
|
|
|
95
257
|
|
|
96
258
|
def get_config(use_closest=True):
|
|
97
|
-
"""
|
|
259
|
+
"""
|
|
260
|
+
Get aioscrapy configuration as a ConfigParser object.
|
|
261
|
+
获取aioscrapy配置作为ConfigParser对象。
|
|
262
|
+
|
|
263
|
+
This function reads the aioscrapy configuration from various possible locations
|
|
264
|
+
and returns it as a ConfigParser object. By default, it looks for configuration
|
|
265
|
+
in standard system locations and the closest aioscrapy.cfg file in the current
|
|
266
|
+
directory or its parents.
|
|
267
|
+
此函数从各种可能的位置读取aioscrapy配置,并将其作为ConfigParser对象返回。
|
|
268
|
+
默认情况下,它在标准系统位置和当前目录或其父目录中最近的aioscrapy.cfg文件中
|
|
269
|
+
查找配置。
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
use_closest: Whether to include the closest aioscrapy.cfg file in the
|
|
273
|
+
configuration sources.
|
|
274
|
+
是否在配置源中包含最近的aioscrapy.cfg文件。
|
|
275
|
+
Defaults to True.
|
|
276
|
+
默认为True。
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
ConfigParser: A ConfigParser object with the loaded configuration.
|
|
280
|
+
加载了配置的ConfigParser对象。
|
|
281
|
+
"""
|
|
282
|
+
# Get the list of configuration file paths to read
|
|
283
|
+
# 获取要读取的配置文件路径列表
|
|
98
284
|
sources = get_sources(use_closest)
|
|
285
|
+
|
|
286
|
+
# Create a new ConfigParser and read the configuration files
|
|
287
|
+
# 创建一个新的ConfigParser并读取配置文件
|
|
99
288
|
cfg = ConfigParser()
|
|
100
289
|
cfg.read(sources)
|
|
290
|
+
|
|
101
291
|
return cfg
|
|
102
292
|
|
|
103
293
|
|
|
104
294
|
def get_sources(use_closest=True):
|
|
295
|
+
"""
|
|
296
|
+
Get a list of possible configuration file paths.
|
|
297
|
+
获取可能的配置文件路径列表。
|
|
298
|
+
|
|
299
|
+
This function returns a list of paths where aioscrapy configuration files might
|
|
300
|
+
be located. It includes standard system locations and optionally the closest
|
|
301
|
+
aioscrapy.cfg file in the current directory or its parents.
|
|
302
|
+
此函数返回可能位于aioscrapy配置文件的路径列表。它包括标准系统位置,
|
|
303
|
+
以及可选的当前目录或其父目录中最近的aioscrapy.cfg文件。
|
|
304
|
+
|
|
305
|
+
The function looks for configuration files in the following locations:
|
|
306
|
+
该函数在以下位置查找配置文件:
|
|
307
|
+
1. /etc/scrapy.cfg (Unix system-wide)
|
|
308
|
+
2. c:\\scrapy\\scrapy.cfg (Windows system-wide)
|
|
309
|
+
3. $XDG_CONFIG_HOME/scrapy.cfg (or ~/.config/scrapy.cfg)
|
|
310
|
+
4. ~/.scrapy.cfg (user home directory)
|
|
311
|
+
5. The closest aioscrapy.cfg file (if use_closest is True)
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
use_closest: Whether to include the closest aioscrapy.cfg file in the
|
|
315
|
+
returned list.
|
|
316
|
+
是否在返回的列表中包含最近的aioscrapy.cfg文件。
|
|
317
|
+
Defaults to True.
|
|
318
|
+
默认为True。
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
list: A list of file paths to check for configuration.
|
|
322
|
+
要检查配置的文件路径列表。
|
|
323
|
+
"""
|
|
324
|
+
# Get XDG config home directory (Linux standard) or default to ~/.config
|
|
325
|
+
# 获取XDG配置主目录(Linux标准)或默认为~/.config
|
|
105
326
|
xdg_config_home = os.environ.get('XDG_CONFIG_HOME') or os.path.expanduser('~/.config')
|
|
327
|
+
|
|
328
|
+
# List of standard locations to check for configuration files
|
|
329
|
+
# 检查配置文件的标准位置列表
|
|
106
330
|
sources = [
|
|
107
|
-
'/etc/scrapy.cfg',
|
|
108
|
-
r'c:\scrapy\scrapy.cfg',
|
|
109
|
-
xdg_config_home + '/scrapy.cfg',
|
|
110
|
-
os.path.expanduser('~/.scrapy.cfg'),
|
|
331
|
+
'/etc/scrapy.cfg', # Unix system-wide
|
|
332
|
+
r'c:\scrapy\scrapy.cfg', # Windows system-wide
|
|
333
|
+
xdg_config_home + '/scrapy.cfg', # XDG config directory
|
|
334
|
+
os.path.expanduser('~/.scrapy.cfg'), # User home directory
|
|
111
335
|
]
|
|
336
|
+
|
|
337
|
+
# Optionally add the closest aioscrapy.cfg file
|
|
338
|
+
# 可选地添加最近的aioscrapy.cfg文件
|
|
112
339
|
if use_closest:
|
|
113
340
|
sources.append(closest_aioscrapy_cfg())
|
|
341
|
+
|
|
114
342
|
return sources
|
|
115
343
|
|
|
116
344
|
|
|
117
345
|
def feed_complete_default_values_from_settings(feed, settings):
|
|
346
|
+
"""
|
|
347
|
+
Complete feed export configuration with default values from settings.
|
|
348
|
+
使用设置中的默认值完成Feed导出配置。
|
|
349
|
+
|
|
350
|
+
This function takes a feed export configuration dictionary and fills in any
|
|
351
|
+
missing values with defaults from the project settings. It creates a new
|
|
352
|
+
dictionary without modifying the original.
|
|
353
|
+
此函数接受Feed导出配置字典,并使用项目设置中的默认值填充任何缺失的值。
|
|
354
|
+
它创建一个新字典,而不修改原始字典。
|
|
355
|
+
|
|
356
|
+
The following feed export settings are handled:
|
|
357
|
+
处理以下Feed导出设置:
|
|
358
|
+
- batch_item_count: Number of items per batch
|
|
359
|
+
- encoding: Character encoding for the exported data
|
|
360
|
+
- fields: List of fields to export
|
|
361
|
+
- store_empty: Whether to store empty feeds
|
|
362
|
+
- uri_params: Parameters for URI formatting
|
|
363
|
+
- item_export_kwargs: Additional keyword arguments for item export
|
|
364
|
+
- indent: Indentation level for formatted outputs
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
feed: Original feed export configuration dictionary.
|
|
368
|
+
原始Feed导出配置字典。
|
|
369
|
+
settings: Project settings object.
|
|
370
|
+
项目设置对象。
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
dict: A new dictionary with the feed export configuration, including
|
|
374
|
+
defaults for any missing values.
|
|
375
|
+
包含Feed导出配置的新字典,包括任何缺失值的默认值。
|
|
376
|
+
"""
|
|
377
|
+
# Create a copy of the original feed dictionary to avoid modifying it
|
|
378
|
+
# 创建原始feed字典的副本,以避免修改它
|
|
118
379
|
out = feed.copy()
|
|
380
|
+
|
|
381
|
+
# Set default values for all feed export settings from the project settings
|
|
382
|
+
# 从项目设置中为所有Feed导出设置设置默认值
|
|
119
383
|
out.setdefault("batch_item_count", settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT'))
|
|
120
384
|
out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"])
|
|
121
385
|
out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None)
|
|
122
386
|
out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
|
|
123
387
|
out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
|
|
124
388
|
out.setdefault("item_export_kwargs", dict())
|
|
389
|
+
|
|
390
|
+
# Handle indentation specially since it might be None
|
|
391
|
+
# 特别处理缩进,因为它可能是None
|
|
125
392
|
if settings["FEED_EXPORT_INDENT"] is None:
|
|
126
393
|
out.setdefault("indent", None)
|
|
127
394
|
else:
|
|
128
395
|
out.setdefault("indent", settings.getint("FEED_EXPORT_INDENT"))
|
|
396
|
+
|
|
129
397
|
return out
|
|
130
398
|
|
|
131
399
|
|
|
132
400
|
def feed_process_params_from_cli(settings, output, output_format=None,
|
|
133
401
|
overwrite_output=None):
|
|
134
402
|
"""
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
403
|
+
Process feed export parameters from command-line arguments.
|
|
404
|
+
处理来自命令行参数的Feed导出参数。
|
|
405
|
+
|
|
406
|
+
This function processes feed export parameters provided via command-line arguments
|
|
407
|
+
(from the 'crawl' or 'runspider' commands), checks for inconsistencies, and
|
|
408
|
+
returns a dictionary suitable to be used as the FEEDS setting.
|
|
409
|
+
此函数处理通过命令行参数提供的Feed导出参数(来自'crawl'或'runspider'命令),
|
|
410
|
+
检查不一致性,并返回一个适合用作FEEDS设置的字典。
|
|
411
|
+
|
|
412
|
+
It handles:
|
|
413
|
+
它处理:
|
|
414
|
+
- Output URIs (-o/--output or -O/--overwrite-output options)
|
|
415
|
+
- Output format (-t option, deprecated)
|
|
416
|
+
- Format specified in the URI (e.g., file.json:json)
|
|
417
|
+
- Format inferred from file extension (e.g., file.json)
|
|
418
|
+
- Overwrite flag (-O/--overwrite-output option)
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
settings: Project settings object.
|
|
422
|
+
项目设置对象。
|
|
423
|
+
output: List of output URIs from -o/--output options.
|
|
424
|
+
来自-o/--output选项的输出URI列表。
|
|
425
|
+
output_format: Output format from -t option (deprecated).
|
|
426
|
+
来自-t选项的输出格式(已弃用)。
|
|
427
|
+
Defaults to None.
|
|
428
|
+
默认为None。
|
|
429
|
+
overwrite_output: List of output URIs from -O/--overwrite-output options.
|
|
430
|
+
来自-O/--overwrite-output选项的输出URI列表。
|
|
431
|
+
Defaults to None.
|
|
432
|
+
默认为None。
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
dict: A dictionary suitable for use as the FEEDS setting.
|
|
436
|
+
适合用作FEEDS设置的字典。
|
|
437
|
+
|
|
438
|
+
Raises:
|
|
439
|
+
UsageError: If there are inconsistencies in the provided parameters.
|
|
440
|
+
如果提供的参数中存在不一致。
|
|
138
441
|
"""
|
|
442
|
+
# Get the list of valid output formats from settings
|
|
443
|
+
# 从设置中获取有效输出格式的列表
|
|
139
444
|
valid_output_formats = without_none_values(
|
|
140
445
|
settings.getwithbase('FEED_EXPORTERS')
|
|
141
446
|
).keys()
|
|
142
447
|
|
|
143
448
|
def check_valid_format(output_format):
|
|
449
|
+
"""
|
|
450
|
+
Check if the output format is valid and raise an error if not.
|
|
451
|
+
检查输出格式是否有效,如果无效则引发错误。
|
|
452
|
+
"""
|
|
144
453
|
if output_format not in valid_output_formats:
|
|
145
454
|
raise UsageError(
|
|
146
455
|
f"Unrecognized output format '{output_format}'. "
|
|
@@ -149,6 +458,8 @@ def feed_process_params_from_cli(settings, output, output_format=None,
|
|
|
149
458
|
"<URI>:<FORMAT>) or as a file extension."
|
|
150
459
|
)
|
|
151
460
|
|
|
461
|
+
# Handle -O/--overwrite-output option
|
|
462
|
+
# 处理-O/--overwrite-output选项
|
|
152
463
|
overwrite = False
|
|
153
464
|
if overwrite_output:
|
|
154
465
|
if output:
|
|
@@ -158,6 +469,8 @@ def feed_process_params_from_cli(settings, output, output_format=None,
|
|
|
158
469
|
output = overwrite_output
|
|
159
470
|
overwrite = True
|
|
160
471
|
|
|
472
|
+
# Handle -t option (deprecated)
|
|
473
|
+
# 处理-t选项(已弃用)
|
|
161
474
|
if output_format:
|
|
162
475
|
if len(output) == 1:
|
|
163
476
|
check_valid_format(output_format)
|
|
@@ -174,22 +487,37 @@ def feed_process_params_from_cli(settings, output, output_format=None,
|
|
|
174
487
|
'URIs are specified'
|
|
175
488
|
)
|
|
176
489
|
|
|
490
|
+
# Process each output URI
|
|
491
|
+
# 处理每个输出URI
|
|
177
492
|
result = {}
|
|
178
493
|
for element in output:
|
|
494
|
+
# Try to extract format from URI (e.g., file.json:json)
|
|
495
|
+
# 尝试从URI中提取格式(例如,file.json:json)
|
|
179
496
|
try:
|
|
180
497
|
feed_uri, feed_format = element.rsplit(':', 1)
|
|
181
498
|
except ValueError:
|
|
499
|
+
# If no format in URI, infer from file extension
|
|
500
|
+
# 如果URI中没有格式,从文件扩展名推断
|
|
182
501
|
feed_uri = element
|
|
183
502
|
feed_format = os.path.splitext(element)[1].replace('.', '')
|
|
184
503
|
else:
|
|
504
|
+
# Special case for stdout
|
|
505
|
+
# stdout的特殊情况
|
|
185
506
|
if feed_uri == '-':
|
|
186
507
|
feed_uri = 'stdout:'
|
|
508
|
+
|
|
509
|
+
# Validate the format
|
|
510
|
+
# 验证格式
|
|
187
511
|
check_valid_format(feed_format)
|
|
512
|
+
|
|
513
|
+
# Add to result dictionary
|
|
514
|
+
# 添加到结果字典
|
|
188
515
|
result[feed_uri] = {'format': feed_format}
|
|
189
516
|
if overwrite:
|
|
190
517
|
result[feed_uri]['overwrite'] = True
|
|
191
518
|
|
|
192
519
|
# FEEDS setting should take precedence over the matching CLI options
|
|
520
|
+
# FEEDS设置应优先于匹配的CLI选项
|
|
193
521
|
result.update(settings.getdict('FEEDS'))
|
|
194
522
|
|
|
195
523
|
return result
|