aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/utils/project.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Project utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的项目实用函数。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with aioscrapy projects.
|
|
6
|
+
It includes functions for determining if code is running inside a project,
|
|
7
|
+
accessing project data directories, and loading project settings.
|
|
8
|
+
此模块提供了用于处理aioscrapy项目的实用函数。
|
|
9
|
+
它包括用于确定代码是否在项目内运行、访问项目数据目录和加载项目设置的函数。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
import os
|
|
2
13
|
import warnings
|
|
3
14
|
|
|
@@ -9,73 +20,221 @@ from aioscrapy.settings import Settings
|
|
|
9
20
|
from aioscrapy.exceptions import NotConfigured, AioScrapyDeprecationWarning
|
|
10
21
|
|
|
11
22
|
|
|
23
|
+
# Environment variable name that defines the settings module path
|
|
24
|
+
# 定义设置模块路径的环境变量名称
|
|
12
25
|
ENVVAR = 'AIOSCRAPY_SETTINGS_MODULE'
|
|
26
|
+
|
|
27
|
+
# Configuration section name for data directory settings in scrapy.cfg
|
|
28
|
+
# scrapy.cfg中数据目录设置的配置部分名称
|
|
13
29
|
DATADIR_CFG_SECTION = 'datadir'
|
|
14
30
|
|
|
15
31
|
|
|
16
32
|
def inside_project():
|
|
33
|
+
"""
|
|
34
|
+
Check if the code is running inside an aioscrapy project.
|
|
35
|
+
检查代码是否在aioscrapy项目内运行。
|
|
36
|
+
|
|
37
|
+
This function determines if the current code is running inside an aioscrapy project
|
|
38
|
+
by checking either:
|
|
39
|
+
1. If the AIOSCRAPY_SETTINGS_MODULE environment variable is set and the module can be imported
|
|
40
|
+
2. If a scrapy.cfg file can be found in the current directory or any parent directory
|
|
41
|
+
|
|
42
|
+
此函数通过以下方式确定当前代码是否在aioscrapy项目内运行:
|
|
43
|
+
1. 检查AIOSCRAPY_SETTINGS_MODULE环境变量是否设置且模块可以导入
|
|
44
|
+
2. 检查当前目录或任何父目录中是否可以找到scrapy.cfg文件
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
bool: True if running inside a project, False otherwise.
|
|
48
|
+
如果在项目内运行则为True,否则为False。
|
|
49
|
+
"""
|
|
50
|
+
# Check if the settings module environment variable is set
|
|
51
|
+
# 检查是否设置了设置模块环境变量
|
|
17
52
|
scrapy_module = os.environ.get('AIOSCRAPY_SETTINGS_MODULE')
|
|
18
53
|
if scrapy_module is not None:
|
|
19
54
|
try:
|
|
55
|
+
# Try to import the settings module
|
|
56
|
+
# 尝试导入设置模块
|
|
20
57
|
import_module(scrapy_module)
|
|
21
58
|
except ImportError as exc:
|
|
59
|
+
# If import fails, warn but continue checking
|
|
60
|
+
# 如果导入失败,发出警告但继续检查
|
|
22
61
|
warnings.warn(f"Cannot import scrapy settings module {scrapy_module}: {exc}")
|
|
23
62
|
else:
|
|
63
|
+
# If import succeeds, we're inside a project
|
|
64
|
+
# 如果导入成功,我们在项目内
|
|
24
65
|
return True
|
|
66
|
+
|
|
67
|
+
# If no settings module or import failed, check for scrapy.cfg file
|
|
68
|
+
# 如果没有设置模块或导入失败,检查scrapy.cfg文件
|
|
25
69
|
return bool(closest_aioscrapy_cfg())
|
|
26
70
|
|
|
27
71
|
|
|
28
72
|
def project_data_dir(project='default'):
|
|
29
|
-
"""
|
|
73
|
+
"""
|
|
74
|
+
Get the project data directory, creating it if it doesn't exist.
|
|
75
|
+
获取项目数据目录,如果不存在则创建它。
|
|
76
|
+
|
|
77
|
+
This function returns the path to the data directory for the specified project.
|
|
78
|
+
The directory is determined in the following order:
|
|
79
|
+
1. From the [datadir] section in scrapy.cfg if it exists
|
|
80
|
+
2. Otherwise, defaults to a '.scrapy' directory in the same directory as scrapy.cfg
|
|
81
|
+
|
|
82
|
+
此函数返回指定项目的数据目录的路径。
|
|
83
|
+
目录按以下顺序确定:
|
|
84
|
+
1. 如果存在,从scrapy.cfg的[datadir]部分获取
|
|
85
|
+
2. 否则,默认为与scrapy.cfg相同目录中的'.scrapy'目录
|
|
86
|
+
|
|
87
|
+
The function will create the directory if it doesn't exist.
|
|
88
|
+
如果目录不存在,该函数将创建它。
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
project: The project name to get the data directory for.
|
|
92
|
+
要获取数据目录的项目名称。
|
|
93
|
+
Defaults to 'default'.
|
|
94
|
+
默认为'default'。
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
str: The absolute path to the project data directory.
|
|
98
|
+
项目数据目录的绝对路径。
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
NotConfigured: If not running inside a project or if scrapy.cfg cannot be found.
|
|
102
|
+
如果不在项目内运行或找不到scrapy.cfg。
|
|
103
|
+
"""
|
|
104
|
+
# Check if we're inside a project
|
|
105
|
+
# 检查我们是否在项目内
|
|
30
106
|
if not inside_project():
|
|
31
107
|
raise NotConfigured("Not inside a project")
|
|
108
|
+
|
|
109
|
+
# Get the configuration
|
|
110
|
+
# 获取配置
|
|
32
111
|
cfg = get_config()
|
|
112
|
+
|
|
113
|
+
# Try to get the data directory from the config
|
|
114
|
+
# 尝试从配置中获取数据目录
|
|
33
115
|
if cfg.has_option(DATADIR_CFG_SECTION, project):
|
|
34
116
|
d = cfg.get(DATADIR_CFG_SECTION, project)
|
|
35
117
|
else:
|
|
118
|
+
# Fall back to default location
|
|
119
|
+
# 回退到默认位置
|
|
36
120
|
scrapy_cfg = closest_aioscrapy_cfg()
|
|
37
121
|
if not scrapy_cfg:
|
|
38
122
|
raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
|
|
39
123
|
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
|
|
124
|
+
|
|
125
|
+
# Create the directory if it doesn't exist
|
|
126
|
+
# 如果目录不存在,则创建它
|
|
40
127
|
if not exists(d):
|
|
41
128
|
os.makedirs(d)
|
|
129
|
+
|
|
42
130
|
return d
|
|
43
131
|
|
|
44
132
|
|
|
45
133
|
def data_path(path, createdir=False):
|
|
46
134
|
"""
|
|
47
|
-
|
|
48
|
-
|
|
135
|
+
Get the absolute path for a file within the project data directory.
|
|
136
|
+
获取项目数据目录中文件的绝对路径。
|
|
137
|
+
|
|
138
|
+
This function resolves a path relative to the project data directory.
|
|
139
|
+
If the given path is already absolute, it returns it unmodified.
|
|
140
|
+
If not inside a project, it uses a '.scrapy' directory in the current directory.
|
|
141
|
+
|
|
142
|
+
此函数解析相对于项目数据目录的路径。
|
|
143
|
+
如果给定的路径已经是绝对路径,则原样返回。
|
|
144
|
+
如果不在项目内,则使用当前目录中的'.scrapy'目录。
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
path: The path to resolve. Can be absolute or relative.
|
|
148
|
+
要解析的路径。可以是绝对路径或相对路径。
|
|
149
|
+
createdir: Whether to create the directory if it doesn't exist.
|
|
150
|
+
如果目录不存在,是否创建它。
|
|
151
|
+
Defaults to False.
|
|
152
|
+
默认为False。
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str: The absolute path to the file or directory.
|
|
156
|
+
文件或目录的绝对路径。
|
|
49
157
|
"""
|
|
158
|
+
# If the path is not absolute, make it relative to the data directory
|
|
159
|
+
# 如果路径不是绝对的,使其相对于数据目录
|
|
50
160
|
if not isabs(path):
|
|
51
161
|
if inside_project():
|
|
162
|
+
# If inside a project, use the project data directory
|
|
163
|
+
# 如果在项目内,使用项目数据目录
|
|
52
164
|
path = join(project_data_dir(), path)
|
|
53
165
|
else:
|
|
166
|
+
# Otherwise, use a .scrapy directory in the current directory
|
|
167
|
+
# 否则,使用当前目录中的.scrapy目录
|
|
54
168
|
path = join('.scrapy', path)
|
|
169
|
+
|
|
170
|
+
# Create the directory if requested and it doesn't exist
|
|
171
|
+
# 如果请求且目录不存在,则创建目录
|
|
55
172
|
if createdir and not exists(path):
|
|
56
173
|
os.makedirs(path)
|
|
174
|
+
|
|
57
175
|
return path
|
|
58
176
|
|
|
59
177
|
|
|
60
178
|
def get_project_settings():
|
|
179
|
+
"""
|
|
180
|
+
Get a Settings instance with the project settings.
|
|
181
|
+
获取包含项目设置的Settings实例。
|
|
182
|
+
|
|
183
|
+
This function loads the project settings from the module specified in the
|
|
184
|
+
AIOSCRAPY_SETTINGS_MODULE environment variable. If the variable is not set,
|
|
185
|
+
it tries to initialize the environment using the project name from the
|
|
186
|
+
AIOSCRAPY_PROJECT environment variable (defaulting to 'default').
|
|
187
|
+
|
|
188
|
+
此函数从AIOSCRAPY_SETTINGS_MODULE环境变量指定的模块加载项目设置。
|
|
189
|
+
如果未设置该变量,它会尝试使用AIOSCRAPY_PROJECT环境变量中的项目名称
|
|
190
|
+
(默认为'default')初始化环境。
|
|
191
|
+
|
|
192
|
+
The function also handles environment variables prefixed with AIOSCRAPY_
|
|
193
|
+
as a way to override settings, though this method is deprecated.
|
|
194
|
+
|
|
195
|
+
该函数还处理以AIOSCRAPY_为前缀的环境变量作为覆盖设置的方式,
|
|
196
|
+
尽管此方法已弃用。
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Settings: A Settings instance with the project settings loaded.
|
|
200
|
+
加载了项目设置的Settings实例。
|
|
201
|
+
"""
|
|
202
|
+
# Initialize the environment if the settings module is not set
|
|
203
|
+
# 如果未设置设置模块,则初始化环境
|
|
61
204
|
if ENVVAR not in os.environ:
|
|
62
205
|
project = os.environ.get('AIOSCRAPY_PROJECT', 'default')
|
|
63
206
|
init_env(project)
|
|
64
207
|
|
|
208
|
+
# Create a new Settings instance
|
|
209
|
+
# 创建一个新的Settings实例
|
|
65
210
|
settings = Settings()
|
|
211
|
+
|
|
212
|
+
# Load settings from the module specified in the environment variable
|
|
213
|
+
# 从环境变量指定的模块加载设置
|
|
66
214
|
settings_module_path = os.environ.get(ENVVAR)
|
|
67
215
|
if settings_module_path:
|
|
68
216
|
settings.setmodule(settings_module_path, priority='project')
|
|
69
217
|
|
|
218
|
+
# Get all environment variables prefixed with AIOSCRAPY_
|
|
219
|
+
# 获取所有以AIOSCRAPY_为前缀的环境变量
|
|
70
220
|
aioscrapy_envvars = {k[10:]: v for k, v in os.environ.items() if
|
|
71
221
|
k.startswith('AIOSCRAPY_')}
|
|
222
|
+
|
|
223
|
+
# Define which environment variables are valid and not settings
|
|
224
|
+
# 定义哪些环境变量是有效的且不是设置
|
|
72
225
|
valid_envvars = {
|
|
73
226
|
'CHECK',
|
|
74
227
|
'PROJECT',
|
|
75
228
|
'PYTHON_SHELL',
|
|
76
229
|
'SETTINGS_MODULE',
|
|
77
230
|
}
|
|
231
|
+
|
|
232
|
+
# Find environment variables that are being used to override settings
|
|
233
|
+
# 查找用于覆盖设置的环境变量
|
|
78
234
|
setting_envvars = {k for k in aioscrapy_envvars if k not in valid_envvars}
|
|
235
|
+
|
|
236
|
+
# Warn about deprecated usage of environment variables to override settings
|
|
237
|
+
# 警告关于使用环境变量覆盖设置的已弃用用法
|
|
79
238
|
if setting_envvars:
|
|
80
239
|
setting_envvar_list = ', '.join(sorted(setting_envvars))
|
|
81
240
|
warnings.warn(
|
|
@@ -84,6 +243,9 @@ def get_project_settings():
|
|
|
84
243
|
f'currently defined: {setting_envvar_list}',
|
|
85
244
|
AioScrapyDeprecationWarning
|
|
86
245
|
)
|
|
246
|
+
|
|
247
|
+
# Apply the environment variable overrides
|
|
248
|
+
# 应用环境变量覆盖
|
|
87
249
|
settings.setdict(aioscrapy_envvars, priority='project')
|
|
88
250
|
|
|
89
251
|
return settings
|
aioscrapy/utils/python.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Python utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的Python实用函数。
|
|
4
|
+
|
|
5
|
+
This module contains essential utility functions for Python that enhance the standard library.
|
|
6
|
+
It provides functions for type conversion, string handling, regular expression searching,
|
|
7
|
+
memoization, and other common operations.
|
|
8
|
+
此模块包含增强标准库的Python基本实用函数。
|
|
9
|
+
它提供了用于类型转换、字符串处理、正则表达式搜索、记忆化和其他常见操作的函数。
|
|
3
10
|
"""
|
|
4
11
|
import gc
|
|
5
12
|
import re
|
|
@@ -12,31 +19,72 @@ from aioscrapy.utils.decorators import deprecated
|
|
|
12
19
|
|
|
13
20
|
def is_listlike(x):
|
|
14
21
|
"""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
22
|
+
Check if the given object is list-like (iterable but not a string or bytes).
|
|
23
|
+
检查给定对象是否类似列表(可迭代但不是字符串或字节)。
|
|
24
|
+
|
|
25
|
+
This function determines if an object is iterable (has the __iter__ method)
|
|
26
|
+
but is not a string or bytes object. This is useful for functions that
|
|
27
|
+
should treat strings differently from other iterables.
|
|
28
|
+
此函数确定对象是否可迭代(具有__iter__方法)但不是字符串或字节对象。
|
|
29
|
+
这对于应该将字符串与其他可迭代对象区别对待的函数很有用。
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
x: The object to check.
|
|
33
|
+
要检查的对象。
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
bool: True if the object is list-like, False otherwise.
|
|
37
|
+
如果对象类似列表则为True,否则为False。
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> is_listlike("foo")
|
|
41
|
+
False
|
|
42
|
+
>>> is_listlike(5)
|
|
43
|
+
False
|
|
44
|
+
>>> is_listlike(b"foo")
|
|
45
|
+
False
|
|
46
|
+
>>> is_listlike([b"foo"])
|
|
47
|
+
True
|
|
48
|
+
>>> is_listlike((b"foo",))
|
|
49
|
+
True
|
|
50
|
+
>>> is_listlike({})
|
|
51
|
+
True
|
|
52
|
+
>>> is_listlike(set())
|
|
53
|
+
True
|
|
54
|
+
>>> is_listlike((x for x in range(3)))
|
|
55
|
+
True
|
|
56
|
+
>>> is_listlike(range(5))
|
|
57
|
+
True
|
|
33
58
|
"""
|
|
34
59
|
return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
|
|
35
60
|
|
|
36
61
|
|
|
37
62
|
def to_unicode(text, encoding=None, errors='strict'):
|
|
38
|
-
"""
|
|
39
|
-
|
|
63
|
+
"""
|
|
64
|
+
Convert a bytes object to a unicode (str) object.
|
|
65
|
+
将字节对象转换为unicode(str)对象。
|
|
66
|
+
|
|
67
|
+
This function converts a bytes object to a unicode string using the specified
|
|
68
|
+
encoding. If the input is already a unicode string, it is returned unchanged.
|
|
69
|
+
此函数使用指定的编码将字节对象转换为unicode字符串。
|
|
70
|
+
如果输入已经是unicode字符串,则原样返回。
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
text: The text to convert. Must be a bytes or str object.
|
|
74
|
+
要转换的文本。必须是bytes或str对象。
|
|
75
|
+
encoding: The encoding to use for decoding bytes. Defaults to 'utf-8'.
|
|
76
|
+
用于解码字节的编码。默认为'utf-8'。
|
|
77
|
+
errors: The error handling scheme for decoding. Defaults to 'strict'.
|
|
78
|
+
解码的错误处理方案。默认为'strict'。
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
str: The unicode representation of the input text.
|
|
82
|
+
输入文本的unicode表示。
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
TypeError: If the input is not a bytes or str object.
|
|
86
|
+
如果输入不是bytes或str对象。
|
|
87
|
+
"""
|
|
40
88
|
if isinstance(text, str):
|
|
41
89
|
return text
|
|
42
90
|
if not isinstance(text, (bytes, str)):
|
|
@@ -48,8 +96,31 @@ def to_unicode(text, encoding=None, errors='strict'):
|
|
|
48
96
|
|
|
49
97
|
|
|
50
98
|
def to_bytes(text, encoding=None, errors='strict'):
|
|
51
|
-
"""
|
|
52
|
-
|
|
99
|
+
"""
|
|
100
|
+
Convert a unicode (str) object to a bytes object.
|
|
101
|
+
将unicode(str)对象转换为字节对象。
|
|
102
|
+
|
|
103
|
+
This function converts a unicode string to a bytes object using the specified
|
|
104
|
+
encoding. If the input is already a bytes object, it is returned unchanged.
|
|
105
|
+
此函数使用指定的编码将unicode字符串转换为字节对象。
|
|
106
|
+
如果输入已经是字节对象,则原样返回。
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
text: The text to convert. Must be a str or bytes object.
|
|
110
|
+
要转换的文本。必须是str或bytes对象。
|
|
111
|
+
encoding: The encoding to use for encoding the string. Defaults to 'utf-8'.
|
|
112
|
+
用于编码字符串的编码。默认为'utf-8'。
|
|
113
|
+
errors: The error handling scheme for encoding. Defaults to 'strict'.
|
|
114
|
+
编码的错误处理方案。默认为'strict'。
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
bytes: The binary representation of the input text.
|
|
118
|
+
输入文本的二进制表示。
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
TypeError: If the input is not a str or bytes object.
|
|
122
|
+
如果输入不是str或bytes对象。
|
|
123
|
+
"""
|
|
53
124
|
if isinstance(text, bytes):
|
|
54
125
|
return text
|
|
55
126
|
if not isinstance(text, str):
|
|
@@ -62,24 +133,71 @@ def to_bytes(text, encoding=None, errors='strict'):
|
|
|
62
133
|
|
|
63
134
|
@deprecated('to_unicode')
|
|
64
135
|
def to_native_str(text, encoding=None, errors='strict'):
|
|
65
|
-
"""
|
|
136
|
+
"""
|
|
137
|
+
Convert text to native string type (str).
|
|
138
|
+
将文本转换为本地字符串类型(str)。
|
|
139
|
+
|
|
140
|
+
This function is deprecated. Use to_unicode() instead.
|
|
141
|
+
此函数已弃用。请改用to_unicode()。
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
text: The text to convert.
|
|
145
|
+
要转换的文本。
|
|
146
|
+
encoding: The encoding to use for decoding bytes. Defaults to 'utf-8'.
|
|
147
|
+
用于解码字节的编码。默认为'utf-8'。
|
|
148
|
+
errors: The error handling scheme. Defaults to 'strict'.
|
|
149
|
+
错误处理方案。默认为'strict'。
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
str: The string representation of the input text.
|
|
153
|
+
输入文本的字符串表示。
|
|
154
|
+
"""
|
|
66
155
|
return to_unicode(text, encoding, errors)
|
|
67
156
|
|
|
68
157
|
|
|
69
158
|
def re_rsearch(pattern, text, chunk_size=1024):
|
|
70
159
|
"""
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
160
|
+
Perform a reverse search in text using a regular expression pattern.
|
|
161
|
+
使用正则表达式模式在文本中执行反向搜索。
|
|
162
|
+
|
|
163
|
+
This function searches for the last occurrence of a pattern in a text,
|
|
164
|
+
starting from the end. Since the re module does not provide reverse search
|
|
165
|
+
functionality, this function implements it by searching in chunks from the
|
|
166
|
+
end of the text for efficiency.
|
|
167
|
+
此函数从文本末尾开始搜索模式的最后一次出现。
|
|
168
|
+
由于re模块不提供反向搜索功能,此函数通过从文本末尾的块中搜索来实现它,以提高效率。
|
|
169
|
+
|
|
170
|
+
The algorithm works as follows:
|
|
171
|
+
1. Extract a chunk of 'chunk_size' kilobytes from the end of the text
|
|
172
|
+
2. Search for the pattern in this chunk
|
|
173
|
+
3. If not found, extract another chunk further from the end and search again
|
|
174
|
+
4. Continue until a match is found or the entire text has been searched
|
|
175
|
+
|
|
176
|
+
算法工作原理如下:
|
|
177
|
+
1. 从文本末尾提取'chunk_size'千字节的块
|
|
178
|
+
2. 在此块中搜索模式
|
|
179
|
+
3. 如果未找到,从末尾进一步提取另一个块并再次搜索
|
|
180
|
+
4. 继续直到找到匹配项或已搜索整个文本
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
pattern: The regular expression pattern to search for.
|
|
184
|
+
要搜索的正则表达式模式。
|
|
185
|
+
Can be a string or a compiled regex pattern.
|
|
186
|
+
可以是字符串或已编译的正则表达式模式。
|
|
187
|
+
text: The text to search in.
|
|
188
|
+
要搜索的文本。
|
|
189
|
+
chunk_size: The size of each chunk in kilobytes. Defaults to 1024 (1MB).
|
|
190
|
+
每个块的大小(千字节)。默认为1024(1MB)。
|
|
82
191
|
|
|
192
|
+
Returns:
|
|
193
|
+
tuple or None: If a match is found, returns a tuple (start, end) with the
|
|
194
|
+
positions of the match in the entire text. If no match is found,
|
|
195
|
+
returns None.
|
|
196
|
+
如果找到匹配项,返回一个元组(start, end),其中包含整个文本中
|
|
197
|
+
匹配项的位置。如果未找到匹配项,返回None。
|
|
198
|
+
"""
|
|
199
|
+
# Inner function to generate chunks from the end of the text
|
|
200
|
+
# 从文本末尾生成块的内部函数
|
|
83
201
|
def _chunk_iter():
|
|
84
202
|
offset = len(text)
|
|
85
203
|
while True:
|
|
@@ -89,60 +207,152 @@ def re_rsearch(pattern, text, chunk_size=1024):
|
|
|
89
207
|
yield (text[offset:], offset)
|
|
90
208
|
yield (text, 0)
|
|
91
209
|
|
|
210
|
+
# Compile the pattern if it's a string
|
|
211
|
+
# 如果模式是字符串,则编译它
|
|
92
212
|
if isinstance(pattern, str):
|
|
93
213
|
pattern = re.compile(pattern)
|
|
94
214
|
|
|
215
|
+
# Search for the pattern in each chunk
|
|
216
|
+
# 在每个块中搜索模式
|
|
95
217
|
for chunk, offset in _chunk_iter():
|
|
96
218
|
matches = [match for match in pattern.finditer(chunk)]
|
|
97
219
|
if matches:
|
|
220
|
+
# Return the position of the last match in the chunk
|
|
221
|
+
# 返回块中最后一个匹配项的位置
|
|
98
222
|
start, end = matches[-1].span()
|
|
99
223
|
return offset + start, offset + end
|
|
100
224
|
return None
|
|
101
225
|
|
|
102
226
|
|
|
103
227
|
def memoizemethod_noargs(method):
|
|
104
|
-
"""Decorator to cache the result of a method (without arguments) using a
|
|
105
|
-
weak reference to its object
|
|
106
228
|
"""
|
|
229
|
+
Decorator to cache the result of a method with no arguments.
|
|
230
|
+
装饰器,用于缓存无参数方法的结果。
|
|
231
|
+
|
|
232
|
+
This decorator caches the result of a method call using a weak reference
|
|
233
|
+
to the object. This means the cache entry will be automatically removed
|
|
234
|
+
when the object is garbage collected.
|
|
235
|
+
此装饰器使用对对象的弱引用缓存方法调用的结果。
|
|
236
|
+
这意味着当对象被垃圾回收时,缓存条目将自动被移除。
|
|
237
|
+
|
|
238
|
+
Note that while the decorated method can accept arguments, the caching
|
|
239
|
+
is based only on the object instance, not on the arguments. This means
|
|
240
|
+
that only the first call's result will be cached, regardless of arguments.
|
|
241
|
+
请注意,虽然装饰的方法可以接受参数,但缓存仅基于对象实例,而不是参数。
|
|
242
|
+
这意味着无论参数如何,只有第一次调用的结果将被缓存。
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
method: The method to be decorated.
|
|
246
|
+
要装饰的方法。
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
function: A new method that caches its result.
|
|
250
|
+
缓存其结果的新方法。
|
|
251
|
+
"""
|
|
252
|
+
# Create a weak key dictionary to store cached results
|
|
253
|
+
# 创建一个弱键字典来存储缓存的结果
|
|
107
254
|
cache = weakref.WeakKeyDictionary()
|
|
108
255
|
|
|
109
256
|
@wraps(method)
|
|
110
257
|
def new_method(self, *args, **kwargs):
|
|
258
|
+
# If the result is not cached for this object, call the method
|
|
259
|
+
# 如果此对象的结果未缓存,则调用该方法
|
|
111
260
|
if self not in cache:
|
|
112
261
|
cache[self] = method(self, *args, **kwargs)
|
|
262
|
+
# Return the cached result
|
|
263
|
+
# 返回缓存的结果
|
|
113
264
|
return cache[self]
|
|
114
265
|
|
|
115
266
|
return new_method
|
|
116
267
|
|
|
117
268
|
|
|
118
269
|
def without_none_values(iterable):
|
|
119
|
-
"""
|
|
270
|
+
"""
|
|
271
|
+
Return a copy of an iterable with all None entries removed.
|
|
272
|
+
返回一个去除所有None条目的可迭代对象的副本。
|
|
273
|
+
|
|
274
|
+
This function creates a new iterable of the same type as the input,
|
|
275
|
+
but with all None values removed. It handles both mappings (like dictionaries)
|
|
276
|
+
and sequences (like lists, tuples).
|
|
277
|
+
此函数创建一个与输入相同类型的新可迭代对象,但移除了所有None值。
|
|
278
|
+
它处理映射(如字典)和序列(如列表、元组)。
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
iterable: The iterable to process. Can be a mapping or a sequence.
|
|
282
|
+
要处理的可迭代对象。可以是映射或序列。
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
A new iterable of the same type as the input, but with all None values removed.
|
|
286
|
+
一个与输入相同类型的新可迭代对象,但移除了所有None值。
|
|
120
287
|
|
|
121
|
-
|
|
122
|
-
|
|
288
|
+
Examples:
|
|
289
|
+
>>> without_none_values({'a': 1, 'b': None, 'c': 3})
|
|
290
|
+
{'a': 1, 'c': 3}
|
|
291
|
+
>>> without_none_values([1, None, 3, None, 5])
|
|
292
|
+
[1, 3, 5]
|
|
123
293
|
"""
|
|
124
294
|
try:
|
|
295
|
+
# Handle mappings (objects with .items() method)
|
|
296
|
+
# 处理映射(具有.items()方法的对象)
|
|
125
297
|
return {k: v for k, v in iterable.items() if v is not None}
|
|
126
298
|
except AttributeError:
|
|
299
|
+
# Handle sequences and other iterables
|
|
300
|
+
# 处理序列和其他可迭代对象
|
|
127
301
|
return type(iterable)((v for v in iterable if v is not None))
|
|
128
302
|
|
|
129
303
|
|
|
130
304
|
def global_object_name(obj):
|
|
131
305
|
"""
|
|
132
|
-
Return full name of a global object.
|
|
306
|
+
Return the full qualified name of a global object.
|
|
307
|
+
返回全局对象的完全限定名称。
|
|
133
308
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
309
|
+
This function returns the full name of an object, including its module path.
|
|
310
|
+
It's useful for debugging, logging, and serialization purposes.
|
|
311
|
+
此函数返回对象的完整名称,包括其模块路径。
|
|
312
|
+
它对于调试、日志记录和序列化目的很有用。
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
obj: The object to get the name for. Must have __module__ and __name__ attributes.
|
|
316
|
+
要获取名称的对象。必须具有__module__和__name__属性。
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
str: The full qualified name of the object in the format "module.name".
|
|
320
|
+
对象的完全限定名称,格式为"module.name"。
|
|
321
|
+
|
|
322
|
+
Examples:
|
|
323
|
+
>>> from aioscrapy import Request
|
|
324
|
+
>>> global_object_name(Request)
|
|
325
|
+
'aioscrapy.http.request.Request'
|
|
137
326
|
"""
|
|
138
327
|
return f"{obj.__module__}.{obj.__name__}"
|
|
139
328
|
|
|
140
329
|
|
|
141
330
|
if hasattr(sys, "pypy_version_info"):
|
|
142
331
|
def garbage_collect():
|
|
332
|
+
"""
|
|
333
|
+
Force garbage collection, with special handling for PyPy.
|
|
334
|
+
强制垃圾回收,对PyPy进行特殊处理。
|
|
335
|
+
|
|
336
|
+
On PyPy, collecting weak references can take two collection cycles,
|
|
337
|
+
so this function calls gc.collect() twice.
|
|
338
|
+
在PyPy上,收集弱引用可能需要两个收集周期,
|
|
339
|
+
因此此函数调用gc.collect()两次。
|
|
340
|
+
"""
|
|
143
341
|
# Collecting weakreferences can take two collections on PyPy.
|
|
342
|
+
# 在PyPy上收集弱引用可能需要两次收集。
|
|
144
343
|
gc.collect()
|
|
145
344
|
gc.collect()
|
|
146
345
|
else:
|
|
147
346
|
def garbage_collect():
|
|
347
|
+
"""
|
|
348
|
+
Force garbage collection.
|
|
349
|
+
强制垃圾回收。
|
|
350
|
+
|
|
351
|
+
This function calls Python's garbage collector to force a collection cycle.
|
|
352
|
+
It's useful when you need to ensure that objects with no references are
|
|
353
|
+
properly cleaned up, especially those with __del__ methods or weak references.
|
|
354
|
+
此函数调用Python的垃圾收集器来强制进行收集周期。
|
|
355
|
+
当您需要确保没有引用的对象被正确清理时,这很有用,
|
|
356
|
+
特别是那些具有__del__方法或弱引用的对象。
|
|
357
|
+
"""
|
|
148
358
|
gc.collect()
|