aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/http/response/text.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module implements the TextResponse class which adds encoding handling and
|
|
3
|
-
discovering (through HTTP headers) to base Response class.
|
|
4
1
|
|
|
5
|
-
|
|
2
|
+
"""
|
|
3
|
+
Text response implementation for aioscrapy.
|
|
4
|
+
aioscrapy的文本响应实现。
|
|
5
|
+
|
|
6
|
+
This module provides the TextResponse class, which extends the base Response
|
|
7
|
+
to handle text content. It supports encoding detection, text extraction,
|
|
8
|
+
and provides methods for CSS and XPath selectors.
|
|
9
|
+
此模块提供了TextResponse类,它扩展了基本Response以处理文本内容。
|
|
10
|
+
它支持编码检测、文本提取,并提供CSS和XPath选择器的方法。
|
|
6
11
|
"""
|
|
7
12
|
|
|
8
13
|
import warnings
|
|
@@ -23,46 +28,190 @@ from aioscrapy.http.response import Response
|
|
|
23
28
|
from aioscrapy.utils.python import memoizemethod_noargs, to_unicode
|
|
24
29
|
from aioscrapy.utils.response import get_base_url
|
|
25
30
|
|
|
31
|
+
# Sentinel object to indicate that a value hasn't been cached yet
|
|
32
|
+
# 表示值尚未缓存的哨兵对象
|
|
26
33
|
_NONE = object()
|
|
27
34
|
|
|
28
35
|
|
|
29
36
|
class TextResponse(Response):
|
|
37
|
+
"""
|
|
38
|
+
A Response subclass that adds support for text processing.
|
|
39
|
+
添加文本处理支持的Response子类。
|
|
40
|
+
|
|
41
|
+
This class extends the base Response to handle text content, with features for:
|
|
42
|
+
此类扩展了基本Response以处理文本内容,具有以下功能:
|
|
43
|
+
|
|
44
|
+
- Automatic encoding detection
|
|
45
|
+
自动编码检测
|
|
46
|
+
- Unicode conversion
|
|
47
|
+
Unicode转换
|
|
48
|
+
- CSS and XPath selectors
|
|
49
|
+
CSS和XPath选择器
|
|
50
|
+
- JSON parsing
|
|
51
|
+
JSON解析
|
|
52
|
+
- Enhanced link following
|
|
53
|
+
增强的链接跟踪
|
|
54
|
+
"""
|
|
30
55
|
|
|
56
|
+
# Default encoding to use if no encoding is specified or detected
|
|
57
|
+
# 如果未指定或检测到编码,则使用的默认编码
|
|
31
58
|
_DEFAULT_ENCODING = 'ascii'
|
|
59
|
+
|
|
60
|
+
# Cache for decoded JSON content
|
|
61
|
+
# 解码的JSON内容的缓存
|
|
32
62
|
_cached_decoded_json = _NONE
|
|
33
63
|
|
|
34
64
|
def __init__(self, *args, encoding=None, **kwargs):
|
|
65
|
+
"""
|
|
66
|
+
Initialize a TextResponse.
|
|
67
|
+
初始化TextResponse。
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
*args: Positional arguments passed to the Response constructor.
|
|
71
|
+
传递给Response构造函数的位置参数。
|
|
72
|
+
encoding: The encoding of the response. If None, it will be auto-detected.
|
|
73
|
+
响应的编码。如果为None,将自动检测。
|
|
74
|
+
**kwargs: Keyword arguments passed to the Response constructor.
|
|
75
|
+
传递给Response构造函数的关键字参数。
|
|
76
|
+
"""
|
|
77
|
+
# The explicitly declared encoding
|
|
78
|
+
# 明确声明的编码
|
|
35
79
|
self._encoding = encoding
|
|
80
|
+
|
|
81
|
+
# Cache for inferred body encoding
|
|
82
|
+
# 推断的正文编码的缓存
|
|
36
83
|
self._cached_benc = None
|
|
84
|
+
|
|
85
|
+
# Cache for unicode body
|
|
86
|
+
# Unicode正文的缓存
|
|
37
87
|
self._cached_ubody = None
|
|
88
|
+
|
|
89
|
+
# Cache for selector
|
|
90
|
+
# 选择器的缓存
|
|
38
91
|
self._cached_selector = None
|
|
92
|
+
|
|
39
93
|
super().__init__(*args, **kwargs)
|
|
40
94
|
|
|
41
95
|
def _set_url(self, url):
|
|
96
|
+
"""
|
|
97
|
+
Set the response URL, ensuring it's properly encoded.
|
|
98
|
+
设置响应URL,确保其正确编码。
|
|
99
|
+
|
|
100
|
+
This method overrides the base Response._set_url to handle string URLs
|
|
101
|
+
by converting them to unicode using the response's encoding.
|
|
102
|
+
此方法重写了基本Response._set_url,通过使用响应的编码将字符串URL转换为unicode来处理它们。
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
url: The URL to set.
|
|
106
|
+
要设置的URL。
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
TypeError: If the URL is not a string (raised by the parent method).
|
|
110
|
+
如果URL不是字符串(由父方法引发)。
|
|
111
|
+
"""
|
|
42
112
|
if isinstance(url, str):
|
|
113
|
+
# Convert the URL to unicode using the response's encoding
|
|
114
|
+
# 使用响应的编码将URL转换为unicode
|
|
43
115
|
self._url = to_unicode(url, self.encoding)
|
|
44
116
|
else:
|
|
117
|
+
# Let the parent class handle non-string URLs
|
|
118
|
+
# 让父类处理非字符串URL
|
|
45
119
|
super()._set_url(url)
|
|
46
120
|
|
|
47
121
|
def _set_body(self, body):
|
|
122
|
+
"""
|
|
123
|
+
Set the response body, handling both bytes and string inputs.
|
|
124
|
+
设置响应体,处理字节和字符串输入。
|
|
125
|
+
|
|
126
|
+
This method overrides the base Response._set_body to handle string bodies
|
|
127
|
+
by encoding them using the response's encoding.
|
|
128
|
+
此方法重写了基本Response._set_body,通过使用响应的编码对字符串正文进行编码来处理它们。
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
body: The body to set, either as bytes or string.
|
|
132
|
+
要设置的正文,可以是字节或字符串。
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
TypeError: If the body is a string but no encoding is specified.
|
|
136
|
+
如果正文是字符串但未指定编码。
|
|
137
|
+
"""
|
|
138
|
+
# Initialize with empty bytes for encoding detection
|
|
139
|
+
# 初始化为空字节以进行编码检测
|
|
48
140
|
self._body = b'' # used by encoding detection
|
|
141
|
+
|
|
49
142
|
if isinstance(body, str):
|
|
143
|
+
# Handle string bodies by encoding them
|
|
144
|
+
# 通过编码字符串正文来处理它们
|
|
50
145
|
if self._encoding is None:
|
|
51
146
|
raise TypeError('Cannot convert unicode body - '
|
|
52
147
|
f'{type(self).__name__} has no encoding')
|
|
53
148
|
self._body = body.encode(self._encoding)
|
|
54
149
|
else:
|
|
150
|
+
# Let the parent class handle non-string bodies
|
|
151
|
+
# 让父类处理非字符串正文
|
|
55
152
|
super()._set_body(body)
|
|
56
153
|
|
|
57
154
|
def replace(self, *args, **kwargs):
|
|
155
|
+
"""
|
|
156
|
+
Create a new TextResponse with the same attributes except for those given new values.
|
|
157
|
+
创建一个新的TextResponse,除了给定的新值外,其他属性与当前TextResponse相同。
|
|
158
|
+
|
|
159
|
+
This method extends the base Response.replace() method to ensure that
|
|
160
|
+
the encoding is preserved when creating a new TextResponse.
|
|
161
|
+
此方法扩展了基本Response.replace()方法,以确保在创建新的TextResponse时保留编码。
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
*args: Positional arguments passed to the base replace() method.
|
|
165
|
+
传递给基本replace()方法的位置参数。
|
|
166
|
+
**kwargs: Keyword arguments passed to the base replace() method.
|
|
167
|
+
传递给基本replace()方法的关键字参数。
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
TextResponse: A new TextResponse object.
|
|
171
|
+
一个新的TextResponse对象。
|
|
172
|
+
"""
|
|
173
|
+
# Ensure the encoding is preserved
|
|
174
|
+
# 确保编码被保留
|
|
58
175
|
kwargs.setdefault('encoding', self.encoding)
|
|
59
176
|
return Response.replace(self, *args, **kwargs)
|
|
60
177
|
|
|
61
178
|
@property
|
|
62
179
|
def encoding(self):
|
|
180
|
+
"""
|
|
181
|
+
Get the response encoding.
|
|
182
|
+
获取响应编码。
|
|
183
|
+
|
|
184
|
+
This property returns the encoding of the response, using a cascading approach:
|
|
185
|
+
1. First, try to get the explicitly declared encoding
|
|
186
|
+
2. If not available, try to infer the encoding from the body
|
|
187
|
+
此属性返回响应的编码,使用级联方法:
|
|
188
|
+
1. 首先,尝试获取明确声明的编码
|
|
189
|
+
2. 如果不可用,尝试从正文推断编码
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
str: The response encoding.
|
|
193
|
+
响应编码。
|
|
194
|
+
"""
|
|
63
195
|
return self._declared_encoding() or self._body_inferred_encoding()
|
|
64
196
|
|
|
65
197
|
def _declared_encoding(self):
|
|
198
|
+
"""
|
|
199
|
+
Get the explicitly declared encoding.
|
|
200
|
+
获取明确声明的编码。
|
|
201
|
+
|
|
202
|
+
This method tries to find the encoding from various sources, in order:
|
|
203
|
+
1. The encoding specified in the constructor
|
|
204
|
+
2. The encoding specified in the Content-Type header
|
|
205
|
+
3. The encoding declared in the HTML/XML body
|
|
206
|
+
此方法尝试从各种来源按顺序查找编码:
|
|
207
|
+
1. 构造函数中指定的编码
|
|
208
|
+
2. Content-Type头部中指定的编码
|
|
209
|
+
3. HTML/XML正文中声明的编码
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
str or None: The declared encoding, or None if not found.
|
|
213
|
+
声明的编码,如果未找到则为None。
|
|
214
|
+
"""
|
|
66
215
|
return (
|
|
67
216
|
self._encoding
|
|
68
217
|
or self._headers_encoding()
|
|
@@ -70,7 +219,17 @@ class TextResponse(Response):
|
|
|
70
219
|
)
|
|
71
220
|
|
|
72
221
|
def body_as_unicode(self):
|
|
73
|
-
"""
|
|
222
|
+
"""
|
|
223
|
+
Return the response body as unicode.
|
|
224
|
+
将响应体作为unicode返回。
|
|
225
|
+
|
|
226
|
+
This method is deprecated. Use the text property instead.
|
|
227
|
+
此方法已弃用。请改用text属性。
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
str: The response body as unicode.
|
|
231
|
+
响应体作为unicode。
|
|
232
|
+
"""
|
|
74
233
|
warnings.warn('Response.body_as_unicode() is deprecated, '
|
|
75
234
|
'please use Response.text instead.',
|
|
76
235
|
AioScrapyDeprecationWarning, stacklevel=2)
|
|
@@ -78,35 +237,106 @@ class TextResponse(Response):
|
|
|
78
237
|
|
|
79
238
|
def json(self):
|
|
80
239
|
"""
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
240
|
+
Parse the response body as JSON.
|
|
241
|
+
将响应体解析为JSON。
|
|
242
|
+
|
|
243
|
+
This method deserializes the response body as a JSON document
|
|
244
|
+
and returns the corresponding Python object. The result is cached
|
|
245
|
+
for subsequent calls.
|
|
246
|
+
此方法将响应体反序列化为JSON文档,并返回相应的Python对象。
|
|
247
|
+
结果会被缓存以供后续调用。
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
object: The deserialized JSON document.
|
|
251
|
+
反序列化的JSON文档。
|
|
252
|
+
|
|
253
|
+
Raises:
|
|
254
|
+
ValueError: If the body is not valid JSON.
|
|
255
|
+
如果正文不是有效的JSON。
|
|
84
256
|
"""
|
|
257
|
+
# Use cached result if available
|
|
258
|
+
# 如果可用,使用缓存的结果
|
|
85
259
|
if self._cached_decoded_json is _NONE:
|
|
86
260
|
self._cached_decoded_json = ujson.loads(self.text)
|
|
87
261
|
return self._cached_decoded_json
|
|
88
262
|
|
|
89
263
|
@property
|
|
90
264
|
def text(self):
|
|
91
|
-
"""
|
|
92
|
-
|
|
265
|
+
"""
|
|
266
|
+
Get the response body as unicode text.
|
|
267
|
+
将响应体作为unicode文本获取。
|
|
268
|
+
|
|
269
|
+
This property converts the response body to unicode using the detected
|
|
270
|
+
or specified encoding. The result is cached for subsequent access.
|
|
271
|
+
此属性使用检测到的或指定的编码将响应体转换为unicode。
|
|
272
|
+
结果会被缓存以供后续访问。
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
str: The response body as unicode text.
|
|
276
|
+
响应体作为unicode文本。
|
|
277
|
+
"""
|
|
278
|
+
# Access self.encoding before _cached_ubody to make sure
|
|
93
279
|
# _body_inferred_encoding is called
|
|
280
|
+
# 在_cached_ubody之前访问self.encoding,以确保调用_body_inferred_encoding
|
|
94
281
|
if self._cached_ubody is None:
|
|
95
282
|
charset = f'charset={self.encoding}'
|
|
96
283
|
self._cached_ubody = html_to_unicode(charset, self.body)[1]
|
|
97
284
|
return self._cached_ubody
|
|
98
285
|
|
|
99
286
|
def urljoin(self, url):
|
|
100
|
-
"""
|
|
101
|
-
|
|
287
|
+
"""
|
|
288
|
+
Join this Response's url with a possible relative url.
|
|
289
|
+
将此Response的url与可能的相对url连接。
|
|
290
|
+
|
|
291
|
+
This method extends the base Response.urljoin() method to use the base URL
|
|
292
|
+
from the HTML document (if available) instead of the response URL.
|
|
293
|
+
此方法扩展了基本Response.urljoin()方法,使用HTML文档中的基本URL
|
|
294
|
+
(如果可用)而不是响应URL。
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
url: The URL to join. Can be a relative URL.
|
|
298
|
+
要连接的URL。可以是相对URL。
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
str: The absolute URL.
|
|
302
|
+
绝对URL。
|
|
303
|
+
"""
|
|
304
|
+
# Use get_base_url to extract the base URL from the HTML document
|
|
305
|
+
# 使用get_base_url从HTML文档中提取基本URL
|
|
102
306
|
return urljoin(get_base_url(self), url)
|
|
103
307
|
|
|
104
308
|
@memoizemethod_noargs
|
|
105
309
|
def _headers_encoding(self):
|
|
310
|
+
"""
|
|
311
|
+
Get the encoding declared in the Content-Type header.
|
|
312
|
+
获取Content-Type头部中声明的编码。
|
|
313
|
+
|
|
314
|
+
This method extracts the charset parameter from the Content-Type header.
|
|
315
|
+
The result is memoized for performance.
|
|
316
|
+
此方法从Content-Type头部提取charset参数。
|
|
317
|
+
结果会被记忆化以提高性能。
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
str or None: The encoding declared in the header, or None if not found.
|
|
321
|
+
头部中声明的编码,如果未找到则为None。
|
|
322
|
+
"""
|
|
106
323
|
content_type = self.headers.get('Content-Type', '')
|
|
107
324
|
return http_content_type_encoding(to_unicode(content_type))
|
|
108
325
|
|
|
109
326
|
def _body_inferred_encoding(self):
|
|
327
|
+
"""
|
|
328
|
+
Infer the encoding from the response body.
|
|
329
|
+
从响应体推断编码。
|
|
330
|
+
|
|
331
|
+
This method tries to detect the encoding from the response body
|
|
332
|
+
using various heuristics. The result is cached for subsequent calls.
|
|
333
|
+
此方法尝试使用各种启发式方法从响应体检测编码。
|
|
334
|
+
结果会被缓存以供后续调用。
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
str: The inferred encoding.
|
|
338
|
+
推断的编码。
|
|
339
|
+
"""
|
|
110
340
|
if self._cached_benc is None:
|
|
111
341
|
content_type = to_unicode(self.headers.get('Content-Type', ''))
|
|
112
342
|
benc, ubody = html_to_unicode(content_type, self.body,
|
|
@@ -117,27 +347,105 @@ class TextResponse(Response):
|
|
|
117
347
|
return self._cached_benc
|
|
118
348
|
|
|
119
349
|
def _auto_detect_fun(self, text):
|
|
350
|
+
"""
|
|
351
|
+
Auto-detect the encoding of the given text.
|
|
352
|
+
自动检测给定文本的编码。
|
|
353
|
+
|
|
354
|
+
This method tries to decode the text using a sequence of common encodings
|
|
355
|
+
and returns the first one that succeeds.
|
|
356
|
+
此方法尝试使用一系列常见编码解码文本,并返回第一个成功的编码。
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
text: The text to detect the encoding for.
|
|
360
|
+
要检测编码的文本。
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
str or None: The detected encoding, or None if none of the encodings work.
|
|
364
|
+
检测到的编码,如果没有编码有效则为None。
|
|
365
|
+
"""
|
|
366
|
+
# Try a sequence of common encodings
|
|
367
|
+
# 尝试一系列常见编码
|
|
120
368
|
for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
|
|
121
369
|
try:
|
|
122
370
|
text.decode(enc)
|
|
123
371
|
except UnicodeError:
|
|
124
372
|
continue
|
|
125
373
|
return resolve_encoding(enc)
|
|
374
|
+
return None
|
|
126
375
|
|
|
127
376
|
@memoizemethod_noargs
|
|
128
377
|
def _body_declared_encoding(self):
|
|
378
|
+
"""
|
|
379
|
+
Get the encoding declared in the HTML/XML body.
|
|
380
|
+
获取HTML/XML正文中声明的编码。
|
|
381
|
+
|
|
382
|
+
This method extracts the encoding from meta tags or XML declarations
|
|
383
|
+
in the response body. The result is memoized for performance.
|
|
384
|
+
此方法从响应体中的meta标签或XML声明中提取编码。
|
|
385
|
+
结果会被记忆化以提高性能。
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
str or None: The encoding declared in the body, or None if not found.
|
|
389
|
+
正文中声明的编码,如果未找到则为None。
|
|
390
|
+
"""
|
|
129
391
|
return html_body_declared_encoding(self.body)
|
|
130
392
|
|
|
131
393
|
@property
|
|
132
394
|
def selector(self):
|
|
395
|
+
"""
|
|
396
|
+
Get a Selector for this response.
|
|
397
|
+
获取此响应的选择器。
|
|
398
|
+
|
|
399
|
+
This property creates a parsel.Selector instance for the response text,
|
|
400
|
+
which allows for XPath and CSS queries. The result is cached for
|
|
401
|
+
subsequent access.
|
|
402
|
+
此属性为响应文本创建一个parsel.Selector实例,允许XPath和CSS查询。
|
|
403
|
+
结果会被缓存以供后续访问。
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
parsel.Selector: A Selector instance for this response.
|
|
407
|
+
此响应的Selector实例。
|
|
408
|
+
"""
|
|
133
409
|
if self._cached_selector is None:
|
|
134
410
|
self._cached_selector = Selector(self.text)
|
|
135
411
|
return self._cached_selector
|
|
136
412
|
|
|
137
413
|
def xpath(self, query, **kwargs):
|
|
414
|
+
"""
|
|
415
|
+
Apply the given XPath selector to this response's content.
|
|
416
|
+
将给定的XPath选择器应用于此响应的内容。
|
|
417
|
+
|
|
418
|
+
This is a shortcut method that creates a selector and applies the XPath query.
|
|
419
|
+
此方法是一个快捷方法,创建选择器并应用XPath查询。
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
query: The XPath query string.
|
|
423
|
+
XPath查询字符串。
|
|
424
|
+
**kwargs: Additional keyword arguments passed to the selector's xpath method.
|
|
425
|
+
传递给选择器的xpath方法的额外关键字参数。
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
parsel.SelectorList: The result of the XPath query.
|
|
429
|
+
XPath查询的结果。
|
|
430
|
+
"""
|
|
138
431
|
return self.selector.xpath(query, **kwargs)
|
|
139
432
|
|
|
140
433
|
def css(self, query):
|
|
434
|
+
"""
|
|
435
|
+
Apply the given CSS selector to this response's content.
|
|
436
|
+
将给定的CSS选择器应用于此响应的内容。
|
|
437
|
+
|
|
438
|
+
This is a shortcut method that creates a selector and applies the CSS query.
|
|
439
|
+
此方法是一个快捷方法,创建选择器并应用CSS查询。
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
query: The CSS query string.
|
|
443
|
+
CSS查询字符串。
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
parsel.SelectorList: The result of the CSS query.
|
|
447
|
+
CSS查询的结果。
|
|
448
|
+
"""
|
|
141
449
|
return self.selector.css(query)
|
|
142
450
|
|
|
143
451
|
def follow(self, url, callback=None, method='GET', headers=None, body=None,
|
|
@@ -145,20 +453,65 @@ class TextResponse(Response):
|
|
|
145
453
|
fingerprint=None, errback=None, cb_kwargs=None, flags=None):
|
|
146
454
|
# type: (...) -> Request
|
|
147
455
|
"""
|
|
148
|
-
Return a
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
456
|
+
Return a Request instance to follow a link.
|
|
457
|
+
返回一个Request实例以跟踪链接。
|
|
458
|
+
|
|
459
|
+
This method extends the base Response.follow() method to handle additional
|
|
460
|
+
URL types, including Selector objects for HTML elements and attributes.
|
|
461
|
+
此方法扩展了基本Response.follow()方法,以处理额外的URL类型,
|
|
462
|
+
包括HTML元素和属性的Selector对象。
|
|
463
|
+
|
|
464
|
+
The URL can be:
|
|
465
|
+
URL可以是:
|
|
466
|
+
|
|
467
|
+
* An absolute URL (string)
|
|
468
|
+
绝对URL(字符串)
|
|
469
|
+
* A relative URL (string)
|
|
470
|
+
相对URL(字符串)
|
|
471
|
+
* A Link object
|
|
472
|
+
Link对象
|
|
473
|
+
* A Selector object for a <link> or <a> element
|
|
474
|
+
<link>或<a>元素的Selector对象
|
|
475
|
+
* An attribute Selector (not SelectorList), e.g., from css('a::attr(href)')[0]
|
|
476
|
+
属性Selector(非SelectorList),例如,来自css('a::attr(href)')[0]
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
url: The URL to follow. Can be any of the types described above.
|
|
480
|
+
要跟踪的URL。可以是上述任何类型。
|
|
481
|
+
callback: A function to be called with the response from the request.
|
|
482
|
+
使用请求的响应调用的函数。
|
|
483
|
+
method: The HTTP method to use.
|
|
484
|
+
要使用的HTTP方法。
|
|
485
|
+
headers: The headers to use for the request.
|
|
486
|
+
请求使用的头部。
|
|
487
|
+
body: The body of the request.
|
|
488
|
+
请求的正文。
|
|
489
|
+
cookies: The cookies to send with the request.
|
|
490
|
+
与请求一起发送的Cookie。
|
|
491
|
+
meta: Extra data to pass to the request.
|
|
492
|
+
传递给请求的额外数据。
|
|
493
|
+
encoding: The encoding to use for the request. Defaults to this response's encoding.
|
|
494
|
+
请求使用的编码。默认为此响应的编码。
|
|
495
|
+
priority: The priority of the request.
|
|
496
|
+
请求的优先级。
|
|
497
|
+
dont_filter: Whether to filter duplicate requests.
|
|
498
|
+
是否过滤重复请求。
|
|
499
|
+
fingerprint: The fingerprint for the request.
|
|
500
|
+
请求的指纹。
|
|
501
|
+
errback: A function to be called if the request fails.
|
|
502
|
+
如果请求失败时调用的函数。
|
|
503
|
+
cb_kwargs: Additional keyword arguments to pass to the callback.
|
|
504
|
+
传递给回调的额外关键字参数。
|
|
505
|
+
flags: Flags for the request.
|
|
506
|
+
请求的标志。
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Request: A new Request instance.
|
|
510
|
+
一个新的Request实例。
|
|
511
|
+
|
|
512
|
+
Raises:
|
|
513
|
+
ValueError: If the URL is a SelectorList or cannot be extracted from a Selector.
|
|
514
|
+
如果URL是SelectorList或无法从Selector中提取。
|
|
162
515
|
"""
|
|
163
516
|
if isinstance(url, parsel.Selector):
|
|
164
517
|
url = _url_from_selector(url)
|
|
@@ -188,27 +541,78 @@ class TextResponse(Response):
|
|
|
188
541
|
css=None, xpath=None):
|
|
189
542
|
# type: (...) -> Generator[Request, None, None]
|
|
190
543
|
"""
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
544
|
+
Return a generator of Request instances to follow all links in urls.
|
|
545
|
+
返回一个Request实例的生成器,以跟踪urls中的所有链接。
|
|
546
|
+
|
|
547
|
+
This method extends the base Response.follow_all() method to handle additional
|
|
548
|
+
URL types and to support direct extraction of links using CSS or XPath selectors.
|
|
549
|
+
此方法扩展了基本Response.follow_all()方法,以处理额外的URL类型,
|
|
550
|
+
并支持使用CSS或XPath选择器直接提取链接。
|
|
551
|
+
|
|
552
|
+
The URLs can be provided in several ways:
|
|
553
|
+
URLs可以通过几种方式提供:
|
|
554
|
+
|
|
555
|
+
1. As a list in the 'urls' parameter, where each element can be:
|
|
556
|
+
作为'urls'参数中的列表,其中每个元素可以是:
|
|
557
|
+
* An absolute URL (string)
|
|
558
|
+
绝对URL(字符串)
|
|
559
|
+
* A relative URL (string)
|
|
560
|
+
相对URL(字符串)
|
|
561
|
+
* A Link object
|
|
562
|
+
Link对象
|
|
563
|
+
* A Selector object for a <link> or <a> element
|
|
564
|
+
<link>或<a>元素的Selector对象
|
|
565
|
+
* An attribute Selector (not SelectorList)
|
|
566
|
+
属性Selector(非SelectorList)
|
|
567
|
+
|
|
568
|
+
2. By providing a CSS selector in the 'css' parameter
|
|
569
|
+
通过在'css'参数中提供CSS选择器
|
|
570
|
+
|
|
571
|
+
3. By providing an XPath selector in the 'xpath' parameter
|
|
572
|
+
通过在'xpath'参数中提供XPath选择器
|
|
573
|
+
|
|
574
|
+
Note: Only one of 'urls', 'css', or 'xpath' should be provided.
|
|
575
|
+
注意:只应提供'urls'、'css'或'xpath'中的一个。
|
|
576
|
+
|
|
577
|
+
Args:
|
|
578
|
+
urls: An iterable of URLs to follow. Each can be any of the types described above.
|
|
579
|
+
要跟踪的URL的可迭代对象。每个可以是上述任何类型。
|
|
580
|
+
callback: A function to be called with the response from each request.
|
|
581
|
+
使用每个请求的响应调用的函数。
|
|
582
|
+
method: The HTTP method to use.
|
|
583
|
+
要使用的HTTP方法。
|
|
584
|
+
headers: The headers to use for the requests.
|
|
585
|
+
请求使用的头部。
|
|
586
|
+
body: The body of the requests.
|
|
587
|
+
请求的正文。
|
|
588
|
+
cookies: The cookies to send with the requests.
|
|
589
|
+
与请求一起发送的Cookie。
|
|
590
|
+
meta: Extra data to pass to the requests.
|
|
591
|
+
传递给请求的额外数据。
|
|
592
|
+
encoding: The encoding to use for the requests. Defaults to this response's encoding.
|
|
593
|
+
请求使用的编码。默认为此响应的编码。
|
|
594
|
+
priority: The priority of the requests.
|
|
595
|
+
请求的优先级。
|
|
596
|
+
dont_filter: Whether to filter duplicate requests.
|
|
597
|
+
是否过滤重复请求。
|
|
598
|
+
errback: A function to be called if the requests fail.
|
|
599
|
+
如果请求失败时调用的函数。
|
|
600
|
+
cb_kwargs: Additional keyword arguments to pass to the callback.
|
|
601
|
+
传递给回调的额外关键字参数。
|
|
602
|
+
flags: Flags for the requests.
|
|
603
|
+
请求的标志。
|
|
604
|
+
css: A CSS selector to extract links from this response.
|
|
605
|
+
从此响应中提取链接的CSS选择器。
|
|
606
|
+
xpath: An XPath selector to extract links from this response.
|
|
607
|
+
从此响应中提取链接的XPath选择器。
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
Generator[Request, None, None]: A generator of Request instances.
|
|
611
|
+
Request实例的生成器。
|
|
612
|
+
|
|
613
|
+
Raises:
|
|
614
|
+
ValueError: If more than one of 'urls', 'css', or 'xpath' is provided.
|
|
615
|
+
如果提供了'urls'、'css'或'xpath'中的多个。
|
|
212
616
|
"""
|
|
213
617
|
arguments = [x for x in (urls, css, xpath) if x is not None]
|
|
214
618
|
if len(arguments) != 1:
|
|
@@ -245,21 +649,64 @@ class TextResponse(Response):
|
|
|
245
649
|
|
|
246
650
|
class _InvalidSelector(ValueError):
|
|
247
651
|
"""
|
|
248
|
-
Raised when a URL cannot be obtained from a Selector
|
|
652
|
+
Raised when a URL cannot be obtained from a Selector.
|
|
653
|
+
当无法从Selector获取URL时引发。
|
|
654
|
+
|
|
655
|
+
This exception is used internally by the _url_from_selector function
|
|
656
|
+
to indicate that a Selector object cannot be converted to a URL.
|
|
657
|
+
此异常由_url_from_selector函数内部使用,
|
|
658
|
+
表示无法将Selector对象转换为URL。
|
|
249
659
|
"""
|
|
250
660
|
|
|
251
661
|
|
|
252
662
|
def _url_from_selector(sel):
|
|
253
663
|
# type: (parsel.Selector) -> str
|
|
664
|
+
"""
|
|
665
|
+
Extract a URL from a Selector object.
|
|
666
|
+
从Selector对象中提取URL。
|
|
667
|
+
|
|
668
|
+
This function extracts a URL from different types of Selector objects:
|
|
669
|
+
此函数从不同类型的Selector对象中提取URL:
|
|
670
|
+
|
|
671
|
+
1. If the selector root is a string (e.g., from ::attr(href)), it returns that string
|
|
672
|
+
如果选择器根是字符串(例如,来自::attr(href)),则返回该字符串
|
|
673
|
+
2. If the selector is for an <a> or <link> element, it returns the href attribute
|
|
674
|
+
如果选择器是<a>或<link>元素,则返回href属性
|
|
675
|
+
|
|
676
|
+
Args:
|
|
677
|
+
sel: The Selector object to extract a URL from.
|
|
678
|
+
要从中提取URL的Selector对象。
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
str: The extracted URL with whitespace stripped.
|
|
682
|
+
提取的URL,已去除空白。
|
|
683
|
+
|
|
684
|
+
Raises:
|
|
685
|
+
_InvalidSelector: If the URL cannot be extracted from the Selector.
|
|
686
|
+
如果无法从Selector中提取URL。
|
|
687
|
+
"""
|
|
254
688
|
if isinstance(sel.root, str):
|
|
255
|
-
# e.g
|
|
689
|
+
# For attribute selectors (e.g., ::attr(href) result)
|
|
690
|
+
# 对于属性选择器(例如,::attr(href)结果)
|
|
256
691
|
return strip_html5_whitespace(sel.root)
|
|
692
|
+
|
|
257
693
|
if not hasattr(sel.root, 'tag'):
|
|
694
|
+
# For selectors that don't represent HTML elements
|
|
695
|
+
# 对于不表示HTML元素的选择器
|
|
258
696
|
raise _InvalidSelector(f"Unsupported selector: {sel}")
|
|
697
|
+
|
|
259
698
|
if sel.root.tag not in ('a', 'link'):
|
|
699
|
+
# Only <a> and <link> elements are supported
|
|
700
|
+
# 只支持<a>和<link>元素
|
|
260
701
|
raise _InvalidSelector("Only <a> and <link> elements are supported; "
|
|
261
702
|
f"got <{sel.root.tag}>")
|
|
703
|
+
|
|
262
704
|
href = sel.root.get('href')
|
|
263
705
|
if href is None:
|
|
706
|
+
# The element has no href attribute
|
|
707
|
+
# 元素没有href属性
|
|
264
708
|
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
|
|
709
|
+
|
|
710
|
+
# Return the href with whitespace stripped
|
|
711
|
+
# 返回去除空白的href
|
|
265
712
|
return strip_html5_whitespace(href)
|