aio-scrapy 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -41
- aio_scrapy-2.1.6.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +187 -3
- aioscrapy/core/downloader/handlers/curl_cffi.py +124 -3
- aioscrapy/core/downloader/handlers/httpx.py +133 -3
- aioscrapy/core/downloader/handlers/pyhttpx.py +132 -3
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +313 -13
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
|
|
1
2
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
3
|
+
HTTP Request implementation for aioscrapy.
|
|
4
|
+
aioscrapy的HTTP请求实现。
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
This module provides the Request class, which represents an HTTP request to be sent by the crawler.
|
|
7
|
+
It handles URL normalization, fingerprinting, serialization, and other request-related functionality.
|
|
8
|
+
此模块提供了Request类,表示由爬虫发送的HTTP请求。
|
|
9
|
+
它处理URL规范化、指纹生成、序列化和其他与请求相关的功能。
|
|
6
10
|
"""
|
|
11
|
+
|
|
7
12
|
import hashlib
|
|
8
13
|
import inspect
|
|
9
14
|
import json
|
|
@@ -18,6 +23,8 @@ from aioscrapy.utils.curl import curl_to_request_kwargs
|
|
|
18
23
|
from aioscrapy.utils.python import to_unicode
|
|
19
24
|
from aioscrapy.utils.url import escape_ajax
|
|
20
25
|
|
|
26
|
+
# Type variable for Request class to use in class methods
|
|
27
|
+
# 用于在类方法中使用的Request类的类型变量
|
|
21
28
|
RequestTypeVar = TypeVar("RequestTypeVar", bound="Request")
|
|
22
29
|
|
|
23
30
|
|
|
@@ -47,7 +54,27 @@ class Request(object):
|
|
|
47
54
|
fingerprint: Optional[str] = None,
|
|
48
55
|
use_proxy: bool = True,
|
|
49
56
|
):
|
|
50
|
-
|
|
57
|
+
"""
|
|
58
|
+
Initialize a Request object.
|
|
59
|
+
初始化Request对象。
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
url: URL for the request. 请求的URL。
|
|
63
|
+
callback: Function to call when the response is received. 接收到响应时调用的函数。
|
|
64
|
+
method: HTTP method. HTTP方法。
|
|
65
|
+
headers: HTTP headers. HTTP头信息。
|
|
66
|
+
body: Request body. 请求体。
|
|
67
|
+
cookies: Cookies to send with the request. 随请求发送的Cookie。
|
|
68
|
+
meta: Additional metadata. 额外的元数据。
|
|
69
|
+
encoding: Encoding for the URL and body. URL和请求体的编码。
|
|
70
|
+
priority: Request priority. 请求优先级。
|
|
71
|
+
dont_filter: Whether to filter this request through the scheduler's dupefilter. 是否通过调度器的去重过滤器过滤此请求。
|
|
72
|
+
errback: Function to call if an error occurs during processing. 处理过程中发生错误时调用的函数。
|
|
73
|
+
flags: Request flags. 请求标志。
|
|
74
|
+
cb_kwargs: Additional keyword arguments to pass to the callback. 传递给回调函数的额外关键字参数。
|
|
75
|
+
fingerprint: Request fingerprint. 请求指纹。
|
|
76
|
+
use_proxy: Whether to use a proxy for this request. 是否为此请求使用代理。
|
|
77
|
+
"""
|
|
51
78
|
self._encoding = encoding
|
|
52
79
|
self.method = str(method).upper()
|
|
53
80
|
self._set_url(url)
|
|
@@ -71,26 +98,90 @@ class Request(object):
|
|
|
71
98
|
|
|
72
99
|
@property
|
|
73
100
|
def cb_kwargs(self) -> dict:
|
|
101
|
+
"""
|
|
102
|
+
Get the callback keyword arguments dictionary.
|
|
103
|
+
获取回调关键字参数字典。
|
|
104
|
+
|
|
105
|
+
This property ensures that the callback keyword arguments dictionary
|
|
106
|
+
is always initialized, creating an empty dictionary if needed.
|
|
107
|
+
此属性确保回调关键字参数字典始终被初始化,如果需要则创建一个空字典。
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
dict: The callback keyword arguments dictionary.
|
|
111
|
+
回调关键字参数字典。
|
|
112
|
+
"""
|
|
74
113
|
if self._cb_kwargs is None:
|
|
75
114
|
self._cb_kwargs = {}
|
|
76
115
|
return self._cb_kwargs
|
|
77
116
|
|
|
78
117
|
@property
|
|
79
118
|
def meta(self) -> dict:
|
|
119
|
+
"""
|
|
120
|
+
Get the request metadata dictionary.
|
|
121
|
+
获取请求元数据字典。
|
|
122
|
+
|
|
123
|
+
This property ensures that the metadata dictionary is always initialized,
|
|
124
|
+
creating an empty dictionary if needed. The metadata dictionary is used
|
|
125
|
+
to store arbitrary data associated with the request.
|
|
126
|
+
此属性确保元数据字典始终被初始化,如果需要则创建一个空字典。
|
|
127
|
+
元数据字典用于存储与请求相关的任意数据。
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
dict: The request metadata dictionary.
|
|
131
|
+
请求元数据字典。
|
|
132
|
+
"""
|
|
80
133
|
if self._meta is None:
|
|
81
134
|
self._meta = {}
|
|
82
135
|
return self._meta
|
|
83
136
|
|
|
84
137
|
def _get_url(self) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Get the request URL.
|
|
140
|
+
获取请求URL。
|
|
141
|
+
|
|
142
|
+
This is an internal method used by the url property.
|
|
143
|
+
这是由url属性使用的内部方法。
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
str: The request URL.
|
|
147
|
+
请求URL。
|
|
148
|
+
"""
|
|
85
149
|
return self._url
|
|
86
150
|
|
|
87
151
|
def _set_url(self, url: str) -> None:
|
|
152
|
+
"""
|
|
153
|
+
Set the request URL.
|
|
154
|
+
设置请求URL。
|
|
155
|
+
|
|
156
|
+
This method normalizes the URL by:
|
|
157
|
+
此方法通过以下方式规范化URL:
|
|
158
|
+
1. Converting it to a safe string using the request's encoding
|
|
159
|
+
使用请求的编码将其转换为安全字符串
|
|
160
|
+
2. Escaping AJAX-specific characters
|
|
161
|
+
转义AJAX特定字符
|
|
162
|
+
3. Validating that the URL has a scheme
|
|
163
|
+
验证URL具有协议方案
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
url: The URL to set.
|
|
167
|
+
要设置的URL。
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
TypeError: If the URL is not a string.
|
|
171
|
+
如果URL不是字符串。
|
|
172
|
+
ValueError: If the URL does not have a scheme.
|
|
173
|
+
如果URL没有协议方案。
|
|
174
|
+
"""
|
|
88
175
|
if not isinstance(url, str):
|
|
89
176
|
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
|
|
90
177
|
|
|
178
|
+
# Normalize the URL
|
|
179
|
+
# 规范化URL
|
|
91
180
|
s = safe_url_string(url, self.encoding)
|
|
92
181
|
self._url = escape_ajax(s)
|
|
93
182
|
|
|
183
|
+
# Validate that the URL has a scheme
|
|
184
|
+
# 验证URL具有协议方案
|
|
94
185
|
if (
|
|
95
186
|
'://' not in self._url
|
|
96
187
|
and not self._url.startswith('about:')
|
|
@@ -98,41 +189,135 @@ class Request(object):
|
|
|
98
189
|
):
|
|
99
190
|
raise ValueError(f'Missing scheme in request url: {self._url}')
|
|
100
191
|
|
|
192
|
+
# Property that uses the getter and setter methods
|
|
193
|
+
# 使用getter和setter方法的属性
|
|
101
194
|
url = property(_get_url, _set_url)
|
|
102
195
|
|
|
103
196
|
def _get_body(self) -> str:
|
|
197
|
+
"""
|
|
198
|
+
Get the request body.
|
|
199
|
+
获取请求体。
|
|
200
|
+
|
|
201
|
+
This is an internal method used by the body property.
|
|
202
|
+
这是由body属性使用的内部方法。
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
str: The request body.
|
|
206
|
+
请求体。
|
|
207
|
+
"""
|
|
104
208
|
return self._body
|
|
105
209
|
|
|
106
210
|
def _set_body(self, body: str) -> None:
|
|
211
|
+
"""
|
|
212
|
+
Set the request body.
|
|
213
|
+
设置请求体。
|
|
214
|
+
|
|
215
|
+
This method sets the request body, converting None to an empty string.
|
|
216
|
+
此方法设置请求体,将None转换为空字符串。
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
body: The body to set.
|
|
220
|
+
要设置的请求体。
|
|
221
|
+
"""
|
|
107
222
|
self._body = '' if body is None else body
|
|
108
223
|
|
|
224
|
+
# Property that uses the getter and setter methods
|
|
225
|
+
# 使用getter和setter方法的属性
|
|
109
226
|
body = property(_get_body, _set_body)
|
|
110
227
|
|
|
111
228
|
def _set_fingerprint(self, fingerprint: str) -> None:
|
|
229
|
+
"""
|
|
230
|
+
Set the request fingerprint.
|
|
231
|
+
设置请求指纹。
|
|
232
|
+
|
|
233
|
+
This is an internal method used by the fingerprint property.
|
|
234
|
+
The fingerprint is stored in the request's metadata.
|
|
235
|
+
这是由fingerprint属性使用的内部方法。
|
|
236
|
+
指纹存储在请求的元数据中。
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
fingerprint: The fingerprint to set.
|
|
240
|
+
要设置的指纹。
|
|
241
|
+
"""
|
|
112
242
|
self._meta['_fingerprint'] = fingerprint
|
|
113
243
|
|
|
114
244
|
def _get_fingerprint(self) -> str:
|
|
245
|
+
"""
|
|
246
|
+
Get the request fingerprint.
|
|
247
|
+
获取请求指纹。
|
|
248
|
+
|
|
249
|
+
This is an internal method used by the fingerprint property.
|
|
250
|
+
If the fingerprint doesn't exist, it's generated using make_fingerprint().
|
|
251
|
+
这是由fingerprint属性使用的内部方法。
|
|
252
|
+
如果指纹不存在,则使用make_fingerprint()生成。
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
str: The request fingerprint.
|
|
256
|
+
请求指纹。
|
|
257
|
+
"""
|
|
115
258
|
if not self._meta.get('_fingerprint'):
|
|
116
259
|
self._meta['_fingerprint'] = self.make_fingerprint()
|
|
117
260
|
return self._meta.get('_fingerprint')
|
|
118
261
|
|
|
262
|
+
# Property that uses the getter and setter methods
|
|
263
|
+
# 使用getter和setter方法的属性
|
|
119
264
|
fingerprint = property(_get_fingerprint, _set_fingerprint)
|
|
120
265
|
|
|
121
266
|
@property
|
|
122
267
|
def encoding(self) -> str:
|
|
268
|
+
"""
|
|
269
|
+
Get the request encoding.
|
|
270
|
+
获取请求编码。
|
|
271
|
+
|
|
272
|
+
This encoding is used for URL and body encoding.
|
|
273
|
+
此编码用于URL和请求体编码。
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
str: The request encoding.
|
|
277
|
+
请求编码。
|
|
278
|
+
"""
|
|
123
279
|
return self._encoding
|
|
124
280
|
|
|
125
281
|
def __str__(self) -> str:
|
|
282
|
+
"""
|
|
283
|
+
Return a string representation of the request.
|
|
284
|
+
返回请求的字符串表示。
|
|
285
|
+
|
|
286
|
+
The string representation includes the HTTP method and URL.
|
|
287
|
+
字符串表示包括HTTP方法和URL。
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
str: A string representation of the request.
|
|
291
|
+
请求的字符串表示。
|
|
292
|
+
"""
|
|
126
293
|
return f"<{self.method} {self.url}>"
|
|
127
294
|
|
|
295
|
+
# Use the same implementation for __repr__
|
|
296
|
+
# 对__repr__使用相同的实现
|
|
128
297
|
__repr__ = __str__
|
|
129
298
|
|
|
130
299
|
def copy(self) -> "Request":
|
|
131
|
-
"""
|
|
300
|
+
"""
|
|
301
|
+
Return a copy of this Request.
|
|
302
|
+
返回此Request的副本。
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
A copy of this Request. 此Request的副本。
|
|
306
|
+
"""
|
|
132
307
|
return self.replace()
|
|
133
308
|
|
|
134
309
|
def replace(self, *args, **kwargs) -> "Request":
|
|
135
|
-
"""
|
|
310
|
+
"""
|
|
311
|
+
Create a new Request with the same attributes except for those given new values.
|
|
312
|
+
创建一个新的Request,除了给定的新值外,其他属性与当前Request相同。
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
*args: Positional arguments for the new Request. 新Request的位置参数。
|
|
316
|
+
**kwargs: Keyword arguments for the new Request. 新Request的关键字参数。
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
A new Request object. 一个新的Request对象。
|
|
320
|
+
"""
|
|
136
321
|
for x in self.attributes:
|
|
137
322
|
kwargs.setdefault(x, getattr(self, x))
|
|
138
323
|
cls = kwargs.pop('cls', self.__class__)
|
|
@@ -142,7 +327,18 @@ class Request(object):
|
|
|
142
327
|
def from_curl(
|
|
143
328
|
cls: Type[RequestTypeVar], curl_command: str, ignore_unknown_options: bool = True, **kwargs
|
|
144
329
|
) -> RequestTypeVar:
|
|
145
|
-
"""
|
|
330
|
+
"""
|
|
331
|
+
Create a Request object from a string containing a cURL command.
|
|
332
|
+
从包含cURL命令的字符串创建Request对象。
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
curl_command: The cURL command. cURL命令。
|
|
336
|
+
ignore_unknown_options: Whether to ignore unknown cURL options. 是否忽略未知的cURL选项。
|
|
337
|
+
**kwargs: Additional keyword arguments for the Request. Request的额外关键字参数。
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
A Request object. Request对象。
|
|
341
|
+
"""
|
|
146
342
|
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
|
|
147
343
|
request_kwargs.update(kwargs)
|
|
148
344
|
return cls(**request_kwargs)
|
|
@@ -151,7 +347,19 @@ class Request(object):
|
|
|
151
347
|
self,
|
|
152
348
|
keep_fragments: bool = False,
|
|
153
349
|
) -> str:
|
|
154
|
-
"""
|
|
350
|
+
"""
|
|
351
|
+
Make the request fingerprint.
|
|
352
|
+
生成请求指纹。
|
|
353
|
+
|
|
354
|
+
The fingerprint is a hash of the request's method, URL, and body.
|
|
355
|
+
指纹是请求的方法、URL和请求体的哈希值。
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
keep_fragments: Whether to keep URL fragments in the fingerprint. 是否在指纹中保留URL片段。
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
The request fingerprint. 请求指纹。
|
|
362
|
+
"""
|
|
155
363
|
return hashlib.sha1(
|
|
156
364
|
json.dumps({
|
|
157
365
|
'method': to_unicode(self.method),
|
|
@@ -161,12 +369,22 @@ class Request(object):
|
|
|
161
369
|
).hexdigest()
|
|
162
370
|
|
|
163
371
|
def to_dict(self, *, spider: Optional["aioscrapy.Spider"] = None) -> dict:
|
|
164
|
-
"""
|
|
372
|
+
"""
|
|
373
|
+
Return a dictionary containing the Request's data.
|
|
374
|
+
返回包含Request数据的字典。
|
|
165
375
|
|
|
166
|
-
Use
|
|
376
|
+
Use request_from_dict() to convert back into a Request object.
|
|
377
|
+
使用request_from_dict()将其转换回Request对象。
|
|
167
378
|
|
|
168
379
|
If a spider is given, this method will try to find out the name of the spider methods used as callback
|
|
169
380
|
and errback and include them in the output dict, raising an exception if they cannot be found.
|
|
381
|
+
如果提供了爬虫,此方法将尝试找出用作回调和错误回调的爬虫方法的名称,并将它们包含在输出字典中,如果找不到则引发异常。
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
spider: The spider instance. 爬虫实例。
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
A dictionary containing the Request's data. 包含Request数据的字典。
|
|
170
388
|
"""
|
|
171
389
|
d = {
|
|
172
390
|
"url": self.url, # urls are safe (safe_string_url)
|
|
@@ -183,16 +401,44 @@ class Request(object):
|
|
|
183
401
|
|
|
184
402
|
|
|
185
403
|
def _find_method(obj, func):
|
|
186
|
-
"""
|
|
404
|
+
"""
|
|
405
|
+
Find the name of a method in an object.
|
|
406
|
+
在对象中查找方法的名称。
|
|
407
|
+
|
|
408
|
+
This is a helper function for Request.to_dict() that finds the name of a method
|
|
409
|
+
in an object by comparing the underlying function objects.
|
|
410
|
+
这是Request.to_dict()的辅助函数,通过比较底层函数对象在对象中查找方法的名称。
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
obj: The object to search in.
|
|
414
|
+
要搜索的对象。
|
|
415
|
+
func: The method to find.
|
|
416
|
+
要查找的方法。
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
str: The name of the method.
|
|
420
|
+
方法的名称。
|
|
421
|
+
|
|
422
|
+
Raises:
|
|
423
|
+
ValueError: If the function is not an instance method in the object.
|
|
424
|
+
如果函数不是对象中的实例方法。
|
|
425
|
+
"""
|
|
187
426
|
# Only instance methods contain ``__func__``
|
|
427
|
+
# 只有实例方法包含``__func__``
|
|
188
428
|
if obj and hasattr(func, '__func__'):
|
|
429
|
+
# Get all methods of the object
|
|
430
|
+
# 获取对象的所有方法
|
|
189
431
|
members = inspect.getmembers(obj, predicate=inspect.ismethod)
|
|
190
432
|
for name, obj_func in members:
|
|
191
433
|
# We need to use __func__ to access the original function object because instance
|
|
192
434
|
# method objects are generated each time attribute is retrieved from instance.
|
|
435
|
+
# 我们需要使用__func__来访问原始函数对象,因为实例方法对象在每次从实例检索属性时都会生成。
|
|
193
436
|
#
|
|
194
437
|
# Reference: The standard type hierarchy
|
|
438
|
+
# 参考:标准类型层次结构
|
|
195
439
|
# https://docs.python.org/3/reference/datamodel.html
|
|
196
440
|
if obj_func.__func__ is func.__func__:
|
|
197
441
|
return name
|
|
442
|
+
# If we get here, the function was not found
|
|
443
|
+
# 如果我们到达这里,则未找到函数
|
|
198
444
|
raise ValueError(f"Function {func} is not an instance method in: {obj}")
|
aioscrapy/http/request/form.py
CHANGED
|
@@ -1,39 +1,119 @@
|
|
|
1
|
+
|
|
1
2
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
3
|
+
Form request implementation for aioscrapy.
|
|
4
|
+
aioscrapy的表单请求实现。
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
This module provides the FormRequest class, which is a specialized Request
|
|
7
|
+
that handles HTML form submission, both GET and POST methods.
|
|
8
|
+
此模块提供了FormRequest类,这是一个专门处理HTML表单提交的Request,
|
|
9
|
+
支持GET和POST方法。
|
|
6
10
|
"""
|
|
11
|
+
|
|
7
12
|
from typing import List, Optional, Tuple, Union
|
|
8
13
|
from urllib.parse import urlencode
|
|
9
14
|
|
|
10
15
|
from aioscrapy.http.request import Request
|
|
11
16
|
from aioscrapy.utils.python import to_bytes, is_listlike
|
|
12
17
|
|
|
18
|
+
# Type definition for form data, which can be a dictionary or a list of key-value tuples
|
|
19
|
+
# 表单数据的类型定义,可以是字典或键值元组列表
|
|
13
20
|
FormdataType = Optional[Union[dict, List[Tuple[str, str]]]]
|
|
14
21
|
|
|
15
22
|
|
|
16
23
|
class FormRequest(Request):
|
|
24
|
+
"""
|
|
25
|
+
A Request that submits HTML form data.
|
|
26
|
+
提交HTML表单数据的Request。
|
|
27
|
+
|
|
28
|
+
This class extends the base Request to handle form submissions,
|
|
29
|
+
automatically setting the appropriate method, headers, and
|
|
30
|
+
encoding the form data either in the URL (for GET requests)
|
|
31
|
+
or in the body (for POST requests).
|
|
32
|
+
此类扩展了基本Request以处理表单提交,自动设置适当的方法、
|
|
33
|
+
头部,并将表单数据编码到URL中(对于GET请求)或请求体中
|
|
34
|
+
(对于POST请求)。
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Valid HTTP methods for form submission
|
|
38
|
+
# 表单提交的有效HTTP方法
|
|
17
39
|
valid_form_methods = ['GET', 'POST']
|
|
18
40
|
|
|
19
41
|
def __init__(self, *args, formdata: FormdataType = None, **kwargs) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Initialize a FormRequest.
|
|
44
|
+
初始化FormRequest。
|
|
45
|
+
|
|
46
|
+
This constructor extends the base Request constructor to handle form data.
|
|
47
|
+
If form data is provided and no method is specified, it defaults to POST.
|
|
48
|
+
此构造函数扩展了基本Request构造函数以处理表单数据。
|
|
49
|
+
如果提供了表单数据且未指定方法,则默认为POST。
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
*args: Positional arguments passed to the Request constructor.
|
|
53
|
+
传递给Request构造函数的位置参数。
|
|
54
|
+
formdata: Form data to submit, either as a dict or a list of (name, value) tuples.
|
|
55
|
+
要提交的表单数据,可以是字典或(名称, 值)元组的列表。
|
|
56
|
+
**kwargs: Keyword arguments passed to the Request constructor.
|
|
57
|
+
传递给Request构造函数的关键字参数。
|
|
58
|
+
"""
|
|
59
|
+
# Default to POST method if form data is provided and no method is specified
|
|
60
|
+
# 如果提供了表单数据且未指定方法,则默认为POST方法
|
|
20
61
|
if formdata and kwargs.get('method') is None:
|
|
21
62
|
kwargs['method'] = 'POST'
|
|
22
63
|
|
|
64
|
+
# Initialize the base Request
|
|
65
|
+
# 初始化基本Request
|
|
23
66
|
super().__init__(*args, **kwargs)
|
|
24
67
|
|
|
68
|
+
# Process form data if provided
|
|
69
|
+
# 如果提供了表单数据,则处理它
|
|
25
70
|
if formdata:
|
|
71
|
+
# Convert dict to items() iterator if necessary
|
|
72
|
+
# 如果需要,将字典转换为items()迭代器
|
|
26
73
|
items = formdata.items() if isinstance(formdata, dict) else formdata
|
|
74
|
+
|
|
75
|
+
# URL-encode the form data
|
|
76
|
+
# URL编码表单数据
|
|
27
77
|
form_query: str = _urlencode(items, self.encoding)
|
|
78
|
+
|
|
28
79
|
if self.method == 'POST':
|
|
80
|
+
# For POST requests, set the Content-Type header and put form data in the body
|
|
81
|
+
# 对于POST请求,设置Content-Type头部并将表单数据放入请求体
|
|
29
82
|
self.headers.setdefault('Content-Type', 'application/x-www-form-urlencoded')
|
|
30
83
|
self._set_body(form_query)
|
|
31
84
|
else:
|
|
85
|
+
# For GET requests, append form data to the URL
|
|
86
|
+
# 对于GET请求,将表单数据附加到URL
|
|
32
87
|
self._set_url(self.url + ('&' if '?' in self.url else '?') + form_query)
|
|
33
88
|
|
|
34
89
|
|
|
35
90
|
def _urlencode(seq, enc):
|
|
91
|
+
"""
|
|
92
|
+
URL-encode a sequence of form data.
|
|
93
|
+
URL编码表单数据序列。
|
|
94
|
+
|
|
95
|
+
This internal function handles the encoding of form data for submission,
|
|
96
|
+
converting keys and values to bytes using the specified encoding and
|
|
97
|
+
properly handling list-like values.
|
|
98
|
+
此内部函数处理表单数据的编码以便提交,使用指定的编码将键和值转换为字节,
|
|
99
|
+
并正确处理类似列表的值。
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
seq: A sequence of (name, value) pairs to encode.
|
|
103
|
+
要编码的(名称, 值)对序列。
|
|
104
|
+
enc: The encoding to use for converting strings to bytes.
|
|
105
|
+
用于将字符串转换为字节的编码。
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
str: The URL-encoded form data string.
|
|
109
|
+
URL编码的表单数据字符串。
|
|
110
|
+
"""
|
|
111
|
+
# Convert each key-value pair to bytes and handle list-like values
|
|
112
|
+
# 将每个键值对转换为字节并处理类似列表的值
|
|
36
113
|
values = [(to_bytes(k, enc), to_bytes(v, enc))
|
|
37
114
|
for k, vs in seq
|
|
38
115
|
for v in (vs if is_listlike(vs) else [vs])]
|
|
116
|
+
|
|
117
|
+
# Use urllib's urlencode with doseq=1 to properly handle sequences
|
|
118
|
+
# 使用urllib的urlencode,doseq=1以正确处理序列
|
|
39
119
|
return urlencode(values, doseq=1)
|