aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module implements the JsonRequest class which is a more convenient class
|
|
3
|
-
(than Request) to generate JSON Requests.
|
|
4
1
|
|
|
5
|
-
|
|
2
|
+
"""
|
|
3
|
+
JSON request implementation for aioscrapy.
|
|
4
|
+
aioscrapy的JSON请求实现。
|
|
5
|
+
|
|
6
|
+
This module provides the JsonRequest class, which is a specialized Request
|
|
7
|
+
that handles JSON data, automatically setting appropriate headers and
|
|
8
|
+
serializing Python objects to JSON.
|
|
9
|
+
此模块提供了JsonRequest类,这是一个专门处理JSON数据的Request,
|
|
10
|
+
自动设置适当的头部并将Python对象序列化为JSON。
|
|
6
11
|
"""
|
|
7
12
|
|
|
8
13
|
import copy
|
|
@@ -11,53 +16,160 @@ import warnings
|
|
|
11
16
|
from typing import Optional, Tuple
|
|
12
17
|
|
|
13
18
|
from aioscrapy.http.request import Request
|
|
14
|
-
from aioscrapy.utils.deprecate import create_deprecated_class
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class JsonRequest(Request):
|
|
22
|
+
"""
|
|
23
|
+
A Request that handles JSON data.
|
|
24
|
+
处理JSON数据的Request。
|
|
25
|
+
|
|
26
|
+
This class extends the base Request to handle JSON data, automatically
|
|
27
|
+
setting appropriate headers for JSON content and serializing Python
|
|
28
|
+
objects to JSON format.
|
|
29
|
+
此类扩展了基本Request以处理JSON数据,自动设置JSON内容的
|
|
30
|
+
适当头部,并将Python对象序列化为JSON格式。
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# Add dumps_kwargs to the list of attributes to be included in serialization
|
|
34
|
+
# 将dumps_kwargs添加到要包含在序列化中的属性列表中
|
|
18
35
|
attributes: Tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
|
|
19
36
|
|
|
20
37
|
def __init__(self, *args, dumps_kwargs: Optional[dict] = None, **kwargs) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Initialize a JsonRequest.
|
|
40
|
+
初始化JsonRequest。
|
|
41
|
+
|
|
42
|
+
This constructor extends the base Request constructor to handle JSON data.
|
|
43
|
+
It accepts either a 'body' parameter with pre-serialized JSON or a 'data'
|
|
44
|
+
parameter with a Python object to be serialized to JSON.
|
|
45
|
+
此构造函数扩展了基本Request构造函数以处理JSON数据。
|
|
46
|
+
它接受带有预序列化JSON的'body'参数或带有要序列化为JSON的Python对象的'data'参数。
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
*args: Positional arguments passed to the Request constructor.
|
|
50
|
+
传递给Request构造函数的位置参数。
|
|
51
|
+
dumps_kwargs: Optional keyword arguments to pass to json.dumps().
|
|
52
|
+
可选的关键字参数,传递给json.dumps()。
|
|
53
|
+
**kwargs: Keyword arguments passed to the Request constructor.
|
|
54
|
+
May include 'data' (a Python object to serialize to JSON)
|
|
55
|
+
or 'body' (pre-serialized JSON string).
|
|
56
|
+
传递给Request构造函数的关键字参数。
|
|
57
|
+
可能包括'data'(要序列化为JSON的Python对象)
|
|
58
|
+
或'body'(预序列化的JSON字符串)。
|
|
59
|
+
"""
|
|
60
|
+
# Make a deep copy of dumps_kwargs to avoid modifying the original
|
|
61
|
+
# 深拷贝dumps_kwargs以避免修改原始对象
|
|
21
62
|
dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
|
|
22
63
|
self._dumps_kwargs = dumps_kwargs
|
|
23
64
|
|
|
65
|
+
# Check if body or data parameters were provided
|
|
66
|
+
# 检查是否提供了body或data参数
|
|
24
67
|
body_passed = kwargs.get('body', None) is not None
|
|
25
68
|
data = kwargs.pop('data', None)
|
|
26
69
|
data_passed = data is not None
|
|
27
70
|
|
|
71
|
+
# Handle the case where both body and data are provided
|
|
72
|
+
# 处理同时提供body和data的情况
|
|
28
73
|
if body_passed and data_passed:
|
|
29
74
|
warnings.warn('Both body and data passed. data will be ignored')
|
|
30
75
|
|
|
76
|
+
# Handle the case where only data is provided
|
|
77
|
+
# 处理只提供data的情况
|
|
31
78
|
elif not body_passed and data_passed:
|
|
79
|
+
# Serialize the data to JSON and set it as the body
|
|
80
|
+
# 将数据序列化为JSON并将其设置为body
|
|
32
81
|
kwargs['body'] = self._dumps(data)
|
|
33
82
|
|
|
83
|
+
# Default to POST method if not specified
|
|
84
|
+
# 如果未指定,则默认为POST方法
|
|
34
85
|
if 'method' not in kwargs:
|
|
35
86
|
kwargs['method'] = 'POST'
|
|
36
87
|
|
|
88
|
+
# Initialize the base Request
|
|
89
|
+
# 初始化基本Request
|
|
37
90
|
super().__init__(*args, **kwargs)
|
|
91
|
+
|
|
92
|
+
# Set default headers for JSON content
|
|
93
|
+
# 设置JSON内容的默认头部
|
|
38
94
|
self.headers.setdefault('Content-Type', 'application/json')
|
|
39
95
|
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
|
|
40
96
|
|
|
41
97
|
@property
|
|
42
98
|
def dumps_kwargs(self) -> dict:
|
|
99
|
+
"""
|
|
100
|
+
Get the keyword arguments used for JSON serialization.
|
|
101
|
+
获取用于JSON序列化的关键字参数。
|
|
102
|
+
|
|
103
|
+
These arguments are passed to json.dumps() when serializing data.
|
|
104
|
+
这些参数在序列化数据时传递给json.dumps()。
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
dict: The keyword arguments for json.dumps().
|
|
108
|
+
json.dumps()的关键字参数。
|
|
109
|
+
"""
|
|
43
110
|
return self._dumps_kwargs
|
|
44
111
|
|
|
45
112
|
def replace(self, *args, **kwargs) -> Request:
|
|
113
|
+
"""
|
|
114
|
+
Create a new JsonRequest with the same attributes except for those given new values.
|
|
115
|
+
创建一个新的JsonRequest,除了给定的新值外,其他属性与当前JsonRequest相同。
|
|
116
|
+
|
|
117
|
+
This method extends the base Request.replace() method to handle the 'data'
|
|
118
|
+
parameter, serializing it to JSON if provided.
|
|
119
|
+
此方法扩展了基本Request.replace()方法以处理'data'参数,
|
|
120
|
+
如果提供了该参数,则将其序列化为JSON。
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
*args: Positional arguments passed to the base replace() method.
|
|
124
|
+
传递给基本replace()方法的位置参数。
|
|
125
|
+
**kwargs: Keyword arguments passed to the base replace() method.
|
|
126
|
+
May include 'data' (a Python object to serialize to JSON)
|
|
127
|
+
or 'body' (pre-serialized JSON string).
|
|
128
|
+
传递给基本replace()方法的关键字参数。
|
|
129
|
+
可能包括'data'(要序列化为JSON的Python对象)
|
|
130
|
+
或'body'(预序列化的JSON字符串)。
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Request: A new JsonRequest object.
|
|
134
|
+
一个新的JsonRequest对象。
|
|
135
|
+
"""
|
|
136
|
+
# Check if body or data parameters were provided
|
|
137
|
+
# 检查是否提供了body或data参数
|
|
46
138
|
body_passed = kwargs.get('body', None) is not None
|
|
47
139
|
data = kwargs.pop('data', None)
|
|
48
140
|
data_passed = data is not None
|
|
49
141
|
|
|
142
|
+
# Handle the case where both body and data are provided
|
|
143
|
+
# 处理同时提供body和data的情况
|
|
50
144
|
if body_passed and data_passed:
|
|
51
145
|
warnings.warn('Both body and data passed. data will be ignored')
|
|
52
146
|
|
|
147
|
+
# Handle the case where only data is provided
|
|
148
|
+
# 处理只提供data的情况
|
|
53
149
|
elif not body_passed and data_passed:
|
|
150
|
+
# Serialize the data to JSON and set it as the body
|
|
151
|
+
# 将数据序列化为JSON并将其设置为body
|
|
54
152
|
kwargs['body'] = self._dumps(data)
|
|
55
153
|
|
|
154
|
+
# Call the base replace() method
|
|
155
|
+
# 调用基本replace()方法
|
|
56
156
|
return super().replace(*args, **kwargs)
|
|
57
157
|
|
|
58
158
|
def _dumps(self, data: dict) -> str:
|
|
59
|
-
"""
|
|
159
|
+
"""
|
|
160
|
+
Convert Python data to a JSON string.
|
|
161
|
+
将Python数据转换为JSON字符串。
|
|
162
|
+
|
|
163
|
+
This internal method serializes the given data to JSON using the
|
|
164
|
+
json.dumps() function with the configured keyword arguments.
|
|
165
|
+
此内部方法使用json.dumps()函数和配置的关键字参数将给定数据序列化为JSON。
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
data: The Python object to serialize to JSON.
|
|
169
|
+
要序列化为JSON的Python对象。
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
str: The JSON string representation of the data.
|
|
173
|
+
数据的JSON字符串表示。
|
|
174
|
+
"""
|
|
60
175
|
return json.dumps(data, **self._dumps_kwargs)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
JSONRequest = create_deprecated_class("JSONRequest", JsonRequest)
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
+
|
|
1
2
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
3
|
+
HTTP Response implementation for aioscrapy.
|
|
4
|
+
aioscrapy的HTTP响应实现。
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
This module provides the Response class, which represents an HTTP response
|
|
7
|
+
received by the crawler. It handles response data, headers, cookies, and
|
|
8
|
+
provides methods for URL joining and following links.
|
|
9
|
+
此模块提供了Response类,表示爬虫接收到的HTTP响应。它处理响应数据、
|
|
10
|
+
头部、Cookie,并提供URL连接和跟踪链接的方法。
|
|
6
11
|
"""
|
|
12
|
+
|
|
7
13
|
from typing import Generator, Optional
|
|
8
14
|
from urllib.parse import urljoin
|
|
9
15
|
|
|
@@ -24,6 +30,19 @@ class Response(object):
|
|
|
24
30
|
flags: Optional[list] = None,
|
|
25
31
|
request: Optional[Request] = None,
|
|
26
32
|
):
|
|
33
|
+
"""
|
|
34
|
+
Initialize a Response object.
|
|
35
|
+
初始化Response对象。
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
url: URL for this response. 此响应的URL。
|
|
39
|
+
status: HTTP status code. HTTP状态码。
|
|
40
|
+
headers: HTTP headers. HTTP头信息。
|
|
41
|
+
cookies: Cookies from the response. 响应中的Cookie。
|
|
42
|
+
body: Response body. 响应体。
|
|
43
|
+
flags: Response flags. 响应标志。
|
|
44
|
+
request: The Request object that generated this response. 生成此响应的Request对象。
|
|
45
|
+
"""
|
|
27
46
|
self.headers = headers or {}
|
|
28
47
|
self.status = int(status)
|
|
29
48
|
self._set_body(body)
|
|
@@ -34,6 +53,24 @@ class Response(object):
|
|
|
34
53
|
|
|
35
54
|
@property
|
|
36
55
|
def cb_kwargs(self):
|
|
56
|
+
"""
|
|
57
|
+
Get the callback keyword arguments from the request that generated this response.
|
|
58
|
+
从生成此响应的请求中获取回调关键字参数。
|
|
59
|
+
|
|
60
|
+
This property provides access to the cb_kwargs dictionary of the request
|
|
61
|
+
that generated this response, allowing callback functions to access
|
|
62
|
+
data passed from the request.
|
|
63
|
+
此属性提供对生成此响应的请求的cb_kwargs字典的访问,
|
|
64
|
+
允许回调函数访问从请求传递的数据。
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
dict: The callback keyword arguments dictionary.
|
|
68
|
+
回调关键字参数字典。
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
AttributeError: If this response is not tied to any request.
|
|
72
|
+
如果此响应未与任何请求关联。
|
|
73
|
+
"""
|
|
37
74
|
try:
|
|
38
75
|
return self.request.cb_kwargs
|
|
39
76
|
except AttributeError:
|
|
@@ -44,6 +81,24 @@ class Response(object):
|
|
|
44
81
|
|
|
45
82
|
@property
|
|
46
83
|
def meta(self):
|
|
84
|
+
"""
|
|
85
|
+
Get the metadata from the request that generated this response.
|
|
86
|
+
从生成此响应的请求中获取元数据。
|
|
87
|
+
|
|
88
|
+
This property provides access to the meta dictionary of the request
|
|
89
|
+
that generated this response, allowing callback functions to access
|
|
90
|
+
metadata passed from the request.
|
|
91
|
+
此属性提供对生成此响应的请求的meta字典的访问,
|
|
92
|
+
允许回调函数访问从请求传递的元数据。
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
dict: The request metadata dictionary.
|
|
96
|
+
请求元数据字典。
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
AttributeError: If this response is not tied to any request.
|
|
100
|
+
如果此响应未与任何请求关联。
|
|
101
|
+
"""
|
|
47
102
|
try:
|
|
48
103
|
return self.request.meta
|
|
49
104
|
except AttributeError:
|
|
@@ -53,21 +108,75 @@ class Response(object):
|
|
|
53
108
|
)
|
|
54
109
|
|
|
55
110
|
def _get_url(self):
|
|
111
|
+
"""
|
|
112
|
+
Get the response URL.
|
|
113
|
+
获取响应URL。
|
|
114
|
+
|
|
115
|
+
This is an internal method used by the url property.
|
|
116
|
+
这是由url属性使用的内部方法。
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
str: The response URL.
|
|
120
|
+
响应URL。
|
|
121
|
+
"""
|
|
56
122
|
return self._url
|
|
57
123
|
|
|
58
124
|
def _set_url(self, url):
|
|
125
|
+
"""
|
|
126
|
+
Set the response URL.
|
|
127
|
+
设置响应URL。
|
|
128
|
+
|
|
129
|
+
This method validates that the URL is a string.
|
|
130
|
+
此方法验证URL是一个字符串。
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
url: The URL to set.
|
|
134
|
+
要设置的URL。
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
TypeError: If the URL is not a string.
|
|
138
|
+
如果URL不是字符串。
|
|
139
|
+
"""
|
|
59
140
|
if isinstance(url, str):
|
|
60
141
|
self._url = url
|
|
61
142
|
else:
|
|
62
143
|
raise TypeError(f'{type(self).__name__} url must be str, '
|
|
63
144
|
f'got {type(url).__name__}')
|
|
64
145
|
|
|
146
|
+
# Property that uses the getter and setter methods
|
|
147
|
+
# 使用getter和setter方法的属性
|
|
65
148
|
url = property(_get_url, _set_url)
|
|
66
149
|
|
|
67
150
|
def _get_body(self):
|
|
151
|
+
"""
|
|
152
|
+
Get the response body.
|
|
153
|
+
获取响应体。
|
|
154
|
+
|
|
155
|
+
This is an internal method used by the body property.
|
|
156
|
+
这是由body属性使用的内部方法。
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
bytes: The response body.
|
|
160
|
+
响应体。
|
|
161
|
+
"""
|
|
68
162
|
return self._body
|
|
69
163
|
|
|
70
164
|
def _set_body(self, body):
|
|
165
|
+
"""
|
|
166
|
+
Set the response body.
|
|
167
|
+
设置响应体。
|
|
168
|
+
|
|
169
|
+
This method validates that the body is bytes and converts None to an empty bytes object.
|
|
170
|
+
此方法验证body是字节对象,并将None转换为空字节对象。
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
body: The body to set.
|
|
174
|
+
要设置的响应体。
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
TypeError: If the body is not bytes.
|
|
178
|
+
如果body不是字节对象。
|
|
179
|
+
"""
|
|
71
180
|
if body is None:
|
|
72
181
|
self._body = b''
|
|
73
182
|
elif not isinstance(body, bytes):
|
|
@@ -78,20 +187,49 @@ class Response(object):
|
|
|
78
187
|
else:
|
|
79
188
|
self._body = body
|
|
80
189
|
|
|
190
|
+
# Property that uses the getter and setter methods
|
|
191
|
+
# 使用getter和setter方法的属性
|
|
81
192
|
body = property(_get_body, _set_body)
|
|
82
193
|
|
|
83
194
|
def __str__(self):
|
|
195
|
+
"""
|
|
196
|
+
Return a string representation of the response.
|
|
197
|
+
返回响应的字符串表示。
|
|
198
|
+
|
|
199
|
+
The string representation includes the HTTP status code and URL.
|
|
200
|
+
字符串表示包括HTTP状态码和URL。
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
str: A string representation of the response.
|
|
204
|
+
响应的字符串表示。
|
|
205
|
+
"""
|
|
84
206
|
return f"<{self.status} {self.url}>"
|
|
85
207
|
|
|
208
|
+
# Use the same implementation for __repr__
|
|
209
|
+
# 对__repr__使用相同的实现
|
|
86
210
|
__repr__ = __str__
|
|
87
211
|
|
|
88
212
|
def copy(self):
|
|
89
|
-
"""
|
|
213
|
+
"""
|
|
214
|
+
Return a copy of this Response.
|
|
215
|
+
返回此Response的副本。
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
A copy of this Response. 此Response的副本。
|
|
219
|
+
"""
|
|
90
220
|
return self.replace()
|
|
91
221
|
|
|
92
222
|
def replace(self, *args, **kwargs):
|
|
93
|
-
"""
|
|
94
|
-
given new values.
|
|
223
|
+
"""
|
|
224
|
+
Create a new Response with the same attributes except for those given new values.
|
|
225
|
+
创建一个新的Response,除了给定的新值外,其他属性与当前Response相同。
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
*args: Positional arguments for the new Response. 新Response的位置参数。
|
|
229
|
+
**kwargs: Keyword arguments for the new Response. 新Response的关键字参数。
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
A new Response object. 一个新的Response对象。
|
|
95
233
|
"""
|
|
96
234
|
for x in [
|
|
97
235
|
"url", "status", "headers", "body", "request", "flags"
|
|
@@ -101,32 +239,102 @@ class Response(object):
|
|
|
101
239
|
return cls(*args, **kwargs)
|
|
102
240
|
|
|
103
241
|
def urljoin(self, url):
|
|
104
|
-
"""
|
|
105
|
-
absolute interpretation of the latter.
|
|
242
|
+
"""
|
|
243
|
+
Join this Response's url with a possible relative url to form an absolute interpretation of the latter.
|
|
244
|
+
将此Response的url与可能的相对url连接,形成后者的绝对解释。
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
url: The URL to join. 要连接的URL。
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
The absolute URL. 绝对URL。
|
|
251
|
+
"""
|
|
106
252
|
return urljoin(self.url, url)
|
|
107
253
|
|
|
108
254
|
@property
|
|
109
255
|
def text(self):
|
|
110
|
-
"""
|
|
111
|
-
as
|
|
256
|
+
"""
|
|
257
|
+
Get the response body as text.
|
|
258
|
+
将响应体作为文本获取。
|
|
259
|
+
|
|
260
|
+
This property is only implemented by subclasses of TextResponse.
|
|
261
|
+
In the base Response class, it raises an AttributeError.
|
|
262
|
+
此属性仅由TextResponse的子类实现。
|
|
263
|
+
在基本Response类中,它会引发AttributeError。
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
str: The response body as text (in subclasses).
|
|
267
|
+
响应体作为文本(在子类中)。
|
|
268
|
+
|
|
269
|
+
Raises:
|
|
270
|
+
AttributeError: In the base Response class.
|
|
271
|
+
在基本Response类中。
|
|
112
272
|
"""
|
|
113
273
|
raise AttributeError("Response content isn't text")
|
|
114
274
|
|
|
115
275
|
def css(self, *a, **kw):
|
|
116
|
-
"""
|
|
117
|
-
|
|
276
|
+
"""
|
|
277
|
+
Apply the given CSS selector to this response's content.
|
|
278
|
+
将给定的CSS选择器应用于此响应的内容。
|
|
279
|
+
|
|
280
|
+
This method is only implemented by subclasses of TextResponse.
|
|
281
|
+
In the base Response class, it raises a NotSupported exception.
|
|
282
|
+
此方法仅由TextResponse的子类实现。
|
|
283
|
+
在基本Response类中,它会引发NotSupported异常。
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
*a: Positional arguments for the CSS selector.
|
|
287
|
+
CSS选择器的位置参数。
|
|
288
|
+
**kw: Keyword arguments for the CSS selector.
|
|
289
|
+
CSS选择器的关键字参数。
|
|
290
|
+
|
|
291
|
+
Raises:
|
|
292
|
+
NotSupported: In the base Response class.
|
|
293
|
+
在基本Response类中。
|
|
118
294
|
"""
|
|
119
295
|
raise NotSupported("Response content isn't text")
|
|
120
296
|
|
|
121
297
|
def xpath(self, *a, **kw):
|
|
122
|
-
"""
|
|
123
|
-
|
|
298
|
+
"""
|
|
299
|
+
Apply the given XPath selector to this response's content.
|
|
300
|
+
将给定的XPath选择器应用于此响应的内容。
|
|
301
|
+
|
|
302
|
+
This method is only implemented by subclasses of TextResponse.
|
|
303
|
+
In the base Response class, it raises a NotSupported exception.
|
|
304
|
+
此方法仅由TextResponse的子类实现。
|
|
305
|
+
在基本Response类中,它会引发NotSupported异常。
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
*a: Positional arguments for the XPath selector.
|
|
309
|
+
XPath选择器的位置参数。
|
|
310
|
+
**kw: Keyword arguments for the XPath selector.
|
|
311
|
+
XPath选择器的关键字参数。
|
|
312
|
+
|
|
313
|
+
Raises:
|
|
314
|
+
NotSupported: In the base Response class.
|
|
315
|
+
在基本Response类中。
|
|
124
316
|
"""
|
|
125
317
|
raise NotSupported("Response content isn't text")
|
|
126
318
|
|
|
127
319
|
def json(self, *a, **kw):
|
|
128
|
-
"""
|
|
129
|
-
|
|
320
|
+
"""
|
|
321
|
+
Parse this response's body as JSON.
|
|
322
|
+
将此响应的正文解析为JSON。
|
|
323
|
+
|
|
324
|
+
This method is only implemented by subclasses of TextResponse.
|
|
325
|
+
In the base Response class, it raises a NotSupported exception.
|
|
326
|
+
此方法仅由TextResponse的子类实现。
|
|
327
|
+
在基本Response类中,它会引发NotSupported异常。
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
*a: Positional arguments for the JSON parser.
|
|
331
|
+
JSON解析器的位置参数。
|
|
332
|
+
**kw: Keyword arguments for the JSON parser.
|
|
333
|
+
JSON解析器的关键字参数。
|
|
334
|
+
|
|
335
|
+
Raises:
|
|
336
|
+
NotSupported: In the base Response class.
|
|
337
|
+
在基本Response类中。
|
|
130
338
|
"""
|
|
131
339
|
raise NotSupported("Response content isn't text")
|
|
132
340
|
|
|
@@ -135,17 +343,48 @@ class Response(object):
|
|
|
135
343
|
fingerprint=None, errback=None, cb_kwargs=None, flags=None):
|
|
136
344
|
# type: (...) -> Request
|
|
137
345
|
"""
|
|
138
|
-
Return a
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
346
|
+
Return a Request instance to follow a link.
|
|
347
|
+
返回一个Request实例以跟踪链接。
|
|
348
|
+
|
|
349
|
+
This method creates a new Request to follow the given URL. The URL can be
|
|
350
|
+
a relative URL, a Link object, or an absolute URL. If it's a relative URL,
|
|
351
|
+
it will be joined with the current response's URL.
|
|
352
|
+
此方法创建一个新的Request以跟踪给定的URL。URL可以是相对URL、Link对象或绝对URL。
|
|
353
|
+
如果是相对URL,它将与当前响应的URL连接。
|
|
142
354
|
|
|
143
|
-
:
|
|
144
|
-
|
|
145
|
-
|
|
355
|
+
Args:
|
|
356
|
+
url: The URL to follow. Can be a string or a Link object.
|
|
357
|
+
要跟踪的URL。可以是字符串或Link对象。
|
|
358
|
+
callback: A function to be called with the response from the request.
|
|
359
|
+
使用请求的响应调用的函数。
|
|
360
|
+
method: The HTTP method to use.
|
|
361
|
+
要使用的HTTP方法。
|
|
362
|
+
headers: The headers to use for the request.
|
|
363
|
+
请求使用的头部。
|
|
364
|
+
body: The body of the request.
|
|
365
|
+
请求的正文。
|
|
366
|
+
cookies: The cookies to send with the request.
|
|
367
|
+
与请求一起发送的Cookie。
|
|
368
|
+
meta: Extra data to pass to the request.
|
|
369
|
+
传递给请求的额外数据。
|
|
370
|
+
encoding: The encoding to use for the request.
|
|
371
|
+
请求使用的编码。
|
|
372
|
+
priority: The priority of the request.
|
|
373
|
+
请求的优先级。
|
|
374
|
+
dont_filter: Whether to filter duplicate requests.
|
|
375
|
+
是否过滤重复请求。
|
|
376
|
+
fingerprint: The fingerprint for the request.
|
|
377
|
+
请求的指纹。
|
|
378
|
+
errback: A function to be called if the request fails.
|
|
379
|
+
如果请求失败时调用的函数。
|
|
380
|
+
cb_kwargs: Additional keyword arguments to pass to the callback.
|
|
381
|
+
传递给回调的额外关键字参数。
|
|
382
|
+
flags: Flags for the request.
|
|
383
|
+
请求的标志。
|
|
146
384
|
|
|
147
|
-
|
|
148
|
-
|
|
385
|
+
Returns:
|
|
386
|
+
Request: A new Request instance.
|
|
387
|
+
一个新的Request实例。
|
|
149
388
|
"""
|
|
150
389
|
if isinstance(url, Link):
|
|
151
390
|
url = url.url
|
|
@@ -175,16 +414,50 @@ class Response(object):
|
|
|
175
414
|
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
|
|
176
415
|
# type: (...) -> Generator[Request, None, None]
|
|
177
416
|
"""
|
|
178
|
-
|
|
417
|
+
Return an iterable of Request instances to follow all links in urls.
|
|
418
|
+
返回一个Request实例的可迭代对象,以跟踪urls中的所有链接。
|
|
419
|
+
|
|
420
|
+
This method creates multiple Requests to follow the given URLs. Each URL can be
|
|
421
|
+
a relative URL, a Link object, or an absolute URL. If it's a relative URL,
|
|
422
|
+
it will be joined with the current response's URL.
|
|
423
|
+
此方法创建多个Request以跟踪给定的URL。每个URL可以是相对URL、Link对象或绝对URL。
|
|
424
|
+
如果是相对URL,它将与当前响应的URL连接。
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
urls: An iterable of URLs to follow. Each can be a string or a Link object.
|
|
428
|
+
要跟踪的URL的可迭代对象。每个可以是字符串或Link对象。
|
|
429
|
+
callback: A function to be called with the response from each request.
|
|
430
|
+
使用每个请求的响应调用的函数。
|
|
431
|
+
method: The HTTP method to use.
|
|
432
|
+
要使用的HTTP方法。
|
|
433
|
+
headers: The headers to use for the requests.
|
|
434
|
+
请求使用的头部。
|
|
435
|
+
body: The body of the requests.
|
|
436
|
+
请求的正文。
|
|
437
|
+
cookies: The cookies to send with the requests.
|
|
438
|
+
与请求一起发送的Cookie。
|
|
439
|
+
meta: Extra data to pass to the requests.
|
|
440
|
+
传递给请求的额外数据。
|
|
441
|
+
encoding: The encoding to use for the requests.
|
|
442
|
+
请求使用的编码。
|
|
443
|
+
priority: The priority of the requests.
|
|
444
|
+
请求的优先级。
|
|
445
|
+
dont_filter: Whether to filter duplicate requests.
|
|
446
|
+
是否过滤重复请求。
|
|
447
|
+
errback: A function to be called if the requests fail.
|
|
448
|
+
如果请求失败时调用的函数。
|
|
449
|
+
cb_kwargs: Additional keyword arguments to pass to the callback.
|
|
450
|
+
传递给回调的额外关键字参数。
|
|
451
|
+
flags: Flags for the requests.
|
|
452
|
+
请求的标志。
|
|
179
453
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
not only absolute URLs.
|
|
454
|
+
Returns:
|
|
455
|
+
Generator[Request, None, None]: A generator of Request instances.
|
|
456
|
+
Request实例的生成器。
|
|
184
457
|
|
|
185
|
-
:
|
|
186
|
-
|
|
187
|
-
|
|
458
|
+
Raises:
|
|
459
|
+
TypeError: If urls is not an iterable.
|
|
460
|
+
如果urls不是可迭代的。
|
|
188
461
|
"""
|
|
189
462
|
if not hasattr(urls, '__iter__'):
|
|
190
463
|
raise TypeError("'urls' argument must be an iterable")
|
aioscrapy/http/response/html.py
CHANGED
|
@@ -1,12 +1,51 @@
|
|
|
1
|
+
|
|
1
2
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
3
|
+
HTML response implementation for aioscrapy.
|
|
4
|
+
aioscrapy的HTML响应实现。
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
This module provides the HtmlResponse class, which is a specialized TextResponse
|
|
7
|
+
for handling HTML content. It inherits all functionality from TextResponse
|
|
8
|
+
but is specifically intended for HTML responses.
|
|
9
|
+
此模块提供了HtmlResponse类,这是一个专门用于处理HTML内容的TextResponse。
|
|
10
|
+
它继承了TextResponse的所有功能,但专门用于HTML响应。
|
|
6
11
|
"""
|
|
7
12
|
|
|
8
13
|
from aioscrapy.http.response.text import TextResponse
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
class HtmlResponse(TextResponse):
|
|
17
|
+
"""
|
|
18
|
+
A Response subclass specifically for HTML responses.
|
|
19
|
+
专门用于HTML响应的Response子类。
|
|
20
|
+
|
|
21
|
+
This class extends TextResponse to handle HTML content. It inherits all the
|
|
22
|
+
functionality of TextResponse, including:
|
|
23
|
+
此类扩展了TextResponse以处理HTML内容。它继承了TextResponse的所有功能,包括:
|
|
24
|
+
|
|
25
|
+
- Automatic encoding detection
|
|
26
|
+
自动编码检测
|
|
27
|
+
- Unicode conversion
|
|
28
|
+
Unicode转换
|
|
29
|
+
- CSS and XPath selectors
|
|
30
|
+
CSS和XPath选择器
|
|
31
|
+
- JSON parsing
|
|
32
|
+
JSON解析
|
|
33
|
+
- Enhanced link following
|
|
34
|
+
增强的链接跟踪
|
|
35
|
+
|
|
36
|
+
The main purpose of this class is to provide a specific type for HTML responses,
|
|
37
|
+
which can be useful for type checking and middleware processing.
|
|
38
|
+
此类的主要目的是为HTML响应提供特定类型,这对类型检查和中间件处理很有用。
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
```python
|
|
42
|
+
def parse(self, response):
|
|
43
|
+
if isinstance(response, HtmlResponse):
|
|
44
|
+
# Process HTML response
|
|
45
|
+
title = response.css('title::text').get()
|
|
46
|
+
else:
|
|
47
|
+
# Handle other response types
|
|
48
|
+
pass
|
|
49
|
+
```
|
|
50
|
+
"""
|
|
12
51
|
pass
|