aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/utils/reqser.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Request serialization utilities for aioscrapy.
|
|
3
|
+
aioscrapy的请求序列化实用工具。
|
|
4
|
+
|
|
5
|
+
This module provides helper functions for serializing and deserializing Request objects.
|
|
6
|
+
These functions are particularly useful for storing requests in queues, databases,
|
|
7
|
+
or transmitting them between different processes or systems.
|
|
8
|
+
此模块提供了用于序列化和反序列化Request对象的辅助函数。
|
|
9
|
+
这些函数对于在队列、数据库中存储请求或在不同进程或系统之间传输请求特别有用。
|
|
3
10
|
"""
|
|
4
11
|
from typing import Optional
|
|
5
12
|
|
|
@@ -8,8 +15,75 @@ from aioscrapy.utils.request import request_from_dict as _from_dict
|
|
|
8
15
|
|
|
9
16
|
|
|
10
17
|
def request_to_dict(request: "aioscrapy.Request", spider: Optional["aioscrapy.Spider"] = None) -> dict:
|
|
18
|
+
"""
|
|
19
|
+
Convert a Request object to a dictionary representation.
|
|
20
|
+
将Request对象转换为字典表示。
|
|
21
|
+
|
|
22
|
+
This function serializes a Request object into a dictionary that can be easily
|
|
23
|
+
stored or transmitted. The dictionary contains all the necessary information
|
|
24
|
+
to reconstruct the Request object later using request_from_dict().
|
|
25
|
+
此函数将Request对象序列化为可以轻松存储或传输的字典。
|
|
26
|
+
该字典包含稍后使用request_from_dict()重建Request对象所需的所有信息。
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
request: The Request object to serialize.
|
|
30
|
+
要序列化的Request对象。
|
|
31
|
+
spider: Optional Spider instance that may be used to customize the
|
|
32
|
+
serialization process. Some Request subclasses may use the spider
|
|
33
|
+
to properly serialize their attributes.
|
|
34
|
+
可选的Spider实例,可用于自定义序列化过程。
|
|
35
|
+
某些Request子类可能使用spider来正确序列化其属性。
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
dict: A dictionary representation of the Request object.
|
|
39
|
+
Request对象的字典表示。
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> request = Request('http://example.com', callback='parse_item')
|
|
43
|
+
>>> request_dict = request_to_dict(request, spider)
|
|
44
|
+
>>> # The dictionary can be stored or transmitted
|
|
45
|
+
>>> new_request = await request_from_dict(request_dict, spider)
|
|
46
|
+
"""
|
|
47
|
+
# Delegate to the Request object's to_dict method
|
|
48
|
+
# 委托给Request对象的to_dict方法
|
|
11
49
|
return request.to_dict(spider=spider)
|
|
12
50
|
|
|
13
51
|
|
|
14
52
|
async def request_from_dict(d: dict, spider: Optional["aioscrapy.Spider"] = None) -> "aioscrapy.Request":
|
|
53
|
+
"""
|
|
54
|
+
Convert a dictionary representation back to a Request object.
|
|
55
|
+
将字典表示转换回Request对象。
|
|
56
|
+
|
|
57
|
+
This function deserializes a dictionary (previously created by request_to_dict)
|
|
58
|
+
back into a Request object. It reconstructs all the attributes and properties
|
|
59
|
+
of the original Request, including callback and errback methods if a spider
|
|
60
|
+
is provided.
|
|
61
|
+
此函数将(先前由request_to_dict创建的)字典反序列化回Request对象。
|
|
62
|
+
它重建原始Request的所有属性和属性,如果提供了spider,
|
|
63
|
+
还包括回调和错误回调方法。
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
d: The dictionary representation of a Request object.
|
|
67
|
+
Request对象的字典表示。
|
|
68
|
+
spider: Optional Spider instance that may be used to resolve callback
|
|
69
|
+
and errback method names to actual methods on the spider.
|
|
70
|
+
可选的Spider实例,可用于将回调和错误回调方法名称
|
|
71
|
+
解析为spider上的实际方法。
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
aioscrapy.Request: A reconstructed Request object.
|
|
75
|
+
重建的Request对象。
|
|
76
|
+
|
|
77
|
+
Example:
|
|
78
|
+
>>> request_dict = {
|
|
79
|
+
... 'url': 'http://example.com',
|
|
80
|
+
... 'callback': 'parse_item',
|
|
81
|
+
... 'method': 'GET'
|
|
82
|
+
... }
|
|
83
|
+
>>> request = await request_from_dict(request_dict, spider)
|
|
84
|
+
>>> request.url
|
|
85
|
+
'http://example.com'
|
|
86
|
+
"""
|
|
87
|
+
# Delegate to the imported _from_dict function from aioscrapy.utils.request
|
|
88
|
+
# 委托给从aioscrapy.utils.request导入的_from_dict函数
|
|
15
89
|
return await _from_dict(d, spider=spider)
|
aioscrapy/utils/request.py
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
aioscrapy
|
|
2
|
+
Request utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的请求实用函数。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with aioscrapy.http.Request objects.
|
|
6
|
+
It includes functions for converting requests to raw HTTP representations, extracting
|
|
7
|
+
referrer information, and creating Request objects from dictionaries.
|
|
8
|
+
此模块提供了用于处理aioscrapy.http.Request对象的实用函数。
|
|
9
|
+
它包括将请求转换为原始HTTP表示、提取引用者信息以及从字典创建Request对象的函数。
|
|
4
10
|
"""
|
|
5
11
|
|
|
6
12
|
from typing import Optional
|
|
@@ -15,53 +21,208 @@ from aioscrapy.utils.python import to_bytes, to_unicode
|
|
|
15
21
|
|
|
16
22
|
|
|
17
23
|
def request_httprepr(request: Request) -> bytes:
|
|
18
|
-
"""Return the raw HTTP representation (as bytes) of the given request.
|
|
19
|
-
This is provided only for reference since it's not the actual stream of
|
|
20
|
-
bytes that will be send when performing the request (that's controlled
|
|
21
|
-
by Twisted).
|
|
22
24
|
"""
|
|
25
|
+
Return the raw HTTP representation of a request as bytes.
|
|
26
|
+
以字节形式返回请求的原始HTTP表示。
|
|
27
|
+
|
|
28
|
+
This function converts a Request object to its raw HTTP representation,
|
|
29
|
+
including the request line, headers, and body. This is useful for debugging
|
|
30
|
+
and logging purposes.
|
|
31
|
+
此函数将Request对象转换为其原始HTTP表示,包括请求行、头部和正文。
|
|
32
|
+
这对于调试和日志记录目的很有用。
|
|
33
|
+
|
|
34
|
+
Note:
|
|
35
|
+
This is provided only for reference since it's not the actual stream of
|
|
36
|
+
bytes that will be sent when performing the request (that's controlled
|
|
37
|
+
by the HTTP client implementation).
|
|
38
|
+
这仅供参考,因为它不是执行请求时将发送的实际字节流
|
|
39
|
+
(那由HTTP客户端实现控制)。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
request: The Request object to convert.
|
|
43
|
+
要转换的Request对象。
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
bytes: The raw HTTP representation of the request.
|
|
47
|
+
请求的原始HTTP表示。
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
>>> request = Request('http://example.com', method='POST',
|
|
51
|
+
... headers={'Content-Type': 'application/json'},
|
|
52
|
+
... body='{"key": "value"}')
|
|
53
|
+
>>> print(request_httprepr(request).decode())
|
|
54
|
+
POST / HTTP/1.1
|
|
55
|
+
Host: example.com
|
|
56
|
+
Content-Type: application/json
|
|
57
|
+
|
|
58
|
+
{"key": "value"}
|
|
59
|
+
"""
|
|
60
|
+
# Parse the URL
|
|
61
|
+
# 解析URL
|
|
23
62
|
parsed = urlparse_cached(request)
|
|
63
|
+
|
|
64
|
+
# Construct the path including params and query
|
|
65
|
+
# 构造包含参数和查询的路径
|
|
24
66
|
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
|
67
|
+
|
|
68
|
+
# Start with the request line
|
|
69
|
+
# 从请求行开始
|
|
25
70
|
s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
|
|
71
|
+
|
|
72
|
+
# Add the Host header
|
|
73
|
+
# 添加Host头部
|
|
26
74
|
s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
|
|
75
|
+
|
|
76
|
+
# Add other headers if present
|
|
77
|
+
# 如果存在,添加其他头部
|
|
27
78
|
if request.headers:
|
|
28
79
|
s += headers_dict_to_raw({to_bytes(k): to_bytes(v) for k, v in request.headers.items()}) + b"\r\n"
|
|
80
|
+
|
|
81
|
+
# Add the empty line that separates headers from body
|
|
82
|
+
# 添加分隔头部和正文的空行
|
|
29
83
|
s += b"\r\n"
|
|
84
|
+
|
|
85
|
+
# Add the body
|
|
86
|
+
# 添加正文
|
|
30
87
|
s += to_bytes(request.body)
|
|
88
|
+
|
|
31
89
|
return s
|
|
32
90
|
|
|
33
91
|
|
|
34
92
|
def referer_str(request: Request) -> Optional[str]:
|
|
35
|
-
"""
|
|
93
|
+
"""
|
|
94
|
+
Return the Referer HTTP header in a format suitable for logging.
|
|
95
|
+
以适合日志记录的格式返回Referer HTTP头。
|
|
96
|
+
|
|
97
|
+
This function extracts the 'Referer' header from a request and converts it
|
|
98
|
+
to a unicode string, replacing any invalid characters. This is useful for
|
|
99
|
+
logging purposes to avoid encoding errors.
|
|
100
|
+
此函数从请求中提取'Referer'头并将其转换为unicode字符串,
|
|
101
|
+
替换任何无效字符。这对于日志记录很有用,可以避免编码错误。
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
request: The Request object to extract the Referer from.
|
|
105
|
+
要提取Referer的Request对象。
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Optional[str]: The Referer header as a unicode string, or None if the
|
|
109
|
+
header is not present.
|
|
110
|
+
作为unicode字符串的Referer头,如果头不存在则为None。
|
|
111
|
+
"""
|
|
112
|
+
# Get the Referer header from the request
|
|
113
|
+
# 从请求中获取Referer头
|
|
36
114
|
referrer = request.headers.get('Referer')
|
|
115
|
+
|
|
116
|
+
# If there's no Referer header, return None
|
|
117
|
+
# 如果没有Referer头,返回None
|
|
37
118
|
if referrer is None:
|
|
38
119
|
return referrer
|
|
120
|
+
|
|
121
|
+
# Convert the Referer to unicode, replacing any invalid characters
|
|
122
|
+
# 将Referer转换为unicode,替换任何无效字符
|
|
39
123
|
return to_unicode(referrer, errors='replace')
|
|
40
124
|
|
|
41
125
|
|
|
42
126
|
async def request_from_dict(d: dict, *, spider: Optional[Spider] = None) -> Request:
|
|
43
|
-
"""
|
|
127
|
+
"""
|
|
128
|
+
Create a Request object from a dictionary.
|
|
129
|
+
从字典创建Request对象。
|
|
130
|
+
|
|
131
|
+
This function converts a dictionary representation of a request into an actual
|
|
132
|
+
Request object. It's useful for deserializing requests, for example when
|
|
133
|
+
loading them from a queue or a file.
|
|
134
|
+
此函数将请求的字典表示转换为实际的Request对象。
|
|
135
|
+
它对于反序列化请求很有用,例如从队列或文件加载请求时。
|
|
136
|
+
|
|
137
|
+
If a spider is provided, the function will:
|
|
138
|
+
1. First call the spider's request_from_dict method to allow custom processing
|
|
139
|
+
2. Try to resolve callback and errback strings to actual methods on the spider
|
|
44
140
|
|
|
45
|
-
|
|
46
|
-
|
|
141
|
+
如果提供了爬虫,该函数将:
|
|
142
|
+
1. 首先调用爬虫的request_from_dict方法以允许自定义处理
|
|
143
|
+
2. 尝试将callback和errback字符串解析为爬虫上的实际方法
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
d: Dictionary containing the request attributes.
|
|
147
|
+
包含请求属性的字典。
|
|
148
|
+
spider: Optional spider instance to resolve callbacks and errbacks.
|
|
149
|
+
可选的爬虫实例,用于解析回调和错误回调。
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Request: A Request object (or subclass) with the attributes from the dictionary.
|
|
153
|
+
具有字典中属性的Request对象(或子类)。
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
ValueError: If a callback or errback name cannot be resolved to a method.
|
|
157
|
+
如果回调或错误回调名称无法解析为方法。
|
|
47
158
|
"""
|
|
48
|
-
|
|
159
|
+
# Allow the spider to customize the dictionary
|
|
160
|
+
# 允许爬虫自定义字典
|
|
161
|
+
if spider:
|
|
162
|
+
d = await spider.request_from_dict(d) or d
|
|
163
|
+
|
|
164
|
+
# If the spider already returned a Request object, return it directly
|
|
165
|
+
# 如果爬虫已经返回了一个Request对象,直接返回它
|
|
49
166
|
if isinstance(d, Request):
|
|
50
167
|
return d
|
|
51
168
|
|
|
169
|
+
# Determine the request class to use (default is Request)
|
|
170
|
+
# 确定要使用的请求类(默认为Request)
|
|
52
171
|
request_cls = load_object(d["_class"]) if "_class" in d else Request
|
|
172
|
+
|
|
173
|
+
# Filter the dictionary to only include valid attributes for the request class
|
|
174
|
+
# 过滤字典,只包含请求类的有效属性
|
|
53
175
|
kwargs = {key: value for key, value in d.items() if key in request_cls.attributes}
|
|
176
|
+
|
|
177
|
+
# Resolve callback string to actual method if spider is provided
|
|
178
|
+
# 如果提供了爬虫,将回调字符串解析为实际方法
|
|
54
179
|
if d.get("callback") and spider:
|
|
55
180
|
kwargs["callback"] = _get_method(spider, d["callback"])
|
|
181
|
+
|
|
182
|
+
# Resolve errback string to actual method if spider is provided
|
|
183
|
+
# 如果提供了爬虫,将错误回调字符串解析为实际方法
|
|
56
184
|
if d.get("errback") and spider:
|
|
57
185
|
kwargs["errback"] = _get_method(spider, d["errback"])
|
|
186
|
+
|
|
187
|
+
# Create and return the request object
|
|
188
|
+
# 创建并返回请求对象
|
|
58
189
|
return request_cls(**kwargs)
|
|
59
190
|
|
|
60
191
|
|
|
61
192
|
def _get_method(obj, name):
|
|
62
|
-
"""
|
|
193
|
+
"""
|
|
194
|
+
Get a method from an object by name.
|
|
195
|
+
通过名称从对象获取方法。
|
|
196
|
+
|
|
197
|
+
This is a helper function for request_from_dict that resolves method names
|
|
198
|
+
to actual method objects. It's used to convert callback and errback strings
|
|
199
|
+
to callable methods on a spider.
|
|
200
|
+
这是request_from_dict的辅助函数,用于将方法名称解析为实际的方法对象。
|
|
201
|
+
它用于将回调和错误回调字符串转换为爬虫上的可调用方法。
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
obj: The object to get the method from (typically a spider).
|
|
205
|
+
要从中获取方法的对象(通常是爬虫)。
|
|
206
|
+
name: The name of the method to get.
|
|
207
|
+
要获取的方法的名称。
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
callable: The method object.
|
|
211
|
+
方法对象。
|
|
212
|
+
|
|
213
|
+
Raises:
|
|
214
|
+
ValueError: If the method is not found on the object.
|
|
215
|
+
如果在对象上找不到该方法。
|
|
216
|
+
"""
|
|
217
|
+
# Ensure the name is a string
|
|
218
|
+
# 确保名称是字符串
|
|
63
219
|
name = str(name)
|
|
220
|
+
|
|
221
|
+
# Try to get the method from the object
|
|
222
|
+
# 尝试从对象获取方法
|
|
64
223
|
try:
|
|
65
224
|
return getattr(obj, name)
|
|
66
225
|
except AttributeError:
|
|
226
|
+
# Raise a more informative error if the method is not found
|
|
227
|
+
# 如果找不到该方法,引发更多信息的错误
|
|
67
228
|
raise ValueError(f"Method {name!r} not found in: {obj}")
|
aioscrapy/utils/response.py
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
Response utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的响应实用函数。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with aioscrapy.http.Response objects.
|
|
6
|
+
It includes functions for extracting base URLs and meta refresh directives from HTML responses.
|
|
7
|
+
此模块提供了用于处理aioscrapy.http.Response对象的实用函数。
|
|
8
|
+
它包括从HTML响应中提取基本URL和元刷新指令的函数。
|
|
4
9
|
"""
|
|
5
10
|
from typing import Iterable, Optional, Tuple, Union
|
|
6
11
|
from weakref import WeakKeyDictionary
|
|
@@ -10,17 +15,59 @@ from w3lib import html
|
|
|
10
15
|
import aioscrapy
|
|
11
16
|
from aioscrapy.http.response import Response
|
|
12
17
|
|
|
18
|
+
# Cache for storing base URLs to avoid repeated parsing of the same response
|
|
19
|
+
# 缓存存储基本URL,以避免重复解析相同的响应
|
|
13
20
|
_baseurl_cache: "WeakKeyDictionary[Response, str]" = WeakKeyDictionary()
|
|
14
21
|
|
|
15
22
|
|
|
16
23
|
def get_base_url(response: "aioscrapy.http.response.TextResponse") -> str:
|
|
17
|
-
"""
|
|
24
|
+
"""
|
|
25
|
+
Extract the base URL from an HTML response.
|
|
26
|
+
从HTML响应中提取基本URL。
|
|
27
|
+
|
|
28
|
+
This function extracts the base URL from an HTML response by looking for
|
|
29
|
+
the <base> tag in the HTML. If found, it returns the href attribute of the
|
|
30
|
+
base tag, resolved against the response URL. If not found, it returns the
|
|
31
|
+
response URL.
|
|
32
|
+
此函数通过查找HTML中的<base>标签来从HTML响应中提取基本URL。
|
|
33
|
+
如果找到,它返回base标签的href属性,相对于响应URL解析。
|
|
34
|
+
如果未找到,它返回响应URL。
|
|
35
|
+
|
|
36
|
+
The function uses a cache to avoid repeated parsing of the same response.
|
|
37
|
+
Only the first 4KB of the response text are examined for performance reasons.
|
|
38
|
+
该函数使用缓存来避免重复解析相同的响应。
|
|
39
|
+
出于性能原因,只检查响应文本的前4KB。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
response: The HTML response to extract the base URL from.
|
|
43
|
+
要从中提取基本URL的HTML响应。
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
str: The base URL of the response, which could be either:
|
|
47
|
+
响应的基本URL,可能是:
|
|
48
|
+
- The href attribute of the <base> tag, resolved against the response URL
|
|
49
|
+
<base>标签的href属性,相对于响应URL解析
|
|
50
|
+
- The response URL if no <base> tag is found
|
|
51
|
+
如果未找到<base>标签,则为响应URL
|
|
52
|
+
"""
|
|
53
|
+
# Check if the base URL is already cached for this response
|
|
54
|
+
# 检查此响应的基本URL是否已缓存
|
|
18
55
|
if response not in _baseurl_cache:
|
|
56
|
+
# Only examine the first 4KB of the response for performance
|
|
57
|
+
# 出于性能考虑,只检查响应的前4KB
|
|
19
58
|
text = response.text[0:4096]
|
|
59
|
+
# Extract the base URL using w3lib.html
|
|
60
|
+
# 使用w3lib.html提取基本URL
|
|
20
61
|
_baseurl_cache[response] = html.get_base_url(text, response.url, response.encoding)
|
|
62
|
+
# Return the cached base URL
|
|
63
|
+
# 返回缓存的基本URL
|
|
21
64
|
return _baseurl_cache[response]
|
|
22
65
|
|
|
23
66
|
|
|
67
|
+
# Cache for storing meta refresh directives to avoid repeated parsing of the same response
|
|
68
|
+
# 缓存存储元刷新指令,以避免重复解析相同的响应
|
|
69
|
+
# The cache stores either (None, None) if no meta refresh is found, or (seconds, url) if found
|
|
70
|
+
# 如果未找到元刷新,缓存存储(None, None),如果找到,则存储(秒数, url)
|
|
24
71
|
_metaref_cache: "WeakKeyDictionary[Response, Union[Tuple[None, None], Tuple[float, str]]]" = WeakKeyDictionary()
|
|
25
72
|
|
|
26
73
|
|
|
@@ -28,11 +75,49 @@ def get_meta_refresh(
|
|
|
28
75
|
response: "aioscrapy.http.response.TextResponse",
|
|
29
76
|
ignore_tags: Optional[Iterable[str]] = ('script', 'noscript'),
|
|
30
77
|
) -> Union[Tuple[None, None], Tuple[float, str]]:
|
|
31
|
-
"""
|
|
78
|
+
"""
|
|
79
|
+
Extract the meta refresh directive from an HTML response.
|
|
80
|
+
从HTML响应中提取元刷新指令。
|
|
81
|
+
|
|
82
|
+
This function looks for the HTML meta refresh tag in the response and extracts
|
|
83
|
+
the delay (in seconds) and the URL to redirect to. The meta refresh tag is
|
|
84
|
+
typically used for automatic page redirection or refreshing.
|
|
85
|
+
此函数在响应中查找HTML元刷新标签,并提取延迟(以秒为单位)和要重定向到的URL。
|
|
86
|
+
元刷新标签通常用于自动页面重定向或刷新。
|
|
87
|
+
|
|
88
|
+
Example of a meta refresh tag:
|
|
89
|
+
元刷新标签的示例:
|
|
90
|
+
<meta http-equiv="refresh" content="5; url=https://example.com">
|
|
91
|
+
|
|
92
|
+
The function uses a cache to avoid repeated parsing of the same response.
|
|
93
|
+
Only the first 4KB of the response text are examined for performance reasons.
|
|
94
|
+
该函数使用缓存来避免重复解析相同的响应。
|
|
95
|
+
出于性能原因,只检查响应文本的前4KB。
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
response: The HTML response to extract the meta refresh from.
|
|
99
|
+
要从中提取元刷新的HTML响应。
|
|
100
|
+
ignore_tags: HTML tags to ignore when parsing. Default is ('script', 'noscript').
|
|
101
|
+
解析时要忽略的HTML标签。默认为('script', 'noscript')。
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
A tuple containing:
|
|
105
|
+
包含以下内容的元组:
|
|
106
|
+
- If meta refresh is found: (delay_seconds, url)
|
|
107
|
+
如果找到元刷新:(延迟秒数, url)
|
|
108
|
+
- If no meta refresh is found: (None, None)
|
|
109
|
+
如果未找到元刷新:(None, None)
|
|
110
|
+
"""
|
|
111
|
+
# Check if the meta refresh is already cached for this response
|
|
112
|
+
# 检查此响应的元刷新是否已缓存
|
|
32
113
|
if response not in _metaref_cache:
|
|
114
|
+
# Only examine the first 4KB of the response for performance
|
|
115
|
+
# 出于性能考虑,只检查响应的前4KB
|
|
33
116
|
text = response.text[0:4096]
|
|
117
|
+
# Extract the meta refresh using w3lib.html
|
|
118
|
+
# 使用w3lib.html提取元刷新
|
|
34
119
|
_metaref_cache[response] = html.get_meta_refresh(
|
|
35
120
|
text, response.url, response.encoding, ignore_tags=ignore_tags)
|
|
121
|
+
# Return the cached meta refresh
|
|
122
|
+
# 返回缓存的元刷新
|
|
36
123
|
return _metaref_cache[response]
|
|
37
|
-
|
|
38
|
-
|