aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Url Length Spider Middleware
|
|
3
|
+
URL长度爬虫中间件
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This middleware filters out requests with URLs that exceed a configurable maximum
|
|
6
|
+
length. It helps prevent issues with excessively long URLs that might cause problems
|
|
7
|
+
with servers, proxies, or browsers.
|
|
8
|
+
此中间件过滤掉URL超过可配置最大长度的请求。它有助于防止过长的URL可能导致
|
|
9
|
+
服务器、代理或浏览器出现问题。
|
|
5
10
|
"""
|
|
6
11
|
|
|
7
12
|
from aioscrapy.exceptions import NotConfigured
|
|
@@ -10,28 +15,119 @@ from aioscrapy.utils.log import logger
|
|
|
10
15
|
|
|
11
16
|
|
|
12
17
|
class UrlLengthMiddleware:
|
|
18
|
+
"""
|
|
19
|
+
Spider middleware to filter out requests with excessively long URLs.
|
|
20
|
+
用于过滤掉URL过长的请求的爬虫中间件。
|
|
21
|
+
|
|
22
|
+
This middleware checks the length of URLs in requests and filters out those
|
|
23
|
+
that exceed a configurable maximum length. This helps prevent issues with
|
|
24
|
+
servers, proxies, or browsers that might have trouble handling very long URLs.
|
|
25
|
+
此中间件检查请求中URL的长度,并过滤掉超过可配置最大长度的URL。
|
|
26
|
+
这有助于防止服务器、代理或浏览器在处理非常长的URL时可能遇到的问题。
|
|
27
|
+
"""
|
|
13
28
|
|
|
14
29
|
def __init__(self, maxlength):
|
|
30
|
+
"""
|
|
31
|
+
Initialize the URL length middleware.
|
|
32
|
+
初始化URL长度中间件。
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
maxlength: The maximum allowed URL length in characters.
|
|
36
|
+
允许的URL最大长度(以字符为单位)。
|
|
37
|
+
"""
|
|
38
|
+
# Maximum allowed URL length
|
|
39
|
+
# 允许的URL最大长度
|
|
15
40
|
self.maxlength = maxlength
|
|
16
41
|
|
|
17
42
|
@classmethod
|
|
18
43
|
def from_settings(cls, settings):
|
|
44
|
+
"""
|
|
45
|
+
Create a UrlLengthMiddleware instance from settings.
|
|
46
|
+
从设置创建UrlLengthMiddleware实例。
|
|
47
|
+
|
|
48
|
+
This is the factory method used by AioScrapy to create the middleware.
|
|
49
|
+
这是AioScrapy用于创建中间件的工厂方法。
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
settings: The AioScrapy settings object.
|
|
53
|
+
AioScrapy设置对象。
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
UrlLengthMiddleware: A new UrlLengthMiddleware instance.
|
|
57
|
+
一个新的UrlLengthMiddleware实例。
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
NotConfigured: If URLLENGTH_LIMIT is not set or is zero in the settings.
|
|
61
|
+
如果在设置中未设置URLLENGTH_LIMIT或其值为零。
|
|
62
|
+
"""
|
|
63
|
+
# Get the maximum URL length from settings
|
|
64
|
+
# 从设置获取最大URL长度
|
|
19
65
|
maxlength = settings.getint('URLLENGTH_LIMIT')
|
|
66
|
+
|
|
67
|
+
# If no maximum length is configured, disable the middleware
|
|
68
|
+
# 如果未配置最大长度,则禁用中间件
|
|
20
69
|
if not maxlength:
|
|
21
70
|
raise NotConfigured
|
|
71
|
+
|
|
72
|
+
# Create and return a new instance
|
|
73
|
+
# 创建并返回一个新实例
|
|
22
74
|
return cls(maxlength)
|
|
23
75
|
|
|
24
76
|
async def process_spider_output(self, response, result, spider):
|
|
77
|
+
"""
|
|
78
|
+
Process the spider output to filter out requests with long URLs.
|
|
79
|
+
处理爬虫输出以过滤掉具有长URL的请求。
|
|
80
|
+
|
|
81
|
+
This method processes each request yielded by the spider and filters out
|
|
82
|
+
those with URLs that exceed the configured maximum length.
|
|
83
|
+
此方法处理爬虫产生的每个请求,并过滤掉URL超过配置的最大长度的请求。
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
response: The response being processed.
|
|
87
|
+
正在处理的响应。
|
|
88
|
+
result: The result returned by the spider.
|
|
89
|
+
爬虫返回的结果。
|
|
90
|
+
spider: The spider that generated the result.
|
|
91
|
+
生成结果的爬虫。
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
An async generator yielding filtered requests and other items.
|
|
95
|
+
一个产生过滤后的请求和其他项目的异步生成器。
|
|
96
|
+
"""
|
|
25
97
|
def _filter(request):
|
|
98
|
+
"""
|
|
99
|
+
Filter function to check if a request's URL is too long.
|
|
100
|
+
检查请求的URL是否过长的过滤函数。
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
request: The request to check.
|
|
104
|
+
要检查的请求。
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: True if the request should be kept, False if it should be filtered out.
|
|
108
|
+
如果应保留请求,则为True;如果应过滤掉请求,则为False。
|
|
109
|
+
"""
|
|
110
|
+
# Check if the item is a Request and if its URL exceeds the maximum length
|
|
111
|
+
# 检查项目是否为Request,以及其URL是否超过最大长度
|
|
26
112
|
if isinstance(request, Request) and len(request.url) > self.maxlength:
|
|
113
|
+
# Log the ignored request
|
|
114
|
+
# 记录被忽略的请求
|
|
27
115
|
logger.info(
|
|
28
116
|
"Ignoring link (url length > %(maxlength)d): %(url)s " % {
|
|
29
117
|
'maxlength': self.maxlength, 'url': request.url
|
|
30
118
|
}
|
|
31
119
|
)
|
|
120
|
+
# Update statistics
|
|
121
|
+
# 更新统计信息
|
|
32
122
|
spider.crawler.stats.inc_value('urllength/request_ignored_count', spider=spider)
|
|
123
|
+
# Filter out this request
|
|
124
|
+
# 过滤掉此请求
|
|
33
125
|
return False
|
|
34
126
|
else:
|
|
127
|
+
# Keep all other items
|
|
128
|
+
# 保留所有其他项目
|
|
35
129
|
return True
|
|
36
130
|
|
|
131
|
+
# Filter the results using the _filter function
|
|
132
|
+
# 使用_filter函数过滤结果
|
|
37
133
|
return (r async for r in result or () if _filter(r))
|
aioscrapy/link.py
CHANGED
|
@@ -1,41 +1,122 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Link Module
|
|
3
|
+
链接模块
|
|
4
|
+
|
|
5
|
+
This module defines the Link object used in Link extractors. The Link class
|
|
6
|
+
represents an extracted link from a web page, containing information such as
|
|
7
|
+
the URL, anchor text, URL fragment, and nofollow status.
|
|
8
|
+
此模块定义了链接提取器中使用的Link对象。Link类表示从网页中提取的链接,
|
|
9
|
+
包含URL、锚文本、URL片段和nofollow状态等信息。
|
|
3
10
|
|
|
4
11
|
For actual link extractors implementation see scrapy.linkextractors, or
|
|
5
12
|
its documentation in: docs/topics/link-extractors.rst
|
|
13
|
+
有关实际链接提取器的实现,请参见scrapy.linkextractors,
|
|
14
|
+
或其文档:docs/topics/link-extractors.rst
|
|
6
15
|
"""
|
|
7
16
|
|
|
8
17
|
|
|
9
18
|
class Link:
|
|
10
|
-
"""
|
|
19
|
+
"""
|
|
20
|
+
Represents an extracted link from a web page.
|
|
21
|
+
表示从网页中提取的链接。
|
|
22
|
+
|
|
23
|
+
Link objects are created by LinkExtractors to represent links extracted from web pages.
|
|
24
|
+
Each Link object contains information about the URL, anchor text, URL fragment, and
|
|
25
|
+
nofollow status of the extracted link.
|
|
26
|
+
Link对象由LinkExtractor创建,用于表示从网页中提取的链接。每个Link对象包含有关
|
|
27
|
+
提取链接的URL、锚文本、URL片段和nofollow状态的信息。
|
|
11
28
|
|
|
12
29
|
Using the anchor tag sample below to illustrate the parameters::
|
|
30
|
+
使用下面的锚标签示例来说明参数::
|
|
13
31
|
|
|
14
32
|
<a href="https://example.com/nofollow.html#foo" rel="nofollow">Dont follow this one</a>
|
|
15
33
|
|
|
16
|
-
:
|
|
17
|
-
|
|
34
|
+
Args:
|
|
35
|
+
url: The absolute URL being linked to in the anchor tag.
|
|
36
|
+
锚标签中链接到的绝对URL。
|
|
37
|
+
From the sample, this is ``https://example.com/nofollow.html``.
|
|
38
|
+
从示例中,这是``https://example.com/nofollow.html``。
|
|
18
39
|
|
|
19
|
-
|
|
40
|
+
text: The text in the anchor tag.
|
|
41
|
+
锚标签中的文本。
|
|
42
|
+
From the sample, this is ``Dont follow this one``.
|
|
43
|
+
从示例中,这是``Dont follow this one``。
|
|
44
|
+
Defaults to an empty string.
|
|
45
|
+
默认为空字符串。
|
|
20
46
|
|
|
21
|
-
|
|
47
|
+
fragment: The part of the URL after the hash symbol.
|
|
48
|
+
URL中哈希符号后的部分。
|
|
49
|
+
From the sample, this is ``foo``.
|
|
50
|
+
从示例中,这是``foo``。
|
|
51
|
+
Defaults to an empty string.
|
|
52
|
+
默认为空字符串。
|
|
22
53
|
|
|
23
|
-
|
|
24
|
-
|
|
54
|
+
nofollow: An indication of the presence or absence of a nofollow value
|
|
55
|
+
in the ``rel`` attribute of the anchor tag.
|
|
56
|
+
表示锚标签的``rel``属性中是否存在nofollow值。
|
|
57
|
+
Defaults to False.
|
|
58
|
+
默认为False。
|
|
25
59
|
"""
|
|
26
60
|
|
|
61
|
+
# Define __slots__ to save memory when creating many Link objects
|
|
62
|
+
# 定义__slots__以在创建多个Link对象时节省内存
|
|
27
63
|
__slots__ = ['url', 'text', 'fragment', 'nofollow']
|
|
28
64
|
|
|
29
65
|
def __init__(self, url, text='', fragment='', nofollow=False):
|
|
66
|
+
"""
|
|
67
|
+
Initialize a Link object.
|
|
68
|
+
初始化Link对象。
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
url: The absolute URL being linked to.
|
|
72
|
+
被链接到的绝对URL。
|
|
73
|
+
text: The anchor text of the link.
|
|
74
|
+
链接的锚文本。
|
|
75
|
+
Defaults to an empty string.
|
|
76
|
+
默认为空字符串。
|
|
77
|
+
fragment: The URL fragment (part after the # symbol).
|
|
78
|
+
URL片段(#符号后的部分)。
|
|
79
|
+
Defaults to an empty string.
|
|
80
|
+
默认为空字符串。
|
|
81
|
+
nofollow: Whether the link has a nofollow attribute.
|
|
82
|
+
链接是否具有nofollow属性。
|
|
83
|
+
Defaults to False.
|
|
84
|
+
默认为False。
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
TypeError: If the URL is not a string.
|
|
88
|
+
如果URL不是字符串。
|
|
89
|
+
"""
|
|
90
|
+
# Ensure the URL is a string
|
|
91
|
+
# 确保URL是字符串
|
|
30
92
|
if not isinstance(url, str):
|
|
31
93
|
got = url.__class__.__name__
|
|
32
94
|
raise TypeError(f"Link urls must be str objects, got {got}")
|
|
95
|
+
|
|
96
|
+
# Store the link attributes
|
|
97
|
+
# 存储链接属性
|
|
33
98
|
self.url = url
|
|
34
99
|
self.text = text
|
|
35
100
|
self.fragment = fragment
|
|
36
101
|
self.nofollow = nofollow
|
|
37
102
|
|
|
38
103
|
def __eq__(self, other):
|
|
104
|
+
"""
|
|
105
|
+
Compare two Link objects for equality.
|
|
106
|
+
比较两个Link对象是否相等。
|
|
107
|
+
|
|
108
|
+
Two Link objects are considered equal if they have the same URL, text,
|
|
109
|
+
fragment, and nofollow status.
|
|
110
|
+
如果两个Link对象具有相同的URL、文本、片段和nofollow状态,则它们被认为是相等的。
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
other: The other Link object to compare with.
|
|
114
|
+
要比较的其他Link对象。
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
bool: True if the Link objects are equal, False otherwise.
|
|
118
|
+
如果Link对象相等,则为True,否则为False。
|
|
119
|
+
"""
|
|
39
120
|
return (
|
|
40
121
|
self.url == other.url
|
|
41
122
|
and self.text == other.text
|
|
@@ -44,9 +125,35 @@ class Link:
|
|
|
44
125
|
)
|
|
45
126
|
|
|
46
127
|
def __hash__(self):
|
|
128
|
+
"""
|
|
129
|
+
Calculate a hash value for the Link object.
|
|
130
|
+
计算Link对象的哈希值。
|
|
131
|
+
|
|
132
|
+
This method is implemented to allow Link objects to be used as dictionary
|
|
133
|
+
keys or in sets. The hash value is based on the URL, text, fragment, and
|
|
134
|
+
nofollow status.
|
|
135
|
+
实现此方法是为了允许将Link对象用作字典键或集合中的元素。哈希值基于URL、
|
|
136
|
+
文本、片段和nofollow状态。
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
int: A hash value for the Link object.
|
|
140
|
+
Link对象的哈希值。
|
|
141
|
+
"""
|
|
47
142
|
return hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
|
|
48
143
|
|
|
49
144
|
def __repr__(self):
|
|
145
|
+
"""
|
|
146
|
+
Return a string representation of the Link object.
|
|
147
|
+
返回Link对象的字符串表示。
|
|
148
|
+
|
|
149
|
+
This method returns a string that, when passed to eval(), would create a
|
|
150
|
+
new Link object with the same attributes.
|
|
151
|
+
此方法返回一个字符串,当传递给eval()时,将创建一个具有相同属性的新Link对象。
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
str: A string representation of the Link object.
|
|
155
|
+
Link对象的字符串表示。
|
|
156
|
+
"""
|
|
50
157
|
return (
|
|
51
158
|
f'Link(url={self.url!r}, text={self.text!r}, '
|
|
52
159
|
f'fragment={self.fragment!r}, nofollow={self.nofollow!r})'
|
aioscrapy/logformatter.py
CHANGED
|
@@ -1,7 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Log Formatter Module
|
|
3
|
+
日志格式化模块
|
|
4
|
+
|
|
5
|
+
This module provides the LogFormatter class, which formats log messages for various
|
|
6
|
+
events that occur during the crawling process, such as crawling a page, scraping an item,
|
|
7
|
+
dropping an item, or encountering errors.
|
|
8
|
+
此模块提供LogFormatter类,用于格式化爬取过程中发生的各种事件的日志消息,
|
|
9
|
+
例如爬取页面、抓取项目、丢弃项目或遇到错误。
|
|
10
|
+
|
|
11
|
+
The LogFormatter can be customized by subclassing and overriding its methods to
|
|
12
|
+
change the format of log messages.
|
|
13
|
+
可以通过子类化并覆盖其方法来自定义LogFormatter,以更改日志消息的格式。
|
|
14
|
+
"""
|
|
1
15
|
import os
|
|
2
16
|
|
|
3
17
|
from aioscrapy.utils.request import referer_str
|
|
4
18
|
|
|
19
|
+
# Standard log message templates
|
|
20
|
+
# 标准日志消息模板
|
|
5
21
|
SCRAPEDMSG = "Scraped from %(src)s" + os.linesep + "%(item)s"
|
|
6
22
|
DROPPEDMSG = "Dropped: %(exception)s" + os.linesep + "%(item)s"
|
|
7
23
|
CRAWLEDMSG = "Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
|
|
@@ -12,11 +28,48 @@ DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
|
|
|
12
28
|
|
|
13
29
|
|
|
14
30
|
class LogFormatter:
|
|
31
|
+
"""
|
|
32
|
+
Formats log messages for various events during the crawling process.
|
|
33
|
+
格式化爬取过程中各种事件的日志消息。
|
|
34
|
+
|
|
35
|
+
This class provides methods to format log messages for events such as crawling a page,
|
|
36
|
+
scraping an item, dropping an item, or encountering errors. It can be customized by
|
|
37
|
+
subclassing and overriding its methods to change the format of log messages.
|
|
38
|
+
此类提供方法来格式化爬取页面、抓取项目、丢弃项目或遇到错误等事件的日志消息。
|
|
39
|
+
可以通过子类化并覆盖其方法来自定义它,以更改日志消息的格式。
|
|
40
|
+
"""
|
|
41
|
+
|
|
15
42
|
@staticmethod
|
|
16
43
|
def crawled(request, response, spider):
|
|
17
|
-
"""
|
|
44
|
+
"""
|
|
45
|
+
Format a log message for a crawled page.
|
|
46
|
+
格式化已爬取页面的日志消息。
|
|
47
|
+
|
|
48
|
+
This method is called when the crawler successfully downloads a webpage.
|
|
49
|
+
It formats a log message that includes the response status, request URL,
|
|
50
|
+
request flags, referer, and response flags.
|
|
51
|
+
当爬虫成功下载网页时调用此方法。它格式化一条包含响应状态、请求URL、
|
|
52
|
+
请求标志、引用者和响应标志的日志消息。
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
request: The request that was made.
|
|
56
|
+
发出的请求。
|
|
57
|
+
response: The response that was received.
|
|
58
|
+
接收到的响应。
|
|
59
|
+
spider: The spider that made the request.
|
|
60
|
+
发出请求的爬虫。
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
dict: A dictionary with log level and message.
|
|
64
|
+
包含日志级别和消息的字典。
|
|
65
|
+
"""
|
|
66
|
+
# Format request and response flags if they exist
|
|
67
|
+
# 如果存在,则格式化请求和响应标志
|
|
18
68
|
request_flags = f' {str(request.flags)}' if request.flags else ''
|
|
19
69
|
response_flags = f' {str(response.flags)}' if response.flags else ''
|
|
70
|
+
|
|
71
|
+
# Return a dictionary with log level and formatted message
|
|
72
|
+
# 返回包含日志级别和格式化消息的字典
|
|
20
73
|
return {
|
|
21
74
|
'_Logger__level': "DEBUG",
|
|
22
75
|
'_Logger__message': CRAWLEDMSG % {
|
|
@@ -31,8 +84,32 @@ class LogFormatter:
|
|
|
31
84
|
|
|
32
85
|
@staticmethod
|
|
33
86
|
def scraped(item, response, spider):
|
|
34
|
-
"""
|
|
87
|
+
"""
|
|
88
|
+
Format a log message for a scraped item.
|
|
89
|
+
格式化已抓取项目的日志消息。
|
|
90
|
+
|
|
91
|
+
This method is called when a spider successfully scrapes an item from a response.
|
|
92
|
+
It formats a log message that includes the response source and the item details.
|
|
93
|
+
当爬虫成功从响应中抓取项目时调用此方法。它格式化一条包含响应源和项目详细信息的日志消息。
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
item: The item that was scraped.
|
|
97
|
+
被抓取的项目。
|
|
98
|
+
response: The response from which the item was scraped.
|
|
99
|
+
项目被抓取的响应。
|
|
100
|
+
spider: The spider that scraped the item.
|
|
101
|
+
抓取项目的爬虫。
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
dict: A dictionary with log level and message.
|
|
105
|
+
包含日志级别和消息的字典。
|
|
106
|
+
"""
|
|
107
|
+
# Use the response as the source
|
|
108
|
+
# 使用响应作为源
|
|
35
109
|
src = response
|
|
110
|
+
|
|
111
|
+
# Return a dictionary with log level and formatted message
|
|
112
|
+
# 返回包含日志级别和格式化消息的字典
|
|
36
113
|
return {
|
|
37
114
|
'_Logger__level': "DEBUG",
|
|
38
115
|
'_Logger__message': SCRAPEDMSG % {
|
|
@@ -43,7 +120,32 @@ class LogFormatter:
|
|
|
43
120
|
|
|
44
121
|
@staticmethod
|
|
45
122
|
def dropped(item, exception, response, spider):
|
|
46
|
-
"""
|
|
123
|
+
"""
|
|
124
|
+
Format a log message for a dropped item.
|
|
125
|
+
格式化已丢弃项目的日志消息。
|
|
126
|
+
|
|
127
|
+
This method is called when an item is dropped while passing through the item pipeline.
|
|
128
|
+
It formats a log message that includes the exception that caused the item to be dropped
|
|
129
|
+
and the item details.
|
|
130
|
+
当项目在通过项目管道时被丢弃时调用此方法。它格式化一条包含导致项目被丢弃的异常
|
|
131
|
+
和项目详细信息的日志消息。
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
item: The item that was dropped.
|
|
135
|
+
被丢弃的项目。
|
|
136
|
+
exception: The exception that caused the item to be dropped.
|
|
137
|
+
导致项目被丢弃的异常。
|
|
138
|
+
response: The response from which the item was scraped.
|
|
139
|
+
项目被抓取的响应。
|
|
140
|
+
spider: The spider that scraped the item.
|
|
141
|
+
抓取项目的爬虫。
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
dict: A dictionary with log level and message.
|
|
145
|
+
包含日志级别和消息的字典。
|
|
146
|
+
"""
|
|
147
|
+
# Return a dictionary with log level and formatted message
|
|
148
|
+
# 返回包含日志级别和格式化消息的字典
|
|
47
149
|
return {
|
|
48
150
|
'_Logger__level': "WARNING",
|
|
49
151
|
'_Logger__message': DROPPEDMSG % {
|
|
@@ -54,21 +156,64 @@ class LogFormatter:
|
|
|
54
156
|
|
|
55
157
|
@staticmethod
|
|
56
158
|
def item_error(item, exception, response, spider):
|
|
57
|
-
"""
|
|
58
|
-
|
|
159
|
+
"""
|
|
160
|
+
Format a log message for an item processing error.
|
|
161
|
+
格式化项目处理错误的日志消息。
|
|
162
|
+
|
|
163
|
+
This method is called when an item causes an error while passing through the item pipeline.
|
|
164
|
+
It formats a log message that includes the item details.
|
|
165
|
+
当项目在通过项目管道时导致错误时调用此方法。它格式化一条包含项目详细信息的日志消息。
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
item: The item that caused the error.
|
|
169
|
+
导致错误的项目。
|
|
170
|
+
exception: The exception that was raised.
|
|
171
|
+
引发的异常。
|
|
172
|
+
response: The response from which the item was scraped.
|
|
173
|
+
项目被抓取的响应。
|
|
174
|
+
spider: The spider that scraped the item.
|
|
175
|
+
抓取项目的爬虫。
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
str: A formatted log message.
|
|
179
|
+
格式化的日志消息。
|
|
59
180
|
|
|
60
181
|
.. versionadded:: 2.0
|
|
61
182
|
"""
|
|
183
|
+
# Return a formatted message with the item details
|
|
184
|
+
# 返回包含项目详细信息的格式化消息
|
|
62
185
|
return ITEMERRORMSG % {
|
|
63
186
|
'item': item,
|
|
64
187
|
}
|
|
65
188
|
|
|
66
189
|
@staticmethod
|
|
67
190
|
def spider_error(failure, request, response, spider):
|
|
68
|
-
"""
|
|
191
|
+
"""
|
|
192
|
+
Format a log message for a spider error.
|
|
193
|
+
格式化爬虫错误的日志消息。
|
|
194
|
+
|
|
195
|
+
This method is called when a spider raises an exception while processing a response.
|
|
196
|
+
It formats a log message that includes the request URL and referer.
|
|
197
|
+
当爬虫在处理响应时引发异常时调用此方法。它格式化一条包含请求URL和引用者的日志消息。
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
failure: The failure that occurred.
|
|
201
|
+
发生的失败。
|
|
202
|
+
request: The request that was being processed.
|
|
203
|
+
正在处理的请求。
|
|
204
|
+
response: The response that was being processed.
|
|
205
|
+
正在处理的响应。
|
|
206
|
+
spider: The spider that raised the exception.
|
|
207
|
+
引发异常的爬虫。
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
str: A formatted log message.
|
|
211
|
+
格式化的日志消息。
|
|
69
212
|
|
|
70
213
|
.. versionadded:: 2.0
|
|
71
214
|
"""
|
|
215
|
+
# Return a formatted message with the request and referer
|
|
216
|
+
# 返回包含请求和引用者的格式化消息
|
|
72
217
|
return SPIDERERRORMSG % {
|
|
73
218
|
'request': request,
|
|
74
219
|
'referer': referer_str(request),
|
|
@@ -76,19 +221,65 @@ class LogFormatter:
|
|
|
76
221
|
|
|
77
222
|
@staticmethod
|
|
78
223
|
def download_error(failure, request, spider, errmsg=None):
|
|
79
|
-
"""
|
|
80
|
-
|
|
224
|
+
"""
|
|
225
|
+
Format a log message for a download error.
|
|
226
|
+
格式化下载错误的日志消息。
|
|
227
|
+
|
|
228
|
+
This method is called when there is an error downloading a request.
|
|
229
|
+
It formats a log message that includes the request URL and optionally the error message.
|
|
230
|
+
当下载请求时出错时调用此方法。它格式化一条包含请求URL和可选的错误消息的日志消息。
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
failure: The failure that occurred.
|
|
234
|
+
发生的失败。
|
|
235
|
+
request: The request that failed to download.
|
|
236
|
+
下载失败的请求。
|
|
237
|
+
spider: The spider that made the request.
|
|
238
|
+
发出请求的爬虫。
|
|
239
|
+
errmsg: An optional error message.
|
|
240
|
+
可选的错误消息。
|
|
241
|
+
Defaults to None.
|
|
242
|
+
默认为None。
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
str: A formatted log message.
|
|
246
|
+
格式化的日志消息。
|
|
81
247
|
|
|
82
248
|
.. versionadded:: 2.0
|
|
83
249
|
"""
|
|
250
|
+
# Prepare arguments for the message
|
|
251
|
+
# 准备消息的参数
|
|
84
252
|
args = {'request': request}
|
|
253
|
+
|
|
254
|
+
# Choose the appropriate message template based on whether an error message is provided
|
|
255
|
+
# 根据是否提供错误消息选择适当的消息模板
|
|
85
256
|
if errmsg:
|
|
86
257
|
msg = DOWNLOADERRORMSG_LONG
|
|
87
258
|
args['errmsg'] = errmsg
|
|
88
259
|
else:
|
|
89
260
|
msg = DOWNLOADERRORMSG_SHORT
|
|
261
|
+
|
|
262
|
+
# Return the formatted message
|
|
263
|
+
# 返回格式化的消息
|
|
90
264
|
return msg % args
|
|
91
265
|
|
|
92
266
|
@classmethod
|
|
93
267
|
def from_crawler(cls, crawler):
|
|
268
|
+
"""
|
|
269
|
+
Create a LogFormatter instance from a crawler.
|
|
270
|
+
从爬虫创建LogFormatter实例。
|
|
271
|
+
|
|
272
|
+
This is the factory method used by AioScrapy to create the log formatter.
|
|
273
|
+
这是AioScrapy用于创建日志格式化器的工厂方法。
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
crawler: The crawler that will use this log formatter.
|
|
277
|
+
将使用此日志格式化器的爬虫。
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
LogFormatter: A new LogFormatter instance.
|
|
281
|
+
一个新的LogFormatter实例。
|
|
282
|
+
"""
|
|
283
|
+
# Create and return a new instance
|
|
284
|
+
# 创建并返回一个新实例
|
|
94
285
|
return cls()
|