aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,54 +1,194 @@
|
|
|
1
1
|
"""
|
|
2
2
|
HttpError Spider Middleware
|
|
3
|
+
HTTP错误爬虫中间件
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This middleware filters out responses with non-200 status codes and generates
|
|
6
|
+
appropriate exceptions. It allows you to specify which status codes should be
|
|
7
|
+
allowed through to the spider via settings or meta attributes.
|
|
8
|
+
此中间件过滤掉具有非200状态码的响应并生成适当的异常。它允许您通过设置或
|
|
9
|
+
元属性指定哪些状态码应该被允许传递给爬虫。
|
|
5
10
|
"""
|
|
6
11
|
|
|
7
12
|
from aioscrapy.exceptions import IgnoreRequest
|
|
8
|
-
|
|
9
13
|
from aioscrapy.utils.log import logger
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
class HttpError(IgnoreRequest):
|
|
13
|
-
"""
|
|
17
|
+
"""
|
|
18
|
+
Exception raised when a non-200 response is filtered.
|
|
19
|
+
当过滤非200响应时引发的异常。
|
|
20
|
+
|
|
21
|
+
This exception is raised by the HttpErrorMiddleware when it encounters a
|
|
22
|
+
response with a status code that is not in the allowed list. It is a subclass
|
|
23
|
+
of IgnoreRequest, which means the response will be ignored by the spider.
|
|
24
|
+
当HttpErrorMiddleware遇到状态码不在允许列表中的响应时,会引发此异常。
|
|
25
|
+
它是IgnoreRequest的子类,这意味着该响应将被爬虫忽略。
|
|
26
|
+
"""
|
|
14
27
|
|
|
15
28
|
def __init__(self, response, *args, **kwargs):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the HttpError exception.
|
|
31
|
+
初始化HttpError异常。
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
response: The response that triggered the exception.
|
|
35
|
+
触发异常的响应。
|
|
36
|
+
*args: Variable length argument list passed to the parent class.
|
|
37
|
+
传递给父类的可变长度参数列表。
|
|
38
|
+
**kwargs: Arbitrary keyword arguments passed to the parent class.
|
|
39
|
+
传递给父类的任意关键字参数。
|
|
40
|
+
"""
|
|
41
|
+
# Store the response that triggered the exception
|
|
42
|
+
# 存储触发异常的响应
|
|
16
43
|
self.response = response
|
|
44
|
+
|
|
45
|
+
# Initialize the parent IgnoreRequest class
|
|
46
|
+
# 初始化父类IgnoreRequest
|
|
17
47
|
super().__init__(*args, **kwargs)
|
|
18
48
|
|
|
19
49
|
|
|
20
50
|
class HttpErrorMiddleware:
|
|
51
|
+
"""
|
|
52
|
+
Spider middleware to filter out responses with non-200 status codes.
|
|
53
|
+
用于过滤掉具有非200状态码的响应的爬虫中间件。
|
|
54
|
+
|
|
55
|
+
This middleware checks the status code of each response and raises an HttpError
|
|
56
|
+
exception for responses with status codes that are not in the allowed list.
|
|
57
|
+
The allowed list can be specified via settings, spider attributes, or response
|
|
58
|
+
meta attributes.
|
|
59
|
+
此中间件检查每个响应的状态码,并为状态码不在允许列表中的响应引发HttpError异常。
|
|
60
|
+
允许列表可以通过设置、爬虫属性或响应元属性指定。
|
|
61
|
+
"""
|
|
21
62
|
|
|
22
63
|
@classmethod
|
|
23
64
|
def from_crawler(cls, crawler):
|
|
65
|
+
"""
|
|
66
|
+
Create a HttpErrorMiddleware instance from a crawler.
|
|
67
|
+
从爬虫创建HttpErrorMiddleware实例。
|
|
68
|
+
|
|
69
|
+
This is the factory method used by AioScrapy to create the middleware.
|
|
70
|
+
这是AioScrapy用于创建中间件的工厂方法。
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
crawler: The crawler that will use this middleware.
|
|
74
|
+
将使用此中间件的爬虫。
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
HttpErrorMiddleware: A new HttpErrorMiddleware instance.
|
|
78
|
+
一个新的HttpErrorMiddleware实例。
|
|
79
|
+
"""
|
|
80
|
+
# Create and return a new instance with the crawler's settings
|
|
81
|
+
# 使用爬虫的设置创建并返回一个新实例
|
|
24
82
|
return cls(crawler.settings)
|
|
25
83
|
|
|
26
84
|
def __init__(self, settings):
|
|
85
|
+
"""
|
|
86
|
+
Initialize the HttpErrorMiddleware.
|
|
87
|
+
初始化HttpErrorMiddleware。
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
settings: The AioScrapy settings object.
|
|
91
|
+
AioScrapy设置对象。
|
|
92
|
+
"""
|
|
93
|
+
# Whether to allow all HTTP status codes
|
|
94
|
+
# 是否允许所有HTTP状态码
|
|
27
95
|
self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
|
|
96
|
+
|
|
97
|
+
# List of allowed HTTP status codes
|
|
98
|
+
# 允许的HTTP状态码列表
|
|
28
99
|
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
|
|
29
100
|
|
|
30
101
|
def process_spider_input(self, response, spider):
|
|
31
|
-
|
|
102
|
+
"""
|
|
103
|
+
Process a response before it is sent to the spider.
|
|
104
|
+
在响应发送到爬虫之前处理它。
|
|
105
|
+
|
|
106
|
+
This method checks if the response's status code is allowed. If not, it
|
|
107
|
+
raises an HttpError exception, which will be caught by process_spider_exception.
|
|
108
|
+
此方法检查响应的状态码是否被允许。如果不允许,它会引发HttpError异常,
|
|
109
|
+
该异常将被process_spider_exception捕获。
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
response: The response being processed.
|
|
113
|
+
正在处理的响应。
|
|
114
|
+
spider: The spider that will receive the response.
|
|
115
|
+
将接收响应的爬虫。
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
HttpError: If the response's status code is not allowed.
|
|
119
|
+
如果响应的状态码不被允许。
|
|
120
|
+
"""
|
|
121
|
+
# Allow responses with status codes in the 200-299 range (common case)
|
|
122
|
+
# 允许状态码在200-299范围内的响应(常见情况)
|
|
123
|
+
if 200 <= response.status < 300:
|
|
32
124
|
return
|
|
125
|
+
|
|
126
|
+
# Allow all status codes if specified in the response meta
|
|
127
|
+
# 如果在响应元数据中指定,则允许所有状态码
|
|
33
128
|
if response.meta.get('handle_httpstatus_all', False):
|
|
34
129
|
return
|
|
130
|
+
|
|
131
|
+
# Get the list of allowed status codes
|
|
132
|
+
# 获取允许的状态码列表
|
|
35
133
|
if 'handle_httpstatus_list' in response.meta:
|
|
134
|
+
# Use the list from response meta if available
|
|
135
|
+
# 如果可用,使用来自响应元数据的列表
|
|
36
136
|
allowed_statuses = response.meta['handle_httpstatus_list']
|
|
37
137
|
elif self.handle_httpstatus_all:
|
|
138
|
+
# Allow all status codes if specified in settings
|
|
139
|
+
# 如果在设置中指定,则允许所有状态码
|
|
38
140
|
return
|
|
39
141
|
else:
|
|
142
|
+
# Use the list from spider attribute or middleware settings
|
|
143
|
+
# 使用来自爬虫属性或中间件设置的列表
|
|
40
144
|
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
|
|
145
|
+
|
|
146
|
+
# Allow the response if its status code is in the allowed list
|
|
147
|
+
# 如果响应的状态码在允许列表中,则允许该响应
|
|
41
148
|
if response.status in allowed_statuses:
|
|
42
149
|
return
|
|
150
|
+
|
|
151
|
+
# Raise an HttpError for responses with disallowed status codes
|
|
152
|
+
# 为具有不允许状态码的响应引发HttpError
|
|
43
153
|
raise HttpError(response, 'Ignoring non-200 response')
|
|
44
154
|
|
|
45
155
|
async def process_spider_exception(self, response, exception, spider):
|
|
156
|
+
"""
|
|
157
|
+
Handle exceptions raised during spider processing.
|
|
158
|
+
处理爬虫处理期间引发的异常。
|
|
159
|
+
|
|
160
|
+
This method catches HttpError exceptions, logs them, updates statistics,
|
|
161
|
+
and returns an empty result list to suppress the exception.
|
|
162
|
+
此方法捕获HttpError异常,记录它们,更新统计信息,并返回一个空结果列表以抑制异常。
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
response: The response being processed when the exception was raised.
|
|
166
|
+
引发异常时正在处理的响应。
|
|
167
|
+
exception: The exception raised.
|
|
168
|
+
引发的异常。
|
|
169
|
+
spider: The spider that was processing the response.
|
|
170
|
+
正在处理响应的爬虫。
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
list: An empty list if the exception is an HttpError, None otherwise.
|
|
174
|
+
如果异常是HttpError,则返回空列表;否则返回None。
|
|
175
|
+
"""
|
|
176
|
+
# Only handle HttpError exceptions
|
|
177
|
+
# 只处理HttpError异常
|
|
46
178
|
if isinstance(exception, HttpError):
|
|
179
|
+
# Update statistics
|
|
180
|
+
# 更新统计信息
|
|
47
181
|
spider.crawler.stats.inc_value('httperror/response_ignored_count')
|
|
48
182
|
spider.crawler.stats.inc_value(
|
|
49
183
|
f'httperror/response_ignored_status_count/{response.status}'
|
|
50
184
|
)
|
|
185
|
+
|
|
186
|
+
# Log the ignored response
|
|
187
|
+
# 记录被忽略的响应
|
|
51
188
|
logger.info("Ignoring response %(response)r: HTTP status code is not handled or not allowed" % {
|
|
52
189
|
'response': response
|
|
53
190
|
})
|
|
191
|
+
|
|
192
|
+
# Return an empty list to suppress the exception
|
|
193
|
+
# 返回空列表以抑制异常
|
|
54
194
|
return []
|
aioscrapy/libs/spider/offsite.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Offsite Spider Middleware
|
|
3
|
+
站外爬虫中间件
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This middleware filters out requests to URLs not belonging to the domains specified
|
|
6
|
+
in the spider's allowed_domains attribute. It helps prevent the crawler from
|
|
7
|
+
following links to external sites, which is useful for keeping crawls focused on
|
|
8
|
+
specific domains.
|
|
9
|
+
此中间件过滤掉对不属于爬虫的allowed_domains属性中指定的域的URL的请求。
|
|
10
|
+
它有助于防止爬虫跟随指向外部站点的链接,这对于使爬取集中在特定域上很有用。
|
|
5
11
|
"""
|
|
6
12
|
import re
|
|
7
13
|
import warnings
|
|
@@ -13,71 +19,265 @@ from aioscrapy.utils.log import logger
|
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
class OffsiteMiddleware:
|
|
22
|
+
"""
|
|
23
|
+
Spider middleware to filter out requests to offsite domains.
|
|
24
|
+
用于过滤掉对站外域的请求的爬虫中间件。
|
|
25
|
+
|
|
26
|
+
This middleware filters out requests to URLs not belonging to the domains specified
|
|
27
|
+
in the spider's allowed_domains attribute. It helps prevent the crawler from
|
|
28
|
+
following links to external sites, which is useful for keeping crawls focused on
|
|
29
|
+
specific domains.
|
|
30
|
+
此中间件过滤掉对不属于爬虫的allowed_domains属性中指定的域的URL的请求。
|
|
31
|
+
它有助于防止爬虫跟随指向外部站点的链接,这对于使爬取集中在特定域上很有用。
|
|
32
|
+
"""
|
|
16
33
|
|
|
17
34
|
def __init__(self, stats):
|
|
35
|
+
"""
|
|
36
|
+
Initialize the offsite middleware.
|
|
37
|
+
初始化站外中间件。
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
stats: Stats collector instance.
|
|
41
|
+
统计收集器实例。
|
|
42
|
+
"""
|
|
43
|
+
# Stats collector instance
|
|
44
|
+
# 统计收集器实例
|
|
18
45
|
self.stats = stats
|
|
19
46
|
|
|
20
47
|
@classmethod
|
|
21
48
|
def from_crawler(cls, crawler):
|
|
49
|
+
"""
|
|
50
|
+
Create an OffsiteMiddleware instance from a crawler.
|
|
51
|
+
从爬虫创建OffsiteMiddleware实例。
|
|
52
|
+
|
|
53
|
+
This is the factory method used by AioScrapy to create the middleware.
|
|
54
|
+
这是AioScrapy用于创建中间件的工厂方法。
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
crawler: The crawler that will use this middleware.
|
|
58
|
+
将使用此中间件的爬虫。
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
OffsiteMiddleware: A new OffsiteMiddleware instance.
|
|
62
|
+
一个新的OffsiteMiddleware实例。
|
|
63
|
+
"""
|
|
64
|
+
# Create a new instance with the crawler's stats collector
|
|
65
|
+
# 使用爬虫的统计收集器创建一个新实例
|
|
22
66
|
o = cls(crawler.stats)
|
|
67
|
+
|
|
68
|
+
# Connect the spider_opened method to the spider_opened signal
|
|
69
|
+
# 将spider_opened方法连接到spider_opened信号
|
|
23
70
|
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
|
71
|
+
|
|
72
|
+
# Return the new instance
|
|
73
|
+
# 返回新实例
|
|
24
74
|
return o
|
|
25
75
|
|
|
26
76
|
async def process_spider_output(self, response, result, spider):
|
|
77
|
+
"""
|
|
78
|
+
Process the spider output to filter out offsite requests.
|
|
79
|
+
处理爬虫输出以过滤掉站外请求。
|
|
80
|
+
|
|
81
|
+
This method processes each request yielded by the spider and filters out
|
|
82
|
+
requests to URLs not belonging to the domains specified in the spider's
|
|
83
|
+
allowed_domains attribute.
|
|
84
|
+
此方法处理爬虫产生的每个请求,并过滤掉对不属于爬虫的allowed_domains属性中
|
|
85
|
+
指定的域的URL的请求。
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
response: The response being processed.
|
|
89
|
+
正在处理的响应。
|
|
90
|
+
result: The result returned by the spider.
|
|
91
|
+
爬虫返回的结果。
|
|
92
|
+
spider: The spider that generated the result.
|
|
93
|
+
生成结果的爬虫。
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
An async generator yielding filtered requests and other items.
|
|
97
|
+
一个产生过滤后的请求和其他项目的异步生成器。
|
|
98
|
+
"""
|
|
99
|
+
# Process each item in the result
|
|
100
|
+
# 处理结果中的每个项目
|
|
27
101
|
async for x in result:
|
|
102
|
+
# If the item is a Request, check if it should be followed
|
|
103
|
+
# 如果项目是一个Request,检查是否应该跟随它
|
|
28
104
|
if isinstance(x, Request):
|
|
105
|
+
# If the request has dont_filter set or it's for an allowed domain, yield it
|
|
106
|
+
# 如果请求设置了dont_filter或它是针对允许的域,则产生它
|
|
29
107
|
if x.dont_filter or self.should_follow(x, spider):
|
|
30
108
|
yield x
|
|
31
109
|
else:
|
|
110
|
+
# Get the domain of the request
|
|
111
|
+
# 获取请求的域
|
|
32
112
|
domain = urlparse_cached(x).hostname
|
|
113
|
+
|
|
114
|
+
# If this is a new domain, log it and update stats
|
|
115
|
+
# 如果这是一个新域,记录它并更新统计信息
|
|
33
116
|
if domain and domain not in self.domains_seen:
|
|
34
117
|
self.domains_seen.add(domain)
|
|
35
118
|
logger.debug(
|
|
36
119
|
"Filtered offsite request to %(domain)r: %(request)s" % {'domain': domain, 'request': x}
|
|
37
120
|
)
|
|
38
121
|
self.stats.inc_value('offsite/domains', spider=spider)
|
|
122
|
+
|
|
123
|
+
# Update filtered requests stats
|
|
124
|
+
# 更新过滤的请求统计信息
|
|
39
125
|
self.stats.inc_value('offsite/filtered', spider=spider)
|
|
40
126
|
else:
|
|
127
|
+
# If the item is not a Request, yield it unchanged
|
|
128
|
+
# 如果项目不是一个Request,则不变地产生它
|
|
41
129
|
yield x
|
|
42
130
|
|
|
43
131
|
def should_follow(self, request, spider):
|
|
132
|
+
"""
|
|
133
|
+
Check if a request should be followed.
|
|
134
|
+
检查是否应该跟随请求。
|
|
135
|
+
|
|
136
|
+
This method checks if the hostname of the request URL matches the allowed
|
|
137
|
+
domains pattern.
|
|
138
|
+
此方法检查请求URL的主机名是否匹配允许的域模式。
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
request: The request to check.
|
|
142
|
+
要检查的请求。
|
|
143
|
+
spider: The spider that generated the request.
|
|
144
|
+
生成请求的爬虫。
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
bool: True if the request should be followed, False otherwise.
|
|
148
|
+
如果应该跟随请求,则为True;否则为False。
|
|
149
|
+
"""
|
|
150
|
+
# Get the compiled regex pattern for allowed domains
|
|
151
|
+
# 获取允许的域的编译正则表达式模式
|
|
44
152
|
regex = self.host_regex
|
|
153
|
+
|
|
154
|
+
# Get the hostname from the request URL
|
|
155
|
+
# 从请求URL获取主机名
|
|
45
156
|
# hostname can be None for wrong urls (like javascript links)
|
|
157
|
+
# 对于错误的URL(如javascript链接),主机名可能为None
|
|
46
158
|
host = urlparse_cached(request).hostname or ''
|
|
159
|
+
|
|
160
|
+
# Check if the hostname matches the allowed domains pattern
|
|
161
|
+
# 检查主机名是否匹配允许的域模式
|
|
47
162
|
return bool(regex.search(host))
|
|
48
163
|
|
|
49
164
|
def get_host_regex(self, spider):
|
|
50
|
-
"""
|
|
165
|
+
"""
|
|
166
|
+
Get a regex pattern for the allowed domains.
|
|
167
|
+
获取允许的域的正则表达式模式。
|
|
168
|
+
|
|
169
|
+
This method creates a regex pattern that matches hostnames belonging to
|
|
170
|
+
the domains specified in the spider's allowed_domains attribute.
|
|
171
|
+
此方法创建一个正则表达式模式,匹配属于爬虫的allowed_domains属性中指定的域的主机名。
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
spider: The spider whose allowed_domains attribute to use.
|
|
175
|
+
使用其allowed_domains属性的爬虫。
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
re.Pattern: A compiled regex pattern for the allowed domains.
|
|
179
|
+
允许的域的编译正则表达式模式。
|
|
180
|
+
|
|
181
|
+
Note:
|
|
182
|
+
Override this method to implement a different offsite policy.
|
|
183
|
+
覆盖此方法以实现不同的站外策略。
|
|
184
|
+
"""
|
|
185
|
+
# Get the allowed_domains attribute from the spider
|
|
186
|
+
# 从爬虫获取allowed_domains属性
|
|
51
187
|
allowed_domains = getattr(spider, 'allowed_domains', None)
|
|
188
|
+
|
|
189
|
+
# If no allowed_domains are specified, allow all domains
|
|
190
|
+
# 如果未指定allowed_domains,则允许所有域
|
|
52
191
|
if not allowed_domains:
|
|
53
192
|
return re.compile('') # allow all by default
|
|
193
|
+
|
|
194
|
+
# Compile patterns for validating domains
|
|
195
|
+
# 编译用于验证域的模式
|
|
54
196
|
url_pattern = re.compile(r"^https?://.*$")
|
|
55
197
|
port_pattern = re.compile(r":\d+$")
|
|
198
|
+
|
|
199
|
+
# Process each domain in allowed_domains
|
|
200
|
+
# 处理allowed_domains中的每个域
|
|
56
201
|
domains = []
|
|
57
202
|
for domain in allowed_domains:
|
|
203
|
+
# Skip None values
|
|
204
|
+
# 跳过None值
|
|
58
205
|
if domain is None:
|
|
59
206
|
continue
|
|
207
|
+
# Warn about URL entries
|
|
208
|
+
# 警告URL条目
|
|
60
209
|
elif url_pattern.match(domain):
|
|
61
210
|
message = ("allowed_domains accepts only domains, not URLs. "
|
|
62
211
|
f"Ignoring URL entry {domain} in allowed_domains.")
|
|
63
212
|
warnings.warn(message, URLWarning)
|
|
213
|
+
# Warn about domains with ports
|
|
214
|
+
# 警告带有端口的域
|
|
64
215
|
elif port_pattern.search(domain):
|
|
65
216
|
message = ("allowed_domains accepts only domains without ports. "
|
|
66
217
|
f"Ignoring entry {domain} in allowed_domains.")
|
|
67
218
|
warnings.warn(message, PortWarning)
|
|
68
219
|
else:
|
|
220
|
+
# Add valid domains to the list, escaping special regex characters
|
|
221
|
+
# 将有效域添加到列表中,转义特殊的正则表达式字符
|
|
69
222
|
domains.append(re.escape(domain))
|
|
223
|
+
|
|
224
|
+
# Create a regex pattern that matches the allowed domains and their subdomains
|
|
225
|
+
# 创建一个正则表达式模式,匹配允许的域及其子域
|
|
70
226
|
regex = fr'^(.*\.)?({"|".join(domains)})$'
|
|
227
|
+
|
|
228
|
+
# Compile and return the regex pattern
|
|
229
|
+
# 编译并返回正则表达式模式
|
|
71
230
|
return re.compile(regex)
|
|
72
231
|
|
|
73
232
|
def spider_opened(self, spider):
|
|
233
|
+
"""
|
|
234
|
+
Initialize middleware state when a spider is opened.
|
|
235
|
+
当爬虫打开时初始化中间件状态。
|
|
236
|
+
|
|
237
|
+
This method is called when a spider is opened. It initializes the regex
|
|
238
|
+
pattern for allowed domains and the set of seen domains.
|
|
239
|
+
当爬虫打开时调用此方法。它初始化允许的域的正则表达式模式和已见过的域集合。
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
spider: The spider that was opened.
|
|
243
|
+
被打开的爬虫。
|
|
244
|
+
"""
|
|
245
|
+
# Initialize the regex pattern for allowed domains
|
|
246
|
+
# 初始化允许的域的正则表达式模式
|
|
74
247
|
self.host_regex = self.get_host_regex(spider)
|
|
248
|
+
|
|
249
|
+
# Initialize the set of seen domains
|
|
250
|
+
# 初始化已见过的域集合
|
|
75
251
|
self.domains_seen = set()
|
|
76
252
|
|
|
77
253
|
|
|
78
254
|
class URLWarning(Warning):
|
|
255
|
+
"""
|
|
256
|
+
Warning raised when a URL is provided in allowed_domains.
|
|
257
|
+
当在allowed_domains中提供URL时引发的警告。
|
|
258
|
+
|
|
259
|
+
This warning is raised by the OffsiteMiddleware when it encounters a URL
|
|
260
|
+
(e.g., 'http://example.com') in the spider's allowed_domains attribute.
|
|
261
|
+
The allowed_domains attribute should only contain domain names without
|
|
262
|
+
the protocol (e.g., 'example.com').
|
|
263
|
+
当OffsiteMiddleware在爬虫的allowed_domains属性中遇到URL
|
|
264
|
+
(例如,'http://example.com')时,会引发此警告。
|
|
265
|
+
allowed_domains属性应该只包含没有协议的域名(例如,'example.com')。
|
|
266
|
+
"""
|
|
79
267
|
pass
|
|
80
268
|
|
|
81
269
|
|
|
82
270
|
class PortWarning(Warning):
|
|
271
|
+
"""
|
|
272
|
+
Warning raised when a domain with port is provided in allowed_domains.
|
|
273
|
+
当在allowed_domains中提供带有端口的域时引发的警告。
|
|
274
|
+
|
|
275
|
+
This warning is raised by the OffsiteMiddleware when it encounters a domain
|
|
276
|
+
with a port (e.g., 'example.com:8080') in the spider's allowed_domains attribute.
|
|
277
|
+
The allowed_domains attribute should only contain domain names without
|
|
278
|
+
ports (e.g., 'example.com').
|
|
279
|
+
当OffsiteMiddleware在爬虫的allowed_domains属性中遇到带有端口的域
|
|
280
|
+
(例如,'example.com:8080')时,会引发此警告。
|
|
281
|
+
allowed_domains属性应该只包含没有端口的域名(例如,'example.com')。
|
|
282
|
+
"""
|
|
83
283
|
pass
|