aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,54 +1,194 @@
1
1
  """
2
2
  HttpError Spider Middleware
3
+ HTTP错误爬虫中间件
3
4
 
4
- See documentation in docs/topics/spider-middleware.rst
5
+ This middleware filters out responses with non-200 status codes and generates
6
+ appropriate exceptions. It allows you to specify which status codes should be
7
+ allowed through to the spider via settings or meta attributes.
8
+ 此中间件过滤掉具有非200状态码的响应并生成适当的异常。它允许您通过设置或
9
+ 元属性指定哪些状态码应该被允许传递给爬虫。
5
10
  """
6
11
 
7
12
  from aioscrapy.exceptions import IgnoreRequest
8
-
9
13
  from aioscrapy.utils.log import logger
10
14
 
11
15
 
12
16
  class HttpError(IgnoreRequest):
13
- """A non-200 response was filtered"""
17
+ """
18
+ Exception raised when a non-200 response is filtered.
19
+ 当过滤非200响应时引发的异常。
20
+
21
+ This exception is raised by the HttpErrorMiddleware when it encounters a
22
+ response with a status code that is not in the allowed list. It is a subclass
23
+ of IgnoreRequest, which means the response will be ignored by the spider.
24
+ 当HttpErrorMiddleware遇到状态码不在允许列表中的响应时,会引发此异常。
25
+ 它是IgnoreRequest的子类,这意味着该响应将被爬虫忽略。
26
+ """
14
27
 
15
28
  def __init__(self, response, *args, **kwargs):
29
+ """
30
+ Initialize the HttpError exception.
31
+ 初始化HttpError异常。
32
+
33
+ Args:
34
+ response: The response that triggered the exception.
35
+ 触发异常的响应。
36
+ *args: Variable length argument list passed to the parent class.
37
+ 传递给父类的可变长度参数列表。
38
+ **kwargs: Arbitrary keyword arguments passed to the parent class.
39
+ 传递给父类的任意关键字参数。
40
+ """
41
+ # Store the response that triggered the exception
42
+ # 存储触发异常的响应
16
43
  self.response = response
44
+
45
+ # Initialize the parent IgnoreRequest class
46
+ # 初始化父类IgnoreRequest
17
47
  super().__init__(*args, **kwargs)
18
48
 
19
49
 
20
50
  class HttpErrorMiddleware:
51
+ """
52
+ Spider middleware to filter out responses with non-200 status codes.
53
+ 用于过滤掉具有非200状态码的响应的爬虫中间件。
54
+
55
+ This middleware checks the status code of each response and raises an HttpError
56
+ exception for responses with status codes that are not in the allowed list.
57
+ The allowed list can be specified via settings, spider attributes, or response
58
+ meta attributes.
59
+ 此中间件检查每个响应的状态码,并为状态码不在允许列表中的响应引发HttpError异常。
60
+ 允许列表可以通过设置、爬虫属性或响应元属性指定。
61
+ """
21
62
 
22
63
  @classmethod
23
64
  def from_crawler(cls, crawler):
65
+ """
66
+ Create a HttpErrorMiddleware instance from a crawler.
67
+ 从爬虫创建HttpErrorMiddleware实例。
68
+
69
+ This is the factory method used by AioScrapy to create the middleware.
70
+ 这是AioScrapy用于创建中间件的工厂方法。
71
+
72
+ Args:
73
+ crawler: The crawler that will use this middleware.
74
+ 将使用此中间件的爬虫。
75
+
76
+ Returns:
77
+ HttpErrorMiddleware: A new HttpErrorMiddleware instance.
78
+ 一个新的HttpErrorMiddleware实例。
79
+ """
80
+ # Create and return a new instance with the crawler's settings
81
+ # 使用爬虫的设置创建并返回一个新实例
24
82
  return cls(crawler.settings)
25
83
 
26
84
  def __init__(self, settings):
85
+ """
86
+ Initialize the HttpErrorMiddleware.
87
+ 初始化HttpErrorMiddleware。
88
+
89
+ Args:
90
+ settings: The AioScrapy settings object.
91
+ AioScrapy设置对象。
92
+ """
93
+ # Whether to allow all HTTP status codes
94
+ # 是否允许所有HTTP状态码
27
95
  self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
96
+
97
+ # List of allowed HTTP status codes
98
+ # 允许的HTTP状态码列表
28
99
  self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
29
100
 
30
101
  def process_spider_input(self, response, spider):
31
- if 200 <= response.status < 300: # common case
102
+ """
103
+ Process a response before it is sent to the spider.
104
+ 在响应发送到爬虫之前处理它。
105
+
106
+ This method checks if the response's status code is allowed. If not, it
107
+ raises an HttpError exception, which will be caught by process_spider_exception.
108
+ 此方法检查响应的状态码是否被允许。如果不允许,它会引发HttpError异常,
109
+ 该异常将被process_spider_exception捕获。
110
+
111
+ Args:
112
+ response: The response being processed.
113
+ 正在处理的响应。
114
+ spider: The spider that will receive the response.
115
+ 将接收响应的爬虫。
116
+
117
+ Raises:
118
+ HttpError: If the response's status code is not allowed.
119
+ 如果响应的状态码不被允许。
120
+ """
121
+ # Allow responses with status codes in the 200-299 range (common case)
122
+ # 允许状态码在200-299范围内的响应(常见情况)
123
+ if 200 <= response.status < 300:
32
124
  return
125
+
126
+ # Allow all status codes if specified in the response meta
127
+ # 如果在响应元数据中指定,则允许所有状态码
33
128
  if response.meta.get('handle_httpstatus_all', False):
34
129
  return
130
+
131
+ # Get the list of allowed status codes
132
+ # 获取允许的状态码列表
35
133
  if 'handle_httpstatus_list' in response.meta:
134
+ # Use the list from response meta if available
135
+ # 如果可用,使用来自响应元数据的列表
36
136
  allowed_statuses = response.meta['handle_httpstatus_list']
37
137
  elif self.handle_httpstatus_all:
138
+ # Allow all status codes if specified in settings
139
+ # 如果在设置中指定,则允许所有状态码
38
140
  return
39
141
  else:
142
+ # Use the list from spider attribute or middleware settings
143
+ # 使用来自爬虫属性或中间件设置的列表
40
144
  allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
145
+
146
+ # Allow the response if its status code is in the allowed list
147
+ # 如果响应的状态码在允许列表中,则允许该响应
41
148
  if response.status in allowed_statuses:
42
149
  return
150
+
151
+ # Raise an HttpError for responses with disallowed status codes
152
+ # 为具有不允许状态码的响应引发HttpError
43
153
  raise HttpError(response, 'Ignoring non-200 response')
44
154
 
45
155
  async def process_spider_exception(self, response, exception, spider):
156
+ """
157
+ Handle exceptions raised during spider processing.
158
+ 处理爬虫处理期间引发的异常。
159
+
160
+ This method catches HttpError exceptions, logs them, updates statistics,
161
+ and returns an empty result list to suppress the exception.
162
+ 此方法捕获HttpError异常,记录它们,更新统计信息,并返回一个空结果列表以抑制异常。
163
+
164
+ Args:
165
+ response: The response being processed when the exception was raised.
166
+ 引发异常时正在处理的响应。
167
+ exception: The exception raised.
168
+ 引发的异常。
169
+ spider: The spider that was processing the response.
170
+ 正在处理响应的爬虫。
171
+
172
+ Returns:
173
+ list: An empty list if the exception is an HttpError, None otherwise.
174
+ 如果异常是HttpError,则返回空列表;否则返回None。
175
+ """
176
+ # Only handle HttpError exceptions
177
+ # 只处理HttpError异常
46
178
  if isinstance(exception, HttpError):
179
+ # Update statistics
180
+ # 更新统计信息
47
181
  spider.crawler.stats.inc_value('httperror/response_ignored_count')
48
182
  spider.crawler.stats.inc_value(
49
183
  f'httperror/response_ignored_status_count/{response.status}'
50
184
  )
185
+
186
+ # Log the ignored response
187
+ # 记录被忽略的响应
51
188
  logger.info("Ignoring response %(response)r: HTTP status code is not handled or not allowed" % {
52
189
  'response': response
53
190
  })
191
+
192
+ # Return an empty list to suppress the exception
193
+ # 返回空列表以抑制异常
54
194
  return []
@@ -1,7 +1,13 @@
1
1
  """
2
2
  Offsite Spider Middleware
3
+ 站外爬虫中间件
3
4
 
4
- See documentation in docs/topics/spider-middleware.rst
5
+ This middleware filters out requests to URLs not belonging to the domains specified
6
+ in the spider's allowed_domains attribute. It helps prevent the crawler from
7
+ following links to external sites, which is useful for keeping crawls focused on
8
+ specific domains.
9
+ 此中间件过滤掉对不属于爬虫的allowed_domains属性中指定的域的URL的请求。
10
+ 它有助于防止爬虫跟随指向外部站点的链接,这对于使爬取集中在特定域上很有用。
5
11
  """
6
12
  import re
7
13
  import warnings
@@ -13,71 +19,265 @@ from aioscrapy.utils.log import logger
13
19
 
14
20
 
15
21
  class OffsiteMiddleware:
22
+ """
23
+ Spider middleware to filter out requests to offsite domains.
24
+ 用于过滤掉对站外域的请求的爬虫中间件。
25
+
26
+ This middleware filters out requests to URLs not belonging to the domains specified
27
+ in the spider's allowed_domains attribute. It helps prevent the crawler from
28
+ following links to external sites, which is useful for keeping crawls focused on
29
+ specific domains.
30
+ 此中间件过滤掉对不属于爬虫的allowed_domains属性中指定的域的URL的请求。
31
+ 它有助于防止爬虫跟随指向外部站点的链接,这对于使爬取集中在特定域上很有用。
32
+ """
16
33
 
17
34
  def __init__(self, stats):
35
+ """
36
+ Initialize the offsite middleware.
37
+ 初始化站外中间件。
38
+
39
+ Args:
40
+ stats: Stats collector instance.
41
+ 统计收集器实例。
42
+ """
43
+ # Stats collector instance
44
+ # 统计收集器实例
18
45
  self.stats = stats
19
46
 
20
47
  @classmethod
21
48
  def from_crawler(cls, crawler):
49
+ """
50
+ Create an OffsiteMiddleware instance from a crawler.
51
+ 从爬虫创建OffsiteMiddleware实例。
52
+
53
+ This is the factory method used by AioScrapy to create the middleware.
54
+ 这是AioScrapy用于创建中间件的工厂方法。
55
+
56
+ Args:
57
+ crawler: The crawler that will use this middleware.
58
+ 将使用此中间件的爬虫。
59
+
60
+ Returns:
61
+ OffsiteMiddleware: A new OffsiteMiddleware instance.
62
+ 一个新的OffsiteMiddleware实例。
63
+ """
64
+ # Create a new instance with the crawler's stats collector
65
+ # 使用爬虫的统计收集器创建一个新实例
22
66
  o = cls(crawler.stats)
67
+
68
+ # Connect the spider_opened method to the spider_opened signal
69
+ # 将spider_opened方法连接到spider_opened信号
23
70
  crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
71
+
72
+ # Return the new instance
73
+ # 返回新实例
24
74
  return o
25
75
 
26
76
  async def process_spider_output(self, response, result, spider):
77
+ """
78
+ Process the spider output to filter out offsite requests.
79
+ 处理爬虫输出以过滤掉站外请求。
80
+
81
+ This method processes each request yielded by the spider and filters out
82
+ requests to URLs not belonging to the domains specified in the spider's
83
+ allowed_domains attribute.
84
+ 此方法处理爬虫产生的每个请求,并过滤掉对不属于爬虫的allowed_domains属性中
85
+ 指定的域的URL的请求。
86
+
87
+ Args:
88
+ response: The response being processed.
89
+ 正在处理的响应。
90
+ result: The result returned by the spider.
91
+ 爬虫返回的结果。
92
+ spider: The spider that generated the result.
93
+ 生成结果的爬虫。
94
+
95
+ Returns:
96
+ An async generator yielding filtered requests and other items.
97
+ 一个产生过滤后的请求和其他项目的异步生成器。
98
+ """
99
+ # Process each item in the result
100
+ # 处理结果中的每个项目
27
101
  async for x in result:
102
+ # If the item is a Request, check if it should be followed
103
+ # 如果项目是一个Request,检查是否应该跟随它
28
104
  if isinstance(x, Request):
105
+ # If the request has dont_filter set or it's for an allowed domain, yield it
106
+ # 如果请求设置了dont_filter或它是针对允许的域,则产生它
29
107
  if x.dont_filter or self.should_follow(x, spider):
30
108
  yield x
31
109
  else:
110
+ # Get the domain of the request
111
+ # 获取请求的域
32
112
  domain = urlparse_cached(x).hostname
113
+
114
+ # If this is a new domain, log it and update stats
115
+ # 如果这是一个新域,记录它并更新统计信息
33
116
  if domain and domain not in self.domains_seen:
34
117
  self.domains_seen.add(domain)
35
118
  logger.debug(
36
119
  "Filtered offsite request to %(domain)r: %(request)s" % {'domain': domain, 'request': x}
37
120
  )
38
121
  self.stats.inc_value('offsite/domains', spider=spider)
122
+
123
+ # Update filtered requests stats
124
+ # 更新过滤的请求统计信息
39
125
  self.stats.inc_value('offsite/filtered', spider=spider)
40
126
  else:
127
+ # If the item is not a Request, yield it unchanged
128
+ # 如果项目不是一个Request,则不变地产生它
41
129
  yield x
42
130
 
43
131
  def should_follow(self, request, spider):
132
+ """
133
+ Check if a request should be followed.
134
+ 检查是否应该跟随请求。
135
+
136
+ This method checks if the hostname of the request URL matches the allowed
137
+ domains pattern.
138
+ 此方法检查请求URL的主机名是否匹配允许的域模式。
139
+
140
+ Args:
141
+ request: The request to check.
142
+ 要检查的请求。
143
+ spider: The spider that generated the request.
144
+ 生成请求的爬虫。
145
+
146
+ Returns:
147
+ bool: True if the request should be followed, False otherwise.
148
+ 如果应该跟随请求,则为True;否则为False。
149
+ """
150
+ # Get the compiled regex pattern for allowed domains
151
+ # 获取允许的域的编译正则表达式模式
44
152
  regex = self.host_regex
153
+
154
+ # Get the hostname from the request URL
155
+ # 从请求URL获取主机名
45
156
  # hostname can be None for wrong urls (like javascript links)
157
+ # 对于错误的URL(如javascript链接),主机名可能为None
46
158
  host = urlparse_cached(request).hostname or ''
159
+
160
+ # Check if the hostname matches the allowed domains pattern
161
+ # 检查主机名是否匹配允许的域模式
47
162
  return bool(regex.search(host))
48
163
 
49
164
  def get_host_regex(self, spider):
50
- """Override this method to implement a different offsite policy"""
165
+ """
166
+ Get a regex pattern for the allowed domains.
167
+ 获取允许的域的正则表达式模式。
168
+
169
+ This method creates a regex pattern that matches hostnames belonging to
170
+ the domains specified in the spider's allowed_domains attribute.
171
+ 此方法创建一个正则表达式模式,匹配属于爬虫的allowed_domains属性中指定的域的主机名。
172
+
173
+ Args:
174
+ spider: The spider whose allowed_domains attribute to use.
175
+ 使用其allowed_domains属性的爬虫。
176
+
177
+ Returns:
178
+ re.Pattern: A compiled regex pattern for the allowed domains.
179
+ 允许的域的编译正则表达式模式。
180
+
181
+ Note:
182
+ Override this method to implement a different offsite policy.
183
+ 覆盖此方法以实现不同的站外策略。
184
+ """
185
+ # Get the allowed_domains attribute from the spider
186
+ # 从爬虫获取allowed_domains属性
51
187
  allowed_domains = getattr(spider, 'allowed_domains', None)
188
+
189
+ # If no allowed_domains are specified, allow all domains
190
+ # 如果未指定allowed_domains,则允许所有域
52
191
  if not allowed_domains:
53
192
  return re.compile('') # allow all by default
193
+
194
+ # Compile patterns for validating domains
195
+ # 编译用于验证域的模式
54
196
  url_pattern = re.compile(r"^https?://.*$")
55
197
  port_pattern = re.compile(r":\d+$")
198
+
199
+ # Process each domain in allowed_domains
200
+ # 处理allowed_domains中的每个域
56
201
  domains = []
57
202
  for domain in allowed_domains:
203
+ # Skip None values
204
+ # 跳过None值
58
205
  if domain is None:
59
206
  continue
207
+ # Warn about URL entries
208
+ # 警告URL条目
60
209
  elif url_pattern.match(domain):
61
210
  message = ("allowed_domains accepts only domains, not URLs. "
62
211
  f"Ignoring URL entry {domain} in allowed_domains.")
63
212
  warnings.warn(message, URLWarning)
213
+ # Warn about domains with ports
214
+ # 警告带有端口的域
64
215
  elif port_pattern.search(domain):
65
216
  message = ("allowed_domains accepts only domains without ports. "
66
217
  f"Ignoring entry {domain} in allowed_domains.")
67
218
  warnings.warn(message, PortWarning)
68
219
  else:
220
+ # Add valid domains to the list, escaping special regex characters
221
+ # 将有效域添加到列表中,转义特殊的正则表达式字符
69
222
  domains.append(re.escape(domain))
223
+
224
+ # Create a regex pattern that matches the allowed domains and their subdomains
225
+ # 创建一个正则表达式模式,匹配允许的域及其子域
70
226
  regex = fr'^(.*\.)?({"|".join(domains)})$'
227
+
228
+ # Compile and return the regex pattern
229
+ # 编译并返回正则表达式模式
71
230
  return re.compile(regex)
72
231
 
73
232
  def spider_opened(self, spider):
233
+ """
234
+ Initialize middleware state when a spider is opened.
235
+ 当爬虫打开时初始化中间件状态。
236
+
237
+ This method is called when a spider is opened. It initializes the regex
238
+ pattern for allowed domains and the set of seen domains.
239
+ 当爬虫打开时调用此方法。它初始化允许的域的正则表达式模式和已见过的域集合。
240
+
241
+ Args:
242
+ spider: The spider that was opened.
243
+ 被打开的爬虫。
244
+ """
245
+ # Initialize the regex pattern for allowed domains
246
+ # 初始化允许的域的正则表达式模式
74
247
  self.host_regex = self.get_host_regex(spider)
248
+
249
+ # Initialize the set of seen domains
250
+ # 初始化已见过的域集合
75
251
  self.domains_seen = set()
76
252
 
77
253
 
78
254
  class URLWarning(Warning):
255
+ """
256
+ Warning raised when a URL is provided in allowed_domains.
257
+ 当在allowed_domains中提供URL时引发的警告。
258
+
259
+ This warning is raised by the OffsiteMiddleware when it encounters a URL
260
+ (e.g., 'http://example.com') in the spider's allowed_domains attribute.
261
+ The allowed_domains attribute should only contain domain names without
262
+ the protocol (e.g., 'example.com').
263
+ 当OffsiteMiddleware在爬虫的allowed_domains属性中遇到URL
264
+ (例如,'http://example.com')时,会引发此警告。
265
+ allowed_domains属性应该只包含没有协议的域名(例如,'example.com')。
266
+ """
79
267
  pass
80
268
 
81
269
 
82
270
  class PortWarning(Warning):
271
+ """
272
+ Warning raised when a domain with port is provided in allowed_domains.
273
+ 当在allowed_domains中提供带有端口的域时引发的警告。
274
+
275
+ This warning is raised by the OffsiteMiddleware when it encounters a domain
276
+ with a port (e.g., 'example.com:8080') in the spider's allowed_domains attribute.
277
+ The allowed_domains attribute should only contain domain names without
278
+ ports (e.g., 'example.com').
279
+ 当OffsiteMiddleware在爬虫的allowed_domains属性中遇到带有端口的域
280
+ (例如,'example.com:8080')时,会引发此警告。
281
+ allowed_domains属性应该只包含没有端口的域名(例如,'example.com')。
282
+ """
83
283
  pass