aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/utils/reqser.py CHANGED
@@ -1,5 +1,12 @@
1
1
  """
2
- Helper functions for serializing (and deserializing) requests.
2
+ Request serialization utilities for aioscrapy.
3
+ aioscrapy的请求序列化实用工具。
4
+
5
+ This module provides helper functions for serializing and deserializing Request objects.
6
+ These functions are particularly useful for storing requests in queues, databases,
7
+ or transmitting them between different processes or systems.
8
+ 此模块提供了用于序列化和反序列化Request对象的辅助函数。
9
+ 这些函数对于在队列、数据库中存储请求或在不同进程或系统之间传输请求特别有用。
3
10
  """
4
11
  from typing import Optional
5
12
 
@@ -8,8 +15,75 @@ from aioscrapy.utils.request import request_from_dict as _from_dict
8
15
 
9
16
 
10
17
  def request_to_dict(request: "aioscrapy.Request", spider: Optional["aioscrapy.Spider"] = None) -> dict:
18
+ """
19
+ Convert a Request object to a dictionary representation.
20
+ 将Request对象转换为字典表示。
21
+
22
+ This function serializes a Request object into a dictionary that can be easily
23
+ stored or transmitted. The dictionary contains all the necessary information
24
+ to reconstruct the Request object later using request_from_dict().
25
+ 此函数将Request对象序列化为可以轻松存储或传输的字典。
26
+ 该字典包含稍后使用request_from_dict()重建Request对象所需的所有信息。
27
+
28
+ Args:
29
+ request: The Request object to serialize.
30
+ 要序列化的Request对象。
31
+ spider: Optional Spider instance that may be used to customize the
32
+ serialization process. Some Request subclasses may use the spider
33
+ to properly serialize their attributes.
34
+ 可选的Spider实例,可用于自定义序列化过程。
35
+ 某些Request子类可能使用spider来正确序列化其属性。
36
+
37
+ Returns:
38
+ dict: A dictionary representation of the Request object.
39
+ Request对象的字典表示。
40
+
41
+ Example:
42
+ >>> request = Request('http://example.com', callback='parse_item')
43
+ >>> request_dict = request_to_dict(request, spider)
44
+ >>> # The dictionary can be stored or transmitted
45
+ >>> new_request = await request_from_dict(request_dict, spider)
46
+ """
47
+ # Delegate to the Request object's to_dict method
48
+ # 委托给Request对象的to_dict方法
11
49
  return request.to_dict(spider=spider)
12
50
 
13
51
 
14
52
  async def request_from_dict(d: dict, spider: Optional["aioscrapy.Spider"] = None) -> "aioscrapy.Request":
53
+ """
54
+ Convert a dictionary representation back to a Request object.
55
+ 将字典表示转换回Request对象。
56
+
57
+ This function deserializes a dictionary (previously created by request_to_dict)
58
+ back into a Request object. It reconstructs all the attributes and properties
59
+ of the original Request, including callback and errback methods if a spider
60
+ is provided.
61
+ 此函数将(先前由request_to_dict创建的)字典反序列化回Request对象。
62
+ 它重建原始Request的所有属性和属性,如果提供了spider,
63
+ 还包括回调和错误回调方法。
64
+
65
+ Args:
66
+ d: The dictionary representation of a Request object.
67
+ Request对象的字典表示。
68
+ spider: Optional Spider instance that may be used to resolve callback
69
+ and errback method names to actual methods on the spider.
70
+ 可选的Spider实例,可用于将回调和错误回调方法名称
71
+ 解析为spider上的实际方法。
72
+
73
+ Returns:
74
+ aioscrapy.Request: A reconstructed Request object.
75
+ 重建的Request对象。
76
+
77
+ Example:
78
+ >>> request_dict = {
79
+ ... 'url': 'http://example.com',
80
+ ... 'callback': 'parse_item',
81
+ ... 'method': 'GET'
82
+ ... }
83
+ >>> request = await request_from_dict(request_dict, spider)
84
+ >>> request.url
85
+ 'http://example.com'
86
+ """
87
+ # Delegate to the imported _from_dict function from aioscrapy.utils.request
88
+ # 委托给从aioscrapy.utils.request导入的_from_dict函数
15
89
  return await _from_dict(d, spider=spider)
@@ -1,6 +1,12 @@
1
1
  """
2
- This module provides some useful functions for working with
3
- aioscrapy.http.Request objects
2
+ Request utility functions for aioscrapy.
3
+ aioscrapy的请求实用函数。
4
+
5
+ This module provides utility functions for working with aioscrapy.http.Request objects.
6
+ It includes functions for converting requests to raw HTTP representations, extracting
7
+ referrer information, and creating Request objects from dictionaries.
8
+ 此模块提供了用于处理aioscrapy.http.Request对象的实用函数。
9
+ 它包括将请求转换为原始HTTP表示、提取引用者信息以及从字典创建Request对象的函数。
4
10
  """
5
11
 
6
12
  from typing import Optional
@@ -15,53 +21,208 @@ from aioscrapy.utils.python import to_bytes, to_unicode
15
21
 
16
22
 
17
23
  def request_httprepr(request: Request) -> bytes:
18
- """Return the raw HTTP representation (as bytes) of the given request.
19
- This is provided only for reference since it's not the actual stream of
20
- bytes that will be send when performing the request (that's controlled
21
- by Twisted).
22
24
  """
25
+ Return the raw HTTP representation of a request as bytes.
26
+ 以字节形式返回请求的原始HTTP表示。
27
+
28
+ This function converts a Request object to its raw HTTP representation,
29
+ including the request line, headers, and body. This is useful for debugging
30
+ and logging purposes.
31
+ 此函数将Request对象转换为其原始HTTP表示,包括请求行、头部和正文。
32
+ 这对于调试和日志记录目的很有用。
33
+
34
+ Note:
35
+ This is provided only for reference since it's not the actual stream of
36
+ bytes that will be sent when performing the request (that's controlled
37
+ by the HTTP client implementation).
38
+ 这仅供参考,因为它不是执行请求时将发送的实际字节流
39
+ (那由HTTP客户端实现控制)。
40
+
41
+ Args:
42
+ request: The Request object to convert.
43
+ 要转换的Request对象。
44
+
45
+ Returns:
46
+ bytes: The raw HTTP representation of the request.
47
+ 请求的原始HTTP表示。
48
+
49
+ Example:
50
+ >>> request = Request('http://example.com', method='POST',
51
+ ... headers={'Content-Type': 'application/json'},
52
+ ... body='{"key": "value"}')
53
+ >>> print(request_httprepr(request).decode())
54
+ POST / HTTP/1.1
55
+ Host: example.com
56
+ Content-Type: application/json
57
+
58
+ {"key": "value"}
59
+ """
60
+ # Parse the URL
61
+ # 解析URL
23
62
  parsed = urlparse_cached(request)
63
+
64
+ # Construct the path including params and query
65
+ # 构造包含参数和查询的路径
24
66
  path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
67
+
68
+ # Start with the request line
69
+ # 从请求行开始
25
70
  s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
71
+
72
+ # Add the Host header
73
+ # 添加Host头部
26
74
  s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
75
+
76
+ # Add other headers if present
77
+ # 如果存在,添加其他头部
27
78
  if request.headers:
28
79
  s += headers_dict_to_raw({to_bytes(k): to_bytes(v) for k, v in request.headers.items()}) + b"\r\n"
80
+
81
+ # Add the empty line that separates headers from body
82
+ # 添加分隔头部和正文的空行
29
83
  s += b"\r\n"
84
+
85
+ # Add the body
86
+ # 添加正文
30
87
  s += to_bytes(request.body)
88
+
31
89
  return s
32
90
 
33
91
 
34
92
  def referer_str(request: Request) -> Optional[str]:
35
- """ Return Referer HTTP header suitable for logging. """
93
+ """
94
+ Return the Referer HTTP header in a format suitable for logging.
95
+ 以适合日志记录的格式返回Referer HTTP头。
96
+
97
+ This function extracts the 'Referer' header from a request and converts it
98
+ to a unicode string, replacing any invalid characters. This is useful for
99
+ logging purposes to avoid encoding errors.
100
+ 此函数从请求中提取'Referer'头并将其转换为unicode字符串,
101
+ 替换任何无效字符。这对于日志记录很有用,可以避免编码错误。
102
+
103
+ Args:
104
+ request: The Request object to extract the Referer from.
105
+ 要提取Referer的Request对象。
106
+
107
+ Returns:
108
+ Optional[str]: The Referer header as a unicode string, or None if the
109
+ header is not present.
110
+ 作为unicode字符串的Referer头,如果头不存在则为None。
111
+ """
112
+ # Get the Referer header from the request
113
+ # 从请求中获取Referer头
36
114
  referrer = request.headers.get('Referer')
115
+
116
+ # If there's no Referer header, return None
117
+ # 如果没有Referer头,返回None
37
118
  if referrer is None:
38
119
  return referrer
120
+
121
+ # Convert the Referer to unicode, replacing any invalid characters
122
+ # 将Referer转换为unicode,替换任何无效字符
39
123
  return to_unicode(referrer, errors='replace')
40
124
 
41
125
 
42
126
  async def request_from_dict(d: dict, *, spider: Optional[Spider] = None) -> Request:
43
- """Create a :class:`~scrapy.Request` object from a dict.
127
+ """
128
+ Create a Request object from a dictionary.
129
+ 从字典创建Request对象。
130
+
131
+ This function converts a dictionary representation of a request into an actual
132
+ Request object. It's useful for deserializing requests, for example when
133
+ loading them from a queue or a file.
134
+ 此函数将请求的字典表示转换为实际的Request对象。
135
+ 它对于反序列化请求很有用,例如从队列或文件加载请求时。
136
+
137
+ If a spider is provided, the function will:
138
+ 1. First call the spider's request_from_dict method to allow custom processing
139
+ 2. Try to resolve callback and errback strings to actual methods on the spider
44
140
 
45
- If a spider is given, it will try to resolve the callbacks looking at the
46
- spider for methods with the same name.
141
+ 如果提供了爬虫,该函数将:
142
+ 1. 首先调用爬虫的request_from_dict方法以允许自定义处理
143
+ 2. 尝试将callback和errback字符串解析为爬虫上的实际方法
144
+
145
+ Args:
146
+ d: Dictionary containing the request attributes.
147
+ 包含请求属性的字典。
148
+ spider: Optional spider instance to resolve callbacks and errbacks.
149
+ 可选的爬虫实例,用于解析回调和错误回调。
150
+
151
+ Returns:
152
+ Request: A Request object (or subclass) with the attributes from the dictionary.
153
+ 具有字典中属性的Request对象(或子类)。
154
+
155
+ Raises:
156
+ ValueError: If a callback or errback name cannot be resolved to a method.
157
+ 如果回调或错误回调名称无法解析为方法。
47
158
  """
48
- d = await spider.request_from_dict(d) or d
159
+ # Allow the spider to customize the dictionary
160
+ # 允许爬虫自定义字典
161
+ if spider:
162
+ d = await spider.request_from_dict(d) or d
163
+
164
+ # If the spider already returned a Request object, return it directly
165
+ # 如果爬虫已经返回了一个Request对象,直接返回它
49
166
  if isinstance(d, Request):
50
167
  return d
51
168
 
169
+ # Determine the request class to use (default is Request)
170
+ # 确定要使用的请求类(默认为Request)
52
171
  request_cls = load_object(d["_class"]) if "_class" in d else Request
172
+
173
+ # Filter the dictionary to only include valid attributes for the request class
174
+ # 过滤字典,只包含请求类的有效属性
53
175
  kwargs = {key: value for key, value in d.items() if key in request_cls.attributes}
176
+
177
+ # Resolve callback string to actual method if spider is provided
178
+ # 如果提供了爬虫,将回调字符串解析为实际方法
54
179
  if d.get("callback") and spider:
55
180
  kwargs["callback"] = _get_method(spider, d["callback"])
181
+
182
+ # Resolve errback string to actual method if spider is provided
183
+ # 如果提供了爬虫,将错误回调字符串解析为实际方法
56
184
  if d.get("errback") and spider:
57
185
  kwargs["errback"] = _get_method(spider, d["errback"])
186
+
187
+ # Create and return the request object
188
+ # 创建并返回请求对象
58
189
  return request_cls(**kwargs)
59
190
 
60
191
 
61
192
  def _get_method(obj, name):
62
- """Helper function for request_from_dict"""
193
+ """
194
+ Get a method from an object by name.
195
+ 通过名称从对象获取方法。
196
+
197
+ This is a helper function for request_from_dict that resolves method names
198
+ to actual method objects. It's used to convert callback and errback strings
199
+ to callable methods on a spider.
200
+ 这是request_from_dict的辅助函数,用于将方法名称解析为实际的方法对象。
201
+ 它用于将回调和错误回调字符串转换为爬虫上的可调用方法。
202
+
203
+ Args:
204
+ obj: The object to get the method from (typically a spider).
205
+ 要从中获取方法的对象(通常是爬虫)。
206
+ name: The name of the method to get.
207
+ 要获取的方法的名称。
208
+
209
+ Returns:
210
+ callable: The method object.
211
+ 方法对象。
212
+
213
+ Raises:
214
+ ValueError: If the method is not found on the object.
215
+ 如果在对象上找不到该方法。
216
+ """
217
+ # Ensure the name is a string
218
+ # 确保名称是字符串
63
219
  name = str(name)
220
+
221
+ # Try to get the method from the object
222
+ # 尝试从对象获取方法
64
223
  try:
65
224
  return getattr(obj, name)
66
225
  except AttributeError:
226
+ # Raise a more informative error if the method is not found
227
+ # 如果找不到该方法,引发更多信息的错误
67
228
  raise ValueError(f"Method {name!r} not found in: {obj}")
@@ -1,6 +1,11 @@
1
1
  """
2
- This module provides some useful functions for working with
3
- scrapy.http.Response objects
2
+ Response utility functions for aioscrapy.
3
+ aioscrapy的响应实用函数。
4
+
5
+ This module provides utility functions for working with aioscrapy.http.Response objects.
6
+ It includes functions for extracting base URLs and meta refresh directives from HTML responses.
7
+ 此模块提供了用于处理aioscrapy.http.Response对象的实用函数。
8
+ 它包括从HTML响应中提取基本URL和元刷新指令的函数。
4
9
  """
5
10
  from typing import Iterable, Optional, Tuple, Union
6
11
  from weakref import WeakKeyDictionary
@@ -10,17 +15,59 @@ from w3lib import html
10
15
  import aioscrapy
11
16
  from aioscrapy.http.response import Response
12
17
 
18
+ # Cache for storing base URLs to avoid repeated parsing of the same response
19
+ # 缓存存储基本URL,以避免重复解析相同的响应
13
20
  _baseurl_cache: "WeakKeyDictionary[Response, str]" = WeakKeyDictionary()
14
21
 
15
22
 
16
23
  def get_base_url(response: "aioscrapy.http.response.TextResponse") -> str:
17
- """Return the base url of the given response, joined with the response url"""
24
+ """
25
+ Extract the base URL from an HTML response.
26
+ 从HTML响应中提取基本URL。
27
+
28
+ This function extracts the base URL from an HTML response by looking for
29
+ the <base> tag in the HTML. If found, it returns the href attribute of the
30
+ base tag, resolved against the response URL. If not found, it returns the
31
+ response URL.
32
+ 此函数通过查找HTML中的<base>标签来从HTML响应中提取基本URL。
33
+ 如果找到,它返回base标签的href属性,相对于响应URL解析。
34
+ 如果未找到,它返回响应URL。
35
+
36
+ The function uses a cache to avoid repeated parsing of the same response.
37
+ Only the first 4KB of the response text are examined for performance reasons.
38
+ 该函数使用缓存来避免重复解析相同的响应。
39
+ 出于性能原因,只检查响应文本的前4KB。
40
+
41
+ Args:
42
+ response: The HTML response to extract the base URL from.
43
+ 要从中提取基本URL的HTML响应。
44
+
45
+ Returns:
46
+ str: The base URL of the response, which could be either:
47
+ 响应的基本URL,可能是:
48
+ - The href attribute of the <base> tag, resolved against the response URL
49
+ <base>标签的href属性,相对于响应URL解析
50
+ - The response URL if no <base> tag is found
51
+ 如果未找到<base>标签,则为响应URL
52
+ """
53
+ # Check if the base URL is already cached for this response
54
+ # 检查此响应的基本URL是否已缓存
18
55
  if response not in _baseurl_cache:
56
+ # Only examine the first 4KB of the response for performance
57
+ # 出于性能考虑,只检查响应的前4KB
19
58
  text = response.text[0:4096]
59
+ # Extract the base URL using w3lib.html
60
+ # 使用w3lib.html提取基本URL
20
61
  _baseurl_cache[response] = html.get_base_url(text, response.url, response.encoding)
62
+ # Return the cached base URL
63
+ # 返回缓存的基本URL
21
64
  return _baseurl_cache[response]
22
65
 
23
66
 
67
+ # Cache for storing meta refresh directives to avoid repeated parsing of the same response
68
+ # 缓存存储元刷新指令,以避免重复解析相同的响应
69
+ # The cache stores either (None, None) if no meta refresh is found, or (seconds, url) if found
70
+ # 如果未找到元刷新,缓存存储(None, None),如果找到,则存储(秒数, url)
24
71
  _metaref_cache: "WeakKeyDictionary[Response, Union[Tuple[None, None], Tuple[float, str]]]" = WeakKeyDictionary()
25
72
 
26
73
 
@@ -28,11 +75,49 @@ def get_meta_refresh(
28
75
  response: "aioscrapy.http.response.TextResponse",
29
76
  ignore_tags: Optional[Iterable[str]] = ('script', 'noscript'),
30
77
  ) -> Union[Tuple[None, None], Tuple[float, str]]:
31
- """Parse the http-equiv refrsh parameter from the given response"""
78
+ """
79
+ Extract the meta refresh directive from an HTML response.
80
+ 从HTML响应中提取元刷新指令。
81
+
82
+ This function looks for the HTML meta refresh tag in the response and extracts
83
+ the delay (in seconds) and the URL to redirect to. The meta refresh tag is
84
+ typically used for automatic page redirection or refreshing.
85
+ 此函数在响应中查找HTML元刷新标签,并提取延迟(以秒为单位)和要重定向到的URL。
86
+ 元刷新标签通常用于自动页面重定向或刷新。
87
+
88
+ Example of a meta refresh tag:
89
+ 元刷新标签的示例:
90
+ <meta http-equiv="refresh" content="5; url=https://example.com">
91
+
92
+ The function uses a cache to avoid repeated parsing of the same response.
93
+ Only the first 4KB of the response text are examined for performance reasons.
94
+ 该函数使用缓存来避免重复解析相同的响应。
95
+ 出于性能原因,只检查响应文本的前4KB。
96
+
97
+ Args:
98
+ response: The HTML response to extract the meta refresh from.
99
+ 要从中提取元刷新的HTML响应。
100
+ ignore_tags: HTML tags to ignore when parsing. Default is ('script', 'noscript').
101
+ 解析时要忽略的HTML标签。默认为('script', 'noscript')。
102
+
103
+ Returns:
104
+ A tuple containing:
105
+ 包含以下内容的元组:
106
+ - If meta refresh is found: (delay_seconds, url)
107
+ 如果找到元刷新:(延迟秒数, url)
108
+ - If no meta refresh is found: (None, None)
109
+ 如果未找到元刷新:(None, None)
110
+ """
111
+ # Check if the meta refresh is already cached for this response
112
+ # 检查此响应的元刷新是否已缓存
32
113
  if response not in _metaref_cache:
114
+ # Only examine the first 4KB of the response for performance
115
+ # 出于性能考虑,只检查响应的前4KB
33
116
  text = response.text[0:4096]
117
+ # Extract the meta refresh using w3lib.html
118
+ # 使用w3lib.html提取元刷新
34
119
  _metaref_cache[response] = html.get_meta_refresh(
35
120
  text, response.url, response.encoding, ignore_tags=ignore_tags)
121
+ # Return the cached meta refresh
122
+ # 返回缓存的元刷新
36
123
  return _metaref_cache[response]
37
-
38
-