aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,13 @@
1
- """
2
- This module implements the TextResponse class which adds encoding handling and
3
- discovering (through HTTP headers) to base Response class.
4
1
 
5
- See documentation in docs/topics/request-response.rst
2
+ """
3
+ Text response implementation for aioscrapy.
4
+ aioscrapy的文本响应实现。
5
+
6
+ This module provides the TextResponse class, which extends the base Response
7
+ to handle text content. It supports encoding detection, text extraction,
8
+ and provides methods for CSS and XPath selectors.
9
+ 此模块提供了TextResponse类,它扩展了基本Response以处理文本内容。
10
+ 它支持编码检测、文本提取,并提供CSS和XPath选择器的方法。
6
11
  """
7
12
 
8
13
  import warnings
@@ -23,46 +28,190 @@ from aioscrapy.http.response import Response
23
28
  from aioscrapy.utils.python import memoizemethod_noargs, to_unicode
24
29
  from aioscrapy.utils.response import get_base_url
25
30
 
31
+ # Sentinel object to indicate that a value hasn't been cached yet
32
+ # 表示值尚未缓存的哨兵对象
26
33
  _NONE = object()
27
34
 
28
35
 
29
36
  class TextResponse(Response):
37
+ """
38
+ A Response subclass that adds support for text processing.
39
+ 添加文本处理支持的Response子类。
40
+
41
+ This class extends the base Response to handle text content, with features for:
42
+ 此类扩展了基本Response以处理文本内容,具有以下功能:
43
+
44
+ - Automatic encoding detection
45
+ 自动编码检测
46
+ - Unicode conversion
47
+ Unicode转换
48
+ - CSS and XPath selectors
49
+ CSS和XPath选择器
50
+ - JSON parsing
51
+ JSON解析
52
+ - Enhanced link following
53
+ 增强的链接跟踪
54
+ """
30
55
 
56
+ # Default encoding to use if no encoding is specified or detected
57
+ # 如果未指定或检测到编码,则使用的默认编码
31
58
  _DEFAULT_ENCODING = 'ascii'
59
+
60
+ # Cache for decoded JSON content
61
+ # 解码的JSON内容的缓存
32
62
  _cached_decoded_json = _NONE
33
63
 
34
64
  def __init__(self, *args, encoding=None, **kwargs):
65
+ """
66
+ Initialize a TextResponse.
67
+ 初始化TextResponse。
68
+
69
+ Args:
70
+ *args: Positional arguments passed to the Response constructor.
71
+ 传递给Response构造函数的位置参数。
72
+ encoding: The encoding of the response. If None, it will be auto-detected.
73
+ 响应的编码。如果为None,将自动检测。
74
+ **kwargs: Keyword arguments passed to the Response constructor.
75
+ 传递给Response构造函数的关键字参数。
76
+ """
77
+ # The explicitly declared encoding
78
+ # 明确声明的编码
35
79
  self._encoding = encoding
80
+
81
+ # Cache for inferred body encoding
82
+ # 推断的正文编码的缓存
36
83
  self._cached_benc = None
84
+
85
+ # Cache for unicode body
86
+ # Unicode正文的缓存
37
87
  self._cached_ubody = None
88
+
89
+ # Cache for selector
90
+ # 选择器的缓存
38
91
  self._cached_selector = None
92
+
39
93
  super().__init__(*args, **kwargs)
40
94
 
41
95
  def _set_url(self, url):
96
+ """
97
+ Set the response URL, ensuring it's properly encoded.
98
+ 设置响应URL,确保其正确编码。
99
+
100
+ This method overrides the base Response._set_url to handle string URLs
101
+ by converting them to unicode using the response's encoding.
102
+ 此方法重写了基本Response._set_url,通过使用响应的编码将字符串URL转换为unicode来处理它们。
103
+
104
+ Args:
105
+ url: The URL to set.
106
+ 要设置的URL。
107
+
108
+ Raises:
109
+ TypeError: If the URL is not a string (raised by the parent method).
110
+ 如果URL不是字符串(由父方法引发)。
111
+ """
42
112
  if isinstance(url, str):
113
+ # Convert the URL to unicode using the response's encoding
114
+ # 使用响应的编码将URL转换为unicode
43
115
  self._url = to_unicode(url, self.encoding)
44
116
  else:
117
+ # Let the parent class handle non-string URLs
118
+ # 让父类处理非字符串URL
45
119
  super()._set_url(url)
46
120
 
47
121
  def _set_body(self, body):
122
+ """
123
+ Set the response body, handling both bytes and string inputs.
124
+ 设置响应体,处理字节和字符串输入。
125
+
126
+ This method overrides the base Response._set_body to handle string bodies
127
+ by encoding them using the response's encoding.
128
+ 此方法重写了基本Response._set_body,通过使用响应的编码对字符串正文进行编码来处理它们。
129
+
130
+ Args:
131
+ body: The body to set, either as bytes or string.
132
+ 要设置的正文,可以是字节或字符串。
133
+
134
+ Raises:
135
+ TypeError: If the body is a string but no encoding is specified.
136
+ 如果正文是字符串但未指定编码。
137
+ """
138
+ # Initialize with empty bytes for encoding detection
139
+ # 初始化为空字节以进行编码检测
48
140
  self._body = b'' # used by encoding detection
141
+
49
142
  if isinstance(body, str):
143
+ # Handle string bodies by encoding them
144
+ # 通过编码字符串正文来处理它们
50
145
  if self._encoding is None:
51
146
  raise TypeError('Cannot convert unicode body - '
52
147
  f'{type(self).__name__} has no encoding')
53
148
  self._body = body.encode(self._encoding)
54
149
  else:
150
+ # Let the parent class handle non-string bodies
151
+ # 让父类处理非字符串正文
55
152
  super()._set_body(body)
56
153
 
57
154
  def replace(self, *args, **kwargs):
155
+ """
156
+ Create a new TextResponse with the same attributes except for those given new values.
157
+ 创建一个新的TextResponse,除了给定的新值外,其他属性与当前TextResponse相同。
158
+
159
+ This method extends the base Response.replace() method to ensure that
160
+ the encoding is preserved when creating a new TextResponse.
161
+ 此方法扩展了基本Response.replace()方法,以确保在创建新的TextResponse时保留编码。
162
+
163
+ Args:
164
+ *args: Positional arguments passed to the base replace() method.
165
+ 传递给基本replace()方法的位置参数。
166
+ **kwargs: Keyword arguments passed to the base replace() method.
167
+ 传递给基本replace()方法的关键字参数。
168
+
169
+ Returns:
170
+ TextResponse: A new TextResponse object.
171
+ 一个新的TextResponse对象。
172
+ """
173
+ # Ensure the encoding is preserved
174
+ # 确保编码被保留
58
175
  kwargs.setdefault('encoding', self.encoding)
59
176
  return Response.replace(self, *args, **kwargs)
60
177
 
61
178
  @property
62
179
  def encoding(self):
180
+ """
181
+ Get the response encoding.
182
+ 获取响应编码。
183
+
184
+ This property returns the encoding of the response, using a cascading approach:
185
+ 1. First, try to get the explicitly declared encoding
186
+ 2. If not available, try to infer the encoding from the body
187
+ 此属性返回响应的编码,使用级联方法:
188
+ 1. 首先,尝试获取明确声明的编码
189
+ 2. 如果不可用,尝试从正文推断编码
190
+
191
+ Returns:
192
+ str: The response encoding.
193
+ 响应编码。
194
+ """
63
195
  return self._declared_encoding() or self._body_inferred_encoding()
64
196
 
65
197
  def _declared_encoding(self):
198
+ """
199
+ Get the explicitly declared encoding.
200
+ 获取明确声明的编码。
201
+
202
+ This method tries to find the encoding from various sources, in order:
203
+ 1. The encoding specified in the constructor
204
+ 2. The encoding specified in the Content-Type header
205
+ 3. The encoding declared in the HTML/XML body
206
+ 此方法尝试从各种来源按顺序查找编码:
207
+ 1. 构造函数中指定的编码
208
+ 2. Content-Type头部中指定的编码
209
+ 3. HTML/XML正文中声明的编码
210
+
211
+ Returns:
212
+ str or None: The declared encoding, or None if not found.
213
+ 声明的编码,如果未找到则为None。
214
+ """
66
215
  return (
67
216
  self._encoding
68
217
  or self._headers_encoding()
@@ -70,7 +219,17 @@ class TextResponse(Response):
70
219
  )
71
220
 
72
221
  def body_as_unicode(self):
73
- """Return body as unicode"""
222
+ """
223
+ Return the response body as unicode.
224
+ 将响应体作为unicode返回。
225
+
226
+ This method is deprecated. Use the text property instead.
227
+ 此方法已弃用。请改用text属性。
228
+
229
+ Returns:
230
+ str: The response body as unicode.
231
+ 响应体作为unicode。
232
+ """
74
233
  warnings.warn('Response.body_as_unicode() is deprecated, '
75
234
  'please use Response.text instead.',
76
235
  AioScrapyDeprecationWarning, stacklevel=2)
@@ -78,35 +237,106 @@ class TextResponse(Response):
78
237
 
79
238
  def json(self):
80
239
  """
81
- .. versionadded:: 2.2
82
-
83
- Deserialize a JSON document to a Python object.
240
+ Parse the response body as JSON.
241
+ 将响应体解析为JSON。
242
+
243
+ This method deserializes the response body as a JSON document
244
+ and returns the corresponding Python object. The result is cached
245
+ for subsequent calls.
246
+ 此方法将响应体反序列化为JSON文档,并返回相应的Python对象。
247
+ 结果会被缓存以供后续调用。
248
+
249
+ Returns:
250
+ object: The deserialized JSON document.
251
+ 反序列化的JSON文档。
252
+
253
+ Raises:
254
+ ValueError: If the body is not valid JSON.
255
+ 如果正文不是有效的JSON。
84
256
  """
257
+ # Use cached result if available
258
+ # 如果可用,使用缓存的结果
85
259
  if self._cached_decoded_json is _NONE:
86
260
  self._cached_decoded_json = ujson.loads(self.text)
87
261
  return self._cached_decoded_json
88
262
 
89
263
  @property
90
264
  def text(self):
91
- """ Body as unicode """
92
- # access self.encoding before _cached_ubody to make sure
265
+ """
266
+ Get the response body as unicode text.
267
+ 将响应体作为unicode文本获取。
268
+
269
+ This property converts the response body to unicode using the detected
270
+ or specified encoding. The result is cached for subsequent access.
271
+ 此属性使用检测到的或指定的编码将响应体转换为unicode。
272
+ 结果会被缓存以供后续访问。
273
+
274
+ Returns:
275
+ str: The response body as unicode text.
276
+ 响应体作为unicode文本。
277
+ """
278
+ # Access self.encoding before _cached_ubody to make sure
93
279
  # _body_inferred_encoding is called
280
+ # 在_cached_ubody之前访问self.encoding,以确保调用_body_inferred_encoding
94
281
  if self._cached_ubody is None:
95
282
  charset = f'charset={self.encoding}'
96
283
  self._cached_ubody = html_to_unicode(charset, self.body)[1]
97
284
  return self._cached_ubody
98
285
 
99
286
  def urljoin(self, url):
100
- """Join this Response's url with a possible relative url to form an
101
- absolute interpretation of the latter."""
287
+ """
288
+ Join this Response's url with a possible relative url.
289
+ 将此Response的url与可能的相对url连接。
290
+
291
+ This method extends the base Response.urljoin() method to use the base URL
292
+ from the HTML document (if available) instead of the response URL.
293
+ 此方法扩展了基本Response.urljoin()方法,使用HTML文档中的基本URL
294
+ (如果可用)而不是响应URL。
295
+
296
+ Args:
297
+ url: The URL to join. Can be a relative URL.
298
+ 要连接的URL。可以是相对URL。
299
+
300
+ Returns:
301
+ str: The absolute URL.
302
+ 绝对URL。
303
+ """
304
+ # Use get_base_url to extract the base URL from the HTML document
305
+ # 使用get_base_url从HTML文档中提取基本URL
102
306
  return urljoin(get_base_url(self), url)
103
307
 
104
308
  @memoizemethod_noargs
105
309
  def _headers_encoding(self):
310
+ """
311
+ Get the encoding declared in the Content-Type header.
312
+ 获取Content-Type头部中声明的编码。
313
+
314
+ This method extracts the charset parameter from the Content-Type header.
315
+ The result is memoized for performance.
316
+ 此方法从Content-Type头部提取charset参数。
317
+ 结果会被记忆化以提高性能。
318
+
319
+ Returns:
320
+ str or None: The encoding declared in the header, or None if not found.
321
+ 头部中声明的编码,如果未找到则为None。
322
+ """
106
323
  content_type = self.headers.get('Content-Type', '')
107
324
  return http_content_type_encoding(to_unicode(content_type))
108
325
 
109
326
  def _body_inferred_encoding(self):
327
+ """
328
+ Infer the encoding from the response body.
329
+ 从响应体推断编码。
330
+
331
+ This method tries to detect the encoding from the response body
332
+ using various heuristics. The result is cached for subsequent calls.
333
+ 此方法尝试使用各种启发式方法从响应体检测编码。
334
+ 结果会被缓存以供后续调用。
335
+
336
+ Returns:
337
+ str: The inferred encoding.
338
+ 推断的编码。
339
+ """
110
340
  if self._cached_benc is None:
111
341
  content_type = to_unicode(self.headers.get('Content-Type', ''))
112
342
  benc, ubody = html_to_unicode(content_type, self.body,
@@ -117,27 +347,105 @@ class TextResponse(Response):
117
347
  return self._cached_benc
118
348
 
119
349
  def _auto_detect_fun(self, text):
350
+ """
351
+ Auto-detect the encoding of the given text.
352
+ 自动检测给定文本的编码。
353
+
354
+ This method tries to decode the text using a sequence of common encodings
355
+ and returns the first one that succeeds.
356
+ 此方法尝试使用一系列常见编码解码文本,并返回第一个成功的编码。
357
+
358
+ Args:
359
+ text: The text to detect the encoding for.
360
+ 要检测编码的文本。
361
+
362
+ Returns:
363
+ str or None: The detected encoding, or None if none of the encodings work.
364
+ 检测到的编码,如果没有编码有效则为None。
365
+ """
366
+ # Try a sequence of common encodings
367
+ # 尝试一系列常见编码
120
368
  for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
121
369
  try:
122
370
  text.decode(enc)
123
371
  except UnicodeError:
124
372
  continue
125
373
  return resolve_encoding(enc)
374
+ return None
126
375
 
127
376
  @memoizemethod_noargs
128
377
  def _body_declared_encoding(self):
378
+ """
379
+ Get the encoding declared in the HTML/XML body.
380
+ 获取HTML/XML正文中声明的编码。
381
+
382
+ This method extracts the encoding from meta tags or XML declarations
383
+ in the response body. The result is memoized for performance.
384
+ 此方法从响应体中的meta标签或XML声明中提取编码。
385
+ 结果会被记忆化以提高性能。
386
+
387
+ Returns:
388
+ str or None: The encoding declared in the body, or None if not found.
389
+ 正文中声明的编码,如果未找到则为None。
390
+ """
129
391
  return html_body_declared_encoding(self.body)
130
392
 
131
393
  @property
132
394
  def selector(self):
395
+ """
396
+ Get a Selector for this response.
397
+ 获取此响应的选择器。
398
+
399
+ This property creates a parsel.Selector instance for the response text,
400
+ which allows for XPath and CSS queries. The result is cached for
401
+ subsequent access.
402
+ 此属性为响应文本创建一个parsel.Selector实例,允许XPath和CSS查询。
403
+ 结果会被缓存以供后续访问。
404
+
405
+ Returns:
406
+ parsel.Selector: A Selector instance for this response.
407
+ 此响应的Selector实例。
408
+ """
133
409
  if self._cached_selector is None:
134
410
  self._cached_selector = Selector(self.text)
135
411
  return self._cached_selector
136
412
 
137
413
  def xpath(self, query, **kwargs):
414
+ """
415
+ Apply the given XPath selector to this response's content.
416
+ 将给定的XPath选择器应用于此响应的内容。
417
+
418
+ This is a shortcut method that creates a selector and applies the XPath query.
419
+ 此方法是一个快捷方法,创建选择器并应用XPath查询。
420
+
421
+ Args:
422
+ query: The XPath query string.
423
+ XPath查询字符串。
424
+ **kwargs: Additional keyword arguments passed to the selector's xpath method.
425
+ 传递给选择器的xpath方法的额外关键字参数。
426
+
427
+ Returns:
428
+ parsel.SelectorList: The result of the XPath query.
429
+ XPath查询的结果。
430
+ """
138
431
  return self.selector.xpath(query, **kwargs)
139
432
 
140
433
  def css(self, query):
434
+ """
435
+ Apply the given CSS selector to this response's content.
436
+ 将给定的CSS选择器应用于此响应的内容。
437
+
438
+ This is a shortcut method that creates a selector and applies the CSS query.
439
+ 此方法是一个快捷方法,创建选择器并应用CSS查询。
440
+
441
+ Args:
442
+ query: The CSS query string.
443
+ CSS查询字符串。
444
+
445
+ Returns:
446
+ parsel.SelectorList: The result of the CSS query.
447
+ CSS查询的结果。
448
+ """
141
449
  return self.selector.css(query)
142
450
 
143
451
  def follow(self, url, callback=None, method='GET', headers=None, body=None,
@@ -145,20 +453,65 @@ class TextResponse(Response):
145
453
  fingerprint=None, errback=None, cb_kwargs=None, flags=None):
146
454
  # type: (...) -> Request
147
455
  """
148
- Return a :class:`~.Request` instance to follow a link ``url``.
149
- It accepts the same arguments as ``Request.__init__`` method,
150
- but ``url`` can be not only an absolute URL, but also
151
-
152
- * a relative URL
153
- * a :class:`~scrapy.link.Link` object, e.g. the result of
154
- :ref:`topics-link-extractors`
155
- * a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
156
- ``response.css('a.my_link')[0]``
157
- * an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
158
- ``response.css('a::attr(href)')[0]`` or
159
- ``response.xpath('//img/@src')[0]``
160
-
161
- See :ref:`response-follow-example` for usage examples.
456
+ Return a Request instance to follow a link.
457
+ 返回一个Request实例以跟踪链接。
458
+
459
+ This method extends the base Response.follow() method to handle additional
460
+ URL types, including Selector objects for HTML elements and attributes.
461
+ 此方法扩展了基本Response.follow()方法,以处理额外的URL类型,
462
+ 包括HTML元素和属性的Selector对象。
463
+
464
+ The URL can be:
465
+ URL可以是:
466
+
467
+ * An absolute URL (string)
468
+ 绝对URL(字符串)
469
+ * A relative URL (string)
470
+ 相对URL(字符串)
471
+ * A Link object
472
+ Link对象
473
+ * A Selector object for a <link> or <a> element
474
+ <link>或<a>元素的Selector对象
475
+ * An attribute Selector (not SelectorList), e.g., from css('a::attr(href)')[0]
476
+ 属性Selector(非SelectorList),例如,来自css('a::attr(href)')[0]
477
+
478
+ Args:
479
+ url: The URL to follow. Can be any of the types described above.
480
+ 要跟踪的URL。可以是上述任何类型。
481
+ callback: A function to be called with the response from the request.
482
+ 使用请求的响应调用的函数。
483
+ method: The HTTP method to use.
484
+ 要使用的HTTP方法。
485
+ headers: The headers to use for the request.
486
+ 请求使用的头部。
487
+ body: The body of the request.
488
+ 请求的正文。
489
+ cookies: The cookies to send with the request.
490
+ 与请求一起发送的Cookie。
491
+ meta: Extra data to pass to the request.
492
+ 传递给请求的额外数据。
493
+ encoding: The encoding to use for the request. Defaults to this response's encoding.
494
+ 请求使用的编码。默认为此响应的编码。
495
+ priority: The priority of the request.
496
+ 请求的优先级。
497
+ dont_filter: Whether to filter duplicate requests.
498
+ 是否过滤重复请求。
499
+ fingerprint: The fingerprint for the request.
500
+ 请求的指纹。
501
+ errback: A function to be called if the request fails.
502
+ 如果请求失败时调用的函数。
503
+ cb_kwargs: Additional keyword arguments to pass to the callback.
504
+ 传递给回调的额外关键字参数。
505
+ flags: Flags for the request.
506
+ 请求的标志。
507
+
508
+ Returns:
509
+ Request: A new Request instance.
510
+ 一个新的Request实例。
511
+
512
+ Raises:
513
+ ValueError: If the URL is a SelectorList or cannot be extracted from a Selector.
514
+ 如果URL是SelectorList或无法从Selector中提取。
162
515
  """
163
516
  if isinstance(url, parsel.Selector):
164
517
  url = _url_from_selector(url)
@@ -188,27 +541,78 @@ class TextResponse(Response):
188
541
  css=None, xpath=None):
189
542
  # type: (...) -> Generator[Request, None, None]
190
543
  """
191
- A generator that produces :class:`~.Request` instances to follow all
192
- links in ``urls``. It accepts the same arguments as the :class:`~.Request`'s
193
- ``__init__`` method, except that each ``urls`` element does not need to be
194
- an absolute URL, it can be any of the following:
195
-
196
- * a relative URL
197
- * a :class:`~scrapy.link.Link` object, e.g. the result of
198
- :ref:`topics-link-extractors`
199
- * a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
200
- ``response.css('a.my_link')[0]``
201
- * an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
202
- ``response.css('a::attr(href)')[0]`` or
203
- ``response.xpath('//img/@src')[0]``
204
-
205
- In addition, ``css`` and ``xpath`` arguments are accepted to perform the link extraction
206
- within the ``follow_all`` method (only one of ``urls``, ``css`` and ``xpath`` is accepted).
207
-
208
- Note that when passing a ``SelectorList`` as argument for the ``urls`` parameter or
209
- using the ``css`` or ``xpath`` parameters, this method will not produce requests for
210
- selectors from which links cannot be obtained (for instance, anchor tags without an
211
- ``href`` attribute)
544
+ Return a generator of Request instances to follow all links in urls.
545
+ 返回一个Request实例的生成器,以跟踪urls中的所有链接。
546
+
547
+ This method extends the base Response.follow_all() method to handle additional
548
+ URL types and to support direct extraction of links using CSS or XPath selectors.
549
+ 此方法扩展了基本Response.follow_all()方法,以处理额外的URL类型,
550
+ 并支持使用CSS或XPath选择器直接提取链接。
551
+
552
+ The URLs can be provided in several ways:
553
+ URLs可以通过几种方式提供:
554
+
555
+ 1. As a list in the 'urls' parameter, where each element can be:
556
+ 作为'urls'参数中的列表,其中每个元素可以是:
557
+ * An absolute URL (string)
558
+ 绝对URL(字符串)
559
+ * A relative URL (string)
560
+ 相对URL(字符串)
561
+ * A Link object
562
+ Link对象
563
+ * A Selector object for a <link> or <a> element
564
+ <link>或<a>元素的Selector对象
565
+ * An attribute Selector (not SelectorList)
566
+ 属性Selector(非SelectorList)
567
+
568
+ 2. By providing a CSS selector in the 'css' parameter
569
+ 通过在'css'参数中提供CSS选择器
570
+
571
+ 3. By providing an XPath selector in the 'xpath' parameter
572
+ 通过在'xpath'参数中提供XPath选择器
573
+
574
+ Note: Only one of 'urls', 'css', or 'xpath' should be provided.
575
+ 注意:只应提供'urls'、'css'或'xpath'中的一个。
576
+
577
+ Args:
578
+ urls: An iterable of URLs to follow. Each can be any of the types described above.
579
+ 要跟踪的URL的可迭代对象。每个可以是上述任何类型。
580
+ callback: A function to be called with the response from each request.
581
+ 使用每个请求的响应调用的函数。
582
+ method: The HTTP method to use.
583
+ 要使用的HTTP方法。
584
+ headers: The headers to use for the requests.
585
+ 请求使用的头部。
586
+ body: The body of the requests.
587
+ 请求的正文。
588
+ cookies: The cookies to send with the requests.
589
+ 与请求一起发送的Cookie。
590
+ meta: Extra data to pass to the requests.
591
+ 传递给请求的额外数据。
592
+ encoding: The encoding to use for the requests. Defaults to this response's encoding.
593
+ 请求使用的编码。默认为此响应的编码。
594
+ priority: The priority of the requests.
595
+ 请求的优先级。
596
+ dont_filter: Whether to filter duplicate requests.
597
+ 是否过滤重复请求。
598
+ errback: A function to be called if the requests fail.
599
+ 如果请求失败时调用的函数。
600
+ cb_kwargs: Additional keyword arguments to pass to the callback.
601
+ 传递给回调的额外关键字参数。
602
+ flags: Flags for the requests.
603
+ 请求的标志。
604
+ css: A CSS selector to extract links from this response.
605
+ 从此响应中提取链接的CSS选择器。
606
+ xpath: An XPath selector to extract links from this response.
607
+ 从此响应中提取链接的XPath选择器。
608
+
609
+ Returns:
610
+ Generator[Request, None, None]: A generator of Request instances.
611
+ Request实例的生成器。
612
+
613
+ Raises:
614
+ ValueError: If more than one of 'urls', 'css', or 'xpath' is provided.
615
+ 如果提供了'urls'、'css'或'xpath'中的多个。
212
616
  """
213
617
  arguments = [x for x in (urls, css, xpath) if x is not None]
214
618
  if len(arguments) != 1:
@@ -245,21 +649,64 @@ class TextResponse(Response):
245
649
 
246
650
  class _InvalidSelector(ValueError):
247
651
  """
248
- Raised when a URL cannot be obtained from a Selector
652
+ Raised when a URL cannot be obtained from a Selector.
653
+ 当无法从Selector获取URL时引发。
654
+
655
+ This exception is used internally by the _url_from_selector function
656
+ to indicate that a Selector object cannot be converted to a URL.
657
+ 此异常由_url_from_selector函数内部使用,
658
+ 表示无法将Selector对象转换为URL。
249
659
  """
250
660
 
251
661
 
252
662
  def _url_from_selector(sel):
253
663
  # type: (parsel.Selector) -> str
664
+ """
665
+ Extract a URL from a Selector object.
666
+ 从Selector对象中提取URL。
667
+
668
+ This function extracts a URL from different types of Selector objects:
669
+ 此函数从不同类型的Selector对象中提取URL:
670
+
671
+ 1. If the selector root is a string (e.g., from ::attr(href)), it returns that string
672
+ 如果选择器根是字符串(例如,来自::attr(href)),则返回该字符串
673
+ 2. If the selector is for an <a> or <link> element, it returns the href attribute
674
+ 如果选择器是<a>或<link>元素,则返回href属性
675
+
676
+ Args:
677
+ sel: The Selector object to extract a URL from.
678
+ 要从中提取URL的Selector对象。
679
+
680
+ Returns:
681
+ str: The extracted URL with whitespace stripped.
682
+ 提取的URL,已去除空白。
683
+
684
+ Raises:
685
+ _InvalidSelector: If the URL cannot be extracted from the Selector.
686
+ 如果无法从Selector中提取URL。
687
+ """
254
688
  if isinstance(sel.root, str):
255
- # e.g. ::attr(href) result
689
+ # For attribute selectors (e.g., ::attr(href) result)
690
+ # 对于属性选择器(例如,::attr(href)结果)
256
691
  return strip_html5_whitespace(sel.root)
692
+
257
693
  if not hasattr(sel.root, 'tag'):
694
+ # For selectors that don't represent HTML elements
695
+ # 对于不表示HTML元素的选择器
258
696
  raise _InvalidSelector(f"Unsupported selector: {sel}")
697
+
259
698
  if sel.root.tag not in ('a', 'link'):
699
+ # Only <a> and <link> elements are supported
700
+ # 只支持<a>和<link>元素
260
701
  raise _InvalidSelector("Only <a> and <link> elements are supported; "
261
702
  f"got <{sel.root.tag}>")
703
+
262
704
  href = sel.root.get('href')
263
705
  if href is None:
706
+ # The element has no href attribute
707
+ # 元素没有href属性
264
708
  raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
709
+
710
+ # Return the href with whitespace stripped
711
+ # 返回去除空白的href
265
712
  return strip_html5_whitespace(href)