aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/utils/url.py CHANGED
@@ -1,9 +1,17 @@
1
1
  """
2
+ URL utility functions for aioscrapy.
3
+ aioscrapy的URL实用函数。
4
+
2
5
  This module contains general purpose URL functions not found in the standard
3
- library.
6
+ library. It provides utilities for URL parsing, manipulation, and validation
7
+ specific to web crawling needs.
8
+ 此模块包含标准库中没有的通用URL函数。
9
+ 它提供了特定于网络爬取需求的URL解析、操作和验证实用工具。
4
10
 
5
11
  Some of the functions that used to be imported from this module have been moved
6
12
  to the w3lib.url module. Always import those from there instead.
13
+ 以前从此模块导入的一些函数已移至w3lib.url模块。
14
+ 始终从那里导入这些函数。
7
15
  """
8
16
  import posixpath
9
17
  import re
@@ -11,72 +19,242 @@ from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
11
19
 
12
20
  # scrapy.utils.url was moved to w3lib.url and import * ensures this
13
21
  # move doesn't break old code
14
- from w3lib.url import *
22
+ from w3lib.url import * # This imports functions like any_to_uri, add_or_replace_parameter, etc.
15
23
  from w3lib.url import _safe_chars, _unquotepath # noqa: F401
16
24
  from aioscrapy.utils.python import to_unicode
17
25
 
18
26
 
19
27
  def url_is_from_any_domain(url, domains):
20
- """Return True if the url belongs to any of the given domains"""
28
+ """
29
+ Check if a URL belongs to any of the given domains.
30
+ 检查URL是否属于给定域名中的任何一个。
31
+
32
+ This function checks if the host part of the URL exactly matches any of the
33
+ given domains, or if it is a subdomain of any of them. The comparison is
34
+ case-insensitive.
35
+ 此函数检查URL的主机部分是否与给定域名中的任何一个完全匹配,
36
+ 或者它是否是其中任何一个的子域。比较不区分大小写。
37
+
38
+ Args:
39
+ url: The URL to check. Can be a string or a ParseResult object.
40
+ 要检查的URL。可以是字符串或ParseResult对象。
41
+ domains: A list of domain names to check against.
42
+ 要检查的域名列表。
43
+
44
+ Returns:
45
+ bool: True if the URL belongs to any of the given domains, False otherwise.
46
+ 如果URL属于给定域名中的任何一个,则为True,否则为False。
47
+
48
+ Examples:
49
+ >>> url_is_from_any_domain("http://www.example.com/some/page.html", ["example.com"])
50
+ True
51
+ >>> url_is_from_any_domain("http://sub.example.com/", ["example.com"])
52
+ True
53
+ >>> url_is_from_any_domain("http://example.org/", ["example.com"])
54
+ False
55
+ """
56
+ # Get the host part of the URL and convert to lowercase
57
+ # 获取URL的主机部分并转换为小写
21
58
  host = parse_url(url).netloc.lower()
59
+
60
+ # If there's no host, it's not from any domain
61
+ # 如果没有主机,则不属于任何域名
22
62
  if not host:
23
63
  return False
64
+
65
+ # Convert all domains to lowercase for case-insensitive comparison
66
+ # 将所有域名转换为小写以进行不区分大小写的比较
24
67
  domains = [d.lower() for d in domains]
68
+
69
+ # Check if the host exactly matches any domain or is a subdomain of any domain
70
+ # 检查主机是否与任何域名完全匹配或是任何域名的子域
25
71
  return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
26
72
 
27
73
 
28
74
  def url_is_from_spider(url, spider):
29
- """Return True if the url belongs to the given spider"""
75
+ """
76
+ Check if a URL belongs to the given spider.
77
+ 检查URL是否属于给定的爬虫。
78
+
79
+ This function checks if the URL belongs to the domains that the spider
80
+ is allowed to crawl. It considers both the spider's name and its
81
+ 'allowed_domains' attribute (if it exists).
82
+ 此函数检查URL是否属于爬虫允许爬取的域名。
83
+ 它同时考虑爬虫的名称和其'allowed_domains'属性(如果存在)。
84
+
85
+ Args:
86
+ url: The URL to check. Can be a string or a ParseResult object.
87
+ 要检查的URL。可以是字符串或ParseResult对象。
88
+ spider: The spider object to check against.
89
+ 要检查的爬虫对象。
90
+
91
+ Returns:
92
+ bool: True if the URL belongs to the spider's domains, False otherwise.
93
+ 如果URL属于爬虫的域名,则为True,否则为False。
94
+ """
95
+ # Check if the URL belongs to either the spider's name or any of its allowed domains
96
+ # 检查URL是否属于爬虫的名称或其任何允许的域名
30
97
  return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
31
98
 
32
99
 
33
100
  def url_has_any_extension(url, extensions):
101
+ """
102
+ Check if a URL has any of the given extensions.
103
+ 检查URL是否具有给定扩展名中的任何一个。
104
+
105
+ This function extracts the file extension from the URL path and checks
106
+ if it matches any of the provided extensions. The comparison is case-insensitive.
107
+ 此函数从URL路径中提取文件扩展名,并检查它是否与提供的任何扩展名匹配。
108
+ 比较不区分大小写。
109
+
110
+ Args:
111
+ url: The URL to check. Can be a string or a ParseResult object.
112
+ 要检查的URL。可以是字符串或ParseResult对象。
113
+ extensions: A list of file extensions to check against (including the dot).
114
+ 要检查的文件扩展名列表(包括点)。
115
+
116
+ Returns:
117
+ bool: True if the URL has any of the given extensions, False otherwise.
118
+ 如果URL具有给定扩展名中的任何一个,则为True,否则为False。
119
+
120
+ Examples:
121
+ >>> url_has_any_extension("http://example.com/file.pdf", ['.pdf', '.doc'])
122
+ True
123
+ >>> url_has_any_extension("http://example.com/file.PDF", ['.pdf'])
124
+ True
125
+ >>> url_has_any_extension("http://example.com/file.txt", ['.pdf', '.doc'])
126
+ False
127
+ """
128
+ # Extract the file extension from the URL path and check if it's in the list
129
+ # 从URL路径中提取文件扩展名,并检查它是否在列表中
34
130
  return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
35
131
 
36
132
 
37
133
  def parse_url(url, encoding=None):
38
- """Return urlparsed url from the given argument (which could be an already
39
- parsed url)
40
134
  """
135
+ Parse a URL into its components.
136
+ 将URL解析为其组成部分。
137
+
138
+ This function parses a URL into its components using urllib.parse.urlparse.
139
+ If the input is already a ParseResult object, it is returned unchanged.
140
+ If the input is a string or bytes, it is first converted to unicode.
141
+ 此函数使用urllib.parse.urlparse将URL解析为其组成部分。
142
+ 如果输入已经是ParseResult对象,则原样返回。
143
+ 如果输入是字符串或字节,则首先将其转换为unicode。
144
+
145
+ Args:
146
+ url: The URL to parse. Can be a string, bytes, or ParseResult object.
147
+ 要解析的URL。可以是字符串、字节或ParseResult对象。
148
+ encoding: The encoding to use for decoding bytes. Defaults to 'utf-8'.
149
+ 用于解码字节的编码。默认为'utf-8'。
150
+
151
+ Returns:
152
+ ParseResult: A named tuple with URL components: scheme, netloc, path,
153
+ params, query, and fragment.
154
+ 包含URL组件的命名元组:scheme、netloc、path、
155
+ params、query和fragment。
156
+ """
157
+ # If the URL is already parsed, return it as is
158
+ # 如果URL已经解析,则原样返回
41
159
  if isinstance(url, ParseResult):
42
160
  return url
161
+ # Otherwise, convert to unicode and parse
162
+ # 否则,转换为unicode并解析
43
163
  return urlparse(to_unicode(url, encoding))
44
164
 
45
165
 
46
166
  def escape_ajax(url):
47
167
  """
48
- Return the crawleable url according to:
49
- https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
168
+ Convert AJAX URLs to crawlable URLs according to Google's specification.
169
+ 根据Google的规范将AJAX URL转换为可爬取的URL。
50
170
 
51
- >>> escape_ajax("www.example.com/ajax.html#!key=value")
52
- 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
53
- >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
54
- 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
55
- >>> escape_ajax("www.example.com/ajax.html?#!key=value")
56
- 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
57
- >>> escape_ajax("www.example.com/ajax.html#!")
58
- 'www.example.com/ajax.html?_escaped_fragment_='
171
+ This function implements Google's "AJAX crawling scheme" which allows
172
+ search engines to crawl AJAX-based pages. It converts fragment identifiers
173
+ that start with an exclamation mark (!) to query parameters with the
174
+ "_escaped_fragment_" key.
175
+ 此函数实现了Google的"AJAX爬取方案",该方案允许搜索引擎爬取基于AJAX的页面。
176
+ 它将以感叹号(!)开头的片段标识符转换为带有"_escaped_fragment_"键的查询参数。
59
177
 
60
- URLs that are not "AJAX crawlable" (according to Google) returned as-is:
178
+ See: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
61
179
 
62
- >>> escape_ajax("www.example.com/ajax.html#key=value")
63
- 'www.example.com/ajax.html#key=value'
64
- >>> escape_ajax("www.example.com/ajax.html#")
65
- 'www.example.com/ajax.html#'
66
- >>> escape_ajax("www.example.com/ajax.html")
67
- 'www.example.com/ajax.html'
180
+ Args:
181
+ url: The URL to convert.
182
+ 要转换的URL。
183
+
184
+ Returns:
185
+ str: The crawlable URL with _escaped_fragment_ parameter if the URL
186
+ contains an AJAX fragment, or the original URL otherwise.
187
+ 如果URL包含AJAX片段,则返回带有_escaped_fragment_参数的可爬取URL,
188
+ 否则返回原始URL。
189
+
190
+ Examples:
191
+ >>> escape_ajax("www.example.com/ajax.html#!key=value")
192
+ 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
193
+ >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
194
+ 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
195
+ >>> escape_ajax("www.example.com/ajax.html?#!key=value")
196
+ 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
197
+ >>> escape_ajax("www.example.com/ajax.html#!")
198
+ 'www.example.com/ajax.html?_escaped_fragment_='
199
+
200
+ URLs that are not "AJAX crawlable" (according to Google) returned as-is:
201
+
202
+ >>> escape_ajax("www.example.com/ajax.html#key=value")
203
+ 'www.example.com/ajax.html#key=value'
204
+ >>> escape_ajax("www.example.com/ajax.html#")
205
+ 'www.example.com/ajax.html#'
206
+ >>> escape_ajax("www.example.com/ajax.html")
207
+ 'www.example.com/ajax.html'
68
208
  """
209
+ # Split the URL into the part before the fragment and the fragment itself
210
+ # 将URL拆分为片段之前的部分和片段本身
69
211
  defrag, frag = urldefrag(url)
212
+
213
+ # If the fragment doesn't start with '!', it's not an AJAX URL
214
+ # 如果片段不以'!'开头,则它不是AJAX URL
70
215
  if not frag.startswith('!'):
71
216
  return url
217
+
218
+ # Convert the AJAX URL to a crawlable URL by adding the _escaped_fragment_ parameter
219
+ # 通过添加_escaped_fragment_参数将AJAX URL转换为可爬取的URL
72
220
  return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
73
221
 
74
222
 
75
223
  def add_http_if_no_scheme(url):
76
- """Add http as the default scheme if it is missing from the url."""
224
+ """
225
+ Add http as the default scheme if it is missing from the URL.
226
+ 如果URL中缺少协议,则添加http作为默认协议。
227
+
228
+ This function checks if the URL already has a scheme (like http://, https://, ftp://).
229
+ If not, it adds 'http:' or 'http://' depending on whether the URL already has a netloc.
230
+ 此函数检查URL是否已有协议(如http://、https://、ftp://)。
231
+ 如果没有,它会添加'http:'或'http://',具体取决于URL是否已有网络位置。
232
+
233
+ Args:
234
+ url: The URL to check and possibly modify.
235
+ 要检查并可能修改的URL。
236
+
237
+ Returns:
238
+ str: The URL with a scheme, either the original one or with 'http' added.
239
+ 带有协议的URL,可能是原始协议或添加了'http'。
240
+
241
+ Examples:
242
+ >>> add_http_if_no_scheme("example.com")
243
+ 'http://example.com'
244
+ >>> add_http_if_no_scheme("http://example.com")
245
+ 'http://example.com'
246
+ >>> add_http_if_no_scheme("https://example.com")
247
+ 'https://example.com'
248
+ """
249
+ # Check if the URL already has a scheme
250
+ # 检查URL是否已有协议
77
251
  match = re.match(r"^\w+://", url, flags=re.I)
78
252
  if not match:
253
+ # Parse the URL to determine if it has a netloc
254
+ # 解析URL以确定它是否有网络位置
79
255
  parts = urlparse(url)
256
+ # Add the appropriate http scheme
257
+ # 添加适当的http协议
80
258
  scheme = "http:" if parts.netloc else "http://"
81
259
  url = scheme + url
82
260
 
@@ -84,6 +262,24 @@ def add_http_if_no_scheme(url):
84
262
 
85
263
 
86
264
  def _is_posix_path(string):
265
+ """
266
+ Check if a string looks like a POSIX filesystem path.
267
+ 检查字符串是否看起来像POSIX文件系统路径。
268
+
269
+ This function uses a regular expression to check if the string matches
270
+ common patterns for POSIX filesystem paths, such as absolute paths,
271
+ relative paths, and paths with home directory references.
272
+ 此函数使用正则表达式检查字符串是否匹配POSIX文件系统路径的常见模式,
273
+ 如绝对路径、相对路径和带有主目录引用的路径。
274
+
275
+ Args:
276
+ string: The string to check.
277
+ 要检查的字符串。
278
+
279
+ Returns:
280
+ bool: True if the string looks like a POSIX path, False otherwise.
281
+ 如果字符串看起来像POSIX路径,则为True,否则为False。
282
+ """
87
283
  return bool(
88
284
  re.match(
89
285
  r'''
@@ -106,13 +302,31 @@ def _is_posix_path(string):
106
302
 
107
303
 
108
304
  def _is_windows_path(string):
305
+ """
306
+ Check if a string looks like a Windows filesystem path.
307
+ 检查字符串是否看起来像Windows文件系统路径。
308
+
309
+ This function uses a regular expression to check if the string matches
310
+ common patterns for Windows filesystem paths, such as drive letters (C:\)
311
+ or UNC paths (\\server\share).
312
+ 此函数使用正则表达式检查字符串是否匹配Windows文件系统路径的常见模式,
313
+ 如驱动器号(C:\)或UNC路径(\\server\share)。
314
+
315
+ Args:
316
+ string: The string to check.
317
+ 要检查的字符串。
318
+
319
+ Returns:
320
+ bool: True if the string looks like a Windows path, False otherwise.
321
+ 如果字符串看起来像Windows路径,则为True,否则为False。
322
+ """
109
323
  return bool(
110
324
  re.match(
111
325
  r'''
112
326
  ^
113
327
  (
114
- [a-z]:\\
115
- | \\\\
328
+ [a-z]:\\ # Drive letter followed by :\
329
+ | \\\\ # Or UNC path starting with \\
116
330
  )
117
331
  ''',
118
332
  string,
@@ -122,38 +336,129 @@ def _is_windows_path(string):
122
336
 
123
337
 
124
338
  def _is_filesystem_path(string):
339
+ """
340
+ Check if a string looks like a filesystem path (either POSIX or Windows).
341
+ 检查字符串是否看起来像文件系统路径(POSIX或Windows)。
342
+
343
+ This function combines the checks for both POSIX and Windows paths.
344
+ 此函数结合了对POSIX和Windows路径的检查。
345
+
346
+ Args:
347
+ string: The string to check.
348
+ 要检查的字符串。
349
+
350
+ Returns:
351
+ bool: True if the string looks like a filesystem path, False otherwise.
352
+ 如果字符串看起来像文件系统路径,则为True,否则为False。
353
+ """
125
354
  return _is_posix_path(string) or _is_windows_path(string)
126
355
 
127
356
 
128
357
  def guess_scheme(url):
129
- """Add an URL scheme if missing: file:// for filepath-like input or
130
- http:// otherwise."""
358
+ """
359
+ Add an appropriate URL scheme if missing from the input.
360
+ 如果输入中缺少适当的URL协议,则添加它。
361
+
362
+ This function examines the input and adds an appropriate scheme:
363
+ - 'file://' for filesystem paths (both POSIX and Windows)
364
+ - 'http://' for other inputs that look like URLs
365
+ 此函数检查输入并添加适当的协议:
366
+ - 对于文件系统路径(POSIX和Windows),添加'file://'
367
+ - 对于看起来像URL的其他输入,添加'http://'
368
+
369
+ Args:
370
+ url: The URL or path to process.
371
+ 要处理的URL或路径。
372
+
373
+ Returns:
374
+ str: The URL with an appropriate scheme added if it was missing.
375
+ 添加了适当协议(如果缺少)的URL。
376
+
377
+ Note:
378
+ This function uses any_to_uri() from w3lib.url to convert filesystem
379
+ paths to proper file:// URLs.
380
+ 此函数使用w3lib.url中的any_to_uri()将文件系统路径转换为适当的file://URL。
381
+ """
382
+ # If it looks like a filesystem path, convert it to a file:// URL
383
+ # 如果它看起来像文件系统路径,将其转换为file://URL
131
384
  if _is_filesystem_path(url):
132
385
  return any_to_uri(url)
386
+ # Otherwise, add http:// if needed
387
+ # 否则,如果需要,添加http://
133
388
  return add_http_if_no_scheme(url)
134
389
 
135
390
 
136
391
  def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=False, strip_fragment=True):
392
+ """
393
+ Strip a URL string of some of its components.
394
+ 从URL字符串中去除某些组件。
137
395
 
138
- """Strip URL string from some of its components:
396
+ This function allows selectively removing parts of a URL, such as credentials,
397
+ default ports, paths, queries, and fragments. It's useful for normalizing URLs
398
+ or removing sensitive information.
399
+ 此函数允许选择性地移除URL的部分内容,如凭据、默认端口、路径、查询和片段。
400
+ 它对于规范化URL或移除敏感信息很有用。
139
401
 
140
- - ``strip_credentials`` removes "user:password@"
141
- - ``strip_default_port`` removes ":80" (resp. ":443", ":21")
142
- from http:// (resp. https://, ftp://) URLs
143
- - ``origin_only`` replaces path component with "/", also dropping
144
- query and fragment components ; it also strips credentials
145
- - ``strip_fragment`` drops any #fragment component
146
- """
402
+ Args:
403
+ url: The URL to strip.
404
+ 要处理的URL。
405
+ strip_credentials: Whether to remove "user:password@" from the URL.
406
+ 是否从URL中移除"user:password@"。
407
+ Defaults to True.
408
+ 默认为True。
409
+ strip_default_port: Whether to remove default ports (":80" for http,
410
+ ":443" for https, ":21" for ftp) from the URL.
411
+ 是否从URL中移除默认端口(http的":80",
412
+ https的":443",ftp的":21")。
413
+ Defaults to True.
414
+ 默认为True。
415
+ origin_only: Whether to keep only the origin part of the URL (scheme and netloc),
416
+ replacing the path with "/" and removing params, query, and fragment.
417
+ 是否只保留URL的源部分(协议和网络位置),
418
+ 将路径替换为"/"并移除参数、查询和片段。
419
+ This also implies strip_credentials=True.
420
+ 这也意味着strip_credentials=True。
421
+ Defaults to False.
422
+ 默认为False。
423
+ strip_fragment: Whether to remove any #fragment component from the URL.
424
+ 是否从URL中移除任何#片段组件。
425
+ Defaults to True.
426
+ 默认为True。
427
+
428
+ Returns:
429
+ str: The stripped URL.
430
+ 处理后的URL。
147
431
 
432
+ Examples:
433
+ >>> strip_url("http://user:pass@example.com:80/path?query#fragment")
434
+ 'http://example.com/path?query'
435
+ >>> strip_url("http://user:pass@example.com:80/path?query#fragment",
436
+ ... strip_credentials=False, strip_fragment=False)
437
+ 'http://user:pass@example.com/path?query#fragment'
438
+ >>> strip_url("http://user:pass@example.com:80/path?query#fragment",
439
+ ... origin_only=True)
440
+ 'http://example.com/'
441
+ """
442
+ # Parse the URL into its components
443
+ # 将URL解析为其组件
148
444
  parsed_url = urlparse(url)
149
445
  netloc = parsed_url.netloc
446
+
447
+ # Remove credentials if requested or if origin_only is True
448
+ # 如果请求或如果origin_only为True,则移除凭据
150
449
  if (strip_credentials or origin_only) and (parsed_url.username or parsed_url.password):
151
450
  netloc = netloc.split('@')[-1]
451
+
452
+ # Remove default ports if requested
453
+ # 如果请求,则移除默认端口
152
454
  if strip_default_port and parsed_url.port:
153
455
  if (parsed_url.scheme, parsed_url.port) in (('http', 80),
154
456
  ('https', 443),
155
457
  ('ftp', 21)):
156
458
  netloc = netloc.replace(f':{parsed_url.port}', '')
459
+
460
+ # Reconstruct the URL with the desired components
461
+ # 使用所需组件重建URL
157
462
  return urlunparse((
158
463
  parsed_url.scheme,
159
464
  netloc,