PyPI - aio-scrapy - Versions diffs - 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl - Mend

aio-scrapy 2.1.4py3-none-any.whl → 2.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

{aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
aio_scrapy-2.1.7.dist-info/METADATA +147 -0
aio_scrapy-2.1.7.dist-info/RECORD +134 -0
{aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
aioscrapy/VERSION +1 -1
aioscrapy/cmdline.py +438 -5
aioscrapy/core/downloader/__init__.py +522 -17
aioscrapy/core/downloader/handlers/__init__.py +187 -5
aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
aioscrapy/core/downloader/handlers/httpx.py +135 -5
aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
aioscrapy/core/downloader/handlers/requests.py +120 -2
aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
aioscrapy/core/engine.py +381 -20
aioscrapy/core/scheduler.py +350 -36
aioscrapy/core/scraper.py +509 -33
aioscrapy/crawler.py +392 -10
aioscrapy/db/__init__.py +149 -0
aioscrapy/db/absmanager.py +212 -6
aioscrapy/db/aiomongo.py +292 -10
aioscrapy/db/aiomysql.py +363 -10
aioscrapy/db/aiopg.py +299 -2
aioscrapy/db/aiorabbitmq.py +444 -4
aioscrapy/db/aioredis.py +260 -11
aioscrapy/dupefilters/__init__.py +110 -5
aioscrapy/dupefilters/disk.py +124 -2
aioscrapy/dupefilters/redis.py +598 -32
aioscrapy/exceptions.py +151 -13
aioscrapy/http/__init__.py +1 -1
aioscrapy/http/headers.py +237 -3
aioscrapy/http/request/__init__.py +257 -11
aioscrapy/http/request/form.py +83 -3
aioscrapy/http/request/json_request.py +121 -9
aioscrapy/http/response/__init__.py +306 -33
aioscrapy/http/response/html.py +42 -3
aioscrapy/http/response/text.py +496 -49
aioscrapy/http/response/web_driver.py +144 -0
aioscrapy/http/response/xml.py +45 -3
aioscrapy/libs/downloader/defaultheaders.py +66 -2
aioscrapy/libs/downloader/downloadtimeout.py +91 -2
aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
aioscrapy/libs/downloader/retry.py +192 -6
aioscrapy/libs/downloader/stats.py +142 -0
aioscrapy/libs/downloader/useragent.py +93 -2
aioscrapy/libs/extensions/closespider.py +166 -4
aioscrapy/libs/extensions/corestats.py +151 -1
aioscrapy/libs/extensions/logstats.py +145 -1
aioscrapy/libs/extensions/metric.py +370 -1
aioscrapy/libs/extensions/throttle.py +235 -1
aioscrapy/libs/pipelines/__init__.py +345 -2
aioscrapy/libs/pipelines/csv.py +242 -0
aioscrapy/libs/pipelines/excel.py +545 -0
aioscrapy/libs/pipelines/mongo.py +132 -0
aioscrapy/libs/pipelines/mysql.py +67 -0
aioscrapy/libs/pipelines/pg.py +67 -0
aioscrapy/libs/spider/depth.py +141 -3
aioscrapy/libs/spider/httperror.py +144 -4
aioscrapy/libs/spider/offsite.py +202 -2
aioscrapy/libs/spider/referer.py +396 -21
aioscrapy/libs/spider/urllength.py +97 -1
aioscrapy/link.py +115 -8
aioscrapy/logformatter.py +199 -8
aioscrapy/middleware/absmanager.py +328 -2
aioscrapy/middleware/downloader.py +218 -0
aioscrapy/middleware/extension.py +50 -1
aioscrapy/middleware/itempipeline.py +96 -0
aioscrapy/middleware/spider.py +360 -7
aioscrapy/process.py +200 -0
aioscrapy/proxy/__init__.py +142 -3
aioscrapy/proxy/redis.py +136 -2
aioscrapy/queue/__init__.py +168 -16
aioscrapy/scrapyd/runner.py +124 -3
aioscrapy/serializer.py +182 -2
aioscrapy/settings/__init__.py +610 -128
aioscrapy/settings/default_settings.py +314 -14
aioscrapy/signalmanager.py +151 -20
aioscrapy/signals.py +183 -1
aioscrapy/spiderloader.py +165 -12
aioscrapy/spiders/__init__.py +233 -6
aioscrapy/statscollectors.py +312 -1
aioscrapy/utils/conf.py +345 -17
aioscrapy/utils/curl.py +168 -16
aioscrapy/utils/decorators.py +76 -6
aioscrapy/utils/deprecate.py +212 -19
aioscrapy/utils/httpobj.py +55 -3
aioscrapy/utils/log.py +79 -0
aioscrapy/utils/misc.py +189 -21
aioscrapy/utils/ossignal.py +67 -5
aioscrapy/utils/project.py +165 -3
aioscrapy/utils/python.py +254 -44
aioscrapy/utils/reqser.py +75 -1
aioscrapy/utils/request.py +173 -12
aioscrapy/utils/response.py +91 -6
aioscrapy/utils/signal.py +196 -14
aioscrapy/utils/spider.py +51 -4
aioscrapy/utils/template.py +93 -6
aioscrapy/utils/tools.py +191 -17
aioscrapy/utils/trackref.py +198 -12
aioscrapy/utils/url.py +341 -36
aio_scrapy-2.1.4.dist-info/METADATA +0 -239
aio_scrapy-2.1.4.dist-info/RECORD +0 -133
aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
aioscrapy/http/response/playwright.py +0 -36
aioscrapy/libs/pipelines/execl.py +0 -169
{aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
{aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0

aioscrapy/utils/url.py CHANGED Viewed

@@ -1,9 +1,17 @@
 """
+URL utility functions for aioscrapy.
+aioscrapy的URL实用函数。
 This module contains general purpose URL functions not found in the standard
-library.
+library. It provides utilities for URL parsing, manipulation, and validation
+specific to web crawling needs.
+此模块包含标准库中没有的通用URL函数。
+它提供了特定于网络爬取需求的URL解析、操作和验证实用工具。
 Some of the functions that used to be imported from this module have been moved
 to the w3lib.url module. Always import those from there instead.
+以前从此模块导入的一些函数已移至w3lib.url模块。
+始终从那里导入这些函数。
 """
 import posixpath
 import re
@@ -11,72 +19,242 @@ from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
 # scrapy.utils.url was moved to w3lib.url and import * ensures this
 # move doesn't break old code
-from w3lib.url import *
+from w3lib.url import *  # This imports functions like any_to_uri, add_or_replace_parameter, etc.
 from w3lib.url import _safe_chars, _unquotepath  # noqa: F401
 from aioscrapy.utils.python import to_unicode
 def url_is_from_any_domain(url, domains):
-    """Return True if the url belongs to any of the given domains"""
+    """
+    Check if a URL belongs to any of the given domains.
+    检查URL是否属于给定域名中的任何一个。
+    This function checks if the host part of the URL exactly matches any of the
+    given domains, or if it is a subdomain of any of them. The comparison is
+    case-insensitive.
+    此函数检查URL的主机部分是否与给定域名中的任何一个完全匹配，
+    或者它是否是其中任何一个的子域。比较不区分大小写。
+    Args:
+        url: The URL to check. Can be a string or a ParseResult object.
+             要检查的URL。可以是字符串或ParseResult对象。
+        domains: A list of domain names to check against.
+                要检查的域名列表。
+    Returns:
+        bool: True if the URL belongs to any of the given domains, False otherwise.
+              如果URL属于给定域名中的任何一个，则为True，否则为False。
+    Examples:
+        >>> url_is_from_any_domain("http://www.example.com/some/page.html", ["example.com"])
+        True
+        >>> url_is_from_any_domain("http://sub.example.com/", ["example.com"])
+        True
+        >>> url_is_from_any_domain("http://example.org/", ["example.com"])
+        False
+    """
+    # Get the host part of the URL and convert to lowercase
+    # 获取URL的主机部分并转换为小写
     host = parse_url(url).netloc.lower()
+    # If there's no host, it's not from any domain
+    # 如果没有主机，则不属于任何域名
     if not host:
         return False
+    # Convert all domains to lowercase for case-insensitive comparison
+    # 将所有域名转换为小写以进行不区分大小写的比较
     domains = [d.lower() for d in domains]
+    # Check if the host exactly matches any domain or is a subdomain of any domain
+    # 检查主机是否与任何域名完全匹配或是任何域名的子域
     return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
 def url_is_from_spider(url, spider):
-    """Return True if the url belongs to the given spider"""
+    """
+    Check if a URL belongs to the given spider.
+    检查URL是否属于给定的爬虫。
+    This function checks if the URL belongs to the domains that the spider
+    is allowed to crawl. It considers both the spider's name and its
+    'allowed_domains' attribute (if it exists).
+    此函数检查URL是否属于爬虫允许爬取的域名。
+    它同时考虑爬虫的名称和其'allowed_domains'属性（如果存在）。
+    Args:
+        url: The URL to check. Can be a string or a ParseResult object.
+             要检查的URL。可以是字符串或ParseResult对象。
+        spider: The spider object to check against.
+               要检查的爬虫对象。
+    Returns:
+        bool: True if the URL belongs to the spider's domains, False otherwise.
+              如果URL属于爬虫的域名，则为True，否则为False。
+    """
+    # Check if the URL belongs to either the spider's name or any of its allowed domains
+    # 检查URL是否属于爬虫的名称或其任何允许的域名
     return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
 def url_has_any_extension(url, extensions):
+    """
+    Check if a URL has any of the given extensions.
+    检查URL是否具有给定扩展名中的任何一个。
+    This function extracts the file extension from the URL path and checks
+    if it matches any of the provided extensions. The comparison is case-insensitive.
+    此函数从URL路径中提取文件扩展名，并检查它是否与提供的任何扩展名匹配。
+    比较不区分大小写。
+    Args:
+        url: The URL to check. Can be a string or a ParseResult object.
+             要检查的URL。可以是字符串或ParseResult对象。
+        extensions: A list of file extensions to check against (including the dot).
+                   要检查的文件扩展名列表（包括点）。
+    Returns:
+        bool: True if the URL has any of the given extensions, False otherwise.
+              如果URL具有给定扩展名中的任何一个，则为True，否则为False。
+    Examples:
+        >>> url_has_any_extension("http://example.com/file.pdf", ['.pdf', '.doc'])
+        True
+        >>> url_has_any_extension("http://example.com/file.PDF", ['.pdf'])
+        True
+        >>> url_has_any_extension("http://example.com/file.txt", ['.pdf', '.doc'])
+        False
+    """
+    # Extract the file extension from the URL path and check if it's in the list
+    # 从URL路径中提取文件扩展名，并检查它是否在列表中
     return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
 def parse_url(url, encoding=None):
-    """Return urlparsed url from the given argument (which could be an already
-    parsed url)
     """
+    Parse a URL into its components.
+    将URL解析为其组成部分。
+    This function parses a URL into its components using urllib.parse.urlparse.
+    If the input is already a ParseResult object, it is returned unchanged.
+    If the input is a string or bytes, it is first converted to unicode.
+    此函数使用urllib.parse.urlparse将URL解析为其组成部分。
+    如果输入已经是ParseResult对象，则原样返回。
+    如果输入是字符串或字节，则首先将其转换为unicode。
+    Args:
+        url: The URL to parse. Can be a string, bytes, or ParseResult object.
+             要解析的URL。可以是字符串、字节或ParseResult对象。
+        encoding: The encoding to use for decoding bytes. Defaults to 'utf-8'.
+                 用于解码字节的编码。默认为'utf-8'。
+    Returns:
+        ParseResult: A named tuple with URL components: scheme, netloc, path,
+                    params, query, and fragment.
+                    包含URL组件的命名元组：scheme、netloc、path、
+                    params、query和fragment。
+    """
+    # If the URL is already parsed, return it as is
+    # 如果URL已经解析，则原样返回
     if isinstance(url, ParseResult):
         return url
+    # Otherwise, convert to unicode and parse
+    # 否则，转换为unicode并解析
     return urlparse(to_unicode(url, encoding))
 def escape_ajax(url):
     """
-    Return the crawleable url according to:
-    https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
+    Convert AJAX URLs to crawlable URLs according to Google's specification.
+    根据Google的规范将AJAX URL转换为可爬取的URL。
-    >>> escape_ajax("www.example.com/ajax.html#!key=value")
-    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
-    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
-    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
-    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
-    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
-    >>> escape_ajax("www.example.com/ajax.html#!")
-    'www.example.com/ajax.html?_escaped_fragment_='
+    This function implements Google's "AJAX crawling scheme" which allows
+    search engines to crawl AJAX-based pages. It converts fragment identifiers
+    that start with an exclamation mark (!) to query parameters with the
+    "_escaped_fragment_" key.
+    此函数实现了Google的"AJAX爬取方案"，该方案允许搜索引擎爬取基于AJAX的页面。
+    它将以感叹号(!)开头的片段标识符转换为带有"_escaped_fragment_"键的查询参数。
-    URLs that are not "AJAX crawlable" (according to Google) returned as-is:
+    See: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
-    >>> escape_ajax("www.example.com/ajax.html#key=value")
-    'www.example.com/ajax.html#key=value'
-    >>> escape_ajax("www.example.com/ajax.html#")
-    'www.example.com/ajax.html#'
-    >>> escape_ajax("www.example.com/ajax.html")
-    'www.example.com/ajax.html'
+    Args:
+        url: The URL to convert.
+             要转换的URL。
+    Returns:
+        str: The crawlable URL with _escaped_fragment_ parameter if the URL
+             contains an AJAX fragment, or the original URL otherwise.
+             如果URL包含AJAX片段，则返回带有_escaped_fragment_参数的可爬取URL，
+             否则返回原始URL。
+    Examples:
+        >>> escape_ajax("www.example.com/ajax.html#!key=value")
+        'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
+        >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
+        'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
+        >>> escape_ajax("www.example.com/ajax.html?#!key=value")
+        'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
+        >>> escape_ajax("www.example.com/ajax.html#!")
+        'www.example.com/ajax.html?_escaped_fragment_='
+        URLs that are not "AJAX crawlable" (according to Google) returned as-is:
+        >>> escape_ajax("www.example.com/ajax.html#key=value")
+        'www.example.com/ajax.html#key=value'
+        >>> escape_ajax("www.example.com/ajax.html#")
+        'www.example.com/ajax.html#'
+        >>> escape_ajax("www.example.com/ajax.html")
+        'www.example.com/ajax.html'
     """
+    # Split the URL into the part before the fragment and the fragment itself
+    # 将URL拆分为片段之前的部分和片段本身
     defrag, frag = urldefrag(url)
+    # If the fragment doesn't start with '!', it's not an AJAX URL
+    # 如果片段不以'!'开头，则它不是AJAX URL
     if not frag.startswith('!'):
         return url
+    # Convert the AJAX URL to a crawlable URL by adding the _escaped_fragment_ parameter
+    # 通过添加_escaped_fragment_参数将AJAX URL转换为可爬取的URL
     return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
 def add_http_if_no_scheme(url):
-    """Add http as the default scheme if it is missing from the url."""
+    """
+    Add http as the default scheme if it is missing from the URL.
+    如果URL中缺少协议，则添加http作为默认协议。
+    This function checks if the URL already has a scheme (like http://, https://, ftp://).
+    If not, it adds 'http:' or 'http://' depending on whether the URL already has a netloc.
+    此函数检查URL是否已有协议（如http://、https://、ftp://）。
+    如果没有，它会添加'http:'或'http://'，具体取决于URL是否已有网络位置。
+    Args:
+        url: The URL to check and possibly modify.
+             要检查并可能修改的URL。
+    Returns:
+        str: The URL with a scheme, either the original one or with 'http' added.
+             带有协议的URL，可能是原始协议或添加了'http'。
+    Examples:
+        >>> add_http_if_no_scheme("example.com")
+        'http://example.com'
+        >>> add_http_if_no_scheme("http://example.com")
+        'http://example.com'
+        >>> add_http_if_no_scheme("https://example.com")
+        'https://example.com'
+    """
+    # Check if the URL already has a scheme
+    # 检查URL是否已有协议
     match = re.match(r"^\w+://", url, flags=re.I)
     if not match:
+        # Parse the URL to determine if it has a netloc
+        # 解析URL以确定它是否有网络位置
         parts = urlparse(url)
+        # Add the appropriate http scheme
+        # 添加适当的http协议
         scheme = "http:" if parts.netloc else "http://"
         url = scheme + url
@@ -84,6 +262,24 @@ def add_http_if_no_scheme(url):
 def _is_posix_path(string):
+    """
+    Check if a string looks like a POSIX filesystem path.
+    检查字符串是否看起来像POSIX文件系统路径。
+    This function uses a regular expression to check if the string matches
+    common patterns for POSIX filesystem paths, such as absolute paths,
+    relative paths, and paths with home directory references.
+    此函数使用正则表达式检查字符串是否匹配POSIX文件系统路径的常见模式，
+    如绝对路径、相对路径和带有主目录引用的路径。
+    Args:
+        string: The string to check.
+               要检查的字符串。
+    Returns:
+        bool: True if the string looks like a POSIX path, False otherwise.
+              如果字符串看起来像POSIX路径，则为True，否则为False。
+    """
     return bool(
         re.match(
             r'''
@@ -106,13 +302,31 @@ def _is_posix_path(string):
 def _is_windows_path(string):
+    """
+    Check if a string looks like a Windows filesystem path.
+    检查字符串是否看起来像Windows文件系统路径。
+    This function uses a regular expression to check if the string matches
+    common patterns for Windows filesystem paths, such as drive letters (C:\)
+    or UNC paths (\\server\share).
+    此函数使用正则表达式检查字符串是否匹配Windows文件系统路径的常见模式，
+    如驱动器号（C:\）或UNC路径（\\server\share）。
+    Args:
+        string: The string to check.
+               要检查的字符串。
+    Returns:
+        bool: True if the string looks like a Windows path, False otherwise.
+              如果字符串看起来像Windows路径，则为True，否则为False。
+    """
     return bool(
         re.match(
             r'''
             ^
             (
-                [a-z]:\\
-                | \\\\
+                [a-z]:\\        # Drive letter followed by :\
+                | \\\\          # Or UNC path starting with \\
             )
             ''',
             string,
@@ -122,38 +336,129 @@ def _is_windows_path(string):
 def _is_filesystem_path(string):
+    """
+    Check if a string looks like a filesystem path (either POSIX or Windows).
+    检查字符串是否看起来像文件系统路径（POSIX或Windows）。
+    This function combines the checks for both POSIX and Windows paths.
+    此函数结合了对POSIX和Windows路径的检查。
+    Args:
+        string: The string to check.
+               要检查的字符串。
+    Returns:
+        bool: True if the string looks like a filesystem path, False otherwise.
+              如果字符串看起来像文件系统路径，则为True，否则为False。
+    """
     return _is_posix_path(string) or _is_windows_path(string)
 def guess_scheme(url):
-    """Add an URL scheme if missing: file:// for filepath-like input or
-    http:// otherwise."""
+    """
+    Add an appropriate URL scheme if missing from the input.
+    如果输入中缺少适当的URL协议，则添加它。
+    This function examines the input and adds an appropriate scheme:
+    - 'file://' for filesystem paths (both POSIX and Windows)
+    - 'http://' for other inputs that look like URLs
+    此函数检查输入并添加适当的协议：
+    - 对于文件系统路径（POSIX和Windows），添加'file://'
+    - 对于看起来像URL的其他输入，添加'http://'
+    Args:
+        url: The URL or path to process.
+             要处理的URL或路径。
+    Returns:
+        str: The URL with an appropriate scheme added if it was missing.
+             添加了适当协议（如果缺少）的URL。
+    Note:
+        This function uses any_to_uri() from w3lib.url to convert filesystem
+        paths to proper file:// URLs.
+        此函数使用w3lib.url中的any_to_uri()将文件系统路径转换为适当的file://URL。
+    """
+    # If it looks like a filesystem path, convert it to a file:// URL
+    # 如果它看起来像文件系统路径，将其转换为file://URL
     if _is_filesystem_path(url):
         return any_to_uri(url)
+    # Otherwise, add http:// if needed
+    # 否则，如果需要，添加http://
     return add_http_if_no_scheme(url)
 def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=False, strip_fragment=True):
+    """
+    Strip a URL string of some of its components.
+    从URL字符串中去除某些组件。
-    """Strip URL string from some of its components:
+    This function allows selectively removing parts of a URL, such as credentials,
+    default ports, paths, queries, and fragments. It's useful for normalizing URLs
+    or removing sensitive information.
+    此函数允许选择性地移除URL的部分内容，如凭据、默认端口、路径、查询和片段。
+    它对于规范化URL或移除敏感信息很有用。
-    - ``strip_credentials`` removes "user:password@"
-    - ``strip_default_port`` removes ":80" (resp. ":443", ":21")
-      from http:// (resp. https://, ftp://) URLs
-    - ``origin_only`` replaces path component with "/", also dropping
-      query and fragment components ; it also strips credentials
-    - ``strip_fragment`` drops any #fragment component
-    """
+    Args:
+        url: The URL to strip.
+             要处理的URL。
+        strip_credentials: Whether to remove "user:password@" from the URL.
+                          是否从URL中移除"user:password@"。
+                          Defaults to True.
+                          默认为True。
+        strip_default_port: Whether to remove default ports (":80" for http,
+                           ":443" for https, ":21" for ftp) from the URL.
+                           是否从URL中移除默认端口（http的":80"，
+                           https的":443"，ftp的":21"）。
+                           Defaults to True.
+                           默认为True。
+        origin_only: Whether to keep only the origin part of the URL (scheme and netloc),
+                    replacing the path with "/" and removing params, query, and fragment.
+                    是否只保留URL的源部分（协议和网络位置），
+                    将路径替换为"/"并移除参数、查询和片段。
+                    This also implies strip_credentials=True.
+                    这也意味着strip_credentials=True。
+                    Defaults to False.
+                    默认为False。
+        strip_fragment: Whether to remove any #fragment component from the URL.
+                       是否从URL中移除任何#片段组件。
+                       Defaults to True.
+                       默认为True。
+    Returns:
+        str: The stripped URL.
+             处理后的URL。
+    Examples:
+        >>> strip_url("http://user:pass@example.com:80/path?query#fragment")
+        'http://example.com/path?query'
+        >>> strip_url("http://user:pass@example.com:80/path?query#fragment",
+        ...           strip_credentials=False, strip_fragment=False)
+        'http://user:pass@example.com/path?query#fragment'
+        >>> strip_url("http://user:pass@example.com:80/path?query#fragment",
+        ...           origin_only=True)
+        'http://example.com/'
+    """
+    # Parse the URL into its components
+    # 将URL解析为其组件
     parsed_url = urlparse(url)
     netloc = parsed_url.netloc
+    # Remove credentials if requested or if origin_only is True
+    # 如果请求或如果origin_only为True，则移除凭据
     if (strip_credentials or origin_only) and (parsed_url.username or parsed_url.password):
         netloc = netloc.split('@')[-1]
+    # Remove default ports if requested
+    # 如果请求，则移除默认端口
     if strip_default_port and parsed_url.port:
         if (parsed_url.scheme, parsed_url.port) in (('http', 80),
                                                     ('https', 443),
                                                     ('ftp', 21)):
             netloc = netloc.replace(f':{parsed_url.port}', '')
+    # Reconstruct the URL with the desired components
+    # 使用所需组件重建URL
     return urlunparse((
         parsed_url.scheme,
         netloc,

aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

aio-scrapy 2.1.4py3-none-any.whl → 2.1.7py3-none-any.whl