aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,13 @@
1
- """Helper functions for scrapy.http objects (Request, Response)"""
1
+ """
2
+ HTTP object utility functions for aioscrapy.
3
+ aioscrapy的HTTP对象实用函数。
4
+
5
+ This module provides utility functions for working with HTTP objects (Request, Response)
6
+ in aioscrapy. It includes functions for parsing and caching URL information to improve
7
+ performance when the same URLs are processed multiple times.
8
+ 此模块提供了用于处理aioscrapy中HTTP对象(Request, Response)的实用函数。
9
+ 它包括用于解析和缓存URL信息的函数,以提高多次处理相同URL时的性能。
10
+ """
2
11
 
3
12
  from typing import Union
4
13
  from urllib.parse import urlparse, ParseResult
@@ -7,13 +16,56 @@ from weakref import WeakKeyDictionary
7
16
  from aioscrapy.http import Request, Response
8
17
 
9
18
 
19
+ # Cache for storing parsed URLs to avoid repeated parsing of the same URL
20
+ # Uses WeakKeyDictionary so entries are automatically removed when the Request/Response is garbage collected
21
+ # 用于存储已解析URL的缓存,以避免重复解析相同的URL
22
+ # 使用WeakKeyDictionary,因此当Request/Response被垃圾回收时,条目会自动删除
10
23
  _urlparse_cache: "WeakKeyDictionary[Union[Request, Response], ParseResult]" = WeakKeyDictionary()
11
24
 
12
25
 
13
26
  def urlparse_cached(request_or_response: Union[Request, Response]) -> ParseResult:
14
- """Return urlparse.urlparse caching the result, where the argument can be a
15
- Request or Response object
16
27
  """
28
+ Parse the URL of a Request or Response object with caching.
29
+ 解析Request或Response对象的URL,并进行缓存。
30
+
31
+ This function parses the URL of the given Request or Response object using
32
+ urllib.parse.urlparse and caches the result. If the same object is passed
33
+ again, the cached result is returned instead of re-parsing the URL.
34
+ 此函数使用urllib.parse.urlparse解析给定Request或Response对象的URL,
35
+ 并缓存结果。如果再次传递相同的对象,则返回缓存的结果,而不是重新解析URL。
36
+
37
+ The caching mechanism uses a WeakKeyDictionary, so the cache entries are
38
+ automatically removed when the Request or Response objects are garbage collected.
39
+ This prevents memory leaks while still providing performance benefits.
40
+ 缓存机制使用WeakKeyDictionary,因此当Request或Response对象被垃圾回收时,
41
+ 缓存条目会自动删除。这可以防止内存泄漏,同时仍然提供性能优势。
42
+
43
+ Args:
44
+ request_or_response: A Request or Response object whose URL will be parsed.
45
+ 将解析其URL的Request或Response对象。
46
+
47
+ Returns:
48
+ ParseResult: The parsed URL components (scheme, netloc, path, params,
49
+ query, fragment).
50
+ 解析的URL组件(scheme, netloc, path, params, query, fragment)。
51
+
52
+ Example:
53
+ >>> request = Request('https://example.com/path?query=value')
54
+ >>> parsed = urlparse_cached(request)
55
+ >>> parsed.netloc
56
+ 'example.com'
57
+ >>> parsed.path
58
+ '/path'
59
+ >>> parsed.query
60
+ 'query=value'
61
+ """
62
+ # Check if this object's URL has already been parsed and cached
63
+ # 检查此对象的URL是否已被解析和缓存
17
64
  if request_or_response not in _urlparse_cache:
65
+ # If not in cache, parse the URL and store the result
66
+ # 如果不在缓存中,解析URL并存储结果
18
67
  _urlparse_cache[request_or_response] = urlparse(request_or_response.url)
68
+
69
+ # Return the cached parse result
70
+ # 返回缓存的解析结果
19
71
  return _urlparse_cache[request_or_response]
aioscrapy/utils/log.py CHANGED
@@ -1,3 +1,13 @@
1
+ """
2
+ Logging utilities for aioscrapy.
3
+ aioscrapy的日志工具。
4
+
5
+ This module provides logging functionality for aioscrapy using the loguru library.
6
+ It configures logging based on settings and provides a spider-aware logger.
7
+ 此模块使用loguru库为aioscrapy提供日志功能。
8
+ 它根据设置配置日志记录,并提供一个感知爬虫的日志记录器。
9
+ """
10
+
1
11
  import asyncio
2
12
  import sys
3
13
  import warnings
@@ -7,21 +17,46 @@ from loguru import logger as _logger
7
17
 
8
18
  from aioscrapy.settings import Settings
9
19
 
20
+ # Remove the default stderr handler to avoid duplicate logs
21
+ # 移除默认的stderr处理程序以避免重复日志
10
22
  for _handler in _logger._core.handlers.values():
11
23
  if _handler._name == '<stderr>':
12
24
  _logger.remove(_handler._id)
13
25
 
14
26
 
15
27
  def configure_logging(spider: Type["Spider"], settings: Settings):
28
+ """
29
+ Configure logging for a spider based on settings.
30
+ 根据设置为爬虫配置日志记录。
31
+
32
+ This function sets up logging handlers for a specific spider based on the provided settings.
33
+ It can configure logging to stderr and/or to a file, with various options like log level,
34
+ rotation, retention, etc.
35
+ 此函数根据提供的设置为特定爬虫设置日志处理程序。
36
+ 它可以配置日志记录到stderr和/或文件,具有各种选项,如日志级别、轮换、保留等。
37
+
38
+ Args:
39
+ spider: The spider instance for which to configure logging.
40
+ 要为其配置日志记录的爬虫实例。
41
+ settings: The settings object containing logging configuration.
42
+ 包含日志配置的设置对象。
43
+ """
44
+ # Get logging configuration from settings
45
+ # 从设置中获取日志配置
16
46
  formatter = settings.get('LOG_FORMAT')
17
47
  level = settings.get('LOG_LEVEL', 'INFO')
18
48
  enqueue = settings.get('ENQUEUE', True)
49
+
50
+ # Configure stderr logging if enabled
51
+ # 如果启用,配置stderr日志记录
19
52
  if settings.get('LOG_STDOUT', True):
20
53
  _logger.add(
21
54
  sys.stderr, format=formatter, level=level, enqueue=enqueue,
22
55
  filter=lambda record: record["extra"].get("spidername") == spider.name,
23
56
  )
24
57
 
58
+ # Configure file logging if a filename is provided
59
+ # 如果提供了文件名,配置文件日志记录
25
60
  if filename := settings.get('LOG_FILE'):
26
61
  rotation = settings.get('LOG_ROTATION', '20MB')
27
62
  retention = settings.get('LOG_RETENTION', 10)
@@ -34,18 +69,62 @@ def configure_logging(spider: Type["Spider"], settings: Settings):
34
69
 
35
70
 
36
71
  class AioScrapyLogger:
72
+ """
73
+ Spider-aware logger for aioscrapy.
74
+ aioscrapy的爬虫感知日志记录器。
75
+
76
+ This class provides a wrapper around the loguru logger that automatically
77
+ binds the current spider name to log records. This allows for filtering
78
+ logs by spider name and provides context about which spider generated each log.
79
+ 此类提供了loguru日志记录器的包装器,它自动将当前爬虫名称绑定到日志记录。
80
+ 这允许按爬虫名称过滤日志,并提供关于哪个爬虫生成了每条日志的上下文。
81
+
82
+ The logger methods (debug, info, warning, etc.) are dynamically accessed
83
+ through __getattr__, so they're not explicitly defined.
84
+ 日志记录器方法(debug、info、warning等)是通过__getattr__动态访问的,
85
+ 因此它们没有明确定义。
86
+ """
37
87
  __slots__ = (
38
88
  'catch', 'complete', 'critical', 'debug', 'error', 'exception',
39
89
  'info', 'log', 'patch', 'success', 'trace', 'warning'
40
90
  )
41
91
 
42
92
  def __getattr__(self, method):
93
+ """
94
+ Dynamically access logger methods with spider name binding.
95
+ 动态访问带有爬虫名称绑定的日志记录器方法。
96
+
97
+ This method intercepts attribute access to provide logger methods that
98
+ automatically include the current spider name in the log context.
99
+ 此方法拦截属性访问,以提供自动在日志上下文中包含当前爬虫名称的日志记录器方法。
100
+
101
+ Args:
102
+ method: The name of the logger method to access (e.g., 'info', 'debug').
103
+ 要访问的日志记录器方法的名称(例如,'info'、'debug')。
104
+
105
+ Returns:
106
+ The requested logger method, bound with the current spider name.
107
+ 请求的日志记录器方法,绑定了当前爬虫名称。
108
+
109
+ Note:
110
+ If the current task name cannot be determined, it falls back to the
111
+ original logger method without binding a spider name.
112
+ 如果无法确定当前任务名称,它会回退到原始日志记录器方法,而不绑定爬虫名称。
113
+ """
43
114
  try:
115
+ # Get the current task name as the spider name
116
+ # 获取当前任务名称作为爬虫名称
44
117
  spider_name = asyncio.current_task().get_name()
118
+ # Return the logger method bound with the spider name
119
+ # 返回绑定了爬虫名称的日志记录器方法
45
120
  return getattr(_logger.bind(spidername=spider_name), method)
46
121
  except Exception as e:
122
+ # Fall back to the original logger method if binding fails
123
+ # 如果绑定失败,回退到原始日志记录器方法
47
124
  warnings.warn(f'Error on get logger: {e}')
48
125
  return getattr(_logger, method)
49
126
 
50
127
 
128
+ # Create a singleton instance of the spider-aware logger
129
+ # 创建爬虫感知日志记录器的单例实例
51
130
  logger = AioScrapyLogger()
aioscrapy/utils/misc.py CHANGED
@@ -1,4 +1,12 @@
1
- """Helper functions which don't fit anywhere else"""
1
+ """
2
+ Miscellaneous utility functions for aioscrapy.
3
+ aioscrapy的杂项实用函数。
4
+
5
+ This module contains helper functions that don't fit into other utility categories.
6
+ It provides functionality for module walking, object loading, and instance creation.
7
+ 此模块包含不适合其他实用程序类别的辅助函数。
8
+ 它提供了模块遍历、对象加载和实例创建的功能。
9
+ """
2
10
 
3
11
  from importlib import import_module
4
12
  from pkgutil import iter_modules
@@ -7,41 +15,112 @@ from aioscrapy.utils.tools import call_helper
7
15
 
8
16
 
9
17
  def walk_modules(path):
10
- """Loads a module and all its submodules from the given module path and
11
- returns them. If *any* module throws an exception while importing, that
12
- exception is thrown back.
13
-
14
- For example: walk_modules('aioscrapy.utils')
15
18
  """
19
+ Load a module and all its submodules recursively.
20
+ 递归加载模块及其所有子模块。
21
+
22
+ This function imports a module and all its submodules from the given module path
23
+ and returns them as a list. It performs a recursive traversal of the module tree.
24
+ If any module raises an exception during import, that exception is propagated.
25
+ 此函数从给定的模块路径导入模块及其所有子模块,并将它们作为列表返回。
26
+ 它执行模块树的递归遍历。如果任何模块在导入过程中引发异常,则该异常会被传播。
27
+
28
+ Args:
29
+ path: The module path to load (e.g., 'aioscrapy.utils').
30
+ 要加载的模块路径(例如,'aioscrapy.utils')。
16
31
 
32
+ Returns:
33
+ list: A list of imported modules, including the root module and all submodules.
34
+ 导入的模块列表,包括根模块和所有子模块。
35
+
36
+ Raises:
37
+ ImportError: If any module cannot be imported.
38
+ 如果任何模块无法导入。
39
+
40
+ Example:
41
+ >>> mods = walk_modules('aioscrapy.utils')
42
+ >>> 'aioscrapy.utils.url' in [mod.__name__ for mod in mods]
43
+ True
44
+ """
45
+ # Initialize the list of modules
46
+ # 初始化模块列表
17
47
  mods = []
48
+
49
+ # Import the root module
50
+ # 导入根模块
18
51
  mod = import_module(path)
19
52
  mods.append(mod)
53
+
54
+ # If the module is a package (has a __path__), process its submodules
55
+ # 如果模块是一个包(有__path__),处理其子模块
20
56
  if hasattr(mod, '__path__'):
21
57
  for _, subpath, ispkg in iter_modules(mod.__path__):
58
+ # Construct the full path for the submodule
59
+ # 构造子模块的完整路径
22
60
  fullpath = path + '.' + subpath
61
+
62
+ # If the submodule is a package, recursively walk it
63
+ # 如果子模块是一个包,递归遍历它
23
64
  if ispkg:
24
65
  mods += walk_modules(fullpath)
66
+ # Otherwise, import the submodule and add it to the list
67
+ # 否则,导入子模块并将其添加到列表中
25
68
  else:
26
69
  submod = import_module(fullpath)
27
70
  mods.append(submod)
71
+
28
72
  return mods
29
73
 
30
74
 
31
75
  def load_object(path: str):
32
- """Load an object given its absolute object path, and return it.
76
+ """
77
+ Load an object by its fully qualified name.
78
+ 通过完全限定名称加载对象。
79
+
80
+ This function imports a module and retrieves an object from it based on the
81
+ provided import path. The object can be a class, function, variable, or an
82
+ instance defined in the specified module.
83
+ 此函数根据提供的导入路径导入模块并从中检索对象。
84
+ 对象可以是在指定模块中定义的类、函数、变量或实例。
85
+
86
+ Args:
87
+ path: The absolute object path (e.g., 'aioscrapy.libs.downloader.redirect.RedirectMiddleware').
88
+ 对象的绝对路径(例如,'aioscrapy.libs.downloader.redirect.RedirectMiddleware')。
33
89
 
34
- The object can be the import path of a class, function, variable or an
35
- instance, e.g. 'aioscrapy.libs.downloader.redirect.RedirectMiddleware'..
90
+ Returns:
91
+ The loaded object.
92
+ 加载的对象。
93
+
94
+ Raises:
95
+ ValueError: If the path is not a full path (doesn't contain a dot).
96
+ 如果路径不是完整路径(不包含点)。
97
+ NameError: If the module doesn't define the specified object.
98
+ 如果模块未定义指定的对象。
99
+ ImportError: If the module cannot be imported.
100
+ 如果无法导入模块。
101
+
102
+ Example:
103
+ >>> middleware = load_object('aioscrapy.libs.downloader.redirect.RedirectMiddleware')
104
+ >>> middleware.__name__
105
+ 'RedirectMiddleware'
36
106
  """
107
+ # Find the last dot in the path to separate module path from object name
108
+ # 在路径中查找最后一个点,以将模块路径与对象名称分开
37
109
  try:
38
110
  dot = path.rindex('.')
39
111
  except ValueError:
40
112
  raise ValueError(f"Error loading object '{path}': not a full path")
41
113
 
114
+ # Split the path into module path and object name
115
+ # 将路径分割为模块路径和对象名称
42
116
  module, name = path[:dot], path[dot + 1:]
117
+
118
+ # Import the module
119
+ # 导入模块
43
120
  mod = import_module(module)
44
121
 
122
+ # Get the object from the module
123
+ # 从模块中获取对象
45
124
  try:
46
125
  obj = getattr(mod, name)
47
126
  except AttributeError:
@@ -51,49 +130,138 @@ def load_object(path: str):
51
130
 
52
131
 
53
132
  async def create_instance(objcls, settings, crawler, *args, spider=None, **kwargs):
54
- """Construct a class instance using its ``from_crawler`` or
55
- ``from_settings`` constructors, if available.
133
+ """
134
+ Create an instance of a class using its factory methods.
135
+ 使用类的工厂方法创建类的实例。
56
136
 
57
- At least one of ``settings`` and ``crawler`` needs to be different from
58
- ``None``. If ``settings `` is ``None``, ``crawler.settings`` will be used.
59
- If ``crawler`` is ``None``, only the ``from_settings`` constructor will be
60
- tried.
137
+ This function tries to create an instance of the given class using one of its
138
+ factory methods in the following order of preference:
139
+ 1. from_crawler(crawler, *args, **kwargs) - if crawler is provided
140
+ 2. from_spider(spider, *args, **kwargs) - if spider is provided
141
+ 3. from_settings(settings, *args, **kwargs) - if settings is provided
142
+ 4. Regular constructor: objcls(*args, **kwargs) - as a fallback
61
143
 
62
- ``*args`` and ``**kwargs`` are forwarded to the constructors.
144
+ 此函数尝试使用给定类的一个工厂方法创建实例,优先顺序如下:
145
+ 1. from_crawler(crawler, *args, **kwargs) - 如果提供了crawler
146
+ 2. from_spider(spider, *args, **kwargs) - 如果提供了spider
147
+ 3. from_settings(settings, *args, **kwargs) - 如果提供了settings
148
+ 4. 常规构造函数:objcls(*args, **kwargs) - 作为后备选项
63
149
 
64
- Raises ``ValueError`` if both ``settings`` and ``crawler`` are ``None``.
150
+ Args:
151
+ objcls: The class to instantiate.
152
+ 要实例化的类。
153
+ settings: The settings object to use. Can be None if crawler is provided.
154
+ 要使用的设置对象。如果提供了crawler,可以为None。
155
+ crawler: The crawler object to use. Can be None.
156
+ 要使用的爬虫对象。可以为None。
157
+ *args: Positional arguments to pass to the constructor.
158
+ 传递给构造函数的位置参数。
159
+ spider: The spider object to use. Can be None.
160
+ 要使用的蜘蛛对象。可以为None。
161
+ **kwargs: Keyword arguments to pass to the constructor.
162
+ 传递给构造函数的关键字参数。
65
163
 
66
- .. versionchanged:: 2.2
67
- Raises ``TypeError`` if the resulting instance is ``None`` (e.g. if an
68
- extension has not been implemented correctly).
164
+ Returns:
165
+ An instance of the specified class.
166
+ 指定类的实例。
167
+
168
+ Raises:
169
+ ValueError: If settings, crawler, and spider are all None.
170
+ 如果settings、crawler和spider都为None。
171
+ TypeError: If the factory method returns None.
172
+ 如果工厂方法返回None。
69
173
  """
174
+ # Ensure we have settings from either crawler, spider, or directly provided
175
+ # 确保我们从crawler、spider或直接提供的参数中获取设置
70
176
  if settings is None:
71
177
  if crawler is None and spider is None:
72
178
  raise ValueError("Specify at least one of settings, crawler and spider.")
73
179
 
180
+ # Get settings from crawler or spider
181
+ # 从crawler或spider获取设置
74
182
  settings = crawler and crawler.settings or spider and spider.settings
183
+ # Get spider from crawler if not directly provided
184
+ # 如果没有直接提供,从crawler获取spider
75
185
  spider = spider or crawler and crawler.spider
76
186
 
187
+ # Try to create instance using the appropriate factory method
188
+ # 尝试使用适当的工厂方法创建实例
77
189
  if crawler and hasattr(objcls, 'from_crawler'):
190
+ # Use from_crawler if available and crawler is provided
191
+ # 如果可用且提供了crawler,则使用from_crawler
78
192
  instance = await call_helper(objcls.from_crawler, crawler, *args, **kwargs)
79
193
  method_name = 'from_crawler'
80
194
  elif spider and hasattr(objcls, 'from_spider'):
195
+ # Use from_spider if available and spider is provided
196
+ # 如果可用且提供了spider,则使用from_spider
81
197
  instance = await call_helper(objcls.from_spider, spider, *args, **kwargs)
82
198
  method_name = 'from_spider'
83
199
  elif hasattr(objcls, 'from_settings'):
200
+ # Use from_settings if available
201
+ # 如果可用,则使用from_settings
84
202
  instance = await call_helper(objcls.from_settings, settings, *args, **kwargs)
85
203
  method_name = 'from_settings'
86
204
  else:
205
+ # Fall back to regular constructor
206
+ # 回退到常规构造函数
87
207
  instance = objcls(*args, **kwargs)
88
208
  method_name = '__new__'
209
+
210
+ # Ensure the factory method returned a valid instance
211
+ # 确保工厂方法返回了有效的实例
89
212
  if instance is None:
90
213
  raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
214
+
91
215
  return instance
92
216
 
93
217
 
94
218
  async def load_instance(clspath: str, *args, settings=None, spider=None, crawler=None, **kwargs):
219
+ """
220
+ Load a class by its path and create an instance of it.
221
+ 通过路径加载类并创建其实例。
222
+
223
+ This function combines load_object() and create_instance() to load a class
224
+ by its fully qualified name and then create an instance of it using the
225
+ appropriate factory method.
226
+ 此函数结合了load_object()和create_instance(),通过完全限定名称加载类,
227
+ 然后使用适当的工厂方法创建其实例。
228
+
229
+ Args:
230
+ clspath: The fully qualified class path (e.g., 'aioscrapy.libs.downloader.redirect.RedirectMiddleware').
231
+ 完全限定的类路径(例如,'aioscrapy.libs.downloader.redirect.RedirectMiddleware')。
232
+ *args: Positional arguments to pass to the constructor.
233
+ 传递给构造函数的位置参数。
234
+ settings: The settings object to use. Can be None if crawler is provided.
235
+ 要使用的设置对象。如果提供了crawler,可以为None。
236
+ spider: The spider object to use. Can be None.
237
+ 要使用的蜘蛛对象。可以为None。
238
+ crawler: The crawler object to use. Can be None.
239
+ 要使用的爬虫对象。可以为None。
240
+ **kwargs: Keyword arguments to pass to the constructor.
241
+ 传递给构造函数的关键字参数。
242
+
243
+ Returns:
244
+ An instance of the specified class.
245
+ 指定类的实例。
246
+
247
+ Raises:
248
+ ValueError: If settings, crawler, and spider are all None.
249
+ 如果settings、crawler和spider都为None。
250
+ TypeError: If the factory method returns None.
251
+ 如果工厂方法返回None。
252
+ ImportError: If the class cannot be imported.
253
+ 如果无法导入类。
254
+ NameError: If the module doesn't define the specified class.
255
+ 如果模块未定义指定的类。
256
+ """
257
+ # First load the class by its path
258
+ # 首先通过路径加载类
259
+ cls = load_object(clspath)
260
+
261
+ # Then create an instance of the class
262
+ # 然后创建类的实例
95
263
  return await create_instance(
96
- load_object(clspath),
264
+ cls,
97
265
  settings,
98
266
  crawler,
99
267
  *args,
@@ -1,23 +1,85 @@
1
+ """
2
+ Operating system signal utilities for aioscrapy.
3
+ aioscrapy的操作系统信号实用工具。
4
+
5
+ This module provides utilities for working with operating system signals in aioscrapy.
6
+ It includes functions for installing signal handlers and mapping between signal
7
+ numbers and their names.
8
+ 此模块提供了用于处理aioscrapy中操作系统信号的实用工具。
9
+ 它包括用于安装信号处理程序以及在信号编号和其名称之间映射的函数。
10
+ """
11
+
1
12
  import signal
2
13
 
3
14
 
15
+ # Dictionary mapping signal numbers to their names (e.g., {2: 'SIGINT', 15: 'SIGTERM'})
16
+ # 将信号编号映射到其名称的字典(例如,{2: 'SIGINT', 15: 'SIGTERM'})
4
17
  signal_names = {}
18
+
19
+ # Populate the signal_names dictionary by iterating through all attributes in the signal module
20
+ # 通过迭代信号模块中的所有属性来填充signal_names字典
5
21
  for signame in dir(signal):
22
+ # Only process attributes that start with 'SIG' but not 'SIG_'
23
+ # (SIG_ prefixed constants are signal handlers, not signal types)
24
+ # 只处理以'SIG'开头但不以'SIG_'开头的属性
25
+ # (SIG_前缀的常量是信号处理程序,而不是信号类型)
6
26
  if signame.startswith('SIG') and not signame.startswith('SIG_'):
27
+ # Get the signal number for this signal name
28
+ # 获取此信号名称的信号编号
7
29
  signum = getattr(signal, signame)
30
+ # Only add to the dictionary if it's an integer (a valid signal number)
31
+ # 只有当它是整数(有效的信号编号)时才添加到字典中
8
32
  if isinstance(signum, int):
9
33
  signal_names[signum] = signame
10
34
 
11
35
 
12
36
  def install_shutdown_handlers(function, override_sigint=True):
13
- """Install the given function as a signal handler for all common shutdown
14
- signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the
15
- SIGINT handler won't be install if there is already a handler in place
16
- (e.g. Pdb)
17
37
  """
38
+ Install a function as a signal handler for common shutdown signals.
39
+ 为常见的关闭信号安装函数作为信号处理程序。
40
+
41
+ This function installs the provided function as a handler for common shutdown
42
+ signals such as SIGTERM (terminate), SIGINT (keyboard interrupt), and SIGBREAK
43
+ (Ctrl-Break on Windows). This is useful for graceful shutdown of applications.
44
+ 此函数将提供的函数安装为常见关闭信号的处理程序,如SIGTERM(终止)、
45
+ SIGINT(键盘中断)和SIGBREAK(Windows上的Ctrl-Break)。
46
+ 这对于应用程序的优雅关闭很有用。
47
+
48
+ Args:
49
+ function: The function to be called when a shutdown signal is received.
50
+ 当收到关闭信号时要调用的函数。
51
+ This function should accept two parameters: signal number and frame.
52
+ 此函数应接受两个参数:信号编号和帧。
53
+ override_sigint: Whether to override an existing SIGINT handler.
54
+ 是否覆盖现有的SIGINT处理程序。
55
+ If False, the SIGINT handler won't be installed if there's
56
+ already a custom handler in place (e.g., a debugger like Pdb).
57
+ 如果为False,则在已有自定义处理程序(例如Pdb调试器)的情况下
58
+ 不会安装SIGINT处理程序。
59
+ Defaults to True.
60
+ 默认为True。
61
+
62
+ Example:
63
+ >>> def handle_shutdown(signum, frame):
64
+ ... print(f"Received signal {signal_names.get(signum, signum)}")
65
+ ... # Perform cleanup operations
66
+ ... sys.exit(0)
67
+ >>> install_shutdown_handlers(handle_shutdown)
68
+ """
69
+ # Always install handler for SIGTERM (terminate signal)
70
+ # 始终为SIGTERM(终止信号)安装处理程序
18
71
  signal.signal(signal.SIGTERM, function)
72
+
73
+ # Install handler for SIGINT (keyboard interrupt) if:
74
+ # - The current handler is the default handler, or
75
+ # - override_sigint is True (forcing override of any existing handler)
76
+ # 在以下情况下为SIGINT(键盘中断)安装处理程序:
77
+ # - 当前处理程序是默认处理程序,或
78
+ # - override_sigint为True(强制覆盖任何现有处理程序)
19
79
  if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint:
20
80
  signal.signal(signal.SIGINT, function)
21
- # Catch Ctrl-Break in windows
81
+
82
+ # Install handler for SIGBREAK (Ctrl-Break) on Windows if available
83
+ # 如果可用,在Windows上为SIGBREAK(Ctrl-Break)安装处理程序
22
84
  if hasattr(signal, 'SIGBREAK'):
23
85
  signal.signal(signal.SIGBREAK, function)