aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/utils/httpobj.py
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
HTTP object utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的HTTP对象实用函数。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with HTTP objects (Request, Response)
|
|
6
|
+
in aioscrapy. It includes functions for parsing and caching URL information to improve
|
|
7
|
+
performance when the same URLs are processed multiple times.
|
|
8
|
+
此模块提供了用于处理aioscrapy中HTTP对象(Request, Response)的实用函数。
|
|
9
|
+
它包括用于解析和缓存URL信息的函数,以提高多次处理相同URL时的性能。
|
|
10
|
+
"""
|
|
2
11
|
|
|
3
12
|
from typing import Union
|
|
4
13
|
from urllib.parse import urlparse, ParseResult
|
|
@@ -7,13 +16,56 @@ from weakref import WeakKeyDictionary
|
|
|
7
16
|
from aioscrapy.http import Request, Response
|
|
8
17
|
|
|
9
18
|
|
|
19
|
+
# Cache for storing parsed URLs to avoid repeated parsing of the same URL
|
|
20
|
+
# Uses WeakKeyDictionary so entries are automatically removed when the Request/Response is garbage collected
|
|
21
|
+
# 用于存储已解析URL的缓存,以避免重复解析相同的URL
|
|
22
|
+
# 使用WeakKeyDictionary,因此当Request/Response被垃圾回收时,条目会自动删除
|
|
10
23
|
_urlparse_cache: "WeakKeyDictionary[Union[Request, Response], ParseResult]" = WeakKeyDictionary()
|
|
11
24
|
|
|
12
25
|
|
|
13
26
|
def urlparse_cached(request_or_response: Union[Request, Response]) -> ParseResult:
|
|
14
|
-
"""Return urlparse.urlparse caching the result, where the argument can be a
|
|
15
|
-
Request or Response object
|
|
16
27
|
"""
|
|
28
|
+
Parse the URL of a Request or Response object with caching.
|
|
29
|
+
解析Request或Response对象的URL,并进行缓存。
|
|
30
|
+
|
|
31
|
+
This function parses the URL of the given Request or Response object using
|
|
32
|
+
urllib.parse.urlparse and caches the result. If the same object is passed
|
|
33
|
+
again, the cached result is returned instead of re-parsing the URL.
|
|
34
|
+
此函数使用urllib.parse.urlparse解析给定Request或Response对象的URL,
|
|
35
|
+
并缓存结果。如果再次传递相同的对象,则返回缓存的结果,而不是重新解析URL。
|
|
36
|
+
|
|
37
|
+
The caching mechanism uses a WeakKeyDictionary, so the cache entries are
|
|
38
|
+
automatically removed when the Request or Response objects are garbage collected.
|
|
39
|
+
This prevents memory leaks while still providing performance benefits.
|
|
40
|
+
缓存机制使用WeakKeyDictionary,因此当Request或Response对象被垃圾回收时,
|
|
41
|
+
缓存条目会自动删除。这可以防止内存泄漏,同时仍然提供性能优势。
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
request_or_response: A Request or Response object whose URL will be parsed.
|
|
45
|
+
将解析其URL的Request或Response对象。
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
ParseResult: The parsed URL components (scheme, netloc, path, params,
|
|
49
|
+
query, fragment).
|
|
50
|
+
解析的URL组件(scheme, netloc, path, params, query, fragment)。
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
>>> request = Request('https://example.com/path?query=value')
|
|
54
|
+
>>> parsed = urlparse_cached(request)
|
|
55
|
+
>>> parsed.netloc
|
|
56
|
+
'example.com'
|
|
57
|
+
>>> parsed.path
|
|
58
|
+
'/path'
|
|
59
|
+
>>> parsed.query
|
|
60
|
+
'query=value'
|
|
61
|
+
"""
|
|
62
|
+
# Check if this object's URL has already been parsed and cached
|
|
63
|
+
# 检查此对象的URL是否已被解析和缓存
|
|
17
64
|
if request_or_response not in _urlparse_cache:
|
|
65
|
+
# If not in cache, parse the URL and store the result
|
|
66
|
+
# 如果不在缓存中,解析URL并存储结果
|
|
18
67
|
_urlparse_cache[request_or_response] = urlparse(request_or_response.url)
|
|
68
|
+
|
|
69
|
+
# Return the cached parse result
|
|
70
|
+
# 返回缓存的解析结果
|
|
19
71
|
return _urlparse_cache[request_or_response]
|
aioscrapy/utils/log.py
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Logging utilities for aioscrapy.
|
|
3
|
+
aioscrapy的日志工具。
|
|
4
|
+
|
|
5
|
+
This module provides logging functionality for aioscrapy using the loguru library.
|
|
6
|
+
It configures logging based on settings and provides a spider-aware logger.
|
|
7
|
+
此模块使用loguru库为aioscrapy提供日志功能。
|
|
8
|
+
它根据设置配置日志记录,并提供一个感知爬虫的日志记录器。
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
import asyncio
|
|
2
12
|
import sys
|
|
3
13
|
import warnings
|
|
@@ -7,21 +17,46 @@ from loguru import logger as _logger
|
|
|
7
17
|
|
|
8
18
|
from aioscrapy.settings import Settings
|
|
9
19
|
|
|
20
|
+
# Remove the default stderr handler to avoid duplicate logs
|
|
21
|
+
# 移除默认的stderr处理程序以避免重复日志
|
|
10
22
|
for _handler in _logger._core.handlers.values():
|
|
11
23
|
if _handler._name == '<stderr>':
|
|
12
24
|
_logger.remove(_handler._id)
|
|
13
25
|
|
|
14
26
|
|
|
15
27
|
def configure_logging(spider: Type["Spider"], settings: Settings):
|
|
28
|
+
"""
|
|
29
|
+
Configure logging for a spider based on settings.
|
|
30
|
+
根据设置为爬虫配置日志记录。
|
|
31
|
+
|
|
32
|
+
This function sets up logging handlers for a specific spider based on the provided settings.
|
|
33
|
+
It can configure logging to stderr and/or to a file, with various options like log level,
|
|
34
|
+
rotation, retention, etc.
|
|
35
|
+
此函数根据提供的设置为特定爬虫设置日志处理程序。
|
|
36
|
+
它可以配置日志记录到stderr和/或文件,具有各种选项,如日志级别、轮换、保留等。
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
spider: The spider instance for which to configure logging.
|
|
40
|
+
要为其配置日志记录的爬虫实例。
|
|
41
|
+
settings: The settings object containing logging configuration.
|
|
42
|
+
包含日志配置的设置对象。
|
|
43
|
+
"""
|
|
44
|
+
# Get logging configuration from settings
|
|
45
|
+
# 从设置中获取日志配置
|
|
16
46
|
formatter = settings.get('LOG_FORMAT')
|
|
17
47
|
level = settings.get('LOG_LEVEL', 'INFO')
|
|
18
48
|
enqueue = settings.get('ENQUEUE', True)
|
|
49
|
+
|
|
50
|
+
# Configure stderr logging if enabled
|
|
51
|
+
# 如果启用,配置stderr日志记录
|
|
19
52
|
if settings.get('LOG_STDOUT', True):
|
|
20
53
|
_logger.add(
|
|
21
54
|
sys.stderr, format=formatter, level=level, enqueue=enqueue,
|
|
22
55
|
filter=lambda record: record["extra"].get("spidername") == spider.name,
|
|
23
56
|
)
|
|
24
57
|
|
|
58
|
+
# Configure file logging if a filename is provided
|
|
59
|
+
# 如果提供了文件名,配置文件日志记录
|
|
25
60
|
if filename := settings.get('LOG_FILE'):
|
|
26
61
|
rotation = settings.get('LOG_ROTATION', '20MB')
|
|
27
62
|
retention = settings.get('LOG_RETENTION', 10)
|
|
@@ -34,18 +69,62 @@ def configure_logging(spider: Type["Spider"], settings: Settings):
|
|
|
34
69
|
|
|
35
70
|
|
|
36
71
|
class AioScrapyLogger:
|
|
72
|
+
"""
|
|
73
|
+
Spider-aware logger for aioscrapy.
|
|
74
|
+
aioscrapy的爬虫感知日志记录器。
|
|
75
|
+
|
|
76
|
+
This class provides a wrapper around the loguru logger that automatically
|
|
77
|
+
binds the current spider name to log records. This allows for filtering
|
|
78
|
+
logs by spider name and provides context about which spider generated each log.
|
|
79
|
+
此类提供了loguru日志记录器的包装器,它自动将当前爬虫名称绑定到日志记录。
|
|
80
|
+
这允许按爬虫名称过滤日志,并提供关于哪个爬虫生成了每条日志的上下文。
|
|
81
|
+
|
|
82
|
+
The logger methods (debug, info, warning, etc.) are dynamically accessed
|
|
83
|
+
through __getattr__, so they're not explicitly defined.
|
|
84
|
+
日志记录器方法(debug、info、warning等)是通过__getattr__动态访问的,
|
|
85
|
+
因此它们没有明确定义。
|
|
86
|
+
"""
|
|
37
87
|
__slots__ = (
|
|
38
88
|
'catch', 'complete', 'critical', 'debug', 'error', 'exception',
|
|
39
89
|
'info', 'log', 'patch', 'success', 'trace', 'warning'
|
|
40
90
|
)
|
|
41
91
|
|
|
42
92
|
def __getattr__(self, method):
|
|
93
|
+
"""
|
|
94
|
+
Dynamically access logger methods with spider name binding.
|
|
95
|
+
动态访问带有爬虫名称绑定的日志记录器方法。
|
|
96
|
+
|
|
97
|
+
This method intercepts attribute access to provide logger methods that
|
|
98
|
+
automatically include the current spider name in the log context.
|
|
99
|
+
此方法拦截属性访问,以提供自动在日志上下文中包含当前爬虫名称的日志记录器方法。
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
method: The name of the logger method to access (e.g., 'info', 'debug').
|
|
103
|
+
要访问的日志记录器方法的名称(例如,'info'、'debug')。
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The requested logger method, bound with the current spider name.
|
|
107
|
+
请求的日志记录器方法,绑定了当前爬虫名称。
|
|
108
|
+
|
|
109
|
+
Note:
|
|
110
|
+
If the current task name cannot be determined, it falls back to the
|
|
111
|
+
original logger method without binding a spider name.
|
|
112
|
+
如果无法确定当前任务名称,它会回退到原始日志记录器方法,而不绑定爬虫名称。
|
|
113
|
+
"""
|
|
43
114
|
try:
|
|
115
|
+
# Get the current task name as the spider name
|
|
116
|
+
# 获取当前任务名称作为爬虫名称
|
|
44
117
|
spider_name = asyncio.current_task().get_name()
|
|
118
|
+
# Return the logger method bound with the spider name
|
|
119
|
+
# 返回绑定了爬虫名称的日志记录器方法
|
|
45
120
|
return getattr(_logger.bind(spidername=spider_name), method)
|
|
46
121
|
except Exception as e:
|
|
122
|
+
# Fall back to the original logger method if binding fails
|
|
123
|
+
# 如果绑定失败,回退到原始日志记录器方法
|
|
47
124
|
warnings.warn(f'Error on get logger: {e}')
|
|
48
125
|
return getattr(_logger, method)
|
|
49
126
|
|
|
50
127
|
|
|
128
|
+
# Create a singleton instance of the spider-aware logger
|
|
129
|
+
# 创建爬虫感知日志记录器的单例实例
|
|
51
130
|
logger = AioScrapyLogger()
|
aioscrapy/utils/misc.py
CHANGED
|
@@ -1,4 +1,12 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Miscellaneous utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的杂项实用函数。
|
|
4
|
+
|
|
5
|
+
This module contains helper functions that don't fit into other utility categories.
|
|
6
|
+
It provides functionality for module walking, object loading, and instance creation.
|
|
7
|
+
此模块包含不适合其他实用程序类别的辅助函数。
|
|
8
|
+
它提供了模块遍历、对象加载和实例创建的功能。
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
from importlib import import_module
|
|
4
12
|
from pkgutil import iter_modules
|
|
@@ -7,41 +15,112 @@ from aioscrapy.utils.tools import call_helper
|
|
|
7
15
|
|
|
8
16
|
|
|
9
17
|
def walk_modules(path):
|
|
10
|
-
"""Loads a module and all its submodules from the given module path and
|
|
11
|
-
returns them. If *any* module throws an exception while importing, that
|
|
12
|
-
exception is thrown back.
|
|
13
|
-
|
|
14
|
-
For example: walk_modules('aioscrapy.utils')
|
|
15
18
|
"""
|
|
19
|
+
Load a module and all its submodules recursively.
|
|
20
|
+
递归加载模块及其所有子模块。
|
|
21
|
+
|
|
22
|
+
This function imports a module and all its submodules from the given module path
|
|
23
|
+
and returns them as a list. It performs a recursive traversal of the module tree.
|
|
24
|
+
If any module raises an exception during import, that exception is propagated.
|
|
25
|
+
此函数从给定的模块路径导入模块及其所有子模块,并将它们作为列表返回。
|
|
26
|
+
它执行模块树的递归遍历。如果任何模块在导入过程中引发异常,则该异常会被传播。
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
path: The module path to load (e.g., 'aioscrapy.utils').
|
|
30
|
+
要加载的模块路径(例如,'aioscrapy.utils')。
|
|
16
31
|
|
|
32
|
+
Returns:
|
|
33
|
+
list: A list of imported modules, including the root module and all submodules.
|
|
34
|
+
导入的模块列表,包括根模块和所有子模块。
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ImportError: If any module cannot be imported.
|
|
38
|
+
如果任何模块无法导入。
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> mods = walk_modules('aioscrapy.utils')
|
|
42
|
+
>>> 'aioscrapy.utils.url' in [mod.__name__ for mod in mods]
|
|
43
|
+
True
|
|
44
|
+
"""
|
|
45
|
+
# Initialize the list of modules
|
|
46
|
+
# 初始化模块列表
|
|
17
47
|
mods = []
|
|
48
|
+
|
|
49
|
+
# Import the root module
|
|
50
|
+
# 导入根模块
|
|
18
51
|
mod = import_module(path)
|
|
19
52
|
mods.append(mod)
|
|
53
|
+
|
|
54
|
+
# If the module is a package (has a __path__), process its submodules
|
|
55
|
+
# 如果模块是一个包(有__path__),处理其子模块
|
|
20
56
|
if hasattr(mod, '__path__'):
|
|
21
57
|
for _, subpath, ispkg in iter_modules(mod.__path__):
|
|
58
|
+
# Construct the full path for the submodule
|
|
59
|
+
# 构造子模块的完整路径
|
|
22
60
|
fullpath = path + '.' + subpath
|
|
61
|
+
|
|
62
|
+
# If the submodule is a package, recursively walk it
|
|
63
|
+
# 如果子模块是一个包,递归遍历它
|
|
23
64
|
if ispkg:
|
|
24
65
|
mods += walk_modules(fullpath)
|
|
66
|
+
# Otherwise, import the submodule and add it to the list
|
|
67
|
+
# 否则,导入子模块并将其添加到列表中
|
|
25
68
|
else:
|
|
26
69
|
submod = import_module(fullpath)
|
|
27
70
|
mods.append(submod)
|
|
71
|
+
|
|
28
72
|
return mods
|
|
29
73
|
|
|
30
74
|
|
|
31
75
|
def load_object(path: str):
|
|
32
|
-
"""
|
|
76
|
+
"""
|
|
77
|
+
Load an object by its fully qualified name.
|
|
78
|
+
通过完全限定名称加载对象。
|
|
79
|
+
|
|
80
|
+
This function imports a module and retrieves an object from it based on the
|
|
81
|
+
provided import path. The object can be a class, function, variable, or an
|
|
82
|
+
instance defined in the specified module.
|
|
83
|
+
此函数根据提供的导入路径导入模块并从中检索对象。
|
|
84
|
+
对象可以是在指定模块中定义的类、函数、变量或实例。
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
path: The absolute object path (e.g., 'aioscrapy.libs.downloader.redirect.RedirectMiddleware').
|
|
88
|
+
对象的绝对路径(例如,'aioscrapy.libs.downloader.redirect.RedirectMiddleware')。
|
|
33
89
|
|
|
34
|
-
|
|
35
|
-
|
|
90
|
+
Returns:
|
|
91
|
+
The loaded object.
|
|
92
|
+
加载的对象。
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If the path is not a full path (doesn't contain a dot).
|
|
96
|
+
如果路径不是完整路径(不包含点)。
|
|
97
|
+
NameError: If the module doesn't define the specified object.
|
|
98
|
+
如果模块未定义指定的对象。
|
|
99
|
+
ImportError: If the module cannot be imported.
|
|
100
|
+
如果无法导入模块。
|
|
101
|
+
|
|
102
|
+
Example:
|
|
103
|
+
>>> middleware = load_object('aioscrapy.libs.downloader.redirect.RedirectMiddleware')
|
|
104
|
+
>>> middleware.__name__
|
|
105
|
+
'RedirectMiddleware'
|
|
36
106
|
"""
|
|
107
|
+
# Find the last dot in the path to separate module path from object name
|
|
108
|
+
# 在路径中查找最后一个点,以将模块路径与对象名称分开
|
|
37
109
|
try:
|
|
38
110
|
dot = path.rindex('.')
|
|
39
111
|
except ValueError:
|
|
40
112
|
raise ValueError(f"Error loading object '{path}': not a full path")
|
|
41
113
|
|
|
114
|
+
# Split the path into module path and object name
|
|
115
|
+
# 将路径分割为模块路径和对象名称
|
|
42
116
|
module, name = path[:dot], path[dot + 1:]
|
|
117
|
+
|
|
118
|
+
# Import the module
|
|
119
|
+
# 导入模块
|
|
43
120
|
mod = import_module(module)
|
|
44
121
|
|
|
122
|
+
# Get the object from the module
|
|
123
|
+
# 从模块中获取对象
|
|
45
124
|
try:
|
|
46
125
|
obj = getattr(mod, name)
|
|
47
126
|
except AttributeError:
|
|
@@ -51,49 +130,138 @@ def load_object(path: str):
|
|
|
51
130
|
|
|
52
131
|
|
|
53
132
|
async def create_instance(objcls, settings, crawler, *args, spider=None, **kwargs):
|
|
54
|
-
"""
|
|
55
|
-
|
|
133
|
+
"""
|
|
134
|
+
Create an instance of a class using its factory methods.
|
|
135
|
+
使用类的工厂方法创建类的实例。
|
|
56
136
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
137
|
+
This function tries to create an instance of the given class using one of its
|
|
138
|
+
factory methods in the following order of preference:
|
|
139
|
+
1. from_crawler(crawler, *args, **kwargs) - if crawler is provided
|
|
140
|
+
2. from_spider(spider, *args, **kwargs) - if spider is provided
|
|
141
|
+
3. from_settings(settings, *args, **kwargs) - if settings is provided
|
|
142
|
+
4. Regular constructor: objcls(*args, **kwargs) - as a fallback
|
|
61
143
|
|
|
62
|
-
|
|
144
|
+
此函数尝试使用给定类的一个工厂方法创建实例,优先顺序如下:
|
|
145
|
+
1. from_crawler(crawler, *args, **kwargs) - 如果提供了crawler
|
|
146
|
+
2. from_spider(spider, *args, **kwargs) - 如果提供了spider
|
|
147
|
+
3. from_settings(settings, *args, **kwargs) - 如果提供了settings
|
|
148
|
+
4. 常规构造函数:objcls(*args, **kwargs) - 作为后备选项
|
|
63
149
|
|
|
64
|
-
|
|
150
|
+
Args:
|
|
151
|
+
objcls: The class to instantiate.
|
|
152
|
+
要实例化的类。
|
|
153
|
+
settings: The settings object to use. Can be None if crawler is provided.
|
|
154
|
+
要使用的设置对象。如果提供了crawler,可以为None。
|
|
155
|
+
crawler: The crawler object to use. Can be None.
|
|
156
|
+
要使用的爬虫对象。可以为None。
|
|
157
|
+
*args: Positional arguments to pass to the constructor.
|
|
158
|
+
传递给构造函数的位置参数。
|
|
159
|
+
spider: The spider object to use. Can be None.
|
|
160
|
+
要使用的蜘蛛对象。可以为None。
|
|
161
|
+
**kwargs: Keyword arguments to pass to the constructor.
|
|
162
|
+
传递给构造函数的关键字参数。
|
|
65
163
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
164
|
+
Returns:
|
|
165
|
+
An instance of the specified class.
|
|
166
|
+
指定类的实例。
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ValueError: If settings, crawler, and spider are all None.
|
|
170
|
+
如果settings、crawler和spider都为None。
|
|
171
|
+
TypeError: If the factory method returns None.
|
|
172
|
+
如果工厂方法返回None。
|
|
69
173
|
"""
|
|
174
|
+
# Ensure we have settings from either crawler, spider, or directly provided
|
|
175
|
+
# 确保我们从crawler、spider或直接提供的参数中获取设置
|
|
70
176
|
if settings is None:
|
|
71
177
|
if crawler is None and spider is None:
|
|
72
178
|
raise ValueError("Specify at least one of settings, crawler and spider.")
|
|
73
179
|
|
|
180
|
+
# Get settings from crawler or spider
|
|
181
|
+
# 从crawler或spider获取设置
|
|
74
182
|
settings = crawler and crawler.settings or spider and spider.settings
|
|
183
|
+
# Get spider from crawler if not directly provided
|
|
184
|
+
# 如果没有直接提供,从crawler获取spider
|
|
75
185
|
spider = spider or crawler and crawler.spider
|
|
76
186
|
|
|
187
|
+
# Try to create instance using the appropriate factory method
|
|
188
|
+
# 尝试使用适当的工厂方法创建实例
|
|
77
189
|
if crawler and hasattr(objcls, 'from_crawler'):
|
|
190
|
+
# Use from_crawler if available and crawler is provided
|
|
191
|
+
# 如果可用且提供了crawler,则使用from_crawler
|
|
78
192
|
instance = await call_helper(objcls.from_crawler, crawler, *args, **kwargs)
|
|
79
193
|
method_name = 'from_crawler'
|
|
80
194
|
elif spider and hasattr(objcls, 'from_spider'):
|
|
195
|
+
# Use from_spider if available and spider is provided
|
|
196
|
+
# 如果可用且提供了spider,则使用from_spider
|
|
81
197
|
instance = await call_helper(objcls.from_spider, spider, *args, **kwargs)
|
|
82
198
|
method_name = 'from_spider'
|
|
83
199
|
elif hasattr(objcls, 'from_settings'):
|
|
200
|
+
# Use from_settings if available
|
|
201
|
+
# 如果可用,则使用from_settings
|
|
84
202
|
instance = await call_helper(objcls.from_settings, settings, *args, **kwargs)
|
|
85
203
|
method_name = 'from_settings'
|
|
86
204
|
else:
|
|
205
|
+
# Fall back to regular constructor
|
|
206
|
+
# 回退到常规构造函数
|
|
87
207
|
instance = objcls(*args, **kwargs)
|
|
88
208
|
method_name = '__new__'
|
|
209
|
+
|
|
210
|
+
# Ensure the factory method returned a valid instance
|
|
211
|
+
# 确保工厂方法返回了有效的实例
|
|
89
212
|
if instance is None:
|
|
90
213
|
raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
|
|
214
|
+
|
|
91
215
|
return instance
|
|
92
216
|
|
|
93
217
|
|
|
94
218
|
async def load_instance(clspath: str, *args, settings=None, spider=None, crawler=None, **kwargs):
|
|
219
|
+
"""
|
|
220
|
+
Load a class by its path and create an instance of it.
|
|
221
|
+
通过路径加载类并创建其实例。
|
|
222
|
+
|
|
223
|
+
This function combines load_object() and create_instance() to load a class
|
|
224
|
+
by its fully qualified name and then create an instance of it using the
|
|
225
|
+
appropriate factory method.
|
|
226
|
+
此函数结合了load_object()和create_instance(),通过完全限定名称加载类,
|
|
227
|
+
然后使用适当的工厂方法创建其实例。
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
clspath: The fully qualified class path (e.g., 'aioscrapy.libs.downloader.redirect.RedirectMiddleware').
|
|
231
|
+
完全限定的类路径(例如,'aioscrapy.libs.downloader.redirect.RedirectMiddleware')。
|
|
232
|
+
*args: Positional arguments to pass to the constructor.
|
|
233
|
+
传递给构造函数的位置参数。
|
|
234
|
+
settings: The settings object to use. Can be None if crawler is provided.
|
|
235
|
+
要使用的设置对象。如果提供了crawler,可以为None。
|
|
236
|
+
spider: The spider object to use. Can be None.
|
|
237
|
+
要使用的蜘蛛对象。可以为None。
|
|
238
|
+
crawler: The crawler object to use. Can be None.
|
|
239
|
+
要使用的爬虫对象。可以为None。
|
|
240
|
+
**kwargs: Keyword arguments to pass to the constructor.
|
|
241
|
+
传递给构造函数的关键字参数。
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
An instance of the specified class.
|
|
245
|
+
指定类的实例。
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
ValueError: If settings, crawler, and spider are all None.
|
|
249
|
+
如果settings、crawler和spider都为None。
|
|
250
|
+
TypeError: If the factory method returns None.
|
|
251
|
+
如果工厂方法返回None。
|
|
252
|
+
ImportError: If the class cannot be imported.
|
|
253
|
+
如果无法导入类。
|
|
254
|
+
NameError: If the module doesn't define the specified class.
|
|
255
|
+
如果模块未定义指定的类。
|
|
256
|
+
"""
|
|
257
|
+
# First load the class by its path
|
|
258
|
+
# 首先通过路径加载类
|
|
259
|
+
cls = load_object(clspath)
|
|
260
|
+
|
|
261
|
+
# Then create an instance of the class
|
|
262
|
+
# 然后创建类的实例
|
|
95
263
|
return await create_instance(
|
|
96
|
-
|
|
264
|
+
cls,
|
|
97
265
|
settings,
|
|
98
266
|
crawler,
|
|
99
267
|
*args,
|
aioscrapy/utils/ossignal.py
CHANGED
|
@@ -1,23 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Operating system signal utilities for aioscrapy.
|
|
3
|
+
aioscrapy的操作系统信号实用工具。
|
|
4
|
+
|
|
5
|
+
This module provides utilities for working with operating system signals in aioscrapy.
|
|
6
|
+
It includes functions for installing signal handlers and mapping between signal
|
|
7
|
+
numbers and their names.
|
|
8
|
+
此模块提供了用于处理aioscrapy中操作系统信号的实用工具。
|
|
9
|
+
它包括用于安装信号处理程序以及在信号编号和其名称之间映射的函数。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
import signal
|
|
2
13
|
|
|
3
14
|
|
|
15
|
+
# Dictionary mapping signal numbers to their names (e.g., {2: 'SIGINT', 15: 'SIGTERM'})
|
|
16
|
+
# 将信号编号映射到其名称的字典(例如,{2: 'SIGINT', 15: 'SIGTERM'})
|
|
4
17
|
signal_names = {}
|
|
18
|
+
|
|
19
|
+
# Populate the signal_names dictionary by iterating through all attributes in the signal module
|
|
20
|
+
# 通过迭代信号模块中的所有属性来填充signal_names字典
|
|
5
21
|
for signame in dir(signal):
|
|
22
|
+
# Only process attributes that start with 'SIG' but not 'SIG_'
|
|
23
|
+
# (SIG_ prefixed constants are signal handlers, not signal types)
|
|
24
|
+
# 只处理以'SIG'开头但不以'SIG_'开头的属性
|
|
25
|
+
# (SIG_前缀的常量是信号处理程序,而不是信号类型)
|
|
6
26
|
if signame.startswith('SIG') and not signame.startswith('SIG_'):
|
|
27
|
+
# Get the signal number for this signal name
|
|
28
|
+
# 获取此信号名称的信号编号
|
|
7
29
|
signum = getattr(signal, signame)
|
|
30
|
+
# Only add to the dictionary if it's an integer (a valid signal number)
|
|
31
|
+
# 只有当它是整数(有效的信号编号)时才添加到字典中
|
|
8
32
|
if isinstance(signum, int):
|
|
9
33
|
signal_names[signum] = signame
|
|
10
34
|
|
|
11
35
|
|
|
12
36
|
def install_shutdown_handlers(function, override_sigint=True):
|
|
13
|
-
"""Install the given function as a signal handler for all common shutdown
|
|
14
|
-
signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the
|
|
15
|
-
SIGINT handler won't be install if there is already a handler in place
|
|
16
|
-
(e.g. Pdb)
|
|
17
37
|
"""
|
|
38
|
+
Install a function as a signal handler for common shutdown signals.
|
|
39
|
+
为常见的关闭信号安装函数作为信号处理程序。
|
|
40
|
+
|
|
41
|
+
This function installs the provided function as a handler for common shutdown
|
|
42
|
+
signals such as SIGTERM (terminate), SIGINT (keyboard interrupt), and SIGBREAK
|
|
43
|
+
(Ctrl-Break on Windows). This is useful for graceful shutdown of applications.
|
|
44
|
+
此函数将提供的函数安装为常见关闭信号的处理程序,如SIGTERM(终止)、
|
|
45
|
+
SIGINT(键盘中断)和SIGBREAK(Windows上的Ctrl-Break)。
|
|
46
|
+
这对于应用程序的优雅关闭很有用。
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
function: The function to be called when a shutdown signal is received.
|
|
50
|
+
当收到关闭信号时要调用的函数。
|
|
51
|
+
This function should accept two parameters: signal number and frame.
|
|
52
|
+
此函数应接受两个参数:信号编号和帧。
|
|
53
|
+
override_sigint: Whether to override an existing SIGINT handler.
|
|
54
|
+
是否覆盖现有的SIGINT处理程序。
|
|
55
|
+
If False, the SIGINT handler won't be installed if there's
|
|
56
|
+
already a custom handler in place (e.g., a debugger like Pdb).
|
|
57
|
+
如果为False,则在已有自定义处理程序(例如Pdb调试器)的情况下
|
|
58
|
+
不会安装SIGINT处理程序。
|
|
59
|
+
Defaults to True.
|
|
60
|
+
默认为True。
|
|
61
|
+
|
|
62
|
+
Example:
|
|
63
|
+
>>> def handle_shutdown(signum, frame):
|
|
64
|
+
... print(f"Received signal {signal_names.get(signum, signum)}")
|
|
65
|
+
... # Perform cleanup operations
|
|
66
|
+
... sys.exit(0)
|
|
67
|
+
>>> install_shutdown_handlers(handle_shutdown)
|
|
68
|
+
"""
|
|
69
|
+
# Always install handler for SIGTERM (terminate signal)
|
|
70
|
+
# 始终为SIGTERM(终止信号)安装处理程序
|
|
18
71
|
signal.signal(signal.SIGTERM, function)
|
|
72
|
+
|
|
73
|
+
# Install handler for SIGINT (keyboard interrupt) if:
|
|
74
|
+
# - The current handler is the default handler, or
|
|
75
|
+
# - override_sigint is True (forcing override of any existing handler)
|
|
76
|
+
# 在以下情况下为SIGINT(键盘中断)安装处理程序:
|
|
77
|
+
# - 当前处理程序是默认处理程序,或
|
|
78
|
+
# - override_sigint为True(强制覆盖任何现有处理程序)
|
|
19
79
|
if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint:
|
|
20
80
|
signal.signal(signal.SIGINT, function)
|
|
21
|
-
|
|
81
|
+
|
|
82
|
+
# Install handler for SIGBREAK (Ctrl-Break) on Windows if available
|
|
83
|
+
# 如果可用,在Windows上为SIGBREAK(Ctrl-Break)安装处理程序
|
|
22
84
|
if hasattr(signal, 'SIGBREAK'):
|
|
23
85
|
signal.signal(signal.SIGBREAK, function)
|