aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/utils/signal.py
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Signal utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的信号实用函数。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with signals in aioscrapy.
|
|
6
|
+
It includes functions for sending signals, catching and logging exceptions,
|
|
7
|
+
and managing signal connections.
|
|
8
|
+
此模块提供了用于处理aioscrapy中信号的实用函数。
|
|
9
|
+
它包括用于发送信号、捕获和记录异常以及管理信号连接的函数。
|
|
10
|
+
"""
|
|
2
11
|
import asyncio
|
|
3
12
|
|
|
4
13
|
from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
|
|
@@ -10,52 +19,225 @@ from aioscrapy.utils.tools import create_task
|
|
|
10
19
|
|
|
11
20
|
|
|
12
21
|
class _IgnoredException(Exception):
|
|
22
|
+
"""
|
|
23
|
+
Internal exception class used to mark exceptions that should be ignored in logs.
|
|
24
|
+
内部异常类,用于标记应在日志中忽略的异常。
|
|
25
|
+
|
|
26
|
+
This exception is used as a marker for exceptions that should not be logged
|
|
27
|
+
when caught in signal handlers. It's used in conjunction with the 'dont_log'
|
|
28
|
+
parameter in signal sending functions.
|
|
29
|
+
此异常用作在信号处理程序中捕获时不应记录的异常的标记。
|
|
30
|
+
它与信号发送函数中的'dont_log'参数一起使用。
|
|
31
|
+
"""
|
|
13
32
|
pass
|
|
14
33
|
|
|
15
34
|
|
|
16
35
|
async def robustApplyWrap(f, recv, *args, **kw):
|
|
36
|
+
"""
|
|
37
|
+
Wrap a function call with exception handling and async support.
|
|
38
|
+
使用异常处理和异步支持包装函数调用。
|
|
39
|
+
|
|
40
|
+
This function wraps the application of a function to a receiver with robust
|
|
41
|
+
exception handling. It also supports awaiting coroutines returned by the function.
|
|
42
|
+
If an exception occurs, it logs the error (unless the exception type is in dont_log)
|
|
43
|
+
and returns the exception object instead of raising it.
|
|
44
|
+
此函数使用健壮的异常处理包装函数对接收器的应用。它还支持等待函数返回的协程。
|
|
45
|
+
如果发生异常,它会记录错误(除非异常类型在dont_log中),并返回异常对象而不是引发它。
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
f: The function to apply (typically robustApply).
|
|
49
|
+
要应用的函数(通常是robustApply)。
|
|
50
|
+
recv: The receiver object (signal handler).
|
|
51
|
+
接收器对象(信号处理程序)。
|
|
52
|
+
*args: Positional arguments to pass to the function.
|
|
53
|
+
传递给函数的位置参数。
|
|
54
|
+
**kw: Keyword arguments to pass to the function.
|
|
55
|
+
传递给函数的关键字参数。
|
|
56
|
+
Special keys:
|
|
57
|
+
特殊键:
|
|
58
|
+
- dont_log: Exception types to not log if caught.
|
|
59
|
+
如果捕获,不记录的异常类型。
|
|
60
|
+
- spider: The spider instance (for context in logs).
|
|
61
|
+
爬虫实例(用于日志中的上下文)。
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The result of the function call, or the exception if one was caught.
|
|
65
|
+
函数调用的结果,如果捕获到异常则返回异常。
|
|
66
|
+
"""
|
|
67
|
+
# Extract special parameters
|
|
68
|
+
# 提取特殊参数
|
|
17
69
|
dont_log = kw.pop('dont_log', None)
|
|
18
|
-
|
|
70
|
+
# Spider is kept in kw for context but extracted here for potential future use
|
|
71
|
+
# 爬虫保留在kw中作为上下文,但在此提取以供将来使用
|
|
72
|
+
spider = kw.get('spider', None) # noqa: F841
|
|
73
|
+
|
|
19
74
|
try:
|
|
75
|
+
# Apply the function to the receiver
|
|
76
|
+
# 将函数应用于接收器
|
|
20
77
|
result = f(recv, *args, **kw)
|
|
78
|
+
# If the result is a coroutine, await it
|
|
79
|
+
# 如果结果是协程,等待它
|
|
21
80
|
if asyncio.iscoroutine(result):
|
|
22
81
|
return await result
|
|
82
|
+
return result
|
|
23
83
|
except (Exception, BaseException) as exc: # noqa: E722
|
|
84
|
+
# Log the exception unless it's a type we should ignore
|
|
85
|
+
# 记录异常,除非它是我们应该忽略的类型
|
|
24
86
|
if dont_log is None or not isinstance(exc, dont_log):
|
|
25
87
|
logger.exception(f"Error caught on signal handler: {recv}")
|
|
88
|
+
# Return the exception instead of raising it
|
|
89
|
+
# 返回异常而不是引发它
|
|
26
90
|
return exc
|
|
27
91
|
|
|
28
92
|
|
|
29
93
|
async def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
|
|
30
|
-
"""Like pydispatcher.robust.sendRobust but it also logs errors and returns
|
|
31
|
-
Failures instead of exceptions.
|
|
32
94
|
"""
|
|
95
|
+
Send a signal and catch any exceptions raised by handlers.
|
|
96
|
+
发送信号并捕获处理程序引发的任何异常。
|
|
97
|
+
|
|
98
|
+
This function is similar to pydispatcher.robust.sendRobust but with additional
|
|
99
|
+
features for aioscrapy:
|
|
100
|
+
1. It logs errors that occur in signal handlers
|
|
101
|
+
2. It returns the exceptions instead of raising them
|
|
102
|
+
3. It supports async signal handlers
|
|
103
|
+
4. It has special handling for StopDownload exceptions
|
|
104
|
+
|
|
105
|
+
此函数类似于pydispatcher.robust.sendRobust,但为aioscrapy提供了额外功能:
|
|
106
|
+
1. 它记录信号处理程序中发生的错误
|
|
107
|
+
2. 它返回异常而不是引发它们
|
|
108
|
+
3. 它支持异步信号处理程序
|
|
109
|
+
4. 它对StopDownload异常有特殊处理
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
signal: The signal to send. Default is Any (all signals).
|
|
113
|
+
要发送的信号。默认为Any(所有信号)。
|
|
114
|
+
sender: The sender of the signal. Default is Anonymous.
|
|
115
|
+
信号的发送者。默认为Anonymous。
|
|
116
|
+
*arguments: Positional arguments to pass to the signal handlers.
|
|
117
|
+
传递给信号处理程序的位置参数。
|
|
118
|
+
**named: Keyword arguments to pass to the signal handlers.
|
|
119
|
+
传递给信号处理程序的关键字参数。
|
|
120
|
+
Special keys:
|
|
121
|
+
特殊键:
|
|
122
|
+
- dont_log: Exception types to not log if caught.
|
|
123
|
+
如果捕获,不记录的异常类型。
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
list: A list of (receiver, result) tuples, where result is either the
|
|
127
|
+
return value of the handler or the exception that was caught.
|
|
128
|
+
(接收器, 结果)元组的列表,其中结果是处理程序的返回值或捕获的异常。
|
|
129
|
+
"""
|
|
130
|
+
# Configure which exceptions should not be logged
|
|
131
|
+
# 配置不应记录的异常
|
|
33
132
|
named['dont_log'] = (named.pop('dont_log', _IgnoredException), StopDownload)
|
|
133
|
+
|
|
134
|
+
# Collect responses from all receivers
|
|
135
|
+
# 收集所有接收器的响应
|
|
34
136
|
responses = []
|
|
137
|
+
|
|
138
|
+
# Get all receivers for this signal and sender
|
|
139
|
+
# 获取此信号和发送者的所有接收器
|
|
35
140
|
for receiver in liveReceivers(getAllReceivers(sender, signal)):
|
|
36
|
-
|
|
141
|
+
# Apply the handler function robustly, catching any exceptions
|
|
142
|
+
# 健壮地应用处理程序函数,捕获任何异常
|
|
143
|
+
result = await robustApplyWrap(
|
|
144
|
+
robustApply,
|
|
145
|
+
receiver,
|
|
146
|
+
signal=signal,
|
|
147
|
+
sender=sender,
|
|
148
|
+
*arguments,
|
|
149
|
+
**named
|
|
150
|
+
)
|
|
151
|
+
# Store the receiver and its result (or exception)
|
|
152
|
+
# 存储接收器及其结果(或异常)
|
|
37
153
|
responses.append((receiver, result))
|
|
154
|
+
|
|
38
155
|
return responses
|
|
39
156
|
|
|
40
157
|
|
|
41
158
|
async def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **named):
|
|
42
|
-
"""Like send_catch_log but supports returning deferreds on signal handlers.
|
|
43
|
-
Returns a deferred that gets fired once all signal handlers deferreds were
|
|
44
|
-
fired.
|
|
45
159
|
"""
|
|
46
|
-
|
|
160
|
+
Send a signal and gather results from all handlers concurrently.
|
|
161
|
+
发送信号并同时收集所有处理程序的结果。
|
|
162
|
+
|
|
163
|
+
This function is similar to send_catch_log but runs all signal handlers
|
|
164
|
+
concurrently using asyncio tasks. It waits for all handlers to complete
|
|
165
|
+
before returning the results.
|
|
166
|
+
此函数类似于send_catch_log,但使用asyncio任务同时运行所有信号处理程序。
|
|
167
|
+
它在返回结果之前等待所有处理程序完成。
|
|
168
|
+
|
|
169
|
+
This is useful when signal handlers are independent of each other and
|
|
170
|
+
can run in parallel, potentially improving performance.
|
|
171
|
+
当信号处理程序彼此独立并且可以并行运行时,这很有用,可能会提高性能。
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
signal: The signal to send. Default is Any (all signals).
|
|
175
|
+
要发送的信号。默认为Any(所有信号)。
|
|
176
|
+
sender: The sender of the signal. Default is Anonymous.
|
|
177
|
+
信号的发送者。默认为Anonymous。
|
|
178
|
+
*arguments: Positional arguments to pass to the signal handlers.
|
|
179
|
+
传递给信号处理程序的位置参数。
|
|
180
|
+
**named: Keyword arguments to pass to the signal handlers.
|
|
181
|
+
传递给信号处理程序的关键字参数。
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
list: A list of results from all signal handlers, in the order they were
|
|
185
|
+
registered. Each result is either the return value of the handler
|
|
186
|
+
or the exception that was caught.
|
|
187
|
+
所有信号处理程序的结果列表,按它们注册的顺序排列。
|
|
188
|
+
每个结果要么是处理程序的返回值,要么是捕获的异常。
|
|
189
|
+
"""
|
|
190
|
+
# List to store tasks for each receiver
|
|
191
|
+
# 用于存储每个接收器的任务的列表
|
|
192
|
+
tasks = []
|
|
193
|
+
|
|
194
|
+
# Get all receivers for this signal and sender
|
|
195
|
+
# 获取此信号和发送者的所有接收器
|
|
47
196
|
for receiver in liveReceivers(getAllReceivers(sender, signal)):
|
|
48
|
-
|
|
197
|
+
# Create a task for each receiver to run concurrently
|
|
198
|
+
# 为每个接收器创建一个任务以同时运行
|
|
199
|
+
tasks.append(
|
|
49
200
|
create_task(
|
|
50
|
-
robustApplyWrap(
|
|
201
|
+
robustApplyWrap(
|
|
202
|
+
robustApply,
|
|
203
|
+
receiver,
|
|
204
|
+
signal=signal,
|
|
205
|
+
sender=sender,
|
|
206
|
+
*arguments,
|
|
207
|
+
**named
|
|
208
|
+
)
|
|
51
209
|
)
|
|
52
210
|
)
|
|
53
|
-
|
|
211
|
+
|
|
212
|
+
# Wait for all tasks to complete and return their results
|
|
213
|
+
# 等待所有任务完成并返回它们的结果
|
|
214
|
+
return await asyncio.gather(*tasks)
|
|
54
215
|
|
|
55
216
|
|
|
56
217
|
def disconnect_all(signal=Any, sender=Any):
|
|
57
|
-
"""Disconnect all signal handlers. Useful for cleaning up after running
|
|
58
|
-
tests
|
|
59
218
|
"""
|
|
219
|
+
Disconnect all signal handlers for a given signal and sender.
|
|
220
|
+
断开给定信号和发送者的所有信号处理程序。
|
|
221
|
+
|
|
222
|
+
This function disconnects all signal handlers that match the specified
|
|
223
|
+
signal and sender. It's particularly useful for cleaning up after running
|
|
224
|
+
tests to ensure that signal handlers from one test don't affect other tests.
|
|
225
|
+
此函数断开与指定信号和发送者匹配的所有信号处理程序。
|
|
226
|
+
它对于在运行测试后进行清理特别有用,以确保一个测试的信号处理程序不会影响其他测试。
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
signal: The signal to disconnect handlers from. Default is Any (all signals).
|
|
230
|
+
要断开处理程序的信号。默认为Any(所有信号)。
|
|
231
|
+
sender: The sender to disconnect handlers for. Default is Any (all senders).
|
|
232
|
+
要断开处理程序的发送者。默认为Any(所有发送者)。
|
|
233
|
+
|
|
234
|
+
Note:
|
|
235
|
+
This function modifies the global signal registry maintained by PyDispatcher.
|
|
236
|
+
此函数修改由PyDispatcher维护的全局信号注册表。
|
|
237
|
+
"""
|
|
238
|
+
# Get all receivers for this signal and sender
|
|
239
|
+
# 获取此信号和发送者的所有接收器
|
|
60
240
|
for receiver in liveReceivers(getAllReceivers(sender, signal)):
|
|
241
|
+
# Disconnect each receiver
|
|
242
|
+
# 断开每个接收器
|
|
61
243
|
disconnect(receiver, signal=signal, sender=sender)
|
aioscrapy/utils/spider.py
CHANGED
|
@@ -1,20 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spider utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的爬虫实用函数。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with spider classes in aioscrapy.
|
|
6
|
+
It includes functions for discovering and iterating over spider classes.
|
|
7
|
+
此模块提供了用于处理aioscrapy中爬虫类的实用函数。
|
|
8
|
+
它包括用于发现和迭代爬虫类的函数。
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
import inspect
|
|
2
12
|
|
|
3
13
|
from aioscrapy.spiders import Spider
|
|
4
14
|
|
|
5
15
|
|
|
6
16
|
def iter_spider_classes(module):
|
|
7
|
-
"""Return an iterator over all spider classes defined in the given module
|
|
8
|
-
that can be instantiated (i.e. which have name)
|
|
9
17
|
"""
|
|
10
|
-
|
|
11
|
-
|
|
18
|
+
Iterate over all valid spider classes defined in a module.
|
|
19
|
+
迭代模块中定义的所有有效爬虫类。
|
|
20
|
+
|
|
21
|
+
This function finds all classes in the given module that:
|
|
22
|
+
1. Are subclasses of the Spider class
|
|
23
|
+
2. Are defined in the module itself (not imported)
|
|
24
|
+
3. Have a non-empty 'name' attribute (required for instantiation)
|
|
25
|
+
|
|
26
|
+
此函数查找给定模块中满足以下条件的所有类:
|
|
27
|
+
1. 是Spider类的子类
|
|
28
|
+
2. 在模块本身中定义(非导入)
|
|
29
|
+
3. 具有非空的'name'属性(实例化所必需的)
|
|
12
30
|
|
|
31
|
+
The function is used by the spider loader to discover spiders in a module.
|
|
32
|
+
该函数被爬虫加载器用来在模块中发现爬虫。
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
module: The module object to inspect for spider classes.
|
|
36
|
+
要检查爬虫类的模块对象。
|
|
37
|
+
|
|
38
|
+
Yields:
|
|
39
|
+
class: Spider classes that can be instantiated.
|
|
40
|
+
可以实例化的爬虫类。
|
|
41
|
+
|
|
42
|
+
Note:
|
|
43
|
+
This implementation avoids importing the spider manager singleton
|
|
44
|
+
from aioscrapy.spider.spiders, which would create circular imports.
|
|
45
|
+
此实现避免从aioscrapy.spider.spiders导入爬虫管理器单例,
|
|
46
|
+
这会创建循环导入。
|
|
47
|
+
"""
|
|
48
|
+
# Iterate through all objects in the module
|
|
49
|
+
# 迭代模块中的所有对象
|
|
13
50
|
for obj in vars(module).values():
|
|
51
|
+
# Check if the object meets all criteria for a valid spider class
|
|
52
|
+
# 检查对象是否满足有效爬虫类的所有条件
|
|
14
53
|
if (
|
|
54
|
+
# Must be a class
|
|
55
|
+
# 必须是一个类
|
|
15
56
|
inspect.isclass(obj)
|
|
57
|
+
# Must be a subclass of Spider
|
|
58
|
+
# 必须是Spider的子类
|
|
16
59
|
and issubclass(obj, Spider)
|
|
60
|
+
# Must be defined in this module (not imported)
|
|
61
|
+
# 必须在此模块中定义(非导入)
|
|
17
62
|
and obj.__module__ == module.__name__
|
|
63
|
+
# Must have a name attribute (required for instantiation)
|
|
64
|
+
# 必须有name属性(实例化所必需的)
|
|
18
65
|
and getattr(obj, 'name', None)
|
|
19
66
|
):
|
|
20
67
|
yield obj
|
aioscrapy/utils/template.py
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Template utility functions for aioscrapy.
|
|
3
|
+
aioscrapy的模板实用函数。
|
|
4
|
+
|
|
5
|
+
This module provides utility functions for working with templates in aioscrapy.
|
|
6
|
+
It includes functions for rendering template files and string transformations
|
|
7
|
+
commonly used in code generation.
|
|
8
|
+
此模块提供了用于处理aioscrapy中模板的实用函数。
|
|
9
|
+
它包括用于渲染模板文件和在代码生成中常用的字符串转换的函数。
|
|
10
|
+
"""
|
|
2
11
|
|
|
3
12
|
import os
|
|
4
13
|
import re
|
|
@@ -6,31 +15,109 @@ import string
|
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
def render_templatefile(path, **kwargs):
|
|
18
|
+
"""
|
|
19
|
+
Render a template file with the given parameters.
|
|
20
|
+
使用给定参数渲染模板文件。
|
|
21
|
+
|
|
22
|
+
This function reads a template file, substitutes variables using Python's
|
|
23
|
+
string.Template, and writes the result back to the file system. If the file
|
|
24
|
+
has a '.tmpl' extension, it will be renamed to remove this extension after
|
|
25
|
+
rendering.
|
|
26
|
+
此函数读取模板文件,使用Python的string.Template替换变量,
|
|
27
|
+
并将结果写回文件系统。如果文件有'.tmpl'扩展名,
|
|
28
|
+
渲染后将重命名以删除此扩展名。
|
|
29
|
+
|
|
30
|
+
The template uses the syntax defined by string.Template, where variables are
|
|
31
|
+
marked with a $ prefix (e.g., $variable or ${variable}).
|
|
32
|
+
模板使用string.Template定义的语法,其中变量用$前缀标记
|
|
33
|
+
(例如,$variable或${variable})。
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
path: Path to the template file to render.
|
|
37
|
+
要渲染的模板文件的路径。
|
|
38
|
+
**kwargs: Variables to substitute in the template.
|
|
39
|
+
要在模板中替换的变量。
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> render_templatefile('spider.py.tmpl',
|
|
43
|
+
... classname='MySpider',
|
|
44
|
+
... domain='example.com')
|
|
45
|
+
|
|
46
|
+
Note:
|
|
47
|
+
This function modifies the file system by:
|
|
48
|
+
此函数通过以下方式修改文件系统:
|
|
49
|
+
1. Potentially renaming the template file (if it ends with .tmpl)
|
|
50
|
+
可能重命名模板文件(如果以.tmpl结尾)
|
|
51
|
+
2. Writing the rendered content to the target file
|
|
52
|
+
将渲染的内容写入目标文件
|
|
53
|
+
"""
|
|
54
|
+
# Read the template file as UTF-8
|
|
55
|
+
# 以UTF-8格式读取模板文件
|
|
9
56
|
with open(path, 'rb') as fp:
|
|
10
57
|
raw = fp.read().decode('utf8')
|
|
11
58
|
|
|
59
|
+
# Substitute variables in the template
|
|
60
|
+
# 替换模板中的变量
|
|
12
61
|
content = string.Template(raw).substitute(**kwargs)
|
|
13
62
|
|
|
63
|
+
# Determine the output path (remove .tmpl extension if present)
|
|
64
|
+
# 确定输出路径(如果存在,则删除.tmpl扩展名)
|
|
14
65
|
render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
|
|
15
66
|
|
|
67
|
+
# Rename the file if it has a .tmpl extension
|
|
68
|
+
# 如果文件有.tmpl扩展名,则重命名文件
|
|
16
69
|
if path.endswith('.tmpl'):
|
|
17
70
|
os.rename(path, render_path)
|
|
18
71
|
|
|
72
|
+
# Write the rendered content back to the file
|
|
73
|
+
# 将渲染的内容写回文件
|
|
19
74
|
with open(render_path, 'wb') as fp:
|
|
20
75
|
fp.write(content.encode('utf8'))
|
|
21
76
|
|
|
22
77
|
|
|
78
|
+
# Regular expression pattern to match characters that are not letters or digits
|
|
79
|
+
# Used by string_camelcase to remove invalid characters when converting to CamelCase
|
|
80
|
+
# 匹配非字母或数字的字符的正则表达式模式
|
|
81
|
+
# 由string_camelcase用于在转换为驼峰命名法时删除无效字符
|
|
23
82
|
CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')
|
|
24
83
|
|
|
25
84
|
|
|
26
85
|
def string_camelcase(string):
|
|
27
|
-
"""
|
|
86
|
+
"""
|
|
87
|
+
Convert a string to CamelCase and remove invalid characters.
|
|
88
|
+
将字符串转换为驼峰命名法并删除无效字符。
|
|
89
|
+
|
|
90
|
+
This function converts a string to CamelCase by:
|
|
91
|
+
1. Capitalizing the first letter of each word (using str.title())
|
|
92
|
+
2. Removing all non-alphanumeric characters (using CAMELCASE_INVALID_CHARS regex)
|
|
93
|
+
|
|
94
|
+
此函数通过以下方式将字符串转换为驼峰命名法:
|
|
95
|
+
1. 将每个单词的首字母大写(使用str.title())
|
|
96
|
+
2. 删除所有非字母数字字符(使用CAMELCASE_INVALID_CHARS正则表达式)
|
|
97
|
+
|
|
98
|
+
This is commonly used in code generation to convert variable names or
|
|
99
|
+
identifiers from different formats (snake_case, kebab-case, etc.) to CamelCase.
|
|
100
|
+
这在代码生成中常用于将变量名或标识符从不同格式
|
|
101
|
+
(snake_case、kebab-case等)转换为驼峰命名法。
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
string: The input string to convert to CamelCase.
|
|
105
|
+
要转换为驼峰命名法的输入字符串。
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
str: The CamelCase version of the input string with invalid characters removed.
|
|
109
|
+
输入字符串的驼峰命名法版本,已删除无效字符。
|
|
28
110
|
|
|
29
|
-
|
|
30
|
-
|
|
111
|
+
Examples:
|
|
112
|
+
>>> string_camelcase('lost-pound')
|
|
113
|
+
'LostPound'
|
|
31
114
|
|
|
32
|
-
|
|
33
|
-
|
|
115
|
+
>>> string_camelcase('missing_images')
|
|
116
|
+
'MissingImages'
|
|
34
117
|
|
|
118
|
+
>>> string_camelcase('hello world')
|
|
119
|
+
'HelloWorld'
|
|
35
120
|
"""
|
|
121
|
+
# Convert to title case (capitalize first letter of each word) and remove invalid chars
|
|
122
|
+
# 转换为标题大小写(每个单词的首字母大写)并删除无效字符
|
|
36
123
|
return CAMELCASE_INVALID_CHARS.sub('', string.title())
|