aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/queue/__init__.py
CHANGED
|
@@ -7,7 +7,15 @@ from aioscrapy.utils.reqser import request_from_dict
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class AbsQueue(metaclass=ABCMeta):
|
|
10
|
-
"""
|
|
10
|
+
"""
|
|
11
|
+
Per-spider base queue class.
|
|
12
|
+
每个爬虫的基础队列类。
|
|
13
|
+
|
|
14
|
+
This abstract class defines the interface for request queues used by spiders.
|
|
15
|
+
It provides methods for pushing, popping, and managing requests in a queue.
|
|
16
|
+
此抽象类定义了爬虫使用的请求队列的接口。
|
|
17
|
+
它提供了推送、弹出和管理队列中请求的方法。
|
|
18
|
+
"""
|
|
11
19
|
|
|
12
20
|
def __init__(
|
|
13
21
|
self,
|
|
@@ -16,52 +24,196 @@ class AbsQueue(metaclass=ABCMeta):
|
|
|
16
24
|
key: Optional[str] = None,
|
|
17
25
|
serializer: Optional[AbsSerializer] = None
|
|
18
26
|
) -> None:
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
27
|
+
"""
|
|
28
|
+
Initialize per-spider queue.
|
|
29
|
+
初始化每个爬虫的队列。
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
container: The underlying data structure to store the queue.
|
|
33
|
+
存储队列的底层数据结构。
|
|
34
|
+
spider: The spider instance that will use this queue.
|
|
35
|
+
将使用此队列的爬虫实例。
|
|
36
|
+
key: Optional key to identify this queue.
|
|
37
|
+
可选的键,用于标识此队列。
|
|
38
|
+
serializer: Optional serializer for encoding/decoding requests.
|
|
39
|
+
可选的序列化器,用于编码/解码请求。
|
|
40
|
+
"""
|
|
41
|
+
self.container = container # The underlying data structure
|
|
42
|
+
# 底层数据结构
|
|
43
|
+
self.spider = spider # Associated spider
|
|
44
|
+
# 关联的爬虫
|
|
45
|
+
self.key = key # Queue identifier
|
|
46
|
+
# 队列标识符
|
|
47
|
+
self.serializer = serializer # For serializing requests
|
|
48
|
+
# 用于序列化请求
|
|
24
49
|
|
|
25
50
|
@property
|
|
26
51
|
@abstractmethod
|
|
27
52
|
def inc_key(self) -> str:
|
|
28
|
-
"""
|
|
53
|
+
"""
|
|
54
|
+
Get the key used for incrementing stats.
|
|
55
|
+
获取用于增加统计信息的键。
|
|
56
|
+
|
|
57
|
+
This property should return a string key that will be used with
|
|
58
|
+
the stats collector's inc_value method to track queue operations.
|
|
59
|
+
此属性应返回一个字符串键,该键将与统计收集器的inc_value方法一起使用,
|
|
60
|
+
以跟踪队列操作。
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
str: The stats key for this queue.
|
|
64
|
+
此队列的统计键。
|
|
65
|
+
"""
|
|
29
66
|
|
|
30
67
|
@classmethod
|
|
31
68
|
@abstractmethod
|
|
32
69
|
async def from_spider(cls, spider: aioscrapy.Spider) -> "AbsQueue":
|
|
33
|
-
"""
|
|
70
|
+
"""
|
|
71
|
+
Create a queue instance for a spider.
|
|
72
|
+
为爬虫创建队列实例。
|
|
73
|
+
|
|
74
|
+
This factory method creates a new queue instance configured
|
|
75
|
+
for the given spider.
|
|
76
|
+
此工厂方法创建一个为给定爬虫配置的新队列实例。
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
spider: The spider that will use the queue.
|
|
80
|
+
将使用队列的爬虫。
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
AbsQueue: A new queue instance.
|
|
84
|
+
一个新的队列实例。
|
|
85
|
+
"""
|
|
34
86
|
|
|
35
87
|
def _encode_request(self, request: aioscrapy.Request) -> Any:
|
|
36
|
-
"""
|
|
88
|
+
"""
|
|
89
|
+
Encode a request object for storage.
|
|
90
|
+
编码请求对象以进行存储。
|
|
91
|
+
|
|
92
|
+
This method converts a Request object to a serialized form that can
|
|
93
|
+
be stored in the queue's container.
|
|
94
|
+
此方法将Request对象转换为可以存储在队列容器中的序列化形式。
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
request: The Request object to encode.
|
|
98
|
+
要编码的Request对象。
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Any: The serialized form of the request.
|
|
102
|
+
请求的序列化形式。
|
|
103
|
+
"""
|
|
37
104
|
obj = request.to_dict(spider=self.spider)
|
|
38
105
|
return self.serializer.dumps(obj)
|
|
39
106
|
|
|
40
107
|
async def _decode_request(self, encoded_request: Any) -> aioscrapy.Request:
|
|
41
|
-
"""
|
|
108
|
+
"""
|
|
109
|
+
Decode a previously encoded request.
|
|
110
|
+
解码先前编码的请求。
|
|
111
|
+
|
|
112
|
+
This method converts a serialized request back into a Request object.
|
|
113
|
+
此方法将序列化的请求转换回Request对象。
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
encoded_request: The serialized request to decode.
|
|
117
|
+
要解码的序列化请求。
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Request: The reconstructed Request object.
|
|
121
|
+
重建的Request对象。
|
|
122
|
+
"""
|
|
42
123
|
obj = self.serializer.loads(encoded_request)
|
|
43
124
|
return await request_from_dict(obj, spider=self.spider)
|
|
44
125
|
|
|
45
126
|
def __len__(self) -> None:
|
|
46
|
-
"""
|
|
127
|
+
"""
|
|
128
|
+
Return the length of the queue (synchronous version).
|
|
129
|
+
返回队列的长度(同步版本)。
|
|
130
|
+
|
|
131
|
+
This method is overridden to prevent synchronous access to the queue length.
|
|
132
|
+
Use the async len() method instead.
|
|
133
|
+
此方法被重写以防止同步访问队列长度。
|
|
134
|
+
请改用异步len()方法。
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
Exception: Always raises an exception to remind users to use the async len() method.
|
|
138
|
+
始终引发异常,以提醒用户使用异步len()方法。
|
|
139
|
+
"""
|
|
47
140
|
raise Exception('please use len()')
|
|
48
141
|
|
|
49
142
|
@abstractmethod
|
|
50
143
|
async def len(self) -> int:
|
|
51
|
-
"""
|
|
144
|
+
"""
|
|
145
|
+
Return the length of the queue (asynchronous version).
|
|
146
|
+
返回队列的长度(异步版本)。
|
|
147
|
+
|
|
148
|
+
This method should return the number of requests currently in the queue.
|
|
149
|
+
此方法应返回当前队列中的请求数量。
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
int: The number of requests in the queue.
|
|
153
|
+
队列中的请求数量。
|
|
154
|
+
"""
|
|
52
155
|
|
|
53
156
|
@abstractmethod
|
|
54
157
|
async def push(self, request: aioscrapy.Request) -> None:
|
|
55
|
-
"""
|
|
158
|
+
"""
|
|
159
|
+
Push a request to the queue.
|
|
160
|
+
将请求推送到队列。
|
|
161
|
+
|
|
162
|
+
This method adds a single request to the queue.
|
|
163
|
+
此方法将单个请求添加到队列中。
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
request: The request to add to the queue.
|
|
167
|
+
要添加到队列的请求。
|
|
168
|
+
"""
|
|
56
169
|
|
|
57
170
|
@abstractmethod
|
|
58
171
|
async def push_batch(self, requests: List[aioscrapy.Request]) -> None:
|
|
59
|
-
"""
|
|
172
|
+
"""
|
|
173
|
+
Push multiple requests to the queue.
|
|
174
|
+
将多个请求推送到队列。
|
|
175
|
+
|
|
176
|
+
This method adds multiple requests to the queue at once,
|
|
177
|
+
which may be more efficient than calling push() multiple times.
|
|
178
|
+
此方法一次将多个请求添加到队列中,
|
|
179
|
+
这可能比多次调用push()更有效率。
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
requests: A list of requests to add to the queue.
|
|
183
|
+
要添加到队列的请求列表。
|
|
184
|
+
"""
|
|
60
185
|
|
|
61
186
|
@abstractmethod
|
|
62
187
|
async def pop(self, timeout: int = 0) -> Optional[aioscrapy.Request]:
|
|
63
|
-
"""
|
|
188
|
+
"""
|
|
189
|
+
Pop a request from the queue.
|
|
190
|
+
从队列中弹出请求。
|
|
191
|
+
|
|
192
|
+
This method removes and returns a request from the queue.
|
|
193
|
+
If the queue is empty, it may wait up to timeout seconds
|
|
194
|
+
before returning None.
|
|
195
|
+
此方法从队列中移除并返回一个请求。
|
|
196
|
+
如果队列为空,它可能会等待最多timeout秒,
|
|
197
|
+
然后返回None。
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
timeout: Maximum time to wait for a request, in seconds.
|
|
201
|
+
等待请求的最长时间,以秒为单位。
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Optional[Request]: The next request from the queue, or None if
|
|
205
|
+
the queue is empty or the timeout expires.
|
|
206
|
+
队列中的下一个请求,如果队列为空或超时,则为None。
|
|
207
|
+
"""
|
|
64
208
|
|
|
65
209
|
@abstractmethod
|
|
66
210
|
async def clear(self) -> None:
|
|
67
|
-
"""
|
|
211
|
+
"""
|
|
212
|
+
Clear all requests from the queue.
|
|
213
|
+
清除队列中的所有请求。
|
|
214
|
+
|
|
215
|
+
This method removes all pending requests from the queue,
|
|
216
|
+
effectively resetting it to an empty state.
|
|
217
|
+
此方法从队列中删除所有待处理的请求,
|
|
218
|
+
有效地将其重置为空状态。
|
|
219
|
+
"""
|
aioscrapy/scrapyd/runner.py
CHANGED
|
@@ -1,4 +1,28 @@
|
|
|
1
1
|
|
|
2
|
+
"""
|
|
3
|
+
Scrapyd Runner Module
|
|
4
|
+
Scrapyd运行器模块
|
|
5
|
+
|
|
6
|
+
This module provides utilities for running AioScrapy spiders from egg files deployed
|
|
7
|
+
with Scrapyd. It handles the activation of egg files, setting up the project environment,
|
|
8
|
+
and launching the spider.
|
|
9
|
+
此模块提供了从使用Scrapyd部署的egg文件运行AioScrapy爬虫的实用程序。它处理egg文件的激活、
|
|
10
|
+
设置项目环境和启动爬虫。
|
|
11
|
+
|
|
12
|
+
The main components are:
|
|
13
|
+
主要组件包括:
|
|
14
|
+
|
|
15
|
+
1. activate_egg: Activates a Scrapy egg file and sets up the environment
|
|
16
|
+
激活Scrapy egg文件并设置环境
|
|
17
|
+
2. project_environment: Context manager that sets up the project environment
|
|
18
|
+
设置项目环境的上下文管理器
|
|
19
|
+
3. main: Entry point for running spiders from Scrapyd
|
|
20
|
+
从Scrapyd运行爬虫的入口点
|
|
21
|
+
|
|
22
|
+
This module is designed to be used by Scrapyd to run AioScrapy spiders, but it can
|
|
23
|
+
also be used directly to run spiders from egg files.
|
|
24
|
+
此模块设计用于Scrapyd运行AioScrapy爬虫,但也可以直接用于从egg文件运行爬虫。
|
|
25
|
+
"""
|
|
2
26
|
import os
|
|
3
27
|
import shutil
|
|
4
28
|
import sys
|
|
@@ -15,9 +39,28 @@ except ImportError:
|
|
|
15
39
|
|
|
16
40
|
|
|
17
41
|
def activate_egg(eggpath):
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
|
|
42
|
+
"""
|
|
43
|
+
Activate a Scrapy egg file.
|
|
44
|
+
激活aioscrapy egg文件。
|
|
45
|
+
|
|
46
|
+
This function activates a aioscrapy egg file by adding it to the Python path
|
|
47
|
+
and setting the AIOSCRAPY_SETTINGS_MODULE environment variable to the
|
|
48
|
+
settings module specified in the egg's entry points.
|
|
49
|
+
此函数通过将aioscrapy egg文件添加到Python路径并将AIOSCRAPY_SETTINGS_MODULE
|
|
50
|
+
环境变量设置为egg入口点中指定的设置模块来激活它。
|
|
51
|
+
|
|
52
|
+
This is meant to be used from egg runners to activate a Scrapy egg file.
|
|
53
|
+
Don't use it from other code as it may leave unwanted side effects.
|
|
54
|
+
这旨在从egg运行器使用,以激活Scrapy egg文件。不要从其他代码中使用它,
|
|
55
|
+
因为它可能会留下不必要的副作用。
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
eggpath: Path to the egg file to activate.
|
|
59
|
+
要激活的egg文件的路径。
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If the egg file is unknown or corrupt.
|
|
63
|
+
如果egg文件未知或损坏。
|
|
21
64
|
"""
|
|
22
65
|
try:
|
|
23
66
|
d = next(pkg_resources.find_distributions(eggpath))
|
|
@@ -30,31 +73,109 @@ def activate_egg(eggpath):
|
|
|
30
73
|
|
|
31
74
|
@contextmanager
|
|
32
75
|
def project_environment(project):
|
|
76
|
+
"""
|
|
77
|
+
Set up the environment for a aioscrapy project.
|
|
78
|
+
为aioscrapy项目设置环境。
|
|
79
|
+
|
|
80
|
+
This context manager sets up the environment for a aioscrapy project by:
|
|
81
|
+
此上下文管理器通过以下方式为aioscrapy项目设置环境:
|
|
82
|
+
|
|
83
|
+
1. Retrieving the egg file for the project from aioscrapyd's egg storage
|
|
84
|
+
从aioscrapyd的egg存储中检索项目的egg文件
|
|
85
|
+
2. Creating a temporary copy of the egg file
|
|
86
|
+
创建egg文件的临时副本
|
|
87
|
+
3. Activating the egg file
|
|
88
|
+
激活egg文件
|
|
89
|
+
4. Cleaning up the temporary egg file when done
|
|
90
|
+
完成后清理临时egg文件
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
project: The name of the project to set up the environment for.
|
|
94
|
+
要为其设置环境的项目名称。
|
|
95
|
+
|
|
96
|
+
Yields:
|
|
97
|
+
None: This context manager doesn't yield a value, but sets up the
|
|
98
|
+
environment for the code inside the with block.
|
|
99
|
+
此上下文管理器不产生值,但为with块内的代码设置环境。
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
AssertionError: If aioscrapy settings are already loaded.
|
|
103
|
+
如果aioscrapy设置已加载。
|
|
104
|
+
"""
|
|
105
|
+
# Get the Scrapyd application and egg storage
|
|
106
|
+
# 获取Scrapyd应用程序和egg存储
|
|
33
107
|
app = get_application()
|
|
34
108
|
eggstorage = app.getComponent(IEggStorage)
|
|
109
|
+
|
|
110
|
+
# Get the egg version from environment or use the latest
|
|
111
|
+
# 从环境获取egg版本或使用最新版本
|
|
35
112
|
eggversion = os.environ.get('AIOSCRAPY_EGG_VERSION', None)
|
|
113
|
+
|
|
114
|
+
# Get the egg file from storage
|
|
115
|
+
# 从存储中获取egg文件
|
|
36
116
|
version, eggfile = eggstorage.get(project, eggversion)
|
|
117
|
+
|
|
37
118
|
if eggfile:
|
|
119
|
+
# Create a temporary copy of the egg file
|
|
120
|
+
# 创建egg文件的临时副本
|
|
38
121
|
prefix = '%s-%s-' % (project, version)
|
|
39
122
|
fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg')
|
|
40
123
|
lf = os.fdopen(fd, 'wb')
|
|
41
124
|
shutil.copyfileobj(eggfile, lf)
|
|
42
125
|
lf.close()
|
|
126
|
+
|
|
127
|
+
# Activate the egg file
|
|
128
|
+
# 激活egg文件
|
|
43
129
|
activate_egg(eggpath)
|
|
44
130
|
else:
|
|
45
131
|
eggpath = None
|
|
132
|
+
|
|
46
133
|
try:
|
|
134
|
+
# Ensure settings aren't already loaded
|
|
135
|
+
# 确保设置尚未加载
|
|
47
136
|
assert 'aioscrapy.conf' not in sys.modules, "aioscrapy settings already loaded"
|
|
48
137
|
yield
|
|
49
138
|
finally:
|
|
139
|
+
# Clean up the temporary egg file
|
|
140
|
+
# 清理临时egg文件
|
|
50
141
|
if eggpath:
|
|
51
142
|
os.remove(eggpath)
|
|
52
143
|
|
|
53
144
|
|
|
54
145
|
def main():
|
|
146
|
+
"""
|
|
147
|
+
Main entry point for running spiders from Scrapyd.
|
|
148
|
+
从Scrapyd运行爬虫的主入口点。
|
|
149
|
+
|
|
150
|
+
This function:
|
|
151
|
+
此函数:
|
|
152
|
+
|
|
153
|
+
1. Updates environment variables by converting SCRAPY_* variables to AIO* variables
|
|
154
|
+
通过将SCRAPY_*变量转换为AIO*变量来更新环境变量
|
|
155
|
+
2. Gets the project name from the AIOSCRAPY_PROJECT environment variable
|
|
156
|
+
从AIOSCRAPY_PROJECT环境变量获取项目名称
|
|
157
|
+
3. Sets up the project environment using the project_environment context manager
|
|
158
|
+
使用project_environment上下文管理器设置项目环境
|
|
159
|
+
4. Imports and executes the aioscrapy.cmdline.execute function to run the spider
|
|
160
|
+
导入并执行aioscrapy.cmdline.execute函数来运行爬虫
|
|
161
|
+
|
|
162
|
+
This function is designed to be called by Scrapyd to run AioScrapy spiders.
|
|
163
|
+
此函数设计用于Scrapyd调用以运行AioScrapy爬虫。
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
KeyError: If the AIOSCRAPY_PROJECT environment variable is not set.
|
|
167
|
+
如果未设置AIOSCRAPY_PROJECT环境变量。
|
|
168
|
+
"""
|
|
169
|
+
# Update environment variables by converting SCRAPY_* to AIO*
|
|
170
|
+
# 通过将SCRAPY_*转换为AIO*来更新环境变量
|
|
55
171
|
os.environ.update({f'AIO{k}': v for k, v in os.environ.items() if k.startswith('SCRAPY_')})
|
|
56
172
|
|
|
173
|
+
# Get the project name from environment
|
|
174
|
+
# 从环境获取项目名称
|
|
57
175
|
project = os.environ['AIOSCRAPY_PROJECT']
|
|
176
|
+
|
|
177
|
+
# Set up the project environment and run the spider
|
|
178
|
+
# 设置项目环境并运行爬虫
|
|
58
179
|
with project_environment(project):
|
|
59
180
|
from aioscrapy.cmdline import execute
|
|
60
181
|
execute()
|
aioscrapy/serializer.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Serialization utilities for AioScrapy.
|
|
3
|
+
AioScrapy的序列化实用工具。
|
|
4
|
+
|
|
5
|
+
This module provides serializer classes for converting Python objects to and from
|
|
6
|
+
serialized formats like JSON and Pickle. These serializers are used throughout
|
|
7
|
+
AioScrapy for data persistence, message passing, and caching.
|
|
8
|
+
此模块提供了用于将Python对象转换为序列化格式(如JSON和Pickle)以及从这些格式转换回来的
|
|
9
|
+
序列化器类。这些序列化器在AioScrapy中用于数据持久化、消息传递和缓存。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
import ujson
|
|
2
13
|
import pickle
|
|
3
14
|
from abc import ABCMeta, abstractmethod
|
|
@@ -6,34 +17,203 @@ __all__ = ['PickleSerializer', 'JsonSerializer', 'AbsSerializer']
|
|
|
6
17
|
|
|
7
18
|
|
|
8
19
|
class AbsSerializer(object, metaclass=ABCMeta):
|
|
20
|
+
"""
|
|
21
|
+
Abstract base class for serializers.
|
|
22
|
+
序列化器的抽象基类。
|
|
23
|
+
|
|
24
|
+
This class defines the interface that all serializers must implement.
|
|
25
|
+
It provides methods for serializing Python objects to a string format
|
|
26
|
+
and deserializing strings back to Python objects.
|
|
27
|
+
此类定义了所有序列化器必须实现的接口。
|
|
28
|
+
它提供了将Python对象序列化为字符串格式以及将字符串反序列化回Python对象的方法。
|
|
29
|
+
"""
|
|
9
30
|
|
|
10
31
|
@staticmethod
|
|
11
32
|
@abstractmethod
|
|
12
33
|
def loads(s):
|
|
13
|
-
"""
|
|
34
|
+
"""
|
|
35
|
+
Deserialize a string to a Python object.
|
|
36
|
+
将字符串反序列化为Python对象。
|
|
37
|
+
|
|
38
|
+
This method takes a serialized string and converts it back to
|
|
39
|
+
a Python object.
|
|
40
|
+
此方法接受一个序列化的字符串并将其转换回Python对象。
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
s: The serialized string to deserialize.
|
|
44
|
+
要反序列化的序列化字符串。
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The deserialized Python object.
|
|
48
|
+
反序列化的Python对象。
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
Depends on the implementation.
|
|
52
|
+
取决于实现。
|
|
53
|
+
"""
|
|
54
|
+
pass
|
|
14
55
|
|
|
15
56
|
@staticmethod
|
|
16
57
|
@abstractmethod
|
|
17
58
|
def dumps(obj):
|
|
18
|
-
"""
|
|
59
|
+
"""
|
|
60
|
+
Serialize a Python object to a string.
|
|
61
|
+
将Python对象序列化为字符串。
|
|
62
|
+
|
|
63
|
+
This method takes a Python object and converts it to a serialized
|
|
64
|
+
string format.
|
|
65
|
+
此方法接受一个Python对象并将其转换为序列化的字符串格式。
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
obj: The Python object to serialize.
|
|
69
|
+
要序列化的Python对象。
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
The serialized string representation of the object.
|
|
73
|
+
对象的序列化字符串表示。
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
Depends on the implementation.
|
|
77
|
+
取决于实现。
|
|
78
|
+
"""
|
|
79
|
+
pass
|
|
19
80
|
|
|
20
81
|
|
|
21
82
|
class PickleSerializer(AbsSerializer):
|
|
83
|
+
"""
|
|
84
|
+
Serializer that uses Python's pickle module.
|
|
85
|
+
使用Python的pickle模块的序列化器。
|
|
86
|
+
|
|
87
|
+
This serializer uses Python's built-in pickle module to serialize and
|
|
88
|
+
deserialize Python objects. Pickle can handle a wide range of Python
|
|
89
|
+
objects, including custom classes, but the resulting serialized data
|
|
90
|
+
is not human-readable and may not be compatible across different
|
|
91
|
+
Python versions.
|
|
92
|
+
此序列化器使用Python内置的pickle模块来序列化和反序列化Python对象。
|
|
93
|
+
Pickle可以处理各种Python对象,包括自定义类,但生成的序列化数据
|
|
94
|
+
不是人类可读的,并且可能在不同的Python版本之间不兼容。
|
|
95
|
+
|
|
96
|
+
Warning:
|
|
97
|
+
Pickle is not secure against maliciously constructed data. Never unpickle
|
|
98
|
+
data received from untrusted or unauthenticated sources.
|
|
99
|
+
Pickle对恶意构造的数据不安全。切勿对来自不受信任或未经身份验证的
|
|
100
|
+
来源的数据进行反序列化。
|
|
101
|
+
"""
|
|
102
|
+
|
|
22
103
|
@staticmethod
|
|
23
104
|
def loads(s):
|
|
105
|
+
"""
|
|
106
|
+
Deserialize a pickle-encoded string to a Python object.
|
|
107
|
+
将pickle编码的字符串反序列化为Python对象。
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
s: The pickle-encoded string to deserialize.
|
|
111
|
+
要反序列化的pickle编码字符串。
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
The deserialized Python object.
|
|
115
|
+
反序列化的Python对象。
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
pickle.UnpicklingError: If the data cannot be unpickled.
|
|
119
|
+
如果数据无法被反序列化。
|
|
120
|
+
ValueError: If the pickle data is truncated.
|
|
121
|
+
如果pickle数据被截断。
|
|
122
|
+
TypeError: If the serialized data is not a bytes-like object.
|
|
123
|
+
如果序列化数据不是类字节对象。
|
|
124
|
+
"""
|
|
24
125
|
return pickle.loads(s)
|
|
25
126
|
|
|
26
127
|
@staticmethod
|
|
27
128
|
def dumps(obj):
|
|
129
|
+
"""
|
|
130
|
+
Serialize a Python object to a pickle-encoded string.
|
|
131
|
+
将Python对象序列化为pickle编码的字符串。
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
obj: The Python object to serialize.
|
|
135
|
+
要序列化的Python对象。
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
bytes: The pickle-encoded representation of the object.
|
|
139
|
+
对象的pickle编码表示。
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
pickle.PicklingError: If the object cannot be pickled.
|
|
143
|
+
如果对象无法被序列化。
|
|
144
|
+
|
|
145
|
+
Note:
|
|
146
|
+
Uses the highest available pickle protocol for maximum efficiency.
|
|
147
|
+
使用最高可用的pickle协议以获得最大效率。
|
|
148
|
+
"""
|
|
149
|
+
# protocol=-1 means use the highest available protocol
|
|
150
|
+
# protocol=-1表示使用最高可用的协议
|
|
28
151
|
return pickle.dumps(obj, protocol=-1)
|
|
29
152
|
|
|
30
153
|
|
|
31
154
|
class JsonSerializer(AbsSerializer):
|
|
155
|
+
"""
|
|
156
|
+
Serializer that uses the ujson module.
|
|
157
|
+
使用ujson模块的序列化器。
|
|
158
|
+
|
|
159
|
+
This serializer uses the ujson module (UltraJSON) to serialize and
|
|
160
|
+
deserialize Python objects to and from JSON format. UltraJSON is a fast
|
|
161
|
+
JSON encoder and decoder written in C with Python bindings.
|
|
162
|
+
此序列化器使用ujson模块(UltraJSON)将Python对象序列化为JSON格式
|
|
163
|
+
以及从JSON格式反序列化。UltraJSON是一个用C编写的快速JSON编码器和
|
|
164
|
+
解码器,带有Python绑定。
|
|
165
|
+
|
|
166
|
+
JSON serialization is more limited than pickle in terms of the types it can
|
|
167
|
+
handle (primarily: dict, list, str, int, float, bool, None), but it produces
|
|
168
|
+
human-readable output and is safe to use with untrusted data.
|
|
169
|
+
JSON序列化在可以处理的类型方面比pickle更有限(主要是:dict、list、str、
|
|
170
|
+
int、float、bool、None),但它产生人类可读的输出,并且可以安全地
|
|
171
|
+
用于不受信任的数据。
|
|
172
|
+
"""
|
|
173
|
+
|
|
32
174
|
@staticmethod
|
|
33
175
|
def loads(s):
|
|
176
|
+
"""
|
|
177
|
+
Deserialize a JSON string to a Python object.
|
|
178
|
+
将JSON字符串反序列化为Python对象。
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
s: The JSON string to deserialize.
|
|
182
|
+
要反序列化的JSON字符串。
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
The deserialized Python object (typically a dict, list, or primitive type).
|
|
186
|
+
反序列化的Python对象(通常是dict、list或原始类型)。
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
ValueError: If the string is not valid JSON.
|
|
190
|
+
如果字符串不是有效的JSON。
|
|
191
|
+
"""
|
|
34
192
|
return ujson.loads(s)
|
|
35
193
|
|
|
36
194
|
@staticmethod
|
|
37
195
|
def dumps(obj):
|
|
196
|
+
"""
|
|
197
|
+
Serialize a Python object to a JSON string.
|
|
198
|
+
将Python对象序列化为JSON字符串。
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
obj: The Python object to serialize.
|
|
202
|
+
要序列化的Python对象。
|
|
203
|
+
Must be a type that can be represented in JSON (dict, list, str,
|
|
204
|
+
int, float, bool, None, or a combination of these).
|
|
205
|
+
必须是可以在JSON中表示的类型(dict、list、str、int、float、
|
|
206
|
+
bool、None或这些的组合)。
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
str: The JSON string representation of the object.
|
|
210
|
+
对象的JSON字符串表示。
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
TypeError: If the object contains types that cannot be serialized to JSON.
|
|
214
|
+
如果对象包含无法序列化为JSON的类型。
|
|
215
|
+
OverflowError: If an integer is too large to be represented in JSON.
|
|
216
|
+
如果整数太大而无法在JSON中表示。
|
|
217
|
+
"""
|
|
38
218
|
return ujson.dumps(obj)
|
|
39
219
|
|