aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/libs/pipelines/csv.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CSV Pipeline for AioScrapy
|
|
3
|
+
AioScrapy的CSV管道
|
|
4
|
+
|
|
5
|
+
This module provides a pipeline for storing scraped items in CSV files.
|
|
6
|
+
It includes a mixin class for CSV file handling and a pipeline class that
|
|
7
|
+
uses the mixin to process items and save them to CSV files.
|
|
8
|
+
此模块提供了一个用于将抓取的项目存储在CSV文件中的管道。
|
|
9
|
+
它包括一个用于CSV文件处理的混入类和一个使用该混入类处理项目并将其保存到CSV文件的管道类。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
import asyncio
|
|
2
13
|
import csv
|
|
3
14
|
from typing import Optional
|
|
@@ -6,6 +17,19 @@ from aioscrapy.utils.log import logger
|
|
|
6
17
|
|
|
7
18
|
|
|
8
19
|
class CsvSinkMixin:
|
|
20
|
+
"""
|
|
21
|
+
Mixin class for CSV file handling.
|
|
22
|
+
CSV文件处理的混入类。
|
|
23
|
+
|
|
24
|
+
This mixin provides methods for saving items to CSV files, managing CSV writers,
|
|
25
|
+
and closing CSV files. It can be used by any class that needs to write data to
|
|
26
|
+
CSV files.
|
|
27
|
+
此混入类提供了将项目保存到CSV文件、管理CSV写入器和关闭CSV文件的方法。
|
|
28
|
+
它可以被任何需要将数据写入CSV文件的类使用。
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Dictionary to store CSV writers and file handles, keyed by filename
|
|
32
|
+
# 用于存储CSV写入器和文件句柄的字典,以文件名为键
|
|
9
33
|
csv_writer = {}
|
|
10
34
|
|
|
11
35
|
async def save_item(
|
|
@@ -14,64 +38,278 @@ class CsvSinkMixin:
|
|
|
14
38
|
*,
|
|
15
39
|
filename: Optional[str] = None,
|
|
16
40
|
):
|
|
41
|
+
"""
|
|
42
|
+
Save an item to a CSV file.
|
|
43
|
+
将项目保存到CSV文件。
|
|
44
|
+
|
|
45
|
+
This method writes a dictionary item as a row in a CSV file. The first row
|
|
46
|
+
of the CSV file will contain the keys of the first item saved to the file.
|
|
47
|
+
此方法将字典项目作为CSV文件中的一行写入。CSV文件的第一行将包含保存到
|
|
48
|
+
文件的第一个项目的键。
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
item: The dictionary item to save.
|
|
52
|
+
要保存的字典项目。
|
|
53
|
+
filename: The name of the CSV file to save to.
|
|
54
|
+
要保存到的CSV文件的名称。
|
|
55
|
+
If not provided, an assertion error will be raised.
|
|
56
|
+
如果未提供,将引发断言错误。
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
AssertionError: If filename is None.
|
|
60
|
+
如果filename为None。
|
|
61
|
+
Exception: If there is an error writing to the CSV file.
|
|
62
|
+
如果写入CSV文件时出错。
|
|
63
|
+
"""
|
|
64
|
+
# Ensure filename is provided
|
|
65
|
+
# 确保提供了文件名
|
|
17
66
|
assert filename is not None, "请传入filename参数"
|
|
67
|
+
|
|
68
|
+
# Add .csv extension if not present
|
|
69
|
+
# 如果不存在,则添加.csv扩展名
|
|
18
70
|
if '.csv' not in filename:
|
|
19
71
|
filename = filename + '.csv'
|
|
72
|
+
|
|
20
73
|
try:
|
|
74
|
+
# Get or create a CSV writer for this file
|
|
75
|
+
# 获取或创建此文件的CSV写入器
|
|
21
76
|
writer = self._get_writer(filename, item)
|
|
77
|
+
|
|
78
|
+
# Write the item values as a row
|
|
79
|
+
# 将项目值作为一行写入
|
|
22
80
|
writer.writerow(item.values())
|
|
23
81
|
except Exception as e:
|
|
82
|
+
# Log any errors that occur
|
|
83
|
+
# 记录发生的任何错误
|
|
24
84
|
logger.exception(f'Save csv Error, filename:{filename}, item:{item}, errMsg: {e}')
|
|
25
85
|
|
|
26
86
|
def _get_writer(self, filename, item):
|
|
87
|
+
"""
|
|
88
|
+
Get or create a CSV writer for a file.
|
|
89
|
+
获取或创建文件的CSV写入器。
|
|
90
|
+
|
|
91
|
+
This method returns an existing CSV writer for the given filename if one
|
|
92
|
+
exists, or creates a new one if not. When creating a new writer, it also
|
|
93
|
+
writes the header row using the keys of the provided item.
|
|
94
|
+
如果存在,此方法返回给定文件名的现有CSV写入器,如果不存在,则创建一个新的。
|
|
95
|
+
创建新写入器时,它还使用提供的项目的键写入标题行。
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
filename: The name of the CSV file.
|
|
99
|
+
CSV文件的名称。
|
|
100
|
+
item: The dictionary item whose keys will be used as headers.
|
|
101
|
+
其键将用作标题的字典项目。
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
csv.writer: A CSV writer object for the file.
|
|
105
|
+
文件的CSV写入器对象。
|
|
106
|
+
"""
|
|
107
|
+
# Try to get an existing writer
|
|
108
|
+
# 尝试获取现有的写入器
|
|
27
109
|
writer, *_ = self.csv_writer.get(filename, (None, None))
|
|
110
|
+
|
|
111
|
+
# If no writer exists, create a new one
|
|
112
|
+
# 如果不存在写入器,则创建一个新的
|
|
28
113
|
if writer is None:
|
|
114
|
+
# Open the file for writing
|
|
115
|
+
# 打开文件进行写入
|
|
29
116
|
file = open(filename, 'w', encoding="UTF8", newline='')
|
|
117
|
+
|
|
118
|
+
# Create a CSV writer
|
|
119
|
+
# 创建CSV写入器
|
|
30
120
|
writer = csv.writer(file)
|
|
121
|
+
|
|
122
|
+
# Write the header row using the item keys
|
|
123
|
+
# 使用项目键写入标题行
|
|
31
124
|
writer.writerow(item.keys())
|
|
125
|
+
|
|
126
|
+
# Store the writer and file handle
|
|
127
|
+
# 存储写入器和文件句柄
|
|
32
128
|
self.csv_writer[filename] = (writer, file)
|
|
129
|
+
|
|
33
130
|
return writer
|
|
34
131
|
|
|
35
132
|
def close_csv(self, filename=None):
|
|
133
|
+
"""
|
|
134
|
+
Close a specific CSV file.
|
|
135
|
+
关闭特定的CSV文件。
|
|
136
|
+
|
|
137
|
+
This method closes the file handle for a specific CSV file and removes
|
|
138
|
+
its writer from the csv_writer dictionary.
|
|
139
|
+
此方法关闭特定CSV文件的文件句柄,并从csv_writer字典中删除其写入器。
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
filename: The name of the CSV file to close.
|
|
143
|
+
要关闭的CSV文件的名称。
|
|
144
|
+
If None, nothing happens.
|
|
145
|
+
如果为None,则不会发生任何事情。
|
|
146
|
+
"""
|
|
147
|
+
# Remove the writer and file handle from the dictionary
|
|
148
|
+
# 从字典中删除写入器和文件句柄
|
|
36
149
|
*_, file = self.csv_writer.pop(filename, (None, None))
|
|
150
|
+
|
|
151
|
+
# If a file handle was found, close it
|
|
152
|
+
# 如果找到文件句柄,则关闭它
|
|
37
153
|
if file is not None:
|
|
38
154
|
logger.info(f'Closing csv: {filename}')
|
|
39
155
|
file.close()
|
|
40
156
|
|
|
41
157
|
def close(self):
|
|
158
|
+
"""
|
|
159
|
+
Close all open CSV files.
|
|
160
|
+
关闭所有打开的CSV文件。
|
|
161
|
+
|
|
162
|
+
This method closes all file handles for all CSV files that have been
|
|
163
|
+
opened by this instance.
|
|
164
|
+
此方法关闭此实例打开的所有CSV文件的所有文件句柄。
|
|
165
|
+
"""
|
|
166
|
+
# Make a copy of the keys to avoid modifying the dictionary during iteration
|
|
167
|
+
# 复制键以避免在迭代期间修改字典
|
|
42
168
|
for filename in list(self.csv_writer.keys()):
|
|
43
169
|
self.close_csv(filename)
|
|
44
170
|
|
|
45
171
|
|
|
46
172
|
class CsvPipeline(CsvSinkMixin):
|
|
173
|
+
"""
|
|
174
|
+
Pipeline for storing scraped items in CSV files.
|
|
175
|
+
用于将抓取的项目存储在CSV文件中的管道。
|
|
176
|
+
|
|
177
|
+
This pipeline uses the CsvSinkMixin to save items to CSV files. It processes
|
|
178
|
+
items that have a '__csv__' key, which contains parameters for the CSV file
|
|
179
|
+
such as the filename.
|
|
180
|
+
此管道使用CsvSinkMixin将项目保存到CSV文件中。它处理具有'__csv__'键的项目,
|
|
181
|
+
该键包含CSV文件的参数,如文件名。
|
|
182
|
+
"""
|
|
183
|
+
|
|
47
184
|
def __init__(self, settings):
|
|
185
|
+
"""
|
|
186
|
+
Initialize the CSV pipeline.
|
|
187
|
+
初始化CSV管道。
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
settings: The AioScrapy settings object.
|
|
191
|
+
AioScrapy设置对象。
|
|
192
|
+
Not used in the current implementation, but included for
|
|
193
|
+
compatibility with the pipeline interface.
|
|
194
|
+
在当前实现中未使用,但为了与管道接口兼容而包含。
|
|
195
|
+
"""
|
|
196
|
+
# Create a lock to ensure thread-safe access to CSV files
|
|
197
|
+
# 创建锁以确保对CSV文件的线程安全访问
|
|
48
198
|
self.lock = asyncio.Lock()
|
|
49
199
|
|
|
50
200
|
@classmethod
|
|
51
201
|
def from_settings(cls, settings):
|
|
202
|
+
"""
|
|
203
|
+
Create a CsvPipeline instance from settings.
|
|
204
|
+
从设置创建CsvPipeline实例。
|
|
205
|
+
|
|
206
|
+
This is the factory method used by AioScrapy to create pipeline instances.
|
|
207
|
+
这是AioScrapy用于创建管道实例的工厂方法。
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
settings: The AioScrapy settings object.
|
|
211
|
+
AioScrapy设置对象。
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
CsvPipeline: A new CsvPipeline instance.
|
|
215
|
+
一个新的CsvPipeline实例。
|
|
216
|
+
"""
|
|
52
217
|
return cls(settings)
|
|
53
218
|
|
|
54
219
|
async def process_item(self, item, spider):
|
|
220
|
+
"""
|
|
221
|
+
Process an item and save it to a CSV file if it has a '__csv__' key.
|
|
222
|
+
处理项目,如果它有'__csv__'键,则将其保存到CSV文件。
|
|
223
|
+
|
|
224
|
+
This method checks if the item has a '__csv__' key. If it does, it uses
|
|
225
|
+
the parameters in that key to save the item to a CSV file. If not, it
|
|
226
|
+
logs a warning and returns the item unchanged.
|
|
227
|
+
此方法检查项目是否具有'__csv__'键。如果有,它使用该键中的参数将项目
|
|
228
|
+
保存到CSV文件。如果没有,它会记录警告并返回未更改的项目。
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
item: The item to process.
|
|
232
|
+
要处理的项目。
|
|
233
|
+
spider: The spider that generated the item.
|
|
234
|
+
生成项目的爬虫。
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
dict: The processed item.
|
|
238
|
+
处理后的项目。
|
|
239
|
+
"""
|
|
240
|
+
# Extract CSV parameters from the item
|
|
241
|
+
# 从项目中提取CSV参数
|
|
55
242
|
execl_kw: Optional[dict] = item.pop('__csv__', None)
|
|
243
|
+
|
|
244
|
+
# If no CSV parameters, log a warning and return the item
|
|
245
|
+
# 如果没有CSV参数,记录警告并返回项目
|
|
56
246
|
if not execl_kw:
|
|
57
247
|
logger.warning(f"item Missing key __csv__, not stored")
|
|
58
248
|
return item
|
|
59
249
|
|
|
250
|
+
# Use the spider name as the default filename
|
|
251
|
+
# 使用爬虫名称作为默认文件名
|
|
60
252
|
execl_kw.setdefault('filename', spider.name)
|
|
253
|
+
|
|
254
|
+
# Use a lock to ensure thread-safe access to CSV files
|
|
255
|
+
# 使用锁确保对CSV文件的线程安全访问
|
|
61
256
|
async with self.lock:
|
|
257
|
+
# Save the item to a CSV file
|
|
258
|
+
# 将项目保存到CSV文件
|
|
62
259
|
await self.save_item(item, **execl_kw)
|
|
63
260
|
|
|
261
|
+
return item
|
|
262
|
+
|
|
64
263
|
async def close_spider(self, spider):
|
|
264
|
+
"""
|
|
265
|
+
Close all open CSV files when the spider is closed.
|
|
266
|
+
当爬虫关闭时关闭所有打开的CSV文件。
|
|
267
|
+
|
|
268
|
+
This method is called by AioScrapy when a spider is closed. It ensures
|
|
269
|
+
that all CSV files opened by this pipeline are properly closed.
|
|
270
|
+
当爬虫关闭时,AioScrapy调用此方法。它确保此管道打开的所有CSV文件
|
|
271
|
+
都正确关闭。
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
spider: The spider that was closed.
|
|
275
|
+
被关闭的爬虫。
|
|
276
|
+
"""
|
|
277
|
+
# Close all open CSV files
|
|
278
|
+
# 关闭所有打开的CSV文件
|
|
65
279
|
self.close()
|
|
66
280
|
|
|
67
281
|
|
|
282
|
+
# Test code for the CSV pipeline
|
|
283
|
+
# CSV管道的测试代码
|
|
68
284
|
if __name__ == '__main__':
|
|
285
|
+
"""
|
|
286
|
+
Test code to demonstrate the usage of the CsvPipeline.
|
|
287
|
+
演示CsvPipeline用法的测试代码。
|
|
288
|
+
|
|
289
|
+
This code creates a simple test spider and pipeline, processes a test item,
|
|
290
|
+
and then closes the pipeline.
|
|
291
|
+
此代码创建一个简单的测试爬虫和管道,处理一个测试项目,然后关闭管道。
|
|
292
|
+
"""
|
|
293
|
+
|
|
69
294
|
class TestSpider:
|
|
295
|
+
"""
|
|
296
|
+
Simple test spider class with a name attribute.
|
|
297
|
+
具有name属性的简单测试爬虫类。
|
|
298
|
+
"""
|
|
70
299
|
name = 'TestSpider'
|
|
71
300
|
|
|
72
301
|
|
|
73
302
|
async def test():
|
|
303
|
+
"""
|
|
304
|
+
Async test function to demonstrate the CsvPipeline.
|
|
305
|
+
演示CsvPipeline的异步测试函数。
|
|
306
|
+
"""
|
|
307
|
+
# Create a new CSV pipeline
|
|
308
|
+
# 创建一个新的CSV管道
|
|
74
309
|
p = CsvPipeline({})
|
|
310
|
+
|
|
311
|
+
# Process a test item with CSV parameters
|
|
312
|
+
# 处理带有CSV参数的测试项目
|
|
75
313
|
await p.process_item({
|
|
76
314
|
'title': '测试',
|
|
77
315
|
'img': '//www.baidu.com/img/flexible/logo/pc/result.png',
|
|
@@ -80,7 +318,11 @@ if __name__ == '__main__':
|
|
|
80
318
|
}
|
|
81
319
|
}, TestSpider())
|
|
82
320
|
|
|
321
|
+
# Close the pipeline
|
|
322
|
+
# 关闭管道
|
|
83
323
|
await p.close_spider(None)
|
|
84
324
|
|
|
85
325
|
|
|
326
|
+
# Run the test function
|
|
327
|
+
# 运行测试函数
|
|
86
328
|
asyncio.run(test())
|