aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MongoDB Pipeline for AioScrapy
|
|
3
|
+
AioScrapy的MongoDB管道
|
|
4
|
+
|
|
5
|
+
This module provides a pipeline for storing scraped items in a MongoDB database.
|
|
6
|
+
It extends the base database pipeline to implement MongoDB-specific functionality
|
|
7
|
+
for batch inserting items.
|
|
8
|
+
此模块提供了一个用于将抓取的项目存储在MongoDB数据库中的管道。
|
|
9
|
+
它扩展了基本数据库管道,以实现MongoDB特定的批量插入项目功能。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from aioscrapy.db import db_manager
|
|
2
13
|
from aioscrapy.libs.pipelines import DBPipelineBase
|
|
3
14
|
|
|
@@ -5,28 +16,114 @@ from aioscrapy.utils.log import logger
|
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class MongoPipeline(DBPipelineBase):
|
|
19
|
+
"""
|
|
20
|
+
Pipeline for storing scraped items in a MongoDB database.
|
|
21
|
+
用于将抓取的项目存储在MongoDB数据库中的管道。
|
|
22
|
+
|
|
23
|
+
This pipeline extends the base database pipeline to implement MongoDB-specific
|
|
24
|
+
functionality for batch inserting items. It supports multiple database connections,
|
|
25
|
+
custom database names, and ordered/unordered inserts.
|
|
26
|
+
此管道扩展了基本数据库管道,以实现MongoDB特定的批量插入项目功能。
|
|
27
|
+
它支持多个数据库连接、自定义数据库名称和有序/无序插入。
|
|
28
|
+
"""
|
|
8
29
|
|
|
9
30
|
def __init__(self, settings, db_type: str):
|
|
31
|
+
"""
|
|
32
|
+
Initialize the MongoDB pipeline.
|
|
33
|
+
初始化MongoDB管道。
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
settings: The AioScrapy settings object.
|
|
37
|
+
AioScrapy设置对象。
|
|
38
|
+
db_type: The database type, should be 'mongo'.
|
|
39
|
+
数据库类型,应为'mongo'。
|
|
40
|
+
"""
|
|
10
41
|
super().__init__(settings, db_type)
|
|
42
|
+
|
|
43
|
+
# Dictionary to cache database names by cache key
|
|
44
|
+
# 按缓存键缓存数据库名称的字典
|
|
11
45
|
self.db_cache = {}
|
|
46
|
+
|
|
47
|
+
# Dictionary to cache ordered insert flags by cache key
|
|
48
|
+
# 按缓存键缓存有序插入标志的字典
|
|
12
49
|
self.ordered_cache = {}
|
|
50
|
+
|
|
51
|
+
# Number of times to retry MongoDB operations on timeout
|
|
52
|
+
# MongoDB操作超时时重试的次数
|
|
13
53
|
self.retry_times = settings.getint("MONGO_TIMEOUT_RETRY_TIMES", 3)
|
|
14
54
|
|
|
15
55
|
@classmethod
|
|
16
56
|
def from_settings(cls, settings):
|
|
57
|
+
"""
|
|
58
|
+
Create a MongoPipeline instance from settings.
|
|
59
|
+
从设置创建MongoPipeline实例。
|
|
60
|
+
|
|
61
|
+
This is the factory method used by AioScrapy to create pipeline instances.
|
|
62
|
+
It initializes the pipeline with the appropriate database type ('mongo').
|
|
63
|
+
这是AioScrapy用于创建管道实例的工厂方法。
|
|
64
|
+
它使用适当的数据库类型('mongo')初始化管道。
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
settings: The AioScrapy settings object.
|
|
68
|
+
AioScrapy设置对象。
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
MongoPipeline: A new MongoPipeline instance.
|
|
72
|
+
一个新的MongoPipeline实例。
|
|
73
|
+
"""
|
|
17
74
|
return cls(settings, 'mongo')
|
|
18
75
|
|
|
19
76
|
def parse_item_to_cache(self, item: dict, save_info: dict):
|
|
77
|
+
"""
|
|
78
|
+
Parse an item and add it to the cache.
|
|
79
|
+
解析项目并将其添加到缓存中。
|
|
80
|
+
|
|
81
|
+
This method overrides the base class method to handle MongoDB-specific
|
|
82
|
+
caching requirements, such as database names and ordered insert flags.
|
|
83
|
+
此方法覆盖基类方法,以处理MongoDB特定的缓存需求,如数据库名称和有序插入标志。
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
item: The item to cache.
|
|
87
|
+
要缓存的项目。
|
|
88
|
+
save_info: Dictionary containing information about how to save the item.
|
|
89
|
+
包含有关如何保存项目的信息的字典。
|
|
90
|
+
Must contain 'table_name' and may contain 'db_name',
|
|
91
|
+
'ordered', and 'db_alias'.
|
|
92
|
+
必须包含'table_name',可能包含'db_name'、'ordered'和'db_alias'。
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
tuple: A tuple containing the cache key and the number of items in the cache.
|
|
96
|
+
包含缓存键和缓存中项目数量的元组。
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
AssertionError: If table_name is not provided in save_info.
|
|
100
|
+
如果在save_info中未提供table_name。
|
|
101
|
+
"""
|
|
102
|
+
# Extract information from save_info
|
|
103
|
+
# 从save_info中提取信息
|
|
20
104
|
db_name = save_info.get('db_name')
|
|
21
105
|
table_name = save_info.get('table_name')
|
|
22
106
|
ordered = save_info.get('ordered', False)
|
|
107
|
+
|
|
108
|
+
# Ensure table_name is provided
|
|
109
|
+
# 确保提供了table_name
|
|
23
110
|
assert table_name is not None, 'please set table_name'
|
|
111
|
+
|
|
112
|
+
# Get database aliases, defaulting to ['default']
|
|
113
|
+
# 获取数据库别名,默认为['default']
|
|
24
114
|
db_alias = save_info.get('db_alias', ['default'])
|
|
115
|
+
|
|
116
|
+
# Convert string db_alias to list
|
|
117
|
+
# 将字符串db_alias转换为列表
|
|
25
118
|
if isinstance(db_alias, str):
|
|
26
119
|
db_alias = [db_alias]
|
|
27
120
|
|
|
121
|
+
# Generate a unique cache key based on the save_info
|
|
122
|
+
# 根据save_info生成唯一的缓存键
|
|
28
123
|
cache_key = ''.join(db_alias) + (db_name or '') + table_name + str(ordered)
|
|
29
124
|
|
|
125
|
+
# If this is a new cache key, initialize the caches
|
|
126
|
+
# 如果这是一个新的缓存键,初始化缓存
|
|
30
127
|
if self.table_cache.get(cache_key) is None:
|
|
31
128
|
self.db_alias_cache[cache_key] = db_alias
|
|
32
129
|
self.table_cache[cache_key] = table_name
|
|
@@ -34,23 +131,58 @@ class MongoPipeline(DBPipelineBase):
|
|
|
34
131
|
self.ordered_cache[cache_key] = ordered
|
|
35
132
|
self.item_cache[cache_key] = []
|
|
36
133
|
|
|
134
|
+
# Add the item to the cache
|
|
135
|
+
# 将项目添加到缓存
|
|
37
136
|
self.item_cache[cache_key].append(item)
|
|
137
|
+
|
|
138
|
+
# Return the cache key and the number of items in the cache
|
|
139
|
+
# 返回缓存键和缓存中的项目数量
|
|
38
140
|
return cache_key, len(self.item_cache[cache_key])
|
|
39
141
|
|
|
40
142
|
async def _save(self, cache_key):
|
|
143
|
+
"""
|
|
144
|
+
Save cached items with the given cache key to the MongoDB database.
|
|
145
|
+
将具有给定缓存键的缓存项目保存到MongoDB数据库。
|
|
146
|
+
|
|
147
|
+
This method implements the abstract _save method from the base class.
|
|
148
|
+
It retrieves the cached items for the given cache key, then executes
|
|
149
|
+
a batch insert operation on each configured database connection.
|
|
150
|
+
此方法实现了基类中的抽象_save方法。
|
|
151
|
+
它检索给定缓存键的缓存项目,然后在每个配置的数据库连接上执行批量插入操作。
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
cache_key: The cache key used to retrieve the cached items and metadata.
|
|
155
|
+
用于检索缓存项目和元数据的缓存键。
|
|
156
|
+
"""
|
|
157
|
+
# Get the table name from the cache
|
|
158
|
+
# 从缓存获取表名
|
|
41
159
|
table_name = self.table_cache[cache_key]
|
|
42
160
|
try:
|
|
161
|
+
# Process each database alias (connection) configured for this cache key
|
|
162
|
+
# 处理为此缓存键配置的每个数据库别名(连接)
|
|
43
163
|
for alias in self.db_alias_cache[cache_key]:
|
|
44
164
|
try:
|
|
165
|
+
# Get a MongoDB executor for this alias
|
|
166
|
+
# 获取此别名的MongoDB执行器
|
|
45
167
|
executor = db_manager.mongo.executor(alias)
|
|
168
|
+
|
|
169
|
+
# Execute the batch insert operation
|
|
170
|
+
# 执行批量插入操作
|
|
46
171
|
result = await executor.insert(
|
|
47
172
|
table_name, self.item_cache[cache_key], db_name=self.db_cache[cache_key],
|
|
48
173
|
ordered=self.ordered_cache[cache_key], retry_times=self.retry_times
|
|
49
174
|
)
|
|
175
|
+
|
|
176
|
+
# Log the result of the operation
|
|
177
|
+
# 记录操作结果
|
|
50
178
|
logger.info(
|
|
51
179
|
f'table:{alias}->{table_name} sum:{len(self.item_cache[cache_key])} ok:{len(result.inserted_ids)}'
|
|
52
180
|
)
|
|
53
181
|
except Exception as e:
|
|
182
|
+
# Log any errors that occur during the operation
|
|
183
|
+
# 记录操作期间发生的任何错误
|
|
54
184
|
logger.exception(f'save data error, table:{alias}->{table_name}, err_msg:{e}')
|
|
55
185
|
finally:
|
|
186
|
+
# Clear the cache after processing, regardless of success or failure
|
|
187
|
+
# 处理后清除缓存,无论成功或失败
|
|
56
188
|
self.item_cache[cache_key] = []
|
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MySQL Pipeline for AioScrapy
|
|
3
|
+
AioScrapy的MySQL管道
|
|
4
|
+
|
|
5
|
+
This module provides a pipeline for storing scraped items in a MySQL database.
|
|
6
|
+
It extends the base database pipeline to implement MySQL-specific functionality
|
|
7
|
+
for batch inserting items.
|
|
8
|
+
此模块提供了一个用于将抓取的项目存储在MySQL数据库中的管道。
|
|
9
|
+
它扩展了基本数据库管道,以实现MySQL特定的批量插入项目功能。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from aioscrapy.db import db_manager
|
|
2
13
|
from aioscrapy.libs.pipelines import DBPipelineBase
|
|
3
14
|
|
|
@@ -5,22 +16,78 @@ from aioscrapy.utils.log import logger
|
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class MysqlPipeline(DBPipelineBase):
|
|
19
|
+
"""
|
|
20
|
+
Pipeline for storing scraped items in a MySQL database.
|
|
21
|
+
用于将抓取的项目存储在MySQL数据库中的管道。
|
|
22
|
+
|
|
23
|
+
This pipeline extends the base database pipeline to implement MySQL-specific
|
|
24
|
+
functionality for batch inserting items. It uses the database manager to
|
|
25
|
+
handle connections and transactions.
|
|
26
|
+
此管道扩展了基本数据库管道,以实现MySQL特定的批量插入项目功能。
|
|
27
|
+
它使用数据库管理器来处理连接和事务。
|
|
28
|
+
"""
|
|
8
29
|
|
|
9
30
|
@classmethod
|
|
10
31
|
def from_settings(cls, settings):
|
|
32
|
+
"""
|
|
33
|
+
Create a MysqlPipeline instance from settings.
|
|
34
|
+
从设置创建MysqlPipeline实例。
|
|
35
|
+
|
|
36
|
+
This is the factory method used by AioScrapy to create pipeline instances.
|
|
37
|
+
It initializes the pipeline with the appropriate database type ('mysql').
|
|
38
|
+
这是AioScrapy用于创建管道实例的工厂方法。
|
|
39
|
+
它使用适当的数据库类型('mysql')初始化管道。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
settings: The AioScrapy settings object.
|
|
43
|
+
AioScrapy设置对象。
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
MysqlPipeline: A new MysqlPipeline instance.
|
|
47
|
+
一个新的MysqlPipeline实例。
|
|
48
|
+
"""
|
|
11
49
|
return cls(settings, 'mysql')
|
|
12
50
|
|
|
13
51
|
async def _save(self, cache_key):
|
|
52
|
+
"""
|
|
53
|
+
Save cached items with the given cache key to the MySQL database.
|
|
54
|
+
将具有给定缓存键的缓存项目保存到MySQL数据库。
|
|
55
|
+
|
|
56
|
+
This method implements the abstract _save method from the base class.
|
|
57
|
+
It retrieves the cached items and SQL statement for the given cache key,
|
|
58
|
+
then executes a batch insert operation on each configured database connection.
|
|
59
|
+
此方法实现了基类中的抽象_save方法。
|
|
60
|
+
它检索给定缓存键的缓存项目和SQL语句,然后在每个配置的数据库连接上执行批量插入操作。
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
cache_key: The cache key used to retrieve the cached items, SQL statement,
|
|
64
|
+
and other metadata needed for the database operation.
|
|
65
|
+
用于检索缓存项目、SQL语句和数据库操作所需的其他元数据的缓存键。
|
|
66
|
+
"""
|
|
67
|
+
# Get the table name from the cache
|
|
68
|
+
# 从缓存获取表名
|
|
14
69
|
table_name = self.table_cache[cache_key]
|
|
15
70
|
try:
|
|
71
|
+
# Process each database alias (connection) configured for this cache key
|
|
72
|
+
# 处理为此缓存键配置的每个数据库别名(连接)
|
|
16
73
|
for alias in self.db_alias_cache[cache_key]:
|
|
74
|
+
# Get a database connection and cursor with ping to ensure the connection is alive
|
|
75
|
+
# 获取数据库连接和游标,并使用ping确保连接处于活动状态
|
|
17
76
|
async with db_manager.mysql.get(alias, ping=True) as (conn, cursor):
|
|
18
77
|
try:
|
|
78
|
+
# Execute the batch insert operation
|
|
79
|
+
# 执行批量插入操作
|
|
19
80
|
num = await cursor.executemany(
|
|
20
81
|
self.insert_sql_cache[cache_key], self.item_cache[cache_key]
|
|
21
82
|
)
|
|
83
|
+
# Log the result of the operation
|
|
84
|
+
# 记录操作结果
|
|
22
85
|
logger.info(f'table:{alias}->{table_name} sum:{len(self.item_cache[cache_key])} ok:{num}')
|
|
23
86
|
except Exception as e:
|
|
87
|
+
# Log any errors that occur during the operation
|
|
88
|
+
# 记录操作期间发生的任何错误
|
|
24
89
|
logger.exception(f'save data error, table:{alias}->{table_name}, err_msg:{e}')
|
|
25
90
|
finally:
|
|
91
|
+
# Clear the cache after processing, regardless of success or failure
|
|
92
|
+
# 处理后清除缓存,无论成功或失败
|
|
26
93
|
self.item_cache[cache_key] = []
|
aioscrapy/libs/pipelines/pg.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PostgreSQL Pipeline for AioScrapy
|
|
3
|
+
AioScrapy的PostgreSQL管道
|
|
4
|
+
|
|
5
|
+
This module provides a pipeline for storing scraped items in a PostgreSQL database.
|
|
6
|
+
It extends the base database pipeline to implement PostgreSQL-specific functionality
|
|
7
|
+
for batch inserting items.
|
|
8
|
+
此模块提供了一个用于将抓取的项目存储在PostgreSQL数据库中的管道。
|
|
9
|
+
它扩展了基本数据库管道,以实现PostgreSQL特定的批量插入项目功能。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from aioscrapy.db import db_manager
|
|
2
13
|
from aioscrapy.libs.pipelines import DBPipelineBase
|
|
3
14
|
|
|
@@ -5,22 +16,78 @@ from aioscrapy.utils.log import logger
|
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class PGPipeline(DBPipelineBase):
|
|
19
|
+
"""
|
|
20
|
+
Pipeline for storing scraped items in a PostgreSQL database.
|
|
21
|
+
用于将抓取的项目存储在PostgreSQL数据库中的管道。
|
|
22
|
+
|
|
23
|
+
This pipeline extends the base database pipeline to implement PostgreSQL-specific
|
|
24
|
+
functionality for batch inserting items. It uses the database manager to handle
|
|
25
|
+
connections and transactions.
|
|
26
|
+
此管道扩展了基本数据库管道,以实现PostgreSQL特定的批量插入项目功能。
|
|
27
|
+
它使用数据库管理器来处理连接和事务。
|
|
28
|
+
"""
|
|
8
29
|
|
|
9
30
|
@classmethod
|
|
10
31
|
def from_settings(cls, settings):
|
|
32
|
+
"""
|
|
33
|
+
Create a PGPipeline instance from settings.
|
|
34
|
+
从设置创建PGPipeline实例。
|
|
35
|
+
|
|
36
|
+
This is the factory method used by AioScrapy to create pipeline instances.
|
|
37
|
+
It initializes the pipeline with the appropriate database type ('pg').
|
|
38
|
+
这是AioScrapy用于创建管道实例的工厂方法。
|
|
39
|
+
它使用适当的数据库类型('pg')初始化管道。
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
settings: The AioScrapy settings object.
|
|
43
|
+
AioScrapy设置对象。
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PGPipeline: A new PGPipeline instance.
|
|
47
|
+
一个新的PGPipeline实例。
|
|
48
|
+
"""
|
|
11
49
|
return cls(settings, 'pg')
|
|
12
50
|
|
|
13
51
|
async def _save(self, cache_key):
|
|
52
|
+
"""
|
|
53
|
+
Save cached items with the given cache key to the PostgreSQL database.
|
|
54
|
+
将具有给定缓存键的缓存项目保存到PostgreSQL数据库。
|
|
55
|
+
|
|
56
|
+
This method implements the abstract _save method from the base class.
|
|
57
|
+
It retrieves the cached items and SQL statement for the given cache key,
|
|
58
|
+
then executes a batch insert operation on each configured database connection.
|
|
59
|
+
此方法实现了基类中的抽象_save方法。
|
|
60
|
+
它检索给定缓存键的缓存项目和SQL语句,然后在每个配置的数据库连接上执行批量插入操作。
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
cache_key: The cache key used to retrieve the cached items, SQL statement,
|
|
64
|
+
and other metadata needed for the database operation.
|
|
65
|
+
用于检索缓存项目、SQL语句和数据库操作所需的其他元数据的缓存键。
|
|
66
|
+
"""
|
|
67
|
+
# Get the table name from the cache
|
|
68
|
+
# 从缓存获取表名
|
|
14
69
|
table_name = self.table_cache[cache_key]
|
|
15
70
|
try:
|
|
71
|
+
# Process each database alias (connection) configured for this cache key
|
|
72
|
+
# 处理为此缓存键配置的每个数据库别名(连接)
|
|
16
73
|
for alias in self.db_alias_cache[cache_key]:
|
|
74
|
+
# Get a database connection with context manager to ensure proper cleanup
|
|
75
|
+
# 使用上下文管理器获取数据库连接,以确保正确清理
|
|
17
76
|
async with db_manager.pg.get(alias) as conn:
|
|
18
77
|
try:
|
|
78
|
+
# Execute the batch insert operation
|
|
79
|
+
# 执行批量插入操作
|
|
19
80
|
num = await conn.executemany(
|
|
20
81
|
self.insert_sql_cache[cache_key], self.item_cache[cache_key]
|
|
21
82
|
)
|
|
83
|
+
# Log the result of the operation
|
|
84
|
+
# 记录操作结果
|
|
22
85
|
logger.info(f'table:{alias}->{table_name} sum:{len(self.item_cache[cache_key])} ok:{num}')
|
|
23
86
|
except Exception as e:
|
|
87
|
+
# Log any errors that occur during the operation
|
|
88
|
+
# 记录操作期间发生的任何错误
|
|
24
89
|
logger.exception(f'save data error, table:{alias}->{table_name}, err_msg:{e}')
|
|
25
90
|
finally:
|
|
91
|
+
# Clear the cache after processing, regardless of success or failure
|
|
92
|
+
# 处理后清除缓存,无论成功或失败
|
|
26
93
|
self.item_cache[cache_key] = []
|
aioscrapy/libs/spider/depth.py
CHANGED
|
@@ -1,54 +1,192 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Depth Spider Middleware
|
|
3
|
+
深度爬虫中间件
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This middleware tracks the depth of requests and can be used to limit the maximum
|
|
6
|
+
depth of crawls. It also adjusts request priorities based on depth and collects
|
|
7
|
+
depth statistics.
|
|
8
|
+
此中间件跟踪请求的深度,可用于限制爬取的最大深度。它还根据深度调整请求优先级
|
|
9
|
+
并收集深度统计信息。
|
|
5
10
|
"""
|
|
6
11
|
|
|
7
12
|
from aioscrapy.http import Request
|
|
8
|
-
|
|
9
13
|
from aioscrapy.utils.log import logger
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
class DepthMiddleware:
|
|
17
|
+
"""
|
|
18
|
+
Spider middleware to track the depth of requests.
|
|
19
|
+
用于跟踪请求深度的爬虫中间件。
|
|
20
|
+
|
|
21
|
+
This middleware tracks how many nested links the crawler has followed from the
|
|
22
|
+
initial request (depth). It can be used to limit the maximum depth of crawls,
|
|
23
|
+
adjust request priorities based on depth, and collect depth statistics.
|
|
24
|
+
此中间件跟踪爬虫从初始请求开始已经跟随了多少层嵌套链接(深度)。它可用于限制
|
|
25
|
+
爬取的最大深度,根据深度调整请求优先级,并收集深度统计信息。
|
|
26
|
+
"""
|
|
13
27
|
|
|
14
28
|
def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the depth middleware.
|
|
31
|
+
初始化深度中间件。
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
maxdepth: Maximum allowed depth. If None or 0, no limit is imposed.
|
|
35
|
+
允许的最大深度。如果为None或0,则不施加限制。
|
|
36
|
+
stats: Stats collector instance.
|
|
37
|
+
统计收集器实例。
|
|
38
|
+
verbose_stats: Whether to collect detailed stats for each depth level.
|
|
39
|
+
是否收集每个深度级别的详细统计信息。
|
|
40
|
+
Defaults to False.
|
|
41
|
+
默认为False。
|
|
42
|
+
prio: Priority adjustment per depth level.
|
|
43
|
+
每个深度级别的优先级调整。
|
|
44
|
+
Defaults to 1.
|
|
45
|
+
默认为1。
|
|
46
|
+
"""
|
|
47
|
+
# Maximum allowed depth
|
|
48
|
+
# 允许的最大深度
|
|
15
49
|
self.maxdepth = maxdepth
|
|
50
|
+
|
|
51
|
+
# Stats collector instance
|
|
52
|
+
# 统计收集器实例
|
|
16
53
|
self.stats = stats
|
|
54
|
+
|
|
55
|
+
# Whether to collect detailed stats for each depth level
|
|
56
|
+
# 是否收集每个深度级别的详细统计信息
|
|
17
57
|
self.verbose_stats = verbose_stats
|
|
58
|
+
|
|
59
|
+
# Priority adjustment per depth level
|
|
60
|
+
# 每个深度级别的优先级调整
|
|
18
61
|
self.prio = prio
|
|
19
62
|
|
|
20
63
|
@classmethod
|
|
21
64
|
def from_crawler(cls, crawler):
|
|
65
|
+
"""
|
|
66
|
+
Create a DepthMiddleware instance from a crawler.
|
|
67
|
+
从爬虫创建DepthMiddleware实例。
|
|
68
|
+
|
|
69
|
+
This is the factory method used by AioScrapy to create the middleware.
|
|
70
|
+
这是AioScrapy用于创建中间件的工厂方法。
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
crawler: The crawler that will use this middleware.
|
|
74
|
+
将使用此中间件的爬虫。
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
DepthMiddleware: A new DepthMiddleware instance.
|
|
78
|
+
一个新的DepthMiddleware实例。
|
|
79
|
+
"""
|
|
80
|
+
# Get settings from crawler
|
|
81
|
+
# 从爬虫获取设置
|
|
22
82
|
settings = crawler.settings
|
|
83
|
+
|
|
84
|
+
# Get maximum depth from settings
|
|
85
|
+
# 从设置获取最大深度
|
|
23
86
|
maxdepth = settings.getint('DEPTH_LIMIT')
|
|
87
|
+
|
|
88
|
+
# Get verbose stats setting
|
|
89
|
+
# 获取详细统计设置
|
|
24
90
|
verbose = settings.getbool('DEPTH_STATS_VERBOSE')
|
|
91
|
+
|
|
92
|
+
# Get priority adjustment setting
|
|
93
|
+
# 获取优先级调整设置
|
|
25
94
|
prio = settings.getint('DEPTH_PRIORITY')
|
|
95
|
+
|
|
96
|
+
# Create and return a new instance
|
|
97
|
+
# 创建并返回一个新实例
|
|
26
98
|
return cls(maxdepth, crawler.stats, verbose, prio)
|
|
27
99
|
|
|
28
100
|
async def process_spider_output(self, response, result, spider):
|
|
101
|
+
"""
|
|
102
|
+
Process the spider output to track request depth.
|
|
103
|
+
处理爬虫输出以跟踪请求深度。
|
|
104
|
+
|
|
105
|
+
This method processes each request yielded by the spider, tracks its depth,
|
|
106
|
+
adjusts its priority, and filters out requests that exceed the maximum depth.
|
|
107
|
+
此方法处理爬虫产生的每个请求,跟踪其深度,调整其优先级,并过滤掉超过最大深度的请求。
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
response: The response being processed.
|
|
111
|
+
正在处理的响应。
|
|
112
|
+
result: The result returned by the spider.
|
|
113
|
+
爬虫返回的结果。
|
|
114
|
+
spider: The spider that generated the result.
|
|
115
|
+
生成结果的爬虫。
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
An async generator yielding filtered requests.
|
|
119
|
+
一个产生过滤后请求的异步生成器。
|
|
120
|
+
"""
|
|
29
121
|
def _filter(request):
|
|
122
|
+
"""
|
|
123
|
+
Filter function to process and possibly filter out requests based on depth.
|
|
124
|
+
基于深度处理并可能过滤掉请求的过滤函数。
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
request: The request to process.
|
|
128
|
+
要处理的请求。
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
bool: True if the request should be kept, False if it should be filtered out.
|
|
132
|
+
如果应保留请求,则为True;如果应过滤掉请求,则为False。
|
|
133
|
+
"""
|
|
134
|
+
# Only process Request objects
|
|
135
|
+
# 只处理Request对象
|
|
30
136
|
if isinstance(request, Request):
|
|
137
|
+
# Calculate depth of this request (parent depth + 1)
|
|
138
|
+
# 计算此请求的深度(父深度 + 1)
|
|
31
139
|
depth = response.meta['depth'] + 1
|
|
140
|
+
|
|
141
|
+
# Store depth in request metadata
|
|
142
|
+
# 将深度存储在请求元数据中
|
|
32
143
|
request.meta['depth'] = depth
|
|
144
|
+
|
|
145
|
+
# Adjust priority based on depth if enabled
|
|
146
|
+
# 如果启用,则根据深度调整优先级
|
|
33
147
|
if self.prio:
|
|
34
148
|
request.priority -= depth * self.prio
|
|
149
|
+
|
|
150
|
+
# Check if request exceeds maximum depth
|
|
151
|
+
# 检查请求是否超过最大深度
|
|
35
152
|
if self.maxdepth and depth > self.maxdepth:
|
|
153
|
+
# Log ignored request
|
|
154
|
+
# 记录被忽略的请求
|
|
36
155
|
logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s " % {
|
|
37
156
|
'maxdepth': self.maxdepth, 'requrl': request.url
|
|
38
157
|
})
|
|
158
|
+
# Filter out this request
|
|
159
|
+
# 过滤掉此请求
|
|
39
160
|
return False
|
|
40
161
|
else:
|
|
162
|
+
# Update depth statistics
|
|
163
|
+
# 更新深度统计信息
|
|
41
164
|
if self.verbose_stats:
|
|
165
|
+
# Increment count for this depth level
|
|
166
|
+
# 增加此深度级别的计数
|
|
42
167
|
self.stats.inc_value(f'request_depth_count/{depth}',
|
|
43
168
|
spider=spider)
|
|
169
|
+
|
|
170
|
+
# Update maximum depth reached
|
|
171
|
+
# 更新达到的最大深度
|
|
44
172
|
self.stats.max_value('request_depth_max', depth,
|
|
45
173
|
spider=spider)
|
|
174
|
+
# Keep all non-Request objects and requests that didn't exceed max depth
|
|
175
|
+
# 保留所有非Request对象和未超过最大深度的请求
|
|
46
176
|
return True
|
|
47
177
|
|
|
48
|
-
# base case (depth
|
|
178
|
+
# Handle the base case (initial response with no depth)
|
|
179
|
+
# 处理基本情况(没有深度的初始响应)
|
|
49
180
|
if 'depth' not in response.meta:
|
|
181
|
+
# Set depth to 0 for the initial response
|
|
182
|
+
# 为初始响应设置深度为0
|
|
50
183
|
response.meta['depth'] = 0
|
|
184
|
+
|
|
185
|
+
# Update depth statistics for depth 0
|
|
186
|
+
# 更新深度0的深度统计信息
|
|
51
187
|
if self.verbose_stats:
|
|
52
188
|
self.stats.inc_value('request_depth_count/0', spider=spider)
|
|
53
189
|
|
|
190
|
+
# Filter the results using the _filter function
|
|
191
|
+
# 使用_filter函数过滤结果
|
|
54
192
|
return (r async for r in result or () if _filter(r))
|