aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,14 @@
1
+ """
2
+ MongoDB Pipeline for AioScrapy
3
+ AioScrapy的MongoDB管道
4
+
5
+ This module provides a pipeline for storing scraped items in a MongoDB database.
6
+ It extends the base database pipeline to implement MongoDB-specific functionality
7
+ for batch inserting items.
8
+ 此模块提供了一个用于将抓取的项目存储在MongoDB数据库中的管道。
9
+ 它扩展了基本数据库管道,以实现MongoDB特定的批量插入项目功能。
10
+ """
11
+
1
12
  from aioscrapy.db import db_manager
2
13
  from aioscrapy.libs.pipelines import DBPipelineBase
3
14
 
@@ -5,28 +16,114 @@ from aioscrapy.utils.log import logger
5
16
 
6
17
 
7
18
  class MongoPipeline(DBPipelineBase):
19
+ """
20
+ Pipeline for storing scraped items in a MongoDB database.
21
+ 用于将抓取的项目存储在MongoDB数据库中的管道。
22
+
23
+ This pipeline extends the base database pipeline to implement MongoDB-specific
24
+ functionality for batch inserting items. It supports multiple database connections,
25
+ custom database names, and ordered/unordered inserts.
26
+ 此管道扩展了基本数据库管道,以实现MongoDB特定的批量插入项目功能。
27
+ 它支持多个数据库连接、自定义数据库名称和有序/无序插入。
28
+ """
8
29
 
9
30
  def __init__(self, settings, db_type: str):
31
+ """
32
+ Initialize the MongoDB pipeline.
33
+ 初始化MongoDB管道。
34
+
35
+ Args:
36
+ settings: The AioScrapy settings object.
37
+ AioScrapy设置对象。
38
+ db_type: The database type, should be 'mongo'.
39
+ 数据库类型,应为'mongo'。
40
+ """
10
41
  super().__init__(settings, db_type)
42
+
43
+ # Dictionary to cache database names by cache key
44
+ # 按缓存键缓存数据库名称的字典
11
45
  self.db_cache = {}
46
+
47
+ # Dictionary to cache ordered insert flags by cache key
48
+ # 按缓存键缓存有序插入标志的字典
12
49
  self.ordered_cache = {}
50
+
51
+ # Number of times to retry MongoDB operations on timeout
52
+ # MongoDB操作超时时重试的次数
13
53
  self.retry_times = settings.getint("MONGO_TIMEOUT_RETRY_TIMES", 3)
14
54
 
15
55
  @classmethod
16
56
  def from_settings(cls, settings):
57
+ """
58
+ Create a MongoPipeline instance from settings.
59
+ 从设置创建MongoPipeline实例。
60
+
61
+ This is the factory method used by AioScrapy to create pipeline instances.
62
+ It initializes the pipeline with the appropriate database type ('mongo').
63
+ 这是AioScrapy用于创建管道实例的工厂方法。
64
+ 它使用适当的数据库类型('mongo')初始化管道。
65
+
66
+ Args:
67
+ settings: The AioScrapy settings object.
68
+ AioScrapy设置对象。
69
+
70
+ Returns:
71
+ MongoPipeline: A new MongoPipeline instance.
72
+ 一个新的MongoPipeline实例。
73
+ """
17
74
  return cls(settings, 'mongo')
18
75
 
19
76
  def parse_item_to_cache(self, item: dict, save_info: dict):
77
+ """
78
+ Parse an item and add it to the cache.
79
+ 解析项目并将其添加到缓存中。
80
+
81
+ This method overrides the base class method to handle MongoDB-specific
82
+ caching requirements, such as database names and ordered insert flags.
83
+ 此方法覆盖基类方法,以处理MongoDB特定的缓存需求,如数据库名称和有序插入标志。
84
+
85
+ Args:
86
+ item: The item to cache.
87
+ 要缓存的项目。
88
+ save_info: Dictionary containing information about how to save the item.
89
+ 包含有关如何保存项目的信息的字典。
90
+ Must contain 'table_name' and may contain 'db_name',
91
+ 'ordered', and 'db_alias'.
92
+ 必须包含'table_name',可能包含'db_name'、'ordered'和'db_alias'。
93
+
94
+ Returns:
95
+ tuple: A tuple containing the cache key and the number of items in the cache.
96
+ 包含缓存键和缓存中项目数量的元组。
97
+
98
+ Raises:
99
+ AssertionError: If table_name is not provided in save_info.
100
+ 如果在save_info中未提供table_name。
101
+ """
102
+ # Extract information from save_info
103
+ # 从save_info中提取信息
20
104
  db_name = save_info.get('db_name')
21
105
  table_name = save_info.get('table_name')
22
106
  ordered = save_info.get('ordered', False)
107
+
108
+ # Ensure table_name is provided
109
+ # 确保提供了table_name
23
110
  assert table_name is not None, 'please set table_name'
111
+
112
+ # Get database aliases, defaulting to ['default']
113
+ # 获取数据库别名,默认为['default']
24
114
  db_alias = save_info.get('db_alias', ['default'])
115
+
116
+ # Convert string db_alias to list
117
+ # 将字符串db_alias转换为列表
25
118
  if isinstance(db_alias, str):
26
119
  db_alias = [db_alias]
27
120
 
121
+ # Generate a unique cache key based on the save_info
122
+ # 根据save_info生成唯一的缓存键
28
123
  cache_key = ''.join(db_alias) + (db_name or '') + table_name + str(ordered)
29
124
 
125
+ # If this is a new cache key, initialize the caches
126
+ # 如果这是一个新的缓存键,初始化缓存
30
127
  if self.table_cache.get(cache_key) is None:
31
128
  self.db_alias_cache[cache_key] = db_alias
32
129
  self.table_cache[cache_key] = table_name
@@ -34,23 +131,58 @@ class MongoPipeline(DBPipelineBase):
34
131
  self.ordered_cache[cache_key] = ordered
35
132
  self.item_cache[cache_key] = []
36
133
 
134
+ # Add the item to the cache
135
+ # 将项目添加到缓存
37
136
  self.item_cache[cache_key].append(item)
137
+
138
+ # Return the cache key and the number of items in the cache
139
+ # 返回缓存键和缓存中的项目数量
38
140
  return cache_key, len(self.item_cache[cache_key])
39
141
 
40
142
  async def _save(self, cache_key):
143
+ """
144
+ Save cached items with the given cache key to the MongoDB database.
145
+ 将具有给定缓存键的缓存项目保存到MongoDB数据库。
146
+
147
+ This method implements the abstract _save method from the base class.
148
+ It retrieves the cached items for the given cache key, then executes
149
+ a batch insert operation on each configured database connection.
150
+ 此方法实现了基类中的抽象_save方法。
151
+ 它检索给定缓存键的缓存项目,然后在每个配置的数据库连接上执行批量插入操作。
152
+
153
+ Args:
154
+ cache_key: The cache key used to retrieve the cached items and metadata.
155
+ 用于检索缓存项目和元数据的缓存键。
156
+ """
157
+ # Get the table name from the cache
158
+ # 从缓存获取表名
41
159
  table_name = self.table_cache[cache_key]
42
160
  try:
161
+ # Process each database alias (connection) configured for this cache key
162
+ # 处理为此缓存键配置的每个数据库别名(连接)
43
163
  for alias in self.db_alias_cache[cache_key]:
44
164
  try:
165
+ # Get a MongoDB executor for this alias
166
+ # 获取此别名的MongoDB执行器
45
167
  executor = db_manager.mongo.executor(alias)
168
+
169
+ # Execute the batch insert operation
170
+ # 执行批量插入操作
46
171
  result = await executor.insert(
47
172
  table_name, self.item_cache[cache_key], db_name=self.db_cache[cache_key],
48
173
  ordered=self.ordered_cache[cache_key], retry_times=self.retry_times
49
174
  )
175
+
176
+ # Log the result of the operation
177
+ # 记录操作结果
50
178
  logger.info(
51
179
  f'table:{alias}->{table_name} sum:{len(self.item_cache[cache_key])} ok:{len(result.inserted_ids)}'
52
180
  )
53
181
  except Exception as e:
182
+ # Log any errors that occur during the operation
183
+ # 记录操作期间发生的任何错误
54
184
  logger.exception(f'save data error, table:{alias}->{table_name}, err_msg:{e}')
55
185
  finally:
186
+ # Clear the cache after processing, regardless of success or failure
187
+ # 处理后清除缓存,无论成功或失败
56
188
  self.item_cache[cache_key] = []
@@ -1,3 +1,14 @@
1
+ """
2
+ MySQL Pipeline for AioScrapy
3
+ AioScrapy的MySQL管道
4
+
5
+ This module provides a pipeline for storing scraped items in a MySQL database.
6
+ It extends the base database pipeline to implement MySQL-specific functionality
7
+ for batch inserting items.
8
+ 此模块提供了一个用于将抓取的项目存储在MySQL数据库中的管道。
9
+ 它扩展了基本数据库管道,以实现MySQL特定的批量插入项目功能。
10
+ """
11
+
1
12
  from aioscrapy.db import db_manager
2
13
  from aioscrapy.libs.pipelines import DBPipelineBase
3
14
 
@@ -5,22 +16,78 @@ from aioscrapy.utils.log import logger
5
16
 
6
17
 
7
18
  class MysqlPipeline(DBPipelineBase):
19
+ """
20
+ Pipeline for storing scraped items in a MySQL database.
21
+ 用于将抓取的项目存储在MySQL数据库中的管道。
22
+
23
+ This pipeline extends the base database pipeline to implement MySQL-specific
24
+ functionality for batch inserting items. It uses the database manager to
25
+ handle connections and transactions.
26
+ 此管道扩展了基本数据库管道,以实现MySQL特定的批量插入项目功能。
27
+ 它使用数据库管理器来处理连接和事务。
28
+ """
8
29
 
9
30
  @classmethod
10
31
  def from_settings(cls, settings):
32
+ """
33
+ Create a MysqlPipeline instance from settings.
34
+ 从设置创建MysqlPipeline实例。
35
+
36
+ This is the factory method used by AioScrapy to create pipeline instances.
37
+ It initializes the pipeline with the appropriate database type ('mysql').
38
+ 这是AioScrapy用于创建管道实例的工厂方法。
39
+ 它使用适当的数据库类型('mysql')初始化管道。
40
+
41
+ Args:
42
+ settings: The AioScrapy settings object.
43
+ AioScrapy设置对象。
44
+
45
+ Returns:
46
+ MysqlPipeline: A new MysqlPipeline instance.
47
+ 一个新的MysqlPipeline实例。
48
+ """
11
49
  return cls(settings, 'mysql')
12
50
 
13
51
  async def _save(self, cache_key):
52
+ """
53
+ Save cached items with the given cache key to the MySQL database.
54
+ 将具有给定缓存键的缓存项目保存到MySQL数据库。
55
+
56
+ This method implements the abstract _save method from the base class.
57
+ It retrieves the cached items and SQL statement for the given cache key,
58
+ then executes a batch insert operation on each configured database connection.
59
+ 此方法实现了基类中的抽象_save方法。
60
+ 它检索给定缓存键的缓存项目和SQL语句,然后在每个配置的数据库连接上执行批量插入操作。
61
+
62
+ Args:
63
+ cache_key: The cache key used to retrieve the cached items, SQL statement,
64
+ and other metadata needed for the database operation.
65
+ 用于检索缓存项目、SQL语句和数据库操作所需的其他元数据的缓存键。
66
+ """
67
+ # Get the table name from the cache
68
+ # 从缓存获取表名
14
69
  table_name = self.table_cache[cache_key]
15
70
  try:
71
+ # Process each database alias (connection) configured for this cache key
72
+ # 处理为此缓存键配置的每个数据库别名(连接)
16
73
  for alias in self.db_alias_cache[cache_key]:
74
+ # Get a database connection and cursor with ping to ensure the connection is alive
75
+ # 获取数据库连接和游标,并使用ping确保连接处于活动状态
17
76
  async with db_manager.mysql.get(alias, ping=True) as (conn, cursor):
18
77
  try:
78
+ # Execute the batch insert operation
79
+ # 执行批量插入操作
19
80
  num = await cursor.executemany(
20
81
  self.insert_sql_cache[cache_key], self.item_cache[cache_key]
21
82
  )
83
+ # Log the result of the operation
84
+ # 记录操作结果
22
85
  logger.info(f'table:{alias}->{table_name} sum:{len(self.item_cache[cache_key])} ok:{num}')
23
86
  except Exception as e:
87
+ # Log any errors that occur during the operation
88
+ # 记录操作期间发生的任何错误
24
89
  logger.exception(f'save data error, table:{alias}->{table_name}, err_msg:{e}')
25
90
  finally:
91
+ # Clear the cache after processing, regardless of success or failure
92
+ # 处理后清除缓存,无论成功或失败
26
93
  self.item_cache[cache_key] = []
@@ -1,3 +1,14 @@
1
+ """
2
+ PostgreSQL Pipeline for AioScrapy
3
+ AioScrapy的PostgreSQL管道
4
+
5
+ This module provides a pipeline for storing scraped items in a PostgreSQL database.
6
+ It extends the base database pipeline to implement PostgreSQL-specific functionality
7
+ for batch inserting items.
8
+ 此模块提供了一个用于将抓取的项目存储在PostgreSQL数据库中的管道。
9
+ 它扩展了基本数据库管道,以实现PostgreSQL特定的批量插入项目功能。
10
+ """
11
+
1
12
  from aioscrapy.db import db_manager
2
13
  from aioscrapy.libs.pipelines import DBPipelineBase
3
14
 
@@ -5,22 +16,78 @@ from aioscrapy.utils.log import logger
5
16
 
6
17
 
7
18
  class PGPipeline(DBPipelineBase):
19
+ """
20
+ Pipeline for storing scraped items in a PostgreSQL database.
21
+ 用于将抓取的项目存储在PostgreSQL数据库中的管道。
22
+
23
+ This pipeline extends the base database pipeline to implement PostgreSQL-specific
24
+ functionality for batch inserting items. It uses the database manager to handle
25
+ connections and transactions.
26
+ 此管道扩展了基本数据库管道,以实现PostgreSQL特定的批量插入项目功能。
27
+ 它使用数据库管理器来处理连接和事务。
28
+ """
8
29
 
9
30
  @classmethod
10
31
  def from_settings(cls, settings):
32
+ """
33
+ Create a PGPipeline instance from settings.
34
+ 从设置创建PGPipeline实例。
35
+
36
+ This is the factory method used by AioScrapy to create pipeline instances.
37
+ It initializes the pipeline with the appropriate database type ('pg').
38
+ 这是AioScrapy用于创建管道实例的工厂方法。
39
+ 它使用适当的数据库类型('pg')初始化管道。
40
+
41
+ Args:
42
+ settings: The AioScrapy settings object.
43
+ AioScrapy设置对象。
44
+
45
+ Returns:
46
+ PGPipeline: A new PGPipeline instance.
47
+ 一个新的PGPipeline实例。
48
+ """
11
49
  return cls(settings, 'pg')
12
50
 
13
51
  async def _save(self, cache_key):
52
+ """
53
+ Save cached items with the given cache key to the PostgreSQL database.
54
+ 将具有给定缓存键的缓存项目保存到PostgreSQL数据库。
55
+
56
+ This method implements the abstract _save method from the base class.
57
+ It retrieves the cached items and SQL statement for the given cache key,
58
+ then executes a batch insert operation on each configured database connection.
59
+ 此方法实现了基类中的抽象_save方法。
60
+ 它检索给定缓存键的缓存项目和SQL语句,然后在每个配置的数据库连接上执行批量插入操作。
61
+
62
+ Args:
63
+ cache_key: The cache key used to retrieve the cached items, SQL statement,
64
+ and other metadata needed for the database operation.
65
+ 用于检索缓存项目、SQL语句和数据库操作所需的其他元数据的缓存键。
66
+ """
67
+ # Get the table name from the cache
68
+ # 从缓存获取表名
14
69
  table_name = self.table_cache[cache_key]
15
70
  try:
71
+ # Process each database alias (connection) configured for this cache key
72
+ # 处理为此缓存键配置的每个数据库别名(连接)
16
73
  for alias in self.db_alias_cache[cache_key]:
74
+ # Get a database connection with context manager to ensure proper cleanup
75
+ # 使用上下文管理器获取数据库连接,以确保正确清理
17
76
  async with db_manager.pg.get(alias) as conn:
18
77
  try:
78
+ # Execute the batch insert operation
79
+ # 执行批量插入操作
19
80
  num = await conn.executemany(
20
81
  self.insert_sql_cache[cache_key], self.item_cache[cache_key]
21
82
  )
83
+ # Log the result of the operation
84
+ # 记录操作结果
22
85
  logger.info(f'table:{alias}->{table_name} sum:{len(self.item_cache[cache_key])} ok:{num}')
23
86
  except Exception as e:
87
+ # Log any errors that occur during the operation
88
+ # 记录操作期间发生的任何错误
24
89
  logger.exception(f'save data error, table:{alias}->{table_name}, err_msg:{e}')
25
90
  finally:
91
+ # Clear the cache after processing, regardless of success or failure
92
+ # 处理后清除缓存,无论成功或失败
26
93
  self.item_cache[cache_key] = []
@@ -1,54 +1,192 @@
1
1
  """
2
2
  Depth Spider Middleware
3
+ 深度爬虫中间件
3
4
 
4
- See documentation in docs/topics/spider-middleware.rst
5
+ This middleware tracks the depth of requests and can be used to limit the maximum
6
+ depth of crawls. It also adjusts request priorities based on depth and collects
7
+ depth statistics.
8
+ 此中间件跟踪请求的深度,可用于限制爬取的最大深度。它还根据深度调整请求优先级
9
+ 并收集深度统计信息。
5
10
  """
6
11
 
7
12
  from aioscrapy.http import Request
8
-
9
13
  from aioscrapy.utils.log import logger
10
14
 
11
15
 
12
16
  class DepthMiddleware:
17
+ """
18
+ Spider middleware to track the depth of requests.
19
+ 用于跟踪请求深度的爬虫中间件。
20
+
21
+ This middleware tracks how many nested links the crawler has followed from the
22
+ initial request (depth). It can be used to limit the maximum depth of crawls,
23
+ adjust request priorities based on depth, and collect depth statistics.
24
+ 此中间件跟踪爬虫从初始请求开始已经跟随了多少层嵌套链接(深度)。它可用于限制
25
+ 爬取的最大深度,根据深度调整请求优先级,并收集深度统计信息。
26
+ """
13
27
 
14
28
  def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
29
+ """
30
+ Initialize the depth middleware.
31
+ 初始化深度中间件。
32
+
33
+ Args:
34
+ maxdepth: Maximum allowed depth. If None or 0, no limit is imposed.
35
+ 允许的最大深度。如果为None或0,则不施加限制。
36
+ stats: Stats collector instance.
37
+ 统计收集器实例。
38
+ verbose_stats: Whether to collect detailed stats for each depth level.
39
+ 是否收集每个深度级别的详细统计信息。
40
+ Defaults to False.
41
+ 默认为False。
42
+ prio: Priority adjustment per depth level.
43
+ 每个深度级别的优先级调整。
44
+ Defaults to 1.
45
+ 默认为1。
46
+ """
47
+ # Maximum allowed depth
48
+ # 允许的最大深度
15
49
  self.maxdepth = maxdepth
50
+
51
+ # Stats collector instance
52
+ # 统计收集器实例
16
53
  self.stats = stats
54
+
55
+ # Whether to collect detailed stats for each depth level
56
+ # 是否收集每个深度级别的详细统计信息
17
57
  self.verbose_stats = verbose_stats
58
+
59
+ # Priority adjustment per depth level
60
+ # 每个深度级别的优先级调整
18
61
  self.prio = prio
19
62
 
20
63
  @classmethod
21
64
  def from_crawler(cls, crawler):
65
+ """
66
+ Create a DepthMiddleware instance from a crawler.
67
+ 从爬虫创建DepthMiddleware实例。
68
+
69
+ This is the factory method used by AioScrapy to create the middleware.
70
+ 这是AioScrapy用于创建中间件的工厂方法。
71
+
72
+ Args:
73
+ crawler: The crawler that will use this middleware.
74
+ 将使用此中间件的爬虫。
75
+
76
+ Returns:
77
+ DepthMiddleware: A new DepthMiddleware instance.
78
+ 一个新的DepthMiddleware实例。
79
+ """
80
+ # Get settings from crawler
81
+ # 从爬虫获取设置
22
82
  settings = crawler.settings
83
+
84
+ # Get maximum depth from settings
85
+ # 从设置获取最大深度
23
86
  maxdepth = settings.getint('DEPTH_LIMIT')
87
+
88
+ # Get verbose stats setting
89
+ # 获取详细统计设置
24
90
  verbose = settings.getbool('DEPTH_STATS_VERBOSE')
91
+
92
+ # Get priority adjustment setting
93
+ # 获取优先级调整设置
25
94
  prio = settings.getint('DEPTH_PRIORITY')
95
+
96
+ # Create and return a new instance
97
+ # 创建并返回一个新实例
26
98
  return cls(maxdepth, crawler.stats, verbose, prio)
27
99
 
28
100
  async def process_spider_output(self, response, result, spider):
101
+ """
102
+ Process the spider output to track request depth.
103
+ 处理爬虫输出以跟踪请求深度。
104
+
105
+ This method processes each request yielded by the spider, tracks its depth,
106
+ adjusts its priority, and filters out requests that exceed the maximum depth.
107
+ 此方法处理爬虫产生的每个请求,跟踪其深度,调整其优先级,并过滤掉超过最大深度的请求。
108
+
109
+ Args:
110
+ response: The response being processed.
111
+ 正在处理的响应。
112
+ result: The result returned by the spider.
113
+ 爬虫返回的结果。
114
+ spider: The spider that generated the result.
115
+ 生成结果的爬虫。
116
+
117
+ Returns:
118
+ An async generator yielding filtered requests.
119
+ 一个产生过滤后请求的异步生成器。
120
+ """
29
121
  def _filter(request):
122
+ """
123
+ Filter function to process and possibly filter out requests based on depth.
124
+ 基于深度处理并可能过滤掉请求的过滤函数。
125
+
126
+ Args:
127
+ request: The request to process.
128
+ 要处理的请求。
129
+
130
+ Returns:
131
+ bool: True if the request should be kept, False if it should be filtered out.
132
+ 如果应保留请求,则为True;如果应过滤掉请求,则为False。
133
+ """
134
+ # Only process Request objects
135
+ # 只处理Request对象
30
136
  if isinstance(request, Request):
137
+ # Calculate depth of this request (parent depth + 1)
138
+ # 计算此请求的深度(父深度 + 1)
31
139
  depth = response.meta['depth'] + 1
140
+
141
+ # Store depth in request metadata
142
+ # 将深度存储在请求元数据中
32
143
  request.meta['depth'] = depth
144
+
145
+ # Adjust priority based on depth if enabled
146
+ # 如果启用,则根据深度调整优先级
33
147
  if self.prio:
34
148
  request.priority -= depth * self.prio
149
+
150
+ # Check if request exceeds maximum depth
151
+ # 检查请求是否超过最大深度
35
152
  if self.maxdepth and depth > self.maxdepth:
153
+ # Log ignored request
154
+ # 记录被忽略的请求
36
155
  logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s " % {
37
156
  'maxdepth': self.maxdepth, 'requrl': request.url
38
157
  })
158
+ # Filter out this request
159
+ # 过滤掉此请求
39
160
  return False
40
161
  else:
162
+ # Update depth statistics
163
+ # 更新深度统计信息
41
164
  if self.verbose_stats:
165
+ # Increment count for this depth level
166
+ # 增加此深度级别的计数
42
167
  self.stats.inc_value(f'request_depth_count/{depth}',
43
168
  spider=spider)
169
+
170
+ # Update maximum depth reached
171
+ # 更新达到的最大深度
44
172
  self.stats.max_value('request_depth_max', depth,
45
173
  spider=spider)
174
+ # Keep all non-Request objects and requests that didn't exceed max depth
175
+ # 保留所有非Request对象和未超过最大深度的请求
46
176
  return True
47
177
 
48
- # base case (depth=0)
178
+ # Handle the base case (initial response with no depth)
179
+ # 处理基本情况(没有深度的初始响应)
49
180
  if 'depth' not in response.meta:
181
+ # Set depth to 0 for the initial response
182
+ # 为初始响应设置深度为0
50
183
  response.meta['depth'] = 0
184
+
185
+ # Update depth statistics for depth 0
186
+ # 更新深度0的深度统计信息
51
187
  if self.verbose_stats:
52
188
  self.stats.inc_value('request_depth_count/0', spider=spider)
53
189
 
190
+ # Filter the results using the _filter function
191
+ # 使用_filter函数过滤结果
54
192
  return (r async for r in result or () if _filter(r))