aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,14 @@
1
+ """
2
+ CSV Pipeline for AioScrapy
3
+ AioScrapy的CSV管道
4
+
5
+ This module provides a pipeline for storing scraped items in CSV files.
6
+ It includes a mixin class for CSV file handling and a pipeline class that
7
+ uses the mixin to process items and save them to CSV files.
8
+ 此模块提供了一个用于将抓取的项目存储在CSV文件中的管道。
9
+ 它包括一个用于CSV文件处理的混入类和一个使用该混入类处理项目并将其保存到CSV文件的管道类。
10
+ """
11
+
1
12
  import asyncio
2
13
  import csv
3
14
  from typing import Optional
@@ -6,6 +17,19 @@ from aioscrapy.utils.log import logger
6
17
 
7
18
 
8
19
  class CsvSinkMixin:
20
+ """
21
+ Mixin class for CSV file handling.
22
+ CSV文件处理的混入类。
23
+
24
+ This mixin provides methods for saving items to CSV files, managing CSV writers,
25
+ and closing CSV files. It can be used by any class that needs to write data to
26
+ CSV files.
27
+ 此混入类提供了将项目保存到CSV文件、管理CSV写入器和关闭CSV文件的方法。
28
+ 它可以被任何需要将数据写入CSV文件的类使用。
29
+ """
30
+
31
+ # Dictionary to store CSV writers and file handles, keyed by filename
32
+ # 用于存储CSV写入器和文件句柄的字典,以文件名为键
9
33
  csv_writer = {}
10
34
 
11
35
  async def save_item(
@@ -14,64 +38,278 @@ class CsvSinkMixin:
14
38
  *,
15
39
  filename: Optional[str] = None,
16
40
  ):
41
+ """
42
+ Save an item to a CSV file.
43
+ 将项目保存到CSV文件。
44
+
45
+ This method writes a dictionary item as a row in a CSV file. The first row
46
+ of the CSV file will contain the keys of the first item saved to the file.
47
+ 此方法将字典项目作为CSV文件中的一行写入。CSV文件的第一行将包含保存到
48
+ 文件的第一个项目的键。
49
+
50
+ Args:
51
+ item: The dictionary item to save.
52
+ 要保存的字典项目。
53
+ filename: The name of the CSV file to save to.
54
+ 要保存到的CSV文件的名称。
55
+ If not provided, an assertion error will be raised.
56
+ 如果未提供,将引发断言错误。
57
+
58
+ Raises:
59
+ AssertionError: If filename is None.
60
+ 如果filename为None。
61
+ Exception: If there is an error writing to the CSV file.
62
+ 如果写入CSV文件时出错。
63
+ """
64
+ # Ensure filename is provided
65
+ # 确保提供了文件名
17
66
  assert filename is not None, "请传入filename参数"
67
+
68
+ # Add .csv extension if not present
69
+ # 如果不存在,则添加.csv扩展名
18
70
  if '.csv' not in filename:
19
71
  filename = filename + '.csv'
72
+
20
73
  try:
74
+ # Get or create a CSV writer for this file
75
+ # 获取或创建此文件的CSV写入器
21
76
  writer = self._get_writer(filename, item)
77
+
78
+ # Write the item values as a row
79
+ # 将项目值作为一行写入
22
80
  writer.writerow(item.values())
23
81
  except Exception as e:
82
+ # Log any errors that occur
83
+ # 记录发生的任何错误
24
84
  logger.exception(f'Save csv Error, filename:{filename}, item:{item}, errMsg: {e}')
25
85
 
26
86
  def _get_writer(self, filename, item):
87
+ """
88
+ Get or create a CSV writer for a file.
89
+ 获取或创建文件的CSV写入器。
90
+
91
+ This method returns an existing CSV writer for the given filename if one
92
+ exists, or creates a new one if not. When creating a new writer, it also
93
+ writes the header row using the keys of the provided item.
94
+ 如果存在,此方法返回给定文件名的现有CSV写入器,如果不存在,则创建一个新的。
95
+ 创建新写入器时,它还使用提供的项目的键写入标题行。
96
+
97
+ Args:
98
+ filename: The name of the CSV file.
99
+ CSV文件的名称。
100
+ item: The dictionary item whose keys will be used as headers.
101
+ 其键将用作标题的字典项目。
102
+
103
+ Returns:
104
+ csv.writer: A CSV writer object for the file.
105
+ 文件的CSV写入器对象。
106
+ """
107
+ # Try to get an existing writer
108
+ # 尝试获取现有的写入器
27
109
  writer, *_ = self.csv_writer.get(filename, (None, None))
110
+
111
+ # If no writer exists, create a new one
112
+ # 如果不存在写入器,则创建一个新的
28
113
  if writer is None:
114
+ # Open the file for writing
115
+ # 打开文件进行写入
29
116
  file = open(filename, 'w', encoding="UTF8", newline='')
117
+
118
+ # Create a CSV writer
119
+ # 创建CSV写入器
30
120
  writer = csv.writer(file)
121
+
122
+ # Write the header row using the item keys
123
+ # 使用项目键写入标题行
31
124
  writer.writerow(item.keys())
125
+
126
+ # Store the writer and file handle
127
+ # 存储写入器和文件句柄
32
128
  self.csv_writer[filename] = (writer, file)
129
+
33
130
  return writer
34
131
 
35
132
  def close_csv(self, filename=None):
133
+ """
134
+ Close a specific CSV file.
135
+ 关闭特定的CSV文件。
136
+
137
+ This method closes the file handle for a specific CSV file and removes
138
+ its writer from the csv_writer dictionary.
139
+ 此方法关闭特定CSV文件的文件句柄,并从csv_writer字典中删除其写入器。
140
+
141
+ Args:
142
+ filename: The name of the CSV file to close.
143
+ 要关闭的CSV文件的名称。
144
+ If None, nothing happens.
145
+ 如果为None,则不会发生任何事情。
146
+ """
147
+ # Remove the writer and file handle from the dictionary
148
+ # 从字典中删除写入器和文件句柄
36
149
  *_, file = self.csv_writer.pop(filename, (None, None))
150
+
151
+ # If a file handle was found, close it
152
+ # 如果找到文件句柄,则关闭它
37
153
  if file is not None:
38
154
  logger.info(f'Closing csv: {filename}')
39
155
  file.close()
40
156
 
41
157
  def close(self):
158
+ """
159
+ Close all open CSV files.
160
+ 关闭所有打开的CSV文件。
161
+
162
+ This method closes all file handles for all CSV files that have been
163
+ opened by this instance.
164
+ 此方法关闭此实例打开的所有CSV文件的所有文件句柄。
165
+ """
166
+ # Make a copy of the keys to avoid modifying the dictionary during iteration
167
+ # 复制键以避免在迭代期间修改字典
42
168
  for filename in list(self.csv_writer.keys()):
43
169
  self.close_csv(filename)
44
170
 
45
171
 
46
172
  class CsvPipeline(CsvSinkMixin):
173
+ """
174
+ Pipeline for storing scraped items in CSV files.
175
+ 用于将抓取的项目存储在CSV文件中的管道。
176
+
177
+ This pipeline uses the CsvSinkMixin to save items to CSV files. It processes
178
+ items that have a '__csv__' key, which contains parameters for the CSV file
179
+ such as the filename.
180
+ 此管道使用CsvSinkMixin将项目保存到CSV文件中。它处理具有'__csv__'键的项目,
181
+ 该键包含CSV文件的参数,如文件名。
182
+ """
183
+
47
184
  def __init__(self, settings):
185
+ """
186
+ Initialize the CSV pipeline.
187
+ 初始化CSV管道。
188
+
189
+ Args:
190
+ settings: The AioScrapy settings object.
191
+ AioScrapy设置对象。
192
+ Not used in the current implementation, but included for
193
+ compatibility with the pipeline interface.
194
+ 在当前实现中未使用,但为了与管道接口兼容而包含。
195
+ """
196
+ # Create a lock to ensure thread-safe access to CSV files
197
+ # 创建锁以确保对CSV文件的线程安全访问
48
198
  self.lock = asyncio.Lock()
49
199
 
50
200
  @classmethod
51
201
  def from_settings(cls, settings):
202
+ """
203
+ Create a CsvPipeline instance from settings.
204
+ 从设置创建CsvPipeline实例。
205
+
206
+ This is the factory method used by AioScrapy to create pipeline instances.
207
+ 这是AioScrapy用于创建管道实例的工厂方法。
208
+
209
+ Args:
210
+ settings: The AioScrapy settings object.
211
+ AioScrapy设置对象。
212
+
213
+ Returns:
214
+ CsvPipeline: A new CsvPipeline instance.
215
+ 一个新的CsvPipeline实例。
216
+ """
52
217
  return cls(settings)
53
218
 
54
219
  async def process_item(self, item, spider):
220
+ """
221
+ Process an item and save it to a CSV file if it has a '__csv__' key.
222
+ 处理项目,如果它有'__csv__'键,则将其保存到CSV文件。
223
+
224
+ This method checks if the item has a '__csv__' key. If it does, it uses
225
+ the parameters in that key to save the item to a CSV file. If not, it
226
+ logs a warning and returns the item unchanged.
227
+ 此方法检查项目是否具有'__csv__'键。如果有,它使用该键中的参数将项目
228
+ 保存到CSV文件。如果没有,它会记录警告并返回未更改的项目。
229
+
230
+ Args:
231
+ item: The item to process.
232
+ 要处理的项目。
233
+ spider: The spider that generated the item.
234
+ 生成项目的爬虫。
235
+
236
+ Returns:
237
+ dict: The processed item.
238
+ 处理后的项目。
239
+ """
240
+ # Extract CSV parameters from the item
241
+ # 从项目中提取CSV参数
55
242
  execl_kw: Optional[dict] = item.pop('__csv__', None)
243
+
244
+ # If no CSV parameters, log a warning and return the item
245
+ # 如果没有CSV参数,记录警告并返回项目
56
246
  if not execl_kw:
57
247
  logger.warning(f"item Missing key __csv__, not stored")
58
248
  return item
59
249
 
250
+ # Use the spider name as the default filename
251
+ # 使用爬虫名称作为默认文件名
60
252
  execl_kw.setdefault('filename', spider.name)
253
+
254
+ # Use a lock to ensure thread-safe access to CSV files
255
+ # 使用锁确保对CSV文件的线程安全访问
61
256
  async with self.lock:
257
+ # Save the item to a CSV file
258
+ # 将项目保存到CSV文件
62
259
  await self.save_item(item, **execl_kw)
63
260
 
261
+ return item
262
+
64
263
  async def close_spider(self, spider):
264
+ """
265
+ Close all open CSV files when the spider is closed.
266
+ 当爬虫关闭时关闭所有打开的CSV文件。
267
+
268
+ This method is called by AioScrapy when a spider is closed. It ensures
269
+ that all CSV files opened by this pipeline are properly closed.
270
+ 当爬虫关闭时,AioScrapy调用此方法。它确保此管道打开的所有CSV文件
271
+ 都正确关闭。
272
+
273
+ Args:
274
+ spider: The spider that was closed.
275
+ 被关闭的爬虫。
276
+ """
277
+ # Close all open CSV files
278
+ # 关闭所有打开的CSV文件
65
279
  self.close()
66
280
 
67
281
 
282
+ # Test code for the CSV pipeline
283
+ # CSV管道的测试代码
68
284
  if __name__ == '__main__':
285
+ """
286
+ Test code to demonstrate the usage of the CsvPipeline.
287
+ 演示CsvPipeline用法的测试代码。
288
+
289
+ This code creates a simple test spider and pipeline, processes a test item,
290
+ and then closes the pipeline.
291
+ 此代码创建一个简单的测试爬虫和管道,处理一个测试项目,然后关闭管道。
292
+ """
293
+
69
294
  class TestSpider:
295
+ """
296
+ Simple test spider class with a name attribute.
297
+ 具有name属性的简单测试爬虫类。
298
+ """
70
299
  name = 'TestSpider'
71
300
 
72
301
 
73
302
  async def test():
303
+ """
304
+ Async test function to demonstrate the CsvPipeline.
305
+ 演示CsvPipeline的异步测试函数。
306
+ """
307
+ # Create a new CSV pipeline
308
+ # 创建一个新的CSV管道
74
309
  p = CsvPipeline({})
310
+
311
+ # Process a test item with CSV parameters
312
+ # 处理带有CSV参数的测试项目
75
313
  await p.process_item({
76
314
  'title': '测试',
77
315
  'img': '//www.baidu.com/img/flexible/logo/pc/result.png',
@@ -80,7 +318,11 @@ if __name__ == '__main__':
80
318
  }
81
319
  }, TestSpider())
82
320
 
321
+ # Close the pipeline
322
+ # 关闭管道
83
323
  await p.close_spider(None)
84
324
 
85
325
 
326
+ # Run the test function
327
+ # 运行测试函数
86
328
  asyncio.run(test())