aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,169 +0,0 @@
1
- import asyncio
2
- import math
3
- from io import BytesIO
4
- from typing import Tuple, Optional
5
-
6
- import requests
7
- import xlsxwriter
8
- from PIL import Image, ImageFile
9
-
10
- from aioscrapy.utils.log import logger
11
-
12
- try:
13
- resample = Image.LANCZOS
14
- except:
15
- resample = Image.ANTIALIAS
16
- ImageFile.LOAD_TRUNCATED_IMAGES = True
17
-
18
-
19
- class ExeclSinkMixin:
20
- ws_cache = {}
21
- wb_cache = {}
22
- fields_cache = {}
23
- y_cache = {}
24
-
25
- @staticmethod
26
- async def deal_img(url: str, img_size: Optional[Tuple[int, int]]) -> Optional[BytesIO]:
27
- if url.startswith('//'):
28
- url = 'https:' + url
29
- try:
30
- img_bytes = requests.get(url).content
31
- except Exception as e:
32
- logger.error(f"download img error: {e}")
33
- return
34
- im = Image.open(BytesIO(img_bytes))
35
- im_format = im.format
36
- if img_size:
37
- temp = max(im.size[0] / img_size[0], im.size[1] / img_size[1])
38
- img_size = (math.ceil(im.size[0] / temp), math.ceil(im.size[1] / temp))
39
- im = im.resize(img_size, resample).convert('P')
40
- result = BytesIO()
41
- im.save(result, format=im_format)
42
- return result
43
-
44
- async def save_item(
45
- self,
46
- item: dict,
47
- *,
48
- filename: Optional[str] = None,
49
- date_fields: Optional[list] = None,
50
- date_format: str = 'yyyy-mm-dd HH:MM:SS',
51
- img_fields: Optional[list] = None,
52
- img_size: Optional[Tuple[int, int]] = None,
53
- **options
54
- ):
55
- assert filename is not None, "请传入filename参数"
56
- if '.xlsx' not in filename:
57
- filename = filename + '.xlsx'
58
- try:
59
- wb, ws, fields, y = self._get_write_class(filename, item, **options)
60
- bold_format_1 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter'})
61
- bold_format_2 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter', 'fg_color': '#D0D3D4'})
62
- for x, field in enumerate(fields):
63
- if x % 2 == 0:
64
- bold_format = bold_format_1
65
- else:
66
- bold_format = bold_format_2
67
- if date_fields is not None and field in date_fields:
68
- ws.write_datetime(y, x, item.get(field), wb.add_format({'num_format': date_format}))
69
-
70
- elif img_fields is not None and field in img_fields:
71
- img_size and ws.set_column_pixels(x, x, width=math.ceil(img_size[0]))
72
- url = item.get(field)
73
- img_bytes = await self.deal_img(url, img_size)
74
- if img_bytes is None or ws.insert_image(y, x, '', {'image_data': img_bytes}) == -1:
75
- ws.write(y, x, url, bold_format)
76
- else:
77
- ws.write(y, x, item.get(field), bold_format)
78
- if img_size is not None:
79
- ws.set_column_pixels(0, len(fields), width=math.ceil(img_size[0]))
80
- ws.set_row_pixels(y, height=math.ceil(img_size[1]))
81
- except Exception as e:
82
- logger.exception(f'Save Execl Error, filename:{filename}, item:{item}, errMsg: {e}')
83
-
84
- def _get_write_class(self, filename, item, sheet='sheet1', **options):
85
- filename_sheet = filename + sheet
86
- if self.ws_cache.get(filename_sheet) is None:
87
- if self.wb_cache.get(filename) is None:
88
- logger.info(f'Create Execl: {filename}')
89
- wb = xlsxwriter.Workbook(filename, options=options)
90
- self.wb_cache[filename] = wb
91
- else:
92
- wb = self.wb_cache[filename]
93
- ws = wb.add_worksheet(sheet)
94
- bold_format = wb.add_format(
95
- {'bold': True, 'font_size': 12, 'border': 1, 'align': 'center', 'valign': 'vcenter'})
96
- fields = list(item.keys())
97
- ws.write_row('A1', fields, cell_format=bold_format)
98
- ws.set_row(0, height=30)
99
- self.fields_cache[filename_sheet] = fields
100
- self.ws_cache[filename_sheet] = ws
101
- self.y_cache[filename_sheet] = 0
102
- self.y_cache[filename_sheet] += 1
103
- return self.wb_cache[filename], \
104
- self.ws_cache[filename_sheet], \
105
- self.fields_cache[filename_sheet], \
106
- self.y_cache[filename_sheet]
107
-
108
- def close_execl(self, filename=None):
109
- if filename not in self.wb_cache:
110
- return
111
-
112
- logger.info(f'Closing Execl: {filename}')
113
- if wb := self.wb_cache.pop(filename):
114
- wb.close()
115
- for filename_sheet in list(self.ws_cache.keys()):
116
- if not filename_sheet.startswith(filename):
117
- continue
118
- self.ws_cache.pop(filename_sheet, None)
119
- self.y_cache.pop(filename_sheet, None)
120
- self.fields_cache.pop(filename_sheet, None)
121
-
122
- def close(self):
123
- for filename in list(self.wb_cache.keys()):
124
- self.close_execl(filename)
125
-
126
-
127
- class ExeclPipeline(ExeclSinkMixin):
128
- def __init__(self, settings):
129
- self.lock = asyncio.Lock()
130
-
131
- @classmethod
132
- def from_settings(cls, settings):
133
- return cls(settings)
134
-
135
- async def process_item(self, item, spider):
136
- execl_kw: Optional[dict] = item.pop('__execl__', None)
137
- if not execl_kw:
138
- logger.warning(f"item Missing key __execl__, not stored")
139
- return item
140
-
141
- execl_kw.setdefault('filename', spider.name)
142
- async with self.lock:
143
- await self.save_item(item, **execl_kw)
144
-
145
- async def close_spider(self, spider):
146
- self.close()
147
-
148
-
149
- if __name__ == '__main__':
150
- class TestSpider:
151
- name = 'TestSpider'
152
-
153
-
154
- async def test():
155
- p = ExeclPipeline({})
156
- await p.process_item({
157
- 'title': 'tttt',
158
- 'img': '//www.baidu.com/img/flexible/logo/pc/result.png',
159
- '__execl__': {
160
- 'sheet': 'sheet1',
161
- # 'filename': 'test',
162
- # 'img_fields': ['img'],
163
- # 'img_size': (100, 500)
164
- }
165
- }, TestSpider())
166
- await p.close_spider(None)
167
-
168
-
169
- asyncio.run(test())