aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import math
|
|
3
|
-
from io import BytesIO
|
|
4
|
-
from typing import Tuple, Optional
|
|
5
|
-
|
|
6
|
-
import requests
|
|
7
|
-
import xlsxwriter
|
|
8
|
-
from PIL import Image, ImageFile
|
|
9
|
-
|
|
10
|
-
from aioscrapy.utils.log import logger
|
|
11
|
-
|
|
12
|
-
try:
|
|
13
|
-
resample = Image.LANCZOS
|
|
14
|
-
except:
|
|
15
|
-
resample = Image.ANTIALIAS
|
|
16
|
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class ExeclSinkMixin:
|
|
20
|
-
ws_cache = {}
|
|
21
|
-
wb_cache = {}
|
|
22
|
-
fields_cache = {}
|
|
23
|
-
y_cache = {}
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
async def deal_img(url: str, img_size: Optional[Tuple[int, int]]) -> Optional[BytesIO]:
|
|
27
|
-
if url.startswith('//'):
|
|
28
|
-
url = 'https:' + url
|
|
29
|
-
try:
|
|
30
|
-
img_bytes = requests.get(url).content
|
|
31
|
-
except Exception as e:
|
|
32
|
-
logger.error(f"download img error: {e}")
|
|
33
|
-
return
|
|
34
|
-
im = Image.open(BytesIO(img_bytes))
|
|
35
|
-
im_format = im.format
|
|
36
|
-
if img_size:
|
|
37
|
-
temp = max(im.size[0] / img_size[0], im.size[1] / img_size[1])
|
|
38
|
-
img_size = (math.ceil(im.size[0] / temp), math.ceil(im.size[1] / temp))
|
|
39
|
-
im = im.resize(img_size, resample).convert('P')
|
|
40
|
-
result = BytesIO()
|
|
41
|
-
im.save(result, format=im_format)
|
|
42
|
-
return result
|
|
43
|
-
|
|
44
|
-
async def save_item(
|
|
45
|
-
self,
|
|
46
|
-
item: dict,
|
|
47
|
-
*,
|
|
48
|
-
filename: Optional[str] = None,
|
|
49
|
-
date_fields: Optional[list] = None,
|
|
50
|
-
date_format: str = 'yyyy-mm-dd HH:MM:SS',
|
|
51
|
-
img_fields: Optional[list] = None,
|
|
52
|
-
img_size: Optional[Tuple[int, int]] = None,
|
|
53
|
-
**options
|
|
54
|
-
):
|
|
55
|
-
assert filename is not None, "请传入filename参数"
|
|
56
|
-
if '.xlsx' not in filename:
|
|
57
|
-
filename = filename + '.xlsx'
|
|
58
|
-
try:
|
|
59
|
-
wb, ws, fields, y = self._get_write_class(filename, item, **options)
|
|
60
|
-
bold_format_1 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter'})
|
|
61
|
-
bold_format_2 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter', 'fg_color': '#D0D3D4'})
|
|
62
|
-
for x, field in enumerate(fields):
|
|
63
|
-
if x % 2 == 0:
|
|
64
|
-
bold_format = bold_format_1
|
|
65
|
-
else:
|
|
66
|
-
bold_format = bold_format_2
|
|
67
|
-
if date_fields is not None and field in date_fields:
|
|
68
|
-
ws.write_datetime(y, x, item.get(field), wb.add_format({'num_format': date_format}))
|
|
69
|
-
|
|
70
|
-
elif img_fields is not None and field in img_fields:
|
|
71
|
-
img_size and ws.set_column_pixels(x, x, width=math.ceil(img_size[0]))
|
|
72
|
-
url = item.get(field)
|
|
73
|
-
img_bytes = await self.deal_img(url, img_size)
|
|
74
|
-
if img_bytes is None or ws.insert_image(y, x, '', {'image_data': img_bytes}) == -1:
|
|
75
|
-
ws.write(y, x, url, bold_format)
|
|
76
|
-
else:
|
|
77
|
-
ws.write(y, x, item.get(field), bold_format)
|
|
78
|
-
if img_size is not None:
|
|
79
|
-
ws.set_column_pixels(0, len(fields), width=math.ceil(img_size[0]))
|
|
80
|
-
ws.set_row_pixels(y, height=math.ceil(img_size[1]))
|
|
81
|
-
except Exception as e:
|
|
82
|
-
logger.exception(f'Save Execl Error, filename:{filename}, item:{item}, errMsg: {e}')
|
|
83
|
-
|
|
84
|
-
def _get_write_class(self, filename, item, sheet='sheet1', **options):
|
|
85
|
-
filename_sheet = filename + sheet
|
|
86
|
-
if self.ws_cache.get(filename_sheet) is None:
|
|
87
|
-
if self.wb_cache.get(filename) is None:
|
|
88
|
-
logger.info(f'Create Execl: {filename}')
|
|
89
|
-
wb = xlsxwriter.Workbook(filename, options=options)
|
|
90
|
-
self.wb_cache[filename] = wb
|
|
91
|
-
else:
|
|
92
|
-
wb = self.wb_cache[filename]
|
|
93
|
-
ws = wb.add_worksheet(sheet)
|
|
94
|
-
bold_format = wb.add_format(
|
|
95
|
-
{'bold': True, 'font_size': 12, 'border': 1, 'align': 'center', 'valign': 'vcenter'})
|
|
96
|
-
fields = list(item.keys())
|
|
97
|
-
ws.write_row('A1', fields, cell_format=bold_format)
|
|
98
|
-
ws.set_row(0, height=30)
|
|
99
|
-
self.fields_cache[filename_sheet] = fields
|
|
100
|
-
self.ws_cache[filename_sheet] = ws
|
|
101
|
-
self.y_cache[filename_sheet] = 0
|
|
102
|
-
self.y_cache[filename_sheet] += 1
|
|
103
|
-
return self.wb_cache[filename], \
|
|
104
|
-
self.ws_cache[filename_sheet], \
|
|
105
|
-
self.fields_cache[filename_sheet], \
|
|
106
|
-
self.y_cache[filename_sheet]
|
|
107
|
-
|
|
108
|
-
def close_execl(self, filename=None):
|
|
109
|
-
if filename not in self.wb_cache:
|
|
110
|
-
return
|
|
111
|
-
|
|
112
|
-
logger.info(f'Closing Execl: {filename}')
|
|
113
|
-
if wb := self.wb_cache.pop(filename):
|
|
114
|
-
wb.close()
|
|
115
|
-
for filename_sheet in list(self.ws_cache.keys()):
|
|
116
|
-
if not filename_sheet.startswith(filename):
|
|
117
|
-
continue
|
|
118
|
-
self.ws_cache.pop(filename_sheet, None)
|
|
119
|
-
self.y_cache.pop(filename_sheet, None)
|
|
120
|
-
self.fields_cache.pop(filename_sheet, None)
|
|
121
|
-
|
|
122
|
-
def close(self):
|
|
123
|
-
for filename in list(self.wb_cache.keys()):
|
|
124
|
-
self.close_execl(filename)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
class ExeclPipeline(ExeclSinkMixin):
|
|
128
|
-
def __init__(self, settings):
|
|
129
|
-
self.lock = asyncio.Lock()
|
|
130
|
-
|
|
131
|
-
@classmethod
|
|
132
|
-
def from_settings(cls, settings):
|
|
133
|
-
return cls(settings)
|
|
134
|
-
|
|
135
|
-
async def process_item(self, item, spider):
|
|
136
|
-
execl_kw: Optional[dict] = item.pop('__execl__', None)
|
|
137
|
-
if not execl_kw:
|
|
138
|
-
logger.warning(f"item Missing key __execl__, not stored")
|
|
139
|
-
return item
|
|
140
|
-
|
|
141
|
-
execl_kw.setdefault('filename', spider.name)
|
|
142
|
-
async with self.lock:
|
|
143
|
-
await self.save_item(item, **execl_kw)
|
|
144
|
-
|
|
145
|
-
async def close_spider(self, spider):
|
|
146
|
-
self.close()
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
if __name__ == '__main__':
|
|
150
|
-
class TestSpider:
|
|
151
|
-
name = 'TestSpider'
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
async def test():
|
|
155
|
-
p = ExeclPipeline({})
|
|
156
|
-
await p.process_item({
|
|
157
|
-
'title': 'tttt',
|
|
158
|
-
'img': '//www.baidu.com/img/flexible/logo/pc/result.png',
|
|
159
|
-
'__execl__': {
|
|
160
|
-
'sheet': 'sheet1',
|
|
161
|
-
# 'filename': 'test',
|
|
162
|
-
# 'img_fields': ['img'],
|
|
163
|
-
# 'img_size': (100, 500)
|
|
164
|
-
}
|
|
165
|
-
}, TestSpider())
|
|
166
|
-
await p.close_spider(None)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
asyncio.run(test())
|
|
File without changes
|
|
File without changes
|