aio-scrapy 2.1.3__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -40
- aio_scrapy-2.1.6.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +523 -18
- aioscrapy/core/downloader/handlers/__init__.py +188 -6
- aioscrapy/core/downloader/handlers/aiohttp.py +188 -4
- aioscrapy/core/downloader/handlers/curl_cffi.py +125 -4
- aioscrapy/core/downloader/handlers/httpx.py +134 -4
- aioscrapy/core/downloader/handlers/pyhttpx.py +133 -4
- aioscrapy/core/downloader/handlers/requests.py +121 -3
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +170 -14
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +193 -7
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +313 -13
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.3.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -110
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -53
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
from functools import wraps
|
|
2
|
-
|
|
3
|
-
from playwright._impl._api_types import Error
|
|
4
|
-
from playwright.async_api._generated import Response as EventResponse
|
|
5
|
-
|
|
6
|
-
from aioscrapy import Request, Spider
|
|
7
|
-
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
8
|
-
from aioscrapy.core.downloader.handlers.playwright.driverpool import WebDriverPool
|
|
9
|
-
from aioscrapy.core.downloader.handlers.playwright.webdriver import PlaywrightDriver
|
|
10
|
-
from aioscrapy.exceptions import DownloadError
|
|
11
|
-
from aioscrapy.http import PlaywrightResponse
|
|
12
|
-
from aioscrapy.settings import Settings
|
|
13
|
-
from aioscrapy.utils.tools import call_helper
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class PlaywrightHandler(BaseDownloadHandler):
|
|
17
|
-
def __init__(self, settings: Settings):
|
|
18
|
-
self.settings = settings
|
|
19
|
-
playwright_client_args = settings.getdict('PLAYWRIGHT_CLIENT_ARGS')
|
|
20
|
-
self.wait_until = playwright_client_args.get('wait_until', 'domcontentloaded')
|
|
21
|
-
self.url_regexes = playwright_client_args.pop('url_regexes', [])
|
|
22
|
-
pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
|
|
23
|
-
self._webdriver_pool = WebDriverPool(pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args)
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def from_settings(cls, settings: Settings):
|
|
27
|
-
return cls(settings)
|
|
28
|
-
|
|
29
|
-
async def download_request(self, request: Request, spider: Spider) -> PlaywrightResponse:
|
|
30
|
-
try:
|
|
31
|
-
return await self._download_request(request, spider)
|
|
32
|
-
except Error as e:
|
|
33
|
-
raise DownloadError(e) from e
|
|
34
|
-
|
|
35
|
-
async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
36
|
-
cookies = dict(request.cookies)
|
|
37
|
-
timeout = request.meta.get('download_timeout', 30) * 1000
|
|
38
|
-
user_agent = request.headers.get("User-Agent")
|
|
39
|
-
proxy: str = request.meta.get("proxy")
|
|
40
|
-
url = request.url
|
|
41
|
-
|
|
42
|
-
cache_response = {}
|
|
43
|
-
|
|
44
|
-
# 为了获取监听事件中的响应结果
|
|
45
|
-
def on_event_wrap_handler(func):
|
|
46
|
-
@wraps(func)
|
|
47
|
-
async def inner(response):
|
|
48
|
-
ret = await func(response)
|
|
49
|
-
if ret:
|
|
50
|
-
cache_response[ret[0]] = ret[1]
|
|
51
|
-
|
|
52
|
-
return inner
|
|
53
|
-
|
|
54
|
-
kwargs = dict()
|
|
55
|
-
if proxy:
|
|
56
|
-
kwargs['proxy'] = proxy
|
|
57
|
-
if user_agent:
|
|
58
|
-
kwargs['user_agent'] = user_agent
|
|
59
|
-
|
|
60
|
-
driver: PlaywrightDriver = await self._webdriver_pool.get(**kwargs)
|
|
61
|
-
|
|
62
|
-
# 移除所有的事件监听事件后 重新添加
|
|
63
|
-
driver.page._events = dict()
|
|
64
|
-
for name in dir(spider):
|
|
65
|
-
if not name.startswith('on_event_'):
|
|
66
|
-
continue
|
|
67
|
-
driver.page.on(name.replace('on_event_', ''), on_event_wrap_handler(getattr(spider, name)))
|
|
68
|
-
|
|
69
|
-
try:
|
|
70
|
-
if cookies:
|
|
71
|
-
driver.url = url
|
|
72
|
-
await driver.set_cookies(cookies)
|
|
73
|
-
await driver.page.goto(url, wait_until=request.meta.get('wait_until', self.wait_until), timeout=timeout)
|
|
74
|
-
|
|
75
|
-
if process_action_fn := getattr(spider, 'process_action', None):
|
|
76
|
-
action_result = await call_helper(process_action_fn, driver)
|
|
77
|
-
if action_result:
|
|
78
|
-
cache_response[action_result[0]] = action_result[1]
|
|
79
|
-
|
|
80
|
-
for cache_key in list(cache_response.keys()):
|
|
81
|
-
if isinstance(cache_response[cache_key], EventResponse):
|
|
82
|
-
cache_ret = cache_response[cache_key]
|
|
83
|
-
cache_response[cache_key] = PlaywrightResponse(
|
|
84
|
-
url=cache_ret.url,
|
|
85
|
-
request=request,
|
|
86
|
-
intercept_request=dict(
|
|
87
|
-
url=cache_ret.request.url,
|
|
88
|
-
headers=cache_ret.request.headers,
|
|
89
|
-
data=cache_ret.request.post_data,
|
|
90
|
-
),
|
|
91
|
-
headers=cache_ret.headers,
|
|
92
|
-
body=await cache_ret.body(),
|
|
93
|
-
status=cache_ret.status,
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
return PlaywrightResponse(
|
|
97
|
-
url=driver.page.url,
|
|
98
|
-
status=200,
|
|
99
|
-
text=await driver.page.content(),
|
|
100
|
-
cookies=await driver.get_cookies(),
|
|
101
|
-
cache_response=cache_response,
|
|
102
|
-
driver=driver,
|
|
103
|
-
driver_pool=self._webdriver_pool
|
|
104
|
-
)
|
|
105
|
-
except Exception as e:
|
|
106
|
-
await self._webdriver_pool.remove(driver)
|
|
107
|
-
raise e
|
|
108
|
-
|
|
109
|
-
async def close(self):
|
|
110
|
-
await self._webdriver_pool.close()
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
|
-
from asyncio import Lock
|
|
4
|
-
from asyncio.queues import Queue
|
|
5
|
-
|
|
6
|
-
from aioscrapy.utils.tools import singleton
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@singleton
|
|
10
|
-
class WebDriverPool:
|
|
11
|
-
def __init__(
|
|
12
|
-
self, pool_size=5, driver_cls=None, **kwargs
|
|
13
|
-
):
|
|
14
|
-
self.pool_size = pool_size
|
|
15
|
-
self.driver_cls = driver_cls
|
|
16
|
-
self.kwargs = kwargs
|
|
17
|
-
|
|
18
|
-
self.queue = Queue(maxsize=pool_size)
|
|
19
|
-
self.lock = Lock()
|
|
20
|
-
self.driver_count = 0
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def is_full(self):
|
|
24
|
-
return self.driver_count >= self.pool_size
|
|
25
|
-
|
|
26
|
-
async def create_driver(self, **args):
|
|
27
|
-
kwargs = self.kwargs.copy()
|
|
28
|
-
kwargs.update(args)
|
|
29
|
-
driver = self.driver_cls(**kwargs)
|
|
30
|
-
await driver.setup()
|
|
31
|
-
return driver
|
|
32
|
-
|
|
33
|
-
async def get(self, **kwargs):
|
|
34
|
-
async with self.lock:
|
|
35
|
-
if not self.is_full:
|
|
36
|
-
driver = await self.create_driver(**kwargs)
|
|
37
|
-
self.driver_count += 1
|
|
38
|
-
else:
|
|
39
|
-
driver = await self.queue.get()
|
|
40
|
-
return driver
|
|
41
|
-
|
|
42
|
-
async def release(self, driver):
|
|
43
|
-
await self.queue.put(driver)
|
|
44
|
-
|
|
45
|
-
async def remove(self, driver):
|
|
46
|
-
await driver.quit()
|
|
47
|
-
self.driver_count -= 1
|
|
48
|
-
|
|
49
|
-
async def close(self):
|
|
50
|
-
while not self.queue.empty():
|
|
51
|
-
driver = await self.queue.get()
|
|
52
|
-
await driver.quit()
|
|
53
|
-
self.driver_count -= 1
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
from typing import Dict, Optional, Tuple
|
|
5
|
-
|
|
6
|
-
try:
|
|
7
|
-
from typing import Literal # python >= 3.8
|
|
8
|
-
except ImportError: # python <3.8
|
|
9
|
-
from typing_extensions import Literal
|
|
10
|
-
|
|
11
|
-
from urllib.parse import urlparse, urlunparse
|
|
12
|
-
|
|
13
|
-
from playwright.async_api import Page, BrowserContext, ViewportSize, ProxySettings
|
|
14
|
-
from playwright.async_api import Playwright, Browser
|
|
15
|
-
from playwright.async_api import async_playwright
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class PlaywrightDriver:
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
*,
|
|
22
|
-
driver_type: Literal["chromium", "firefox", "webkit"] = "chromium",
|
|
23
|
-
proxy: Optional[str] = None,
|
|
24
|
-
browser_args: Optional[Dict] = None,
|
|
25
|
-
context_args: Optional[Dict] = None,
|
|
26
|
-
window_size: Optional[Tuple[int, int]] = None,
|
|
27
|
-
user_agent: str = None,
|
|
28
|
-
**kwargs
|
|
29
|
-
):
|
|
30
|
-
|
|
31
|
-
self.driver_type = driver_type
|
|
32
|
-
self.proxy = proxy and self.format_context_proxy(proxy)
|
|
33
|
-
self.viewport = window_size and ViewportSize(width=window_size[0], height=window_size[1])
|
|
34
|
-
self.browser_args = browser_args or {}
|
|
35
|
-
self.context_args = context_args or {}
|
|
36
|
-
self.user_agent = user_agent
|
|
37
|
-
|
|
38
|
-
self.driver: Optional[Playwright] = None
|
|
39
|
-
self.browser: Optional[Browser] = None
|
|
40
|
-
self.context: Optional[BrowserContext] = None
|
|
41
|
-
self.page: Optional[Page] = None
|
|
42
|
-
self.url = None
|
|
43
|
-
|
|
44
|
-
async def setup(self):
|
|
45
|
-
browser_args = self.browser_args.copy()
|
|
46
|
-
context_args = self.context_args.copy()
|
|
47
|
-
if browser_args.get('args') is None:
|
|
48
|
-
browser_args.update({'args': ["--no-sandbox"]})
|
|
49
|
-
|
|
50
|
-
if context_args.get("storage_state") is not None:
|
|
51
|
-
storage_state_path = context_args.get("storage_state")
|
|
52
|
-
os.makedirs(os.path.dirname(storage_state_path), exist_ok=True)
|
|
53
|
-
|
|
54
|
-
if self.proxy:
|
|
55
|
-
browser_args.update({'proxy': self.proxy})
|
|
56
|
-
context_args.update({'proxy': self.proxy})
|
|
57
|
-
if self.viewport:
|
|
58
|
-
context_args.update({"viewport": self.viewport})
|
|
59
|
-
context_args.update({"screen": self.viewport})
|
|
60
|
-
if self.user_agent:
|
|
61
|
-
context_args.update({'user_agent': self.user_agent})
|
|
62
|
-
|
|
63
|
-
self.driver = await async_playwright().start()
|
|
64
|
-
self.browser: Browser = await getattr(self.driver, self.driver_type).launch(**browser_args)
|
|
65
|
-
self.context = await self.browser.new_context(**context_args)
|
|
66
|
-
self.page = await self.context.new_page()
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def format_context_proxy(proxy) -> ProxySettings:
|
|
70
|
-
parsed_url = urlparse(proxy)
|
|
71
|
-
return ProxySettings(
|
|
72
|
-
server=urlunparse(parsed_url._replace(netloc=parsed_url.netloc.split('@')[-1])),
|
|
73
|
-
username=parsed_url.username,
|
|
74
|
-
password=parsed_url.password,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
async def quit(self):
|
|
78
|
-
await self.page.close()
|
|
79
|
-
try:
|
|
80
|
-
await self.context.close()
|
|
81
|
-
except:
|
|
82
|
-
pass
|
|
83
|
-
finally:
|
|
84
|
-
await self.browser.close()
|
|
85
|
-
await self.driver.stop()
|
|
86
|
-
|
|
87
|
-
async def get_cookies(self):
|
|
88
|
-
return {
|
|
89
|
-
cookie["name"]: cookie["value"]
|
|
90
|
-
for cookie in await self.page.context.cookies()
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
async def set_cookies(self, cookies: dict):
|
|
94
|
-
await self.page.context.add_cookies([
|
|
95
|
-
{"name": key, "value": value, "url": self.url or self.page.url} for key, value in cookies.items()
|
|
96
|
-
])
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from typing import Optional, Any
|
|
2
|
-
|
|
3
|
-
from aioscrapy.http.response.text import TextResponse
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class PlaywrightResponse(TextResponse):
|
|
7
|
-
def __init__(
|
|
8
|
-
self,
|
|
9
|
-
*args,
|
|
10
|
-
text: str = '',
|
|
11
|
-
cache_response: Optional[dict] = None,
|
|
12
|
-
driver: Optional["PlaywrightDriver"] = None,
|
|
13
|
-
driver_pool: Optional["WebDriverPool"] = None,
|
|
14
|
-
intercept_request: Optional[dict] = None,
|
|
15
|
-
**kwargs
|
|
16
|
-
):
|
|
17
|
-
self.driver = driver
|
|
18
|
-
self.driver_pool = driver_pool
|
|
19
|
-
self._text = text
|
|
20
|
-
self.cache_response = cache_response or {}
|
|
21
|
-
self.intercept_request = intercept_request
|
|
22
|
-
super().__init__(*args, **kwargs)
|
|
23
|
-
|
|
24
|
-
async def release(self):
|
|
25
|
-
self.driver_pool and self.driver and await self.driver_pool.release(self.driver)
|
|
26
|
-
|
|
27
|
-
@property
|
|
28
|
-
def text(self):
|
|
29
|
-
return self._text or super().text
|
|
30
|
-
|
|
31
|
-
@text.setter
|
|
32
|
-
def text(self, text):
|
|
33
|
-
self._text = text
|
|
34
|
-
|
|
35
|
-
def get_response(self, key) -> Any:
|
|
36
|
-
return self.cache_response.get(key)
|
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import math
|
|
3
|
-
from io import BytesIO
|
|
4
|
-
from typing import Tuple, Optional
|
|
5
|
-
|
|
6
|
-
import requests
|
|
7
|
-
import xlsxwriter
|
|
8
|
-
from PIL import Image, ImageFile
|
|
9
|
-
|
|
10
|
-
from aioscrapy.utils.log import logger
|
|
11
|
-
|
|
12
|
-
try:
|
|
13
|
-
resample = Image.LANCZOS
|
|
14
|
-
except:
|
|
15
|
-
resample = Image.ANTIALIAS
|
|
16
|
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class ExeclSinkMixin:
|
|
20
|
-
ws_cache = {}
|
|
21
|
-
wb_cache = {}
|
|
22
|
-
fields_cache = {}
|
|
23
|
-
y_cache = {}
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
async def deal_img(url: str, img_size: Optional[Tuple[int, int]]) -> Optional[BytesIO]:
|
|
27
|
-
if url.startswith('//'):
|
|
28
|
-
url = 'https:' + url
|
|
29
|
-
try:
|
|
30
|
-
img_bytes = requests.get(url).content
|
|
31
|
-
except Exception as e:
|
|
32
|
-
logger.error(f"download img error: {e}")
|
|
33
|
-
return
|
|
34
|
-
im = Image.open(BytesIO(img_bytes))
|
|
35
|
-
im_format = im.format
|
|
36
|
-
if img_size:
|
|
37
|
-
temp = max(im.size[0] / img_size[0], im.size[1] / img_size[1])
|
|
38
|
-
img_size = (math.ceil(im.size[0] / temp), math.ceil(im.size[1] / temp))
|
|
39
|
-
im = im.resize(img_size, resample).convert('P')
|
|
40
|
-
result = BytesIO()
|
|
41
|
-
im.save(result, format=im_format)
|
|
42
|
-
return result
|
|
43
|
-
|
|
44
|
-
async def save_item(
|
|
45
|
-
self,
|
|
46
|
-
item: dict,
|
|
47
|
-
*,
|
|
48
|
-
filename: Optional[str] = None,
|
|
49
|
-
date_fields: Optional[list] = None,
|
|
50
|
-
date_format: str = 'yyyy-mm-dd HH:MM:SS',
|
|
51
|
-
img_fields: Optional[list] = None,
|
|
52
|
-
img_size: Optional[Tuple[int, int]] = None,
|
|
53
|
-
**options
|
|
54
|
-
):
|
|
55
|
-
assert filename is not None, "请传入filename参数"
|
|
56
|
-
if '.xlsx' not in filename:
|
|
57
|
-
filename = filename + '.xlsx'
|
|
58
|
-
try:
|
|
59
|
-
wb, ws, fields, y = self._get_write_class(filename, item, **options)
|
|
60
|
-
bold_format_1 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter'})
|
|
61
|
-
bold_format_2 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter', 'fg_color': '#D0D3D4'})
|
|
62
|
-
for x, field in enumerate(fields):
|
|
63
|
-
if x % 2 == 0:
|
|
64
|
-
bold_format = bold_format_1
|
|
65
|
-
else:
|
|
66
|
-
bold_format = bold_format_2
|
|
67
|
-
if date_fields is not None and field in date_fields:
|
|
68
|
-
ws.write_datetime(y, x, item.get(field), wb.add_format({'num_format': date_format}))
|
|
69
|
-
|
|
70
|
-
elif img_fields is not None and field in img_fields:
|
|
71
|
-
img_size and ws.set_column_pixels(x, x, width=math.ceil(img_size[0]))
|
|
72
|
-
url = item.get(field)
|
|
73
|
-
img_bytes = await self.deal_img(url, img_size)
|
|
74
|
-
if img_bytes is None or ws.insert_image(y, x, '', {'image_data': img_bytes}) == -1:
|
|
75
|
-
ws.write(y, x, url, bold_format)
|
|
76
|
-
else:
|
|
77
|
-
ws.write(y, x, item.get(field), bold_format)
|
|
78
|
-
if img_size is not None:
|
|
79
|
-
ws.set_column_pixels(0, len(fields), width=math.ceil(img_size[0]))
|
|
80
|
-
ws.set_row_pixels(y, height=math.ceil(img_size[1]))
|
|
81
|
-
except Exception as e:
|
|
82
|
-
logger.exception(f'Save Execl Error, filename:{filename}, item:{item}, errMsg: {e}')
|
|
83
|
-
|
|
84
|
-
def _get_write_class(self, filename, item, sheet='sheet1', **options):
|
|
85
|
-
filename_sheet = filename + sheet
|
|
86
|
-
if self.ws_cache.get(filename_sheet) is None:
|
|
87
|
-
if self.wb_cache.get(filename) is None:
|
|
88
|
-
logger.info(f'Create Execl: {filename}')
|
|
89
|
-
wb = xlsxwriter.Workbook(filename, options=options)
|
|
90
|
-
self.wb_cache[filename] = wb
|
|
91
|
-
else:
|
|
92
|
-
wb = self.wb_cache[filename]
|
|
93
|
-
ws = wb.add_worksheet(sheet)
|
|
94
|
-
bold_format = wb.add_format(
|
|
95
|
-
{'bold': True, 'font_size': 12, 'border': 1, 'align': 'center', 'valign': 'vcenter'})
|
|
96
|
-
fields = list(item.keys())
|
|
97
|
-
ws.write_row('A1', fields, cell_format=bold_format)
|
|
98
|
-
ws.set_row(0, height=30)
|
|
99
|
-
self.fields_cache[filename_sheet] = fields
|
|
100
|
-
self.ws_cache[filename_sheet] = ws
|
|
101
|
-
self.y_cache[filename_sheet] = 0
|
|
102
|
-
self.y_cache[filename_sheet] += 1
|
|
103
|
-
return self.wb_cache[filename], \
|
|
104
|
-
self.ws_cache[filename_sheet], \
|
|
105
|
-
self.fields_cache[filename_sheet], \
|
|
106
|
-
self.y_cache[filename_sheet]
|
|
107
|
-
|
|
108
|
-
def close_execl(self, filename=None):
|
|
109
|
-
if filename not in self.wb_cache:
|
|
110
|
-
return
|
|
111
|
-
|
|
112
|
-
logger.info(f'Closing Execl: {filename}')
|
|
113
|
-
if wb := self.wb_cache.pop(filename):
|
|
114
|
-
wb.close()
|
|
115
|
-
for filename_sheet in list(self.ws_cache.keys()):
|
|
116
|
-
if not filename_sheet.startswith(filename):
|
|
117
|
-
continue
|
|
118
|
-
self.ws_cache.pop(filename_sheet, None)
|
|
119
|
-
self.y_cache.pop(filename_sheet, None)
|
|
120
|
-
self.fields_cache.pop(filename_sheet, None)
|
|
121
|
-
|
|
122
|
-
def close(self):
|
|
123
|
-
for filename in list(self.wb_cache.keys()):
|
|
124
|
-
self.close_execl(filename)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
class ExeclPipeline(ExeclSinkMixin):
|
|
128
|
-
def __init__(self, settings):
|
|
129
|
-
self.lock = asyncio.Lock()
|
|
130
|
-
|
|
131
|
-
@classmethod
|
|
132
|
-
def from_settings(cls, settings):
|
|
133
|
-
return cls(settings)
|
|
134
|
-
|
|
135
|
-
async def process_item(self, item, spider):
|
|
136
|
-
execl_kw: Optional[dict] = item.pop('__execl__', None)
|
|
137
|
-
if not execl_kw:
|
|
138
|
-
logger.warning(f"item Missing key __execl__, not stored")
|
|
139
|
-
return item
|
|
140
|
-
|
|
141
|
-
execl_kw.setdefault('filename', spider.name)
|
|
142
|
-
async with self.lock:
|
|
143
|
-
await self.save_item(item, **execl_kw)
|
|
144
|
-
|
|
145
|
-
async def close_spider(self, spider):
|
|
146
|
-
self.close()
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
if __name__ == '__main__':
|
|
150
|
-
class TestSpider:
|
|
151
|
-
name = 'TestSpider'
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
async def test():
|
|
155
|
-
p = ExeclPipeline({})
|
|
156
|
-
await p.process_item({
|
|
157
|
-
'title': 'tttt',
|
|
158
|
-
'img': '//www.baidu.com/img/flexible/logo/pc/result.png',
|
|
159
|
-
'__execl__': {
|
|
160
|
-
'sheet': 'sheet1',
|
|
161
|
-
# 'filename': 'test',
|
|
162
|
-
# 'img_fields': ['img'],
|
|
163
|
-
# 'img_size': (100, 500)
|
|
164
|
-
}
|
|
165
|
-
}, TestSpider())
|
|
166
|
-
await p.close_spider(None)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
asyncio.run(test())
|
|
File without changes
|
|
File without changes
|