npm - @arcblock/crawler - Versions diffs - 1.2.0 → 1.3.1 - Mend

@arcblock/crawler 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/lib/cjs/crawler.d.ts +10 -2
package/lib/cjs/crawler.js +41 -30
package/lib/cjs/services/carbon.d.ts +3 -0
package/lib/cjs/services/carbon.js +41 -0
package/lib/cjs/store/job.d.ts +3 -0
package/lib/esm/crawler.d.ts +10 -2
package/lib/esm/crawler.js +39 -30
package/lib/esm/services/carbon.d.ts +3 -0
package/lib/esm/services/carbon.js +38 -0
package/lib/esm/store/job.d.ts +3 -0
package/package.json +1 -1

package/lib/cjs/crawler.d.ts CHANGED Viewed

@@ -1,10 +1,15 @@
+import { Page } from '@blocklet/puppeteer';
 import { JobState, SnapshotModel } from './store';
-export declare function createCrawlQueue(queue: string): any;
+type PageHandler = {
+    handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
+    handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
+};
+export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
 export declare function getDataDir(): Promise<{
     htmlDir: string;
     screenshotDir: string;
 }>;
-export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
+export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
     html: string | null;
     screenshot: Uint8Array<ArrayBufferLike> | null;
     meta: {
@@ -17,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
  * @param params
  * @param callback callback when job finished
  */
+export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
 export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
+export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
+export {};

package/lib/cjs/crawler.js CHANGED Viewed

@@ -15,7 +15,9 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports.getPageContent = void 0;
 exports.createCrawlQueue = createCrawlQueue;
 exports.getDataDir = getDataDir;
+exports.enqueue = enqueue;
 exports.crawlUrl = crawlUrl;
+exports.crawlCode = crawlCode;
 const queue_1 = __importDefault(require("@abtnode/queue"));
 const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
 const crypto_1 = require("crypto");
@@ -23,44 +25,44 @@ const fs_extra_1 = __importDefault(require("fs-extra"));
 const path_1 = __importDefault(require("path"));
 const config_1 = require("./config");
 const puppeteer_1 = require("./puppeteer");
+const carbon_1 = require("./services/carbon");
 const snapshot_1 = require("./services/snapshot");
 const store_1 = require("./store");
 const utils_1 = require("./utils");
 const { BaseState } = require('@abtnode/models');
 // eslint-disable-next-line import/no-mutable-exports
 const crawlQueue = createCrawlQueue('urlCrawler');
-function createCrawlQueue(queue) {
+const syncQueue = createCrawlQueue('syncCrawler');
+const codeQueue = createCrawlQueue('codeCrawler', {
+    handleScreenshot: carbon_1.createCarbonImage,
+});
+function createCrawlQueue(queue, handler) {
     const db = new BaseState(store_1.Job);
     return (0, queue_1.default)({
         store: new sequelize_1.default(db, queue),
         concurrency: config_1.config.concurrency,
         onJob: (job) => __awaiter(this, void 0, void 0, function* () {
             config_1.logger.info('Starting to execute crawl job', job);
-            const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
-            if (!canCrawl) {
-                config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
-                const snapshot = (0, snapshot_1.convertJobToSnapshot)({
-                    job,
-                    snapshot: {
-                        status: 'failed',
-                        error: 'Denied by robots.txt',
-                    },
-                });
-                yield store_1.Snapshot.upsert(snapshot);
-                return snapshot;
+            // check robots.txt
+            if (!job.ignoreRobots) {
+                const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
+                if (!canCrawl) {
+                    config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
+                    const snapshot = (0, snapshot_1.convertJobToSnapshot)({
+                        job,
+                        snapshot: {
+                            status: 'failed',
+                            error: 'Denied by robots.txt',
+                        },
+                    });
+                    yield store_1.Snapshot.upsert(snapshot);
+                    return snapshot;
+                }
             }
-            // if index reach autoCloseBrowserCount, close browser
-            // try {
-            //   if (index >= autoCloseBrowserCount) {
-            //     await closeBrowser({ trimCache: false });
-            //   }
-            // } catch (error) {
-            //   logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
-            // }
             const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
             try {
                 // get page content later
-                const result = yield (0, exports.getPageContent)(formattedJob);
+                const result = yield (0, exports.getPageContent)(formattedJob, handler);
                 if (!result || (!result.html && !result.screenshot)) {
                     config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
                     const snapshot = (0, snapshot_1.convertJobToSnapshot)({
@@ -93,6 +95,7 @@ function createCrawlQueue(queue) {
                     const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
                         screenshot: result.screenshot,
                         html: result.html,
+                        format: formattedJob.format,
                     });
                     const snapshot = (0, snapshot_1.convertJobToSnapshot)({
                         job: formattedJob,
@@ -133,13 +136,13 @@ function getDataDir() {
     });
 }
 function saveSnapshotToLocal(_a) {
-    return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
+    return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
         const { htmlDir, screenshotDir } = yield getDataDir();
         let screenshotPath = null;
         let htmlPath = null;
         if (screenshot) {
             const hash = (0, utils_1.md5)(screenshot);
-            screenshotPath = path_1.default.join(screenshotDir, `${hash}.webp`);
+            screenshotPath = path_1.default.join(screenshotDir, `${hash}.${format}`);
             config_1.logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
             yield fs_extra_1.default.writeFile(screenshotPath, screenshot);
         }
@@ -155,7 +158,7 @@ function saveSnapshotToLocal(_a) {
         };
     });
 }
-const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
+const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
     const page = yield (0, puppeteer_1.initPage)();
     if (width && height) {
         yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -219,7 +222,9 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
                 }
             }
             try {
-                screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
+                screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
+                    ? yield handler.handleScreenshot(page)
+                    : yield page.screenshot({ fullPage, quality, type: format });
             }
             catch (err) {
                 config_1.logger.error('Failed to get screenshot:', err);
@@ -252,7 +257,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
             meta.title = data.title;
             meta.description = data.description;
             if (includeHtml) {
-                html = data.html;
+                html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
             }
         }
         catch (err) {
@@ -280,17 +285,17 @@ exports.getPageContent = getPageContent;
  * @param callback callback when job finished
  */
 // eslint-disable-next-line require-await
-function crawlUrl(params, callback) {
+function enqueue(queue, params, callback) {
     return __awaiter(this, void 0, void 0, function* () {
         // skip duplicate job
         const existsJob = yield store_1.Job.isExists(params);
-        if (existsJob) {
+        if (existsJob && !params.sync) {
             config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
             return existsJob.id;
         }
         config_1.logger.info('enqueue crawl job', params);
         const jobId = (0, crypto_1.randomUUID)();
-        const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
+        const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
         job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
             config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
             callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
@@ -302,3 +307,9 @@ function crawlUrl(params, callback) {
         return jobId;
     });
 }
+function crawlUrl(params, callback) {
+    return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
+}
+function crawlCode(params, callback) {
+    return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
+}

package/lib/cjs/services/carbon.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import { Page } from '@blocklet/puppeteer';
+import { JobState } from '../store';
+export declare function createCarbonImage(page: Page, params?: JobState): Promise<Buffer<ArrayBufferLike>>;

package/lib/cjs/services/carbon.js ADDED Viewed

@@ -0,0 +1,41 @@
+"use strict";
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.createCarbonImage = createCarbonImage;
+const config_1 = require("../config");
+function createCarbonImage(page, params) {
+    return __awaiter(this, void 0, void 0, function* () {
+        try {
+            yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
+            const targetElement = (yield page.$('.export-container'));
+            yield page.evaluate((target = document) => {
+                if (!target) {
+                    throw new Error('Target element not found');
+                }
+                target.querySelectorAll('span[role="presentation"]').forEach((node) => {
+                    var _a;
+                    const el = node;
+                    if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
+                        (_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
+                            el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
+                        });
+                    }
+                });
+            }, targetElement);
+            const buffer = yield targetElement.screenshot({ type: (params === null || params === void 0 ? void 0 : params.format) || 'webp', quality: (params === null || params === void 0 ? void 0 : params.quality) || 100 });
+            return buffer;
+        }
+        catch (e) {
+            config_1.logger.error('failed to crawl from carbon', { error: e });
+            throw e;
+        }
+    });
+}

package/lib/cjs/store/job.d.ts CHANGED Viewed

@@ -9,11 +9,14 @@ export interface JobState {
     width?: number;
     height?: number;
     quality?: number;
+    format?: 'png' | 'jpeg' | 'webp';
     timeout?: number;
     fullPage?: boolean;
     lastModified?: string;
     waitTime?: number;
     replace?: boolean;
+    sync?: boolean;
+    ignoreRobots?: boolean;
     headers?: Record<string, string>;
     cookies?: CookieParam[];
     localStorage?: {

package/lib/esm/crawler.d.ts CHANGED Viewed

@@ -1,10 +1,15 @@
+import { Page } from '@blocklet/puppeteer';
 import { JobState, SnapshotModel } from './store';
-export declare function createCrawlQueue(queue: string): any;
+type PageHandler = {
+    handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
+    handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
+};
+export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
 export declare function getDataDir(): Promise<{
     htmlDir: string;
     screenshotDir: string;
 }>;
-export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
+export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
     html: string | null;
     screenshot: Uint8Array<ArrayBufferLike> | null;
     meta: {
@@ -17,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
  * @param params
  * @param callback callback when job finished
  */
+export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
 export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
+export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
+export {};

package/lib/esm/crawler.js CHANGED Viewed

@@ -14,44 +14,44 @@ import fs from 'fs-extra';
 import path from 'path';
 import { config, logger } from './config';
 import { initPage } from './puppeteer';
+import { createCarbonImage } from './services/carbon';
 import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
 import { Job, Snapshot, sequelize } from './store';
 import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
 const { BaseState } = require('@abtnode/models');
 // eslint-disable-next-line import/no-mutable-exports
 const crawlQueue = createCrawlQueue('urlCrawler');
-export function createCrawlQueue(queue) {
+const syncQueue = createCrawlQueue('syncCrawler');
+const codeQueue = createCrawlQueue('codeCrawler', {
+    handleScreenshot: createCarbonImage,
+});
+export function createCrawlQueue(queue, handler) {
     const db = new BaseState(Job);
     return createQueue({
         store: new SequelizeStore(db, queue),
         concurrency: config.concurrency,
         onJob: (job) => __awaiter(this, void 0, void 0, function* () {
             logger.info('Starting to execute crawl job', job);
-            const canCrawl = yield isAcceptCrawler(job.url);
-            if (!canCrawl) {
-                logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
-                const snapshot = convertJobToSnapshot({
-                    job,
-                    snapshot: {
-                        status: 'failed',
-                        error: 'Denied by robots.txt',
-                    },
-                });
-                yield Snapshot.upsert(snapshot);
-                return snapshot;
+            // check robots.txt
+            if (!job.ignoreRobots) {
+                const canCrawl = yield isAcceptCrawler(job.url);
+                if (!canCrawl) {
+                    logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
+                    const snapshot = convertJobToSnapshot({
+                        job,
+                        snapshot: {
+                            status: 'failed',
+                            error: 'Denied by robots.txt',
+                        },
+                    });
+                    yield Snapshot.upsert(snapshot);
+                    return snapshot;
+                }
             }
-            // if index reach autoCloseBrowserCount, close browser
-            // try {
-            //   if (index >= autoCloseBrowserCount) {
-            //     await closeBrowser({ trimCache: false });
-            //   }
-            // } catch (error) {
-            //   logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
-            // }
             const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
             try {
                 // get page content later
-                const result = yield getPageContent(formattedJob);
+                const result = yield getPageContent(formattedJob, handler);
                 if (!result || (!result.html && !result.screenshot)) {
                     logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
                     const snapshot = convertJobToSnapshot({
@@ -84,6 +84,7 @@ export function createCrawlQueue(queue) {
                     const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
                         screenshot: result.screenshot,
                         html: result.html,
+                        format: formattedJob.format,
                     });
                     const snapshot = convertJobToSnapshot({
                         job: formattedJob,
@@ -124,13 +125,13 @@ export function getDataDir() {
     });
 }
 function saveSnapshotToLocal(_a) {
-    return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
+    return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
         const { htmlDir, screenshotDir } = yield getDataDir();
         let screenshotPath = null;
         let htmlPath = null;
         if (screenshot) {
             const hash = md5(screenshot);
-            screenshotPath = path.join(screenshotDir, `${hash}.webp`);
+            screenshotPath = path.join(screenshotDir, `${hash}.${format}`);
             logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
             yield fs.writeFile(screenshotPath, screenshot);
         }
@@ -146,7 +147,7 @@ function saveSnapshotToLocal(_a) {
         };
     });
 }
-export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
+export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
     const page = yield initPage();
     if (width && height) {
         yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -210,7 +211,9 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
                 }
             }
             try {
-                screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
+                screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
+                    ? yield handler.handleScreenshot(page)
+                    : yield page.screenshot({ fullPage, quality, type: format });
             }
             catch (err) {
                 logger.error('Failed to get screenshot:', err);
@@ -243,7 +246,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
             meta.title = data.title;
             meta.description = data.description;
             if (includeHtml) {
-                html = data.html;
+                html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
             }
         }
         catch (err) {
@@ -270,17 +273,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
  * @param callback callback when job finished
  */
 // eslint-disable-next-line require-await
-export function crawlUrl(params, callback) {
+export function enqueue(queue, params, callback) {
     return __awaiter(this, void 0, void 0, function* () {
         // skip duplicate job
         const existsJob = yield Job.isExists(params);
-        if (existsJob) {
+        if (existsJob && !params.sync) {
             logger.info(`Crawl job already exists for ${params.url}, skip`);
             return existsJob.id;
         }
         logger.info('enqueue crawl job', params);
         const jobId = randomUUID();
-        const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
+        const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
         job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
             logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
             callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
@@ -292,3 +295,9 @@ export function crawlUrl(params, callback) {
         return jobId;
     });
 }
+export function crawlUrl(params, callback) {
+    return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
+}
+export function crawlCode(params, callback) {
+    return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
+}

package/lib/esm/services/carbon.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import { Page } from '@blocklet/puppeteer';
+import { JobState } from '../store';
+export declare function createCarbonImage(page: Page, params?: JobState): Promise<Buffer<ArrayBufferLike>>;

package/lib/esm/services/carbon.js ADDED Viewed

@@ -0,0 +1,38 @@
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+import { logger } from '../config';
+export function createCarbonImage(page, params) {
+    return __awaiter(this, void 0, void 0, function* () {
+        try {
+            yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
+            const targetElement = (yield page.$('.export-container'));
+            yield page.evaluate((target = document) => {
+                if (!target) {
+                    throw new Error('Target element not found');
+                }
+                target.querySelectorAll('span[role="presentation"]').forEach((node) => {
+                    var _a;
+                    const el = node;
+                    if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
+                        (_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
+                            el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
+                        });
+                    }
+                });
+            }, targetElement);
+            const buffer = yield targetElement.screenshot({ type: (params === null || params === void 0 ? void 0 : params.format) || 'webp', quality: (params === null || params === void 0 ? void 0 : params.quality) || 100 });
+            return buffer;
+        }
+        catch (e) {
+            logger.error('failed to crawl from carbon', { error: e });
+            throw e;
+        }
+    });
+}

package/lib/esm/store/job.d.ts CHANGED Viewed

@@ -9,11 +9,14 @@ export interface JobState {
     width?: number;
     height?: number;
     quality?: number;
+    format?: 'png' | 'jpeg' | 'webp';
     timeout?: number;
     fullPage?: boolean;
     lastModified?: string;
     waitTime?: number;
     replace?: boolean;
+    sync?: boolean;
+    ignoreRobots?: boolean;
     headers?: Record<string, string>;
     cookies?: CookieParam[];
     localStorage?: {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arcblock/crawler",
-  "version": "1.2.0",
+  "version": "1.3.1",
   "main": "lib/cjs/index.js",
   "module": "lib/esm/index.js",
   "types": "lib/cjs/index.d.ts",