@arcblock/crawler 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,15 @@
1
+ import { Page } from '@blocklet/puppeteer';
1
2
  import { JobState, SnapshotModel } from './store';
2
- export declare function createCrawlQueue(queue: string): any;
3
+ type PageHandler = {
4
+ handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
5
+ handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
6
+ };
7
+ export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
3
8
  export declare function getDataDir(): Promise<{
4
9
  htmlDir: string;
5
10
  screenshotDir: string;
6
11
  }>;
7
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
12
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
8
13
  html: string | null;
9
14
  screenshot: Uint8Array<ArrayBufferLike> | null;
10
15
  meta: {
@@ -17,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
17
22
  * @param params
18
23
  * @param callback callback when job finished
19
24
  */
25
+ export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
20
26
  export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
27
+ export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
28
+ export {};
@@ -15,7 +15,9 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.getPageContent = void 0;
16
16
  exports.createCrawlQueue = createCrawlQueue;
17
17
  exports.getDataDir = getDataDir;
18
+ exports.enqueue = enqueue;
18
19
  exports.crawlUrl = crawlUrl;
20
+ exports.crawlCode = crawlCode;
19
21
  const queue_1 = __importDefault(require("@abtnode/queue"));
20
22
  const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
21
23
  const crypto_1 = require("crypto");
@@ -23,44 +25,44 @@ const fs_extra_1 = __importDefault(require("fs-extra"));
23
25
  const path_1 = __importDefault(require("path"));
24
26
  const config_1 = require("./config");
25
27
  const puppeteer_1 = require("./puppeteer");
28
+ const carbon_1 = require("./services/carbon");
26
29
  const snapshot_1 = require("./services/snapshot");
27
30
  const store_1 = require("./store");
28
31
  const utils_1 = require("./utils");
29
32
  const { BaseState } = require('@abtnode/models');
30
33
  // eslint-disable-next-line import/no-mutable-exports
31
34
  const crawlQueue = createCrawlQueue('urlCrawler');
32
- function createCrawlQueue(queue) {
35
+ const syncQueue = createCrawlQueue('syncCrawler');
36
+ const codeQueue = createCrawlQueue('codeCrawler', {
37
+ handleScreenshot: carbon_1.createCarbonImage,
38
+ });
39
+ function createCrawlQueue(queue, handler) {
33
40
  const db = new BaseState(store_1.Job);
34
41
  return (0, queue_1.default)({
35
42
  store: new sequelize_1.default(db, queue),
36
43
  concurrency: config_1.config.concurrency,
37
44
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
38
45
  config_1.logger.info('Starting to execute crawl job', job);
39
- const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
40
- if (!canCrawl) {
41
- config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
42
- const snapshot = (0, snapshot_1.convertJobToSnapshot)({
43
- job,
44
- snapshot: {
45
- status: 'failed',
46
- error: 'Denied by robots.txt',
47
- },
48
- });
49
- yield store_1.Snapshot.upsert(snapshot);
50
- return snapshot;
46
+ // check robots.txt
47
+ if (!job.ignoreRobots) {
48
+ const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
49
+ if (!canCrawl) {
50
+ config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
51
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
52
+ job,
53
+ snapshot: {
54
+ status: 'failed',
55
+ error: 'Denied by robots.txt',
56
+ },
57
+ });
58
+ yield store_1.Snapshot.upsert(snapshot);
59
+ return snapshot;
60
+ }
51
61
  }
52
- // if index reach autoCloseBrowserCount, close browser
53
- // try {
54
- // if (index >= autoCloseBrowserCount) {
55
- // await closeBrowser({ trimCache: false });
56
- // }
57
- // } catch (error) {
58
- // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
59
- // }
60
62
  const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
61
63
  try {
62
64
  // get page content later
63
- const result = yield (0, exports.getPageContent)(formattedJob);
65
+ const result = yield (0, exports.getPageContent)(formattedJob, handler);
64
66
  if (!result || (!result.html && !result.screenshot)) {
65
67
  config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
66
68
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
@@ -93,6 +95,7 @@ function createCrawlQueue(queue) {
93
95
  const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
94
96
  screenshot: result.screenshot,
95
97
  html: result.html,
98
+ format: formattedJob.format,
96
99
  });
97
100
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
98
101
  job: formattedJob,
@@ -133,13 +136,13 @@ function getDataDir() {
133
136
  });
134
137
  }
135
138
  function saveSnapshotToLocal(_a) {
136
- return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
139
+ return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
137
140
  const { htmlDir, screenshotDir } = yield getDataDir();
138
141
  let screenshotPath = null;
139
142
  let htmlPath = null;
140
143
  if (screenshot) {
141
144
  const hash = (0, utils_1.md5)(screenshot);
142
- screenshotPath = path_1.default.join(screenshotDir, `${hash}.webp`);
145
+ screenshotPath = path_1.default.join(screenshotDir, `${hash}.${format}`);
143
146
  config_1.logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
144
147
  yield fs_extra_1.default.writeFile(screenshotPath, screenshot);
145
148
  }
@@ -155,7 +158,7 @@ function saveSnapshotToLocal(_a) {
155
158
  };
156
159
  });
157
160
  }
158
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
161
+ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
159
162
  const page = yield (0, puppeteer_1.initPage)();
160
163
  if (width && height) {
161
164
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -219,7 +222,9 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
219
222
  }
220
223
  }
221
224
  try {
222
- screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
225
+ screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
226
+ ? yield handler.handleScreenshot(page)
227
+ : yield page.screenshot({ fullPage, quality, type: format });
223
228
  }
224
229
  catch (err) {
225
230
  config_1.logger.error('Failed to get screenshot:', err);
@@ -252,7 +257,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
252
257
  meta.title = data.title;
253
258
  meta.description = data.description;
254
259
  if (includeHtml) {
255
- html = data.html;
260
+ html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
256
261
  }
257
262
  }
258
263
  catch (err) {
@@ -280,17 +285,17 @@ exports.getPageContent = getPageContent;
280
285
  * @param callback callback when job finished
281
286
  */
282
287
  // eslint-disable-next-line require-await
283
- function crawlUrl(params, callback) {
288
+ function enqueue(queue, params, callback) {
284
289
  return __awaiter(this, void 0, void 0, function* () {
285
290
  // skip duplicate job
286
291
  const existsJob = yield store_1.Job.isExists(params);
287
- if (existsJob) {
292
+ if (existsJob && !params.sync) {
288
293
  config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
289
294
  return existsJob.id;
290
295
  }
291
296
  config_1.logger.info('enqueue crawl job', params);
292
297
  const jobId = (0, crypto_1.randomUUID)();
293
- const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
298
+ const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
294
299
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
295
300
  config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
296
301
  callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
@@ -302,3 +307,9 @@ function crawlUrl(params, callback) {
302
307
  return jobId;
303
308
  });
304
309
  }
310
+ function crawlUrl(params, callback) {
311
+ return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
312
+ }
313
+ function crawlCode(params, callback) {
314
+ return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
315
+ }
@@ -0,0 +1,3 @@
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { JobState } from '../store';
3
+ export declare function createCarbonImage(page: Page, params?: JobState): Promise<Buffer<ArrayBufferLike>>;
@@ -0,0 +1,41 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.createCarbonImage = createCarbonImage;
13
+ const config_1 = require("../config");
14
+ function createCarbonImage(page, params) {
15
+ return __awaiter(this, void 0, void 0, function* () {
16
+ try {
17
+ yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
18
+ const targetElement = (yield page.$('.export-container'));
19
+ yield page.evaluate((target = document) => {
20
+ if (!target) {
21
+ throw new Error('Target element not found');
22
+ }
23
+ target.querySelectorAll('span[role="presentation"]').forEach((node) => {
24
+ var _a;
25
+ const el = node;
26
+ if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
27
+ (_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
28
+ el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
29
+ });
30
+ }
31
+ });
32
+ }, targetElement);
33
+ const buffer = yield targetElement.screenshot({ type: (params === null || params === void 0 ? void 0 : params.format) || 'webp', quality: (params === null || params === void 0 ? void 0 : params.quality) || 100 });
34
+ return buffer;
35
+ }
36
+ catch (e) {
37
+ config_1.logger.error('failed to crawl from carbon', { error: e });
38
+ throw e;
39
+ }
40
+ });
41
+ }
@@ -9,11 +9,14 @@ export interface JobState {
9
9
  width?: number;
10
10
  height?: number;
11
11
  quality?: number;
12
+ format?: 'png' | 'jpeg' | 'webp';
12
13
  timeout?: number;
13
14
  fullPage?: boolean;
14
15
  lastModified?: string;
15
16
  waitTime?: number;
16
17
  replace?: boolean;
18
+ sync?: boolean;
19
+ ignoreRobots?: boolean;
17
20
  headers?: Record<string, string>;
18
21
  cookies?: CookieParam[];
19
22
  localStorage?: {
@@ -1,10 +1,15 @@
1
+ import { Page } from '@blocklet/puppeteer';
1
2
  import { JobState, SnapshotModel } from './store';
2
- export declare function createCrawlQueue(queue: string): any;
3
+ type PageHandler = {
4
+ handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
5
+ handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
6
+ };
7
+ export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
3
8
  export declare function getDataDir(): Promise<{
4
9
  htmlDir: string;
5
10
  screenshotDir: string;
6
11
  }>;
7
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
12
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
8
13
  html: string | null;
9
14
  screenshot: Uint8Array<ArrayBufferLike> | null;
10
15
  meta: {
@@ -17,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
17
22
  * @param params
18
23
  * @param callback callback when job finished
19
24
  */
25
+ export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
20
26
  export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
27
+ export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
28
+ export {};
@@ -14,44 +14,44 @@ import fs from 'fs-extra';
14
14
  import path from 'path';
15
15
  import { config, logger } from './config';
16
16
  import { initPage } from './puppeteer';
17
+ import { createCarbonImage } from './services/carbon';
17
18
  import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
18
19
  import { Job, Snapshot, sequelize } from './store';
19
20
  import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
20
21
  const { BaseState } = require('@abtnode/models');
21
22
  // eslint-disable-next-line import/no-mutable-exports
22
23
  const crawlQueue = createCrawlQueue('urlCrawler');
23
- export function createCrawlQueue(queue) {
24
+ const syncQueue = createCrawlQueue('syncCrawler');
25
+ const codeQueue = createCrawlQueue('codeCrawler', {
26
+ handleScreenshot: createCarbonImage,
27
+ });
28
+ export function createCrawlQueue(queue, handler) {
24
29
  const db = new BaseState(Job);
25
30
  return createQueue({
26
31
  store: new SequelizeStore(db, queue),
27
32
  concurrency: config.concurrency,
28
33
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
29
34
  logger.info('Starting to execute crawl job', job);
30
- const canCrawl = yield isAcceptCrawler(job.url);
31
- if (!canCrawl) {
32
- logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
33
- const snapshot = convertJobToSnapshot({
34
- job,
35
- snapshot: {
36
- status: 'failed',
37
- error: 'Denied by robots.txt',
38
- },
39
- });
40
- yield Snapshot.upsert(snapshot);
41
- return snapshot;
35
+ // check robots.txt
36
+ if (!job.ignoreRobots) {
37
+ const canCrawl = yield isAcceptCrawler(job.url);
38
+ if (!canCrawl) {
39
+ logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
40
+ const snapshot = convertJobToSnapshot({
41
+ job,
42
+ snapshot: {
43
+ status: 'failed',
44
+ error: 'Denied by robots.txt',
45
+ },
46
+ });
47
+ yield Snapshot.upsert(snapshot);
48
+ return snapshot;
49
+ }
42
50
  }
43
- // if index reach autoCloseBrowserCount, close browser
44
- // try {
45
- // if (index >= autoCloseBrowserCount) {
46
- // await closeBrowser({ trimCache: false });
47
- // }
48
- // } catch (error) {
49
- // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
50
- // }
51
51
  const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
52
52
  try {
53
53
  // get page content later
54
- const result = yield getPageContent(formattedJob);
54
+ const result = yield getPageContent(formattedJob, handler);
55
55
  if (!result || (!result.html && !result.screenshot)) {
56
56
  logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
57
57
  const snapshot = convertJobToSnapshot({
@@ -84,6 +84,7 @@ export function createCrawlQueue(queue) {
84
84
  const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
85
85
  screenshot: result.screenshot,
86
86
  html: result.html,
87
+ format: formattedJob.format,
87
88
  });
88
89
  const snapshot = convertJobToSnapshot({
89
90
  job: formattedJob,
@@ -124,13 +125,13 @@ export function getDataDir() {
124
125
  });
125
126
  }
126
127
  function saveSnapshotToLocal(_a) {
127
- return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
128
+ return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
128
129
  const { htmlDir, screenshotDir } = yield getDataDir();
129
130
  let screenshotPath = null;
130
131
  let htmlPath = null;
131
132
  if (screenshot) {
132
133
  const hash = md5(screenshot);
133
- screenshotPath = path.join(screenshotDir, `${hash}.webp`);
134
+ screenshotPath = path.join(screenshotDir, `${hash}.${format}`);
134
135
  logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
135
136
  yield fs.writeFile(screenshotPath, screenshot);
136
137
  }
@@ -146,7 +147,7 @@ function saveSnapshotToLocal(_a) {
146
147
  };
147
148
  });
148
149
  }
149
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
150
+ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
150
151
  const page = yield initPage();
151
152
  if (width && height) {
152
153
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -210,7 +211,9 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
210
211
  }
211
212
  }
212
213
  try {
213
- screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
214
+ screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
215
+ ? yield handler.handleScreenshot(page)
216
+ : yield page.screenshot({ fullPage, quality, type: format });
214
217
  }
215
218
  catch (err) {
216
219
  logger.error('Failed to get screenshot:', err);
@@ -243,7 +246,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
243
246
  meta.title = data.title;
244
247
  meta.description = data.description;
245
248
  if (includeHtml) {
246
- html = data.html;
249
+ html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
247
250
  }
248
251
  }
249
252
  catch (err) {
@@ -270,17 +273,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
270
273
  * @param callback callback when job finished
271
274
  */
272
275
  // eslint-disable-next-line require-await
273
- export function crawlUrl(params, callback) {
276
+ export function enqueue(queue, params, callback) {
274
277
  return __awaiter(this, void 0, void 0, function* () {
275
278
  // skip duplicate job
276
279
  const existsJob = yield Job.isExists(params);
277
- if (existsJob) {
280
+ if (existsJob && !params.sync) {
278
281
  logger.info(`Crawl job already exists for ${params.url}, skip`);
279
282
  return existsJob.id;
280
283
  }
281
284
  logger.info('enqueue crawl job', params);
282
285
  const jobId = randomUUID();
283
- const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
286
+ const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
284
287
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
285
288
  logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
286
289
  callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
@@ -292,3 +295,9 @@ export function crawlUrl(params, callback) {
292
295
  return jobId;
293
296
  });
294
297
  }
298
+ export function crawlUrl(params, callback) {
299
+ return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
300
+ }
301
+ export function crawlCode(params, callback) {
302
+ return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
303
+ }
@@ -0,0 +1,3 @@
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { JobState } from '../store';
3
+ export declare function createCarbonImage(page: Page, params?: JobState): Promise<Buffer<ArrayBufferLike>>;
@@ -0,0 +1,38 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { logger } from '../config';
11
+ export function createCarbonImage(page, params) {
12
+ return __awaiter(this, void 0, void 0, function* () {
13
+ try {
14
+ yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
15
+ const targetElement = (yield page.$('.export-container'));
16
+ yield page.evaluate((target = document) => {
17
+ if (!target) {
18
+ throw new Error('Target element not found');
19
+ }
20
+ target.querySelectorAll('span[role="presentation"]').forEach((node) => {
21
+ var _a;
22
+ const el = node;
23
+ if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
24
+ (_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
25
+ el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
26
+ });
27
+ }
28
+ });
29
+ }, targetElement);
30
+ const buffer = yield targetElement.screenshot({ type: (params === null || params === void 0 ? void 0 : params.format) || 'webp', quality: (params === null || params === void 0 ? void 0 : params.quality) || 100 });
31
+ return buffer;
32
+ }
33
+ catch (e) {
34
+ logger.error('failed to crawl from carbon', { error: e });
35
+ throw e;
36
+ }
37
+ });
38
+ }
@@ -9,11 +9,14 @@ export interface JobState {
9
9
  width?: number;
10
10
  height?: number;
11
11
  quality?: number;
12
+ format?: 'png' | 'jpeg' | 'webp';
12
13
  timeout?: number;
13
14
  fullPage?: boolean;
14
15
  lastModified?: string;
15
16
  waitTime?: number;
16
17
  replace?: boolean;
18
+ sync?: boolean;
19
+ ignoreRobots?: boolean;
17
20
  headers?: Record<string, string>;
18
21
  cookies?: CookieParam[];
19
22
  localStorage?: {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.2.0",
3
+ "version": "1.3.1",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",