@arcblock/crawler 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +10 -2
- package/lib/cjs/crawler.js +41 -30
- package/lib/cjs/services/carbon.d.ts +3 -0
- package/lib/cjs/services/carbon.js +41 -0
- package/lib/cjs/store/job.d.ts +3 -0
- package/lib/esm/crawler.d.ts +10 -2
- package/lib/esm/crawler.js +39 -30
- package/lib/esm/services/carbon.d.ts +3 -0
- package/lib/esm/services/carbon.js +38 -0
- package/lib/esm/store/job.d.ts +3 -0
- package/package.json +1 -1
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
|
+
import { Page } from '@blocklet/puppeteer';
|
|
1
2
|
import { JobState, SnapshotModel } from './store';
|
|
2
|
-
|
|
3
|
+
type PageHandler = {
|
|
4
|
+
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
5
|
+
handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
|
|
6
|
+
};
|
|
7
|
+
export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
|
|
3
8
|
export declare function getDataDir(): Promise<{
|
|
4
9
|
htmlDir: string;
|
|
5
10
|
screenshotDir: string;
|
|
6
11
|
}>;
|
|
7
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
12
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
|
|
8
13
|
html: string | null;
|
|
9
14
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
10
15
|
meta: {
|
|
@@ -17,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
17
22
|
* @param params
|
|
18
23
|
* @param callback callback when job finished
|
|
19
24
|
*/
|
|
25
|
+
export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
20
26
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
27
|
+
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
28
|
+
export {};
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -15,7 +15,9 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
exports.getPageContent = void 0;
|
|
16
16
|
exports.createCrawlQueue = createCrawlQueue;
|
|
17
17
|
exports.getDataDir = getDataDir;
|
|
18
|
+
exports.enqueue = enqueue;
|
|
18
19
|
exports.crawlUrl = crawlUrl;
|
|
20
|
+
exports.crawlCode = crawlCode;
|
|
19
21
|
const queue_1 = __importDefault(require("@abtnode/queue"));
|
|
20
22
|
const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
|
|
21
23
|
const crypto_1 = require("crypto");
|
|
@@ -23,44 +25,44 @@ const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
|
23
25
|
const path_1 = __importDefault(require("path"));
|
|
24
26
|
const config_1 = require("./config");
|
|
25
27
|
const puppeteer_1 = require("./puppeteer");
|
|
28
|
+
const carbon_1 = require("./services/carbon");
|
|
26
29
|
const snapshot_1 = require("./services/snapshot");
|
|
27
30
|
const store_1 = require("./store");
|
|
28
31
|
const utils_1 = require("./utils");
|
|
29
32
|
const { BaseState } = require('@abtnode/models');
|
|
30
33
|
// eslint-disable-next-line import/no-mutable-exports
|
|
31
34
|
const crawlQueue = createCrawlQueue('urlCrawler');
|
|
32
|
-
|
|
35
|
+
const syncQueue = createCrawlQueue('syncCrawler');
|
|
36
|
+
const codeQueue = createCrawlQueue('codeCrawler', {
|
|
37
|
+
handleScreenshot: carbon_1.createCarbonImage,
|
|
38
|
+
});
|
|
39
|
+
function createCrawlQueue(queue, handler) {
|
|
33
40
|
const db = new BaseState(store_1.Job);
|
|
34
41
|
return (0, queue_1.default)({
|
|
35
42
|
store: new sequelize_1.default(db, queue),
|
|
36
43
|
concurrency: config_1.config.concurrency,
|
|
37
44
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
38
45
|
config_1.logger.info('Starting to execute crawl job', job);
|
|
39
|
-
|
|
40
|
-
if (!
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
job,
|
|
44
|
-
snapshot
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
46
|
+
// check robots.txt
|
|
47
|
+
if (!job.ignoreRobots) {
|
|
48
|
+
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
49
|
+
if (!canCrawl) {
|
|
50
|
+
config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
51
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
52
|
+
job,
|
|
53
|
+
snapshot: {
|
|
54
|
+
status: 'failed',
|
|
55
|
+
error: 'Denied by robots.txt',
|
|
56
|
+
},
|
|
57
|
+
});
|
|
58
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
59
|
+
return snapshot;
|
|
60
|
+
}
|
|
51
61
|
}
|
|
52
|
-
// if index reach autoCloseBrowserCount, close browser
|
|
53
|
-
// try {
|
|
54
|
-
// if (index >= autoCloseBrowserCount) {
|
|
55
|
-
// await closeBrowser({ trimCache: false });
|
|
56
|
-
// }
|
|
57
|
-
// } catch (error) {
|
|
58
|
-
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
59
|
-
// }
|
|
60
62
|
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
|
|
61
63
|
try {
|
|
62
64
|
// get page content later
|
|
63
|
-
const result = yield (0, exports.getPageContent)(formattedJob);
|
|
65
|
+
const result = yield (0, exports.getPageContent)(formattedJob, handler);
|
|
64
66
|
if (!result || (!result.html && !result.screenshot)) {
|
|
65
67
|
config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
66
68
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
@@ -93,6 +95,7 @@ function createCrawlQueue(queue) {
|
|
|
93
95
|
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
94
96
|
screenshot: result.screenshot,
|
|
95
97
|
html: result.html,
|
|
98
|
+
format: formattedJob.format,
|
|
96
99
|
});
|
|
97
100
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
98
101
|
job: formattedJob,
|
|
@@ -133,13 +136,13 @@ function getDataDir() {
|
|
|
133
136
|
});
|
|
134
137
|
}
|
|
135
138
|
function saveSnapshotToLocal(_a) {
|
|
136
|
-
return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
|
|
139
|
+
return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
|
|
137
140
|
const { htmlDir, screenshotDir } = yield getDataDir();
|
|
138
141
|
let screenshotPath = null;
|
|
139
142
|
let htmlPath = null;
|
|
140
143
|
if (screenshot) {
|
|
141
144
|
const hash = (0, utils_1.md5)(screenshot);
|
|
142
|
-
screenshotPath = path_1.default.join(screenshotDir, `${hash}
|
|
145
|
+
screenshotPath = path_1.default.join(screenshotDir, `${hash}.${format}`);
|
|
143
146
|
config_1.logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
|
|
144
147
|
yield fs_extra_1.default.writeFile(screenshotPath, screenshot);
|
|
145
148
|
}
|
|
@@ -155,7 +158,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
155
158
|
};
|
|
156
159
|
});
|
|
157
160
|
}
|
|
158
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
|
|
161
|
+
const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
|
|
159
162
|
const page = yield (0, puppeteer_1.initPage)();
|
|
160
163
|
if (width && height) {
|
|
161
164
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -219,7 +222,9 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
219
222
|
}
|
|
220
223
|
}
|
|
221
224
|
try {
|
|
222
|
-
screenshot =
|
|
225
|
+
screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
|
|
226
|
+
? yield handler.handleScreenshot(page)
|
|
227
|
+
: yield page.screenshot({ fullPage, quality, type: format });
|
|
223
228
|
}
|
|
224
229
|
catch (err) {
|
|
225
230
|
config_1.logger.error('Failed to get screenshot:', err);
|
|
@@ -252,7 +257,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
252
257
|
meta.title = data.title;
|
|
253
258
|
meta.description = data.description;
|
|
254
259
|
if (includeHtml) {
|
|
255
|
-
html = data.html;
|
|
260
|
+
html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
|
|
256
261
|
}
|
|
257
262
|
}
|
|
258
263
|
catch (err) {
|
|
@@ -280,17 +285,17 @@ exports.getPageContent = getPageContent;
|
|
|
280
285
|
* @param callback callback when job finished
|
|
281
286
|
*/
|
|
282
287
|
// eslint-disable-next-line require-await
|
|
283
|
-
function
|
|
288
|
+
function enqueue(queue, params, callback) {
|
|
284
289
|
return __awaiter(this, void 0, void 0, function* () {
|
|
285
290
|
// skip duplicate job
|
|
286
291
|
const existsJob = yield store_1.Job.isExists(params);
|
|
287
|
-
if (existsJob) {
|
|
292
|
+
if (existsJob && !params.sync) {
|
|
288
293
|
config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
289
294
|
return existsJob.id;
|
|
290
295
|
}
|
|
291
296
|
config_1.logger.info('enqueue crawl job', params);
|
|
292
297
|
const jobId = (0, crypto_1.randomUUID)();
|
|
293
|
-
const job =
|
|
298
|
+
const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
294
299
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
295
300
|
config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
296
301
|
callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
|
|
@@ -302,3 +307,9 @@ function crawlUrl(params, callback) {
|
|
|
302
307
|
return jobId;
|
|
303
308
|
});
|
|
304
309
|
}
|
|
310
|
+
function crawlUrl(params, callback) {
|
|
311
|
+
return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
|
|
312
|
+
}
|
|
313
|
+
function crawlCode(params, callback) {
|
|
314
|
+
return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
|
|
315
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.createCarbonImage = createCarbonImage;
|
|
13
|
+
const config_1 = require("../config");
|
|
14
|
+
function createCarbonImage(page, params) {
|
|
15
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
16
|
+
try {
|
|
17
|
+
yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
|
|
18
|
+
const targetElement = (yield page.$('.export-container'));
|
|
19
|
+
yield page.evaluate((target = document) => {
|
|
20
|
+
if (!target) {
|
|
21
|
+
throw new Error('Target element not found');
|
|
22
|
+
}
|
|
23
|
+
target.querySelectorAll('span[role="presentation"]').forEach((node) => {
|
|
24
|
+
var _a;
|
|
25
|
+
const el = node;
|
|
26
|
+
if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
|
|
27
|
+
(_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
|
|
28
|
+
el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
}, targetElement);
|
|
33
|
+
const buffer = yield targetElement.screenshot({ type: (params === null || params === void 0 ? void 0 : params.format) || 'webp', quality: (params === null || params === void 0 ? void 0 : params.quality) || 100 });
|
|
34
|
+
return buffer;
|
|
35
|
+
}
|
|
36
|
+
catch (e) {
|
|
37
|
+
config_1.logger.error('failed to crawl from carbon', { error: e });
|
|
38
|
+
throw e;
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
}
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -9,11 +9,14 @@ export interface JobState {
|
|
|
9
9
|
width?: number;
|
|
10
10
|
height?: number;
|
|
11
11
|
quality?: number;
|
|
12
|
+
format?: 'png' | 'jpeg' | 'webp';
|
|
12
13
|
timeout?: number;
|
|
13
14
|
fullPage?: boolean;
|
|
14
15
|
lastModified?: string;
|
|
15
16
|
waitTime?: number;
|
|
16
17
|
replace?: boolean;
|
|
18
|
+
sync?: boolean;
|
|
19
|
+
ignoreRobots?: boolean;
|
|
17
20
|
headers?: Record<string, string>;
|
|
18
21
|
cookies?: CookieParam[];
|
|
19
22
|
localStorage?: {
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
|
+
import { Page } from '@blocklet/puppeteer';
|
|
1
2
|
import { JobState, SnapshotModel } from './store';
|
|
2
|
-
|
|
3
|
+
type PageHandler = {
|
|
4
|
+
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
5
|
+
handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
|
|
6
|
+
};
|
|
7
|
+
export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
|
|
3
8
|
export declare function getDataDir(): Promise<{
|
|
4
9
|
htmlDir: string;
|
|
5
10
|
screenshotDir: string;
|
|
6
11
|
}>;
|
|
7
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
12
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
|
|
8
13
|
html: string | null;
|
|
9
14
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
10
15
|
meta: {
|
|
@@ -17,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
17
22
|
* @param params
|
|
18
23
|
* @param callback callback when job finished
|
|
19
24
|
*/
|
|
25
|
+
export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
20
26
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
27
|
+
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
28
|
+
export {};
|
package/lib/esm/crawler.js
CHANGED
|
@@ -14,44 +14,44 @@ import fs from 'fs-extra';
|
|
|
14
14
|
import path from 'path';
|
|
15
15
|
import { config, logger } from './config';
|
|
16
16
|
import { initPage } from './puppeteer';
|
|
17
|
+
import { createCarbonImage } from './services/carbon';
|
|
17
18
|
import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
|
|
18
19
|
import { Job, Snapshot, sequelize } from './store';
|
|
19
20
|
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
|
|
20
21
|
const { BaseState } = require('@abtnode/models');
|
|
21
22
|
// eslint-disable-next-line import/no-mutable-exports
|
|
22
23
|
const crawlQueue = createCrawlQueue('urlCrawler');
|
|
23
|
-
|
|
24
|
+
const syncQueue = createCrawlQueue('syncCrawler');
|
|
25
|
+
const codeQueue = createCrawlQueue('codeCrawler', {
|
|
26
|
+
handleScreenshot: createCarbonImage,
|
|
27
|
+
});
|
|
28
|
+
export function createCrawlQueue(queue, handler) {
|
|
24
29
|
const db = new BaseState(Job);
|
|
25
30
|
return createQueue({
|
|
26
31
|
store: new SequelizeStore(db, queue),
|
|
27
32
|
concurrency: config.concurrency,
|
|
28
33
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
29
34
|
logger.info('Starting to execute crawl job', job);
|
|
30
|
-
|
|
31
|
-
if (!
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
job,
|
|
35
|
-
snapshot
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
35
|
+
// check robots.txt
|
|
36
|
+
if (!job.ignoreRobots) {
|
|
37
|
+
const canCrawl = yield isAcceptCrawler(job.url);
|
|
38
|
+
if (!canCrawl) {
|
|
39
|
+
logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
40
|
+
const snapshot = convertJobToSnapshot({
|
|
41
|
+
job,
|
|
42
|
+
snapshot: {
|
|
43
|
+
status: 'failed',
|
|
44
|
+
error: 'Denied by robots.txt',
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
yield Snapshot.upsert(snapshot);
|
|
48
|
+
return snapshot;
|
|
49
|
+
}
|
|
42
50
|
}
|
|
43
|
-
// if index reach autoCloseBrowserCount, close browser
|
|
44
|
-
// try {
|
|
45
|
-
// if (index >= autoCloseBrowserCount) {
|
|
46
|
-
// await closeBrowser({ trimCache: false });
|
|
47
|
-
// }
|
|
48
|
-
// } catch (error) {
|
|
49
|
-
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
50
|
-
// }
|
|
51
51
|
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
|
|
52
52
|
try {
|
|
53
53
|
// get page content later
|
|
54
|
-
const result = yield getPageContent(formattedJob);
|
|
54
|
+
const result = yield getPageContent(formattedJob, handler);
|
|
55
55
|
if (!result || (!result.html && !result.screenshot)) {
|
|
56
56
|
logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
57
57
|
const snapshot = convertJobToSnapshot({
|
|
@@ -84,6 +84,7 @@ export function createCrawlQueue(queue) {
|
|
|
84
84
|
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
85
85
|
screenshot: result.screenshot,
|
|
86
86
|
html: result.html,
|
|
87
|
+
format: formattedJob.format,
|
|
87
88
|
});
|
|
88
89
|
const snapshot = convertJobToSnapshot({
|
|
89
90
|
job: formattedJob,
|
|
@@ -124,13 +125,13 @@ export function getDataDir() {
|
|
|
124
125
|
});
|
|
125
126
|
}
|
|
126
127
|
function saveSnapshotToLocal(_a) {
|
|
127
|
-
return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
|
|
128
|
+
return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
|
|
128
129
|
const { htmlDir, screenshotDir } = yield getDataDir();
|
|
129
130
|
let screenshotPath = null;
|
|
130
131
|
let htmlPath = null;
|
|
131
132
|
if (screenshot) {
|
|
132
133
|
const hash = md5(screenshot);
|
|
133
|
-
screenshotPath = path.join(screenshotDir, `${hash}
|
|
134
|
+
screenshotPath = path.join(screenshotDir, `${hash}.${format}`);
|
|
134
135
|
logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
|
|
135
136
|
yield fs.writeFile(screenshotPath, screenshot);
|
|
136
137
|
}
|
|
@@ -146,7 +147,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
146
147
|
};
|
|
147
148
|
});
|
|
148
149
|
}
|
|
149
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
|
|
150
|
+
export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
|
|
150
151
|
const page = yield initPage();
|
|
151
152
|
if (width && height) {
|
|
152
153
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -210,7 +211,9 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
210
211
|
}
|
|
211
212
|
}
|
|
212
213
|
try {
|
|
213
|
-
screenshot =
|
|
214
|
+
screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
|
|
215
|
+
? yield handler.handleScreenshot(page)
|
|
216
|
+
: yield page.screenshot({ fullPage, quality, type: format });
|
|
214
217
|
}
|
|
215
218
|
catch (err) {
|
|
216
219
|
logger.error('Failed to get screenshot:', err);
|
|
@@ -243,7 +246,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
243
246
|
meta.title = data.title;
|
|
244
247
|
meta.description = data.description;
|
|
245
248
|
if (includeHtml) {
|
|
246
|
-
html = data.html;
|
|
249
|
+
html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
|
|
247
250
|
}
|
|
248
251
|
}
|
|
249
252
|
catch (err) {
|
|
@@ -270,17 +273,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
270
273
|
* @param callback callback when job finished
|
|
271
274
|
*/
|
|
272
275
|
// eslint-disable-next-line require-await
|
|
273
|
-
export function
|
|
276
|
+
export function enqueue(queue, params, callback) {
|
|
274
277
|
return __awaiter(this, void 0, void 0, function* () {
|
|
275
278
|
// skip duplicate job
|
|
276
279
|
const existsJob = yield Job.isExists(params);
|
|
277
|
-
if (existsJob) {
|
|
280
|
+
if (existsJob && !params.sync) {
|
|
278
281
|
logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
279
282
|
return existsJob.id;
|
|
280
283
|
}
|
|
281
284
|
logger.info('enqueue crawl job', params);
|
|
282
285
|
const jobId = randomUUID();
|
|
283
|
-
const job =
|
|
286
|
+
const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
284
287
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
285
288
|
logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
286
289
|
callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
|
|
@@ -292,3 +295,9 @@ export function crawlUrl(params, callback) {
|
|
|
292
295
|
return jobId;
|
|
293
296
|
});
|
|
294
297
|
}
|
|
298
|
+
export function crawlUrl(params, callback) {
|
|
299
|
+
return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
|
|
300
|
+
}
|
|
301
|
+
export function crawlCode(params, callback) {
|
|
302
|
+
return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
|
|
303
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { logger } from '../config';
|
|
11
|
+
export function createCarbonImage(page, params) {
|
|
12
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
13
|
+
try {
|
|
14
|
+
yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
|
|
15
|
+
const targetElement = (yield page.$('.export-container'));
|
|
16
|
+
yield page.evaluate((target = document) => {
|
|
17
|
+
if (!target) {
|
|
18
|
+
throw new Error('Target element not found');
|
|
19
|
+
}
|
|
20
|
+
target.querySelectorAll('span[role="presentation"]').forEach((node) => {
|
|
21
|
+
var _a;
|
|
22
|
+
const el = node;
|
|
23
|
+
if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
|
|
24
|
+
(_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
|
|
25
|
+
el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
});
|
|
29
|
+
}, targetElement);
|
|
30
|
+
const buffer = yield targetElement.screenshot({ type: (params === null || params === void 0 ? void 0 : params.format) || 'webp', quality: (params === null || params === void 0 ? void 0 : params.quality) || 100 });
|
|
31
|
+
return buffer;
|
|
32
|
+
}
|
|
33
|
+
catch (e) {
|
|
34
|
+
logger.error('failed to crawl from carbon', { error: e });
|
|
35
|
+
throw e;
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
}
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -9,11 +9,14 @@ export interface JobState {
|
|
|
9
9
|
width?: number;
|
|
10
10
|
height?: number;
|
|
11
11
|
quality?: number;
|
|
12
|
+
format?: 'png' | 'jpeg' | 'webp';
|
|
12
13
|
timeout?: number;
|
|
13
14
|
fullPage?: boolean;
|
|
14
15
|
lastModified?: string;
|
|
15
16
|
waitTime?: number;
|
|
16
17
|
replace?: boolean;
|
|
18
|
+
sync?: boolean;
|
|
19
|
+
ignoreRobots?: boolean;
|
|
17
20
|
headers?: Record<string, string>;
|
|
18
21
|
cookies?: CookieParam[];
|
|
19
22
|
localStorage?: {
|