@arcblock/crawler 1.1.6 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +11 -4
- package/lib/cjs/crawler.js +96 -59
- package/lib/cjs/index.d.ts +1 -0
- package/lib/cjs/index.js +3 -5
- package/lib/cjs/services/carbon.d.ts +3 -0
- package/lib/cjs/services/carbon.js +87 -0
- package/lib/cjs/services/snapshot.d.ts +5 -2
- package/lib/cjs/services/snapshot.js +36 -6
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +9 -3
- package/lib/cjs/store/index.d.ts +4 -1
- package/lib/cjs/store/index.js +37 -45
- package/lib/cjs/store/job.d.ts +5 -0
- package/lib/cjs/store/migrate.d.ts +4 -0
- package/lib/cjs/store/migrate.js +63 -0
- package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
- package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/cjs/store/migrations/20250616-replace.js +40 -0
- package/lib/cjs/store/snapshot.d.ts +2 -0
- package/lib/cjs/store/snapshot.js +7 -0
- package/lib/esm/crawler.d.ts +11 -4
- package/lib/esm/crawler.js +92 -57
- package/lib/esm/index.d.ts +1 -0
- package/lib/esm/index.js +1 -4
- package/lib/esm/services/carbon.d.ts +3 -0
- package/lib/esm/services/carbon.js +84 -0
- package/lib/esm/services/snapshot.d.ts +5 -2
- package/lib/esm/services/snapshot.js +33 -4
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +9 -3
- package/lib/esm/store/index.d.ts +4 -1
- package/lib/esm/store/index.js +23 -45
- package/lib/esm/store/job.d.ts +5 -0
- package/lib/esm/store/migrate.d.ts +4 -0
- package/lib/esm/store/migrate.js +26 -0
- package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/esm/store/migrations/20250615-genesis.js +110 -0
- package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/esm/store/migrations/20250616-replace.js +36 -0
- package/lib/esm/store/snapshot.d.ts +2 -0
- package/lib/esm/store/snapshot.js +7 -0
- package/package.json +3 -2
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { SnapshotModel } from './store
|
|
3
|
-
|
|
1
|
+
import { Page } from '@blocklet/puppeteer';
|
|
2
|
+
import { JobState, SnapshotModel } from './store';
|
|
3
|
+
type PageHandler = {
|
|
4
|
+
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
5
|
+
handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
|
|
6
|
+
};
|
|
7
|
+
export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
|
|
4
8
|
export declare function getDataDir(): Promise<{
|
|
5
9
|
htmlDir: string;
|
|
6
10
|
screenshotDir: string;
|
|
7
11
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
12
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
|
|
9
13
|
html: string | null;
|
|
10
14
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
15
|
meta: {
|
|
@@ -18,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
18
22
|
* @param params
|
|
19
23
|
* @param callback callback when job finished
|
|
20
24
|
*/
|
|
25
|
+
export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
21
26
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
27
|
+
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
28
|
+
export {};
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -15,7 +15,9 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
exports.getPageContent = void 0;
|
|
16
16
|
exports.createCrawlQueue = createCrawlQueue;
|
|
17
17
|
exports.getDataDir = getDataDir;
|
|
18
|
+
exports.enqueue = enqueue;
|
|
18
19
|
exports.crawlUrl = crawlUrl;
|
|
20
|
+
exports.crawlCode = crawlCode;
|
|
19
21
|
const queue_1 = __importDefault(require("@abtnode/queue"));
|
|
20
22
|
const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
|
|
21
23
|
const crypto_1 = require("crypto");
|
|
@@ -23,44 +25,44 @@ const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
|
23
25
|
const path_1 = __importDefault(require("path"));
|
|
24
26
|
const config_1 = require("./config");
|
|
25
27
|
const puppeteer_1 = require("./puppeteer");
|
|
28
|
+
const carbon_1 = require("./services/carbon");
|
|
26
29
|
const snapshot_1 = require("./services/snapshot");
|
|
27
|
-
const
|
|
28
|
-
const snapshot_2 = require("./store/snapshot");
|
|
30
|
+
const store_1 = require("./store");
|
|
29
31
|
const utils_1 = require("./utils");
|
|
30
32
|
const { BaseState } = require('@abtnode/models');
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
// eslint-disable-next-line import/no-mutable-exports
|
|
34
|
+
const crawlQueue = createCrawlQueue('urlCrawler');
|
|
35
|
+
const syncQueue = createCrawlQueue('syncCrawler');
|
|
36
|
+
const codeQueue = createCrawlQueue('codeCrawler', {
|
|
37
|
+
handleScreenshot: carbon_1.createCarbonImage,
|
|
38
|
+
});
|
|
39
|
+
function createCrawlQueue(queue, handler) {
|
|
40
|
+
const db = new BaseState(store_1.Job);
|
|
41
|
+
return (0, queue_1.default)({
|
|
42
|
+
store: new sequelize_1.default(db, queue),
|
|
36
43
|
concurrency: config_1.config.concurrency,
|
|
37
44
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
38
45
|
config_1.logger.info('Starting to execute crawl job', job);
|
|
39
|
-
|
|
40
|
-
if (!
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
job,
|
|
44
|
-
snapshot
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
46
|
+
// check robots.txt
|
|
47
|
+
if (!job.ignoreRobots) {
|
|
48
|
+
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
49
|
+
if (!canCrawl) {
|
|
50
|
+
config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
51
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
52
|
+
job,
|
|
53
|
+
snapshot: {
|
|
54
|
+
status: 'failed',
|
|
55
|
+
error: 'Denied by robots.txt',
|
|
56
|
+
},
|
|
57
|
+
});
|
|
58
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
59
|
+
return snapshot;
|
|
60
|
+
}
|
|
51
61
|
}
|
|
52
|
-
// if index reach autoCloseBrowserCount, close browser
|
|
53
|
-
// try {
|
|
54
|
-
// if (index >= autoCloseBrowserCount) {
|
|
55
|
-
// await closeBrowser({ trimCache: false });
|
|
56
|
-
// }
|
|
57
|
-
// } catch (error) {
|
|
58
|
-
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
59
|
-
// }
|
|
60
62
|
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
|
|
61
63
|
try {
|
|
62
64
|
// get page content later
|
|
63
|
-
const result = yield (0, exports.getPageContent)(formattedJob);
|
|
65
|
+
const result = yield (0, exports.getPageContent)(formattedJob, handler);
|
|
64
66
|
if (!result || (!result.html && !result.screenshot)) {
|
|
65
67
|
config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
66
68
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
@@ -70,25 +72,43 @@ function createCrawlQueue() {
|
|
|
70
72
|
error: 'Failed to crawl content',
|
|
71
73
|
},
|
|
72
74
|
});
|
|
73
|
-
yield
|
|
75
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
74
76
|
return snapshot;
|
|
75
77
|
}
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
78
|
+
const snapshot = yield store_1.sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
|
|
79
|
+
// delete old snapshot
|
|
80
|
+
if (formattedJob.replace) {
|
|
81
|
+
try {
|
|
82
|
+
const deletedJobIds = yield (0, snapshot_1.deleteSnapshots)({
|
|
83
|
+
url: formattedJob.url,
|
|
84
|
+
replace: true,
|
|
85
|
+
}, { txn });
|
|
86
|
+
if (deletedJobIds) {
|
|
87
|
+
config_1.logger.info('Deleted old snapshot', { deletedJobIds });
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
catch (error) {
|
|
91
|
+
config_1.logger.error('Failed to delete old snapshot', { error, formattedJob });
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
// save html and screenshot to data dir
|
|
95
|
+
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
96
|
+
screenshot: result.screenshot,
|
|
97
|
+
html: result.html,
|
|
98
|
+
format: formattedJob.format,
|
|
99
|
+
});
|
|
100
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
101
|
+
job: formattedJob,
|
|
102
|
+
snapshot: {
|
|
103
|
+
status: 'success',
|
|
104
|
+
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
|
|
105
|
+
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
|
|
106
|
+
meta: result.meta,
|
|
107
|
+
},
|
|
108
|
+
});
|
|
109
|
+
yield store_1.Snapshot.upsert(snapshot, { transaction: txn });
|
|
110
|
+
return snapshot;
|
|
111
|
+
}));
|
|
92
112
|
return snapshot;
|
|
93
113
|
}
|
|
94
114
|
catch (error) {
|
|
@@ -100,7 +120,7 @@ function createCrawlQueue() {
|
|
|
100
120
|
error: 'Internal error',
|
|
101
121
|
},
|
|
102
122
|
});
|
|
103
|
-
yield
|
|
123
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
104
124
|
return snapshot;
|
|
105
125
|
}
|
|
106
126
|
}),
|
|
@@ -116,13 +136,13 @@ function getDataDir() {
|
|
|
116
136
|
});
|
|
117
137
|
}
|
|
118
138
|
function saveSnapshotToLocal(_a) {
|
|
119
|
-
return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
|
|
139
|
+
return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
|
|
120
140
|
const { htmlDir, screenshotDir } = yield getDataDir();
|
|
121
141
|
let screenshotPath = null;
|
|
122
142
|
let htmlPath = null;
|
|
123
143
|
if (screenshot) {
|
|
124
144
|
const hash = (0, utils_1.md5)(screenshot);
|
|
125
|
-
screenshotPath = path_1.default.join(screenshotDir, `${hash}
|
|
145
|
+
screenshotPath = path_1.default.join(screenshotDir, `${hash}.${format}`);
|
|
126
146
|
config_1.logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
|
|
127
147
|
yield fs_extra_1.default.writeFile(screenshotPath, screenshot);
|
|
128
148
|
}
|
|
@@ -138,7 +158,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
138
158
|
};
|
|
139
159
|
});
|
|
140
160
|
}
|
|
141
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
|
|
161
|
+
const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
|
|
142
162
|
const page = yield (0, puppeteer_1.initPage)();
|
|
143
163
|
if (width && height) {
|
|
144
164
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -175,9 +195,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
175
195
|
}
|
|
176
196
|
// await for networkidle0
|
|
177
197
|
// https://pptr.dev/api/puppeteer.page.waitfornetworkidle
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
198
|
+
try {
|
|
199
|
+
yield Promise.all([
|
|
200
|
+
page.waitForNetworkIdle({
|
|
201
|
+
idleTime: 1.5 * 1000,
|
|
202
|
+
timeout,
|
|
203
|
+
}),
|
|
204
|
+
(0, utils_1.sleep)(waitTime),
|
|
205
|
+
]);
|
|
206
|
+
}
|
|
207
|
+
catch (err) {
|
|
208
|
+
config_1.logger.warn(`Failed to wait for network idle in ${url}:`, err);
|
|
209
|
+
}
|
|
181
210
|
// get screenshot
|
|
182
211
|
if (includeScreenshot) {
|
|
183
212
|
// Try to find the tallest element and set the browser to the same height
|
|
@@ -193,7 +222,9 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
193
222
|
}
|
|
194
223
|
}
|
|
195
224
|
try {
|
|
196
|
-
screenshot =
|
|
225
|
+
screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
|
|
226
|
+
? yield handler.handleScreenshot(page)
|
|
227
|
+
: yield page.screenshot({ fullPage, quality, type: format });
|
|
197
228
|
}
|
|
198
229
|
catch (err) {
|
|
199
230
|
config_1.logger.error('Failed to get screenshot:', err);
|
|
@@ -221,12 +252,12 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
221
252
|
// check if the page is an error page
|
|
222
253
|
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
223
254
|
if (isErrorPage) {
|
|
224
|
-
throw new Error(
|
|
255
|
+
throw new Error(`${url} is an error page`);
|
|
225
256
|
}
|
|
226
257
|
meta.title = data.title;
|
|
227
258
|
meta.description = data.description;
|
|
228
259
|
if (includeHtml) {
|
|
229
|
-
html = data.html;
|
|
260
|
+
html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
|
|
230
261
|
}
|
|
231
262
|
}
|
|
232
263
|
catch (err) {
|
|
@@ -254,17 +285,17 @@ exports.getPageContent = getPageContent;
|
|
|
254
285
|
* @param callback callback when job finished
|
|
255
286
|
*/
|
|
256
287
|
// eslint-disable-next-line require-await
|
|
257
|
-
function
|
|
288
|
+
function enqueue(queue, params, callback) {
|
|
258
289
|
return __awaiter(this, void 0, void 0, function* () {
|
|
259
290
|
// skip duplicate job
|
|
260
|
-
const existsJob = yield
|
|
261
|
-
if (existsJob) {
|
|
291
|
+
const existsJob = yield store_1.Job.isExists(params);
|
|
292
|
+
if (existsJob && !params.sync) {
|
|
262
293
|
config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
263
294
|
return existsJob.id;
|
|
264
295
|
}
|
|
265
296
|
config_1.logger.info('enqueue crawl job', params);
|
|
266
297
|
const jobId = (0, crypto_1.randomUUID)();
|
|
267
|
-
const job =
|
|
298
|
+
const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
268
299
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
269
300
|
config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
270
301
|
callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
|
|
@@ -276,3 +307,9 @@ function crawlUrl(params, callback) {
|
|
|
276
307
|
return jobId;
|
|
277
308
|
});
|
|
278
309
|
}
|
|
310
|
+
function crawlUrl(params, callback) {
|
|
311
|
+
return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
|
|
312
|
+
}
|
|
313
|
+
function crawlCode(params, callback) {
|
|
314
|
+
return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
|
|
315
|
+
}
|
package/lib/cjs/index.d.ts
CHANGED
|
@@ -3,4 +3,5 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
+
export { migrate } from './store/migrate';
|
|
6
7
|
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/cjs/index.js
CHANGED
|
@@ -48,28 +48,26 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
48
48
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
49
49
|
};
|
|
50
50
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
51
|
-
exports.utils = void 0;
|
|
51
|
+
exports.migrate = exports.utils = void 0;
|
|
52
52
|
exports.initCrawler = initCrawler;
|
|
53
53
|
/* eslint-disable @typescript-eslint/indent */
|
|
54
54
|
const merge_1 = __importDefault(require("lodash/merge"));
|
|
55
55
|
const config_1 = require("./config");
|
|
56
|
-
const crawler_1 = require("./crawler");
|
|
57
56
|
const cron_1 = require("./cron");
|
|
58
57
|
const puppeteer_1 = require("./puppeteer");
|
|
59
|
-
const store_1 = require("./store");
|
|
60
58
|
__exportStar(require("./crawler"), exports);
|
|
61
59
|
__exportStar(require("./site"), exports);
|
|
62
60
|
__exportStar(require("./services/snapshot"), exports);
|
|
63
61
|
exports.utils = __importStar(require("./utils"));
|
|
62
|
+
var migrate_1 = require("./store/migrate");
|
|
63
|
+
Object.defineProperty(exports, "migrate", { enumerable: true, get: function () { return migrate_1.migrate; } });
|
|
64
64
|
function initCrawler(params) {
|
|
65
65
|
return __awaiter(this, void 0, void 0, function* () {
|
|
66
66
|
var _a;
|
|
67
67
|
(0, merge_1.default)(config_1.config, params);
|
|
68
68
|
config_1.logger.info('Init crawler', { params, config: config_1.config });
|
|
69
69
|
try {
|
|
70
|
-
yield (0, store_1.initDatabase)();
|
|
71
70
|
yield (0, puppeteer_1.ensureBrowser)();
|
|
72
|
-
yield (0, crawler_1.createCrawlQueue)();
|
|
73
71
|
if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
74
72
|
yield (0, cron_1.initCron)();
|
|
75
73
|
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.createCarbonImage = createCarbonImage;
|
|
13
|
+
const config_1 = require("../config");
|
|
14
|
+
// TODO expose local version of dom-to-image
|
|
15
|
+
const DOM_TO_IMAGE_URL = 'https://unpkg.com/dom-to-image@2.6.0/dist/dom-to-image.min.js';
|
|
16
|
+
function createCarbonImage(page, params) {
|
|
17
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
18
|
+
try {
|
|
19
|
+
yield page.addScriptTag({ url: DOM_TO_IMAGE_URL });
|
|
20
|
+
yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
|
|
21
|
+
const targetElement = yield page.$('.export-container');
|
|
22
|
+
const format = (params === null || params === void 0 ? void 0 : params.format) || 'png';
|
|
23
|
+
const dataUrl = yield page.evaluate((target = document, imageFormat = 'png') => {
|
|
24
|
+
const query = new URLSearchParams(document.location.search);
|
|
25
|
+
const EXPORT_SIZES_HASH = {
|
|
26
|
+
'1x': '1',
|
|
27
|
+
'2x': '2',
|
|
28
|
+
'4x': '4',
|
|
29
|
+
};
|
|
30
|
+
const exportSize = EXPORT_SIZES_HASH[query.get('es')] || '2';
|
|
31
|
+
if (!target) {
|
|
32
|
+
throw new Error('Target element not found');
|
|
33
|
+
}
|
|
34
|
+
target.querySelectorAll('span[role="presentation"]').forEach((node) => {
|
|
35
|
+
var _a;
|
|
36
|
+
const el = node;
|
|
37
|
+
if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
|
|
38
|
+
(_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
|
|
39
|
+
el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
const width = target.offsetWidth * exportSize;
|
|
44
|
+
const height = query.get('si') === 'true'
|
|
45
|
+
? target.offsetWidth * exportSize
|
|
46
|
+
: target.offsetHeight * exportSize;
|
|
47
|
+
const config = {
|
|
48
|
+
style: {
|
|
49
|
+
transform: `scale(${exportSize})`,
|
|
50
|
+
'transform-origin': 'center',
|
|
51
|
+
background: query.get('si') ? query.get('bg') : 'none',
|
|
52
|
+
},
|
|
53
|
+
filter: (n) => {
|
|
54
|
+
if (n.className) {
|
|
55
|
+
return String(n.className).indexOf('eliminateOnRender') < 0;
|
|
56
|
+
}
|
|
57
|
+
return true;
|
|
58
|
+
},
|
|
59
|
+
width,
|
|
60
|
+
height,
|
|
61
|
+
};
|
|
62
|
+
switch (imageFormat) {
|
|
63
|
+
case 'jpeg':
|
|
64
|
+
// @ts-ignore: domtoimage is injected by addScriptTag
|
|
65
|
+
return domtoimage.toJpeg(target, config);
|
|
66
|
+
case 'webp':
|
|
67
|
+
// dom-to-image doesn't support webp directly, fall back to png
|
|
68
|
+
// @ts-ignore: domtoimage is injected by addScriptTag
|
|
69
|
+
return domtoimage.toPng(target, config);
|
|
70
|
+
case 'png':
|
|
71
|
+
default:
|
|
72
|
+
// @ts-ignore: domtoimage is injected by addScriptTag
|
|
73
|
+
return domtoimage.toPng(target, config);
|
|
74
|
+
}
|
|
75
|
+
}, targetElement, format);
|
|
76
|
+
const base64Data = dataUrl.split(',')[1];
|
|
77
|
+
if (!base64Data) {
|
|
78
|
+
throw new Error('Failed to extract base64 data from image');
|
|
79
|
+
}
|
|
80
|
+
return Buffer.from(base64Data, 'base64');
|
|
81
|
+
}
|
|
82
|
+
catch (e) {
|
|
83
|
+
config_1.logger.error('failed to crawl from carbon', { error: e });
|
|
84
|
+
throw e;
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { SnapshotModel } from '../store
|
|
1
|
+
import { Transaction, WhereOptions } from '@sequelize/core';
|
|
2
|
+
import { JobState, SnapshotModel } from '../store';
|
|
3
3
|
export declare function convertJobToSnapshot({ job, snapshot }: {
|
|
4
4
|
job: JobState;
|
|
5
5
|
snapshot?: Partial<SnapshotModel>;
|
|
@@ -10,3 +10,6 @@ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<
|
|
|
10
10
|
*/
|
|
11
11
|
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
|
12
12
|
export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
13
|
+
export declare function deleteSnapshots(where: WhereOptions<SnapshotModel>, { txn }?: {
|
|
14
|
+
txn?: Transaction;
|
|
15
|
+
}): Promise<string[]>;
|
|
@@ -16,17 +16,17 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
|
|
|
16
16
|
exports.formatSnapshot = formatSnapshot;
|
|
17
17
|
exports.getSnapshot = getSnapshot;
|
|
18
18
|
exports.getLatestSnapshot = getLatestSnapshot;
|
|
19
|
+
exports.deleteSnapshots = deleteSnapshots;
|
|
19
20
|
const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
|
|
20
21
|
const pick_1 = __importDefault(require("lodash/pick"));
|
|
21
22
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
22
23
|
const node_path_1 = __importDefault(require("node:path"));
|
|
23
24
|
const ufo_1 = require("ufo");
|
|
24
25
|
const config_1 = require("../config");
|
|
25
|
-
const
|
|
26
|
-
const snapshot_1 = require("../store/snapshot");
|
|
26
|
+
const store_1 = require("../store");
|
|
27
27
|
const utils_1 = require("../utils");
|
|
28
28
|
function convertJobToSnapshot({ job, snapshot }) {
|
|
29
|
-
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
|
|
29
|
+
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), replace: job.replace, options: {
|
|
30
30
|
width: job.width,
|
|
31
31
|
height: job.height,
|
|
32
32
|
includeScreenshot: job.includeScreenshot,
|
|
@@ -64,11 +64,11 @@ function formatSnapshot(snapshot, columns) {
|
|
|
64
64
|
*/
|
|
65
65
|
function getSnapshot(jobId) {
|
|
66
66
|
return __awaiter(this, void 0, void 0, function* () {
|
|
67
|
-
const snapshot = yield
|
|
67
|
+
const snapshot = yield store_1.Snapshot.findSnapshot({ where: { jobId } });
|
|
68
68
|
if (snapshot) {
|
|
69
69
|
return formatSnapshot(snapshot);
|
|
70
70
|
}
|
|
71
|
-
const job = yield
|
|
71
|
+
const job = yield store_1.Job.findJob({ id: jobId });
|
|
72
72
|
if (job) {
|
|
73
73
|
return {
|
|
74
74
|
jobId,
|
|
@@ -80,12 +80,42 @@ function getSnapshot(jobId) {
|
|
|
80
80
|
}
|
|
81
81
|
function getLatestSnapshot(url) {
|
|
82
82
|
return __awaiter(this, void 0, void 0, function* () {
|
|
83
|
-
const snapshot = yield
|
|
83
|
+
const snapshot = yield store_1.Snapshot.findSnapshot({
|
|
84
84
|
where: {
|
|
85
85
|
url: (0, utils_1.formatUrl)(url),
|
|
86
86
|
status: 'success',
|
|
87
87
|
},
|
|
88
|
+
order: [
|
|
89
|
+
['lastModified', 'DESC'],
|
|
90
|
+
['updatedAt', 'DESC'],
|
|
91
|
+
],
|
|
88
92
|
});
|
|
89
93
|
return snapshot ? formatSnapshot(snapshot) : null;
|
|
90
94
|
});
|
|
91
95
|
}
|
|
96
|
+
function deleteSnapshots(where_1) {
|
|
97
|
+
return __awaiter(this, arguments, void 0, function* (where, { txn } = {}) {
|
|
98
|
+
const snapshots = yield store_1.Snapshot.findAll({
|
|
99
|
+
where,
|
|
100
|
+
order: [
|
|
101
|
+
['lastModified', 'DESC'],
|
|
102
|
+
['updatedAt', 'DESC'],
|
|
103
|
+
],
|
|
104
|
+
});
|
|
105
|
+
const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
|
|
106
|
+
try {
|
|
107
|
+
yield Promise.all([
|
|
108
|
+
snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
|
|
109
|
+
snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
|
|
110
|
+
]);
|
|
111
|
+
yield snapshot.destroy({ transaction: txn });
|
|
112
|
+
return snapshot.jobId;
|
|
113
|
+
}
|
|
114
|
+
catch (error) {
|
|
115
|
+
config_1.logger.error('Failed to delete snapshot', { error, snapshot });
|
|
116
|
+
throw error;
|
|
117
|
+
}
|
|
118
|
+
})));
|
|
119
|
+
return jobIds.filter(Boolean);
|
|
120
|
+
});
|
|
121
|
+
}
|
package/lib/cjs/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(`${string}-${string}-${string}-${string}-${string}` | null)[]>;
|
package/lib/cjs/site.js
CHANGED
|
@@ -14,12 +14,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
15
|
exports.crawlSite = void 0;
|
|
16
16
|
const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
17
|
+
const node_crypto_1 = require("node:crypto");
|
|
17
18
|
const p_map_1 = __importDefault(require("p-map"));
|
|
18
19
|
const config_1 = require("./config");
|
|
19
20
|
const crawler_1 = require("./crawler");
|
|
20
|
-
const
|
|
21
|
+
const store_1 = require("./store");
|
|
21
22
|
const utils_1 = require("./utils");
|
|
22
23
|
const crawlBlockletRunningMap = new Map();
|
|
24
|
+
const crawlQueue = (0, crawler_1.createCrawlQueue)('cronJobs');
|
|
23
25
|
function parseSitemapUrl(sitemapItem) {
|
|
24
26
|
var _a;
|
|
25
27
|
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
@@ -48,7 +50,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
48
50
|
try {
|
|
49
51
|
const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
50
52
|
processCount++;
|
|
51
|
-
const snapshot = yield
|
|
53
|
+
const snapshot = yield store_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
|
|
52
54
|
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
53
55
|
const lastModified = new Date(snapshot.lastModified);
|
|
54
56
|
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
@@ -67,12 +69,16 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
67
69
|
url,
|
|
68
70
|
});
|
|
69
71
|
crawlCount++;
|
|
70
|
-
|
|
72
|
+
const jobId = (0, node_crypto_1.randomUUID)();
|
|
73
|
+
crawlQueue.push({
|
|
74
|
+
id: jobId,
|
|
71
75
|
url,
|
|
72
76
|
lastModified: sitemapItem.lastmod,
|
|
73
77
|
includeScreenshot: false,
|
|
74
78
|
includeHtml: true,
|
|
79
|
+
replace: true,
|
|
75
80
|
});
|
|
81
|
+
return jobId;
|
|
76
82
|
}), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
77
83
|
config_1.logger.info('Enqueued jobs from sitemap finished', {
|
|
78
84
|
url,
|
package/lib/cjs/store/index.d.ts
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
1
|
import { Sequelize } from '@sequelize/core';
|
|
2
2
|
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
3
|
-
|
|
3
|
+
declare const sequelize: Sequelize<SqliteDialect>;
|
|
4
|
+
export { sequelize };
|
|
5
|
+
export * from './job';
|
|
6
|
+
export * from './snapshot';
|